201 files changed, 58700 insertions, 91699 deletions
diff --git a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c
index 96bcec530c..1919d21356 100644
--- a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c
+++ b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c
@@ -196,6 +196,7 @@ static void		ire_report(const mib_item_t *item);
 static void		tcp_report(const mib_item_t *item);
 static void		udp_report(const mib_item_t *item);
 static void		group_report(mib_item_t *item);
+static void		dce_report(mib_item_t *item);
 static void		print_ip_stats(mib2_ip_t *ip);
 static void		print_icmp_stats(mib2_icmp_t *icmp);
 static void		print_ip6_stats(mib2_ipv6IfStatsEntry_t *ip6);
@@ -236,7 +237,7 @@ static void 		fatal(int errcode, char *str1, ...);
 
 
 static	boolean_t	Aflag = B_FALSE;	/* All sockets/ifs/rtng-tbls */
-static	boolean_t	Dflag = B_FALSE;	/* Debug Info */
+static	boolean_t	Dflag = B_FALSE;	/* DCE info */
 static	boolean_t	Iflag = B_FALSE;	/* IP Traffic Interfaces */
 static	boolean_t	Mflag = B_FALSE;	/* STREAMS Memory Statistics */
 static	boolean_t	Nflag = B_FALSE;	/* Numeric Network Addresses */
@@ -248,6 +249,7 @@ static	boolean_t	Pflag = B_FALSE;	/* Net to Media Tables */
 static	boolean_t	Gflag = B_FALSE;	/* Multicast group membership */
 static	boolean_t	MMflag = B_FALSE;	/* Multicast routing table */
 static	boolean_t	DHCPflag = B_FALSE;	/* DHCP statistics */
+static	boolean_t	Xflag = B_FALSE;	/* Debug Info */
 
 static	int	v4compat = 0;	/* Compatible printing format for status */
 
@@ -276,6 +278,8 @@ static int ipv6NetToMediaEntrySize;
 static int ipv6MemberEntrySize;
 static int ipv6GroupSourceEntrySize;
 
+static int ipDestEntrySize;
+
 static int transportMLPSize;
 static int tcpConnEntrySize;
 static int tcp6ConnEntrySize;
@@ -298,7 +302,7 @@ static m_label_t *zone_security_label = NULL;
 
 /* Flags on routes */
 #define	FLF_A		0x00000001
-#define	FLF_B		0x00000002
+#define	FLF_b		0x00000002
 #define	FLF_D		0x00000004
 #define	FLF_G		0x00000008
 #define	FLF_H		0x00000010
@@ -306,7 +310,12 @@ static m_label_t *zone_security_label = NULL;
 #define	FLF_U		0x00000040
 #define	FLF_M		0x00000080
 #define	FLF_S		0x00000100
-static const char flag_list[] = "ABDGHLUMS";
+#define	FLF_C		0x00000200	/* IRE_IF_CLONE */
+#define	FLF_I		0x00000400	/* RTF_INDIRECT */
+#define	FLF_R		0x00000800	/* RTF_REJECT */
+#define	FLF_B		0x00001000	/* RTF_BLACKHOLE */
+
+static const char flag_list[] = "AbDGHLUMSCIRB";
 
 typedef struct filter_rule filter_t;
 
@@ -379,14 +388,15 @@ main(int argc, char **argv)
 	(void) setlocale(LC_ALL, "");
 	(void) textdomain(TEXT_DOMAIN);
 
-	while ((c = getopt(argc, argv, "adimnrspMgvf:P:I:DRT:")) != -1) {
+	while ((c = getopt(argc, argv, "adimnrspMgvxf:P:I:DRT:")) != -1) {
 		switch ((char)c) {
 		case 'a':		/* all connections */
 			Aflag = B_TRUE;
 			break;
 
-		case 'd':		/* turn on debugging */
+		case 'd':		/* DCE info */
 			Dflag = B_TRUE;
+			IFLAGMOD(Iflag_only, 1, 0); /* see macro def'n */
 			break;
 
 		case 'i':		/* interface (ill/ipif report) */
@@ -438,6 +448,10 @@ main(int argc, char **argv)
 			IFLAGMOD(Iflag_only, 1, 0); /* see macro def'n */
 			break;
 
+		case 'x':		/* turn on debugging */
+			Xflag = B_TRUE;
+			break;
+
 		case 'f':
 			process_filter(optarg);
 			break;
@@ -603,7 +617,7 @@ main(int argc, char **argv)
 				mib_item_destroy(&previtem);
 			}
 
-			if (!(Iflag || Rflag || Sflag || Mflag ||
+			if (!(Dflag || Iflag || Rflag || Sflag || Mflag ||
 			    MMflag || Pflag || Gflag || DHCPflag)) {
 				if (protocol_selected(IPPROTO_UDP))
 					udp_report(item);
@@ -634,12 +648,14 @@ main(int argc, char **argv)
 				if (family_selected(AF_INET6))
 					ndp_report(item);
 			}
+			if (Dflag)
+				dce_report(item);
 			mib_item_destroy(&curritem);
 		}
 
 		/* netstat: AF_UNIX behaviour */
 		if (family_selected(AF_UNIX) &&
-		    (!(Iflag || Rflag || Sflag || Mflag ||
+		    (!(Dflag || Iflag || Rflag || Sflag || Mflag ||
 		    MMflag || Pflag || Gflag)))
 			unixpr(kc);
 		(void) kstat_close(kc);
@@ -729,7 +745,7 @@ mibget(int sd)
 	 * us information concerning IRE_MARK_TESTHIDDEN routes.
 	 */
 	req = (struct opthdr *)&tor[1];
-	req->level = EXPER_IP_AND_TESTHIDDEN;
+	req->level = EXPER_IP_AND_ALL_IRES;
 	req->name  = 0;
 	req->len   = 0;
 
@@ -755,7 +771,7 @@ mibget(int sd)
 		getcode = getmsg(sd, &ctlbuf, (struct strbuf *)0, &flags);
 		if (getcode == -1) {
 			perror("mibget getmsg(ctl) failed");
-			if (Dflag) {
+			if (Xflag) {
 				(void) fputs("#   level   name    len\n",
 				    stderr);
 				i = 0;
@@ -774,7 +790,7 @@ mibget(int sd)
 		    toa->PRIM_type == T_OPTMGMT_ACK &&
 		    toa->MGMT_flags == T_SUCCESS &&
 		    req->len == 0) {
-			if (Dflag)
+			if (Xflag)
 				(void) printf("mibget getmsg() %d returned "
 				    "EOD (level %ld, name %ld)\n",
 				    j, req->level, req->name);
@@ -826,7 +842,7 @@ mibget(int sd)
 		last_item->valp = malloc((int)req->len);
 		if (last_item->valp == NULL)
 			goto error_exit;
-		if (Dflag)
+		if (Xflag)
 			(void) printf("msg %d: group = %4d   mib_id = %5d"
 			    "length = %d\n",
 			    j, last_item->group, last_item->mib_id,
@@ -1754,6 +1770,7 @@ mib_get_constants(mib_item_t *item)
 			ipGroupSourceEntrySize = ip->ipGroupSourceEntrySize;
 			ipRouteAttributeSize = ip->ipRouteAttributeSize;
 			transportMLPSize = ip->transportMLPSize;
+			ipDestEntrySize = ip->ipDestEntrySize;
 			assert(IS_P2ALIGNED(ipAddrEntrySize,
 			    sizeof (mib2_ipAddrEntry_t *)));
 			assert(IS_P2ALIGNED(ipRouteEntrySize,
@@ -1850,7 +1867,7 @@ mib_get_constants(mib_item_t *item)
 		}
 	} /* 'for' loop 1 ends */
 
-	if (Dflag) {
+	if (Xflag) {
 		(void) puts("mib_get_constants:");
 		(void) printf("\tipv6IfStatsEntrySize %d\n",
 		    ipv6IfStatsEntrySize);
@@ -1872,6 +1889,7 @@ mib_get_constants(mib_item_t *item)
 		    ipv6MemberEntrySize);
 		(void) printf("\tipv6IfIcmpEntrySize %d\n",
 		    ipv6IfIcmpEntrySize);
+		(void) printf("\tipDestEntrySize %d\n", ipDestEntrySize);
 		(void) printf("\ttransportMLPSize %d\n", transportMLPSize);
 		(void) printf("\ttcpConnEntrySize %d\n", tcpConnEntrySize);
 		(void) printf("\ttcp6ConnEntrySize %d\n", tcp6ConnEntrySize);
@@ -1895,7 +1913,7 @@ stat_report(mib_item_t *item)
 
 	/* 'for' loop 1: */
 	for (; item; item = item->next_item) {
-		if (Dflag) {
+		if (Xflag) {
 			(void) printf("\n--- Entry %d ---\n", ++jtemp);
 			(void) printf("Group = %d, mib_id = %d, "
 			    "length = %d, valp = 0x%p\n",
@@ -2542,7 +2560,7 @@ mrt_stat_report(mib_item_t *curritem)
 	for (tempitem = curritem;
 	    tempitem;
 	    tempitem = tempitem->next_item) {
-		if (Dflag) {
+		if (Xflag) {
 			(void) printf("\n--- Entry %d ---\n", ++jtemp);
 			(void) printf("Group = %d, mib_id = %d, "
 			    "length = %d, valp = 0x%p\n",
@@ -2603,7 +2621,7 @@ if_report(mib_item_t *item, char *matchname,
 
 	/* 'for' loop 1: */
 	for (; item; item = item->next_item) {
-		if (Dflag) {
+		if (Xflag) {
 			(void) printf("\n--- Entry %d ---\n", ++jtemp);
 			(void) printf("Group = %d, mib_id = %d, "
 			    "length = %d, valp = 0x%p\n",
@@ -2632,7 +2650,7 @@ if_report(mib_item_t *item, char *matchname,
 				boolean_t	first = B_TRUE;
 				uint32_t	new_ifindex;
 
-				if (Dflag)
+				if (Xflag)
 					(void) printf("if_report: %d items\n",
 					    (item->length)
 					    / sizeof (mib2_ipAddrEntry_t));
@@ -2944,7 +2962,7 @@ if_report(mib_item_t *item, char *matchname,
 				boolean_t	first = B_TRUE;
 				uint32_t	new_ifindex;
 
-				if (Dflag)
+				if (Xflag)
 					(void) printf("if_report: %d items\n",
 					    (item->length)
 					    / sizeof (mib2_ipv6AddrEntry_t));
@@ -3287,10 +3305,10 @@ if_report_ip4(mib2_ipAddrEntry_t *ap,
 			(void) pr_netaddr(ap->ipAdEntAddr, ap->ipAdEntNetMask,
 			    abuf, sizeof (abuf));
 
-		(void) printf("%-13s %-14s %-6llu %-5s %-6llu "
+		(void) printf("%-13s %-14s %-6llu %-5s %-6s "
 		    "%-5s %-6s %-6llu\n", abuf,
 		    pr_addr(ap->ipAdEntAddr, dstbuf, sizeof (dstbuf)),
-		    statptr->ipackets, "N/A", statptr->opackets, "N/A", "N/A",
+		    statptr->ipackets, "N/A", "N/A", "N/A", "N/A",
 		    0LL);
 	}
 }
@@ -3337,11 +3355,10 @@ if_report_ip6(mib2_ipv6AddrEntry_t *ap6,
 		else
 			(void) pr_prefix6(&ap6->ipv6AddrAddress,
 			    ap6->ipv6AddrPfxLength, abuf, sizeof (abuf));
-		(void) printf("%-27s %-27s %-6llu %-5s %-6llu %-5s %-6s\n",
+		(void) printf("%-27s %-27s %-6llu %-5s %-6s %-5s %-6s\n",
 		    abuf, pr_addr6(&ap6->ipv6AddrAddress, dstbuf,
 		    sizeof (dstbuf)),
-		    statptr->ipackets, "N/A",
-		    statptr->opackets, "N/A", "N/A");
+		    statptr->ipackets, "N/A", "N/A", "N/A", "N/A");
 	}
 }
 
@@ -3490,7 +3507,7 @@ group_report(mib_item_t *item)
 
 	/* 'for' loop 1: */
 	for (; item; item = item->next_item) {
-		if (Dflag) {
+		if (Xflag) {
 			(void) printf("\n--- Entry %d ---\n", ++jtemp);
 			(void) printf("Group = %d, mib_id = %d, "
 			    "length = %d, valp = 0x%p\n",
@@ -3501,12 +3518,12 @@ group_report(mib_item_t *item)
 			switch (item->mib_id) {
 			case EXPER_IP_GROUP_MEMBERSHIP:
 				v4grp = item;
-				if (Dflag)
+				if (Xflag)
 					(void) printf("item is v4grp info\n");
 				break;
 			case EXPER_IP_GROUP_SOURCES:
 				v4src = item;
-				if (Dflag)
+				if (Xflag)
 					(void) printf("item is v4src info\n");
 				break;
 			default:
@@ -3518,12 +3535,12 @@ group_report(mib_item_t *item)
 			switch (item->mib_id) {
 			case EXPER_IP6_GROUP_MEMBERSHIP:
 				v6grp = item;
-				if (Dflag)
+				if (Xflag)
 					(void) printf("item is v6grp info\n");
 				break;
 			case EXPER_IP6_GROUP_SOURCES:
 				v6src = item;
-				if (Dflag)
+				if (Xflag)
 					(void) printf("item is v6src info\n");
 				break;
 			default:
@@ -3533,7 +3550,7 @@ group_report(mib_item_t *item)
 	}
 
 	if (family_selected(AF_INET) && v4grp != NULL) {
-		if (Dflag)
+		if (Xflag)
 			(void) printf("%u records for ipGroupMember:\n",
 			    v4grp->length / sizeof (ip_member_t));
 
@@ -3564,7 +3581,7 @@ group_report(mib_item_t *item)
 			if (!Vflag || v4src == NULL)
 				continue;
 
-			if (Dflag)
+			if (Xflag)
 				(void) printf("scanning %u ipGroupSource "
 				    "records...\n",
 				    v4src->length/sizeof (ip_grpsrc_t));
@@ -3609,7 +3626,7 @@ group_report(mib_item_t *item)
 	}
 
 	if (family_selected(AF_INET6) && v6grp != NULL) {
-		if (Dflag)
+		if (Xflag)
 			(void) printf("%u records for ipv6GroupMember:\n",
 			    v6grp->length / sizeof (ipv6_member_t));
 
@@ -3638,7 +3655,7 @@ group_report(mib_item_t *item)
 			if (!Vflag || v6src == NULL)
 				continue;
 
-			if (Dflag)
+			if (Xflag)
 				(void) printf("scanning %u ipv6GroupSource "
 				    "records...\n",
 				    v6src->length/sizeof (ipv6_grpsrc_t));
@@ -3683,6 +3700,126 @@ group_report(mib_item_t *item)
 	(void) fflush(stdout);
 }
 
+/* --------------------- DCE_REPORT (netstat -d) ------------------------- */
+
+#define	FLBUFSIZE	8
+
+/* Assumes flbuf is at least 5 characters; callers use FLBUFSIZE */
+static char *
+dceflags2str(uint32_t flags, char *flbuf)
+{
+	char *str = flbuf;
+
+	if (flags & DCEF_DEFAULT)
+		*str++ = 'D';
+	if (flags & DCEF_PMTU)
+		*str++ = 'P';
+	if (flags & DCEF_UINFO)
+		*str++ = 'U';
+	if (flags & DCEF_TOO_SMALL_PMTU)
+		*str++ = 'S';
+	*str++ = '\0';
+	return (flbuf);
+}
+
+static void
+dce_report(mib_item_t *item)
+{
+	mib_item_t	*v4dce = NULL;
+	mib_item_t	*v6dce = NULL;
+	int		jtemp = 0;
+	char		ifname[LIFNAMSIZ + 1];
+	char		abuf[MAXHOSTNAMELEN + 1];
+	char		flbuf[FLBUFSIZE];
+	boolean_t	first;
+	dest_cache_entry_t *dce;
+
+	/* 'for' loop 1: */
+	for (; item; item = item->next_item) {
+		if (Xflag) {
+			(void) printf("\n--- Entry %d ---\n", ++jtemp);
+			(void) printf("Group = %d, mib_id = %d, "
+			    "length = %d, valp = 0x%p\n",
+			    item->group, item->mib_id, item->length,
+			    item->valp);
+		}
+		if (item->group == MIB2_IP && family_selected(AF_INET) &&
+		    item->mib_id == EXPER_IP_DCE) {
+			v4dce = item;
+			if (Xflag)
+				(void) printf("item is v4dce info\n");
+		}
+		if (item->group == MIB2_IP6 && family_selected(AF_INET6) &&
+		    item->mib_id == EXPER_IP_DCE) {
+			v6dce = item;
+			if (Xflag)
+				(void) printf("item is v6dce info\n");
+		}
+	}
+
+	if (family_selected(AF_INET) && v4dce != NULL) {
+		if (Xflag)
+			(void) printf("%u records for DestCacheEntry:\n",
+			    v4dce->length / ipDestEntrySize);
+
+		first = B_TRUE;
+		for (dce = (dest_cache_entry_t *)v4dce->valp;
+		    (char *)dce < (char *)v4dce->valp + v4dce->length;
+		    /* LINTED: (note 1) */
+		    dce = (dest_cache_entry_t *)((char *)dce +
+		    ipDestEntrySize)) {
+			if (first) {
+				(void) putchar('\n');
+				(void) puts("Destination Cache Entries: IPv4");
+				(void) puts(
+				    "Address               PMTU   Age  Flags");
+				(void) puts(
+				    "-------------------- ------ ----- -----");
+				first = B_FALSE;
+			}
+
+			(void) printf("%-20s %6u %5u %-5s\n",
+			    pr_addr(dce->DestIpv4Address, abuf, sizeof (abuf)),
+			    dce->DestPmtu, dce->DestAge,
+			    dceflags2str(dce->DestFlags, flbuf));
+		}
+	}
+
+	if (family_selected(AF_INET6) && v6dce != NULL) {
+		if (Xflag)
+			(void) printf("%u records for DestCacheEntry:\n",
+			    v6dce->length / ipDestEntrySize);
+
+		first = B_TRUE;
+		for (dce = (dest_cache_entry_t *)v6dce->valp;
+		    (char *)dce < (char *)v6dce->valp + v6dce->length;
+		    /* LINTED: (note 1) */
+		    dce = (dest_cache_entry_t *)((char *)dce +
+		    ipDestEntrySize)) {
+			if (first) {
+				(void) putchar('\n');
+				(void) puts("Destination Cache Entries: IPv6");
+				(void) puts(
+				    "Address                      PMTU  "
+				    " Age Flags If ");
+				(void) puts(
+				    "--------------------------- ------ "
+				    "----- ----- ---");
+				first = B_FALSE;
+			}
+
+			(void) printf("%-27s %6u %5u %-5s %s\n",
+			    pr_addr6(&dce->DestIpv6Address, abuf,
+			    sizeof (abuf)),
+			    dce->DestPmtu, dce->DestAge,
+			    dceflags2str(dce->DestFlags, flbuf),
+			    dce->DestIfindex == 0 ? "" :
+			    ifindex2str(dce->DestIfindex, ifname));
+		}
+	}
+	(void) fflush(stdout);
+}
+
 /* --------------------- ARP_REPORT (netstat -p) -------------------------- */
 
 static void
@@ -3703,7 +3840,7 @@ arp_report(mib_item_t *item)
 
 	/* 'for' loop 1: */
 	for (; item; item = item->next_item) {
-		if (Dflag) {
+		if (Xflag) {
 			(void) printf("\n--- Entry %d ---\n", ++jtemp);
 			(void) printf("Group = %d, mib_id = %d, "
 			    "length = %d, valp = 0x%p\n",
@@ -3713,7 +3850,7 @@ arp_report(mib_item_t *item)
 		if (!(item->group == MIB2_IP && item->mib_id == MIB2_IP_MEDIA))
 			continue; /* 'for' loop 1 */
 
-		if (Dflag)
+		if (Xflag)
 			(void) printf("%u records for "
 			    "ipNetToMediaEntryTable:\n",
 			    item->length/sizeof (mib2_ipNetToMediaEntry_t));
@@ -3798,7 +3935,7 @@ ndp_report(mib_item_t *item)
 
 	/* 'for' loop 1: */
 	for (; item; item = item->next_item) {
-		if (Dflag) {
+		if (Xflag) {
 			(void) printf("\n--- Entry %d ---\n", ++jtemp);
 			(void) printf("Group = %d, mib_id = %d, "
 			    "length = %d, valp = 0x%p\n",
@@ -3973,7 +4110,7 @@ ire_report(const mib_item_t *item)
 	v4a = v4_attrs;
 	v6a = v6_attrs;
 	for (; item != NULL; item = item->next_item) {
-		if (Dflag) {
+		if (Xflag) {
 			(void) printf("\n--- Entry %d ---\n", ++jtemp);
 			(void) printf("Group = %d, mib_id = %d, "
 			    "length = %d, valp = 0x%p\n",
@@ -3991,7 +4128,7 @@ ire_report(const mib_item_t *item)
 		else if (item->group == MIB2_IP6 && !family_selected(AF_INET6))
 			continue; /* 'for' loop 1 */
 
-		if (Dflag) {
+		if (Xflag) {
 			if (item->group == MIB2_IP) {
 				(void) printf("%u records for "
 				    "ipRouteEntryTable:\n",
@@ -4161,29 +4298,29 @@ form_v4_route_flags(const mib2_ipRouteEntry_t *rp, char *flags)
 
 	flag_b = FLF_U;
 	(void) strcpy(flags, "U");
-	if (rp->ipRouteInfo.re_ire_type == IRE_DEFAULT ||
-	    rp->ipRouteInfo.re_ire_type == IRE_PREFIX ||
-	    rp->ipRouteInfo.re_ire_type == IRE_HOST ||
-	    rp->ipRouteInfo.re_ire_type == IRE_HOST_REDIRECT) {
+	/* RTF_INDIRECT wins over RTF_GATEWAY - don't display both */
+	if (rp->ipRouteInfo.re_flags & RTF_INDIRECT) {
+		(void) strcat(flags, "I");
+		flag_b |= FLF_I;
+	} else if (rp->ipRouteInfo.re_ire_type & IRE_OFFLINK) {
 		(void) strcat(flags, "G");
 		flag_b |= FLF_G;
 	}
-	if (rp->ipRouteMask == IP_HOST_MASK) {
+	/* IRE_IF_CLONE wins over RTF_HOST - don't display both */
+	if (rp->ipRouteInfo.re_ire_type & IRE_IF_CLONE) {
+		(void) strcat(flags, "C");
+		flag_b |= FLF_C;
+	} else if (rp->ipRouteMask == IP_HOST_MASK) {
 		(void) strcat(flags, "H");
 		flag_b |= FLF_H;
 	}
-	if (rp->ipRouteInfo.re_ire_type == IRE_HOST_REDIRECT) {
+	if (rp->ipRouteInfo.re_flags & RTF_DYNAMIC) {
 		(void) strcat(flags, "D");
 		flag_b |= FLF_D;
 	}
-	if (rp->ipRouteInfo.re_ire_type == IRE_CACHE) {
-		/* Address resolution */
-		(void) strcat(flags, "A");
-		flag_b |= FLF_A;
-	}
 	if (rp->ipRouteInfo.re_ire_type == IRE_BROADCAST) {	/* Broadcast */
-		(void) strcat(flags, "B");
-		flag_b |= FLF_B;
+		(void) strcat(flags, "b");
+		flag_b |= FLF_b;
 	}
 	if (rp->ipRouteInfo.re_ire_type == IRE_LOCAL) {		/* Local */
 		(void) strcat(flags, "L");
@@ -4197,6 +4334,14 @@ form_v4_route_flags(const mib2_ipRouteEntry_t *rp, char *flags)
 		(void) strcat(flags, "S");			/* Setsrc */
 		flag_b |= FLF_S;
 	}
+	if (rp->ipRouteInfo.re_flags & RTF_REJECT) {
+		(void) strcat(flags, "R");
+		flag_b |= FLF_R;
+	}
+	if (rp->ipRouteInfo.re_flags & RTF_BLACKHOLE) {
+		(void) strcat(flags, "B");
+		flag_b |= FLF_B;
+	}
 	return (flag_b);
 }
 
@@ -4205,9 +4350,9 @@ static const char ire_hdr_v4[] =
 static const char ire_hdr_v4_compat[] =
 "\n%s Table:\n";
 static const char ire_hdr_v4_verbose[] =
-"  Destination             Mask           Gateway          Device Mxfrg "
-"Rtt   Ref Flg  Out  In/Fwd %s\n"
-"-------------------- --------------- -------------------- ------ ----- "
+"  Destination             Mask           Gateway          Device "
+" MTU  Ref Flg  Out  In/Fwd %s\n"
+"-------------------- --------------- -------------------- ------ "
 "----- --- --- ----- ------ %s\n";
 
 static const char ire_hdr_v4_normal[] =
@@ -4226,8 +4371,10 @@ ire_report_item_v4(const mib2_ipRouteEntry_t *rp, boolean_t first,
 	char			flags[10];	/* RTF_ flags */
 	uint_t			flag_b;
 
-	if (!(Aflag || (rp->ipRouteInfo.re_ire_type != IRE_CACHE &&
+	if (!(Aflag || (rp->ipRouteInfo.re_ire_type != IRE_IF_CLONE &&
 	    rp->ipRouteInfo.re_ire_type != IRE_BROADCAST &&
+	    rp->ipRouteInfo.re_ire_type != IRE_MULTICAST &&
+	    rp->ipRouteInfo.re_ire_type != IRE_NOROUTE &&
 	    rp->ipRouteInfo.re_ire_type != IRE_LOCAL))) {
 		return (first);
 	}
@@ -4253,15 +4400,13 @@ ire_report_item_v4(const mib2_ipRouteEntry_t *rp, boolean_t first,
 		    dstbuf, sizeof (dstbuf));
 	}
 	if (Vflag) {
-		(void) printf("%-20s %-15s %-20s %-6s %5u%c %4u %3u "
+		(void) printf("%-20s %-15s %-20s %-6s %5u %3u "
 		    "%-4s%6u %6u %s\n",
 		    dstbuf,
 		    pr_mask(rp->ipRouteMask, maskbuf, sizeof (maskbuf)),
 		    pr_addrnz(rp->ipRouteNextHop, gwbuf, sizeof (gwbuf)),
 		    octetstr(&rp->ipRouteIfIndex, 'a', ifname, sizeof (ifname)),
 		    rp->ipRouteInfo.re_max_frag,
-		    rp->ipRouteInfo.re_frag_flag ? '*' : ' ',
-		    rp->ipRouteInfo.re_rtt,
 		    rp->ipRouteInfo.re_ref,
 		    flags,
 		    rp->ipRouteInfo.re_obpkt,
@@ -4391,58 +4536,39 @@ ire_filter_match_v6(const mib2_ipv6RouteEntry_t *rp6, uint_t flag_b)
 	return (B_TRUE);
 }
 
-static const char ire_hdr_v6[] =
-"\n%s Table: IPv6\n";
-static const char ire_hdr_v6_verbose[] =
-"  Destination/Mask            Gateway                    If    PMTU   Rtt  "
-"Ref Flags  Out   In/Fwd %s\n"
-"--------------------------- --------------------------- ----- ------ ----- "
-"--- ----- ------ ------ %s\n";
-static const char ire_hdr_v6_normal[] =
-"  Destination/Mask            Gateway                   Flags Ref   Use  "
-"  If   %s\n"
-"--------------------------- --------------------------- ----- --- ------- "
-"----- %s\n";
-
-static boolean_t
-ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
-    const sec_attr_list_t *attrs)
+/*
+ * Given an IPv6 MIB2 route entry, form the list of flags for the
+ * route.
+ */
+static uint_t
+form_v6_route_flags(const mib2_ipv6RouteEntry_t *rp6, char *flags)
 {
-	char			dstbuf[MAXHOSTNAMELEN + 1];
-	char			gwbuf[MAXHOSTNAMELEN + 1];
-	char			ifname[LIFNAMSIZ + 1];
-	char			flags[10];	/* RTF_ flags */
-	uint_t			flag_b;
-
-	if (!(Aflag || (rp6->ipv6RouteInfo.re_ire_type != IRE_CACHE &&
-	    rp6->ipv6RouteInfo.re_ire_type != IRE_LOCAL))) {
-		return (first);
-	}
+	uint_t flag_b;
 
 	flag_b = FLF_U;
 	(void) strcpy(flags, "U");
-	if (rp6->ipv6RouteInfo.re_ire_type == IRE_DEFAULT ||
-	    rp6->ipv6RouteInfo.re_ire_type == IRE_PREFIX ||
-	    rp6->ipv6RouteInfo.re_ire_type == IRE_HOST ||
-	    rp6->ipv6RouteInfo.re_ire_type == IRE_HOST_REDIRECT) {
+	/* RTF_INDIRECT wins over RTF_GATEWAY - don't display both */
+	if (rp6->ipv6RouteInfo.re_flags & RTF_INDIRECT) {
+		(void) strcat(flags, "I");
+		flag_b |= FLF_I;
+	} else if (rp6->ipv6RouteInfo.re_ire_type & IRE_OFFLINK) {
 		(void) strcat(flags, "G");
 		flag_b |= FLF_G;
 	}
 
-	if (rp6->ipv6RoutePfxLength == IPV6_ABITS) {
+	/* IRE_IF_CLONE wins over RTF_HOST - don't display both */
+	if (rp6->ipv6RouteInfo.re_ire_type & IRE_IF_CLONE) {
+		(void) strcat(flags, "C");
+		flag_b |= FLF_C;
+	} else if (rp6->ipv6RoutePfxLength == IPV6_ABITS) {
 		(void) strcat(flags, "H");
 		flag_b |= FLF_H;
 	}
 
-	if (rp6->ipv6RouteInfo.re_ire_type == IRE_HOST_REDIRECT) {
+	if (rp6->ipv6RouteInfo.re_flags & RTF_DYNAMIC) {
 		(void) strcat(flags, "D");
 		flag_b |= FLF_D;
 	}
-	if (rp6->ipv6RouteInfo.re_ire_type == IRE_CACHE) {
-		/* Address resolution */
-		(void) strcat(flags, "A");
-		flag_b |= FLF_A;
-	}
 	if (rp6->ipv6RouteInfo.re_ire_type == IRE_LOCAL) {	/* Local */
 		(void) strcat(flags, "L");
 		flag_b |= FLF_L;
@@ -4455,6 +4581,48 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
 		(void) strcat(flags, "S");			/* Setsrc */
 		flag_b |= FLF_S;
 	}
+	if (rp6->ipv6RouteInfo.re_flags & RTF_REJECT) {
+		(void) strcat(flags, "R");
+		flag_b |= FLF_R;
+	}
+	if (rp6->ipv6RouteInfo.re_flags & RTF_BLACKHOLE) {
+		(void) strcat(flags, "B");
+		flag_b |= FLF_B;
+	}
+	return (flag_b);
+}
+
+static const char ire_hdr_v6[] =
+"\n%s Table: IPv6\n";
+static const char ire_hdr_v6_verbose[] =
+"  Destination/Mask            Gateway                    If    MTU  "
+"Ref Flags  Out   In/Fwd %s\n"
+"--------------------------- --------------------------- ----- ----- "
+"--- ----- ------ ------ %s\n";
+static const char ire_hdr_v6_normal[] =
+"  Destination/Mask            Gateway                   Flags Ref   Use  "
+"  If   %s\n"
+"--------------------------- --------------------------- ----- --- ------- "
+"----- %s\n";
+
+static boolean_t
+ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
+    const sec_attr_list_t *attrs)
+{
+	char			dstbuf[MAXHOSTNAMELEN + 1];
+	char			gwbuf[MAXHOSTNAMELEN + 1];
+	char			ifname[LIFNAMSIZ + 1];
+	char			flags[10];	/* RTF_ flags */
+	uint_t			flag_b;
+
+	if (!(Aflag || (rp6->ipv6RouteInfo.re_ire_type != IRE_IF_CLONE &&
+	    rp6->ipv6RouteInfo.re_ire_type != IRE_MULTICAST &&
+	    rp6->ipv6RouteInfo.re_ire_type != IRE_NOROUTE &&
+	    rp6->ipv6RouteInfo.re_ire_type != IRE_LOCAL))) {
+		return (first);
+	}
+
+	flag_b = form_v6_route_flags(rp6, flags);
 
 	if (!ire_filter_match_v6(rp6, flag_b))
 		return (first);
@@ -4468,7 +4636,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
 	}
 
 	if (Vflag) {
-		(void) printf("%-27s %-27s %-5s %5u%c %5u %3u "
+		(void) printf("%-27s %-27s %-5s %5u %3u "
 		    "%-5s %6u %6u %s\n",
 		    pr_prefix6(&rp6->ipv6RouteDest,
 		    rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)),
@@ -4478,8 +4646,6 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
 		    octetstr(&rp6->ipv6RouteIfIndex, 'a',
 		    ifname, sizeof (ifname)),
 		    rp6->ipv6RouteInfo.re_max_frag,
-		    rp6->ipv6RouteInfo.re_frag_flag ? '*' : ' ',
-		    rp6->ipv6RouteInfo.re_rtt,
 		    rp6->ipv6RouteInfo.re_ref,
 		    flags,
 		    rp6->ipv6RouteInfo.re_obpkt,
@@ -4617,7 +4783,7 @@ tcp_report(const mib_item_t *item)
 	v4a = v4_attrs;
 	v6a = v6_attrs;
 	for (; item != NULL; item = item->next_item) {
-		if (Dflag) {
+		if (Xflag) {
 			(void) printf("\n--- Entry %d ---\n", ++jtemp);
 			(void) printf("Group = %d, mib_id = %d, "
 			    "length = %d, valp = 0x%p\n",
@@ -4841,7 +5007,7 @@ udp_report(const mib_item_t *item)
 	v6a = v6_attrs;
 	/* 'for' loop 1: */
 	for (; item; item = item->next_item) {
-		if (Dflag) {
+		if (Xflag) {
 			(void) printf("\n--- Entry %d ---\n", ++jtemp);
 			(void) printf("Group = %d, mib_id = %d, "
 			    "length = %d, valp = 0x%p\n",
@@ -4916,10 +5082,7 @@ udp_report_item_v4(const mib2_udpEntry_t *ude, boolean_t first,
 	    "",
 	    miudp_state(ude->udpEntryInfo.ue_state, attr));
 
-	/*
-	 * UDP sockets don't have remote attributes, so there's no need to
-	 * print them here.
-	 */
+	print_transport_label(attr);
 
 	return (first);
 }
@@ -4956,10 +5119,7 @@ udp_report_item_v6(const mib2_udp6Entry_t *ude6, boolean_t first,
 	    miudp_state(ude6->udp6EntryInfo.ue_state, attr),
 	    ifnamep == NULL ? "" : ifnamep);
 
-	/*
-	 * UDP sockets don't have remote attributes, so there's no need to
-	 * print them here.
-	 */
+	print_transport_label(attr);
 
 	return (first);
 }
@@ -5321,7 +5481,7 @@ mrt_report(mib_item_t *item)
 
 	/* 'for' loop 1: */
 	for (; item; item = item->next_item) {
-		if (Dflag) {
+		if (Xflag) {
 			(void) printf("\n--- Entry %d ---\n", ++jtemp);
 			(void) printf("Group = %d, mib_id = %d, "
 			    "length = %d, valp = 0x%p\n",
@@ -5334,7 +5494,7 @@ mrt_report(mib_item_t *item)
 		switch (item->mib_id) {
 
 		case EXPER_DVMRP_VIF:
-			if (Dflag)
+			if (Xflag)
 				(void) printf("%u records for ipVifTable:\n",
 				    item->length/sizeof (struct vifctl));
 			if (item->length/sizeof (struct vifctl) == 0) {
@@ -5377,7 +5537,7 @@ mrt_report(mib_item_t *item)
 			break;
 
 		case EXPER_DVMRP_MRT:
-			if (Dflag)
+			if (Xflag)
 				(void) printf("%u records for ipMfcTable:\n",
 				    item->length/sizeof (struct vifctl));
 			if (item->length/sizeof (struct vifctl) == 0) {
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
index 28416c4d7f..c0621996d3 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
@@ -2875,7 +2875,7 @@ mibwalk(void (*proc)(mib_item_t *))
 	 * us information concerning IRE_MARK_TESTHIDDEN routes.
 	 */
 	req = (struct opthdr *)&tor[1];
-	req->level = EXPER_IP_AND_TESTHIDDEN;
+	req->level = EXPER_IP_AND_ALL_IRES;
 	req->name  = 0;
 	req->len   = 0;
 
diff --git a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c
index b76341e303..2cea11b454 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c
@@ -407,6 +407,15 @@ select_src_ifi_info_solaris(int sockfd, int numifs,
         if (ifflags & (IFF_NOXMIT | IFF_NOLOCAL | IFF_PRIVATE))
             continue;
 
+	/* A DHCP client will have IFF_UP set yet the address is zero. Ignore */
+        if (lifr->lifr_addr.ss_family == AF_INET) {
+		struct sockaddr_in *sinptr;
+
+		sinptr = (struct sockaddr_in *) &lifr->lifr_addr;
+		if (sinptr->sin_addr.s_addr == INADDR_ANY)
+			continue;
+	}
+
         if (*best_lifr != NULL) {
             /* 
              * Check if we found a better interface by checking
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
index 506b15a307..868f9ab5e2 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
@@ -3541,18 +3541,6 @@ ifplumb(const char *linkname, const char *ifname, boolean_t genppa, int af)
 		Perror2_exit("I_PUSH", IP_MOD_NAME);
 
 	/*
-	 * Push the ARP module onto the interface stream. IP uses
-	 * this to send resolution requests up to ARP. We need to
-	 * do this before the SLIFNAME ioctl is sent down because
-	 * the interface becomes publicly known as soon as the SLIFNAME
-	 * ioctl completes. Thus some other process trying to bring up
-	 * the interface after SLIFNAME but before we have pushed ARP
-	 * could hang. We pop the module again later if it is not needed.
-	 */
-	if (ioctl(ip_fd, I_PUSH, ARP_MOD_NAME) == -1)
-		Perror2_exit("I_PUSH", ARP_MOD_NAME);
-
-	/*
 	 * Prepare to set IFF_IPV4/IFF_IPV6 flags as part of SIOCSLIFNAME.
 	 * (At this point in time the kernel also allows an override of the
 	 * IFF_CANTCHANGE flags.)
@@ -3679,12 +3667,6 @@ ifplumb(const char *linkname, const char *ifname, boolean_t genppa, int af)
 		(void) putchar('\n');
 	}
 
-	/* Check if arp is not actually needed */
-	if (lifr.lifr_flags & (IFF_NOARP|IFF_IPV6)) {
-		if (ioctl(ip_fd, I_POP, 0) == -1)
-			Perror2_exit("I_POP", ARP_MOD_NAME);
-	}
-
 	/*
 	 * Open "/dev/udp" for use as a multiplexor to PLINK the
 	 * interface stream under. We use "/dev/udp" instead of "/dev/ip"
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
index 2a4ff60d57..d851dce613 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
@@ -159,6 +159,7 @@ static int moptions;			/* multicast options */
 int npackets;				/* number of packets to send */
 static ushort_t tos;			/* type-of-service value */
 static int hoplimit = -1;		/* time-to-live value */
+static int dontfrag;			/* IP*_DONTFRAG */
 static int timeout = TIMEOUT;		/* timeout value (sec) for probes */
 static struct if_entry out_if;		/* interface argument */
 int ident;				/* ID for this ping run */
@@ -268,7 +269,7 @@ main(int argc, char *argv[])
 	setbuf(stdout, (char *)0);
 
 	while ((c = getopt(argc, argv,
-	    "abA:c:dF:G:g:I:i:LlnN:P:p:rRSsTt:UvX:x:Y0123?")) != -1) {
+	    "abA:c:dDF:G:g:I:i:LlnN:P:p:rRSsTt:UvX:x:Y0123?")) != -1) {
 		switch ((char)c) {
 		case 'A':
 			if (strcmp(optarg, "inet") == 0) {
@@ -301,6 +302,10 @@ main(int argc, char *argv[])
 			options |= SO_DEBUG;
 			break;
 
+		case 'D':
+			dontfrag = 1;
+			break;
+
 		case 'b':
 			bypass = _B_TRUE;
 			break;
@@ -1303,8 +1308,6 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
 		}
 	}
 
-	if (nexthop != NULL && !use_udp)
-		set_nexthop(family, ai_nexthop, recv_sock);
 	/*
 	 * We always receive on raw icmp socket. But the sending socket can be
 	 * raw icmp or udp, depending on the use of -U flag.
@@ -1332,9 +1335,6 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
 			}
 		}
 
-		if (nexthop != NULL)
-			set_nexthop(family, ai_nexthop, send_sock);
-
 		/*
 		 * In order to distinguish replies to our UDP probes from
 		 * other pings', we need to know our source port number.
@@ -1368,6 +1368,9 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
 		send_sock = recv_sock;
 	}
 
+	if (nexthop != NULL)
+		set_nexthop(family, ai_nexthop, send_sock);
+
 	int_op = 48 * 1024;
 	if (int_op < datalen)
 		int_op = datalen;
@@ -1431,6 +1434,7 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
 	if (moptions & MULTICAST_TTL) {
 		char_op = hoplimit;
 
+		/* Applies to unicast and multicast. */
 		if (family == AF_INET) {
 			if (setsockopt(send_sock, IPPROTO_IP, IP_MULTICAST_TTL,
 			    (char *)&char_op, sizeof (char)) == -1) {
@@ -1454,7 +1458,10 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
 		 */
 	}
 
-	/* did the user specify an interface? */
+	/*
+	 * did the user specify an interface?
+	 * Applies to unicast, broadcast and multicast.
+	 */
 	if (moptions & MULTICAST_IF) {
 		struct ifaddrlist *al = NULL;		/* interface list */
 		struct ifaddrlist *my_if;
@@ -1496,6 +1503,8 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
 		}
 
 		if (family == AF_INET) {
+			struct in_pktinfo pktinfo;
+
 			if (setsockopt(send_sock, IPPROTO_IP, IP_MULTICAST_IF,
 			    (char *)&my_if->addr.addr,
 			    sizeof (struct in_addr)) == -1) {
@@ -1504,6 +1513,15 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
 				    strerror(errno));
 				exit(EXIT_FAILURE);
 			}
+			bzero(&pktinfo, sizeof (pktinfo));
+			pktinfo.ipi_ifindex = my_if->index;
+			if (setsockopt(send_sock, IPPROTO_IP, IP_PKTINFO,
+			    (char *)&pktinfo, sizeof (pktinfo)) == -1) {
+				Fprintf(stderr, "%s: setsockopt "
+				    "IP_PKTINFO %s\n", progname,
+				    strerror(errno));
+				exit(EXIT_FAILURE);
+			}
 		} else {
 			/*
 			 * the outgoing interface is set in set_ancillary_data()
@@ -1525,6 +1543,23 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
 		}
 	}
 
+	/* We enable or disable to not depend on the kernel default */
+	if (family == AF_INET) {
+		if (setsockopt(send_sock, IPPROTO_IP, IP_DONTFRAG,
+		    (char *)&dontfrag, sizeof (dontfrag)) == -1) {
+			Fprintf(stderr, "%s: setsockopt IP_DONTFRAG %s\n",
+			    progname, strerror(errno));
+			exit(EXIT_FAILURE);
+		}
+	} else {
+		if (setsockopt(send_sock, IPPROTO_IPV6, IPV6_DONTFRAG,
+		    (char *)&dontfrag, sizeof (dontfrag)) == -1) {
+			Fprintf(stderr, "%s: setsockopt IPV6_DONTFRAG %s\n",
+			    progname, strerror(errno));
+			exit(EXIT_FAILURE);
+		}
+	}
+
 	/* receiving IPv6 extension headers in verbose mode */
 	if (verbose && family == AF_INET6) {
 		if (setsockopt(recv_sock, IPPROTO_IPV6, IPV6_RECVHOPOPTS,
@@ -2336,7 +2371,7 @@ usage(char *cmdname)
 	Fprintf(stderr, "usage: %s host [timeout]\n", cmdname);
 	Fprintf(stderr,
 /* CSTYLED */
-"usage: %s -s [-l | U] [abdLnRrv] [-A addr_family] [-c traffic_class]\n\t"
+"usage: %s -s [-l | U] [abdDLnRrv] [-A addr_family] [-c traffic_class]\n\t"
 "[-g gateway [-g gateway ...]] [-N nexthop] [-F flow_label] [-I interval]\n\t"
 "[-i interface] [-P tos] [-p port] [-t ttl] host [data_size] [npackets]\n",
 	    cmdname);
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/route.c b/usr/src/cmd/cmd-inet/usr.sbin/route.c
index b4b16d6755..aedef45409 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/route.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/route.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,8 +45,6 @@
  *	@(#)linkaddr.c	8.1 (Berkeley) 6/4/93
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/file.h>
 #include <sys/socket.h>
@@ -175,6 +173,8 @@ static struct keytab {
 	{"show",	K_SHOW},
 #define	K_SECATTR	43
 	{"secattr",	K_SECATTR},
+#define	K_INDIRECT	44
+	{"indirect",	K_INDIRECT},
 	{0, 0}
 };
 
@@ -655,7 +655,7 @@ flushroutes(int argc, char *argv[])
 			    (char *)rp < (char *)item->valp + item->length;
 			    /* LINTED */
 			    rp = (mib2_ipRouteEntry_t *)
-				((char *)rp + ipRouteEntrySize)) {
+			    ((char *)rp + ipRouteEntrySize)) {
 				delRouteEntry(rp, NULL, seqno);
 				seqno++;
 			}
@@ -670,7 +670,7 @@ flushroutes(int argc, char *argv[])
 			if (item->group == MIB2_IP6) {
 				ipv6RouteEntrySize =
 				    ((mib2_ipv6IfStatsEntry_t *)item->valp)->
-					ipv6RouteEntrySize;
+				    ipv6RouteEntrySize;
 				assert(IS_P2ALIGNED(ipv6RouteEntrySize,
 				    sizeof (mib2_ipv6RouteEntry_t *)));
 				break;
@@ -692,7 +692,7 @@ flushroutes(int argc, char *argv[])
 			    (char *)rp6 < (char *)item->valp + item->length;
 			    /* LINTED */
 			    rp6 = (mib2_ipv6RouteEntry_t *)
-				((char *)rp6 + ipv6RouteEntrySize)) {
+			    ((char *)rp6 + ipv6RouteEntrySize)) {
 				delRouteEntry(NULL, rp6, seqno);
 				seqno++;
 			}
@@ -812,7 +812,7 @@ delRouteEntry(mib2_ipRouteEntry_t *rp, mib2_ipv6RouteEntry_t *rp6, int seqno)
 
 		(void) printf("%-20.20s ",
 		    rtm->rtm_flags & RTF_HOST ? routename(sa) :
-			netname(sa));
+		    netname(sa));
 		/* LINTED */
 		sa = (struct sockaddr *)(salen(sa) + (char *)sa);
 		(void) printf("%-20.20s ", routename(sa));
@@ -861,7 +861,7 @@ routename(const struct sockaddr *sa)
 			cp = "default";
 		if (cp == NULL && !nflag) {
 			hp = gethostbyaddr((char *)&in, sizeof (struct in_addr),
-				AF_INET);
+			    AF_INET);
 			if (hp != NULL) {
 				if (((cp = strchr(hp->h_name, '.')) != NULL) &&
 				    (strcmp(cp + 1, domain) == 0))
@@ -892,7 +892,7 @@ routename(const struct sockaddr *sa)
 			cp = "default";
 		if (cp == NULL && !nflag) {
 			hp = getipnodebyaddr((char *)&in6,
-				sizeof (struct in6_addr), AF_INET6, &error_num);
+			    sizeof (struct in6_addr), AF_INET6, &error_num);
 			if (hp != NULL) {
 				if (((cp = strchr(hp->h_name, '.')) != NULL) &&
 				    (strcmp(cp + 1, domain) == 0))
@@ -1120,8 +1120,8 @@ print_rtcmd_short(FILE *to, rtcmd_irep_t *rcip, boolean_t gw_good,
 			break;
 		case AF_INET6:
 			if (inet_ntop(AF_INET6,
-				&rcip->ri_gate.sin6.sin6_addr, obuf,
-				INET6_ADDRSTRLEN) != NULL) {
+			    &rcip->ri_gate.sin6.sin6_addr, obuf,
+			    INET6_ADDRSTRLEN) != NULL) {
 				if (nflag) {
 					(void) fprintf(to, ": gateway %s",
 					    obuf);
@@ -1405,6 +1405,9 @@ args_to_rtcmd(rtcmd_irep_t *rcip, char **argv, char *cmd_string)
 				return (B_FALSE);
 			}
 			break;
+		case K_INDIRECT:
+			rcip->ri_flags |= RTF_INDIRECT;
+			break;
 		default:
 			if (dash_keyword) {
 				syntax_bad_keyword(tok + 1);
@@ -1479,8 +1482,8 @@ args_to_rtcmd(rtcmd_irep_t *rcip, char **argv, char *cmd_string)
 			}
 			if (rcip->ri_af == AF_INET6 &&
 			    memcmp(&rcip->ri_mask.sin6.sin6_addr,
-				&in6_host_mask,
-				sizeof (struct in6_addr)) == 0) {
+			    &in6_host_mask,
+			    sizeof (struct in6_addr)) == 0) {
 				rcip->ri_flags |= RTF_HOST;
 			}
 		} else {
@@ -1853,8 +1856,8 @@ newroute(char **argv)
 				break;
 			case AF_INET6:
 				if (inet_ntop(AF_INET6,
-					(void *)&newrt->ri_dst.sin6.sin6_addr,
-					obuf, INET6_ADDRSTRLEN) != NULL) {
+				    (void *)&newrt->ri_dst.sin6.sin6_addr,
+				    obuf, INET6_ADDRSTRLEN) != NULL) {
 					(void) printf(" %s", obuf);
 					break;
 				}
@@ -2236,7 +2239,7 @@ in_getaddr(char *s, struct sockaddr_in *sin, int *plenp, int which,
 			    inet_lnaof(sin->sin_addr) == INADDR_ANY)) {
 				/* This looks like a network address. */
 				inet_makenetandmask(rcip, ntohl(val),
-					    sin);
+				    sin);
 			}
 		}
 		return (B_TRUE);
@@ -2562,7 +2565,7 @@ static char metricnames[] =
 static char routeflags[] =
 "\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE\010MASK_PRESENT"
 	"\011CLONING\012XRESOLVE\013LLINFO\014STATIC\015BLACKHOLE"
-	"\016PRIVATE\017PROTO2\020PROTO1\021MULTIRT\022SETSRC";
+	"\016PRIVATE\017PROTO2\020PROTO1\021MULTIRT\022SETSRC\023INDIRECT";
 static char ifnetflags[] =
 "\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5PTP\6NOTRAILERS\7RUNNING\010NOARP"
 	"\011PPROMISC\012ALLMULTI\013INTELLIGENT\014MULTICAST"
@@ -2623,7 +2626,7 @@ print_rtmsg(struct rt_msghdr *rtm, int msglen)
 		break;
 	default:
 		(void) printf("pid: %ld, seq %d, errno %d, flags:",
-			rtm->rtm_pid, rtm->rtm_seq, rtm->rtm_errno);
+		    rtm->rtm_pid, rtm->rtm_seq, rtm->rtm_errno);
 		bprintf(stdout, rtm->rtm_flags, routeflags);
 		pmsg_common(rtm, msglen);
 		break;
@@ -2649,7 +2652,7 @@ print_getmsg(rtcmd_irep_t *req_rt, struct rt_msghdr *rtm, int msglen)
 	if (rtm->rtm_msglen > (ushort_t)msglen) {
 		(void) fprintf(stderr,
 		    gettext("message length mismatch, in packet %d, "
-			"returned %d\n"), rtm->rtm_msglen, msglen);
+		    "returned %d\n"), rtm->rtm_msglen, msglen);
 	}
 	if (rtm->rtm_errno)  {
 		(void) fprintf(stderr, "RTM_GET: %s (errno %d)\n",
@@ -2675,7 +2678,7 @@ print_getmsg(rtcmd_irep_t *req_rt, struct rt_msghdr *rtm, int msglen)
 				case RTA_IFP:
 					if (sa->sa_family == AF_LINK &&
 					    ((struct sockaddr_dl *)sa)->
-						sdl_nlen != 0)
+					    sdl_nlen != 0)
 						ifp = (struct sockaddr_dl *)sa;
 					break;
 				case RTA_SRC:
@@ -3122,8 +3125,8 @@ mibget(int sd)
 			(void) fprintf(stderr, gettext("mibget %d gives "
 			    "T_ERROR_ACK: TLI_error = 0x%lx, UNIX_error = "
 			    "0x%lx\n"), j, tea->TLI_error, tea->UNIX_error);
-			errno = (tea->TLI_error == TSYSERR)
-				? tea->UNIX_error : EPROTO;
+			errno = (tea->TLI_error == TSYSERR) ?
+			    tea->UNIX_error : EPROTO;
 			break;
 		}
 
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c
index cae75df60d..b8b56259ad 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c
@@ -166,6 +166,7 @@ boolean_t useicmp = _B_FALSE;  	/* use icmp echo instead of udp packets */
 boolean_t docksum = _B_TRUE;	/* calculate checksums */
 static boolean_t collect_stat = _B_FALSE;	/* print statistics */
 boolean_t settos = _B_FALSE;   	/* set type-of-service field */
+int dontfrag = 0;		/* IP*_DONTFRAG */
 static int max_timeout = 5;	/* quit after this consecutive timeouts */
 static boolean_t probe_all = _B_FALSE;	/* probe all the IFs of the target */
 static boolean_t pick_src = _B_FALSE;	/* traceroute picks the src address */
@@ -315,6 +316,7 @@ main(int argc, char **argv)
 
 		case 'F':
 			off = IP_DF;
+			dontfrag = 1;
 			break;
 
 		case 'g':
@@ -1361,6 +1363,24 @@ setup_socket(struct pr_set *pr, int packet_len)
 			exit(EXIT_FAILURE);
 		}
 	}
+
+	/* We enable or disable to not depend on the kernel default */
+	if (pr->family == AF_INET) {
+		if (setsockopt(ssock, IPPROTO_IP, IP_DONTFRAG,
+		    (char *)&dontfrag, sizeof (dontfrag)) == -1) {
+			Fprintf(stderr, "%s: IP_DONTFRAG %s\n", prog,
+			    strerror(errno));
+			exit(EXIT_FAILURE);
+		}
+	} else {
+		if (setsockopt(ssock, IPPROTO_IPV6, IPV6_DONTFRAG,
+		    (char *)&dontfrag, sizeof (dontfrag)) == -1) {
+			Fprintf(stderr, "%s: IPV6_DONTFRAG %s\n", prog,
+			    strerror(errno));
+			exit(EXIT_FAILURE);
+		}
+	}
+
 	if (pr->family == AF_INET) {
 		rcvsock4 = rsock;
 		sndsock4 = ssock;
diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c
index 222699e479..84cdb42377 100644
--- a/usr/src/cmd/devfsadm/misc_link.c
+++ b/usr/src/cmd/devfsadm/misc_link.c
@@ -104,8 +104,7 @@ static devfsadm_create_t misc_cbt[] = {
 	    "(^ip$)|(^tcp$)|(^udp$)|(^icmp$)|(^sctp$)|"
 	    "(^ip6$)|(^tcp6$)|(^udp6$)|(^icmp6$)|(^sctp6$)|"
 	    "(^rts$)|(^arp$)|(^ipsecah$)|(^ipsecesp$)|(^keysock$)|(^spdsock$)|"
-	    "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)|(^dlpistub$)|(^iptunq)|"
-	    "(^bpf$)",
+	    "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)|(^dlpistub$)|(^bpf$)",
 	    TYPE_EXACT | DRV_RE, ILEVEL_1, minor_name
 	},
 	{ "pseudo", "ddi_pseudo",
diff --git a/usr/src/cmd/mdb/common/modules/arp/arp.c b/usr/src/cmd/mdb/common/modules/arp/arp.c
index f36a81170e..f97cdaab42 100644
--- a/usr/src/cmd/mdb/common/modules/arp/arp.c
+++ b/usr/src/cmd/mdb/common/modules/arp/arp.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/stropts.h>
@@ -36,7 +34,6 @@
 #include <inet/common.h>
 #include <inet/mi.h>
 #include <inet/arp.h>
-#include <inet/arp_impl.h>
 #include <inet/ip.h>
 #include <netinet/arp.h>
 
@@ -50,541 +47,10 @@ typedef struct {
 } arp_cmd_tbl;
 
 /*
- * Table of ARP commands and structure types used for messages between ARP and
- * IP.
- */
-static const arp_cmd_tbl act_list[] = {
-	{ AR_ENTRY_ADD,		"AR_ENTRY_ADD",		"arp`area_t" },
-	{ AR_ENTRY_DELETE,	"AR_ENTRY_DELETE",	"arp`ared_t" },
-	{ AR_ENTRY_QUERY,	"AR_ENTRY_QUERY",	"arp`areq_t" },
-	{ AR_ENTRY_SQUERY,	"AR_ENTRY_SQUERY",	"arp`area_t" },
-	{ AR_MAPPING_ADD,	"AR_MAPPING_ADD",	"arp`arma_t" },
-	{ AR_CLIENT_NOTIFY,	"AR_CLIENT_NOTIFY",	"arp`arcn_t" },
-	{ AR_INTERFACE_UP,	"AR_INTERFACE_UP",	"arp`arc_t" },
-	{ AR_INTERFACE_DOWN,	"AR_INTERFACE_DOWN",	"arp`arc_t" },
-	{ AR_INTERFACE_ON,	"AR_INTERFACE_ON",	"arp`arc_t" },
-	{ AR_INTERFACE_OFF,	"AR_INTERFACE_OFF",	"arp`arc_t" },
-	{ AR_DLPIOP_DONE,	"AR_DLPIOP_DONE",	"arp`arc_t" },
-	{ AR_ARP_CLOSING,	"AR_ARP_CLOSING",	"arp`arc_t" },
-	{ AR_ARP_EXTEND,	"AR_ARP_EXTEND",	"arp`arc_t" },
-	{ 0,			"unknown command",	"arp`arc_t" }
-};
-
-/*
- * State information kept during walk over ACE hash table and unhashed mask
- * list.
- */
-typedef struct ace_walk_data {
-	ace_t *awd_hash_tbl[ARP_HASH_SIZE];
-	ace_t *awd_masks;
-	int awd_idx;
-} ace_walk_data_t;
-
-/*
- * Given the kernel address of an arl_t, return the stackid
+ * removed all the ace/arl related stuff. The only thing that remains
+ * is code for dealing with ioctls and printing out arp header that
+ * should probably be moved into the ip/mdb module.
  */
-static int
-arl_to_stackid(uintptr_t addr)
-{
-	arl_t arl;
-	queue_t rq;
-	ar_t ar;
-	arp_stack_t ass;
-	netstack_t nss;
-
-	if (mdb_vread(&arl, sizeof (arl), addr) == -1) {
-		mdb_warn("failed to read arl_t %p", addr);
-		return (0);
-	}
-
-	addr = (uintptr_t)arl.arl_rq;
-	if (mdb_vread(&rq, sizeof (rq), addr) == -1) {
-		mdb_warn("failed to read queue_t %p", addr);
-		return (0);
-	}
-
-	addr = (uintptr_t)rq.q_ptr;
-	if (mdb_vread(&ar, sizeof (ar), addr) == -1) {
-		mdb_warn("failed to read ar_t %p", addr);
-		return (0);
-	}
-
-	addr = (uintptr_t)ar.ar_as;
-	if (mdb_vread(&ass, sizeof (ass), addr) == -1) {
-		mdb_warn("failed to read arp_stack_t %p", addr);
-		return (0);
-	}
-	addr = (uintptr_t)ass.as_netstack;
-	if (mdb_vread(&nss, sizeof (nss), addr) == -1) {
-		mdb_warn("failed to read netstack_t %p", addr);
-		return (0);
-	}
-	return (nss.netstack_stackid);
-}
-
-static int
-arp_stacks_walk_init(mdb_walk_state_t *wsp)
-{
-	if (mdb_layered_walk("netstack", wsp) == -1) {
-		mdb_warn("can't walk 'netstack'");
-		return (WALK_ERR);
-	}
-	return (WALK_NEXT);
-}
-
-static int
-arp_stacks_walk_step(mdb_walk_state_t *wsp)
-{
-	uintptr_t addr;
-	netstack_t nss;
-
-	if (mdb_vread(&nss, sizeof (nss), wsp->walk_addr) == -1) {
-		mdb_warn("can't read netstack at %p", wsp->walk_addr);
-		return (WALK_ERR);
-	}
-	addr = (uintptr_t)nss.netstack_modules[NS_ARP];
-
-	return (wsp->walk_callback(addr, wsp->walk_layer, wsp->walk_cbdata));
-}
-
-static int
-arl_stack_walk_init(mdb_walk_state_t *wsp)
-{
-	uintptr_t addr;
-
-	if (wsp->walk_addr == NULL) {
-		mdb_warn("arl_stack supports only local walks\n");
-		return (WALK_ERR);
-	}
-
-	addr = wsp->walk_addr + OFFSETOF(arp_stack_t, as_arl_head);
-	if (mdb_vread(&wsp->walk_addr, sizeof (wsp->walk_addr),
-	    addr) == -1) {
-		mdb_warn("failed to read 'arl_g_head'");
-		return (WALK_ERR);
-	}
-	return (WALK_NEXT);
-}
-
-static int
-arl_stack_walk_step(mdb_walk_state_t *wsp)
-{
-	uintptr_t addr = wsp->walk_addr;
-	arl_t arl;
-
-	if (wsp->walk_addr == NULL)
-		return (WALK_DONE);
-
-	if (mdb_vread(&arl, sizeof (arl), addr) == -1) {
-		mdb_warn("failed to read arl_t at %p", addr);
-		return (WALK_ERR);
-	}
-
-	wsp->walk_addr = (uintptr_t)arl.arl_next;
-
-	return ((*wsp->walk_callback)(addr, &arl, wsp->walk_cbdata));
-}
-
-static int
-arl_walk_init(mdb_walk_state_t *wsp)
-{
-	if (mdb_layered_walk("arp_stacks", wsp) == -1) {
-		mdb_warn("can't walk 'arp_stacks'");
-		return (WALK_ERR);
-	}
-
-	return (WALK_NEXT);
-}
-
-static int
-arl_walk_step(mdb_walk_state_t *wsp)
-{
-	if (mdb_pwalk("arl_stack", wsp->walk_callback,
-		wsp->walk_cbdata, wsp->walk_addr) == -1) {
-		mdb_warn("couldn't walk 'arl_stack' at %p", wsp->walk_addr);
-		return (WALK_ERR);
-	}
-	return (WALK_NEXT);
-}
-
-/*
- * Called with walk_addr being the address of arp_stack_t
- */
-static int
-ace_stack_walk_init(mdb_walk_state_t *wsp)
-{
-	ace_walk_data_t *aw;
-	uintptr_t addr;
-
-	if (wsp->walk_addr == NULL) {
-		mdb_warn("ace_stack supports only local walks\n");
-		return (WALK_ERR);
-	}
-
-	aw = mdb_alloc(sizeof (ace_walk_data_t), UM_SLEEP);
-
-	addr = wsp->walk_addr + OFFSETOF(arp_stack_t, as_ce_hash_tbl);
-	if (mdb_vread(aw->awd_hash_tbl, sizeof (aw->awd_hash_tbl),
-	    addr) == -1) {
-		mdb_warn("failed to read 'as_ce_hash_tbl'");
-		mdb_free(aw, sizeof (ace_walk_data_t));
-		return (WALK_ERR);
-	}
-
-	addr = wsp->walk_addr + OFFSETOF(arp_stack_t, as_ce_mask_entries);
-	if (mdb_vread(&aw->awd_masks, sizeof (aw->awd_masks),
-	    addr) == -1) {
-		mdb_warn("failed to read 'as_ce_mask_entries'");
-		mdb_free(aw, sizeof (ace_walk_data_t));
-		return (WALK_ERR);
-	}
-
-	/* The step routine will start off by incrementing to index 0 */
-	aw->awd_idx = -1;
-	wsp->walk_addr = 0;
-	wsp->walk_data = aw;
-
-	return (WALK_NEXT);
-}
-
-static int
-ace_stack_walk_step(mdb_walk_state_t *wsp)
-{
-	uintptr_t addr;
-	ace_walk_data_t *aw = wsp->walk_data;
-	ace_t ace;
-
-	/*
-	 * If we're at the end of the previous list, then find the start of the
-	 * next list to process.
-	 */
-	while (wsp->walk_addr == NULL) {
-		if (aw->awd_idx == ARP_HASH_SIZE)
-			return (WALK_DONE);
-		if (++aw->awd_idx == ARP_HASH_SIZE) {
-			wsp->walk_addr = (uintptr_t)aw->awd_masks;
-		} else {
-			wsp->walk_addr =
-			    (uintptr_t)aw->awd_hash_tbl[aw->awd_idx];
-		}
-	}
-
-	addr = wsp->walk_addr;
-	if (mdb_vread(&ace, sizeof (ace), addr) == -1) {
-		mdb_warn("failed to read ace_t at %p", addr);
-		return (WALK_ERR);
-	}
-
-	wsp->walk_addr = (uintptr_t)ace.ace_next;
-
-	return (wsp->walk_callback(addr, &ace, wsp->walk_cbdata));
-}
-
-static void
-ace_stack_walk_fini(mdb_walk_state_t *wsp)
-{
-	mdb_free(wsp->walk_data, sizeof (ace_walk_data_t));
-}
-
-static int
-ace_walk_init(mdb_walk_state_t *wsp)
-{
-	if (mdb_layered_walk("arp_stacks", wsp) == -1) {
-		mdb_warn("can't walk 'arp_stacks'");
-		return (WALK_ERR);
-	}
-
-	return (WALK_NEXT);
-}
-
-static int
-ace_walk_step(mdb_walk_state_t *wsp)
-{
-	if (mdb_pwalk("ace_stack", wsp->walk_callback,
-		wsp->walk_cbdata, wsp->walk_addr) == -1) {
-		mdb_warn("couldn't walk 'ace_stack' at %p", wsp->walk_addr);
-		return (WALK_ERR);
-	}
-	return (WALK_NEXT);
-}
-
-
-/* Common routine to produce an 'ar' text description */
-static void
-ar_describe(const ar_t *ar, char *buf, size_t nbytes, boolean_t addmac)
-{
-	if (ar->ar_arl == NULL) {
-		queue_t wq, ipq;
-		ill_t ill;
-		char name[LIFNAMSIZ];
-		GElf_Sym sym;
-		boolean_t nextip;
-
-		if (mdb_vread(&wq, sizeof (wq), (uintptr_t)ar->ar_wq) == -1 ||
-		    mdb_vread(&ipq, sizeof (ipq), (uintptr_t)wq.q_next) == -1)
-			return;
-
-		nextip =
-		    (mdb_lookup_by_obj("ip", "ipwinit", &sym) == 0 &&
-		    (uintptr_t)sym.st_value == (uintptr_t)ipq.q_qinfo);
-
-		if (!ar->ar_on_ill_stream) {
-			(void) strcpy(buf, nextip ? "Client" : "Unknown");
-			return;
-		}
-
-		if (!nextip ||
-		    mdb_vread(&ill, sizeof (ill), (uintptr_t)ipq.q_ptr) == -1 ||
-		    mdb_readstr(name, sizeof (name),
-		    (uintptr_t)ill.ill_name) == -1) {
-			return;
-		}
-		(void) mdb_snprintf(buf, nbytes, "IP %s", name);
-	} else {
-		arl_t arl;
-		arlphy_t ap;
-		ssize_t retv;
-		uint32_t alen;
-		uchar_t macaddr[ARP_MAX_ADDR_LEN];
-
-		if (mdb_vread(&arl, sizeof (arl), (uintptr_t)ar->ar_arl) == -1)
-			return;
-		retv = mdb_snprintf(buf, nbytes, "ARP %s ", arl.arl_name);
-		if (retv >= nbytes || !addmac)
-			return;
-		if (mdb_vread(&ap, sizeof (ap), (uintptr_t)arl.arl_phy) == -1)
-			return;
-		alen = ap.ap_hw_addrlen;
-		if (ap.ap_hw_addr == NULL || alen == 0 ||
-		    alen > sizeof (macaddr))
-			return;
-		if (mdb_vread(macaddr, alen, (uintptr_t)ap.ap_hw_addr) == -1)
-			return;
-		mdb_mac_addr(macaddr, alen, buf + retv, nbytes - retv);
-	}
-}
-
-/* ARGSUSED2 */
-static int
-ar_cb(uintptr_t addr, const void *arptr, void *dummy)
-{
-	const ar_t *ar = arptr;
-	char ardesc[sizeof ("ARP  ") + LIFNAMSIZ];
-
-	ar_describe(ar, ardesc, sizeof (ardesc), B_FALSE);
-	mdb_printf("%?p %?p %?p %s\n", addr, ar->ar_wq, ar->ar_arl, ardesc);
-	return (WALK_NEXT);
-}
-
-/*
- * Print out ARP client structures.
- */
-/* ARGSUSED2 */
-static int
-ar_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	ar_t ar;
-
-	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
-		mdb_printf("%<u>%?s %?s %?s %s%</u>\n",
-		    "AR", "WQ", "ARL", "TYPE");
-	}
-
-	if (flags & DCMD_ADDRSPEC) {
-		if (mdb_vread(&ar, sizeof (ar), addr) == -1) {
-			mdb_warn("failed to read ar_t at %p", addr);
-			return (DCMD_ERR);
-		}
-		(void) ar_cb(addr, &ar, NULL);
-	} else {
-		if (mdb_walk("ar", ar_cb, NULL) == -1) {
-			mdb_warn("cannot walk ar_t structures");
-			return (DCMD_ERR);
-		}
-	}
-	return (DCMD_OK);
-}
-
-/* ARGSUSED2 */
-static int
-arl_cb(uintptr_t addr, const void *arlptr, void *dummy)
-{
-	const arl_t *arl = arlptr;
-	arlphy_t ap;
-	uchar_t macaddr[ARP_MAX_ADDR_LEN];
-	char macstr[ARP_MAX_ADDR_LEN*3];
-	char flags[4];
-	const char *primstr;
-
-	mdb_printf("%?p  ", addr);
-	if (arl->arl_dlpi_pending == DL_PRIM_INVAL)
-		mdb_printf("%16s", "--");
-	else if ((primstr = mdb_dlpi_prim(arl->arl_dlpi_pending)) != NULL)
-		mdb_printf("%16s", primstr);
-	else
-		mdb_printf("%16x", arl->arl_dlpi_pending);
-
-	if (mdb_vread(&ap, sizeof (ap), (uintptr_t)arl->arl_phy) == -1 ||
-	    ap.ap_hw_addrlen == 0 || ap.ap_hw_addrlen > sizeof (macaddr)) {
-		(void) strcpy(macstr, "--");
-	} else if (mdb_vread(macaddr, ap.ap_hw_addrlen,
-	    (uintptr_t)ap.ap_hw_addr) == -1) {
-		(void) strcpy(macstr, "?");
-	} else {
-		mdb_mac_addr(macaddr, ap.ap_hw_addrlen, macstr,
-		    sizeof (macstr));
-	}
-
-	/* Print both the link-layer state and the NOARP flag */
-	flags[0] = '\0';
-	if (arl->arl_flags & ARL_F_NOARP)
-		(void) strcat(flags, "N");
-	switch (arl->arl_state) {
-	case ARL_S_DOWN:
-		(void) strcat(flags, "d");
-		break;
-	case ARL_S_PENDING:
-		(void) strcat(flags, "P");
-		break;
-	case ARL_S_UP:
-		(void) strcat(flags, "U");
-		break;
-	default:
-		(void) strcat(flags, "?");
-		break;
-	}
-	mdb_printf("  %8d  %-3s  %-9s  %-17s %5d\n",
-	    mdb_mblk_count(arl->arl_dlpi_deferred), flags, arl->arl_name,
-	    macstr, arl_to_stackid((uintptr_t)addr));
-	return (WALK_NEXT);
-}
-
-/*
- * Print out ARP link-layer elements.
- */
-/* ARGSUSED2 */
-static int
-arl_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	arl_t arl;
-
-	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
-		mdb_printf("%<u>%?s  %16s  %8s  %3s  %9s  %-17s %5s%</u>\n",
-		    "ARL", "DLPI REQ", "DLPI CNT", "FLG", "INTERFACE",
-		    "HWADDR", "STACK");
-	}
-
-	if (flags & DCMD_ADDRSPEC) {
-		if (mdb_vread(&arl, sizeof (arl), addr) == -1) {
-			mdb_warn("failed to read arl_t at %p", addr);
-			return (DCMD_ERR);
-		}
-		(void) arl_cb(addr, &arl, NULL);
-	} else {
-		if (mdb_walk("arl", arl_cb, NULL) == -1) {
-			mdb_warn("cannot walk arl_t structures");
-			return (DCMD_ERR);
-		}
-	}
-	return (DCMD_OK);
-}
-
-/* ARGSUSED2 */
-static int
-ace_cb(uintptr_t addr, const void *aceptr, void *dummy)
-{
-	const ace_t *ace = aceptr;
-	uchar_t macaddr[ARP_MAX_ADDR_LEN];
-	char macstr[ARP_MAX_ADDR_LEN*3];
-	/* The %b format isn't compact enough for long listings */
-	static const char ace_flags[] = "SPDRMLdA ofya";
-	const char *cp;
-	char flags[sizeof (ace_flags)], *fp;
-	int flg;
-	in_addr_t inaddr, mask;
-	char addrstr[sizeof ("255.255.255.255/32")];
-
-	/* Walk the list of flags and produce a string */
-	cp = ace_flags;
-	fp = flags;
-	for (flg = 1; *cp != '\0'; flg <<= 1, cp++) {
-		if ((flg & ace->ace_flags) && *cp != ' ')
-			*fp++ = *cp;
-	}
-	*fp = '\0';
-
-	/* If it's not resolved, then it has no hardware address */
-	if (!(ace->ace_flags & ACE_F_RESOLVED) ||
-	    ace->ace_hw_addr_length == 0 ||
-	    ace->ace_hw_addr_length > sizeof (macaddr)) {
-		(void) strcpy(macstr, "--");
-	} else if (mdb_vread(macaddr, ace->ace_hw_addr_length,
-	    (uintptr_t)ace->ace_hw_addr) == -1) {
-		(void) strcpy(macstr, "?");
-	} else {
-		mdb_mac_addr(macaddr, ace->ace_hw_addr_length, macstr,
-		    sizeof (macstr));
-	}
-
-	/*
-	 * Nothing other than IP uses ARP these days, so we don't try very hard
-	 * here to switch out on ARP protocol type.  (Note that ARP protocol
-	 * types are roughly Ethertypes, but are allocated separately at IANA.)
-	 */
-	if (ace->ace_proto != IP_ARP_PROTO_TYPE) {
-		(void) mdb_snprintf(addrstr, sizeof (addrstr),
-		    "Unknown proto %x", ace->ace_proto);
-	} else if (mdb_vread(&inaddr, sizeof (inaddr),
-	    (uintptr_t)ace->ace_proto_addr) != -1 &&
-	    mdb_vread(&mask, sizeof (mask), (uintptr_t)ace->ace_proto_mask) !=
-	    -1) {
-		/*
-		 * If it's the standard host mask, then print it normally.
-		 * Otherwise, use "/n" notation.
-		 */
-		if (mask == (in_addr_t)~0) {
-			(void) mdb_snprintf(addrstr, sizeof (addrstr), "%I",
-			    inaddr);
-		} else {
-			(void) mdb_snprintf(addrstr, sizeof (addrstr), "%I/%d",
-			    inaddr, mask == 0 ? 0 : 33 - mdb_ffs(mask));
-		}
-	} else {
-		(void) strcpy(addrstr, "?");
-	}
-	mdb_printf("%?p  %-18s  %-8s  %-17s %5d\n", addr, addrstr, flags,
-	    macstr, arl_to_stackid((uintptr_t)ace->ace_arl));
-	return (WALK_NEXT);
-}
-
-/*
- * Print out ARP cache entry (ace_t) elements.
- */
-/* ARGSUSED2 */
-static int
-ace_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	ace_t ace;
-
-	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
-		mdb_printf("%<u>%?s  %-18s  %-8s  %-17s %5s%</u>\n",
-		    "ACE", "PROTOADDR", "FLAGS", "HWADDR", "STACK");
-	}
-
-	if (flags & DCMD_ADDRSPEC) {
-		if (mdb_vread(&ace, sizeof (ace), addr) == -1) {
-			mdb_warn("failed to read ace_t at %p", addr);
-			return (DCMD_ERR);
-		}
-		(void) ace_cb(addr, &ace, NULL);
-	} else {
-		if (mdb_walk("ace", ace_cb, NULL) == -1) {
-			mdb_warn("cannot walk ace_t structures");
-			return (DCMD_ERR);
-		}
-	}
-	return (DCMD_OK);
-}
 
 /*
  * Print an ARP hardware and protocol address pair; used when printing an ARP
@@ -696,148 +162,25 @@ arphdr_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	return (DCMD_OK);
 }
 
-/*
- * Print out an arp command formatted in a reasonable manner.  This implements
- * the type switch used by ARP.
- *
- * It could also dump the data that follows the header (using offset and length
- * in the various structures), but it currently does not.
- */
-/* ARGSUSED2 */
-static int
-arpcmd_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	arc_t arc;
-	const arp_cmd_tbl *tp;
-	mdb_arg_t subargv;
-
-	if (!(flags & DCMD_ADDRSPEC)) {
-		mdb_warn("address required to print ARP command\n");
-		return (DCMD_ERR);
-	}
-	if (mdb_vread(&arc, sizeof (arc), addr) == -1) {
-		mdb_warn("unable to read arc_t at %p", addr);
-		return (DCMD_ERR);
-	}
-	for (tp = act_list; tp->act_cmd != 0; tp++)
-		if (tp->act_cmd == arc.arc_cmd)
-			break;
-	mdb_printf("%p %s (%s) = ", addr, tp->act_name, tp->act_type);
-	subargv.a_type = MDB_TYPE_STRING;
-	subargv.a_un.a_str = tp->act_type;
-	if (mdb_call_dcmd("print", addr, DCMD_ADDRSPEC, 1, &subargv) == -1)
-		return (DCMD_ERR);
-	else
-		return (DCMD_OK);
-}
-
-static size_t
-mi_osize(const queue_t *q)
-{
-	/*
-	 * The code in common/inet/mi.c allocates an extra word to store the
-	 * size of the allocation.  An mi_o_s is thus a size_t plus an mi_o_s.
-	 */
-	struct mi_block {
-		size_t mi_nbytes;
-		struct mi_o_s mi_o;
-	} m;
-
-	if (mdb_vread(&m, sizeof (m), (uintptr_t)q->q_ptr - sizeof (m)) != -1)
-		return (m.mi_nbytes - sizeof (m));
-
-	return (0);
-}
-
-/*
- * This is called when ::stream is used and an ARP module is seen on the
- * stream.  Determine what sort of ARP usage is involved and show an
- * appropriate message.
- */
-static void
-arp_qinfo(const queue_t *qp, char *buf, size_t nbytes)
-{
-	size_t size = mi_osize(qp);
-	ar_t ar;
-
-	if (size != sizeof (ar_t))
-		return;
-	if (mdb_vread(&ar, sizeof (ar), (uintptr_t)qp->q_ptr) == -1)
-		return;
-	ar_describe(&ar, buf, nbytes, B_TRUE);
-}
-
-static uintptr_t
-arp_rnext(const queue_t *q)
-{
-	size_t size = mi_osize(q);
-	ar_t ar;
-
-	if (size == sizeof (ar_t) && mdb_vread(&ar, sizeof (ar),
-	    (uintptr_t)q->q_ptr) != -1)
-		return ((uintptr_t)ar.ar_rq);
-
-	return (NULL);
-}
-
-static uintptr_t
-arp_wnext(const queue_t *q)
-{
-	size_t size = mi_osize(q);
-	ar_t ar;
-
-	if (size == sizeof (ar_t) && mdb_vread(&ar, sizeof (ar),
-	    (uintptr_t)q->q_ptr) != -1)
-		return ((uintptr_t)ar.ar_wq);
-
-	return (NULL);
-}
-
 static const mdb_dcmd_t dcmds[] = {
-	{ "ar", "?", "display ARP client streams for all stacks",
-	    ar_cmd, NULL },
-	{ "arl", "?", "display ARP link layers for all stacks", arl_cmd, NULL },
-	{ "ace", "?", "display ARP cache entries for all stacks",
-	    ace_cmd, NULL },
 	{ "arphdr", ":", "display an ARP header", arphdr_cmd, NULL },
-	{ "arpcmd", ":", "display an ARP command", arpcmd_cmd, NULL },
 	{ NULL }
 };
 
 /* Note: ar_t walker is in genunix.c and net.c; generic MI walker */
 static const mdb_walker_t walkers[] = {
-	{ "arl", "walk list of arl_t links for all stacks",
-	    arl_walk_init, arl_walk_step, NULL },
-	{ "arl_stack", "walk list of arl_t links",
-	    arl_stack_walk_init, arl_stack_walk_step, NULL },
-	{ "ace", "walk list of ace_t entries for all stacks",
-	    ace_walk_init, ace_walk_step, NULL },
-	{ "ace_stack", "walk list of ace_t entries",
-	    ace_stack_walk_init, ace_stack_walk_step, ace_stack_walk_fini },
-	{ "arp_stacks", "walk all the arp_stack_t",
-	    arp_stacks_walk_init, arp_stacks_walk_step, NULL },
 	{ NULL }
 };
 
-static const mdb_qops_t arp_qops = { arp_qinfo, arp_rnext, arp_wnext };
 static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, walkers };
 
 const mdb_modinfo_t *
 _mdb_init(void)
 {
-	GElf_Sym sym;
-
-	if (mdb_lookup_by_obj("arp", "winit", &sym) == 0)
-		mdb_qops_install(&arp_qops, (uintptr_t)sym.st_value);
-
 	return (&modinfo);
 }
 
 void
 _mdb_fini(void)
 {
-	GElf_Sym sym;
-
-	if (mdb_lookup_by_obj("arp", "winit", &sym) == 0)
-		mdb_qops_remove(&arp_qops, (uintptr_t)sym.st_value);
 }
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
index 3e49d9a99c..e6fe3f7dcf 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
@@ -4770,8 +4770,6 @@ static const mdb_walker_t walkers[] = {
 		NULL, modchain_walk_step, NULL },
 
 	/* from net.c */
-	{ "ar", "walk ar_t structures using MI for all stacks",
-		mi_payload_walk_init, mi_payload_walk_step, NULL, &mi_ar_arg },
 	{ "icmp", "walk ICMP control structures using MI for all stacks",
 		mi_payload_walk_init, mi_payload_walk_step, NULL,
 		&mi_icmp_arg },
@@ -4779,8 +4777,6 @@ static const mdb_walker_t walkers[] = {
 		mi_walk_init, mi_walk_step, mi_walk_fini, NULL },
 	{ "sonode", "given a sonode, walk its children",
 		sonode_walk_init, sonode_walk_step, sonode_walk_fini, NULL },
-	{ "ar_stacks", "walk all the ar_stack_t",
-		ar_stacks_walk_init, ar_stacks_walk_step, NULL },
 	{ "icmp_stacks", "walk all the icmp_stack_t",
 		icmp_stacks_walk_init, icmp_stacks_walk_step, NULL },
 	{ "tcp_stacks", "walk all the tcp_stack_t",
diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.c b/usr/src/cmd/mdb/common/modules/genunix/net.c
index d9f4717d7e..23d6202fff 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/net.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.c
@@ -45,7 +45,6 @@
 #include <sys/socketvar.h>
 #include <sys/cred_impl.h>
 #include <inet/udp_impl.h>
-#include <inet/arp_impl.h>
 #include <inet/rawip_impl.h>
 #include <inet/mi.h>
 #include <fs/sockfs/socktpi_impl.h>
@@ -71,31 +70,6 @@ typedef struct netstat_cb_data_s {
 	int	af;
 } netstat_cb_data_t;
 
-/* Walkers for various *_stack_t */
-int
-ar_stacks_walk_init(mdb_walk_state_t *wsp)
-{
-	if (mdb_layered_walk("netstack", wsp) == -1) {
-		mdb_warn("can't walk 'netstack'");
-		return (WALK_ERR);
-	}
-	return (WALK_NEXT);
-}
-
-int
-ar_stacks_walk_step(mdb_walk_state_t *wsp)
-{
-	uintptr_t kaddr;
-	netstack_t nss;
-
-	if (mdb_vread(&nss, sizeof (nss), wsp->walk_addr) == -1) {
-		mdb_warn("can't read netstack at %p", wsp->walk_addr);
-		return (WALK_ERR);
-	}
-	kaddr = (uintptr_t)nss.netstack_modules[NS_ARP];
-	return (wsp->walk_callback(kaddr, wsp->walk_layer, wsp->walk_cbdata));
-}
-
 int
 icmp_stacks_walk_init(mdb_walk_state_t *wsp)
 {
@@ -201,15 +175,15 @@ net_tcp_active(const tcp_t *tcp)
 static int
 net_tcp_ipv4(const tcp_t *tcp)
 {
-	return ((tcp->tcp_ipversion == IPV4_VERSION) ||
-	    (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6) &&
+	return ((tcp->tcp_connp->conn_ipversion == IPV4_VERSION) ||
+	    (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_connp->conn_laddr_v6) &&
 	    (tcp->tcp_state <= TCPS_LISTEN)));
 }
 
 static int
 net_tcp_ipv6(const tcp_t *tcp)
 {
-	return (tcp->tcp_ipversion == IPV6_VERSION);
+	return (tcp->tcp_connp->conn_ipversion == IPV6_VERSION);
 }
 
 static int
@@ -222,15 +196,15 @@ net_udp_active(const udp_t *udp)
 static int
 net_udp_ipv4(const udp_t *udp)
 {
-	return ((udp->udp_ipversion == IPV4_VERSION) ||
-	    (IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src) &&
+	return ((udp->udp_connp->conn_ipversion == IPV4_VERSION) ||
+	    (IN6_IS_ADDR_UNSPECIFIED(&udp->udp_connp->conn_laddr_v6) &&
 	    (udp->udp_state <= TS_IDLE)));
 }
 
 static int
 net_udp_ipv6(const udp_t *udp)
 {
-	return (udp->udp_ipversion == IPV6_VERSION);
+	return (udp->udp_connp->conn_ipversion == IPV6_VERSION);
 }
 
 int
@@ -399,11 +373,6 @@ mi_payload_walk_step(mdb_walk_state_t *wsp)
 	return (WALK_NEXT);
 }
 
-const mi_payload_walk_arg_t mi_ar_arg = {
-	"ar_stacks", OFFSETOF(arp_stack_t, as_head), sizeof (ar_t),
-	MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE
-};
-
 const mi_payload_walk_arg_t mi_icmp_arg = {
 	"icmp_stacks", OFFSETOF(icmp_stack_t, is_head), sizeof (icmp_t),
 	MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE
@@ -632,7 +601,7 @@ netstat_tcp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 
 	tcp_kaddr = (uintptr_t)connp->conn_tcp;
 	if (mdb_vread(&tcps, sizeof (tcp_t), tcp_kaddr) == -1) {
-		mdb_warn("failed to read tcp_t at %p", kaddr);
+		mdb_warn("failed to read tcp_t at %p", tcp_kaddr);
 		return (WALK_ERR);
 	}
 
@@ -648,13 +617,13 @@ netstat_tcp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 
 	mdb_printf("%0?p %2i ", tcp_kaddr, tcp->tcp_state);
 	if (af == AF_INET) {
-		net_ipv4addrport_pr(&tcp->tcp_ip_src_v6, tcp->tcp_lport);
+		net_ipv4addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
 		mdb_printf(" ");
-		net_ipv4addrport_pr(&tcp->tcp_remote_v6, tcp->tcp_fport);
+		net_ipv4addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
 	} else if (af == AF_INET6) {
-		net_ipv6addrport_pr(&tcp->tcp_ip_src_v6, tcp->tcp_lport);
+		net_ipv6addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
 		mdb_printf(" ");
-		net_ipv6addrport_pr(&tcp->tcp_remote_v6, tcp->tcp_fport);
+		net_ipv6addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
 	}
 	mdb_printf(" %5i", ns_to_stackid((uintptr_t)connp->conn_netstack));
 	mdb_printf(" %4i\n", connp->conn_zoneid);
@@ -687,6 +656,9 @@ netstat_udp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 		return (WALK_ERR);
 	}
 
+	connp->conn_udp = &udp;
+	udp.udp_connp = connp;
+
 	if (!((opts & NETSTAT_ALL) || net_udp_active(&udp)) ||
 	    (af == AF_INET && !net_udp_ipv4(&udp)) ||
 	    (af == AF_INET6 && !net_udp_ipv6(&udp))) {
@@ -704,13 +676,13 @@ netstat_udp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 
 	mdb_printf("%0?p %10s ", (uintptr_t)connp->conn_udp, state);
 	if (af == AF_INET) {
-		net_ipv4addrport_pr(&udp.udp_v6src, udp.udp_port);
+		net_ipv4addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
 		mdb_printf(" ");
-		net_ipv4addrport_pr(&udp.udp_v6dst, udp.udp_dstport);
+		net_ipv4addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
 	} else if (af == AF_INET6) {
-		net_ipv6addrport_pr(&udp.udp_v6src, udp.udp_port);
+		net_ipv6addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
 		mdb_printf(" ");
-		net_ipv6addrport_pr(&udp.udp_v6dst, udp.udp_dstport);
+		net_ipv6addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
 	}
 	mdb_printf(" %5i", ns_to_stackid((uintptr_t)connp->conn_netstack));
 	mdb_printf(" %4i\n", connp->conn_zoneid);
@@ -740,8 +712,11 @@ netstat_icmp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 		return (WALK_ERR);
 	}
 
-	if ((af == AF_INET && icmp.icmp_ipversion != IPV4_VERSION) ||
-	    (af == AF_INET6 && icmp.icmp_ipversion != IPV6_VERSION)) {
+	connp->conn_icmp = &icmp;
+	icmp.icmp_connp = connp;
+
+	if ((af == AF_INET && connp->conn_ipversion != IPV4_VERSION) ||
+	    (af == AF_INET6 && connp->conn_ipversion != IPV6_VERSION)) {
 		return (WALK_NEXT);
 	}
 
@@ -756,16 +731,16 @@ netstat_icmp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 
 	mdb_printf("%0?p %10s ", (uintptr_t)connp->conn_icmp, state);
 	if (af == AF_INET) {
-		mdb_printf("%*I ", ADDR_V4_WIDTH,
-		    V4_PART_OF_V6((icmp.icmp_v6src)));
-		mdb_printf("%*I ", ADDR_V4_WIDTH,
-		    V4_PART_OF_V6((icmp.icmp_v6dst.sin6_addr)));
+		net_ipv4addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
+		mdb_printf(" ");
+		net_ipv4addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
 	} else if (af == AF_INET6) {
-		mdb_printf("%*N ", ADDR_V6_WIDTH, &icmp.icmp_v6src);
-		mdb_printf("%*N ", ADDR_V6_WIDTH, &icmp.icmp_v6dst);
+		net_ipv6addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
+		mdb_printf(" ");
+		net_ipv6addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
 	}
 	mdb_printf(" %5i", ns_to_stackid((uintptr_t)connp->conn_netstack));
-	mdb_printf(" %4i\n", icmp.icmp_zoneid);
+	mdb_printf(" %4i\n", connp->conn_zoneid);
 
 	return (WALK_NEXT);
 }
@@ -881,57 +856,57 @@ get_ifname(const ire_t *ire, char *intf)
 	ill_t ill;
 
 	*intf = '\0';
-	if (ire->ire_type == IRE_CACHE) {
-		queue_t stq;
-
-		if (mdb_vread(&stq, sizeof (stq), (uintptr_t)ire->ire_stq) ==
-		    -1)
-			return;
-		if (mdb_vread(&ill, sizeof (ill), (uintptr_t)stq.q_ptr) == -1)
+	if (ire->ire_ill != NULL) {
+		if (mdb_vread(&ill, sizeof (ill),
+		    (uintptr_t)ire->ire_ill) == -1)
 			return;
 		(void) mdb_readstr(intf, MIN(LIFNAMSIZ, ill.ill_name_length),
 		    (uintptr_t)ill.ill_name);
-	} else if (ire->ire_ipif != NULL) {
-		ipif_t ipif;
-		char *cp;
-
-		if (mdb_vread(&ipif, sizeof (ipif),
-		    (uintptr_t)ire->ire_ipif) == -1)
-			return;
-		if (mdb_vread(&ill, sizeof (ill), (uintptr_t)ipif.ipif_ill) ==
-		    -1)
-			return;
-		(void) mdb_readstr(intf, MIN(LIFNAMSIZ, ill.ill_name_length),
-		    (uintptr_t)ill.ill_name);
-		if (ipif.ipif_id != 0) {
-			cp = intf + strlen(intf);
-			(void) mdb_snprintf(cp, LIFNAMSIZ + 1 - (cp - intf),
-			    ":%u", ipif.ipif_id);
-		}
 	}
 }
 
+const in6_addr_t ipv6_all_ones =
+	{ 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU };
+
 static void
-get_v4flags(const ire_t *ire, char *flags)
+get_ireflags(const ire_t *ire, char *flags)
 {
 	(void) strcpy(flags, "U");
-	if (ire->ire_type == IRE_DEFAULT || ire->ire_type == IRE_PREFIX ||
-	    ire->ire_type == IRE_HOST || ire->ire_type == IRE_HOST_REDIRECT)
+	/* RTF_INDIRECT wins over RTF_GATEWAY - don't display both */
+	if (ire->ire_flags & RTF_INDIRECT)
+		(void) strcat(flags, "I");
+	else if (ire->ire_type & IRE_OFFLINK)
 		(void) strcat(flags, "G");
-	if (ire->ire_mask == IP_HOST_MASK)
-		(void) strcat(flags, "H");
-	if (ire->ire_type == IRE_HOST_REDIRECT)
+
+	/* IRE_IF_CLONE wins over RTF_HOST - don't display both */
+	if (ire->ire_type & IRE_IF_CLONE)
+		(void) strcat(flags, "C");
+	else if (ire->ire_ipversion == IPV4_VERSION) {
+		if (ire->ire_mask == IP_HOST_MASK)
+			(void) strcat(flags, "H");
+	} else {
+		if (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones))
+			(void) strcat(flags, "H");
+	}
+
+	if (ire->ire_flags & RTF_DYNAMIC)
 		(void) strcat(flags, "D");
-	if (ire->ire_type == IRE_CACHE)
-		(void) strcat(flags, "A");
 	if (ire->ire_type == IRE_BROADCAST)
-		(void) strcat(flags, "B");
+		(void) strcat(flags, "b");
+	if (ire->ire_type == IRE_MULTICAST)
+		(void) strcat(flags, "m");
 	if (ire->ire_type == IRE_LOCAL)
 		(void) strcat(flags, "L");
+	if (ire->ire_type == IRE_NOROUTE)
+		(void) strcat(flags, "N");
 	if (ire->ire_flags & RTF_MULTIRT)
 		(void) strcat(flags, "M");
 	if (ire->ire_flags & RTF_SETSRC)
 		(void) strcat(flags, "S");
+	if (ire->ire_flags & RTF_REJECT)
+		(void) strcat(flags, "R");
+	if (ire->ire_flags & RTF_BLACKHOLE)
+		(void) strcat(flags, "B");
 }
 
 static int
@@ -945,8 +920,10 @@ netstat_irev4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 	if (ire->ire_ipversion != IPV4_VERSION)
 		return (WALK_NEXT);
 
-	if (!(*opts & NETSTAT_ALL) && (ire->ire_type == IRE_CACHE ||
-	    ire->ire_type == IRE_BROADCAST || ire->ire_type == IRE_LOCAL))
+	/* Skip certain IREs by default */
+	if (!(*opts & NETSTAT_ALL) &&
+	    (ire->ire_type &
+	    (IRE_BROADCAST|IRE_LOCAL|IRE_MULTICAST|IRE_NOROUTE|IRE_IF_CLONE)))
 		return (WALK_NEXT);
 
 	if (*opts & NETSTAT_FIRST) {
@@ -966,10 +943,9 @@ netstat_irev4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 		}
 	}
 
-	gate = (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) ?
-	    ire->ire_src_addr : ire->ire_gateway_addr;
+	gate = ire->ire_gateway_addr;
 
-	get_v4flags(ire, flags);
+	get_ireflags(ire, flags);
 
 	get_ifname(ire, intf);
 
@@ -977,8 +953,8 @@ netstat_irev4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 		mdb_printf("%?p %-*I %-*I %-*I %-6s %5u%c %4u %3u %-3s %5u "
 		    "%u\n", kaddr, ADDR_V4_WIDTH, ire->ire_addr, ADDR_V4_WIDTH,
 		    ire->ire_mask, ADDR_V4_WIDTH, gate, intf,
-		    ire->ire_max_frag, ire->ire_frag_flag ? '*' : ' ',
-		    ire->ire_uinfo.iulp_rtt, ire->ire_refcnt, flags,
+		    0, ' ',
+		    ire->ire_metrics.iulp_rtt, ire->ire_refcnt, flags,
 		    ire->ire_ob_pkt_count, ire->ire_ib_pkt_count);
 	} else {
 		mdb_printf("%?p %-*I %-*I %-5s %4u %5u %s\n", kaddr,
@@ -1025,7 +1001,10 @@ netstat_irev6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 	if (ire->ire_ipversion != IPV6_VERSION)
 		return (WALK_NEXT);
 
-	if (!(*opts & NETSTAT_ALL) && ire->ire_type == IRE_CACHE)
+	/* Skip certain IREs by default */
+	if (!(*opts & NETSTAT_ALL) &&
+	    (ire->ire_type &
+	    (IRE_BROADCAST|IRE_LOCAL|IRE_MULTICAST|IRE_NOROUTE|IRE_IF_CLONE)))
 		return (WALK_NEXT);
 
 	if (*opts & NETSTAT_FIRST) {
@@ -1045,37 +1024,21 @@ netstat_irev6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 		}
 	}
 
-	gatep = (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) ?
-	    &ire->ire_src_addr_v6 : &ire->ire_gateway_addr_v6;
+	gatep = &ire->ire_gateway_addr_v6;
 
 	masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6);
 	(void) mdb_snprintf(deststr, sizeof (deststr), "%N/%d",
 	    &ire->ire_addr_v6, masklen);
 
-	(void) strcpy(flags, "U");
-	if (ire->ire_type == IRE_DEFAULT || ire->ire_type == IRE_PREFIX ||
-	    ire->ire_type == IRE_HOST || ire->ire_type == IRE_HOST_REDIRECT)
-		(void) strcat(flags, "G");
-	if (masklen == IPV6_ABITS)
-		(void) strcat(flags, "H");
-	if (ire->ire_type == IRE_HOST_REDIRECT)
-		(void) strcat(flags, "D");
-	if (ire->ire_type == IRE_CACHE)
-		(void) strcat(flags, "A");
-	if (ire->ire_type == IRE_LOCAL)
-		(void) strcat(flags, "L");
-	if (ire->ire_flags & RTF_MULTIRT)
-		(void) strcat(flags, "M");
-	if (ire->ire_flags & RTF_SETSRC)
-		(void) strcat(flags, "S");
+	get_ireflags(ire, flags);
 
 	get_ifname(ire, intf);
 
 	if (*opts & NETSTAT_VERBOSE) {
 		mdb_printf("%?p %-*s %-*N %-5s %5u%c %5u %3u %-5s %6u %u\n",
 		    kaddr, ADDR_V6_WIDTH+4, deststr, ADDR_V6_WIDTH, gatep,
-		    intf, ire->ire_max_frag, ire->ire_frag_flag ? '*' : ' ',
-		    ire->ire_uinfo.iulp_rtt, ire->ire_refcnt,
+		    intf, 0, ' ',
+		    ire->ire_metrics.iulp_rtt, ire->ire_refcnt,
 		    flags, ire->ire_ob_pkt_count, ire->ire_ib_pkt_count);
 	} else {
 		mdb_printf("%?p %-*s %-*N %-5s %3u %6u %s\n", kaddr,
diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.h b/usr/src/cmd/mdb/common/modules/genunix/net.h
index f2d441e78c..f72d75f75a 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/net.h
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.h
@@ -30,7 +30,6 @@
 extern "C" {
 #endif
 
-extern struct mi_payload_walk_arg_s mi_ar_arg;
 extern struct mi_payload_walk_arg_s mi_icmp_arg;
 extern struct mi_payload_walk_arg_s mi_ill_arg;
 
@@ -42,8 +41,6 @@ extern int mi_walk_step(mdb_walk_state_t *);
 extern void mi_walk_fini(mdb_walk_state_t *);
 extern int mi_payload_walk_init(mdb_walk_state_t *);
 extern int mi_payload_walk_step(mdb_walk_state_t *);
-extern int ar_stacks_walk_init(mdb_walk_state_t *);
-extern int ar_stacks_walk_step(mdb_walk_state_t *);
 extern int icmp_stacks_walk_init(mdb_walk_state_t *);
 extern int icmp_stacks_walk_step(mdb_walk_state_t *);
 extern int tcp_stacks_walk_init(mdb_walk_state_t *);
diff --git a/usr/src/cmd/mdb/common/modules/genunix/streams.c b/usr/src/cmd/mdb/common/modules/genunix/streams.c
index 0458589309..d0095c7752 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/streams.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/streams.c
@@ -172,7 +172,6 @@ static const struct str_flags mbf[] = {
 	{ SF(0x08),		"unused"				},
 	{ SF(MSGMARKNEXT), 	"Private: b_next's first byte marked"	},
 	{ SF(MSGNOTMARKNEXT),	"Private: ... not marked"		},
-	{ SF(MSGHASREF),	"Private: msg has reference to owner"	},
 	{ 0, NULL,		NULL					}
 };
 
diff --git a/usr/src/cmd/mdb/common/modules/genunix/vfs.c b/usr/src/cmd/mdb/common/modules/genunix/vfs.c
index 45dc27af23..8001c41b3c 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/vfs.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/vfs.c
@@ -572,8 +572,9 @@ sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr)
 	sin_t			*sin4;
 	int			scanned = 0;
 	boolean_t		skip_lback = B_FALSE;
+	conn_t			*connp = sctp->sctp_connp;
 
-	addr->sa_family = sctp->sctp_family;
+	addr->sa_family = connp->conn_family;
 	if (sctp->sctp_nsaddrs == 0)
 		goto done;
 
@@ -636,18 +637,18 @@ sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr)
 				continue;
 			}
 
-			switch (sctp->sctp_family) {
+			switch (connp->conn_family) {
 			case AF_INET:
 				/* LINTED: alignment */
 				sin4 = (sin_t *)addr;
 				if ((sctp->sctp_state <= SCTPS_LISTEN) &&
 				    sctp->sctp_bound_to_all) {
 					sin4->sin_addr.s_addr = INADDR_ANY;
-					sin4->sin_port = sctp->sctp_lport;
+					sin4->sin_port = connp->conn_lport;
 				} else {
 					sin4 += added;
 					sin4->sin_family = AF_INET;
-					sin4->sin_port = sctp->sctp_lport;
+					sin4->sin_port = connp->conn_lport;
 					IN6_V4MAPPED_TO_INADDR(&laddr,
 					    &sin4->sin_addr);
 				}
@@ -660,15 +661,14 @@ sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr)
 				    sctp->sctp_bound_to_all) {
 					bzero(&sin6->sin6_addr,
 					    sizeof (sin6->sin6_addr));
-					sin6->sin6_port = sctp->sctp_lport;
+					sin6->sin6_port = connp->conn_lport;
 				} else {
 					sin6 += added;
 					sin6->sin6_family = AF_INET6;
-					sin6->sin6_port = sctp->sctp_lport;
+					sin6->sin6_port = connp->conn_lport;
 					sin6->sin6_addr = laddr;
 				}
-				sin6->sin6_flowinfo = sctp->sctp_ip6h->ip6_vcf &
-				    ~IPV6_VERS_AND_FLOW_MASK;
+				sin6->sin6_flowinfo = connp->conn_flowinfo;
 				sin6->sin6_scope_id = 0;
 				sin6->__sin6_src_id = 0;
 				break;
@@ -712,11 +712,12 @@ sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr)
 	struct sockaddr_in6	*sin6;
 	sctp_faddr_t		sctp_primary;
 	in6_addr_t		faddr;
+	conn_t			*connp = sctp->sctp_connp;
 
 	if (sctp->sctp_faddrs == NULL)
 		return (-1);
 
-	addr->sa_family = sctp->sctp_family;
+	addr->sa_family = connp->conn_family;
 	if (mdb_vread(&sctp_primary, sizeof (sctp_faddr_t),
 	    (uintptr_t)sctp->sctp_primary) == -1) {
 		mdb_warn("failed to read sctp primary faddr");
@@ -724,12 +725,12 @@ sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr)
 	}
 	faddr = sctp_primary.faddr;
 
-	switch (sctp->sctp_family) {
+	switch (connp->conn_family) {
 	case AF_INET:
 		/* LINTED: alignment */
 		sin4 = (struct sockaddr_in *)addr;
 		IN6_V4MAPPED_TO_INADDR(&faddr, &sin4->sin_addr);
-		sin4->sin_port = sctp->sctp_fport;
+		sin4->sin_port = connp->conn_fport;
 		sin4->sin_family = AF_INET;
 		break;
 
@@ -737,7 +738,7 @@ sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr)
 		/* LINTED: alignment */
 		sin6 = (struct sockaddr_in6 *)addr;
 		sin6->sin6_addr = faddr;
-		sin6->sin6_port = sctp->sctp_fport;
+		sin6->sin6_port = connp->conn_fport;
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_flowinfo = 0;
 		sin6->sin6_scope_id = 0;
@@ -797,7 +798,7 @@ tcpip_sock_print(struct sonode *socknode)
 
 		mdb_printf("socket: ");
 		mdb_nhconvert(&port, &conn_t.conn_lport, sizeof (port));
-		mdb_printf("AF_INET %I %d ", conn_t.conn_src, port);
+		mdb_printf("AF_INET %I %d ", conn_t.conn_laddr_v4, port);
 
 		/*
 		 * If this is a listening socket, we don't print
@@ -807,7 +808,8 @@ tcpip_sock_print(struct sonode *socknode)
 		    IPCL_IS_UDP(&conn_t) && IPCL_IS_CONNECTED(&conn_t)) {
 			mdb_printf("remote: ");
 			mdb_nhconvert(&port, &conn_t.conn_fport, sizeof (port));
-			mdb_printf("AF_INET %I %d ", conn_t.conn_rem, port);
+			mdb_printf("AF_INET %I %d ", conn_t.conn_faddr_v4,
+			    port);
 		}
 
 		break;
@@ -826,7 +828,7 @@ tcpip_sock_print(struct sonode *socknode)
 
 		mdb_printf("socket: ");
 		mdb_nhconvert(&port, &conn_t.conn_lport, sizeof (port));
-		mdb_printf("AF_INET6 %N %d ", &conn_t.conn_srcv6, port);
+		mdb_printf("AF_INET6 %N %d ", &conn_t.conn_laddr_v4, port);
 
 		/*
 		 * If this is a listening socket, we don't print
@@ -836,7 +838,8 @@ tcpip_sock_print(struct sonode *socknode)
 		    IPCL_IS_UDP(&conn_t) && IPCL_IS_CONNECTED(&conn_t)) {
 			mdb_printf("remote: ");
 			mdb_nhconvert(&port, &conn_t.conn_fport, sizeof (port));
-			mdb_printf("AF_INET6 %N %d ", &conn_t.conn_remv6, port);
+			mdb_printf("AF_INET6 %N %d ", &conn_t.conn_faddr_v6,
+			    port);
 		}
 
 		break;
@@ -854,6 +857,7 @@ static int
 sctp_sock_print(struct sonode *socknode)
 {
 	sctp_t sctp_t;
+	conn_t conns;
 
 	struct sockaddr *laddr = mdb_alloc(sizeof (struct sockaddr), UM_SLEEP);
 	struct sockaddr *faddr = mdb_alloc(sizeof (struct sockaddr), UM_SLEEP);
@@ -864,6 +868,14 @@ sctp_sock_print(struct sonode *socknode)
 		return (-1);
 	}
 
+	if (mdb_vread(&conns, sizeof (conn_t),
+	    (uintptr_t)sctp_t.sctp_connp) == -1) {
+		mdb_warn("failed to read conn_t at %p",
+		    (uintptr_t)sctp_t.sctp_connp);
+		return (-1);
+	}
+	sctp_t.sctp_connp = &conns;
+
 	if (sctp_getsockaddr(&sctp_t, laddr) == 0) {
 		mdb_printf("socket:");
 		pfiles_print_addr(laddr);
diff --git a/usr/src/cmd/mdb/common/modules/ip/ip.c b/usr/src/cmd/mdb/common/modules/ip/ip.c
index 28f21efe1f..da94942eae 100644
--- a/usr/src/cmd/mdb/common/modules/ip/ip.c
+++ b/usr/src/cmd/mdb/common/modules/ip/ip.c
@@ -52,6 +52,7 @@
 #include <ilb/ilb_nat.h>
 #include <ilb/ilb_conn.h>
 #include <sys/dlpi.h>
+#include <sys/zone.h>
 
 #include <mdb/mdb_modapi.h>
 #include <mdb/mdb_ks.h>
@@ -84,15 +85,20 @@ typedef struct illif_walk_data {
 	ill_if_t ill_if;
 } illif_walk_data_t;
 
-typedef struct nce_walk_data_s {
-	struct ndp_g_s	nce_ip_ndp;
-	int		nce_hash_tbl_index;
-	nce_t 		nce;
-} nce_walk_data_t;
+typedef struct ncec_walk_data_s {
+	struct ndp_g_s	ncec_ip_ndp;
+	int		ncec_hash_tbl_index;
+	ncec_t 		ncec;
+} ncec_walk_data_t;
+
+typedef struct ncec_cbdata_s {
+	uintptr_t ncec_addr;
+	int	  ncec_ipversion;
+} ncec_cbdata_t;
 
 typedef struct nce_cbdata_s {
-	uintptr_t nce_addr;
-	int	  nce_ipversion;
+	int		nce_ipversion;
+	char		nce_ill_name[LIFNAMSIZ];
 } nce_cbdata_t;
 
 typedef struct ire_cbdata_s {
@@ -100,6 +106,12 @@ typedef struct ire_cbdata_s {
 	boolean_t	verbose;
 } ire_cbdata_t;
 
+typedef struct zi_cbdata_s {
+	const char	*zone_name;
+	ip_stack_t	*ipst;
+	boolean_t	shared_ip_zone;
+} zi_cbdata_t;
+
 typedef struct th_walk_data {
 	uint_t		thw_non_zero_only;
 	boolean_t	thw_match;
@@ -122,6 +134,7 @@ typedef struct ill_walk_data_s {
 typedef struct ill_cbdata_s {
 	uintptr_t ill_addr;
 	int	  ill_ipversion;
+	ip_stack_t *ill_ipst;
 	boolean_t verbose;
 } ill_cbdata_t;
 
@@ -156,7 +169,7 @@ static hash_walk_arg_t bind_hash_arg = {
 };
 
 static hash_walk_arg_t proto_hash_arg = {
-	OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout),
+	OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout_v4),
 	0
 };
 
@@ -210,13 +223,15 @@ static void ip_list_walk_fini(mdb_walk_state_t *);
 static int srcid_walk_step(mdb_walk_state_t *);
 
 static int ire_format(uintptr_t addr, const void *, void *);
-static int nce_format(uintptr_t addr, const nce_t *nce, int ipversion);
-static int nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv);
-static int nce_walk_step(mdb_walk_state_t *wsp);
-static int nce_stack_walk_init(mdb_walk_state_t *wsp);
-static int nce_stack_walk_step(mdb_walk_state_t *wsp);
-static void nce_stack_walk_fini(mdb_walk_state_t *wsp);
-static int nce_cb(uintptr_t addr, const nce_walk_data_t *iw, nce_cbdata_t *id);
+static int ncec_format(uintptr_t addr, const ncec_t *ncec, int ipversion);
+static int ncec(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv);
+static int ncec_walk_step(mdb_walk_state_t *wsp);
+static int ncec_stack_walk_init(mdb_walk_state_t *wsp);
+static int ncec_stack_walk_step(mdb_walk_state_t *wsp);
+static void ncec_stack_walk_fini(mdb_walk_state_t *wsp);
+static int ncec_cb(uintptr_t addr, const ncec_walk_data_t *iw,
+    ncec_cbdata_t *id);
+static char *nce_l2_addr(const nce_t *, const ill_t *);
 
 static int ipcl_hash_walk_init(mdb_walk_state_t *);
 static int ipcl_hash_walk_step(mdb_walk_state_t *);
@@ -262,6 +277,69 @@ ips_to_stackid(uintptr_t kaddr)
 	return (nss.netstack_stackid);
 }
 
+/* ARGSUSED */
+static int
+zone_to_ips_cb(uintptr_t addr, const void *zi_arg, void *zi_cb_arg)
+{
+	zi_cbdata_t *zi_cb = zi_cb_arg;
+	zone_t zone;
+	char zone_name[ZONENAME_MAX];
+	netstack_t ns;
+
+	if (mdb_vread(&zone, sizeof (zone_t), addr) == -1) {
+		mdb_warn("can't read zone at %p", addr);
+		return (WALK_ERR);
+	}
+
+	(void) mdb_readstr(zone_name, ZONENAME_MAX, (uintptr_t)zone.zone_name);
+
+	if (strcmp(zi_cb->zone_name, zone_name) != 0)
+		return (WALK_NEXT);
+
+	zi_cb->shared_ip_zone = (!(zone.zone_flags & ZF_NET_EXCL) &&
+	    (strcmp(zone_name, "global") != 0));
+
+	if (mdb_vread(&ns, sizeof (netstack_t), (uintptr_t)zone.zone_netstack)
+	    == -1) {
+		mdb_warn("can't read netstack at %p", zone.zone_netstack);
+		return (WALK_ERR);
+	}
+
+	zi_cb->ipst = ns.netstack_ip;
+	return (WALK_DONE);
+}
+
+static ip_stack_t *
+zone_to_ips(const char *zone_name)
+{
+	zi_cbdata_t zi_cb;
+
+	if (zone_name == NULL)
+		return (NULL);
+
+	zi_cb.zone_name = zone_name;
+	zi_cb.ipst = NULL;
+	zi_cb.shared_ip_zone = B_FALSE;
+
+	if (mdb_walk("zone", (mdb_walk_cb_t)zone_to_ips_cb, &zi_cb) == -1) {
+		mdb_warn("failed to walk zone");
+		return (NULL);
+	}
+
+	if (zi_cb.shared_ip_zone) {
+		mdb_warn("%s is a Shared-IP zone, try '-s global' instead\n",
+		    zone_name);
+		return (NULL);
+	}
+
+	if (zi_cb.ipst == NULL) {
+		mdb_warn("failed to find zone %s\n", zone_name);
+		return (NULL);
+	}
+
+	return (zi_cb.ipst);
+}
+
 int
 ip_stacks_walk_init(mdb_walk_state_t *wsp)
 {
@@ -529,10 +607,10 @@ illif_help(void)
 }
 
 int
-ire_walk_init(mdb_walk_state_t *wsp)
+nce_walk_init(mdb_walk_state_t *wsp)
 {
-	if (mdb_layered_walk("ire_cache", wsp) == -1) {
-		mdb_warn("can't walk 'ire_cache'");
+	if (mdb_layered_walk("nce_cache", wsp) == -1) {
+		mdb_warn("can't walk 'nce_cache'");
 		return (WALK_ERR);
 	}
 
@@ -540,60 +618,129 @@ ire_walk_init(mdb_walk_state_t *wsp)
 }
 
 int
-ire_walk_step(mdb_walk_state_t *wsp)
+nce_walk_step(mdb_walk_state_t *wsp)
 {
-	ire_t ire;
+	nce_t nce;
 
-	if (mdb_vread(&ire, sizeof (ire), wsp->walk_addr) == -1) {
-		mdb_warn("can't read ire at %p", wsp->walk_addr);
+	if (mdb_vread(&nce, sizeof (nce), wsp->walk_addr) == -1) {
+		mdb_warn("can't read nce at %p", wsp->walk_addr);
 		return (WALK_ERR);
 	}
 
-	return (wsp->walk_callback(wsp->walk_addr, &ire, wsp->walk_cbdata));
+	return (wsp->walk_callback(wsp->walk_addr, &nce, wsp->walk_cbdata));
 }
 
+static int
+nce_format(uintptr_t addr, const nce_t *ncep, void *nce_cb_arg)
+{
+	nce_cbdata_t *nce_cb = nce_cb_arg;
+	ill_t ill;
+	char ill_name[LIFNAMSIZ];
+	ncec_t ncec;
+
+	if (mdb_vread(&ncec, sizeof (ncec),
+	    (uintptr_t)ncep->nce_common) == -1) {
+		mdb_warn("can't read ncec at %p", ncep->nce_common);
+		return (WALK_NEXT);
+	}
+	if (nce_cb->nce_ipversion != 0 &&
+	    ncec.ncec_ipversion != nce_cb->nce_ipversion)
+		return (WALK_NEXT);
+
+	if (mdb_vread(&ill, sizeof (ill), (uintptr_t)ncep->nce_ill) == -1) {
+		mdb_snprintf(ill_name, sizeof (ill_name), "--");
+	} else {
+		(void) mdb_readstr(ill_name,
+		    MIN(LIFNAMSIZ, ill.ill_name_length),
+		    (uintptr_t)ill.ill_name);
+	}
+
+	if (nce_cb->nce_ill_name[0] != '\0' &&
+	    strncmp(nce_cb->nce_ill_name, ill_name, LIFNAMSIZ) != 0)
+		return (WALK_NEXT);
+
+	if (ncec.ncec_ipversion == IPV6_VERSION) {
+
+		mdb_printf("%?p %5s %-18s %?p %6d %N\n",
+		    addr, ill_name,
+		    nce_l2_addr(ncep, &ill),
+		    ncep->nce_fp_mp,
+		    ncep->nce_refcnt,
+		    &ncep->nce_addr);
+
+	} else {
+		struct in_addr nceaddr;
+
+		IN6_V4MAPPED_TO_INADDR(&ncep->nce_addr, &nceaddr);
+		mdb_printf("%?p %5s %-18s %?p %6d %I\n",
+		    addr, ill_name,
+		    nce_l2_addr(ncep, &ill),
+		    ncep->nce_fp_mp,
+		    ncep->nce_refcnt,
+		    nceaddr.s_addr);
+	}
+
+	return (WALK_NEXT);
+}
 
 int
-ire_ctable_walk_step(mdb_walk_state_t *wsp)
+dce_walk_init(mdb_walk_state_t *wsp)
 {
-	uintptr_t kaddr;
-	irb_t *irb;
-	uint32_t cache_table_size;
-	int i;
-	ire_cbdata_t ire_cb;
+	wsp->walk_data = (void *)wsp->walk_addr;
 
-	ire_cb.verbose = B_FALSE;
-	ire_cb.ire_ipversion = 0;
+	if (mdb_layered_walk("dce_cache", wsp) == -1) {
+		mdb_warn("can't walk 'dce_cache'");
+		return (WALK_ERR);
+	}
 
+	return (WALK_NEXT);
+}
 
-	kaddr = wsp->walk_addr + OFFSETOF(ip_stack_t, ips_ip_cache_table_size);
+int
+dce_walk_step(mdb_walk_state_t *wsp)
+{
+	dce_t dce;
 
-	if (mdb_vread(&cache_table_size, sizeof (uint32_t), kaddr) == -1) {
-		mdb_warn("can't read ips_ip_cache_table at %p", kaddr);
+	if (mdb_vread(&dce, sizeof (dce), wsp->walk_addr) == -1) {
+		mdb_warn("can't read dce at %p", wsp->walk_addr);
 		return (WALK_ERR);
 	}
 
-	kaddr = wsp->walk_addr + OFFSETOF(ip_stack_t, ips_ip_cache_table);
-	if (mdb_vread(&kaddr, sizeof (kaddr), kaddr) == -1) {
-		mdb_warn("can't read ips_ip_cache_table at %p", kaddr);
+	/* If ip_stack_t is specified, skip DCEs that don't belong to it. */
+	if ((wsp->walk_data != NULL) && (wsp->walk_data != dce.dce_ipst))
+		return (WALK_NEXT);
+
+	return (wsp->walk_callback(wsp->walk_addr, &dce, wsp->walk_cbdata));
+}
+
+int
+ire_walk_init(mdb_walk_state_t *wsp)
+{
+	wsp->walk_data = (void *)wsp->walk_addr;
+
+	if (mdb_layered_walk("ire_cache", wsp) == -1) {
+		mdb_warn("can't walk 'ire_cache'");
 		return (WALK_ERR);
 	}
 
-	irb = mdb_alloc(sizeof (irb_t) * cache_table_size, UM_SLEEP|UM_GC);
-	if (mdb_vread(irb, sizeof (irb_t) * cache_table_size, kaddr) == -1) {
-		mdb_warn("can't read irb at %p", kaddr);
+	return (WALK_NEXT);
+}
+
+int
+ire_walk_step(mdb_walk_state_t *wsp)
+{
+	ire_t ire;
+
+	if (mdb_vread(&ire, sizeof (ire), wsp->walk_addr) == -1) {
+		mdb_warn("can't read ire at %p", wsp->walk_addr);
 		return (WALK_ERR);
 	}
-	for (i = 0; i < cache_table_size; i++) {
-		kaddr = (uintptr_t)irb[i].irb_ire;
 
-		if (mdb_pwalk("ire_next", ire_format, &ire_cb,
-		    kaddr) == -1) {
-			mdb_warn("can't walk 'ire_next' for ire %p", kaddr);
-			return (WALK_ERR);
-		}
-	}
-	return (WALK_NEXT);
+	/* If ip_stack_t is specified, skip IREs that don't belong to it. */
+	if ((wsp->walk_data != NULL) && (wsp->walk_data != ire.ire_ipst))
+		return (WALK_NEXT);
+
+	return (wsp->walk_callback(wsp->walk_addr, &ire, wsp->walk_cbdata));
 }
 
 /* ARGSUSED */
@@ -633,6 +780,9 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg)
 	const ire_t *irep = ire_arg;
 	ire_cbdata_t *ire_cb = ire_cb_arg;
 	boolean_t verbose = ire_cb->verbose;
+	ill_t ill;
+	char ill_name[LIFNAMSIZ];
+	boolean_t condemned = irep->ire_generation == IRE_GENERATION_CONDEMNED;
 
 	static const mdb_bitmask_t tmasks[] = {
 		{ "BROADCAST",	IRE_BROADCAST,		IRE_BROADCAST	},
@@ -640,22 +790,12 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg)
 		{ "LOCAL",	IRE_LOCAL,		IRE_LOCAL	},
 		{ "LOOPBACK",	IRE_LOOPBACK,		IRE_LOOPBACK	},
 		{ "PREFIX",	IRE_PREFIX,		IRE_PREFIX	},
-		{ "CACHE",	IRE_CACHE,		IRE_CACHE	},
+		{ "MULTICAST",	IRE_MULTICAST,		IRE_MULTICAST	},
+		{ "NOROUTE",	IRE_NOROUTE,		IRE_NOROUTE	},
 		{ "IF_NORESOLVER", IRE_IF_NORESOLVER,	IRE_IF_NORESOLVER },
 		{ "IF_RESOLVER", IRE_IF_RESOLVER,	IRE_IF_RESOLVER	},
+		{ "IF_CLONE",	IRE_IF_CLONE,		IRE_IF_CLONE	},
 		{ "HOST",	IRE_HOST,		IRE_HOST	},
-		{ "HOST_REDIRECT", IRE_HOST_REDIRECT,	IRE_HOST_REDIRECT },
-		{ NULL,		0,			0		}
-	};
-
-	static const mdb_bitmask_t mmasks[] = {
-		{ "CONDEMNED",	IRE_MARK_CONDEMNED,	IRE_MARK_CONDEMNED },
-		{ "TESTHIDDEN", IRE_MARK_TESTHIDDEN,    IRE_MARK_TESTHIDDEN },
-		{ "NOADD",	IRE_MARK_NOADD,		IRE_MARK_NOADD	},
-		{ "TEMPORARY",	IRE_MARK_TEMPORARY,	IRE_MARK_TEMPORARY },
-		{ "USESRC",	IRE_MARK_USESRC_CHECK,	IRE_MARK_USESRC_CHECK },
-		{ "PRIVATE",	IRE_MARK_PRIVATE_ADDR,	IRE_MARK_PRIVATE_ADDR },
-		{ "UNCACHED",	IRE_MARK_UNCACHED,	IRE_MARK_UNCACHED },
 		{ NULL,		0,			0		}
 	};
 
@@ -678,6 +818,7 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg)
 		{ "PROTO1",	RTF_PROTO1,		RTF_PROTO1	},
 		{ "MULTIRT",	RTF_MULTIRT,		RTF_MULTIRT	},
 		{ "SETSRC",	RTF_SETSRC,		RTF_SETSRC	},
+		{ "INDIRECT",	RTF_INDIRECT,		RTF_INDIRECT	},
 		{ NULL,		0,			0		}
 	};
 
@@ -685,40 +826,53 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg)
 	    irep->ire_ipversion != ire_cb->ire_ipversion)
 		return (WALK_NEXT);
 
+	if (mdb_vread(&ill, sizeof (ill), (uintptr_t)irep->ire_ill) == -1) {
+		mdb_snprintf(ill_name, sizeof (ill_name), "--");
+	} else {
+		(void) mdb_readstr(ill_name,
+		    MIN(LIFNAMSIZ, ill.ill_name_length),
+		    (uintptr_t)ill.ill_name);
+	}
+
 	if (irep->ire_ipversion == IPV6_VERSION && verbose) {
 
-		mdb_printf("%<b>%?p%</b> %40N <%hb>\n"
-		    "%?s %40N <%hb>\n"
-		    "%?s %40d %4d <%hb>\n",
-		    addr, &irep->ire_src_addr_v6, irep->ire_type, tmasks,
-		    "", &irep->ire_addr_v6, (ushort_t)irep->ire_marks, mmasks,
+		mdb_printf("%<b>%?p%</b>%3s %40N <%hb%s>\n"
+		    "%?s %40N\n"
+		    "%?s %40d %4d <%hb> %s\n",
+		    addr, condemned ? "(C)" : "", &irep->ire_setsrc_addr_v6,
+		    irep->ire_type, tmasks,
+		    (irep->ire_testhidden ? ", HIDDEN" : ""),
+		    "", &irep->ire_addr_v6,
 		    "", ips_to_stackid((uintptr_t)irep->ire_ipst),
 		    irep->ire_zoneid,
-		    irep->ire_flags, fmasks);
+		    irep->ire_flags, fmasks, ill_name);
 
 	} else if (irep->ire_ipversion == IPV6_VERSION) {
 
-		mdb_printf("%?p %30N %30N %5d %4d\n",
-		    addr, &irep->ire_src_addr_v6,
+		mdb_printf("%?p%3s %30N %30N %5d %4d %s\n",
+		    addr, condemned ? "(C)" : "", &irep->ire_setsrc_addr_v6,
 		    &irep->ire_addr_v6,
 		    ips_to_stackid((uintptr_t)irep->ire_ipst),
-		    irep->ire_zoneid);
+		    irep->ire_zoneid, ill_name);
 
 	} else if (verbose) {
 
-		mdb_printf("%<b>%?p%</b> %40I <%hb>\n"
-		    "%?s %40I <%hb>\n"
-		    "%?s %40d %4d <%hb>\n",
-		    addr, irep->ire_src_addr, irep->ire_type, tmasks,
-		    "", irep->ire_addr, (ushort_t)irep->ire_marks, mmasks,
+		mdb_printf("%<b>%?p%</b>%3s %40I <%hb%s>\n"
+		    "%?s %40I\n"
+		    "%?s %40d %4d <%hb> %s\n",
+		    addr, condemned ? "(C)" : "", irep->ire_setsrc_addr,
+		    irep->ire_type, tmasks,
+		    (irep->ire_testhidden ? ", HIDDEN" : ""),
+		    "", irep->ire_addr,
 		    "", ips_to_stackid((uintptr_t)irep->ire_ipst),
-		    irep->ire_zoneid, irep->ire_flags, fmasks);
+		    irep->ire_zoneid, irep->ire_flags, fmasks, ill_name);
 
 	} else {
 
-		mdb_printf("%?p %30I %30I %5d %4d\n", addr, irep->ire_src_addr,
+		mdb_printf("%?p%3s %30I %30I %5d %4d %s\n", addr,
+		    condemned ? "(C)" : "", irep->ire_setsrc_addr,
 		    irep->ire_addr, ips_to_stackid((uintptr_t)irep->ire_ipst),
-		    irep->ire_zoneid);
+		    irep->ire_zoneid, ill_name);
 	}
 
 	return (WALK_NEXT);
@@ -1040,6 +1194,140 @@ ip6hdr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 }
 
 int
+nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	nce_t nce;
+	nce_cbdata_t nce_cb;
+	int ipversion = 0;
+	const char *opt_P = NULL, *opt_ill;
+
+	if (mdb_getopts(argc, argv,
+	    'i', MDB_OPT_STR, &opt_ill,
+	    'P', MDB_OPT_STR, &opt_P, NULL) != argc)
+		return (DCMD_USAGE);
+
+	if (opt_P != NULL) {
+		if (strcmp("v4", opt_P) == 0) {
+			ipversion = IPV4_VERSION;
+		} else if (strcmp("v6", opt_P) == 0) {
+			ipversion = IPV6_VERSION;
+		} else {
+			mdb_warn("invalid protocol '%s'\n", opt_P);
+			return (DCMD_USAGE);
+		}
+	}
+
+	if ((flags & DCMD_LOOPFIRST) || !(flags & DCMD_LOOP)) {
+		mdb_printf("%<u>%?s %5s %18s %?s %s %s %</u>\n",
+		    "ADDR", "INTF", "LLADDR", "FP_MP", "REFCNT",
+		    "NCE_ADDR");
+	}
+
+	bzero(&nce_cb, sizeof (nce_cb));
+	if (opt_ill != NULL) {
+		strcpy(nce_cb.nce_ill_name, opt_ill);
+	}
+	nce_cb.nce_ipversion = ipversion;
+
+	if (flags & DCMD_ADDRSPEC) {
+		(void) mdb_vread(&nce, sizeof (nce_t), addr);
+		(void) nce_format(addr, &nce, &nce_cb);
+	} else if (mdb_walk("nce", (mdb_walk_cb_t)nce_format, &nce_cb) == -1) {
+		mdb_warn("failed to walk ire table");
+		return (DCMD_ERR);
+	}
+
+	return (DCMD_OK);
+}
+
+/* ARGSUSED */
+static int
+dce_format(uintptr_t addr, const dce_t *dcep, void *dce_cb_arg)
+{
+	static const mdb_bitmask_t dmasks[] = {
+		{ "D",	DCEF_DEFAULT,		DCEF_DEFAULT },
+		{ "P",	DCEF_PMTU,		DCEF_PMTU },
+		{ "U",	DCEF_UINFO,		DCEF_UINFO },
+		{ "S",	DCEF_TOO_SMALL_PMTU,	DCEF_TOO_SMALL_PMTU },
+		{ NULL,	0,			0		}
+	};
+	char flagsbuf[2 * A_CNT(dmasks)];
+	int ipversion = *(int *)dce_cb_arg;
+	boolean_t condemned = dcep->dce_generation == DCE_GENERATION_CONDEMNED;
+
+	if (ipversion != 0 && ipversion != dcep->dce_ipversion)
+		return (WALK_NEXT);
+
+	mdb_snprintf(flagsbuf, sizeof (flagsbuf), "%b", dcep->dce_flags,
+	    dmasks);
+
+	switch (dcep->dce_ipversion) {
+	case IPV4_VERSION:
+		mdb_printf("%<u>%?p%3s %8s %8d %30I %</u>\n", addr, condemned ?
+		    "(C)" : "", flagsbuf, dcep->dce_pmtu, &dcep->dce_v4addr);
+		break;
+	case IPV6_VERSION:
+		mdb_printf("%<u>%?p%3s %8s %8d %30N %</u>\n", addr, condemned ?
+		    "(C)" : "", flagsbuf, dcep->dce_pmtu, &dcep->dce_v6addr);
+		break;
+	default:
+		mdb_printf("%<u>%?p%3s %8s %8d %30s %</u>\n", addr, condemned ?
+		    "(C)" : "", flagsbuf, dcep->dce_pmtu, "");
+	}
+
+	return (WALK_NEXT);
+}
+
+int
+dce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	dce_t dce;
+	const char *opt_P = NULL;
+	const char *zone_name = NULL;
+	ip_stack_t *ipst = NULL;
+	int ipversion = 0;
+
+	if (mdb_getopts(argc, argv,
+	    's', MDB_OPT_STR, &zone_name,
+	    'P', MDB_OPT_STR, &opt_P, NULL) != argc)
+		return (DCMD_USAGE);
+
+	/* Follow the specified zone name to find a ip_stack_t*. */
+	if (zone_name != NULL) {
+		ipst = zone_to_ips(zone_name);
+		if (ipst == NULL)
+			return (DCMD_USAGE);
+	}
+
+	if (opt_P != NULL) {
+		if (strcmp("v4", opt_P) == 0) {
+			ipversion = IPV4_VERSION;
+		} else if (strcmp("v6", opt_P) == 0) {
+			ipversion = IPV6_VERSION;
+		} else {
+			mdb_warn("invalid protocol '%s'\n", opt_P);
+			return (DCMD_USAGE);
+		}
+	}
+
+	if ((flags & DCMD_LOOPFIRST) || !(flags & DCMD_LOOP)) {
+		mdb_printf("%<u>%?s%3s %8s %8s %30s %</u>\n",
+		    "ADDR", "", "FLAGS", "PMTU", "DST_ADDR");
+	}
+
+	if (flags & DCMD_ADDRSPEC) {
+		(void) mdb_vread(&dce, sizeof (dce_t), addr);
+		(void) dce_format(addr, &dce, &ipversion);
+	} else if (mdb_pwalk("dce", (mdb_walk_cb_t)dce_format, &ipversion,
+	    (uintptr_t)ipst) == -1) {
+		mdb_warn("failed to walk dce cache");
+		return (DCMD_ERR);
+	}
+
+	return (DCMD_OK);
+}
+
+int
 ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
 	uint_t verbose = FALSE;
@@ -1047,12 +1335,22 @@ ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	ire_cbdata_t ire_cb;
 	int ipversion = 0;
 	const char *opt_P = NULL;
+	const char *zone_name = NULL;
+	ip_stack_t *ipst = NULL;
 
 	if (mdb_getopts(argc, argv,
 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
+	    's', MDB_OPT_STR, &zone_name,
 	    'P', MDB_OPT_STR, &opt_P, NULL) != argc)
 		return (DCMD_USAGE);
 
+	/* Follow the specified zone name to find a ip_stack_t*. */
+	if (zone_name != NULL) {
+		ipst = zone_to_ips(zone_name);
+		if (ipst == NULL)
+			return (DCMD_USAGE);
+	}
+
 	if (opt_P != NULL) {
 		if (strcmp("v4", opt_P) == 0) {
 			ipversion = IPV4_VERSION;
@@ -1069,13 +1367,13 @@ ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		if (verbose) {
 			mdb_printf("%?s %40s %-20s%\n"
 			    "%?s %40s %-20s%\n"
-			    "%<u>%?s %40s %4s %-20s%</u>\n",
+			    "%<u>%?s %40s %4s %-20s %s%</u>\n",
 			    "ADDR", "SRC", "TYPE",
 			    "", "DST", "MARKS",
-			    "", "STACK", "ZONE", "FLAGS");
+			    "", "STACK", "ZONE", "FLAGS", "INTF");
 		} else {
-			mdb_printf("%<u>%?s %30s %30s %5s %4s%</u>\n",
-			    "ADDR", "SRC", "DST", "STACK", "ZONE");
+			mdb_printf("%<u>%?s %30s %30s %5s %4s %s%</u>\n",
+			    "ADDR", "SRC", "DST", "STACK", "ZONE", "INTF");
 		}
 	}
 
@@ -1085,7 +1383,8 @@ ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	if (flags & DCMD_ADDRSPEC) {
 		(void) mdb_vread(&ire, sizeof (ire_t), addr);
 		(void) ire_format(addr, &ire, &ire_cb);
-	} else if (mdb_walk("ire", (mdb_walk_cb_t)ire_format, &ire_cb) == -1) {
+	} else if (mdb_pwalk("ire", (mdb_walk_cb_t)ire_format, &ire_cb,
+	    (uintptr_t)ipst) == -1) {
 		mdb_warn("failed to walk ire table");
 		return (DCMD_ERR);
 	}
@@ -1338,7 +1637,7 @@ th_trace(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 static void
 th_trace_help(void)
 {
-	mdb_printf("If given an address of an ill_t, ipif_t, ire_t, or nce_t, "
+	mdb_printf("If given an address of an ill_t, ipif_t, ire_t, or ncec_t, "
 	    "print the\n"
 	    "corresponding th_trace_t structure in detail.  Otherwise, if no "
 	    "address is\n"
@@ -1354,8 +1653,8 @@ static const mdb_dcmd_t dcmds[] = {
 	{ "srcid_status", ":",
 	    "display connection structures from ipcl hash tables",
 	    srcid_status },
-	{ "ill", "?[-v] [-P v4 | v6]", "display ill_t structures",
-	    ill, ill_help },
+	{ "ill", "?[-v] [-P v4 | v6] [-s exclusive-ip-zone-name]",
+	    "display ill_t structures", ill, ill_help },
 	{ "illif", "?[-P v4 | v6]",
 	    "display or filter IP Lower Level InterFace structures", illif,
 	    illif_help },
@@ -1363,10 +1662,14 @@ static const mdb_dcmd_t dcmds[] = {
 	{ "ip6hdr", ":[-vf]", "display an IPv6 header", ip6hdr },
 	{ "ipif", "?[-v] [-P v4 | v6]", "display ipif structures",
 	    ipif, ipif_help },
-	{ "ire", "?[-v] [-P v4|v6]",
+	{ "ire", "?[-v] [-P v4|v6] [-s exclusive-ip-zone-name]",
 	    "display Internet Route Entry structures", ire },
-	{ "nce", "?[-P v4 | v6]", "display Neighbor Cache Entry structures",
-	    nce },
+	{ "nce", "?[-P v4|v6] [-i <interface>]",
+	    "display interface-specific Neighbor Cache structures", nce },
+	{ "ncec", "?[-P v4 | v6]", "display Neighbor Cache Entry structures",
+	    ncec },
+	{ "dce", "?[-P v4|v6] [-s exclusive-ip-zone-name]",
+	    "display Destination Cache Entry structures", dce },
 	{ "squeue", ":[-v]", "print core squeue_t info", squeue,
 	    ip_squeue_help },
 	{ "tcphdr", ":", "display a TCP header", tcphdr },
@@ -1385,7 +1688,7 @@ static const mdb_walker_t walkers[] = {
 	{ "illif_stack", "walk list of ill interface types",
 		illif_stack_walk_init, illif_stack_walk_step,
 		illif_stack_walk_fini },
-	{ "ill", "walk list of nce structures for all stacks",
+	{ "ill", "walk active ill_t structures for all stacks",
 		ill_walk_init, ill_walk_step, NULL },
 	{ "ipif", "walk list of ipif structures for all stacks",
 		ipif_walk_init, ipif_walk_step, NULL },
@@ -1400,19 +1703,21 @@ static const mdb_walker_t walkers[] = {
 		&srcid_walk_arg },
 	{ "ire", "walk active ire_t structures",
 		ire_walk_init, ire_walk_step, NULL },
-	{ "ire_ctable", "walk ire_t structures in the ctable",
-		ip_stacks_common_walk_init, ire_ctable_walk_step, NULL },
 	{ "ire_next", "walk ire_t structures in the ctable",
 		ire_next_walk_init, ire_next_walk_step, NULL },
+	{ "nce", "walk active nce_t structures",
+		nce_walk_init, nce_walk_step, NULL },
+	{ "dce", "walk active dce_t structures",
+		dce_walk_init, dce_walk_step, NULL },
 	{ "ip_stacks", "walk all the ip_stack_t",
 		ip_stacks_walk_init, ip_stacks_walk_step, NULL },
 	{ "th_hash", "walk all the th_hash_t entries",
 		th_hash_walk_init, th_hash_walk_step, NULL },
-	{ "nce", "walk list of nce structures for all stacks",
-		ip_stacks_common_walk_init, nce_walk_step, NULL },
-	{ "nce_stack", "walk list of nce structures",
-		nce_stack_walk_init, nce_stack_walk_step,
-		nce_stack_walk_fini},
+	{ "ncec", "walk list of ncec structures for all stacks",
+		ip_stacks_common_walk_init, ncec_walk_step, NULL },
+	{ "ncec_stack", "walk list of ncec structures",
+		ncec_stack_walk_init, ncec_stack_walk_step,
+		ncec_stack_walk_fini},
 	{ "udp_hash", "walk list of conn_t structures in ips_ipcl_udp_fanout",
 		ipcl_hash_walk_init, ipcl_hash_walk_step,
 		ipcl_hash_walk_fini, &udp_hash_arg},
@@ -1471,9 +1776,9 @@ _mdb_fini(void)
 }
 
 static char *
-nce_state(int nce_state)
+ncec_state(int ncec_state)
 {
-	switch (nce_state) {
+	switch (ncec_state) {
 	case ND_UNCHANGED:
 		return ("unchanged");
 	case ND_INCOMPLETE:
@@ -1496,36 +1801,61 @@ nce_state(int nce_state)
 }
 
 static char *
-nce_l2_addr(const nce_t *nce, const ill_t *ill)
+ncec_l2_addr(const ncec_t *ncec, const ill_t *ill)
 {
 	uchar_t *h;
 	static char addr_buf[L2MAXADDRSTRLEN];
-	mblk_t mp;
-	size_t mblen;
 
-	if (ill->ill_flags & ILLF_XRESOLV) {
-		return ("XRESOLV");
+	if (ncec->ncec_lladdr == NULL) {
+		return ("None");
 	}
 
-	if (nce->nce_res_mp == NULL) {
+	if (ill->ill_net_type == IRE_IF_RESOLVER) {
+
+		if (ill->ill_phys_addr_length == 0)
+			return ("None");
+		h = mdb_zalloc(ill->ill_phys_addr_length, UM_SLEEP);
+		if (mdb_vread(h, ill->ill_phys_addr_length,
+		    (uintptr_t)ncec->ncec_lladdr) == -1) {
+			mdb_warn("failed to read hwaddr at %p",
+			    ncec->ncec_lladdr);
+			return ("Unknown");
+		}
+		mdb_mac_addr(h, ill->ill_phys_addr_length,
+		    addr_buf, sizeof (addr_buf));
+	} else {
 		return ("None");
 	}
+	mdb_free(h, ill->ill_phys_addr_length);
+	return (addr_buf);
+}
 
-	if (ill->ill_net_type == IRE_IF_RESOLVER) {
+static char *
+nce_l2_addr(const nce_t *nce, const ill_t *ill)
+{
+	uchar_t *h;
+	static char addr_buf[L2MAXADDRSTRLEN];
+	mblk_t mp;
+	size_t mblen;
+
+	if (nce->nce_dlur_mp == NULL)
+		return ("None");
 
+	if (ill->ill_net_type == IRE_IF_RESOLVER) {
 		if (mdb_vread(&mp, sizeof (mblk_t),
-		    (uintptr_t)nce->nce_res_mp) == -1) {
-			mdb_warn("failed to read nce_res_mp at %p",
-			    nce->nce_res_mp);
+		    (uintptr_t)nce->nce_dlur_mp) == -1) {
+			mdb_warn("failed to read nce_dlur_mp at %p",
+			    nce->nce_dlur_mp);
+			return ("None");
 		}
-
-		if (ill->ill_nd_lla_len == 0)
+		if (ill->ill_phys_addr_length == 0)
 			return ("None");
 		mblen = mp.b_wptr - mp.b_rptr;
 		if (mblen > (sizeof (dl_unitdata_req_t) + MAX_SAP_LEN) ||
-		    ill->ill_nd_lla_len > MAX_SAP_LEN ||
-		    NCE_LL_ADDR_OFFSET(ill) + ill->ill_nd_lla_len > mblen) {
-			return ("Truncated");
+		    ill->ill_phys_addr_length > MAX_SAP_LEN ||
+		    (NCE_LL_ADDR_OFFSET(ill) +
+		    ill->ill_phys_addr_length) > mblen) {
+			return ("Unknown");
 		}
 		h = mdb_zalloc(mblen, UM_SLEEP);
 		if (mdb_vread(h, mblen, (uintptr_t)(mp.b_rptr)) == -1) {
@@ -1533,8 +1863,8 @@ nce_l2_addr(const nce_t *nce, const ill_t *ill)
 			    mp.b_rptr + NCE_LL_ADDR_OFFSET(ill));
 			return ("Unknown");
 		}
-		mdb_mac_addr(h + NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len,
-		    addr_buf, sizeof (addr_buf));
+		mdb_mac_addr(h + NCE_LL_ADDR_OFFSET(ill),
+		    ill->ill_phys_addr_length, addr_buf, sizeof (addr_buf));
 	} else {
 		return ("None");
 	}
@@ -1543,7 +1873,7 @@ nce_l2_addr(const nce_t *nce, const ill_t *ill)
 }
 
 static void
-nce_header(uint_t flags)
+ncec_header(uint_t flags)
 {
 	if ((flags & DCMD_LOOPFIRST) || !(flags & DCMD_LOOP)) {
 
@@ -1553,10 +1883,10 @@ nce_header(uint_t flags)
 }
 
 int
-nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+ncec(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
-	nce_t nce;
-	nce_cbdata_t id;
+	ncec_t ncec;
+	ncec_cbdata_t id;
 	int ipversion = 0;
 	const char *opt_P = NULL;
 
@@ -1577,23 +1907,23 @@ nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 
 	if (flags & DCMD_ADDRSPEC) {
 
-		if (mdb_vread(&nce, sizeof (nce_t), addr) == -1) {
-			mdb_warn("failed to read nce at %p\n", addr);
+		if (mdb_vread(&ncec, sizeof (ncec_t), addr) == -1) {
+			mdb_warn("failed to read ncec at %p\n", addr);
 			return (DCMD_ERR);
 		}
-		if (ipversion != 0 && nce.nce_ipversion != ipversion) {
+		if (ipversion != 0 && ncec.ncec_ipversion != ipversion) {
 			mdb_printf("IP Version mismatch\n");
 			return (DCMD_ERR);
 		}
-		nce_header(flags);
-		return (nce_format(addr, &nce, ipversion));
+		ncec_header(flags);
+		return (ncec_format(addr, &ncec, ipversion));
 
 	} else {
-		id.nce_addr = addr;
-		id.nce_ipversion = ipversion;
-		nce_header(flags);
-		if (mdb_walk("nce", (mdb_walk_cb_t)nce_cb, &id) == -1) {
-			mdb_warn("failed to walk nce table\n");
+		id.ncec_addr = addr;
+		id.ncec_ipversion = ipversion;
+		ncec_header(flags);
+		if (mdb_walk("ncec", (mdb_walk_cb_t)ncec_cb, &id) == -1) {
+			mdb_warn("failed to walk ncec table\n");
 			return (DCMD_ERR);
 		}
 	}
@@ -1601,10 +1931,10 @@ nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 }
 
 static int
-nce_format(uintptr_t addr, const nce_t *nce, int ipversion)
+ncec_format(uintptr_t addr, const ncec_t *ncec, int ipversion)
 {
-	static const mdb_bitmask_t nce_flags[] = {
-		{ "P",	NCE_F_PERMANENT,	NCE_F_PERMANENT },
+	static const mdb_bitmask_t ncec_flags[] = {
+		{ "P",	NCE_F_NONUD,		NCE_F_NONUD },
 		{ "R",	NCE_F_ISROUTER,		NCE_F_ISROUTER	},
 		{ "N",	NCE_F_NONUD,		NCE_F_NONUD	},
 		{ "A",	NCE_F_ANYCAST,		NCE_F_ANYCAST	},
@@ -1613,15 +1943,15 @@ nce_format(uintptr_t addr, const nce_t *nce, int ipversion)
 		{ "B",	NCE_F_BCAST,		NCE_F_BCAST	},
 		{ NULL,	0,			0		}
 	};
-#define	NCE_MAX_FLAGS	(sizeof (nce_flags) / sizeof (mdb_bitmask_t))
+#define	NCE_MAX_FLAGS	(sizeof (ncec_flags) / sizeof (mdb_bitmask_t))
 	struct in_addr nceaddr;
 	ill_t ill;
 	char ill_name[LIFNAMSIZ];
 	char flagsbuf[NCE_MAX_FLAGS];
 
-	if (mdb_vread(&ill, sizeof (ill), (uintptr_t)nce->nce_ill) == -1) {
-		mdb_warn("failed to read nce_ill at %p",
-		    nce->nce_ill);
+	if (mdb_vread(&ill, sizeof (ill), (uintptr_t)ncec->ncec_ill) == -1) {
+		mdb_warn("failed to read ncec_ill at %p",
+		    ncec->ncec_ill);
 		return (DCMD_ERR);
 	}
 
@@ -1629,33 +1959,33 @@ nce_format(uintptr_t addr, const nce_t *nce, int ipversion)
 	    (uintptr_t)ill.ill_name);
 
 	mdb_snprintf(flagsbuf, sizeof (flagsbuf), "%hb",
-	    nce->nce_flags, nce_flags);
+	    ncec->ncec_flags, ncec_flags);
 
-	if (ipversion != 0 && nce->nce_ipversion != ipversion)
+	if (ipversion != 0 && ncec->ncec_ipversion != ipversion)
 		return (DCMD_OK);
 
-	if (nce->nce_ipversion == IPV4_VERSION) {
-		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
+	if (ncec->ncec_ipversion == IPV4_VERSION) {
+		IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &nceaddr);
 		mdb_printf("%?p %-20s %-10s "
 		    "%-8s "
 		    "%-5s %I\n",
-		    addr, nce_l2_addr(nce, &ill),
-		    nce_state(nce->nce_state),
+		    addr, ncec_l2_addr(ncec, &ill),
+		    ncec_state(ncec->ncec_state),
 		    flagsbuf,
 		    ill_name, nceaddr.s_addr);
 	} else {
 		mdb_printf("%?p %-20s %-10s %-8s %-5s %N\n",
-		    addr,  nce_l2_addr(nce, &ill),
-		    nce_state(nce->nce_state),
+		    addr,  ncec_l2_addr(ncec, &ill),
+		    ncec_state(ncec->ncec_state),
 		    flagsbuf,
-		    ill_name, &nce->nce_addr);
+		    ill_name, &ncec->ncec_addr);
 	}
 
 	return (DCMD_OK);
 }
 
 static uintptr_t
-nce_get_next_hash_tbl(uintptr_t start, int *index, struct ndp_g_s ndp)
+ncec_get_next_hash_tbl(uintptr_t start, int *index, struct ndp_g_s ndp)
 {
 	uintptr_t addr = start;
 	int i = *index;
@@ -1671,7 +2001,7 @@ nce_get_next_hash_tbl(uintptr_t start, int *index, struct ndp_g_s ndp)
 }
 
 static int
-nce_walk_step(mdb_walk_state_t *wsp)
+ncec_walk_step(mdb_walk_state_t *wsp)
 {
 	uintptr_t kaddr4, kaddr6;
 
@@ -1686,15 +2016,15 @@ nce_walk_step(mdb_walk_state_t *wsp)
 		mdb_warn("can't read ips_ip_cache_table at %p", kaddr6);
 		return (WALK_ERR);
 	}
-	if (mdb_pwalk("nce_stack", wsp->walk_callback, wsp->walk_cbdata,
+	if (mdb_pwalk("ncec_stack", wsp->walk_callback, wsp->walk_cbdata,
 	    kaddr4) == -1) {
-		mdb_warn("couldn't walk 'nce_stack' for ips_ndp4 %p",
+		mdb_warn("couldn't walk 'ncec_stack' for ips_ndp4 %p",
 		    kaddr4);
 		return (WALK_ERR);
 	}
-	if (mdb_pwalk("nce_stack", wsp->walk_callback,
+	if (mdb_pwalk("ncec_stack", wsp->walk_callback,
 	    wsp->walk_cbdata, kaddr6) == -1) {
-		mdb_warn("couldn't walk 'nce_stack' for ips_ndp6 %p",
+		mdb_warn("couldn't walk 'ncec_stack' for ips_ndp6 %p",
 		    kaddr6);
 		return (WALK_ERR);
 	}
@@ -1743,7 +2073,7 @@ ipcl_hash_walk_init(mdb_walk_state_t *wsp)
 		mdb_free(iw, sizeof (ipcl_hash_walk_data_t));
 		return (WALK_ERR);
 	}
-	if (arg->tbl_off == OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout) ||
+	if (arg->tbl_off == OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout_v4) ||
 	    arg->tbl_off == OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout_v6)) {
 		iw->hash_tbl_size = IPPROTO_MAX;
 	} else {
@@ -1809,72 +2139,75 @@ ipcl_hash_walk_fini(mdb_walk_state_t *wsp)
  * Called with walk_addr being the address of ips_ndp{4,6}
  */
 static int
-nce_stack_walk_init(mdb_walk_state_t *wsp)
+ncec_stack_walk_init(mdb_walk_state_t *wsp)
 {
-	nce_walk_data_t *nw;
+	ncec_walk_data_t *nw;
 
 	if (wsp->walk_addr == NULL) {
-		mdb_warn("nce_stack requires ndp_g_s address\n");
+		mdb_warn("ncec_stack requires ndp_g_s address\n");
 		return (WALK_ERR);
 	}
 
-	nw = mdb_alloc(sizeof (nce_walk_data_t), UM_SLEEP);
+	nw = mdb_alloc(sizeof (ncec_walk_data_t), UM_SLEEP);
 
-	if (mdb_vread(&nw->nce_ip_ndp, sizeof (struct ndp_g_s),
+	if (mdb_vread(&nw->ncec_ip_ndp, sizeof (struct ndp_g_s),
 	    wsp->walk_addr) == -1) {
 		mdb_warn("failed to read 'ip_ndp' at %p",
 		    wsp->walk_addr);
-		mdb_free(nw, sizeof (nce_walk_data_t));
+		mdb_free(nw, sizeof (ncec_walk_data_t));
 		return (WALK_ERR);
 	}
 
-	nw->nce_hash_tbl_index = 0;
-	wsp->walk_addr = nce_get_next_hash_tbl(NULL,
-	    &nw->nce_hash_tbl_index, nw->nce_ip_ndp);
+	/*
+	 * ncec_get_next_hash_tbl() starts at ++i , so initialize index to -1
+	 */
+	nw->ncec_hash_tbl_index = -1;
+	wsp->walk_addr = ncec_get_next_hash_tbl(NULL,
+	    &nw->ncec_hash_tbl_index, nw->ncec_ip_ndp);
 	wsp->walk_data = nw;
 
 	return (WALK_NEXT);
 }
 
 static int
-nce_stack_walk_step(mdb_walk_state_t *wsp)
+ncec_stack_walk_step(mdb_walk_state_t *wsp)
 {
 	uintptr_t addr = wsp->walk_addr;
-	nce_walk_data_t *nw = wsp->walk_data;
+	ncec_walk_data_t *nw = wsp->walk_data;
 
 	if (addr == NULL)
 		return (WALK_DONE);
 
-	if (mdb_vread(&nw->nce, sizeof (nce_t), addr) == -1) {
-		mdb_warn("failed to read nce_t at %p", addr);
+	if (mdb_vread(&nw->ncec, sizeof (ncec_t), addr) == -1) {
+		mdb_warn("failed to read ncec_t at %p", addr);
 		return (WALK_ERR);
 	}
 
-	wsp->walk_addr = (uintptr_t)nw->nce.nce_next;
+	wsp->walk_addr = (uintptr_t)nw->ncec.ncec_next;
 
-	wsp->walk_addr = nce_get_next_hash_tbl(wsp->walk_addr,
-	    &nw->nce_hash_tbl_index, nw->nce_ip_ndp);
+	wsp->walk_addr = ncec_get_next_hash_tbl(wsp->walk_addr,
+	    &nw->ncec_hash_tbl_index, nw->ncec_ip_ndp);
 
 	return (wsp->walk_callback(addr, nw, wsp->walk_cbdata));
 }
 
 static void
-nce_stack_walk_fini(mdb_walk_state_t *wsp)
+ncec_stack_walk_fini(mdb_walk_state_t *wsp)
 {
-	mdb_free(wsp->walk_data, sizeof (nce_walk_data_t));
+	mdb_free(wsp->walk_data, sizeof (ncec_walk_data_t));
 }
 
 /* ARGSUSED */
 static int
-nce_cb(uintptr_t addr, const nce_walk_data_t *iw, nce_cbdata_t *id)
+ncec_cb(uintptr_t addr, const ncec_walk_data_t *iw, ncec_cbdata_t *id)
 {
-	nce_t nce;
+	ncec_t ncec;
 
-	if (mdb_vread(&nce, sizeof (nce_t), addr) == -1) {
-		mdb_warn("failed to read nce at %p", addr);
+	if (mdb_vread(&ncec, sizeof (ncec_t), addr) == -1) {
+		mdb_warn("failed to read ncec at %p", addr);
 		return (WALK_NEXT);
 	}
-	(void) nce_format(addr, &nce, id->nce_ipversion);
+	(void) ncec_format(addr, &ncec, id->ncec_ipversion);
 	return (WALK_NEXT);
 }
 
@@ -1918,6 +2251,11 @@ ill_cb(uintptr_t addr, const ill_walk_data_t *iw, ill_cbdata_t *id)
 		mdb_warn("failed to read ill at %p", addr);
 		return (WALK_NEXT);
 	}
+
+	/* If ip_stack_t is specified, skip ILLs that don't belong to it. */
+	if (id->ill_ipst != NULL && ill.ill_ipst != id->ill_ipst)
+		return (WALK_NEXT);
+
 	return (ill_format((uintptr_t)addr, &ill, id));
 }
 
@@ -2013,7 +2351,7 @@ ill_format(uintptr_t addr, const void *illptr, void *ill_cb_arg)
 		break;
 	}
 	cnt = ill->ill_refcnt + ill->ill_ire_cnt + ill->ill_nce_cnt +
-	    ill->ill_ilm_walker_cnt + ill->ill_ilm_cnt;
+	    ill->ill_ilm_cnt + ill->ill_ncec_cnt;
 	mdb_printf("%-?p %-8s %-3s ",
 	    addr, ill_name, ill->ill_isv6 ? "v6" : "v4");
 	if (typebuf != NULL)
@@ -2035,11 +2373,10 @@ ill_format(uintptr_t addr, const void *illptr, void *ill_cb_arg)
 		    strlen(sbuf), "", ill->ill_ire_cnt, "ill_ire_cnt");
 		mdb_printf("%*s %7d %-18s nces referencing this ill\n",
 		    strlen(sbuf), "", ill->ill_nce_cnt, "ill_nce_cnt");
+		mdb_printf("%*s %7d %-18s ncecs referencing this ill\n",
+		    strlen(sbuf), "", ill->ill_ncec_cnt, "ill_ncec_cnt");
 		mdb_printf("%*s %7d %-18s ilms referencing this ill\n",
 		    strlen(sbuf), "", ill->ill_ilm_cnt, "ill_ilm_cnt");
-		mdb_printf("%*s %7d %-18s active ilm walkers\n\n",
-		    strlen(sbuf), "", ill->ill_ilm_walker_cnt,
-		    "ill_ilm_walker_cnt");
 	} else {
 		mdb_printf("%4d %-?p %-llb\n",
 		    cnt, ill->ill_wq,
@@ -2054,14 +2391,24 @@ ill(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	ill_t ill_data;
 	ill_cbdata_t id;
 	int ipversion = 0;
+	const char *zone_name = NULL;
 	const char *opt_P = NULL;
 	uint_t verbose = FALSE;
+	ip_stack_t *ipst = NULL;
 
 	if (mdb_getopts(argc, argv,
 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
+	    's', MDB_OPT_STR, &zone_name,
 	    'P', MDB_OPT_STR, &opt_P, NULL) != argc)
 		return (DCMD_USAGE);
 
+	/* Follow the specified zone name to find a ip_stack_t*. */
+	if (zone_name != NULL) {
+		ipst = zone_to_ips(zone_name);
+		if (ipst == NULL)
+			return (DCMD_USAGE);
+	}
+
 	if (opt_P != NULL) {
 		if (strcmp("v4", opt_P) == 0) {
 			ipversion = IPV4_VERSION;
@@ -2076,6 +2423,7 @@ ill(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	id.verbose = verbose;
 	id.ill_addr = addr;
 	id.ill_ipversion = ipversion;
+	id.ill_ipst = ipst;
 
 	ill_header(verbose);
 	if (flags & DCMD_ADDRSPEC) {
@@ -2254,7 +2602,6 @@ ipif_format(uintptr_t addr, const void *ipifptr, void *ipif_cb_arg)
 		{ "CO",		IPIF_CONDEMNED,		IPIF_CONDEMNED},
 		{ "CH",		IPIF_CHANGING,		IPIF_CHANGING},
 		{ "SL",		IPIF_SET_LINKLOCAL,	IPIF_SET_LINKLOCAL},
-		{ "ZS",		IPIF_ZERO_SOURCE,	IPIF_ZERO_SOURCE},
 		{ NULL,		0,			0		}
 	};
 	static const mdb_bitmask_t fmasks[] = {
@@ -2299,16 +2646,14 @@ ipif_format(uintptr_t addr, const void *ipifptr, void *ipif_cb_arg)
 	}
 	mdb_snprintf(bitfields, sizeof (bitfields), "%s",
 	    ipif->ipif_addr_ready ? ",ADR" : "",
-	    ipif->ipif_multicast_up ? ",MU" : "",
 	    ipif->ipif_was_up ? ",WU" : "",
-	    ipif->ipif_was_dup ? ",WD" : "",
-	    ipif->ipif_joined_allhosts ? ",JA" : "");
+	    ipif->ipif_was_dup ? ",WD" : "");
 	mdb_snprintf(flagsbuf, sizeof (flagsbuf), "%llb%s",
 	    ipif->ipif_flags, fmasks, bitfields);
 	mdb_snprintf(sflagsbuf, sizeof (sflagsbuf), "%b",
 	    ipif->ipif_state_flags, sfmasks);
 
-	cnt = ipif->ipif_refcnt + ipif->ipif_ire_cnt + ipif->ipif_ilm_cnt;
+	cnt = ipif->ipif_refcnt;
 
 	if (ipifcb->ill.ill_isv6) {
 		mdb_snprintf(addrstr, sizeof (addrstr), "%N",
@@ -2329,12 +2674,6 @@ ipif_format(uintptr_t addr, const void *ipifptr, void *ipif_cb_arg)
 		mdb_printf("%s |\n%s +---> %4d %-15s "
 		    "Active consistent reader cnt\n",
 		    sbuf, sbuf, ipif->ipif_refcnt, "ipif_refcnt");
-		mdb_printf("%*s %10d %-15s "
-		    "Number of ire's referencing this ipif\n",
-		    strlen(sbuf), "", ipif->ipif_ire_cnt, "ipif_ire_cnt");
-		mdb_printf("%*s %10d %-15s "
-		    "Number of ilm's referencing this ipif\n\n",
-		    strlen(sbuf), "", ipif->ipif_ilm_cnt, "ipif_ilm_cnt");
 		mdb_printf("%-s/%d\n",
 		    addrstr, mask_to_prefixlen(af, &ipif->ipif_v6net_mask));
 		if (ipifcb->ill.ill_isv6) {
@@ -2473,16 +2812,16 @@ conn_status_cb(uintptr_t addr, const void *walk_data,
 	mdb_printf("%-?p %-?p %?d %?d\n", addr, conn->conn_wq,
 	    nss.netstack_stackid, conn->conn_zoneid);
 
-	if (conn->conn_af_isv6) {
+	if (conn->conn_family == AF_INET6) {
 		mdb_snprintf(src_addrstr, sizeof (rem_addrstr), "%N",
-		    &conn->conn_srcv6);
+		    &conn->conn_laddr_v6);
 		mdb_snprintf(rem_addrstr, sizeof (rem_addrstr), "%N",
-		    &conn->conn_remv6);
+		    &conn->conn_faddr_v6);
 	} else {
 		mdb_snprintf(src_addrstr, sizeof (src_addrstr), "%I",
-		    V4_PART_OF_V6((conn->conn_srcv6)));
+		    V4_PART_OF_V6((conn->conn_laddr_v6)));
 		mdb_snprintf(rem_addrstr, sizeof (rem_addrstr), "%I",
-		    V4_PART_OF_V6((conn->conn_remv6)));
+		    V4_PART_OF_V6((conn->conn_faddr_v6)));
 	}
 	mdb_printf("%s:%-5d\n%s:%-5d\n",
 	    src_addrstr, conn->conn_lport, rem_addrstr, conn->conn_fport);
@@ -2519,7 +2858,7 @@ conn_status_help(void)
 {
 	mdb_printf("Prints conn_t structures from the following hash tables: "
 	    "\n\tips_ipcl_udp_fanout\n\tips_ipcl_bind_fanout"
-	    "\n\tips_ipcl_conn_fanout\n\tips_ipcl_proto_fanout"
+	    "\n\tips_ipcl_conn_fanout\n\tips_ipcl_proto_fanout_v4"
 	    "\n\tips_ipcl_proto_fanout_v6\n");
 }
 
diff --git a/usr/src/cmd/mdb/common/modules/sctp/sctp.c b/usr/src/cmd/mdb/common/modules/sctp/sctp.c
index 05f0c385c8..4165a56ca4 100644
--- a/usr/src/cmd/mdb/common/modules/sctp/sctp.c
+++ b/usr/src/cmd/mdb/common/modules/sctp/sctp.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/stream.h>
 #include <sys/mdb_modapi.h>
@@ -164,7 +162,7 @@ sctp_faddr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	mdb_printf("lastactive\t%?ld\thb_secret\t%?#lx\n", fa->lastactive,
 	    fa->hb_secret);
 	mdb_printf("rxt_unacked\t%?u\n", fa->rxt_unacked);
-	mdb_printf("timer_mp\t%?p\tire\t\t%?p\n", fa->timer_mp, fa->ire);
+	mdb_printf("timer_mp\t%?p\tixa\t\t%?p\n", fa->timer_mp, fa->ixa);
 	mdb_printf("hb_enabled\t%?d\thb_pending\t%?d\n"
 	    "timer_running\t%?d\tdf\t\t%?d\n"
 	    "pmtu_discovered\t%?d\tisv4\t\t%?d\n"
@@ -566,11 +564,12 @@ show_sctp_flags(sctp_t *sctp)
 {
 	mdb_printf("\tunderstands_asconf\t%d\n",
 	    sctp->sctp_understands_asconf);
-	mdb_printf("\tdebug\t\t\t%d\n", sctp->sctp_debug);
+	mdb_printf("\tdebug\t\t\t%d\n", sctp->sctp_connp->conn_debug);
 	mdb_printf("\tcchunk_pend\t\t%d\n", sctp->sctp_cchunk_pend);
-	mdb_printf("\tdgram_errind\t\t%d\n", sctp->sctp_dgram_errind);
+	mdb_printf("\tdgram_errind\t\t%d\n",
+	    sctp->sctp_connp->conn_dgram_errind);
 
-	mdb_printf("\tlinger\t\t\t%d\n", sctp->sctp_linger);
+	mdb_printf("\tlinger\t\t\t%d\n", sctp->sctp_connp->conn_linger);
 	if (sctp->sctp_lingering)
 		return;
 	mdb_printf("\tlingering\t\t%d\n", sctp->sctp_lingering);
@@ -578,7 +577,8 @@ show_sctp_flags(sctp_t *sctp)
 	mdb_printf("\tforce_sack\t\t%d\n", sctp->sctp_force_sack);
 
 	mdb_printf("\tack_timer_runing\t%d\n", sctp->sctp_ack_timer_running);
-	mdb_printf("\trecvdstaddr\t\t%d\n", sctp->sctp_recvdstaddr);
+	mdb_printf("\trecvdstaddr\t\t%d\n",
+	    sctp->sctp_connp->conn_recv_ancillary.crb_recvdstaddr);
 	mdb_printf("\thwcksum\t\t\t%d\n", sctp->sctp_hwcksum);
 	mdb_printf("\tunderstands_addip\t%d\n", sctp->sctp_understands_addip);
 
@@ -654,8 +654,8 @@ print_saddr(uintptr_t ptr, const void *addr, void *cbdata)
 	if (saddr->saddr_ipif_delete_pending == 1)
 		mdb_printf("/DeletePending");
 	mdb_printf(")\n");
-	mdb_printf("\t\t\tMTU %d id %d zoneid %d IPIF flags %x\n",
-	    ipif.sctp_ipif_mtu, ipif.sctp_ipif_id,
+	mdb_printf("\t\t\tid %d zoneid %d IPIF flags %x\n",
+	    ipif.sctp_ipif_id,
 	    ipif.sctp_ipif_zoneid, ipif.sctp_ipif_flags);
 	return (WALK_NEXT);
 }
@@ -682,8 +682,8 @@ print_faddr(uintptr_t ptr, const void *addr, void *cbdata)
 int
 sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
-	sctp_t sctp;
-	conn_t connp;
+	sctp_t sctps, *sctp;
+	conn_t conns, *connp;
 	int i;
 	uint_t opts = 0;
 	uint_t paddr = 0;
@@ -692,16 +692,23 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	if (!(flags & DCMD_ADDRSPEC))
 		return (DCMD_USAGE);
 
-	if (mdb_vread(&sctp, sizeof (sctp), addr) == -1) {
+	if (mdb_vread(&sctps, sizeof (sctps), addr) == -1) {
 		mdb_warn("failed to read sctp_t at: %p\n", addr);
 		return (DCMD_ERR);
 	}
-	if (mdb_vread(&connp, sizeof (connp),
-	    (uintptr_t)sctp.sctp_connp) == -1) {
-		mdb_warn("failed to read conn_t at: %p\n", sctp.sctp_connp);
+	sctp = &sctps;
+
+	if (mdb_vread(&conns, sizeof (conns),
+	    (uintptr_t)sctp->sctp_connp) == -1) {
+		mdb_warn("failed to read conn_t at: %p\n", sctp->sctp_connp);
 		return (DCMD_ERR);
 	}
 
+	connp = &conns;
+
+	connp->conn_sctp = sctp;
+	sctp->sctp_connp = connp;
+
 	if (mdb_getopts(argc, argv,
 	    'a', MDB_OPT_SETBITS, MDB_SCTP_SHOW_ALL, &opts,
 	    'f', MDB_OPT_SETBITS, MDB_SCTP_SHOW_FLAGS, &opts,
@@ -726,7 +733,7 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	/* non-verbose faddrs, suitable for pipelines to sctp_faddr */
 	if (paddr != 0) {
 		sctp_faddr_t faddr, *fp;
-		for (fp = sctp.sctp_faddrs; fp != NULL; fp = faddr.next) {
+		for (fp = sctp->sctp_faddrs; fp != NULL; fp = faddr.next) {
 			if (mdb_vread(&faddr, sizeof (faddr), (uintptr_t)fp)
 			    == -1) {
 				mdb_warn("failed to read faddr at %p",
@@ -738,16 +745,16 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		return (DCMD_OK);
 	}
 
-	mdb_nhconvert(&lport, &sctp.sctp_lport, sizeof (lport));
-	mdb_nhconvert(&fport, &sctp.sctp_fport, sizeof (fport));
+	mdb_nhconvert(&lport, &connp->conn_lport, sizeof (lport));
+	mdb_nhconvert(&fport, &connp->conn_fport, sizeof (fport));
 	mdb_printf("%<u>%p% %22s S=%-6hu D=%-6hu% STACK=%d ZONE=%d%</u>", addr,
-	    state2str(&sctp), lport, fport,
-	    ns_to_stackid((uintptr_t)connp.conn_netstack), connp.conn_zoneid);
+	    state2str(sctp), lport, fport,
+	    ns_to_stackid((uintptr_t)connp->conn_netstack), connp->conn_zoneid);
 
-	if (sctp.sctp_faddrs) {
+	if (sctp->sctp_faddrs) {
 		sctp_faddr_t faddr;
 		if (mdb_vread(&faddr, sizeof (faddr),
-		    (uintptr_t)sctp.sctp_faddrs) != -1)
+		    (uintptr_t)sctp->sctp_faddrs) != -1)
 			mdb_printf("%<u> %N%</u>", &faddr.faddr);
 	}
 	mdb_printf("\n");
@@ -756,78 +763,78 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		mdb_printf("%<b>Local and Peer Addresses%</b>\n");
 
 		/* Display source addresses */
-		mdb_printf("nsaddrs\t\t%?d\n", sctp.sctp_nsaddrs);
+		mdb_printf("nsaddrs\t\t%?d\n", sctp->sctp_nsaddrs);
 		(void) mdb_pwalk("sctp_walk_saddr", print_saddr, NULL, addr);
 
 		/* Display peer addresses */
-		mdb_printf("nfaddrs\t\t%?d\n", sctp.sctp_nfaddrs);
+		mdb_printf("nfaddrs\t\t%?d\n", sctp->sctp_nfaddrs);
 		i = 1;
 		(void) mdb_pwalk("sctp_walk_faddr", print_faddr, &i, addr);
 
 		mdb_printf("lastfaddr\t%?p\tprimary\t\t%?p\n",
-		    sctp.sctp_lastfaddr, sctp.sctp_primary);
+		    sctp->sctp_lastfaddr, sctp->sctp_primary);
 		mdb_printf("current\t\t%?p\tlastdata\t%?p\n",
-		    sctp.sctp_current, sctp.sctp_lastdata);
+		    sctp->sctp_current, sctp->sctp_lastdata);
 	}
 
 	if (opts & MDB_SCTP_SHOW_OUT) {
 		mdb_printf("%<b>Outbound Data%</b>\n");
 		mdb_printf("xmit_head\t%?p\txmit_tail\t%?p\n",
-		    sctp.sctp_xmit_head, sctp.sctp_xmit_tail);
+		    sctp->sctp_xmit_head, sctp->sctp_xmit_tail);
 		mdb_printf("xmit_unsent\t%?p\txmit_unsent_tail%?p\n",
-		    sctp.sctp_xmit_unsent, sctp.sctp_xmit_unsent_tail);
-		mdb_printf("xmit_unacked\t%?p\n", sctp.sctp_xmit_unacked);
+		    sctp->sctp_xmit_unsent, sctp->sctp_xmit_unsent_tail);
+		mdb_printf("xmit_unacked\t%?p\n", sctp->sctp_xmit_unacked);
 		mdb_printf("unacked\t\t%?u\tunsent\t\t%?ld\n",
-		    sctp.sctp_unacked, sctp.sctp_unsent);
+		    sctp->sctp_unacked, sctp->sctp_unsent);
 		mdb_printf("ltsn\t\t%?x\tlastack_rxd\t%?x\n",
-		    sctp.sctp_ltsn, sctp.sctp_lastack_rxd);
+		    sctp->sctp_ltsn, sctp->sctp_lastack_rxd);
 		mdb_printf("recovery_tsn\t%?x\tadv_pap\t\t%?x\n",
-		    sctp.sctp_recovery_tsn, sctp.sctp_adv_pap);
+		    sctp->sctp_recovery_tsn, sctp->sctp_adv_pap);
 		mdb_printf("num_ostr\t%?hu\tostrcntrs\t%?p\n",
-		    sctp.sctp_num_ostr, sctp.sctp_ostrcntrs);
+		    sctp->sctp_num_ostr, sctp->sctp_ostrcntrs);
 		mdb_printf("pad_mp\t\t%?p\terr_chunks\t%?p\n",
-		    sctp.sctp_pad_mp, sctp.sctp_err_chunks);
-		mdb_printf("err_len\t\t%?u\n", sctp.sctp_err_len);
+		    sctp->sctp_pad_mp, sctp->sctp_err_chunks);
+		mdb_printf("err_len\t\t%?u\n", sctp->sctp_err_len);
 
 		mdb_printf("%<b>Default Send Parameters%</b>\n");
 		mdb_printf("def_stream\t%?u\tdef_flags\t%?x\n",
-		    sctp.sctp_def_stream, sctp.sctp_def_flags);
+		    sctp->sctp_def_stream, sctp->sctp_def_flags);
 		mdb_printf("def_ppid\t%?x\tdef_context\t%?x\n",
-		    sctp.sctp_def_ppid, sctp.sctp_def_context);
+		    sctp->sctp_def_ppid, sctp->sctp_def_context);
 		mdb_printf("def_timetolive\t%?u\n",
-		    sctp.sctp_def_timetolive);
+		    sctp->sctp_def_timetolive);
 	}
 
 	if (opts & MDB_SCTP_SHOW_IN) {
 		mdb_printf("%<b>Inbound Data%</b>\n");
 		mdb_printf("sack_info\t%?p\tsack_gaps\t%?d\n",
-		    sctp.sctp_sack_info, sctp.sctp_sack_gaps);
-		dump_sack_info((uintptr_t)sctp.sctp_sack_info);
+		    sctp->sctp_sack_info, sctp->sctp_sack_gaps);
+		dump_sack_info((uintptr_t)sctp->sctp_sack_info);
 		mdb_printf("ftsn\t\t%?x\tlastacked\t%?x\n",
-		    sctp.sctp_ftsn, sctp.sctp_lastacked);
+		    sctp->sctp_ftsn, sctp->sctp_lastacked);
 		mdb_printf("istr_nmsgs\t%?d\tsack_toggle\t%?d\n",
-		    sctp.sctp_istr_nmsgs, sctp.sctp_sack_toggle);
-		mdb_printf("ack_mp\t\t%?p\n", sctp.sctp_ack_mp);
+		    sctp->sctp_istr_nmsgs, sctp->sctp_sack_toggle);
+		mdb_printf("ack_mp\t\t%?p\n", sctp->sctp_ack_mp);
 		mdb_printf("num_istr\t%?hu\tinstr\t\t%?p\n",
-		    sctp.sctp_num_istr, sctp.sctp_instr);
-		mdb_printf("unord_reass\t%?p\n", sctp.sctp_uo_frags);
+		    sctp->sctp_num_istr, sctp->sctp_instr);
+		mdb_printf("unord_reass\t%?p\n", sctp->sctp_uo_frags);
 	}
 
 	if (opts & MDB_SCTP_SHOW_RTT) {
 		mdb_printf("%<b>RTT Tracking%</b>\n");
 		mdb_printf("rtt_tsn\t\t%?x\tout_time\t%?ld\n",
-		    sctp.sctp_rtt_tsn, sctp.sctp_out_time);
+		    sctp->sctp_rtt_tsn, sctp->sctp_out_time);
 	}
 
 	if (opts & MDB_SCTP_SHOW_FLOW) {
 		mdb_printf("%<b>Flow Control%</b>\n");
-		mdb_printf("txmit_hiwater\t%?d\n"
-		    "xmit_lowater\t%?d\tfrwnd\t\t%?u\n"
+		mdb_printf("tconn_sndbuf\t%?d\n"
+		    "conn_sndlowat\t%?d\tfrwnd\t\t%?u\n"
 		    "rwnd\t\t%?u\tinitial rwnd\t%?u\n"
-		    "rxqueued\t%?u\tcwnd_max\t%?u\n", sctp.sctp_xmit_hiwater,
-		    sctp.sctp_xmit_lowater, sctp.sctp_frwnd,
-		    sctp.sctp_rwnd, sctp.sctp_irwnd, sctp.sctp_rxqueued,
-		    sctp.sctp_cwnd_max);
+		    "rxqueued\t%?u\tcwnd_max\t%?u\n", connp->conn_sndbuf,
+		    connp->conn_sndlowat, sctp->sctp_frwnd,
+		    sctp->sctp_rwnd, sctp->sctp_irwnd, sctp->sctp_rxqueued,
+		    sctp->sctp_cwnd_max);
 	}
 
 	if (opts & MDB_SCTP_SHOW_HDR) {
@@ -838,21 +845,21 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		    "ipha\t\t%?p\tip6h\t\t%?p\n"
 		    "ip_hdr_len\t%?d\tip_hdr6_len\t%?d\n"
 		    "sctph\t\t%?p\tsctph6\t\t%?p\n"
-		    "lvtag\t\t%?x\tfvtag\t\t%?x\n", sctp.sctp_iphc,
-		    sctp.sctp_iphc6, sctp.sctp_iphc_len,
-		    sctp.sctp_iphc6_len, sctp.sctp_hdr_len,
-		    sctp.sctp_hdr6_len, sctp.sctp_ipha, sctp.sctp_ip6h,
-		    sctp.sctp_ip_hdr_len, sctp.sctp_ip_hdr6_len,
-		    sctp.sctp_sctph, sctp.sctp_sctph6, sctp.sctp_lvtag,
-		    sctp.sctp_fvtag);
+		    "lvtag\t\t%?x\tfvtag\t\t%?x\n", sctp->sctp_iphc,
+		    sctp->sctp_iphc6, sctp->sctp_iphc_len,
+		    sctp->sctp_iphc6_len, sctp->sctp_hdr_len,
+		    sctp->sctp_hdr6_len, sctp->sctp_ipha, sctp->sctp_ip6h,
+		    sctp->sctp_ip_hdr_len, sctp->sctp_ip_hdr6_len,
+		    sctp->sctp_sctph, sctp->sctp_sctph6, sctp->sctp_lvtag,
+		    sctp->sctp_fvtag);
 	}
 
 	if (opts & MDB_SCTP_SHOW_PMTUD) {
 		mdb_printf("%<b>PMTUd%</b>\n");
 		mdb_printf("last_mtu_probe\t%?ld\tmtu_probe_intvl\t%?ld\n"
 		    "mss\t\t%?u\n",
-		    sctp.sctp_last_mtu_probe, sctp.sctp_mtu_probe_intvl,
-		    sctp.sctp_mss);
+		    sctp->sctp_last_mtu_probe, sctp->sctp_mtu_probe_intvl,
+		    sctp->sctp_mss);
 	}
 
 	if (opts & MDB_SCTP_SHOW_RXT) {
@@ -862,33 +869,33 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		    "pp_max_rxt\t%?d\trto_max\t\t%?u\n"
 		    "rto_min\t\t%?u\trto_initial\t%?u\n"
 		    "init_rto_max\t%?u\n"
-		    "rxt_nxttsn\t%?u\trxt_maxtsn\t%?u\n", sctp.sctp_cookie_mp,
-		    sctp.sctp_strikes, sctp.sctp_max_init_rxt,
-		    sctp.sctp_pa_max_rxt, sctp.sctp_pp_max_rxt,
-		    sctp.sctp_rto_max, sctp.sctp_rto_min,
-		    sctp.sctp_rto_initial, sctp.sctp_init_rto_max,
-		    sctp.sctp_rxt_nxttsn, sctp.sctp_rxt_maxtsn);
+		    "rxt_nxttsn\t%?u\trxt_maxtsn\t%?u\n", sctp->sctp_cookie_mp,
+		    sctp->sctp_strikes, sctp->sctp_max_init_rxt,
+		    sctp->sctp_pa_max_rxt, sctp->sctp_pp_max_rxt,
+		    sctp->sctp_rto_max, sctp->sctp_rto_min,
+		    sctp->sctp_rto_initial, sctp->sctp_init_rto_max,
+		    sctp->sctp_rxt_nxttsn, sctp->sctp_rxt_maxtsn);
 	}
 
 	if (opts & MDB_SCTP_SHOW_CONN) {
 		mdb_printf("%<b>Connection State%</b>\n");
 		mdb_printf("last_secret_update%?ld\n",
-		    sctp.sctp_last_secret_update);
+		    sctp->sctp_last_secret_update);
 
 		mdb_printf("secret\t\t");
 		for (i = 0; i < SCTP_SECRET_LEN; i++) {
 			if (i % 2 == 0)
-				mdb_printf("0x%02x", sctp.sctp_secret[i]);
+				mdb_printf("0x%02x", sctp->sctp_secret[i]);
 			else
-				mdb_printf("%02x ", sctp.sctp_secret[i]);
+				mdb_printf("%02x ", sctp->sctp_secret[i]);
 		}
 		mdb_printf("\n");
 		mdb_printf("old_secret\t");
 		for (i = 0; i < SCTP_SECRET_LEN; i++) {
 			if (i % 2 == 0)
-				mdb_printf("0x%02x", sctp.sctp_old_secret[i]);
+				mdb_printf("0x%02x", sctp->sctp_old_secret[i]);
 			else
-				mdb_printf("%02x ", sctp.sctp_old_secret[i]);
+				mdb_printf("%02x ", sctp->sctp_old_secret[i]);
 		}
 		mdb_printf("\n");
 	}
@@ -901,40 +908,40 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		    "T2expire\t%?lu\tT3expire\t%?lu\n"
 		    "msgcount\t%?llu\tprsctpdrop\t%?llu\n"
 		    "AssocStartTime\t%?lu\n",
-		    sctp.sctp_opkts, sctp.sctp_obchunks,
-		    sctp.sctp_odchunks, sctp.sctp_oudchunks,
-		    sctp.sctp_rxtchunks, sctp.sctp_T1expire,
-		    sctp.sctp_T2expire, sctp.sctp_T3expire,
-		    sctp.sctp_msgcount, sctp.sctp_prsctpdrop,
-		    sctp.sctp_assoc_start_time);
+		    sctp->sctp_opkts, sctp->sctp_obchunks,
+		    sctp->sctp_odchunks, sctp->sctp_oudchunks,
+		    sctp->sctp_rxtchunks, sctp->sctp_T1expire,
+		    sctp->sctp_T2expire, sctp->sctp_T3expire,
+		    sctp->sctp_msgcount, sctp->sctp_prsctpdrop,
+		    sctp->sctp_assoc_start_time);
 		mdb_printf("ipkts\t\t%?llu\tibchunks\t%?llu\n"
 		    "idchunks\t%?llu\tiudchunks\t%?llu\n"
 		    "fragdmsgs\t%?llu\treassmsgs\t%?llu\n",
-		    sctp.sctp_ipkts, sctp.sctp_ibchunks,
-		    sctp.sctp_idchunks, sctp.sctp_iudchunks,
-		    sctp.sctp_fragdmsgs, sctp.sctp_reassmsgs);
+		    sctp->sctp_ipkts, sctp->sctp_ibchunks,
+		    sctp->sctp_idchunks, sctp->sctp_iudchunks,
+		    sctp->sctp_fragdmsgs, sctp->sctp_reassmsgs);
 	}
 
 	if (opts & MDB_SCTP_SHOW_HASH) {
 		mdb_printf("%<b>Hash Tables%</b>\n");
-		mdb_printf("conn_hash_next\t%?p\t", sctp.sctp_conn_hash_next);
-		mdb_printf("conn_hash_prev\t%?p\n", sctp.sctp_conn_hash_prev);
+		mdb_printf("conn_hash_next\t%?p\t", sctp->sctp_conn_hash_next);
+		mdb_printf("conn_hash_prev\t%?p\n", sctp->sctp_conn_hash_prev);
 
 		mdb_printf("listen_hash_next%?p\t",
-		    sctp.sctp_listen_hash_next);
+		    sctp->sctp_listen_hash_next);
 		mdb_printf("listen_hash_prev%?p\n",
-		    sctp.sctp_listen_hash_prev);
-		mdb_nhconvert(&lport, &sctp.sctp_lport, sizeof (lport));
+		    sctp->sctp_listen_hash_prev);
+		mdb_nhconvert(&lport, &connp->conn_lport, sizeof (lport));
 		mdb_printf("[ listen_hash bucket\t%?d ]\n",
 		    SCTP_LISTEN_HASH(lport));
 
-		mdb_printf("conn_tfp\t%?p\t", sctp.sctp_conn_tfp);
-		mdb_printf("listen_tfp\t%?p\n", sctp.sctp_listen_tfp);
+		mdb_printf("conn_tfp\t%?p\t", sctp->sctp_conn_tfp);
+		mdb_printf("listen_tfp\t%?p\n", sctp->sctp_listen_tfp);
 
 		mdb_printf("bind_hash\t%?p\tptpbhn\t\t%?p\n",
-		    sctp.sctp_bind_hash, sctp.sctp_ptpbhn);
+		    sctp->sctp_bind_hash, sctp->sctp_ptpbhn);
 		mdb_printf("bind_lockp\t%?p\n",
-		    sctp.sctp_bind_lockp);
+		    sctp->sctp_bind_lockp);
 		mdb_printf("[ bind_hash bucket\t%?d ]\n",
 		    SCTP_BIND_HASH(lport));
 	}
@@ -943,8 +950,8 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		mdb_printf("%<b>Cleanup / Close%</b>\n");
 		mdb_printf("shutdown_faddr\t%?p\tclient_errno\t%?d\n"
 		    "lingertime\t%?d\trefcnt\t\t%?hu\n",
-		    sctp.sctp_shutdown_faddr, sctp.sctp_client_errno,
-		    sctp.sctp_lingertime, sctp.sctp_refcnt);
+		    sctp->sctp_shutdown_faddr, sctp->sctp_client_errno,
+		    connp->conn_lingertime, sctp->sctp_refcnt);
 	}
 
 	if (opts & MDB_SCTP_SHOW_MISC) {
@@ -955,24 +962,25 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		    "active\t\t%?ld\ttx_adaptation_code%?x\n"
 		    "rx_adaptation_code%?x\ttimer_mp\t%?p\n"
 		    "partial_delivery_point\t%?d\n",
-		    sctp.sctp_bound_if, sctp.sctp_heartbeat_mp,
-		    sctp.sctp_family, sctp.sctp_ipversion,
-		    sctp.sctp_hb_interval, sctp.sctp_autoclose,
-		    sctp.sctp_active, sctp.sctp_tx_adaptation_code,
-		    sctp.sctp_rx_adaptation_code, sctp.sctp_timer_mp,
-		    sctp.sctp_pd_point);
+		    connp->conn_bound_if, sctp->sctp_heartbeat_mp,
+		    connp->conn_family,
+		    connp->conn_ipversion,
+		    sctp->sctp_hb_interval, sctp->sctp_autoclose,
+		    sctp->sctp_active, sctp->sctp_tx_adaptation_code,
+		    sctp->sctp_rx_adaptation_code, sctp->sctp_timer_mp,
+		    sctp->sctp_pd_point);
 	}
 
 	if (opts & MDB_SCTP_SHOW_EXT) {
 		mdb_printf("%<b>Extensions and Reliable Ctl Chunks%</b>\n");
 		mdb_printf("cxmit_list\t%?p\tlcsn\t\t%?x\n"
-		    "fcsn\t\t%?x\n", sctp.sctp_cxmit_list, sctp.sctp_lcsn,
-		    sctp.sctp_fcsn);
+		    "fcsn\t\t%?x\n", sctp->sctp_cxmit_list, sctp->sctp_lcsn,
+		    sctp->sctp_fcsn);
 	}
 
 	if (opts & MDB_SCTP_SHOW_FLAGS) {
 		mdb_printf("%<b>Flags%</b>\n");
-		show_sctp_flags(&sctp);
+		show_sctp_flags(sctp);
 	}
 
 	return (DCMD_OK);
diff --git a/usr/src/common/net/patricia/radix.c b/usr/src/common/net/patricia/radix.c
index 9a1d3f78ed..cf2085280f 100644
--- a/usr/src/common/net/patricia/radix.c
+++ b/usr/src/common/net/patricia/radix.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
  * Copyright (c) 1988, 1989, 1993
@@ -367,8 +367,9 @@ rn_match_args(v_arg, head, rn_leaf_fn, rn_leaf_arg)
 		 * is looking for some other criteria as well. Continue
 		 * looking as if the exact match failed.
 		 */
-		if (t->rn_parent->rn_flags & RNF_ROOT) {
-			/* hit the top. have to give up */
+		if (t->rn_dupedkey == NULL &&
+		    (t->rn_parent->rn_flags & RNF_ROOT)) {
+			/* no more dupedkeys and hit the top. have to give up */
 			return (NULL);
 		}
 		b = 0;
@@ -486,56 +487,70 @@ rn_insert(v_arg, head, dupentry, nodes)
 {
 	caddr_t v = v_arg;
 	struct radix_node *top = head->rnh_treetop;
+	struct radix_node *p, *x;
 	int head_off = top->rn_offset, vlen = (int)LEN(v);
 	struct radix_node *t = rn_search(v_arg, top);
 	caddr_t cp = v + head_off;
 	int b;
 	struct radix_node *tt;
+	caddr_t cp2 = t->rn_key + head_off;
+	int cmp_res;
+	caddr_t cplim = v + vlen;
 
 	/*
 	 * Find first bit at which v and t->rn_key differ
 	 */
-	{
-		caddr_t cp2 = t->rn_key + head_off;
-		int cmp_res;
-		caddr_t cplim = v + vlen;
-
-		while (cp < cplim)
-			if (*cp2++ != *cp++)
-				goto on1;
-		*dupentry = 1;
-		return (t);
+	while (cp < cplim)
+		if (*cp2++ != *cp++)
+			goto on1;
+	*dupentry = 1;
+	return (t);
 on1:
-		*dupentry = 0;
-		cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
-		for (b = (cp - v) << 3; cmp_res; b--)
-			cmp_res >>= 1;
-	}
-	{
-		struct radix_node *p, *x = top;
-		cp = v;
-		do {
-			p = x;
-			if (cp[x->rn_offset] & x->rn_bmask)
-				x = x->rn_right;
-			else
-				x = x->rn_left;
-		} while (b > (unsigned)x->rn_bit);
-				/* x->rn_bit < b && x->rn_bit >= 0 */
-		t = rn_newpair(v_arg, b, nodes);
-		tt = t->rn_left;
-		if ((cp[p->rn_offset] & p->rn_bmask) == 0)
-			p->rn_left = t;
+	*dupentry = 0;
+	cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
+	/*
+	 * (cp - v) is the number of bytes where the match is relevant.
+	 * Multiply by 8 to get number of bits. Then reduce this number
+	 * by the trailing bits in the last byte where we have a match
+	 * by looking at (cmp_res >> 1) in each iteration below.
+	 * Note that v starts at the beginning of the key, so, when key
+	 * is a sockaddr structure, the preliminary len/family/port bytes
+	 * are accounted for.
+	 */
+	for (b = (cp - v) << 3; cmp_res; b--)
+		cmp_res >>= 1;
+	cp = v;
+	x = top;
+	do {
+		p = x;
+		if (cp[x->rn_offset] & x->rn_bmask)
+			x = x->rn_right;
 		else
-			p->rn_right = t;
-		x->rn_parent = t;
-		t->rn_parent = p;
-		if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
-			t->rn_right = x;
-		} else {
-			t->rn_right = tt;
-			t->rn_left = x;
-		}
+			x = x->rn_left;
+	} while (b > (unsigned)x->rn_bit);
+			/* x->rn_bit < b && x->rn_bit >= 0 */
+	/*
+	 * now the rightmost bit where v and rn_key differ (b) is <
+	 * x->rn_bit.
+	 *
+	 * We will add a new branch at p.  b cannot equal x->rn_bit
+	 * because we know we didn't find a duplicated key.
+	 * The tree will be re-adjusted so that t is inserted between p
+	 * and x.
+	 */
+	t = rn_newpair(v_arg, b, nodes);
+	tt = t->rn_left;
+	if ((cp[p->rn_offset] & p->rn_bmask) == 0)
+		p->rn_left = t;
+	else
+		p->rn_right = t;
+	x->rn_parent = t;
+	t->rn_parent = p;
+	if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
+		t->rn_right = x;
+	} else {
+		t->rn_right = tt;
+		t->rn_left = x;
 	}
 	return (tt);
 }
@@ -718,6 +733,8 @@ rn_addroute(v_arg, n_arg, head, treenodes)
 		 * find it among possible duplicate key entries
 		 * anyway, so the above test doesn't hurt.
 		 *
+		 * Insert treenodes before tt.
+		 *
 		 * We sort the masks for a duplicated key the same way as
 		 * in a masklist -- most specific to least specific.
 		 * This may require the unfortunate nuisance of relocating
@@ -758,22 +775,54 @@ rn_addroute(v_arg, n_arg, head, treenodes)
 		tt->rn_bit = x->rn_bit;
 		tt->rn_flags |= x->rn_flags & RNF_NORMAL;
 	}
+	/* BEGIN CSTYLED */
+	/*
+	 * at this point the parent-child relationship for p, t, x, tt is
+	 * one of the following:
+	 *		p			p
+	 *		:  (left/right child)	:
+	 *		:			:
+	 *		t			t
+	 *	       / \		       / \
+	 *	      x   tt		      tt  x
+	 *
+	 *	tt == saved_tt returned by rn_insert().
+	 */
+	/* END CSTYLED */
 	t = saved_tt->rn_parent;
 	if (keyduplicated)
 		goto key_exists;
 	b_leaf = -1 - t->rn_bit;
+	/*
+	 * b_leaf is now normalized to be in the leaf rn_bit format
+	 * (it is the rn_bit value of a leaf corresponding to netmask
+	 * of t->rn_bit).
+	 */
 	if (t->rn_right == saved_tt)
 		x = t->rn_left;
 	else
 		x = t->rn_right;
-	/* Promote general routes from below */
+	/*
+	 * Promote general routes from below.
+	 * Identify the less specific netmasks and add them to t->rm_mklist
+	 */
 	if (x->rn_bit < 0) {
-	    for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
-		if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
-			*mp = m = rn_new_radix_mask(x, 0);
-			if (m)
-				mp = &m->rm_mklist;
-		}
+		/* x is the sibling node. it is a leaf node. */
+		for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
+			if (x->rn_mask && (x->rn_bit >= b_leaf) &&
+			    x->rn_mklist == 0) {
+				/*
+				 * x is the first node in the dupedkey chain
+				 * without a mklist, and with a shorter mask
+				 * than b_leaf. Create a radix_mask
+				 * corresponding to x's mask and add it to
+				 * t's rn_mklist. The mask list gets created
+				 * in decreasing order of mask length.
+				 */
+				*mp = m = rn_new_radix_mask(x, 0);
+				if (m)
+					mp = &m->rm_mklist;
+			}
 	} else if (x->rn_mklist) {
 		/*
 		 * Skip over masks whose index is > that of new node
@@ -788,6 +837,7 @@ key_exists:
 	if ((netmask == 0) || (b > t->rn_bit))
 		return (tt); /* can't lift at all */
 	b_leaf = tt->rn_bit;
+	/* b is the index of the netmask */
 	do {
 		x = t;
 		t = t->rn_parent;
diff --git a/usr/src/lib/brand/native/zone/platform.xml b/usr/src/lib/brand/native/zone/platform.xml
index e988200bde..0225a51dc7 100644
--- a/usr/src/lib/brand/native/zone/platform.xml
+++ b/usr/src/lib/brand/native/zone/platform.xml
@@ -106,7 +106,6 @@
 	<device match="ipsecesp" ip-type="exclusive" />
 	<device match="ipstate" ip-type="exclusive" />
 	<device match="ipsync" ip-type="exclusive" />
-	<device match="iptunq" ip-type="exclusive" />
 	<device match="keysock" ip-type="exclusive" />
 	<device match="rawip" ip-type="exclusive" />
 	<device match="rawip6" ip-type="exclusive" />
@@ -117,6 +116,7 @@
 	<device match="spdsock" ip-type="exclusive" />
 	<device match="sppp" ip-type="exclusive" />
 	<device match="sppptun" ip-type="exclusive" />
+	<device match="vni" ip-type="exclusive" />
 
 	<!-- Renamed devices to create under /dev -->
 	<device match="zcons/%z/zoneconsole" name="zconsole" />
diff --git a/usr/src/lib/brand/solaris10/zone/platform.xml b/usr/src/lib/brand/solaris10/zone/platform.xml
index fa396ec222..89f7035615 100644
--- a/usr/src/lib/brand/solaris10/zone/platform.xml
+++ b/usr/src/lib/brand/solaris10/zone/platform.xml
@@ -123,7 +123,6 @@
 	<device match="ipsecesp" ip-type="exclusive" />
 	<device match="ipstate" ip-type="exclusive" />
 	<device match="ipsync" ip-type="exclusive" />
-	<device match="iptunq" ip-type="exclusive" />
 	<device match="keysock" ip-type="exclusive" />
 	<device match="rawip" ip-type="exclusive" />
 	<device match="rawip6" ip-type="exclusive" />
@@ -134,6 +133,7 @@
 	<device match="spdsock" ip-type="exclusive" />
 	<device match="sppp" ip-type="exclusive" />
 	<device match="sppptun" ip-type="exclusive" />
+	<device match="vni" ip-type="exclusive" />
 
 	<!-- Renamed devices to create under /dev -->
 	<device match="zcons/%z/zoneconsole" name="zconsole" />
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_com b/usr/src/pkgdefs/SUNWckr/prototype_com
index 30679b7037..86489c1422 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_com
+++ b/usr/src/pkgdefs/SUNWckr/prototype_com
@@ -92,7 +92,6 @@ f none kernel/drv/ippctl.conf 644 root sys
 f none kernel/drv/ipsecah.conf 644 root sys
 f none kernel/drv/ipsecesp.conf 644 root sys
 f none kernel/drv/iptun.conf 644 root sys
-f none kernel/drv/iptunq.conf 644 root sys
 f none kernel/drv/iwscn.conf 644 root sys
 f none kernel/drv/keysock.conf 644 root sys
 f none kernel/drv/kmdb.conf 644 root sys
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_i386 b/usr/src/pkgdefs/SUNWckr/prototype_i386
index 2a6676197e..5f886a8d60 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_i386
+++ b/usr/src/pkgdefs/SUNWckr/prototype_i386
@@ -103,7 +103,6 @@ f none kernel/drv/ippctl 755 root sys
 f none kernel/drv/ipsecah 755 root sys
 f none kernel/drv/ipsecesp 755 root sys
 f none kernel/drv/iptun 755 root sys
-f none kernel/drv/iptunq 755 root sys
 f none kernel/drv/iwscn 755 root sys
 f none kernel/drv/kb8042 755 root sys
 f none kernel/drv/keysock 755 root sys
@@ -326,7 +325,6 @@ f none kernel/drv/amd64/ippctl 755 root sys
 f none kernel/drv/amd64/ipsecah 755 root sys
 f none kernel/drv/amd64/ipsecesp 755 root sys
 f none kernel/drv/amd64/iptun 755 root sys
-f none kernel/drv/amd64/iptunq 755 root sys
 f none kernel/drv/amd64/iwscn 755 root sys
 f none kernel/drv/amd64/kb8042 755 root sys
 f none kernel/drv/amd64/keysock 755 root sys
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_sparc b/usr/src/pkgdefs/SUNWckr/prototype_sparc
index e086c94862..c2824f989c 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc
@@ -94,7 +94,6 @@ f none kernel/drv/sparcv9/ippctl 755 root sys
 f none kernel/drv/sparcv9/ipsecah 755 root sys
 f none kernel/drv/sparcv9/ipsecesp 755 root sys
 f none kernel/drv/sparcv9/iptun 755 root sys
-f none kernel/drv/sparcv9/iptunq 755 root sys
 f none kernel/drv/sparcv9/isp 755 root sys
 f none kernel/drv/sparcv9/iwscn 755 root sys
 f none kernel/drv/sparcv9/kb8042 755 root sys
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index 3129ef6be5..e3bfe3f348 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -242,6 +242,7 @@ d none usr/include/inet 755 root bin
 f none usr/include/inet/arp.h 644 root bin
 f none usr/include/inet/common.h 644 root bin
 f none usr/include/inet/ip.h 644 root bin
+f none usr/include/inet/ip_arp.h 644 root bin
 f none usr/include/inet/ip_if.h 644 root bin
 f none usr/include/inet/ip_ire.h 644 root bin
 f none usr/include/inet/ip_ftable.h 644 root bin
diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386
index 09514a0ecc..ee760eba55 100644
--- a/usr/src/pkgdefs/etc/exception_list_i386
+++ b/usr/src/pkgdefs/etc/exception_list_i386
@@ -365,7 +365,6 @@ usr/lib/amd64/llib-like.ln		i386
 usr/lib/amd64/libipsecutil.so		i386
 usr/lib/amd64/llib-lipsecutil.ln	i386
 #
-usr/include/inet/arp_impl.h		i386
 usr/include/inet/rawip_impl.h		i386
 usr/include/inet/udp_impl.h		i386
 usr/include/inet/tcp_impl.h		i386
diff --git a/usr/src/pkgdefs/etc/exception_list_sparc b/usr/src/pkgdefs/etc/exception_list_sparc
index 5a32c55a05..533552b058 100644
--- a/usr/src/pkgdefs/etc/exception_list_sparc
+++ b/usr/src/pkgdefs/etc/exception_list_sparc
@@ -354,7 +354,6 @@ usr/share/lib/locale/com/sun/dhcpmgr/cli/dhcpconfig/ResourceBundle.properties	sp
 usr/share/lib/locale/com/sun/dhcpmgr/cli/dhtadm/ResourceBundle.properties	sparc
 usr/share/lib/locale/com/sun/dhcpmgr/cli/pntadm/ResourceBundle.properties	sparc
 #
-usr/include/inet/arp_impl.h		sparc
 usr/include/inet/rawip_impl.h		sparc
 usr/include/inet/udp_impl.h		sparc
 usr/include/inet/tcp_impl.h		sparc
diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh
index be820004e4..e4e9a36ab2 100644
--- a/usr/src/tools/scripts/bfu.sh
+++ b/usr/src/tools/scripts/bfu.sh
@@ -8010,6 +8010,12 @@ mondo_loop() {
 	rm -f $root/kernel/strmod/sparcv9/tun
 	rm -f $root/kernel/strmod/amd64/tun
 
+	# Remove obsolete iptunq
+	rm -f $root/kernel/drv/iptunq
+	rm -f $root/kernel/drv/iptunq.conf
+	rm -f $root/kernel/drv/amd64/iptunq
+	rm -f $root/kernel/drv/sparcv9/iptunq
+
 	#
 	# Remove libtopo platform XML files that have been replaced by propmap
 	# files.
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 042685bc5a..550606f39c 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -514,7 +514,7 @@ TOKENMT_OBJS +=	tokenmt.o tokenmtddi.o
 
 TSWTCL_OBJS +=	tswtcl.o tswtclddi.o
 
-ARP_OBJS +=	arpddi.o arp.o arp_netinfo.o
+ARP_OBJS +=	arpddi.o
 
 ICMP_OBJS +=	icmpddi.o
 
@@ -535,13 +535,15 @@ IP_SCTP_OBJS =	sctp.o sctp_opt_data.o sctp_output.o \
 		sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o
 IP_ILB_OBJS =	ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o
 
-IP_OBJS +=	igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
-		ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
-		ip_multi.o ip2mac.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
+IP_OBJS +=	igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \
+		ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
+		ip_multi.o ip2mac.o ip_ndp.o ip_rts.o ip_srcid.o \
 		ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
 		spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \
 		ip_sadb.o ip_ftable.o proto_set.o radix.o ip_dummy.o \
-		ip_helper_stream.o iptunq.o \
+		ip_helper_stream.o \
+		ip_output.o ip_input.o ip6_input.o ip6_output.o ip_arp.o \
+		conn_opt.o ip_attr.o ip_dce.o \
 		$(IP_ICMP_OBJS) \
 		$(IP_RTS_OBJS) \
 		$(IP_TCP_OBJS) \
@@ -644,8 +646,6 @@ MAC_IB_OBJS +=		mac_ib.o
 
 IPTUN_OBJS +=	iptun_dev.o iptun_ctl.o iptun.o
 
-IPTUNQ_OBJS +=	iptunq_ddi.o
-
 AGGR_OBJS +=	aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \
 		aggr_send.o aggr_recv.o aggr_lacp.o
 
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.h b/usr/src/uts/common/fs/sockfs/sockcommon.h
index f3ffe456f1..fac10a8935 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon.h
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.h
@@ -184,8 +184,7 @@ extern int	so_dequeue_msg(struct sonode *, mblk_t **, struct uio *,
 extern void	so_enqueue_msg(struct sonode *, mblk_t *, size_t);
 extern void	so_process_new_message(struct sonode *, mblk_t *, mblk_t *);
 
-extern mblk_t	*socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *,
-    cred_t *);
+extern mblk_t	*socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *);
 extern mblk_t 	*socopyoutuio(mblk_t *, struct uio *, ssize_t, int *);
 
 extern boolean_t somsghasdata(mblk_t *);
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index 48a3e37921..4521fdd352 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -470,8 +470,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
 			    so->so_proto_props.sopp_maxpsz,
 			    so->so_proto_props.sopp_wroff,
 			    so->so_proto_props.sopp_maxblk,
-			    so->so_proto_props.sopp_tail, &error,
-			    cr)) == NULL) {
+			    so->so_proto_props.sopp_tail, &error)) == NULL) {
 				break;
 			}
 			ASSERT(uiop->uio_resid >= 0);
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
index a244c65bc6..9b806d0a4a 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -471,7 +471,7 @@ socket_sendsig(struct sonode *so, int event)
 /* Copy userdata into a new mblk_t */
 mblk_t *
 socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
-    size_t tail_len, int *errorp, cred_t *cr)
+    size_t tail_len, int *errorp)
 {
 	mblk_t	*head = NULL, **tail = &head;
 
@@ -499,11 +499,7 @@ socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
 
 		blocksize = MIN(iosize, maxblk);
 		ASSERT(blocksize >= 0);
-		if (is_system_labeled())
-			mp = allocb_cred(wroff + blocksize + tail_len,
-			    cr, curproc->p_pid);
-		else
-			mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
+		mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
 		if (mp == NULL) {
 			*errorp = ENOMEM;
 			return (head);
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index b2a178fbcb..bfbd67ad81 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -5506,205 +5506,6 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name,
 	so_lock_single(so);	/* Set SOLOCKED */
 	mutex_exit(&so->so_lock);
 
-	/*
-	 * For SOCKET or TCP level options, try to set it here itself
-	 * provided socket has not been popped and we know the tcp
-	 * structure (stored in so_priv).
-	 */
-	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
-	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
-	    (so->so_version == SOV_SOCKSTREAM) &&
-	    (so->so_proto_handle != NULL)) {
-		tcp_t		*tcp = (tcp_t *)so->so_proto_handle;
-		boolean_t	onoff;
-
-#define	intvalue	(*(int32_t *)optval)
-
-		switch (level) {
-		case SOL_SOCKET:
-			switch (option_name) {		/* Check length param */
-			case SO_DEBUG:
-			case SO_REUSEADDR:
-			case SO_DONTROUTE:
-			case SO_BROADCAST:
-			case SO_USELOOPBACK:
-			case SO_OOBINLINE:
-			case SO_DGRAM_ERRIND:
-				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
-					error = EINVAL;
-					eprintsoline(so, error);
-					mutex_enter(&so->so_lock);
-					goto done2;
-				}
-				ASSERT(optval);
-				onoff = intvalue != 0;
-				handled = B_TRUE;
-				break;
-			case SO_SNDTIMEO:
-			case SO_RCVTIMEO:
-				if (get_udatamodel() == DATAMODEL_NONE ||
-				    get_udatamodel() == DATAMODEL_NATIVE) {
-					if (optlen !=
-					    sizeof (struct timeval)) {
-						error = EINVAL;
-						eprintsoline(so, error);
-						mutex_enter(&so->so_lock);
-						goto done2;
-					}
-				} else {
-					if (optlen !=
-					    sizeof (struct timeval32)) {
-						error = EINVAL;
-						eprintsoline(so, error);
-						mutex_enter(&so->so_lock);
-						goto done2;
-					}
-				}
-				ASSERT(optval);
-				handled = B_TRUE;
-				break;
-			case SO_LINGER:
-				if (optlen !=
-				    (t_uscalar_t)sizeof (struct linger)) {
-					error = EINVAL;
-					eprintsoline(so, error);
-					mutex_enter(&so->so_lock);
-					goto done2;
-				}
-				ASSERT(optval);
-				handled = B_TRUE;
-				break;
-			}
-
-			switch (option_name) {			/* Do actions */
-			case SO_LINGER: {
-				struct linger *lgr = (struct linger *)optval;
-
-				if (lgr->l_onoff) {
-					tcp->tcp_linger = 1;
-					tcp->tcp_lingertime = lgr->l_linger;
-					so->so_linger.l_onoff = SO_LINGER;
-					so->so_options |= SO_LINGER;
-				} else {
-					tcp->tcp_linger = 0;
-					tcp->tcp_lingertime = 0;
-					so->so_linger.l_onoff = 0;
-					so->so_options &= ~SO_LINGER;
-				}
-				so->so_linger.l_linger = lgr->l_linger;
-				handled = B_TRUE;
-				break;
-			}
-			case SO_SNDTIMEO:
-			case SO_RCVTIMEO: {
-				struct timeval tl;
-				clock_t val;
-
-				if (get_udatamodel() == DATAMODEL_NONE ||
-				    get_udatamodel() == DATAMODEL_NATIVE)
-					bcopy(&tl, (struct timeval *)optval,
-					    sizeof (struct timeval));
-				else
-					TIMEVAL32_TO_TIMEVAL(&tl,
-					    (struct timeval32 *)optval);
-				val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
-				if (option_name == SO_RCVTIMEO)
-					so->so_rcvtimeo = drv_usectohz(val);
-				else
-					so->so_sndtimeo = drv_usectohz(val);
-				break;
-			}
-
-			case SO_DEBUG:
-				tcp->tcp_debug = onoff;
-#ifdef SOCK_TEST
-				if (intvalue & 2)
-					sock_test_timelimit = 10 * hz;
-				else
-					sock_test_timelimit = 0;
-
-				if (intvalue & 4)
-					do_useracc = 0;
-				else
-					do_useracc = 1;
-#endif /* SOCK_TEST */
-				break;
-			case SO_DONTROUTE:
-				/*
-				 * SO_DONTROUTE, SO_USELOOPBACK and
-				 * SO_BROADCAST are only of interest to IP.
-				 * We track them here only so
-				 * that we can report their current value.
-				 */
-				tcp->tcp_dontroute = onoff;
-				if (onoff)
-					so->so_options |= option_name;
-				else
-					so->so_options &= ~option_name;
-				break;
-			case SO_USELOOPBACK:
-				tcp->tcp_useloopback = onoff;
-				if (onoff)
-					so->so_options |= option_name;
-				else
-					so->so_options &= ~option_name;
-				break;
-			case SO_BROADCAST:
-				tcp->tcp_broadcast = onoff;
-				if (onoff)
-					so->so_options |= option_name;
-				else
-					so->so_options &= ~option_name;
-				break;
-			case SO_REUSEADDR:
-				tcp->tcp_reuseaddr = onoff;
-				if (onoff)
-					so->so_options |= option_name;
-				else
-					so->so_options &= ~option_name;
-				break;
-			case SO_OOBINLINE:
-				tcp->tcp_oobinline = onoff;
-				if (onoff)
-					so->so_options |= option_name;
-				else
-					so->so_options &= ~option_name;
-				break;
-			case SO_DGRAM_ERRIND:
-				tcp->tcp_dgram_errind = onoff;
-				if (onoff)
-					so->so_options |= option_name;
-				else
-					so->so_options &= ~option_name;
-				break;
-			}
-			break;
-		case IPPROTO_TCP:
-			switch (option_name) {
-			case TCP_NODELAY:
-				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
-					error = EINVAL;
-					eprintsoline(so, error);
-					mutex_enter(&so->so_lock);
-					goto done2;
-				}
-				ASSERT(optval);
-				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
-				handled = B_TRUE;
-				break;
-			}
-			break;
-		default:
-			handled = B_FALSE;
-			break;
-		}
-	}
-
-	if (handled) {
-		mutex_enter(&so->so_lock);
-		goto done2;
-	}
-
 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
diff --git a/usr/src/uts/common/inet/Makefile b/usr/src/uts/common/inet/Makefile
index 052c010aea..3d45e4861c 100644
--- a/usr/src/uts/common/inet/Makefile
+++ b/usr/src/uts/common/inet/Makefile
@@ -28,12 +28,12 @@
 # include global definitions
 include ../../../Makefile.master
 
-HDRS=	arp.h arp_impl.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \
+HDRS=	arp.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \
 	ipsecah.h ipsecesp.h ipsec_info.h iptun.h ip6_asp.h ip_if.h ip_ire.h \
 	ip_multi.h ip_netinfo.h ip_ndp.h ip_rts.h ipsec_impl.h keysock.h \
 	led.h mi.h mib2.h nd.h optcom.h sadb.h sctp_itf.h snmpcom.h tcp.h \
 	tcp_sack.h tcp_stack.h udp_impl.h rawip_impl.h ipp_common.h \
-	ip_ftable.h ip_impl.h ip_stack.h tcp_impl.h wifi_ioctl.h \
+	ip_ftable.h ip_impl.h ip_stack.h ip_arp.h tcp_impl.h wifi_ioctl.h \
 	ip2mac.h ip2mac_impl.h
 
 ROOTDIRS= $(ROOT)/usr/include/inet
diff --git a/usr/src/uts/common/inet/arp.h b/usr/src/uts/common/inet/arp.h
index 4351c91666..de0602e1f7 100644
--- a/usr/src/uts/common/inet/arp.h
+++ b/usr/src/uts/common/inet/arp.h
@@ -28,7 +28,6 @@
 #define	_INET_ARP_H
 
 #include <sys/types.h>
-#include <net/if.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -45,30 +44,7 @@ extern "C" {
 #define	RARP_REQUEST	3
 #define	RARP_RESPONSE	4
 
-#define	AR_IOCTL		(((unsigned)'A' & 0xFF)<<8)
-#define	CMD_IN_PROGRESS		0x10000
-
-#define	AR_ENTRY_ADD		(AR_IOCTL + 1)
-#define	AR_ENTRY_DELETE		(AR_IOCTL + 2)
-#define	AR_ENTRY_QUERY		(AR_IOCTL + 3)
-#define	AR_ENTRY_SQUERY		(AR_IOCTL + 6)
-#define	AR_MAPPING_ADD		(AR_IOCTL + 7)
-#define	AR_CLIENT_NOTIFY	(AR_IOCTL + 8)
-#define	AR_INTERFACE_UP		(AR_IOCTL + 9)
-#define	AR_INTERFACE_DOWN	(AR_IOCTL + 10)
-#define	AR_INTERFACE_ON		(AR_IOCTL + 12)
-#define	AR_INTERFACE_OFF	(AR_IOCTL + 13)
-#define	AR_DLPIOP_DONE		(AR_IOCTL + 14)
-/*
- * This is not an ARP command per se, it is used to interface between
- * ARP and IP during close.
- */
-#define	AR_ARP_CLOSING		(AR_IOCTL + 16)
-#define	AR_ARP_EXTEND		(AR_IOCTL + 17)
-#define	AR_IPMP_ACTIVATE	(AR_IOCTL + 18)
-#define	AR_IPMP_DEACTIVATE	(AR_IOCTL + 19)
-
-/* Both ace_flags and area_flags; must also modify arp.c in mdb */
+/* Both ace_flags; must also modify arp.c in mdb */
 #define	ACE_F_PERMANENT		0x0001
 #define	ACE_F_PUBLISH		0x0002
 #define	ACE_F_DYING		0x0004
@@ -84,123 +60,6 @@ extern "C" {
 #define	ACE_F_DELAYED		0x0800	/* rescheduled on arp_defend_rate */
 #define	ACE_F_DAD_ABORTED	0x1000	/* DAD was aborted on link down */
 
-/* ared_flags */
-#define	ARED_F_PRESERVE_PERM	0x0001	/* preserve permanent ace */
-
-/* ARP Command Structures */
-
-/* arc_t - Common command overlay */
-typedef struct ar_cmd_s {
-	uint32_t	arc_cmd;
-	uint32_t	arc_name_offset;
-	uint32_t	arc_name_length;
-} arc_t;
-
-/*
- * NOTE: when using area_t for an AR_ENTRY_SQUERY, the area_hw_addr_offset
- * field isn't what you might think. See comments in ip_multi.c where
- * the routine ill_create_squery() is called, and also in the routine
- * itself, to see how this field is used *only* when the area_t holds
- * an AR_ENTRY_SQUERY.
- */
-typedef	struct ar_entry_add_s {
-	uint32_t	area_cmd;
-	uint32_t	area_name_offset;
-	uint32_t	area_name_length;
-	uint32_t	area_proto;
-	uint32_t	area_proto_addr_offset;
-	uint32_t	area_proto_addr_length;
-	uint32_t	area_proto_mask_offset;
-	uint32_t	area_flags;		/* Same values as ace_flags */
-	uint32_t	area_hw_addr_offset;
-	uint32_t	area_hw_addr_length;
-} area_t;
-
-typedef	struct ar_entry_delete_s {
-	uint32_t	ared_cmd;
-	uint32_t	ared_name_offset;
-	uint32_t	ared_name_length;
-	uint32_t	ared_proto;
-	uint32_t	ared_proto_addr_offset;
-	uint32_t	ared_proto_addr_length;
-	uint32_t	ared_flags;
-} ared_t;
-
-typedef	struct ar_entry_query_s {
-	uint32_t	areq_cmd;
-	uint32_t	areq_name_offset;
-	uint32_t	areq_name_length;
-	uint32_t	areq_proto;
-	uint32_t	areq_target_addr_offset;
-	uint32_t	areq_target_addr_length;
-	uint32_t	areq_flags;
-	uint32_t	areq_sender_addr_offset;
-	uint32_t	areq_sender_addr_length;
-	uint32_t	areq_xmit_count;	/* 0 ==> cache lookup only */
-	uint32_t	areq_xmit_interval; /* # of milliseconds; 0: default */
-		/* # ofquests to buffer; 0: default */
-	uint32_t	areq_max_buffered;
-	uchar_t	areq_sap[8];		/* to insert in returned template */
-} areq_t;
-
-#define	AR_EQ_DEFAULT_XMIT_COUNT	6
-#define	AR_EQ_DEFAULT_XMIT_INTERVAL	1000
-#define	AR_EQ_DEFAULT_MAX_BUFFERED	4
-
-/*
- * Structure used with AR_ENTRY_LLAQUERY to map from the link_addr
- * (in Neighbor Discovery option format excluding the option type and
- * length) to a hardware address.
- * The response has the same format as for an AR_ENTRY_SQUERY - an M_CTL with
- * arel_hw_addr updated.
- * An IPv6 address will be passed in AR_ENTRY_LLAQUERY so that atmip
- * can send it in AR_CLIENT_NOTIFY messages.
- */
-typedef	struct ar_entry_llaquery_s {
-	uint32_t	arel_cmd;
-	uint32_t	arel_name_offset;
-	uint32_t	arel_name_length;
-	uint32_t	arel_link_addr_offset;
-	uint32_t	arel_link_addr_length;
-	uint32_t	arel_hw_addr_offset;
-	uint32_t	arel_hw_addr_length;
-	uint32_t	arel_ip_addr_offset;
-	uint32_t	arel_ip_addr_length;
-} arel_t;
-
-typedef	struct ar_mapping_add_s {
-	uint32_t	arma_cmd;
-	uint32_t	arma_name_offset;
-	uint32_t	arma_name_length;
-	uint32_t	arma_proto;
-	uint32_t	arma_proto_addr_offset;
-	uint32_t	arma_proto_addr_length;
-	uint32_t	arma_proto_mask_offset;
-	uint32_t	arma_proto_extract_mask_offset;
-	uint32_t	arma_flags;
-	uint32_t	arma_hw_addr_offset;
-	uint32_t	arma_hw_addr_length;
-		/* Offset were we start placing */
-	uint32_t	arma_hw_mapping_start;
-					/* the mask&proto_addr */
-} arma_t;
-
-/* Structure used to notify ARP of changes to IPMP group topology */
-typedef	struct ar_ipmp_event_s {
-	uint32_t	arie_cmd;
-	uint32_t	arie_name_offset;
-	uint32_t	arie_name_length;
-	char		arie_grifname[LIFNAMSIZ];
-} arie_t;
-
-/* Structure used to notify clients of interesting conditions. */
-typedef struct ar_client_notify_s {
-	uint32_t	arcn_cmd;
-	uint32_t	arcn_name_offset;
-	uint32_t	arcn_name_length;
-	uint32_t	arcn_code;			/* Notification code. */
-} arcn_t;
-
 /* Client Notification Codes */
 #define	AR_CN_BOGON	1
 #define	AR_CN_ANNOUNCE	2
diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c
deleted file mode 100644
index abdbc39a47..0000000000
--- a/usr/src/uts/common/inet/arp/arp.c
+++ /dev/null
@@ -1,4883 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/* Copyright (c) 1990 Mentat Inc. */
-
-/* AR - Address Resolution Protocol */
-
-#include <sys/types.h>
-#include <sys/stream.h>
-#include <sys/stropts.h>
-#include <sys/strsubr.h>
-#include <sys/errno.h>
-#include <sys/strlog.h>
-#include <sys/dlpi.h>
-#include <sys/sockio.h>
-#define	_SUN_TPI_VERSION	2
-#include <sys/tihdr.h>
-#include <sys/socket.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/cmn_err.h>
-#include <sys/sdt.h>
-#include <sys/vtrace.h>
-#include <sys/strsun.h>
-#include <sys/policy.h>
-#include <sys/zone.h>
-#include <sys/ethernet.h>
-#include <sys/zone.h>
-#include <sys/random.h>
-#include <sys/sdt.h>
-#include <sys/hook_event.h>
-
-#include <inet/common.h>
-#include <inet/optcom.h>
-#include <inet/mi.h>
-#include <inet/nd.h>
-#include <inet/snmpcom.h>
-#include <net/if.h>
-#include <inet/arp.h>
-#include <netinet/ip6.h>
-#include <netinet/arp.h>
-#include <inet/ip.h>
-#include <inet/ip_ire.h>
-#include <inet/ip_ndp.h>
-#include <inet/mib2.h>
-#include <inet/arp_impl.h>
-
-/*
- * ARP entry life time and design notes
- * ------------------------------------
- *
- * ARP entries (ACEs) must last at least as long as IP knows about a given
- * MAC-IP translation (i.e., as long as the IRE cache entry exists).  It's ok
- * if the ARP entry lasts longer, but not ok if it is removed before the IP
- * entry.  The reason for this is that if ARP doesn't have an entry, we will be
- * unable to detect the difference between an ARP broadcast that represents no
- * change (same, known address of sender) and one that represents a change (new
- * address for existing entry).  In the former case, we must not notify IP, or
- * we can suffer hurricane attack.  In the latter case, we must notify IP, or
- * IP will drift out of sync with the network.
- *
- * Note that IP controls the lifetime of entries, not ARP.
- *
- * We don't attempt to reconfirm aging entries.  If the system is no longer
- * talking to a given peer, then it doesn't matter if we have the right mapping
- * for that peer.  It would be possible to send queries on aging entries that
- * are active, but this isn't done.
- *
- * IPMP Notes
- * ----------
- *
- * ARP is aware of IPMP.  In particular, IP notifies ARP about all "active"
- * (able to transmit data packets) interfaces in a given group via
- * AR_IPMP_ACTIVATE and AR_IPMP_DEACTIVATE messages.  These messages, combined
- * with the "IPMP arl_t" that ARP creates over the IPMP DLPI stub driver,
- * enable ARP to track all the arl_t's that are in the same group and thus
- * ensure that ACEs are shared across each group and the arl_t that ARP
- * chooses to transmit on for a given ACE is optimal.
- *
- * ARP relies on IP for hardware address updates.  In particular, if the
- * hardware address of an interface changes (DL_NOTE_PHYS_ADDR), then IP will
- * bring the interface down and back up -- and as part of bringing it back
- * up, will send messages to ARP that allow it to update the affected arl's
- * with new hardware addresses.
- *
- * N.B.: One side-effect of this approach is that when an interface fails and
- * then starts to repair, it will temporarily populate the ARP cache with
- * addresses that are owned by it rather than the group's arl_t.  To address
- * this, we could add more messages (e.g., AR_IPMP_JOIN and AR_IPMP_LEAVE),
- * but as the issue appears to be only cosmetic (redundant entries in the ARP
- * cache during interace repair), we've kept things simple for now.
- */
-
-/*
- * This is used when scanning for "old" (least recently broadcast) ACEs.  We
- * don't want to have to walk the list for every single one, so we gather up
- * batches at a time.
- */
-#define	ACE_RESCHED_LIST_LEN	8
-
-typedef struct {
-	arl_t	*art_arl;
-	uint_t	art_naces;
-	ace_t	*art_aces[ACE_RESCHED_LIST_LEN];
-} ace_resched_t;
-
-#define	ACE_RESOLVED(ace)	((ace)->ace_flags & ACE_F_RESOLVED)
-#define	ACE_NONPERM(ace)	\
-	(((ace)->ace_flags & (ACE_F_RESOLVED | ACE_F_PERMANENT)) == \
-	ACE_F_RESOLVED)
-
-#define	AR_DEF_XMIT_INTERVAL	500	/* time in milliseconds */
-#define	AR_LL_HDR_SLACK	32	/* Leave the lower layer some room */
-
-#define	AR_SNMP_MSG		T_OPTMGMT_ACK
-#define	AR_DRAINING		(void *)0x11
-
-/*
- * The IPv4 Link Local address space is special; we do extra duplicate checking
- * there, as the entire assignment mechanism rests on random numbers.
- */
-#define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
-				((uchar_t *)ptr)[1] == 254)
-
-/*
- * Check if the command needs to be enqueued by seeing if there are other
- * commands ahead of us or if some DLPI response is being awaited. Usually
- * there would be an enqueued command in the latter case, however if the
- * stream that originated the command has closed, the close would have
- * cleaned up the enqueued command. AR_DRAINING signifies that the command
- * at the head of the arl_queue has been internally dequeued on completion
- * of the previous command and is being called from ar_dlpi_done
- */
-#define	CMD_NEEDS_QUEUEING(mp, arl)					\
-	(mp->b_prev != AR_DRAINING && (arl->arl_queue != NULL ||	\
-	    arl->arl_dlpi_pending != DL_PRIM_INVAL))
-
-#define	ARH_FIXED_LEN	8
-
-/*
- * Macro used when creating ACEs to determine the arl that should own it.
- */
-#define	OWNING_ARL(arl)							\
-	((arl)->arl_ipmp_arl != NULL ? (arl)->arl_ipmp_arl : arl)
-
-/*
- * MAC-specific intelligence.  Shouldn't be needed, but the DL_INFO_ACK
- * doesn't quite do it for us.
- */
-typedef struct ar_m_s {
-	t_uscalar_t	ar_mac_type;
-	uint32_t	ar_mac_arp_hw_type;
-	t_scalar_t	ar_mac_sap_length;
-	uint32_t	ar_mac_hw_addr_length;
-} ar_m_t;
-
-typedef struct msg2_args {
-	mblk_t	*m2a_mpdata;
-	mblk_t	*m2a_mptail;
-} msg2_args_t;
-
-static mblk_t	*ar_alloc(uint32_t cmd, int);
-static int	ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr,
-    uint32_t hw_addr_len, uchar_t *proto_addr,
-    uint32_t proto_addr_len, uchar_t *proto_mask,
-    uchar_t *proto_extract_mask, uint32_t hw_extract_start,
-    uchar_t *sender_addr, uint32_t flags);
-static void	ar_ce_delete(ace_t *ace);
-static void	ar_ce_delete_per_arl(ace_t *ace, void *arg);
-static ace_t	**ar_ce_hash(arp_stack_t *as, uint32_t proto,
-    const uchar_t *proto_addr, uint32_t proto_addr_length);
-static ace_t	*ar_ce_lookup(arl_t *arl, uint32_t proto,
-    const uchar_t *proto_addr, uint32_t proto_addr_length);
-static ace_t	*ar_ce_lookup_entry(arl_t *arl, uint32_t proto,
-    const uchar_t *proto_addr, uint32_t proto_addr_length);
-static ace_t	*ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp,
-    ace_t *matchfn());
-static ace_t	*ar_ce_lookup_mapping(arl_t *arl, uint32_t proto,
-    const uchar_t *proto_addr, uint32_t proto_addr_length);
-static ace_t	*ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto,
-    uchar_t *proto_addr, uint32_t proto_addr_length);
-static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr,
-    uint32_t hw_addr_length);
-static void	ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *),
-    void *arg1);
-
-static void	ar_client_notify(const arl_t *arl, mblk_t *mp, int code);
-static int	ar_close(queue_t *q);
-static int	ar_cmd_dispatch(queue_t *q, mblk_t *mp, boolean_t from_wput);
-static void	ar_cmd_drain(arl_t *arl);
-static void	ar_cmd_done(arl_t *arl);
-static mblk_t	*ar_dlpi_comm(t_uscalar_t prim, size_t size);
-static void	ar_dlpi_send(arl_t *, mblk_t *);
-static void	ar_dlpi_done(arl_t *, t_uscalar_t);
-static int	ar_entry_add(queue_t *q, mblk_t *mp);
-static int	ar_entry_delete(queue_t *q, mblk_t *mp);
-static int	ar_entry_query(queue_t *q, mblk_t *mp);
-static int	ar_entry_squery(queue_t *q, mblk_t *mp);
-static int	ar_interface_up(queue_t *q, mblk_t *mp);
-static int	ar_interface_down(queue_t *q, mblk_t *mp);
-static int	ar_interface_on(queue_t *q, mblk_t *mp);
-static int	ar_interface_off(queue_t *q, mblk_t *mp);
-static int	ar_ipmp_activate(queue_t *q, mblk_t *mp);
-static int	ar_ipmp_deactivate(queue_t *q, mblk_t *mp);
-static void	ar_ll_cleanup_arl_queue(queue_t *q);
-static void	ar_ll_down(arl_t *arl);
-static arl_t	*ar_ll_lookup_by_name(arp_stack_t *as, const char *name);
-static arl_t	*ar_ll_lookup_from_mp(arp_stack_t *as, mblk_t *mp);
-static void	ar_ll_init(arp_stack_t *, ar_t *, mblk_t *mp);
-static void	ar_ll_set_defaults(arl_t *, mblk_t *mp);
-static void	ar_ll_clear_defaults(arl_t *);
-static int	ar_ll_up(arl_t *arl);
-static int	ar_mapping_add(queue_t *q, mblk_t *mp);
-static boolean_t	ar_mask_all_ones(uchar_t *mask, uint32_t mask_len);
-static ar_m_t	*ar_m_lookup(t_uscalar_t mac_type);
-static int	ar_nd_ioctl(queue_t *q, mblk_t *mp);
-static int	ar_open(queue_t *q, dev_t *devp, int flag, int sflag,
-    cred_t *credp);
-static int	ar_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
-static boolean_t ar_param_register(IDP *ndp, arpparam_t *arppa, int cnt);
-static int	ar_param_set(queue_t *q, mblk_t *mp, char *value,
-    caddr_t cp, cred_t *cr);
-static void	ar_query_delete(ace_t *ace, void *ar);
-static void	ar_query_reply(ace_t *ace, int ret_val,
-    uchar_t *proto_addr, uint32_t proto_addr_len);
-static clock_t	ar_query_xmit(arp_stack_t *as, ace_t *ace);
-static void	ar_rput(queue_t *q, mblk_t *mp_orig);
-static void	ar_rput_dlpi(queue_t *q, mblk_t *mp);
-static void	ar_set_address(ace_t *ace, uchar_t *addrpos,
-    uchar_t *proto_addr, uint32_t proto_addr_len);
-static int	ar_slifname(queue_t *q, mblk_t *mp);
-static int	ar_set_ppa(queue_t *q, mblk_t *mp);
-static int	ar_snmp_msg(queue_t *q, mblk_t *mp_orig);
-static void	ar_snmp_msg2(ace_t *, void *);
-static void	ar_wput(queue_t *q, mblk_t *mp);
-static void	ar_wsrv(queue_t *q);
-static void	ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto,
-    uint32_t plen, const uchar_t *haddr1, const uchar_t *paddr1,
-    const uchar_t *haddr2, const uchar_t *paddr2, const uchar_t *dstaddr,
-    arp_stack_t *as);
-static void	ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q,
-    ushort_t cmd, boolean_t);
-static mblk_t	*ar_cmd_dequeue(arl_t *arl);
-
-static void	*arp_stack_init(netstackid_t stackid, netstack_t *ns);
-static void	arp_stack_fini(netstackid_t stackid, void *arg);
-static void	arp_stack_shutdown(netstackid_t stackid, void *arg);
-
-boolean_t arp_no_defense = B_FALSE;
-
-/*
- * All of these are alterable, within the min/max values given,
- * at run time. arp_publish_interval and arp_publish_count are
- * set by default to 2 seconds and 5 respectively. This is
- * useful during FAILOVER/FAILBACK to make sure that the ARP
- * packets are not lost. Assumed that it does not affect the
- * normal operations.
- */
-static arpparam_t	arp_param_arr[] = {
-	/* min		max		value	name */
-	{ 30000,	3600000,	300000,	"arp_cleanup_interval"},
-	{ 1000,		20000,		2000,	"arp_publish_interval"},
-	{ 1,		20,		5,	"arp_publish_count"},
-	{ 0,		20000,		1000,	"arp_probe_delay"},
-	{ 10,		20000,		1500,	"arp_probe_interval"},
-	{ 0,		20,		3,	"arp_probe_count"},
-	{ 0,		20000,		100,	"arp_fastprobe_delay"},
-	{ 10,		20000,		150,	"arp_fastprobe_interval"},
-	{ 0,		20,		3,	"arp_fastprobe_count"},
-	{ 0,		3600000,	300000,	"arp_defend_interval"},
-	{ 0,		20000,		100,	"arp_defend_rate"},
-	{ 0,		3600000,	15000,	"arp_broadcast_interval"},
-	{ 5,		86400,		3600,	"arp_defend_period"}
-};
-#define	as_cleanup_interval	as_param_arr[0].arp_param_value
-#define	as_publish_interval	as_param_arr[1].arp_param_value
-#define	as_publish_count	as_param_arr[2].arp_param_value
-#define	as_probe_delay		as_param_arr[3].arp_param_value
-#define	as_probe_interval	as_param_arr[4].arp_param_value
-#define	as_probe_count		as_param_arr[5].arp_param_value
-#define	as_fastprobe_delay	as_param_arr[6].arp_param_value
-#define	as_fastprobe_interval	as_param_arr[7].arp_param_value
-#define	as_fastprobe_count	as_param_arr[8].arp_param_value
-#define	as_defend_interval	as_param_arr[9].arp_param_value
-#define	as_defend_rate		as_param_arr[10].arp_param_value
-#define	as_broadcast_interval	as_param_arr[11].arp_param_value
-#define	as_defend_period	as_param_arr[12].arp_param_value
-
-static struct module_info arp_mod_info = {
-	0, "arp", 0, INFPSZ, 512, 128
-};
-
-static struct qinit arprinit = {
-	(pfi_t)ar_rput, NULL, ar_open, ar_close, NULL, &arp_mod_info
-};
-
-static struct qinit arpwinit = {
-	(pfi_t)ar_wput, (pfi_t)ar_wsrv, ar_open, ar_close, NULL, &arp_mod_info
-};
-
-struct streamtab arpinfo = {
-	&arprinit, &arpwinit
-};
-
-/*
- * TODO: we need a better mechanism to set the ARP hardware type since
- * the DLPI mac type does not include enough predefined values.
- */
-static ar_m_t	ar_m_tbl[] = {
-	{ DL_CSMACD,	ARPHRD_ETHER,	-2,	6},	/* 802.3 */
-	{ DL_TPB,	ARPHRD_IEEE802,	-2,	6},	/* 802.4 */
-	{ DL_TPR,	ARPHRD_IEEE802,	-2,	6},	/* 802.5 */
-	{ DL_METRO,	ARPHRD_IEEE802,	-2,	6},	/* 802.6 */
-	{ DL_ETHER,	ARPHRD_ETHER,	-2,	6},	/* Ethernet */
-	{ DL_FDDI,	ARPHRD_ETHER,	-2,	6},	/* FDDI */
-	{ DL_IB,	ARPHRD_IB,	-2,	20},	/* Infiniband */
-	{ DL_OTHER,	ARPHRD_ETHER,	-2,	6},	/* unknown */
-};
-
-/*
- * Note that all routines which need to queue the message for later
- * processing have to be ioctl_aware to be able to queue the complete message.
- * Following are command entry flags in arct_flags
- */
-#define	ARF_IOCTL_AWARE	0x1	/* Arp command can come down as M_IOCTL */
-#define	ARF_ONLY_CMD	0x2	/* Command is exclusive to ARP */
-#define	ARF_WPUT_OK	0x4	/* Command is allowed from ar_wput */
-
-/* ARP Cmd Table entry */
-typedef struct arct_s {
-	int		(*arct_pfi)(queue_t *, mblk_t *);
-	uint32_t	arct_cmd;
-	int		arct_min_len;
-	uint32_t	arct_flags;
-	int		arct_priv_req;	/* Privilege required for this cmd */
-	const char	*arct_txt;
-} arct_t;
-
-/*
- * AR_ENTRY_ADD, QUERY and SQUERY are used by sdp, hence they need to
- * have ARF_WPUT_OK set.
- */
-static arct_t	ar_cmd_tbl[] = {
-	{ ar_entry_add,		AR_ENTRY_ADD,		sizeof (area_t),
-	    ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_CONFIG,
-	    "AR_ENTRY_ADD" },
-	{ ar_entry_delete,	AR_ENTRY_DELETE,	sizeof (ared_t),
-	    ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_ENTRY_DELETE" },
-	{ ar_entry_query,	AR_ENTRY_QUERY,		sizeof (areq_t),
-	    ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_NP,
-	    "AR_ENTRY_QUERY" },
-	{ ar_entry_squery,	AR_ENTRY_SQUERY,	sizeof (area_t),
-	    ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_NP,
-	    "AR_ENTRY_SQUERY" },
-	{ ar_mapping_add,	AR_MAPPING_ADD,		sizeof (arma_t),
-	    ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_MAPPING_ADD" },
-	{ ar_interface_up,	AR_INTERFACE_UP,	sizeof (arc_t),
-	    ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_UP" },
-	{ ar_interface_down,	AR_INTERFACE_DOWN,	sizeof (arc_t),
-	    ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_DOWN" },
-	{ ar_interface_on,	AR_INTERFACE_ON,	sizeof (arc_t),
-	    ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_ON" },
-	{ ar_interface_off,	AR_INTERFACE_OFF,	sizeof (arc_t),
-	    ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_OFF" },
-	{ ar_ipmp_activate,	AR_IPMP_ACTIVATE,	sizeof (arie_t),
-	    ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_ACTIVATE" },
-	{ ar_ipmp_deactivate, 	AR_IPMP_DEACTIVATE,	sizeof (arie_t),
-	    ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_DEACTIVATE" },
-	{ ar_set_ppa,		(uint32_t)IF_UNITSEL,	sizeof (int),
-	    ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "IF_UNITSEL" },
-	{ ar_nd_ioctl,		ND_GET,			1,
-	    ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_NP, "ND_GET" },
-	{ ar_nd_ioctl,		ND_SET,			1,
-	    ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "ND_SET" },
-	{ ar_snmp_msg,		AR_SNMP_MSG,	sizeof (struct T_optmgmt_ack),
-	    ARF_IOCTL_AWARE | ARF_WPUT_OK | ARF_ONLY_CMD, OP_NP,
-	    "AR_SNMP_MSG" },
-	{ ar_slifname,		(uint32_t)SIOCSLIFNAME,	sizeof (struct lifreq),
-	    ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "SIOCSLIFNAME" }
-};
-
-/*
- * Lookup and return an arl appropriate for sending packets with either source
- * hardware address `hw_addr' or source protocol address `ip_addr', in that
- * order.  If neither was specified or neither match, return any arl in the
- * same group as `arl'.
- */
-static arl_t *
-ar_ipmp_lookup_xmit_arl(arl_t *arl, uchar_t *hw_addr, uint_t hw_addrlen,
-    uchar_t *ip_addr)
-{
-	arlphy_t *ap;
-	ace_t *src_ace;
-	arl_t *xmit_arl = NULL;
-	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
-	ASSERT(arl->arl_flags & ARL_F_IPMP);
-
-	if (hw_addr != NULL && hw_addrlen != 0) {
-		xmit_arl = as->as_arl_head;
-		for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) {
-			/*
-			 * There may be arls with the same HW address that are
-			 * not in our IPMP group; we don't want those.
-			 */
-			if (xmit_arl->arl_ipmp_arl != arl)
-				continue;
-
-			ap = xmit_arl->arl_phy;
-			if (ap != NULL && ap->ap_hw_addrlen == hw_addrlen &&
-			    bcmp(ap->ap_hw_addr, hw_addr, hw_addrlen) == 0)
-				break;
-		}
-
-		DTRACE_PROBE4(xmit_arl_hwsrc, arl_t *, arl, arl_t *,
-		    xmit_arl, uchar_t *, hw_addr, uint_t, hw_addrlen);
-	}
-
-	if (xmit_arl == NULL && ip_addr != NULL) {
-		src_ace = ar_ce_lookup_permanent(as, IP_ARP_PROTO_TYPE, ip_addr,
-		    IP_ADDR_LEN);
-		if (src_ace != NULL)
-			xmit_arl = src_ace->ace_xmit_arl;
-
-		DTRACE_PROBE4(xmit_arl_ipsrc, arl_t *, arl, arl_t *,
-		    xmit_arl, uchar_t *, ip_addr, uint_t, IP_ADDR_LEN);
-	}
-
-	if (xmit_arl == NULL) {
-		xmit_arl = as->as_arl_head;
-		for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next)
-			if (xmit_arl->arl_ipmp_arl == arl && xmit_arl != arl)
-				break;
-
-		DTRACE_PROBE2(xmit_arl_any, arl_t *, arl, arl_t *, xmit_arl);
-	}
-
-	return (xmit_arl);
-}
-
-/*
- * ARP Cache Entry creation routine.
- * Cache entries are allocated within timer messages and inserted into
- * the global hash list based on protocol and protocol address.
- */
-static int
-ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
-    uchar_t *proto_addr, uint_t proto_addr_len, uchar_t *proto_mask,
-    uchar_t *proto_extract_mask, uint_t hw_extract_start, uchar_t *sender_addr,
-    uint_t flags)
-{
-	static ace_t	ace_null;
-	ace_t	*ace;
-	ace_t	**acep;
-	uchar_t	*dst;
-	mblk_t	*mp;
-	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-	arl_t	*xmit_arl;
-	arlphy_t *ap;
-
-	if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL)
-		return (EINVAL);
-
-	if (proto_addr == NULL || proto_addr_len == 0 ||
-	    (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN))
-		return (EINVAL);
-
-	if (flags & ACE_F_MYADDR)
-		flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY;
-
-	/*
-	 * Latch a transmit arl for this ace.
-	 */
-	if (arl->arl_flags & ARL_F_IPMP) {
-		ASSERT(proto == IP_ARP_PROTO_TYPE);
-		xmit_arl = ar_ipmp_lookup_xmit_arl(arl, hw_addr, hw_addr_len,
-		    sender_addr);
-	} else {
-		xmit_arl = arl;
-	}
-
-	if (xmit_arl == NULL || xmit_arl->arl_phy == NULL)
-		return (EINVAL);
-
-	ap = xmit_arl->arl_phy;
-
-	if (!hw_addr && hw_addr_len == 0) {
-		if (flags == ACE_F_PERMANENT) {	/* Not publish */
-			/* 224.0.0.0 to zero length address */
-			flags |= ACE_F_RESOLVED;
-		} else {	/* local address and unresolved case */
-			hw_addr = ap->ap_hw_addr;
-			hw_addr_len = ap->ap_hw_addrlen;
-			if (flags & ACE_F_PUBLISH)
-				flags |= ACE_F_RESOLVED;
-		}
-	} else {
-		flags |= ACE_F_RESOLVED;
-	}
-
-	/* Handle hw_addr_len == 0 for DL_ENABMULTI_REQ etc. */
-	if (hw_addr_len != 0 && hw_addr == NULL)
-		return (EINVAL);
-	if (hw_addr_len < ap->ap_hw_addrlen && hw_addr_len != 0)
-		return (EINVAL);
-	if (!proto_extract_mask && (flags & ACE_F_MAPPING))
-		return (EINVAL);
-
-	/*
-	 * If the underlying link doesn't have reliable up/down notification or
-	 * if we're working with the IPv4 169.254.0.0/16 Link Local Address
-	 * space, then don't use the fast timers.  Otherwise, use them.
-	 */
-	if (ap->ap_notifies &&
-	    !(proto == IP_ARP_PROTO_TYPE && IS_IPV4_LL_SPACE(proto_addr))) {
-		flags |= ACE_F_FAST;
-	}
-
-	/*
-	 * Allocate the timer block to hold the ace.
-	 * (ace + proto_addr + proto_addr_mask + proto_extract_mask + hw_addr)
-	 */
-	mp = mi_timer_alloc(sizeof (ace_t) + proto_addr_len + proto_addr_len +
-	    proto_addr_len + hw_addr_len);
-	if (!mp)
-		return (ENOMEM);
-	ace = (ace_t *)mp->b_rptr;
-	*ace = ace_null;
-	ace->ace_proto = proto;
-	ace->ace_mp = mp;
-	ace->ace_arl = arl;
-	ace->ace_xmit_arl = xmit_arl;
-
-	dst = (uchar_t *)&ace[1];
-
-	ace->ace_proto_addr = dst;
-	ace->ace_proto_addr_length = proto_addr_len;
-	bcopy(proto_addr, dst, proto_addr_len);
-	dst += proto_addr_len;
-	/*
-	 * The proto_mask allows us to add entries which will let us respond
-	 * to requests for a group of addresses.  This makes it easy to provide
-	 * proxy ARP service for machines that don't understand about the local
-	 * subnet structure, if, for example, there are BSD4.2 systems lurking.
-	 */
-	ace->ace_proto_mask = dst;
-	if (proto_mask != NULL) {
-		bcopy(proto_mask, dst, proto_addr_len);
-		dst += proto_addr_len;
-	} else {
-		while (proto_addr_len-- > 0)
-			*dst++ = (uchar_t)~0;
-	}
-
-	if (proto_extract_mask != NULL) {
-		ace->ace_proto_extract_mask = dst;
-		bcopy(proto_extract_mask, dst, ace->ace_proto_addr_length);
-		dst += ace->ace_proto_addr_length;
-	} else {
-		ace->ace_proto_extract_mask = NULL;
-	}
-	ace->ace_hw_extract_start = hw_extract_start;
-	ace->ace_hw_addr_length = hw_addr_len;
-	ace->ace_hw_addr = dst;
-	if (hw_addr != NULL) {
-		bcopy(hw_addr, dst, hw_addr_len);
-		dst += hw_addr_len;
-	}
-
-	ace->ace_flags = flags;
-	if (ar_mask_all_ones(ace->ace_proto_mask,
-	    ace->ace_proto_addr_length)) {
-		acep = ar_ce_hash(as, ace->ace_proto, ace->ace_proto_addr,
-		    ace->ace_proto_addr_length);
-	} else {
-		acep = &as->as_ce_mask_entries;
-	}
-	if ((ace->ace_next = *acep) != NULL)
-		ace->ace_next->ace_ptpn = &ace->ace_next;
-	*acep = ace;
-	ace->ace_ptpn = acep;
-	return (0);
-}
-
-/* Delete a cache entry. */
-static void
-ar_ce_delete(ace_t *ace)
-{
-	ace_t	**acep;
-
-	/* Get out of the hash list. */
-	acep = ace->ace_ptpn;
-	if (ace->ace_next)
-		ace->ace_next->ace_ptpn = acep;
-	acep[0] = ace->ace_next;
-	/* Mark it dying in case we have a timer about to fire. */
-	ace->ace_flags |= ACE_F_DYING;
-	/* Complete any outstanding queries immediately. */
-	ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
-	/* Free the timer, immediately, or when it fires. */
-	mi_timer_free(ace->ace_mp);
-}
-
-/*
- * ar_ce_walk routine.	Delete the ace if it is associated with the arl
- * that is going away.
- */
-static void
-ar_ce_delete_per_arl(ace_t *ace, void *arl)
-{
-	if (ace->ace_arl == arl || ace->ace_xmit_arl == arl) {
-		ace->ace_flags &= ~ACE_F_PERMANENT;
-		ar_ce_delete(ace);
-	}
-}
-
-/*
- * ar_ce_walk routine used when deactivating an `arl' in a group.  Deletes
- * `ace' if it was using `arl_arg' as its output interface.
- */
-static void
-ar_ce_ipmp_deactivate(ace_t *ace, void *arl_arg)
-{
-	arl_t *arl = arl_arg;
-
-	ASSERT(!(arl->arl_flags & ARL_F_IPMP));
-
-	if (ace->ace_arl == arl) {
-		ASSERT(ace->ace_xmit_arl == arl);
-		/*
-		 * This ACE is tied to the arl leaving the group (e.g., an
-		 * ACE_F_PERMANENT for a test address) and is not used by the
-		 * group, so we can leave it be.
-		 */
-		return;
-	}
-
-	if (ace->ace_xmit_arl != arl)
-		return;
-
-	ASSERT(ace->ace_arl == arl->arl_ipmp_arl);
-
-	/*
-	 * IP should've already sent us messages asking us to move any
-	 * ACE_F_MYADDR entries to another arl, but there are two exceptions:
-	 *
-	 * 1. The group was misconfigured with interfaces that have duplicate
-	 *    hardware addresses, but in.mpathd was unable to offline those
-	 *    duplicate interfaces.
-	 *
-	 * 2. The messages from IP were lost or never created (e.g. due to
-	 *    memory pressure).
-	 *
-	 * We handle the first case by just quietly deleting the ACE.  Since
-	 * the second case cannot be distinguished from a more serious bug in
-	 * the IPMP framework, we ASSERT() that this can't happen on DEBUG
-	 * systems, but quietly delete the ACE on production systems (the
-	 * deleted ACE will render the IP address unreachable).
-	 */
-	if (ace->ace_flags & ACE_F_MYADDR) {
-		arlphy_t *ap = arl->arl_phy;
-		uint_t hw_addrlen = ap->ap_hw_addrlen;
-
-		ASSERT(hw_addrlen == ace->ace_hw_addr_length &&
-		    bcmp(ap->ap_hw_addr, ace->ace_hw_addr, hw_addrlen) == 0);
-	}
-
-	/*
-	 * NOTE: it's possible this arl got selected as the ace_xmit_arl when
-	 * creating an ACE_F_PERMANENT ACE on behalf of an SIOCS*ARP ioctl for
-	 * an IPMP IP interface.  But it's still OK for us to delete such an
-	 * ACE since ipmp_illgrp_refresh_arpent() will ask us to recreate it
-	 * and we'll pick another arl then.
-	 */
-	ar_ce_delete(ace);
-}
-
-/* Cache entry hash routine, based on protocol and protocol address. */
-static ace_t **
-ar_ce_hash(arp_stack_t *as, uint32_t proto, const uchar_t *proto_addr,
-    uint32_t proto_addr_length)
-{
-	const uchar_t *up = proto_addr;
-	unsigned int hval = proto;
-	int	len = proto_addr_length;
-
-	while (--len >= 0)
-		hval ^= *up++;
-	return (&as->as_ce_hash_tbl[hval % ARP_HASH_SIZE]);
-}
-
-/* Cache entry lookup.	Try to find an ace matching the parameters passed. */
-ace_t *
-ar_ce_lookup(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
-    uint32_t proto_addr_length)
-{
-	ace_t	*ace;
-
-	ace = ar_ce_lookup_entry(arl, proto, proto_addr, proto_addr_length);
-	if (!ace)
-		ace = ar_ce_lookup_mapping(arl, proto, proto_addr,
-		    proto_addr_length);
-	return (ace);
-}
-
-/*
- * Cache entry lookup.	Try to find an ace matching the parameters passed.
- * Look only for exact entries (no mappings)
- */
-static ace_t *
-ar_ce_lookup_entry(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
-    uint32_t proto_addr_length)
-{
-	ace_t	*ace;
-	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
-	if (!proto_addr)
-		return (NULL);
-	ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
-	for (; ace; ace = ace->ace_next) {
-		if ((ace->ace_arl == arl ||
-		    ace->ace_arl == arl->arl_ipmp_arl) &&
-		    ace->ace_proto_addr_length == proto_addr_length &&
-		    ace->ace_proto == proto) {
-			int	i1 = proto_addr_length;
-			uchar_t	*ace_addr = ace->ace_proto_addr;
-			uchar_t	*mask = ace->ace_proto_mask;
-			/*
-			 * Note that the ace_proto_mask is applied to the
-			 * proto_addr before comparing to the ace_addr.
-			 */
-			do {
-				if (--i1 < 0)
-					return (ace);
-			} while ((proto_addr[i1] &  mask[i1]) == ace_addr[i1]);
-		}
-	}
-	return (ace);
-}
-
-/*
- * Extract cache entry lookup parameters from an external command message, then
- * call the supplied match function.
- */
-static ace_t *
-ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp, ace_t *matchfn())
-{
-	uchar_t	*proto_addr;
-	area_t	*area = (area_t *)mp->b_rptr;
-
-	proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
-	    area->area_proto_addr_length);
-	if (!proto_addr)
-		return (NULL);
-	return ((*matchfn)(ar_ll_lookup_from_mp(as, mp), area->area_proto,
-	    proto_addr, area->area_proto_addr_length));
-}
-
-/*
- * Cache entry lookup.	Try to find an ace matching the parameters passed.
- * Look only for mappings.
- */
-static ace_t *
-ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
-    uint32_t proto_addr_length)
-{
-	ace_t	*ace;
-	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
-	if (!proto_addr)
-		return (NULL);
-	ace = as->as_ce_mask_entries;
-	for (; ace; ace = ace->ace_next) {
-		if (ace->ace_arl == arl &&
-		    ace->ace_proto_addr_length == proto_addr_length &&
-		    ace->ace_proto == proto) {
-			int	i1 = proto_addr_length;
-			uchar_t	*ace_addr = ace->ace_proto_addr;
-			uchar_t	*mask = ace->ace_proto_mask;
-			/*
-			 * Note that the ace_proto_mask is applied to the
-			 * proto_addr before comparing to the ace_addr.
-			 */
-			do {
-				if (--i1 < 0)
-					return (ace);
-			} while ((proto_addr[i1] &  mask[i1]) == ace_addr[i1]);
-		}
-	}
-	return (ace);
-}
-
-/*
- * Look for a permanent entry for proto_addr across all interfaces.
- */
-static ace_t *
-ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr,
-    uint32_t proto_addr_length)
-{
-	ace_t	*ace;
-
-	ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
-	for (; ace != NULL; ace = ace->ace_next) {
-		if (!(ace->ace_flags & ACE_F_PERMANENT))
-			continue;
-		if (ace->ace_proto_addr_length == proto_addr_length &&
-		    ace->ace_proto == proto) {
-			int	i1 = proto_addr_length;
-			uchar_t *ace_addr = ace->ace_proto_addr;
-			uchar_t *mask = ace->ace_proto_mask;
-
-			/*
-			 * Note that the ace_proto_mask is applied to the
-			 * proto_addr before comparing to the ace_addr.
-			 */
-			do {
-				if (--i1 < 0)
-					return (ace);
-			} while ((proto_addr[i1] &  mask[i1]) == ace_addr[i1]);
-		}
-	}
-	return (ace);
-}
-
-/*
- * ar_ce_resolve is called when a response comes in to an outstanding request.
- * Returns 'true' if the address has changed and we need to tell the client.
- * (We don't need to tell the client if there's still an outstanding query.)
- */
-static boolean_t
-ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length)
-{
-	boolean_t hwchanged;
-
-	if (hw_addr_length == ace->ace_hw_addr_length) {
-		ASSERT(ace->ace_hw_addr != NULL);
-		hwchanged = bcmp(hw_addr, ace->ace_hw_addr,
-		    hw_addr_length) != 0;
-		if (hwchanged)
-			bcopy(hw_addr, ace->ace_hw_addr, hw_addr_length);
-		/*
-		 * No need to bother with ar_query_reply if no queries are
-		 * waiting.
-		 */
-		ace->ace_flags |= ACE_F_RESOLVED;
-		if (ace->ace_query_mp != NULL)
-			ar_query_reply(ace, 0, NULL, (uint32_t)0);
-		if (hwchanged)
-			return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-/*
- * There are 2 functions performed by this function.
- * 1. Resolution of unresolved entries and update of resolved entries.
- * 2. Detection of nodes with our own IP address (duplicates).
- *
- * If the resolving ARL is in the same group as a matching ACE's ARL, then
- * update the ACE.  Otherwise, make no updates.
- *
- * For all entries, we first check to see if this is a duplicate (probable
- * loopback) message.  If so, then just ignore it.
- *
- * Next, check to see if the entry has completed DAD.  If not, then we've
- * failed, because someone is already using the address.  Notify IP of the DAD
- * failure and remove the broken ace.
- *
- * Next, we check if we're the authority for this address.  If so, then it's
- * time to defend it, because the other node is a duplicate.  Report it as a
- * 'bogon' and let IP decide how to defend.
- *
- * Finally, if it's unresolved or if the arls match, we just update the MAC
- * address.  This allows a published 'static' entry to be updated by an ARP
- * request from the node for which we're a proxy ARP server.
- *
- * Note that this logic does not update published ARP entries for mismatched
- * arls, as for example when we proxy arp across 2 subnets with differing
- * subnet masks.
- *
- * Return Values below
- */
-
-#define	AR_NOTFOUND	1	/* No matching ace found in cache */
-#define	AR_MERGED	2	/* Matching ace updated (RFC 826 Merge_flag) */
-#define	AR_LOOPBACK	3	/* Our own arp packet was received */
-#define	AR_BOGON	4	/* Another host has our IP addr. */
-#define	AR_FAILED	5	/* Duplicate Address Detection has failed */
-#define	AR_CHANGED	6	/* Address has changed; tell IP (and merged) */
-
-static int
-ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
-    uint32_t hlen, const uchar_t *src_paddr, uint32_t plen, arl_t **ace_arlp)
-{
-	ace_t *ace;
-	ace_t *ace_next;
-	int i1;
-	const uchar_t *paddr;
-	uchar_t *ace_addr;
-	uchar_t *mask;
-	int retv = AR_NOTFOUND;
-	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
-	ace = *ar_ce_hash(as, proto, src_paddr, plen);
-	for (; ace != NULL; ace = ace_next) {
-
-		/* ar_ce_resolve may delete the ace; fetch next pointer now */
-		ace_next = ace->ace_next;
-
-		if (ace->ace_proto_addr_length != plen ||
-		    ace->ace_proto != proto) {
-			continue;
-		}
-
-		/*
-		 * Note that the ace_proto_mask is applied to the proto_addr
-		 * before comparing to the ace_addr.
-		 */
-		paddr = src_paddr;
-		i1 = plen;
-		ace_addr = ace->ace_proto_addr;
-		mask = ace->ace_proto_mask;
-		while (--i1 >= 0) {
-			if ((*paddr++ & *mask++) != *ace_addr++)
-				break;
-		}
-		if (i1 >= 0)
-			continue;
-
-		*ace_arlp = ace->ace_arl;
-
-		/*
-		 * If the IP address is ours, and the hardware address matches
-		 * one of our own arls, then this is a broadcast packet
-		 * emitted by one of our interfaces, reflected by the switch
-		 * and received on another interface.  We return AR_LOOPBACK.
-		 */
-		if (ace->ace_flags & ACE_F_MYADDR) {
-			arl_t *hw_arl = as->as_arl_head;
-			arlphy_t *ap;
-
-			for (; hw_arl != NULL; hw_arl = hw_arl->arl_next) {
-				ap = hw_arl->arl_phy;
-				if (ap != NULL && ap->ap_hw_addrlen == hlen &&
-				    bcmp(ap->ap_hw_addr, src_haddr, hlen) == 0)
-					return (AR_LOOPBACK);
-			}
-		}
-
-		/*
-		 * If the entry is unverified, then we've just verified that
-		 * someone else already owns this address, because this is a
-		 * message with the same protocol address but different
-		 * hardware address.  NOTE: the ace_xmit_arl check ensures we
-		 * don't send duplicate AR_FAILEDs if arl is in an IPMP group.
-		 */
-		if ((ace->ace_flags & ACE_F_UNVERIFIED) &&
-		    arl == ace->ace_xmit_arl) {
-			ar_ce_delete(ace);
-			return (AR_FAILED);
-		}
-
-		/*
-		 * If the IP address matches ours and we're authoritative for
-		 * this entry, then some other node is using our IP addr, so
-		 * return AR_BOGON.  Also reset the transmit count to zero so
-		 * that, if we're currently in initial announcement mode, we
-		 * switch back to the lazier defense mode.  Knowing that
-		 * there's at least one duplicate out there, we ought not
-		 * blindly announce.  NOTE: the ace_xmit_arl check ensures we
-		 * don't send duplicate AR_BOGONs if arl is in an IPMP group.
-		 */
-		if ((ace->ace_flags & ACE_F_AUTHORITY) &&
-		    arl == ace->ace_xmit_arl) {
-			ace->ace_xmit_count = 0;
-			return (AR_BOGON);
-		}
-
-		/*
-		 * Only update this ACE if it's on the same network -- i.e.,
-		 * it's for our ARL or another ARL in the same IPMP group.
-		 */
-		if (ace->ace_arl == arl || ace->ace_arl == arl->arl_ipmp_arl) {
-			if (ar_ce_resolve(ace, src_haddr, hlen))
-				retv = AR_CHANGED;
-			else if (retv == AR_NOTFOUND)
-				retv = AR_MERGED;
-		}
-	}
-
-	if (retv == AR_NOTFOUND)
-		*ace_arlp = NULL;
-	return (retv);
-}
-
-/* Pass arg1 to the pfi supplied, along with each ace in existence. */
-static void
-ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *), void *arg1)
-{
-	ace_t	*ace;
-	ace_t	*ace1;
-	int i;
-
-	for (i = 0; i < ARP_HASH_SIZE; i++) {
-		/*
-		 * We walk the hash chain in a way that allows the current
-		 * ace to get blown off by the called routine.
-		 */
-		for (ace = as->as_ce_hash_tbl[i]; ace; ace = ace1) {
-			ace1 = ace->ace_next;
-			(*pfi)(ace, arg1);
-		}
-	}
-	for (ace = as->as_ce_mask_entries; ace; ace = ace1) {
-		ace1 = ace->ace_next;
-		(*pfi)(ace, arg1);
-	}
-}
-
-/*
- * Send a copy of interesting packets to the corresponding IP instance.
- * The corresponding IP instance is the ARP-IP-DEV instance for this
- * DEV (i.e. ARL).
- */
-static void
-ar_client_notify(const arl_t *arl, mblk_t *mp, int code)
-{
-	ar_t	*ar = ((ar_t *)arl->arl_rq->q_ptr)->ar_arl_ip_assoc;
-	arcn_t	*arcn;
-	mblk_t	*mp1;
-	int	arl_namelen = strlen(arl->arl_name) + 1;
-
-	/* Looks like the association disappeared */
-	if (ar == NULL) {
-		freemsg(mp);
-		return;
-	}
-
-	/* ar is the corresponding ARP-IP instance for this ARL */
-	ASSERT(ar->ar_arl == NULL && ar->ar_wq->q_next != NULL);
-
-	mp1 = allocb(sizeof (arcn_t) + arl_namelen, BPRI_MED);
-	if (mp1 == NULL) {
-		freemsg(mp);
-		return;
-	}
-	DB_TYPE(mp1) = M_CTL;
-	mp1->b_cont = mp;
-	arcn = (arcn_t *)mp1->b_rptr;
-	mp1->b_wptr = (uchar_t *)&arcn[1] + arl_namelen;
-	arcn->arcn_cmd = AR_CLIENT_NOTIFY;
-	arcn->arcn_name_offset = sizeof (arcn_t);
-	arcn->arcn_name_length = arl_namelen;
-	arcn->arcn_code = code;
-	bcopy(arl->arl_name, &arcn[1], arl_namelen);
-
-	putnext(ar->ar_wq, mp1);
-}
-
-/*
- * Send a delete-notify message down to IP.  We've determined that IP doesn't
- * have a cache entry for the IP address itself, but it may have other cache
- * entries with the same hardware address, and we don't want to see those grow
- * stale.  (The alternative is sending down updates for every ARP message we
- * get that doesn't match an existing ace.  That's much more expensive than an
- * occasional delete and reload.)
- */
-static void
-ar_delete_notify(const ace_t *ace)
-{
-	const arl_t *arl = ace->ace_arl;
-	const arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
-	mblk_t	*mp;
-	size_t	len;
-	arh_t	*arh;
-
-	len = sizeof (*arh) + 2 * ace->ace_proto_addr_length;
-	mp = allocb(len, BPRI_MED);
-	if (mp == NULL)
-		return;
-	arh = (arh_t *)mp->b_rptr;
-	mp->b_wptr = (uchar_t *)arh + len;
-	U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware);
-	U16_TO_BE16(ace->ace_proto, arh->arh_proto);
-	arh->arh_hlen = 0;
-	arh->arh_plen = ace->ace_proto_addr_length;
-	U16_TO_BE16(ARP_RESPONSE, arh->arh_operation);
-	bcopy(ace->ace_proto_addr, arh + 1, ace->ace_proto_addr_length);
-	bcopy(ace->ace_proto_addr, (uchar_t *)(arh + 1) +
-	    ace->ace_proto_addr_length, ace->ace_proto_addr_length);
-	ar_client_notify(arl, mp, AR_CN_ANNOUNCE);
-}
-
-/* ARP module close routine. */
-static int
-ar_close(queue_t *q)
-{
-	ar_t	*ar = (ar_t *)q->q_ptr;
-	char	name[LIFNAMSIZ];
-	arl_t	*arl, *xarl;
-	arl_t	**arlp;
-	cred_t	*cr;
-	arc_t	*arc;
-	mblk_t	*mp1;
-	int	index;
-	arp_stack_t *as = ar->ar_as;
-
-	TRACE_1(TR_FAC_ARP, TR_ARP_CLOSE,
-	    "arp_close: q %p", q);
-
-	arl = ar->ar_arl;
-	if (arl == NULL) {
-		index = 0;
-		/*
-		 * If this is the <ARP-IP-Driver> stream send down
-		 * a closing message to IP and wait for IP to send
-		 * an ack. This helps to make sure that messages
-		 * that are currently being sent up by IP are not lost.
-		 */
-		if (ar->ar_on_ill_stream) {
-			mp1 = allocb(sizeof (arc_t), BPRI_MED);
-			if (mp1 != NULL) {
-				DB_TYPE(mp1) = M_CTL;
-				arc = (arc_t *)mp1->b_rptr;
-				mp1->b_wptr = mp1->b_rptr + sizeof (arc_t);
-				arc->arc_cmd = AR_ARP_CLOSING;
-				putnext(WR(q), mp1);
-				while (!ar->ar_ip_acked_close)
-					/* If we are interrupted break out */
-					if (qwait_sig(q) == 0)
-						break;
-			}
-		}
-		/* Delete all our pending queries, 'arl' is not dereferenced */
-		ar_ce_walk(as, ar_query_delete, ar);
-		/*
-		 * The request could be pending on some arl_queue also. This
-		 * happens if the arl is not yet bound, and bind is pending.
-		 */
-		ar_ll_cleanup_arl_queue(q);
-	} else {
-		index = arl->arl_index;
-		(void) strcpy(name, arl->arl_name);
-		arl->arl_closing = 1;
-		while (arl->arl_queue != NULL)
-			qwait(arl->arl_rq);
-
-		if (arl->arl_state == ARL_S_UP)
-			ar_ll_down(arl);
-
-		while (arl->arl_state != ARL_S_DOWN)
-			qwait(arl->arl_rq);
-
-		if (arl->arl_flags & ARL_F_IPMP) {
-			/*
-			 * Though rude, someone could force the IPMP arl
-			 * closed without removing the underlying interfaces.
-			 * In that case, force the ARLs out of the group.
-			 */
-			xarl = as->as_arl_head;
-			for (; xarl != NULL; xarl = xarl->arl_next) {
-				if (xarl->arl_ipmp_arl != arl || xarl == arl)
-					continue;
-				ar_ce_walk(as, ar_ce_ipmp_deactivate, xarl);
-				xarl->arl_ipmp_arl = NULL;
-			}
-		}
-
-		ar_ll_clear_defaults(arl);
-		/*
-		 * If this is the control stream for an arl, delete anything
-		 * hanging off our arl.
-		 */
-		ar_ce_walk(as, ar_ce_delete_per_arl, arl);
-		/* Free any messages waiting for a bind_ack */
-		/* Get the arl out of the chain. */
-		rw_enter(&as->as_arl_lock, RW_WRITER);
-		for (arlp = &as->as_arl_head; *arlp;
-		    arlp = &(*arlp)->arl_next) {
-			if (*arlp == arl) {
-				*arlp = arl->arl_next;
-				break;
-			}
-		}
-
-		ASSERT(arl->arl_dlpi_deferred == NULL);
-		ar->ar_arl = NULL;
-		rw_exit(&as->as_arl_lock);
-
-		mi_free((char *)arl);
-	}
-	/* Let's break the association between an ARL and IP instance */
-	if (ar->ar_arl_ip_assoc != NULL) {
-		ASSERT(ar->ar_arl_ip_assoc->ar_arl_ip_assoc != NULL &&
-		    ar->ar_arl_ip_assoc->ar_arl_ip_assoc == ar);
-		ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL;
-		ar->ar_arl_ip_assoc = NULL;
-	}
-	cr = ar->ar_credp;
-	/* mi_close_comm frees the instance data. */
-	(void) mi_close_comm(&as->as_head, q);
-	qprocsoff(q);
-	crfree(cr);
-
-	if (index != 0) {
-		hook_nic_event_t info;
-
-		info.hne_nic = index;
-		info.hne_lif = 0;
-		info.hne_event = NE_UNPLUMB;
-		info.hne_data = name;
-		info.hne_datalen = strlen(name);
-		(void) hook_run(as->as_net_data->netd_hooks,
-		    as->as_arpnicevents, (hook_data_t)&info);
-	}
-	netstack_rele(as->as_netstack);
-	return (0);
-}
-
-/*
- * Dispatch routine for ARP commands.  This routine can be called out of
- * either ar_wput or ar_rput, in response to IOCTLs or M_PROTO messages.
- */
-/* TODO: error reporting for M_PROTO case */
-static int
-ar_cmd_dispatch(queue_t *q, mblk_t *mp_orig, boolean_t from_wput)
-{
-	arct_t	*arct;
-	uint32_t	cmd;
-	ssize_t	len;
-	mblk_t	*mp = mp_orig;
-	cred_t *cr = NULL;
-
-	if (!mp)
-		return (ENOENT);
-
-	/* We get both M_PROTO and M_IOCTL messages, so watch out! */
-	if (DB_TYPE(mp) == M_IOCTL) {
-		struct iocblk *ioc;
-		ioc = (struct iocblk *)mp->b_rptr;
-		cmd = ioc->ioc_cmd;
-		cr = ioc->ioc_cr;
-		mp = mp->b_cont;
-		if (!mp)
-			return (ENOENT);
-	} else {
-		cr = msg_getcred(mp, NULL);
-		/* For initial messages beteen IP and ARP, cr can be NULL */
-		if (cr == NULL)
-			cr = ((ar_t *)q->q_ptr)->ar_credp;
-	}
-	len = MBLKL(mp);
-	if (len < sizeof (uint32_t) || !OK_32PTR(mp->b_rptr))
-		return (ENOENT);
-	if (mp_orig == mp)
-		cmd = *(uint32_t *)mp->b_rptr;
-	for (arct = ar_cmd_tbl; ; arct++) {
-		if (arct >= A_END(ar_cmd_tbl))
-			return (ENOENT);
-		if (arct->arct_cmd == cmd)
-			break;
-	}
-	if (len < arct->arct_min_len) {
-		/*
-		 * If the command is exclusive to ARP, we return EINVAL,
-		 * else we need to pass the command downstream, so return
-		 * ENOENT
-		 */
-		return ((arct->arct_flags & ARF_ONLY_CMD) ? EINVAL : ENOENT);
-	}
-	if (arct->arct_priv_req != OP_NP) {
-		int error;
-
-		if ((error = secpolicy_ip(cr, arct->arct_priv_req,
-		    B_FALSE)) != 0)
-			return (error);
-	}
-	/* Disallow many commands except if from rput i.e. from IP */
-	if (from_wput && !(arct->arct_flags & ARF_WPUT_OK)) {
-		return (EINVAL);
-	}
-
-	if (arct->arct_flags & ARF_IOCTL_AWARE)
-		mp = mp_orig;
-
-	DTRACE_PROBE3(cmd_dispatch, queue_t *, q, mblk_t *, mp,
-	    arct_t *, arct);
-	return (*arct->arct_pfi)(q, mp);
-}
-
-/* Allocate and do common initializations for DLPI messages. */
-static mblk_t *
-ar_dlpi_comm(t_uscalar_t prim, size_t size)
-{
-	mblk_t	*mp;
-
-	if ((mp = allocb(size, BPRI_HI)) == NULL)
-		return (NULL);
-
-	/*
-	 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
-	 * of which we don't seem to use) are sent with M_PCPROTO, and
-	 * that other DLPI are M_PROTO.
-	 */
-	DB_TYPE(mp) = (prim == DL_INFO_REQ) ? M_PCPROTO : M_PROTO;
-
-	mp->b_wptr = mp->b_rptr + size;
-	bzero(mp->b_rptr, size);
-	((union DL_primitives *)mp->b_rptr)->dl_primitive = prim;
-
-	return (mp);
-}
-
-static void
-ar_dlpi_dispatch(arl_t *arl)
-{
-	mblk_t *mp;
-	t_uscalar_t primitive = DL_PRIM_INVAL;
-
-	while (((mp = arl->arl_dlpi_deferred) != NULL) &&
-	    (arl->arl_dlpi_pending == DL_PRIM_INVAL)) {
-		union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
-
-		DTRACE_PROBE2(dlpi_dispatch, arl_t *, arl, mblk_t *, mp);
-
-		ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
-		arl->arl_dlpi_deferred = mp->b_next;
-		mp->b_next = NULL;
-
-		/*
-		 * If this is a DL_NOTIFY_CONF, no ack is expected.
-		 */
-		if ((primitive = dlp->dl_primitive) != DL_NOTIFY_CONF)
-			arl->arl_dlpi_pending = dlp->dl_primitive;
-		putnext(arl->arl_wq, mp);
-	}
-
-	if (arl->arl_dlpi_pending == DL_PRIM_INVAL) {
-		/*
-		 * No pending DLPI operation.
-		 */
-		ASSERT(mp == NULL);
-		DTRACE_PROBE1(dlpi_idle, arl_t *, arl);
-
-		/*
-		 * If the last DLPI message dispatched is DL_NOTIFY_CONF,
-		 * it is not assoicated with any pending cmd request, drain
-		 * the rest of pending cmd requests, otherwise call
-		 * ar_cmd_done() to finish up the current pending cmd
-		 * operation.
-		 */
-		if (primitive == DL_NOTIFY_CONF)
-			ar_cmd_drain(arl);
-		else
-			ar_cmd_done(arl);
-	} else if (mp != NULL) {
-		DTRACE_PROBE2(dlpi_defer, arl_t *, arl, mblk_t *, mp);
-	}
-}
-
-/*
- * The following two functions serialize DLPI messages to the driver, much
- * along the lines of ill_dlpi_send and ill_dlpi_done in IP. Basically,
- * we wait for a DLPI message, sent downstream, to be acked before sending
- * the next. If there are DLPI messages that have not yet been sent, queue
- * this message (mp), else send it downstream.
- */
-static void
-ar_dlpi_send(arl_t *arl, mblk_t *mp)
-{
-	mblk_t **mpp;
-
-	ASSERT(arl != NULL);
-	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
-
-	/* Always queue the message. Tail insertion */
-	mpp = &arl->arl_dlpi_deferred;
-	while (*mpp != NULL)
-		mpp = &((*mpp)->b_next);
-	*mpp = mp;
-
-	ar_dlpi_dispatch(arl);
-}
-
-/*
- * Called when an DLPI control message has been acked; send down the next
- * queued message (if any).
- * The DLPI messages of interest being bind, attach, unbind and detach since
- * these are the only ones sent by ARP via ar_dlpi_send.
- */
-static void
-ar_dlpi_done(arl_t *arl, t_uscalar_t prim)
-{
-	if (arl->arl_dlpi_pending != prim) {
-		DTRACE_PROBE2(dlpi_done_unexpected, arl_t *, arl,
-		    t_uscalar_t, prim);
-		return;
-	}
-
-	DTRACE_PROBE2(dlpi_done, arl_t *, arl, t_uscalar_t, prim);
-	arl->arl_dlpi_pending = DL_PRIM_INVAL;
-	ar_dlpi_dispatch(arl);
-}
-
-/*
- * Send a DL_NOTE_REPLUMB_DONE message down to the driver to indicate
- * the replumb process has already been done. Note that mp is either a
- * DL_NOTIFY_IND message or an AR_INTERFACE_DOWN message (comes from IP).
- */
-static void
-arp_replumb_done(arl_t *arl, mblk_t *mp)
-{
-	ASSERT(arl->arl_state == ARL_S_DOWN && arl->arl_replumbing);
-
-	mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO,
-	    DL_NOTIFY_CONF);
-	((dl_notify_conf_t *)(mp->b_rptr))->dl_notification =
-	    DL_NOTE_REPLUMB_DONE;
-	arl->arl_replumbing = B_FALSE;
-	ar_dlpi_send(arl, mp);
-}
-
-static void
-ar_cmd_drain(arl_t *arl)
-{
-	mblk_t	*mp;
-	queue_t	*q;
-
-	/*
-	 * Run the commands that have been enqueued while we were waiting
-	 * for the last command (AR_INTERFACE_UP or AR_INTERFACE_DOWN)
-	 * to complete.
-	 */
-	while ((mp = arl->arl_queue) != NULL) {
-		if (((uintptr_t)mp->b_prev & CMD_IN_PROGRESS) != 0) {
-			/*
-			 * The current command is an AR_INTERFACE_UP or
-			 * AR_INTERFACE_DOWN and is waiting for a DLPI ack
-			 * from the driver. Return. We can't make progress now.
-			 */
-			break;
-		}
-
-		mp = ar_cmd_dequeue(arl);
-		mp->b_prev = AR_DRAINING;
-		q = mp->b_queue;
-		mp->b_queue = NULL;
-
-		/*
-		 * Don't call put(q, mp) since it can lead to reorder of
-		 * messages by sending the current messages to the end of
-		 * arp's syncq
-		 */
-		if (q->q_flag & QREADR)
-			ar_rput(q, mp);
-		else
-			ar_wput(q, mp);
-	}
-}
-
-static void
-ar_cmd_done(arl_t *arl)
-{
-	mblk_t			*mp;
-	int			cmd;
-	int			err;
-	mblk_t			*mp1;
-	mblk_t			*dlpi_op_done_mp = NULL;
-	queue_t			*dlpi_op_done_q;
-	ar_t			*ar_arl;
-	ar_t			*ar_ip;
-
-	ASSERT(arl->arl_state == ARL_S_UP || arl->arl_state == ARL_S_DOWN);
-
-	/*
-	 * If the current operation was initiated by IP there must be
-	 * an op enqueued in arl_queue. But if ar_close has sent down
-	 * a detach/unbind, there is no command enqueued. Also if the IP-ARP
-	 * stream has closed the cleanup would be done and there won't be any mp
-	 */
-	if ((mp = arl->arl_queue) == NULL)
-		return;
-
-	if ((cmd = (uintptr_t)mp->b_prev) & CMD_IN_PROGRESS) {
-		mp1 = ar_cmd_dequeue(arl);
-		ASSERT(mp == mp1);
-
-		cmd &= ~CMD_IN_PROGRESS;
-		if (cmd == AR_INTERFACE_UP) {
-			/*
-			 * There is an ioctl waiting for us...
-			 */
-			if (arl->arl_state == ARL_S_UP)
-				err = 0;
-			else
-				err = EINVAL;
-
-			dlpi_op_done_mp = ar_alloc(AR_DLPIOP_DONE, err);
-			if (dlpi_op_done_mp != NULL) {
-				/*
-				 * Better performance if we send the response
-				 * after the potential MAPPING_ADDs command
-				 * that are likely to follow. (Do it below the
-				 * while loop, instead of putnext right now)
-				 */
-				dlpi_op_done_q = WR(mp->b_queue);
-			}
-
-			if (err == 0) {
-				/*
-				 * Now that we have the ARL instance
-				 * corresponding to the IP instance let's make
-				 * the association here.
-				 */
-				ar_ip = (ar_t *)mp->b_queue->q_ptr;
-				ar_arl = (ar_t *)arl->arl_rq->q_ptr;
-				ar_arl->ar_arl_ip_assoc = ar_ip;
-				ar_ip->ar_arl_ip_assoc = ar_arl;
-			}
-
-			inet_freemsg(mp);
-		} else if (cmd == AR_INTERFACE_DOWN && arl->arl_replumbing) {
-			/*
-			 * The arl is successfully brought down and this is
-			 * a result of the DL_NOTE_REPLUMB process. Reset
-			 * mp->b_prev first (it keeps the 'cmd' information
-			 * at this point).
-			 */
-			mp->b_prev = NULL;
-			arp_replumb_done(arl, mp);
-		} else {
-			inet_freemsg(mp);
-		}
-	}
-
-	ar_cmd_drain(arl);
-
-	if (dlpi_op_done_mp != NULL) {
-		DTRACE_PROBE3(cmd_done_next, arl_t *, arl,
-		    queue_t *, dlpi_op_done_q, mblk_t *, dlpi_op_done_mp);
-		putnext(dlpi_op_done_q, dlpi_op_done_mp);
-	}
-}
-
-/*
- * Queue all arp commands coming from clients. Typically these commands
- * come from IP, but could also come from other clients. The commands
- * are serviced in FIFO order. Some commands need to wait and restart
- * after the DLPI response from the driver is received. Typically
- * AR_INTERFACE_UP and AR_INTERFACE_DOWN. ar_dlpi_done restarts
- * the command and then dequeues the queue at arl_queue and calls ar_rput
- * or ar_wput for each enqueued command. AR_DRAINING is used to signify
- * that the command is being executed thru a drain from ar_dlpi_done.
- * Functions handling the individual commands such as ar_entry_add
- * check for this flag in b_prev to determine whether the command has
- * to be enqueued for later processing or must be processed now.
- *
- * b_next used to thread the enqueued command mblks
- * b_queue used to identify the queue of the originating request(client)
- * b_prev used to store the command itself for easy parsing.
- */
-static void
-ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q, ushort_t cmd,
-    boolean_t tail_insert)
-{
-	mp->b_queue = q;
-	if (arl->arl_queue == NULL) {
-		ASSERT(arl->arl_queue_tail == NULL);
-		mp->b_prev = (void *)((uintptr_t)(cmd | CMD_IN_PROGRESS));
-		mp->b_next = NULL;
-		arl->arl_queue = mp;
-		arl->arl_queue_tail = mp;
-	} else if (tail_insert) {
-		mp->b_prev = (void *)((uintptr_t)cmd);
-		mp->b_next = NULL;
-		arl->arl_queue_tail->b_next = mp;
-		arl->arl_queue_tail = mp;
-	} else {
-		/* head insert */
-		mp->b_prev = (void *)((uintptr_t)cmd | CMD_IN_PROGRESS);
-		mp->b_next = arl->arl_queue;
-		arl->arl_queue = mp;
-	}
-}
-
-static mblk_t *
-ar_cmd_dequeue(arl_t *arl)
-{
-	mblk_t	*mp;
-
-	if (arl->arl_queue == NULL) {
-		ASSERT(arl->arl_queue_tail == NULL);
-		return (NULL);
-	}
-	mp = arl->arl_queue;
-	arl->arl_queue = mp->b_next;
-	if (arl->arl_queue == NULL)
-		arl->arl_queue_tail = NULL;
-	mp->b_next = NULL;
-	return (mp);
-}
-
-/*
- * Standard ACE timer handling: compute 'fuzz' around a central value or from 0
- * up to a value, and then set the timer.  The randomization is necessary to
- * prevent groups of systems from falling into synchronization on the network
- * and producing ARP packet storms.
- */
-static void
-ace_set_timer(ace_t *ace, boolean_t initial_time)
-{
-	clock_t intv, rnd, frac;
-
-	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
-	/* Note that clock_t is signed; must chop off bits */
-	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
-	intv = ace->ace_xmit_interval;
-	if (initial_time) {
-		/* Set intv to be anywhere in the [1 .. intv] range */
-		if (intv <= 0)
-			intv = 1;
-		else
-			intv = (rnd % intv) + 1;
-	} else {
-		/* Compute 'frac' as 20% of the configured interval */
-		if ((frac = intv / 5) <= 1)
-			frac = 2;
-		/* Set intv randomly in the range [intv-frac .. intv+frac] */
-		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
-			intv = 1;
-	}
-	mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, intv);
-}
-
-/*
- * Process entry add requests from external messages.
- * It is also called by ip_rput_dlpi_writer() through
- * ipif_resolver_up() to change hardware address when
- * an asynchronous hardware address change notification
- * arrives from the driver.
- */
-static int
-ar_entry_add(queue_t *q, mblk_t *mp_orig)
-{
-	area_t	*area;
-	ace_t	*ace;
-	uchar_t	*hw_addr;
-	uint32_t hw_addr_len;
-	uchar_t	*proto_addr;
-	uint32_t proto_addr_len;
-	uchar_t	*proto_mask;
-	arl_t	*arl;
-	mblk_t	*mp = mp_orig;
-	int	err;
-	uint_t	aflags;
-	boolean_t unverified;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	/* We handle both M_IOCTL and M_PROTO messages. */
-	if (DB_TYPE(mp) == M_IOCTL)
-		mp = mp->b_cont;
-	arl = ar_ll_lookup_from_mp(as, mp);
-	if (arl == NULL)
-		return (EINVAL);
-	/*
-	 * Newly received commands from clients go to the tail of the queue.
-	 */
-	if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
-		DTRACE_PROBE3(eadd_enqueued, queue_t *, q, mblk_t *, mp_orig,
-		    arl_t *, arl);
-		ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_ADD, B_TRUE);
-		return (EINPROGRESS);
-	}
-	mp_orig->b_prev = NULL;
-
-	area = (area_t *)mp->b_rptr;
-	aflags = area->area_flags;
-
-	/*
-	 * If the previous entry wasn't published and we are now going
-	 * to publish, then we need to do address verification. The previous
-	 * entry may have been a local unpublished address or even an external
-	 * address. If the entry we find was in an unverified state we retain
-	 * this.
-	 * If it's a new published entry, then we're obligated to do
-	 * duplicate address detection now.
-	 */
-	ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup_entry);
-	if (ace != NULL) {
-		unverified = !(ace->ace_flags & ACE_F_PUBLISH) &&
-		    (aflags & ACE_F_PUBLISH);
-		if (ace->ace_flags & ACE_F_UNVERIFIED)
-			unverified = B_TRUE;
-		ar_ce_delete(ace);
-	} else {
-		unverified = (aflags & ACE_F_PUBLISH) != 0;
-	}
-
-	/* Allow client to request DAD restart */
-	if (aflags & ACE_F_UNVERIFIED)
-		unverified = B_TRUE;
-
-	/* Extract parameters from the message. */
-	hw_addr_len = area->area_hw_addr_length;
-	hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len);
-	proto_addr_len = area->area_proto_addr_length;
-	proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
-	    proto_addr_len);
-	proto_mask = mi_offset_paramc(mp, area->area_proto_mask_offset,
-	    proto_addr_len);
-	if (proto_mask == NULL) {
-		DTRACE_PROBE2(eadd_bad_mask, arl_t *, arl, area_t *, area);
-		return (EINVAL);
-	}
-	err = ar_ce_create(
-	    arl,
-	    area->area_proto,
-	    hw_addr,
-	    hw_addr_len,
-	    proto_addr,
-	    proto_addr_len,
-	    proto_mask,
-	    NULL,
-	    (uint32_t)0,
-	    NULL,
-	    aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND);
-	if (err != 0) {
-		DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area,
-		    int, err);
-		return (err);
-	}
-
-	if (aflags & ACE_F_PUBLISH) {
-		arlphy_t *ap;
-
-		ace = ar_ce_lookup(arl, area->area_proto, proto_addr,
-		    proto_addr_len);
-		ASSERT(ace != NULL);
-
-		ap = ace->ace_xmit_arl->arl_phy;
-
-		if (hw_addr == NULL || hw_addr_len == 0) {
-			hw_addr = ap->ap_hw_addr;
-		} else if (aflags & ACE_F_MYADDR) {
-			/*
-			 * If hardware address changes, then make sure
-			 * that the hardware address and hardware
-			 * address length fields in arlphy_t get updated
-			 * too. Otherwise, they will continue carrying
-			 * the old hardware address information.
-			 */
-			ASSERT((hw_addr != NULL) && (hw_addr_len != 0));
-			bcopy(hw_addr, ap->ap_hw_addr, hw_addr_len);
-			ap->ap_hw_addrlen = hw_addr_len;
-		}
-
-		if (ace->ace_flags & ACE_F_FAST) {
-			ace->ace_xmit_count = as->as_fastprobe_count;
-			ace->ace_xmit_interval = as->as_fastprobe_delay;
-		} else {
-			ace->ace_xmit_count = as->as_probe_count;
-			ace->ace_xmit_interval = as->as_probe_delay;
-		}
-
-		/*
-		 * If the user has disabled duplicate address detection for
-		 * this kind of interface (fast or slow) by setting the probe
-		 * count to zero, then pretend as if we've verified the
-		 * address, and go right to address defense mode.
-		 */
-		if (ace->ace_xmit_count == 0)
-			unverified = B_FALSE;
-
-		/*
-		 * If we need to do duplicate address detection, then kick that
-		 * off.  Otherwise, send out a gratuitous ARP message in order
-		 * to update everyone's caches with the new hardware address.
-		 */
-		if (unverified) {
-			ace->ace_flags |= ACE_F_UNVERIFIED;
-			if (ace->ace_xmit_interval == 0) {
-				/*
-				 * User has configured us to send the first
-				 * probe right away.  Do so, and set up for
-				 * the subsequent probes.
-				 */
-				DTRACE_PROBE2(eadd_probe, ace_t *, ace,
-				    area_t *, area);
-				ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
-				    area->area_proto, proto_addr_len,
-				    hw_addr, NULL, NULL, proto_addr, NULL, as);
-				ace->ace_xmit_count--;
-				ace->ace_xmit_interval =
-				    (ace->ace_flags & ACE_F_FAST) ?
-				    as->as_fastprobe_interval :
-				    as->as_probe_interval;
-				ace_set_timer(ace, B_FALSE);
-			} else {
-				DTRACE_PROBE2(eadd_delay, ace_t *, ace,
-				    area_t *, area);
-				/* Regular delay before initial probe */
-				ace_set_timer(ace, B_TRUE);
-			}
-		} else {
-			DTRACE_PROBE2(eadd_announce, ace_t *, ace,
-			    area_t *, area);
-			ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
-			    area->area_proto, proto_addr_len, hw_addr,
-			    proto_addr, ap->ap_arp_addr, proto_addr, NULL, as);
-			ace->ace_last_bcast = ddi_get_lbolt();
-
-			/*
-			 * If AUTHORITY is set, it is not just a proxy arp
-			 * entry; we believe we're the authority for this
-			 * entry.  In that case, and if we're not just doing
-			 * one-off defense of the address, we send more than
-			 * one copy, so we'll still have a good chance of
-			 * updating everyone even when there's a packet loss
-			 * or two.
-			 */
-			if ((aflags & ACE_F_AUTHORITY) &&
-			    !(aflags & ACE_F_DEFEND) &&
-			    as->as_publish_count > 0) {
-				/* Account for the xmit we just did */
-				ace->ace_xmit_count = as->as_publish_count - 1;
-				ace->ace_xmit_interval =
-				    as->as_publish_interval;
-				if (ace->ace_xmit_count > 0)
-					ace_set_timer(ace, B_FALSE);
-			}
-		}
-	}
-	return (0);
-}
-
-/* Process entry delete requests from external messages. */
-static int
-ar_entry_delete(queue_t *q, mblk_t *mp_orig)
-{
-	ace_t	*ace;
-	arl_t	*arl;
-	mblk_t	*mp = mp_orig;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	/* We handle both M_IOCTL and M_PROTO messages. */
-	if (DB_TYPE(mp) == M_IOCTL)
-		mp = mp->b_cont;
-	arl = ar_ll_lookup_from_mp(as, mp);
-	if (arl == NULL)
-		return (EINVAL);
-	/*
-	 * Newly received commands from clients go to the tail of the queue.
-	 */
-	if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
-		DTRACE_PROBE3(edel_enqueued, queue_t *, q, mblk_t *, mp_orig,
-		    arl_t *, arl);
-		ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_DELETE, B_TRUE);
-		return (EINPROGRESS);
-	}
-	mp_orig->b_prev = NULL;
-
-	/*
-	 * Need to know if it is a mapping or an exact match.  Check exact
-	 * match first.
-	 */
-	ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup);
-	if (ace != NULL) {
-		ared_t *ared = (ared_t *)mp->b_rptr;
-
-		/*
-		 * If it's a permanent entry, then the client is the one who
-		 * told us to delete it, so there's no reason to notify.
-		 */
-		if (ACE_NONPERM(ace))
-			ar_delete_notify(ace);
-		/*
-		 * Only delete the ARP entry if it is non-permanent, or
-		 * ARED_F_PRESERVE_PERM flags is not set.
-		 */
-		if (ACE_NONPERM(ace) ||
-		    !(ared->ared_flags & ARED_F_PRESERVE_PERM)) {
-			ar_ce_delete(ace);
-		}
-		return (0);
-	}
-	return (ENXIO);
-}
-
-/*
- * Process entry query requests from external messages.
- * Bump up the ire_stats_freed for all errors except
- * EINPROGRESS - which means the packet has been queued.
- * For all other errors the packet is going to be freed
- * and hence we account for ire being freed if it
- * is a M_PROTO message.
- */
-static int
-ar_entry_query(queue_t *q, mblk_t *mp_orig)
-{
-	ace_t	*ace;
-	areq_t	*areq;
-	arl_t	*arl;
-	int	err;
-	mblk_t	*mp = mp_orig;
-	uchar_t	*proto_addr;
-	uchar_t	*sender_addr;
-	uint32_t proto_addr_len;
-	clock_t	ms;
-	boolean_t is_mproto = B_TRUE;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	/* We handle both M_IOCTL and M_PROTO messages. */
-	if (DB_TYPE(mp) == M_IOCTL) {
-		is_mproto = B_FALSE;
-		mp = mp->b_cont;
-	}
-	arl = ar_ll_lookup_from_mp(as, mp);
-	if (arl == NULL) {
-		DTRACE_PROBE2(query_no_arl, queue_t *, q, mblk_t *, mp);
-		err = EINVAL;
-		goto err_ret;
-	}
-	/*
-	 * Newly received commands from clients go to the tail of the queue.
-	 */
-	if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
-		DTRACE_PROBE3(query_enqueued, queue_t *, q, mblk_t *, mp_orig,
-		    arl_t *, arl);
-		ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_QUERY, B_TRUE);
-		return (EINPROGRESS);
-	}
-	mp_orig->b_prev = NULL;
-
-	areq = (areq_t *)mp->b_rptr;
-	proto_addr_len = areq->areq_target_addr_length;
-	proto_addr = mi_offset_paramc(mp, areq->areq_target_addr_offset,
-	    proto_addr_len);
-	if (proto_addr == NULL) {
-		DTRACE_PROBE1(query_illegal_address, areq_t *, areq);
-		err = EINVAL;
-		goto err_ret;
-	}
-	/* Stash the reply queue pointer for later use. */
-	mp->b_prev = (mblk_t *)OTHERQ(q);
-	mp->b_next = NULL;
-	if (areq->areq_xmit_interval == 0)
-		areq->areq_xmit_interval = AR_DEF_XMIT_INTERVAL;
-	ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr, proto_addr_len);
-	if (ace != NULL && (ace->ace_flags & ACE_F_OLD)) {
-		/*
-		 * This is a potentially stale entry that IP's asking about.
-		 * Since IP is asking, it must not have an answer anymore,
-		 * either due to periodic ARP flush or due to SO_DONTROUTE.
-		 * Rather than go forward with what we've got, restart
-		 * resolution.
-		 */
-		DTRACE_PROBE2(query_stale_ace, ace_t *, ace, areq_t *, areq);
-		ar_ce_delete(ace);
-		ace = NULL;
-	}
-	if (ace != NULL) {
-		mblk_t	**mpp;
-		uint32_t	count = 0;
-
-		/*
-		 * There is already a cache entry.  This means there is either
-		 * a permanent entry, or address resolution is in progress.
-		 * If the latter, there should be one or more queries queued
-		 * up.	We link the current one in at the end, if there aren't
-		 * too many outstanding.
-		 */
-		for (mpp = &ace->ace_query_mp; mpp[0]; mpp = &mpp[0]->b_next) {
-			if (++count > areq->areq_max_buffered) {
-				DTRACE_PROBE2(query_overflow, ace_t *, ace,
-				    areq_t *, areq);
-				mp->b_prev = NULL;
-				err = EALREADY;
-				goto err_ret;
-			}
-		}
-		/* Put us on the list. */
-		mpp[0] = mp;
-		if (count != 0) {
-			/*
-			 * If a query was already queued up, then we must not
-			 * have an answer yet.
-			 */
-			DTRACE_PROBE2(query_in_progress, ace_t *, ace,
-			    areq_t *, areq);
-			return (EINPROGRESS);
-		}
-		if (ACE_RESOLVED(ace)) {
-			/*
-			 * We have an answer already.
-			 * Keep a dup of mp since proto_addr points to it
-			 * and mp has been placed on the ace_query_mp list.
-			 */
-			mblk_t *mp1;
-
-			DTRACE_PROBE2(query_resolved, ace_t *, ace,
-			    areq_t *, areq);
-			mp1 = dupmsg(mp);
-			ar_query_reply(ace, 0, proto_addr, proto_addr_len);
-			freemsg(mp1);
-			return (EINPROGRESS);
-		}
-		if (ace->ace_flags & ACE_F_MAPPING) {
-			/* Should never happen */
-			DTRACE_PROBE2(query_unresolved_mapping, ace_t *, ace,
-			    areq_t *, areq);
-			mpp[0] = mp->b_next;
-			err = ENXIO;
-			goto err_ret;
-		}
-		DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq);
-	} else {
-		/* No ace yet.	Make one now.  (This is the common case.) */
-		if (areq->areq_xmit_count == 0) {
-			DTRACE_PROBE2(query_template, arl_t *, arl,
-			    areq_t *, areq);
-			mp->b_prev = NULL;
-			err = ENXIO;
-			goto err_ret;
-		}
-		/*
-		 * Check for sender addr being NULL or not before
-		 * we create the ace. It is easy to cleanup later.
-		 */
-		sender_addr = mi_offset_paramc(mp,
-		    areq->areq_sender_addr_offset,
-		    areq->areq_sender_addr_length);
-		if (sender_addr == NULL) {
-			DTRACE_PROBE2(query_no_sender, arl_t *, arl,
-			    areq_t *, areq);
-			mp->b_prev = NULL;
-			err = EINVAL;
-			goto err_ret;
-		}
-		err = ar_ce_create(OWNING_ARL(arl), areq->areq_proto, NULL, 0,
-		    proto_addr, proto_addr_len, NULL,
-		    NULL, (uint32_t)0, sender_addr,
-		    areq->areq_flags);
-		if (err != 0) {
-			DTRACE_PROBE3(query_create_failed, arl_t *, arl,
-			    areq_t *, areq, int, err);
-			mp->b_prev = NULL;
-			goto err_ret;
-		}
-		ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr,
-		    proto_addr_len);
-		if (ace == NULL || ace->ace_query_mp != NULL) {
-			/* Shouldn't happen! */
-			DTRACE_PROBE3(query_lookup_failed, arl_t *, arl,
-			    areq_t *, areq, ace_t *, ace);
-			mp->b_prev = NULL;
-			err = ENXIO;
-			goto err_ret;
-		}
-		ace->ace_query_mp = mp;
-	}
-	ms = ar_query_xmit(as, ace);
-	if (ms == 0) {
-		/* Immediate reply requested. */
-		ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
-	} else {
-		mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, ms);
-	}
-	return (EINPROGRESS);
-err_ret:
-	if (is_mproto) {
-		ip_stack_t *ipst = as->as_netstack->netstack_ip;
-
-		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
-	}
-	return (err);
-}
-
-/* Handle simple query requests. */
-static int
-ar_entry_squery(queue_t *q, mblk_t *mp_orig)
-{
-	ace_t	*ace;
-	area_t	*area;
-	arl_t	*arl;
-	uchar_t	*hw_addr;
-	uint32_t	hw_addr_len;
-	mblk_t	*mp = mp_orig;
-	uchar_t	*proto_addr;
-	int	proto_addr_len;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	if (DB_TYPE(mp) == M_IOCTL)
-		mp = mp->b_cont;
-	arl = ar_ll_lookup_from_mp(as, mp);
-	if (arl == NULL)
-		return (EINVAL);
-	/*
-	 * Newly received commands from clients go to the tail of the queue.
-	 */
-	if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
-		DTRACE_PROBE3(squery_enqueued, queue_t *, q, mblk_t *, mp_orig,
-		    arl_t *, arl);
-		ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_SQUERY, B_TRUE);
-		return (EINPROGRESS);
-	}
-	mp_orig->b_prev = NULL;
-
-	/* Extract parameters from the request message. */
-	area = (area_t *)mp->b_rptr;
-	proto_addr_len = area->area_proto_addr_length;
-	proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
-	    proto_addr_len);
-	hw_addr_len = area->area_hw_addr_length;
-	hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len);
-	if (proto_addr == NULL || hw_addr == NULL) {
-		DTRACE_PROBE1(squery_illegal_address, area_t *, area);
-		return (EINVAL);
-	}
-	ace = ar_ce_lookup(arl, area->area_proto, proto_addr, proto_addr_len);
-	if (ace == NULL) {
-		return (ENXIO);
-	}
-	if (hw_addr_len < ace->ace_hw_addr_length) {
-		return (EINVAL);
-	}
-	if (ACE_RESOLVED(ace)) {
-		/* Got it, prepare the response. */
-		ASSERT(area->area_hw_addr_length == ace->ace_hw_addr_length);
-		ar_set_address(ace, hw_addr, proto_addr, proto_addr_len);
-	} else {
-		/*
-		 * We have an incomplete entry.	 Set the length to zero and
-		 * just return out the flags.
-		 */
-		area->area_hw_addr_length = 0;
-	}
-	area->area_flags = ace->ace_flags;
-	if (mp == mp_orig) {
-		/* Non-ioctl case */
-		/* TODO: change message type? */
-		DB_TYPE(mp) = M_CTL; /* Caught by ip_wput */
-		DTRACE_PROBE3(squery_reply, queue_t *, q, mblk_t *, mp,
-		    arl_t *, arl);
-		qreply(q, mp);
-		return (EINPROGRESS);
-	}
-	return (0);
-}
-
-/* Process an interface down causing us to detach and unbind. */
-/* ARGSUSED */
-static int
-ar_interface_down(queue_t *q, mblk_t *mp)
-{
-	arl_t	*arl;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	arl = ar_ll_lookup_from_mp(as, mp);
-	if (arl == NULL || arl->arl_closing) {
-		DTRACE_PROBE2(down_no_arl, queue_t *, q, mblk_t *, mp);
-		return (EINVAL);
-	}
-
-	/*
-	 * Newly received commands from clients go to the tail of the queue.
-	 */
-	if (CMD_NEEDS_QUEUEING(mp, arl)) {
-		DTRACE_PROBE3(down_enqueued, queue_t *, q, mblk_t *, mp,
-		    arl_t *, arl);
-		ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_TRUE);
-		return (EINPROGRESS);
-	}
-	mp->b_prev = NULL;
-	/*
-	 * The arl is already down, no work to do.
-	 */
-	if (arl->arl_state == ARL_S_DOWN) {
-		if (arl->arl_replumbing) {
-			/*
-			 * The arl is already down and this is a result of
-			 * the DL_NOTE_REPLUMB process. Return EINPROGRESS
-			 * so this mp won't be freed by ar_rput().
-			 */
-			arp_replumb_done(arl, mp);
-			return (EINPROGRESS);
-		} else {
-			/* ar_rput frees the mp */
-			return (0);
-		}
-	}
-
-	/*
-	 * This command cannot complete in a single shot now itself.
-	 * It has to be restarted after the receipt of the ack from
-	 * the driver. So we need to enqueue the command (at the head).
-	 */
-	ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_FALSE);
-
-	ASSERT(arl->arl_state == ARL_S_UP);
-
-	/* Free all arp entries for this interface */
-	ar_ce_walk(as, ar_ce_delete_per_arl, arl);
-
-	ar_ll_down(arl);
-	/* Return EINPROGRESS so that ar_rput does not free the 'mp' */
-	return (EINPROGRESS);
-}
-
-
-/* Process an interface up causing the info req sequence to start. */
-/* ARGSUSED */
-static int
-ar_interface_up(queue_t *q, mblk_t *mp)
-{
-	arl_t	*arl;
-	int	err;
-	mblk_t	*mp1;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	arl = ar_ll_lookup_from_mp(as, mp);
-	if (arl == NULL || arl->arl_closing) {
-		DTRACE_PROBE2(up_no_arl, queue_t *, q, mblk_t *, mp);
-		err = EINVAL;
-		goto done;
-	}
-
-	/*
-	 * Newly received commands from clients go to the tail of the queue.
-	 */
-	if (CMD_NEEDS_QUEUEING(mp, arl)) {
-		DTRACE_PROBE3(up_enqueued, queue_t *, q, mblk_t *, mp,
-		    arl_t *, arl);
-		ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_TRUE);
-		return (EINPROGRESS);
-	}
-	mp->b_prev = NULL;
-
-	/*
-	 * The arl is already up. No work to do.
-	 */
-	if (arl->arl_state == ARL_S_UP) {
-		err = 0;
-		goto done;
-	}
-
-	/*
-	 * This command cannot complete in a single shot now itself.
-	 * It has to be restarted after the receipt of the ack from
-	 * the driver. So we need to enqueue the command (at the head).
-	 */
-	ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_FALSE);
-
-	err = ar_ll_up(arl);
-
-	/* Return EINPROGRESS so that ar_rput does not free the 'mp' */
-	return (EINPROGRESS);
-
-done:
-	/* caller frees 'mp' */
-
-	mp1 = ar_alloc(AR_DLPIOP_DONE, err);
-	if (mp1 != NULL) {
-		q = WR(q);
-		DTRACE_PROBE3(up_send_err, queue_t *, q, mblk_t *, mp1,
-		    int, err);
-		putnext(q, mp1);
-	}
-	return (err);
-}
-
-/*
- * Given an arie_t `mp', find the arl_t's that it names and return them
- * in `*arlp' and `*ipmp_arlp'.  If they cannot be found, return B_FALSE.
- */
-static boolean_t
-ar_ipmp_lookup(arp_stack_t *as, mblk_t *mp, arl_t **arlp, arl_t **ipmp_arlp)
-{
-	arie_t	*arie = (arie_t *)mp->b_rptr;
-
-	*arlp = ar_ll_lookup_from_mp(as, mp);
-	if (*arlp == NULL) {
-		DTRACE_PROBE1(ipmp_lookup_no_arl, mblk_t *, mp);
-		return (B_FALSE);
-	}
-
-	arie->arie_grifname[LIFNAMSIZ - 1] = '\0';
-	*ipmp_arlp = ar_ll_lookup_by_name(as, arie->arie_grifname);
-	if (*ipmp_arlp == NULL) {
-		DTRACE_PROBE1(ipmp_lookup_no_ipmp_arl, mblk_t *, mp);
-		return (B_FALSE);
-	}
-
-	DTRACE_PROBE2(ipmp_lookup, arl_t *, *arlp, arl_t *, *ipmp_arlp);
-	return (B_TRUE);
-}
-
-/*
- * Bind an arl_t to an IPMP group arl_t.
- */
-static int
-ar_ipmp_activate(queue_t *q, mblk_t *mp)
-{
-	arl_t *arl, *ipmp_arl;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
-		return (EINVAL);
-
-	if (arl->arl_ipmp_arl != NULL) {
-		DTRACE_PROBE1(ipmp_activated_already, arl_t *, arl);
-		return (EALREADY);
-	}
-
-	DTRACE_PROBE2(ipmp_activate, arl_t *, arl, arl_t *, ipmp_arl);
-	arl->arl_ipmp_arl = ipmp_arl;
-	return (0);
-}
-
-/*
- * Unbind an arl_t from an IPMP group arl_t and update the ace_t's so
- * that it is no longer part of the group.
- */
-static int
-ar_ipmp_deactivate(queue_t *q, mblk_t *mp)
-{
-	arl_t *arl, *ipmp_arl;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
-		return (EINVAL);
-
-	if (ipmp_arl != arl->arl_ipmp_arl) {
-		DTRACE_PROBE2(ipmp_deactivate_notactive, arl_t *, arl, arl_t *,
-		    ipmp_arl);
-		return (EINVAL);
-	}
-
-	DTRACE_PROBE2(ipmp_deactivate, arl_t *, arl, arl_t *,
-	    arl->arl_ipmp_arl);
-	ar_ce_walk(as, ar_ce_ipmp_deactivate, arl);
-	arl->arl_ipmp_arl = NULL;
-	return (0);
-}
-
-/*
- * Enable an interface to process ARP_REQUEST and ARP_RESPONSE messages.
- */
-/* ARGSUSED */
-static int
-ar_interface_on(queue_t *q, mblk_t *mp)
-{
-	arl_t	*arl;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	arl = ar_ll_lookup_from_mp(as, mp);
-	if (arl == NULL) {
-		DTRACE_PROBE2(on_no_arl, queue_t *, q, mblk_t *, mp);
-		return (EINVAL);
-	}
-
-	DTRACE_PROBE3(on_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl);
-	arl->arl_flags &= ~ARL_F_NOARP;
-	return (0);
-}
-
-/*
- * Disable an interface from processing
- * ARP_REQUEST and ARP_RESPONSE messages
- */
-/* ARGSUSED */
-static int
-ar_interface_off(queue_t *q, mblk_t *mp)
-{
-	arl_t	*arl;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	arl = ar_ll_lookup_from_mp(as, mp);
-	if (arl == NULL) {
-		DTRACE_PROBE2(off_no_arl, queue_t *, q, mblk_t *, mp);
-		return (EINVAL);
-	}
-
-	DTRACE_PROBE3(off_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl);
-	arl->arl_flags |= ARL_F_NOARP;
-	return (0);
-}
-
-/*
- * The queue 'q' is closing. Walk all the arl's and free any message
- * pending in the arl_queue if it originated from the closing q.
- * Also cleanup the ip_pending_queue, if the arp-IP stream is closing.
- */
-static void
-ar_ll_cleanup_arl_queue(queue_t *q)
-{
-	arl_t	*arl;
-	mblk_t	*mp;
-	mblk_t	*mpnext;
-	mblk_t	*prev;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-	ip_stack_t *ipst = as->as_netstack->netstack_ip;
-
-	for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) {
-		for (prev = NULL, mp = arl->arl_queue; mp != NULL;
-		    mp = mpnext) {
-			mpnext = mp->b_next;
-			if ((void *)mp->b_queue == (void *)q ||
-			    (void *)mp->b_queue == (void *)OTHERQ(q)) {
-				if (prev == NULL)
-					arl->arl_queue = mp->b_next;
-				else
-					prev->b_next = mp->b_next;
-				if (arl->arl_queue_tail == mp)
-					arl->arl_queue_tail = prev;
-				if (DB_TYPE(mp) == M_PROTO &&
-				    *(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) {
-					BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
-					    ire_stats_freed);
-				}
-				inet_freemsg(mp);
-			} else {
-				prev = mp;
-			}
-		}
-	}
-}
-
-/*
- * Look up a lower level tap by name.
- */
-static arl_t *
-ar_ll_lookup_by_name(arp_stack_t *as, const char *name)
-{
-	arl_t	*arl;
-
-	for (arl = as->as_arl_head; arl; arl = arl->arl_next) {
-		if (strcmp(arl->arl_name, name) == 0) {
-			return (arl);
-		}
-	}
-	return (NULL);
-}
-
-/*
- * Look up a lower level tap using parameters extracted from the common
- * portion of the ARP command.
- */
-static arl_t *
-ar_ll_lookup_from_mp(arp_stack_t *as, mblk_t *mp)
-{
-	arc_t	*arc = (arc_t *)mp->b_rptr;
-	uint8_t	*name;
-	size_t	namelen = arc->arc_name_length;
-
-	name = mi_offset_param(mp, arc->arc_name_offset, namelen);
-	if (name == NULL || name[namelen - 1] != '\0')
-		return (NULL);
-	return (ar_ll_lookup_by_name(as, (char *)name));
-}
-
-static void
-ar_ll_init(arp_stack_t *as, ar_t *ar, mblk_t *mp)
-{
-	arl_t	*arl;
-	dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
-
-	ASSERT(ar->ar_arl == NULL);
-
-	if ((arl = (arl_t *)mi_zalloc(sizeof (arl_t))) == NULL)
-		return;
-
-	if (dlia->dl_mac_type == SUNW_DL_IPMP) {
-		arl->arl_flags |= ARL_F_IPMP;
-		arl->arl_ipmp_arl = arl;
-	}
-
-	arl->arl_provider_style = dlia->dl_provider_style;
-	arl->arl_rq = ar->ar_rq;
-	arl->arl_wq = ar->ar_wq;
-
-	arl->arl_dlpi_pending = DL_PRIM_INVAL;
-
-	ar->ar_arl = arl;
-
-	/*
-	 * If/when ARP gets pushed into the IP module then this code to make
-	 * a number uniquely identify an ARP instance can be removed and the
-	 * ifindex from IP used.  Rather than try and reinvent or copy the
-	 * code used by IP for the purpose of allocating an index number
-	 * (and trying to keep the number small), just allocate it in an
-	 * ever increasing manner.  This index number isn't ever exposed to
-	 * users directly, its only use is for providing the pfhooks interface
-	 * with a number it can use to uniquely identify an interface in time.
-	 *
-	 * Using a 32bit counter, over 136 plumbs would need to be done every
-	 * second of every day (non-leap year) for it to wrap around and the
-	 * for() loop below to kick in as a performance concern.
-	 */
-	if (as->as_arp_counter_wrapped) {
-		arl_t *arl1;
-
-		do {
-			for (arl1 = as->as_arl_head; arl1 != NULL;
-			    arl1 = arl1->arl_next)
-				if (arl1->arl_index ==
-				    as->as_arp_index_counter) {
-					as->as_arp_index_counter++;
-					if (as->as_arp_index_counter == 0) {
-						as->as_arp_counter_wrapped++;
-						as->as_arp_index_counter = 1;
-					}
-					break;
-			}
-		} while (arl1 != NULL);
-	} else {
-		arl->arl_index = as->as_arp_index_counter;
-	}
-	as->as_arp_index_counter++;
-	if (as->as_arp_index_counter == 0) {
-		as->as_arp_counter_wrapped++;
-		as->as_arp_index_counter = 1;
-	}
-}
-
-/*
- * This routine is called during module initialization when the DL_INFO_ACK
- * comes back from the device.	We set up defaults for all the device dependent
- * doo-dads we are going to need.  This will leave us ready to roll if we are
- * attempting auto-configuration.  Alternatively, these defaults can be
- * overridden by initialization procedures possessing higher intelligence.
- */
-static void
-ar_ll_set_defaults(arl_t *arl, mblk_t *mp)
-{
-	ar_m_t		*arm;
-	dl_info_ack_t	*dlia = (dl_info_ack_t *)mp->b_rptr;
-	dl_unitdata_req_t *dlur;
-	uchar_t		*up;
-	arlphy_t 	*ap;
-
-	ASSERT(arl != NULL);
-
-	/*
-	 * Clear any stale defaults that might exist.
-	 */
-	ar_ll_clear_defaults(arl);
-
-	if (arl->arl_flags & ARL_F_IPMP) {
-		/*
-		 * If this is an IPMP arl_t, we have nothing to do,
-		 * since we will never transmit or receive.
-		 */
-		return;
-	}
-
-	ap = kmem_zalloc(sizeof (arlphy_t), KM_NOSLEEP);
-	if (ap == NULL)
-		goto bad;
-	arl->arl_phy = ap;
-
-	if ((arm = ar_m_lookup(dlia->dl_mac_type)) == NULL)
-		arm = ar_m_lookup(DL_OTHER);
-	ASSERT(arm != NULL);
-
-	/*
-	 * We initialize based on parameters in the (currently) not too
-	 * exhaustive ar_m_tbl.
-	 */
-	if (dlia->dl_version == DL_VERSION_2) {
-		/* XXX DLPI spec allows dl_sap_length of 0 before binding. */
-		ap->ap_saplen = dlia->dl_sap_length;
-		ap->ap_hw_addrlen = dlia->dl_brdcst_addr_length;
-	} else {
-		ap->ap_saplen = arm->ar_mac_sap_length;
-		ap->ap_hw_addrlen = arm->ar_mac_hw_addr_length;
-	}
-	ap->ap_arp_hw_type = arm->ar_mac_arp_hw_type;
-
-	/*
-	 * Allocate the hardware and ARP addresses; note that the hardware
-	 * address cannot be filled in until we see the DL_BIND_ACK.
-	 */
-	ap->ap_hw_addr = kmem_zalloc(ap->ap_hw_addrlen, KM_NOSLEEP);
-	ap->ap_arp_addr = kmem_alloc(ap->ap_hw_addrlen, KM_NOSLEEP);
-	if (ap->ap_hw_addr == NULL || ap->ap_arp_addr == NULL)
-		goto bad;
-
-	if (dlia->dl_version == DL_VERSION_2) {
-		if ((up = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
-		    ap->ap_hw_addrlen)) == NULL)
-			goto bad;
-		bcopy(up, ap->ap_arp_addr, ap->ap_hw_addrlen);
-	} else {
-		/*
-		 * No choice but to assume a broadcast address of all ones,
-		 * known to work on some popular networks.
-		 */
-		(void) memset(ap->ap_arp_addr, ~0, ap->ap_hw_addrlen);
-	}
-
-	/*
-	 * Make us a template DL_UNITDATA_REQ message which we will use for
-	 * broadcasting resolution requests, and which we will clone to hand
-	 * back as responses to the protocols.
-	 */
-	ap->ap_xmit_mp = ar_dlpi_comm(DL_UNITDATA_REQ, ap->ap_hw_addrlen +
-	    ABS(ap->ap_saplen) + sizeof (dl_unitdata_req_t));
-	if (ap->ap_xmit_mp == NULL)
-		goto bad;
-
-	dlur = (dl_unitdata_req_t *)ap->ap_xmit_mp->b_rptr;
-	dlur->dl_priority.dl_min = 0;
-	dlur->dl_priority.dl_max = 0;
-	dlur->dl_dest_addr_length = ap->ap_hw_addrlen + ABS(ap->ap_saplen);
-	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
-
-	/* NOTE: the destination address and sap offsets are permanently set */
-	ap->ap_xmit_sapoff = dlur->dl_dest_addr_offset;
-	ap->ap_xmit_addroff = dlur->dl_dest_addr_offset;
-	if (ap->ap_saplen < 0)
-		ap->ap_xmit_sapoff += ap->ap_hw_addrlen;	/* sap last */
-	else
-		ap->ap_xmit_addroff += ap->ap_saplen;		/* addr last */
-
-	*(uint16_t *)((caddr_t)dlur + ap->ap_xmit_sapoff) = ETHERTYPE_ARP;
-	return;
-bad:
-	ar_ll_clear_defaults(arl);
-}
-
-static void
-ar_ll_clear_defaults(arl_t *arl)
-{
-	arlphy_t *ap = arl->arl_phy;
-
-	if (ap != NULL) {
-		arl->arl_phy = NULL;
-		if (ap->ap_hw_addr != NULL)
-			kmem_free(ap->ap_hw_addr, ap->ap_hw_addrlen);
-		if (ap->ap_arp_addr != NULL)
-			kmem_free(ap->ap_arp_addr, ap->ap_hw_addrlen);
-		freemsg(ap->ap_xmit_mp);
-		kmem_free(ap, sizeof (arlphy_t));
-	}
-}
-
-static void
-ar_ll_down(arl_t *arl)
-{
-	mblk_t	*mp;
-	ar_t	*ar;
-
-	ASSERT(arl->arl_state == ARL_S_UP);
-
-	/* Let's break the association between an ARL and IP instance */
-	ar = (ar_t *)arl->arl_rq->q_ptr;
-	if (ar->ar_arl_ip_assoc != NULL) {
-		ASSERT(ar->ar_arl_ip_assoc->ar_arl_ip_assoc != NULL &&
-		    ar->ar_arl_ip_assoc->ar_arl_ip_assoc == ar);
-		ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL;
-		ar->ar_arl_ip_assoc = NULL;
-	}
-
-	arl->arl_state = ARL_S_PENDING;
-
-	mp = arl->arl_unbind_mp;
-	ASSERT(mp != NULL);
-	ar_dlpi_send(arl, mp);
-	arl->arl_unbind_mp = NULL;
-
-	if (arl->arl_provider_style == DL_STYLE2) {
-		mp = arl->arl_detach_mp;
-		ASSERT(mp != NULL);
-		ar_dlpi_send(arl, mp);
-		arl->arl_detach_mp = NULL;
-	}
-}
-
-static int
-ar_ll_up(arl_t *arl)
-{
-	mblk_t	*attach_mp = NULL;
-	mblk_t	*bind_mp = NULL;
-	mblk_t	*detach_mp = NULL;
-	mblk_t	*unbind_mp = NULL;
-	mblk_t	*info_mp = NULL;
-	mblk_t	*notify_mp = NULL;
-
-	ASSERT(arl->arl_state == ARL_S_DOWN);
-
-	if (arl->arl_provider_style == DL_STYLE2) {
-		attach_mp =
-		    ar_dlpi_comm(DL_ATTACH_REQ, sizeof (dl_attach_req_t));
-		if (attach_mp == NULL)
-			goto bad;
-		((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa =
-		    arl->arl_ppa;
-
-		detach_mp =
-		    ar_dlpi_comm(DL_DETACH_REQ, sizeof (dl_detach_req_t));
-		if (detach_mp == NULL)
-			goto bad;
-	}
-
-	info_mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t));
-	if (info_mp == NULL)
-		goto bad;
-
-	/* Allocate and initialize a bind message. */
-	bind_mp = ar_dlpi_comm(DL_BIND_REQ, sizeof (dl_bind_req_t));
-	if (bind_mp == NULL)
-		goto bad;
-	((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ETHERTYPE_ARP;
-	((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
-
-	unbind_mp = ar_dlpi_comm(DL_UNBIND_REQ, sizeof (dl_unbind_req_t));
-	if (unbind_mp == NULL)
-		goto bad;
-
-	notify_mp = ar_dlpi_comm(DL_NOTIFY_REQ, sizeof (dl_notify_req_t));
-	if (notify_mp == NULL)
-		goto bad;
-	((dl_notify_req_t *)notify_mp->b_rptr)->dl_notifications =
-	    DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN | DL_NOTE_REPLUMB;
-
-	arl->arl_state = ARL_S_PENDING;
-	if (arl->arl_provider_style == DL_STYLE2) {
-		ar_dlpi_send(arl, attach_mp);
-		ASSERT(detach_mp != NULL);
-		arl->arl_detach_mp = detach_mp;
-	}
-	ar_dlpi_send(arl, info_mp);
-	ar_dlpi_send(arl, bind_mp);
-	arl->arl_unbind_mp = unbind_mp;
-	ar_dlpi_send(arl, notify_mp);
-	return (0);
-
-bad:
-	freemsg(attach_mp);
-	freemsg(bind_mp);
-	freemsg(detach_mp);
-	freemsg(unbind_mp);
-	freemsg(info_mp);
-	freemsg(notify_mp);
-	return (ENOMEM);
-}
-
-/* Process mapping add requests from external messages. */
-static int
-ar_mapping_add(queue_t *q, mblk_t *mp_orig)
-{
-	arma_t	*arma;
-	mblk_t	*mp = mp_orig;
-	ace_t	*ace;
-	uchar_t	*hw_addr;
-	uint32_t hw_addr_len;
-	uchar_t	*proto_addr;
-	uint32_t proto_addr_len;
-	uchar_t	*proto_mask;
-	uchar_t	*proto_extract_mask;
-	uint32_t hw_extract_start;
-	arl_t	*arl;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	/* We handle both M_IOCTL and M_PROTO messages. */
-	if (DB_TYPE(mp) == M_IOCTL)
-		mp = mp->b_cont;
-	arl = ar_ll_lookup_from_mp(as, mp);
-	if (arl == NULL)
-		return (EINVAL);
-	/*
-	 * Newly received commands from clients go to the tail of the queue.
-	 */
-	if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
-		DTRACE_PROBE3(madd_enqueued, queue_t *, q, mblk_t *, mp_orig,
-		    arl_t *, arl);
-		ar_cmd_enqueue(arl, mp_orig, q, AR_MAPPING_ADD, B_TRUE);
-		return (EINPROGRESS);
-	}
-	mp_orig->b_prev = NULL;
-
-	arma = (arma_t *)mp->b_rptr;
-	ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup_mapping);
-	if (ace != NULL)
-		ar_ce_delete(ace);
-	hw_addr_len = arma->arma_hw_addr_length;
-	hw_addr = mi_offset_paramc(mp, arma->arma_hw_addr_offset, hw_addr_len);
-	proto_addr_len = arma->arma_proto_addr_length;
-	proto_addr = mi_offset_paramc(mp, arma->arma_proto_addr_offset,
-	    proto_addr_len);
-	proto_mask = mi_offset_paramc(mp, arma->arma_proto_mask_offset,
-	    proto_addr_len);
-	proto_extract_mask = mi_offset_paramc(mp,
-	    arma->arma_proto_extract_mask_offset, proto_addr_len);
-	hw_extract_start = arma->arma_hw_mapping_start;
-	if (proto_mask == NULL || proto_extract_mask == NULL) {
-		DTRACE_PROBE2(madd_illegal_mask, arl_t *, arl, arpa_t *, arma);
-		return (EINVAL);
-	}
-	return (ar_ce_create(
-	    arl,
-	    arma->arma_proto,
-	    hw_addr,
-	    hw_addr_len,
-	    proto_addr,
-	    proto_addr_len,
-	    proto_mask,
-	    proto_extract_mask,
-	    hw_extract_start,
-	    NULL,
-	    arma->arma_flags | ACE_F_MAPPING));
-}
-
-static boolean_t
-ar_mask_all_ones(uchar_t *mask, uint32_t mask_len)
-{
-	if (mask == NULL)
-		return (B_TRUE);
-
-	while (mask_len-- > 0) {
-		if (*mask++ != 0xFF) {
-			return (B_FALSE);
-		}
-	}
-	return (B_TRUE);
-}
-
-/* Find an entry for a particular MAC type in the ar_m_tbl. */
-static ar_m_t	*
-ar_m_lookup(t_uscalar_t mac_type)
-{
-	ar_m_t	*arm;
-
-	for (arm = ar_m_tbl; arm < A_END(ar_m_tbl); arm++) {
-		if (arm->ar_mac_type == mac_type)
-			return (arm);
-	}
-	return (NULL);
-}
-
-/* Respond to Named Dispatch requests. */
-static int
-ar_nd_ioctl(queue_t *q, mblk_t *mp)
-{
-	ar_t	*ar = (ar_t *)q->q_ptr;
-	arp_stack_t *as = ar->ar_as;
-
-	if (DB_TYPE(mp) == M_IOCTL && nd_getset(q, as->as_nd, mp))
-		return (0);
-	return (ENOENT);
-}
-
-/* ARP module open routine. */
-static int
-ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
-{
-	ar_t	*ar;
-	int	err;
-	queue_t *tmp_q;
-	mblk_t *mp;
-	netstack_t *ns;
-	arp_stack_t *as;
-
-	TRACE_1(TR_FAC_ARP, TR_ARP_OPEN,
-	    "arp_open: q %p", q);
-	/* Allow a reopen. */
-	if (q->q_ptr != NULL) {
-		return (0);
-	}
-
-	ns = netstack_find_by_cred(credp);
-	ASSERT(ns != NULL);
-	as = ns->netstack_arp;
-	ASSERT(as != NULL);
-
-	/* mi_open_comm allocates the instance data structure, etc. */
-	err = mi_open_comm(&as->as_head, sizeof (ar_t), q, devp, flag, sflag,
-	    credp);
-	if (err) {
-		netstack_rele(as->as_netstack);
-		return (err);
-	}
-
-	/*
-	 * We are D_MTPERMOD so it is safe to do qprocson before
-	 * the instance data has been initialized.
-	 */
-	qprocson(q);
-
-	ar = (ar_t *)q->q_ptr;
-	ar->ar_rq = q;
-	q = WR(q);
-	ar->ar_wq = q;
-	crhold(credp);
-	ar->ar_credp = credp;
-	ar->ar_as = as;
-
-	/*
-	 * Probe for the DLPI info if we are not pushed on IP or UDP. Wait for
-	 * the reply. In case of error call ar_close() which will take
-	 * care of doing everything required to close this instance, such
-	 * as freeing the arl, restarting the timer on a different queue etc.
-	 */
-	if (strcmp(q->q_next->q_qinfo->qi_minfo->mi_idname, "ip") == 0 ||
-	    strcmp(q->q_next->q_qinfo->qi_minfo->mi_idname, "udp") == 0) {
-		arc_t *arc;
-
-		/*
-		 * We are pushed directly on top of IP or UDP. There is no need
-		 * to send down a DL_INFO_REQ. Return success. This could
-		 * either be an ill stream (i.e. <arp-IP-Driver> stream)
-		 * or a stream corresponding to an open of /dev/arp
-		 * (i.e. <arp-IP> stream). Note that we don't support
-		 * pushing some module in between arp and IP.
-		 *
-		 * Tell IP, though, that we're an extended implementation, so
-		 * it knows to expect a DAD response after bringing an
-		 * interface up.  Old ATM drivers won't do this, and IP will
-		 * just bring the interface up immediately.
-		 */
-		ar->ar_on_ill_stream = (q->q_next->q_next != NULL);
-		if (!ar->ar_on_ill_stream || arp_no_defense)
-			return (0);
-		mp = allocb(sizeof (arc_t), BPRI_MED);
-		if (mp == NULL) {
-			(void) ar_close(RD(q));
-			return (ENOMEM);
-		}
-		DB_TYPE(mp) = M_CTL;
-		arc = (arc_t *)mp->b_rptr;
-		mp->b_wptr = mp->b_rptr + sizeof (arc_t);
-		arc->arc_cmd = AR_ARP_EXTEND;
-		putnext(q, mp);
-		return (0);
-	}
-	tmp_q = q;
-	/* Get the driver's queue */
-	while (tmp_q->q_next != NULL)
-		tmp_q = tmp_q->q_next;
-
-	ASSERT(tmp_q->q_qinfo->qi_minfo != NULL);
-
-	if (strcmp(tmp_q->q_qinfo->qi_minfo->mi_idname, "ip") == 0 ||
-	    strcmp(tmp_q->q_qinfo->qi_minfo->mi_idname, "udp") == 0) {
-		/*
-		 * We don't support pushing ARP arbitrarily on an IP or UDP
-		 * driver stream.  ARP has to be pushed directly above IP or
-		 * UDP.
-		 */
-		(void) ar_close(RD(q));
-		return (ENOTSUP);
-	} else {
-		/*
-		 * Send down a DL_INFO_REQ so we can find out what we are
-		 * talking to.
-		 */
-		mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t));
-		if (mp == NULL) {
-			(void) ar_close(RD(q));
-			return (ENOMEM);
-		}
-		putnext(ar->ar_wq, mp);
-		while (ar->ar_arl == NULL) {
-			if (!qwait_sig(ar->ar_rq)) {
-				(void) ar_close(RD(q));
-				return (EINTR);
-			}
-		}
-	}
-	return (0);
-}
-
-/* Get current value of Named Dispatch item. */
-/* ARGSUSED */
-static int
-ar_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
-{
-	arpparam_t	*arppa = (arpparam_t *)cp;
-
-	(void) mi_mpprintf(mp, "%d", arppa->arp_param_value);
-	return (0);
-}
-
-/*
- * Walk through the param array specified registering each element with the
- * named dispatch handler.
- */
-static boolean_t
-ar_param_register(IDP *ndp, arpparam_t *arppa, int cnt)
-{
-	for (; cnt-- > 0; arppa++) {
-		if (arppa->arp_param_name && arppa->arp_param_name[0]) {
-			if (!nd_load(ndp, arppa->arp_param_name,
-			    ar_param_get, ar_param_set,
-			    (caddr_t)arppa)) {
-				nd_free(ndp);
-				return (B_FALSE);
-			}
-		}
-	}
-	return (B_TRUE);
-}
-
-/* Set new value of Named Dispatch item. */
-/* ARGSUSED */
-static int
-ar_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
-{
-	long		new_value;
-	arpparam_t	*arppa = (arpparam_t *)cp;
-
-	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
-	    new_value < arppa->arp_param_min ||
-	    new_value > arppa->arp_param_max) {
-		return (EINVAL);
-	}
-	arppa->arp_param_value = new_value;
-	return (0);
-}
-
-/*
- * Process an I_PLINK ioctl. If the lower stream is an arp device stream,
- * append another mblk to the chain, that will carry the device name,
- * and the muxid. IP uses this info to lookup the corresponding ill, and
- * set the ill_arp_muxid atomically, as part of the I_PLINK, instead of
- * waiting for the SIOCSLIFMUXID. (which may never happen if ifconfig is
- * killed, and this has the bad effect of not being able to unplumb
- * subsequently)
- */
-static int
-ar_plink_send(queue_t *q, mblk_t *mp)
-{
-	char	*name;
-	mblk_t 	*muxmp;
-	mblk_t 	*mp1;
-	ar_t	*ar = (ar_t *)q->q_ptr;
-	arp_stack_t *as = ar->ar_as;
-	struct	linkblk *li;
-	struct	ipmx_s	*ipmxp;
-	queue_t	*arpwq;
-
-	mp1 = mp->b_cont;
-	ASSERT((mp1 != NULL) && (mp1->b_cont == NULL));
-	li = (struct linkblk *)mp1->b_rptr;
-	arpwq = li->l_qbot;
-
-	/*
-	 * Allocate a new mblk which will hold an ipmx_s and chain it to
-	 * the M_IOCTL chain. The final chain will consist of 3 mblks,
-	 * namely the M_IOCTL, followed by the linkblk, followed by the ipmx_s
-	 */
-	muxmp =  allocb(sizeof (struct ipmx_s), BPRI_MED);
-	if (muxmp == NULL)
-		return (ENOMEM);
-	ipmxp = (struct ipmx_s *)muxmp->b_wptr;
-	ipmxp->ipmx_arpdev_stream = 0;
-	muxmp->b_wptr += sizeof (struct ipmx_s);
-	mp1->b_cont = muxmp;
-
-	/*
-	 * The l_qbot represents the uppermost write queue of the
-	 * lower stream. Walk down this stream till we hit ARP.
-	 * We can safely walk, since STREAMS has made sure the stream
-	 * cannot close till the IOCACK goes up, and is not interruptible.
-	 */
-	while (arpwq != NULL) {
-		/*
-		 * Beware of broken modules like logsubr.c that
-		 * may not have a q_qinfo or qi_minfo.
-		 */
-		if ((q->q_qinfo != NULL) && (q->q_qinfo->qi_minfo != NULL)) {
-			name = arpwq->q_qinfo->qi_minfo->mi_idname;
-			if (name != NULL && name[0] != NULL &&
-			    (strcmp(name, arp_mod_info.mi_idname) == 0))
-				break;
-		}
-		arpwq = arpwq->q_next;
-	}
-
-	/*
-	 * Check if arpwq corresponds to an arp device stream, by walking
-	 * the mi list. If it does, then add the muxid and device name info
-	 * for use by IP. IP will send the M_IOCACK.
-	 */
-	if (arpwq != NULL) {
-		for (ar = (ar_t *)mi_first_ptr(&as->as_head); ar != NULL;
-		    ar = (ar_t *)mi_next_ptr(&as->as_head, (void *)ar)) {
-			if ((ar->ar_wq == arpwq) && (ar->ar_arl != NULL)) {
-				ipmxp->ipmx_arpdev_stream = 1;
-				(void) strcpy((char *)ipmxp->ipmx_name,
-				    ar->ar_arl->arl_name);
-				break;
-			}
-		}
-	}
-
-	putnext(q, mp);
-	return (0);
-}
-
-/*
- * ar_ce_walk routine to delete any outstanding queries for an ar that is
- * going away.
- */
-static void
-ar_query_delete(ace_t *ace, void *arg)
-{
-	ar_t	*ar = arg;
-	mblk_t	**mpp = &ace->ace_query_mp;
-	mblk_t	*mp;
-	arp_stack_t *as = ar->ar_as;
-	ip_stack_t *ipst = as->as_netstack->netstack_ip;
-
-	while ((mp = *mpp) != NULL) {
-		/* The response queue was stored in the query b_prev. */
-		if ((queue_t *)mp->b_prev == ar->ar_wq ||
-		    (queue_t *)mp->b_prev == ar->ar_rq) {
-			*mpp = mp->b_next;
-			if (DB_TYPE(mp) == M_PROTO &&
-			    *(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) {
-				BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
-				    ire_stats_freed);
-			}
-			inet_freemsg(mp);
-		} else {
-			mpp = &mp->b_next;
-		}
-	}
-}
-
-/*
- * This routine is called either when an address resolution has just been
- * found, or when it is time to give, or in some other error situation.
- * If a non-zero ret_val is provided, any outstanding queries for the
- * specified ace will be completed using that error value.  Otherwise,
- * the completion status will depend on whether the address has been
- * resolved.
- */
-static void
-ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
-    uint32_t proto_addr_len)
-{
-	mblk_t	*areq_mp;
-	mblk_t	*mp;
-	mblk_t	*xmit_mp;
-	queue_t	*arl_wq = ace->ace_arl->arl_wq;
-	arp_stack_t *as = ARL_TO_ARPSTACK(ace->ace_arl);
-	ip_stack_t *ipst = as->as_netstack->netstack_ip;
-	arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
-
-	/*
-	 * On error or completion for a query, we need to shut down the timer.
-	 * However, the timer must not be stopped for an interface doing
-	 * Duplicate Address Detection, or it will never finish that phase.
-	 */
-	if (!(ace->ace_flags & (ACE_F_UNVERIFIED | ACE_F_AUTHORITY)))
-		mi_timer(arl_wq, ace->ace_mp, -1L);
-
-	/* Establish the return value appropriate. */
-	if (ret_val == 0) {
-		if (!ACE_RESOLVED(ace) || ap == NULL)
-			ret_val = ENXIO;
-	}
-	/* Terminate all outstanding queries. */
-	while ((mp = ace->ace_query_mp) != 0) {
-		/* The response queue was saved in b_prev. */
-		queue_t	*q = (queue_t *)mp->b_prev;
-		mp->b_prev = NULL;
-		ace->ace_query_mp = mp->b_next;
-		mp->b_next = NULL;
-		/*
-		 * If we have the answer, attempt to get a copy of the xmit
-		 * template to prepare for the client.
-		 */
-		if (ret_val == 0 &&
-		    (xmit_mp = copyb(ap->ap_xmit_mp)) == NULL) {
-			/* Too bad, buy more memory. */
-			ret_val = ENOMEM;
-		}
-		/* Complete the response based on how the request arrived. */
-		if (DB_TYPE(mp) == M_IOCTL) {
-			struct iocblk *ioc = (struct iocblk *)mp->b_rptr;
-
-			ioc->ioc_error = ret_val;
-			if (ret_val != 0) {
-				DB_TYPE(mp) = M_IOCNAK;
-				ioc->ioc_count = 0;
-				putnext(q, mp);
-				continue;
-			}
-			/*
-			 * Return the xmit mp out with the successful IOCTL.
-			 */
-			DB_TYPE(mp) = M_IOCACK;
-			ioc->ioc_count = MBLKL(xmit_mp);
-			/* Remove the areq mblk from the IOCTL. */
-			areq_mp = mp->b_cont;
-			mp->b_cont = areq_mp->b_cont;
-		} else {
-			if (ret_val != 0) {
-				/* TODO: find some way to let the guy know? */
-				inet_freemsg(mp);
-				BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
-				    ire_stats_freed);
-				continue;
-			}
-			/*
-			 * In the M_PROTO case, the areq message is followed by
-			 * a message chain to be returned to the protocol.  ARP
-			 * doesn't know (or care) what is in this chain, but in
-			 * the event that the reader is pondering the
-			 * relationship between ARP and IP (for example), the
-			 * areq is followed by an incipient IRE, and then the
-			 * original outbound packet.  Here we detach the areq.
-			 */
-			areq_mp = mp;
-			mp = mp->b_cont;
-		}
-		ASSERT(ret_val == 0 && ap != NULL);
-		if (ap->ap_saplen != 0) {
-			/*
-			 * Copy the SAP type specified in the request into
-			 * the xmit mp.
-			 */
-			areq_t	*areq = (areq_t *)areq_mp->b_rptr;
-			bcopy(areq->areq_sap, xmit_mp->b_rptr +
-			    ap->ap_xmit_sapoff, ABS(ap->ap_saplen));
-		}
-		/* Done with the areq message. */
-		freeb(areq_mp);
-		/*
-		 * Copy the resolved hardware address into the xmit mp
-		 * or perform the mapping operation.
-		 */
-		ar_set_address(ace, xmit_mp->b_rptr + ap->ap_xmit_addroff,
-		    proto_addr, proto_addr_len);
-		/*
-		 * Now insert the xmit mp after the response message.  In
-		 * the M_IOCTL case, it will be the returned data block.  In
-		 * the M_PROTO case, (again using IP as an example) it will
-		 * appear after the IRE and before the outbound packet.
-		 */
-		xmit_mp->b_cont = mp->b_cont;
-		mp->b_cont = xmit_mp;
-		putnext(q, mp);
-	}
-
-	/*
-	 * Unless we are responding from a permanent cache entry, start the
-	 * cleanup timer or (on error) delete the entry.
-	 */
-	if (!(ace->ace_flags & (ACE_F_PERMANENT | ACE_F_DYING))) {
-		if (!ACE_RESOLVED(ace) || ap == NULL) {
-			/*
-			 * No need to notify IP here, because the entry was
-			 * never resolved, so IP can't have any cached copies
-			 * of the address.
-			 */
-			ar_ce_delete(ace);
-		} else {
-			mi_timer(arl_wq, ace->ace_mp, as->as_cleanup_interval);
-		}
-	}
-}
-
-/*
- * Returns number of milliseconds after which we should either rexmit or abort.
- * Return of zero means we should abort.
- */
-static clock_t
-ar_query_xmit(arp_stack_t *as, ace_t *ace)
-{
-	areq_t	*areq;
-	mblk_t	*mp;
-	uchar_t	*proto_addr;
-	uchar_t	*sender_addr;
-	ace_t	*src_ace;
-	arl_t	*xmit_arl = ace->ace_xmit_arl;
-
-	mp = ace->ace_query_mp;
-	/*
-	 * ar_query_delete may have just blown off the outstanding
-	 * ace_query_mp entries because the client who sent the query
-	 * went away. If this happens just before the ace_mp timer
-	 * goes off, we'd find a null ace_query_mp which is not an error.
-	 * The unresolved ace itself, and the timer, will be removed
-	 * when the arl stream goes away.
-	 */
-	if (!mp)
-		return (0);
-	if (DB_TYPE(mp) == M_IOCTL)
-		mp = mp->b_cont;
-	areq = (areq_t *)mp->b_rptr;
-	if (areq->areq_xmit_count == 0)
-		return (0);
-	areq->areq_xmit_count--;
-	proto_addr = mi_offset_paramc(mp, areq->areq_target_addr_offset,
-	    areq->areq_target_addr_length);
-	sender_addr = mi_offset_paramc(mp, areq->areq_sender_addr_offset,
-	    areq->areq_sender_addr_length);
-
-	/*
-	 * Get the ace for the sender address, so that we can verify that
-	 * we have one and that DAD has completed.
-	 */
-	src_ace = ar_ce_lookup(xmit_arl, areq->areq_proto, sender_addr,
-	    areq->areq_sender_addr_length);
-	if (src_ace == NULL) {
-		DTRACE_PROBE3(xmit_no_source, ace_t *, ace, areq_t *, areq,
-		    uchar_t *, sender_addr);
-		return (0);
-	}
-
-	/*
-	 * If we haven't yet finished duplicate address checking on this source
-	 * address, then do *not* use it on the wire.  Doing so will corrupt
-	 * the world's caches.  Just allow the timer to restart.  Note that
-	 * duplicate address checking will eventually complete one way or the
-	 * other, so this cannot go on "forever."
-	 */
-	if (src_ace->ace_flags & ACE_F_UNVERIFIED) {
-		DTRACE_PROBE2(xmit_source_unverified, ace_t *, ace,
-		    ace_t *, src_ace);
-		areq->areq_xmit_count++;
-		return (areq->areq_xmit_interval);
-	}
-
-	DTRACE_PROBE3(xmit_send, ace_t *, ace, ace_t *, src_ace,
-	    areq_t *, areq);
-
-	ar_xmit(xmit_arl, ARP_REQUEST, areq->areq_proto,
-	    areq->areq_sender_addr_length, xmit_arl->arl_phy->ap_hw_addr,
-	    sender_addr, xmit_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as);
-	src_ace->ace_last_bcast = ddi_get_lbolt();
-	return (areq->areq_xmit_interval);
-}
-
-/* Our read side put procedure. */
-static void
-ar_rput(queue_t *q, mblk_t *mp)
-{
-	arh_t	*arh;
-	arl_t	*arl;
-	arl_t	*client_arl;
-	ace_t	*dst_ace;
-	uchar_t	*dst_paddr;
-	int	err;
-	uint32_t	hlen;
-	struct iocblk	*ioc;
-	mblk_t	*mp1;
-	int	op;
-	uint32_t	plen;
-	uint32_t	proto;
-	uchar_t	*src_haddr;
-	uchar_t	*src_paddr;
-	uchar_t *dst_haddr;
-	boolean_t is_probe;
-	boolean_t is_unicast = B_FALSE;
-	dl_unitdata_ind_t *dlindp;
-	int i;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	TRACE_1(TR_FAC_ARP, TR_ARP_RPUT_START,
-	    "arp_rput_start: q %p", q);
-
-	/*
-	 * We handle ARP commands from below both in M_IOCTL and M_PROTO
-	 * messages.  Actual ARP requests and responses will show up as
-	 * M_PROTO messages containing DL_UNITDATA_IND blocks.
-	 */
-	switch (DB_TYPE(mp)) {
-	case M_IOCTL:
-		err = ar_cmd_dispatch(q, mp, B_FALSE);
-		switch (err) {
-		case ENOENT:
-			DB_TYPE(mp) = M_IOCNAK;
-			if ((mp1 = mp->b_cont) != 0) {
-				/*
-				 * Collapse the data as a note to the
-				 * originator.
-				 */
-				mp1->b_wptr = mp1->b_rptr;
-			}
-			break;
-		case EINPROGRESS:
-			TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-			    "arp_rput_end: q %p (%S)", q, "ioctl/inprogress");
-			return;
-		default:
-			DB_TYPE(mp) = M_IOCACK;
-			break;
-		}
-		ioc = (struct iocblk *)mp->b_rptr;
-		ioc->ioc_error = err;
-		if ((mp1 = mp->b_cont) != 0)
-			ioc->ioc_count = MBLKL(mp1);
-		else
-			ioc->ioc_count = 0;
-		qreply(q, mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		    "arp_rput_end: q %p (%S)", q, "ioctl");
-		return;
-	case M_CTL:
-		/*
-		 * IP is acking the AR_ARP_CLOSING message that we sent
-		 * in ar_close.
-		 */
-		if (MBLKL(mp) == sizeof (arc_t)) {
-			if (((arc_t *)mp->b_rptr)->arc_cmd == AR_ARP_CLOSING)
-				((ar_t *)q->q_ptr)->ar_ip_acked_close = 1;
-		}
-		freemsg(mp);
-		return;
-	case M_PCPROTO:
-	case M_PROTO:
-		dlindp = (dl_unitdata_ind_t *)mp->b_rptr;
-		if (MBLKL(mp) >= sizeof (dl_unitdata_ind_t) &&
-		    dlindp->dl_primitive == DL_UNITDATA_IND) {
-			is_unicast = (dlindp->dl_group_address == 0);
-			arl = ((ar_t *)q->q_ptr)->ar_arl;
-			if (arl != NULL && arl->arl_phy != NULL) {
-				/* Real messages from the wire! */
-				break;
-			}
-			putnext(q, mp);
-			TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-			    "arp_rput_end: q %p (%S)", q, "default");
-			return;
-		}
-		err = ar_cmd_dispatch(q, mp, B_FALSE);
-		switch (err) {
-		case ENOENT:
-			/* Miscellaneous DLPI messages get shuffled off. */
-			ar_rput_dlpi(q, mp);
-			TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-			    "arp_rput_end: q %p (%S)", q, "proto/dlpi");
-			break;
-		case EINPROGRESS:
-			TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-			    "arp_rput_end: q %p (%S)", q, "proto");
-			break;
-		default:
-			inet_freemsg(mp);
-			break;
-		}
-		return;
-	default:
-		putnext(q, mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		    "arp_rput_end: q %p (%S)", q, "default");
-		return;
-	}
-	/*
-	 * If the IFF_NOARP flag is on, then do not process any
-	 * incoming ARP_REQUEST or incoming ARP_RESPONSE.
-	 */
-	if (arl->arl_flags & ARL_F_NOARP) {
-		freemsg(mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		"arp_rput_end: q %p (%S)", q, "interface has IFF_NOARP set");
-		return;
-	}
-
-	/*
-	 * What we should have at this point is a DL_UNITDATA_IND message
-	 * followed by an ARP packet.  We do some initial checks and then
-	 * get to work.
-	 */
-	mp1 = mp->b_cont;
-	if (mp1 == NULL) {
-		freemsg(mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		    "arp_rput_end: q %p (%S)", q, "baddlpi");
-		return;
-	}
-	if (mp1->b_cont != NULL) {
-		/* No fooling around with funny messages. */
-		if (!pullupmsg(mp1, -1)) {
-			freemsg(mp);
-			TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-			    "arp_rput_end: q %p (%S)", q, "pullupmsgfail");
-			return;
-		}
-	}
-	arh = (arh_t *)mp1->b_rptr;
-	hlen = arh->arh_hlen;
-	plen = arh->arh_plen;
-	if (MBLKL(mp1) < ARH_FIXED_LEN + 2 * hlen + 2 * plen) {
-		freemsg(mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		    "arp_rput_end: q %p (%S)", q, "short");
-		return;
-	}
-	/*
-	 * hlen 0 is used for RFC 1868 UnARP.
-	 *
-	 * Note that the rest of the code checks that hlen is what we expect
-	 * for this hardware address type, so might as well discard packets
-	 * here that don't match.
-	 */
-	if ((hlen > 0 && hlen != arl->arl_phy->ap_hw_addrlen) || plen == 0) {
-		DTRACE_PROBE2(rput_bogus, arl_t *, arl, mblk_t *, mp1);
-		freemsg(mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		    "arp_rput_end: q %p (%S)", q, "hlenzero/plenzero");
-		return;
-	}
-	/*
-	 * Historically, Solaris has been lenient about hardware type numbers.
-	 * We should check here, but don't.
-	 */
-	DTRACE_PROBE2(rput_normal, arl_t *, arl, arh_t *, arh);
-
-	DTRACE_PROBE3(arp__physical__in__start,
-	    arl_t *, arl, arh_t *, arh, mblk_t *, mp);
-
-	ARP_HOOK_IN(as->as_arp_physical_in_event, as->as_arp_physical_in,
-	    arl->arl_index, arh, mp, mp1, as);
-
-	DTRACE_PROBE1(arp__physical__in__end, mblk_t *, mp);
-
-	if (mp == NULL)
-		return;
-
-	proto = (uint32_t)BE16_TO_U16(arh->arh_proto);
-	src_haddr = (uchar_t *)arh;
-	src_haddr = &src_haddr[ARH_FIXED_LEN];
-	src_paddr = &src_haddr[hlen];
-	dst_haddr = &src_haddr[hlen + plen];
-	dst_paddr = &src_haddr[hlen + plen + hlen];
-	op = BE16_TO_U16(arh->arh_operation);
-
-	/* Determine if this is just a probe */
-	for (i = 0; i < plen; i++)
-		if (src_paddr[i] != 0)
-			break;
-	is_probe = i >= plen;
-
-	/*
-	 * RFC 826: first check if the <protocol, sender protocol address> is
-	 * in the cache, if there is a sender protocol address.  Note that this
-	 * step also handles resolutions based on source.
-	 *
-	 * Note that IP expects that each notification it receives will be
-	 * tied to the ill it received it on.  Thus, we must talk to it over
-	 * the arl tied to the resolved IP address (if any), hence client_arl.
-	 */
-	if (is_probe)
-		err = AR_NOTFOUND;
-	else
-		err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, src_paddr,
-		    plen, &client_arl);
-
-	switch (err) {
-	case AR_BOGON:
-		ar_client_notify(client_arl, mp1, AR_CN_BOGON);
-		mp1 = NULL;
-		break;
-	case AR_FAILED:
-		ar_client_notify(client_arl, mp1, AR_CN_FAILED);
-		mp1 = NULL;
-		break;
-	case AR_LOOPBACK:
-		DTRACE_PROBE2(rput_loopback, arl_t *, arl, arh_t *, arh);
-		freemsg(mp1);
-		mp1 = NULL;
-		break;
-	}
-	if (mp1 == NULL) {
-		freeb(mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		    "arp_rput_end: q %p (%S)", q, "unneeded");
-		return;
-	}
-
-	/*
-	 * Now look up the destination address.  By RFC 826, we ignore the
-	 * packet at this step if the target isn't one of our addresses.  This
-	 * is true even if the target is something we're trying to resolve and
-	 * the packet is a response.  To avoid duplicate responses, we also
-	 * ignore the packet if it was multicast/broadcast to an arl that's in
-	 * an IPMP group but was not the designated xmit_arl for the ACE.
-	 *
-	 * Note that in order to do this correctly, we need to know when to
-	 * notify IP of a change implied by the source address of the ARP
-	 * message.  That implies that the local ARP table has entries for all
-	 * of the resolved entries cached in the client.  This is why we must
-	 * notify IP when we delete a resolved entry and we know that IP may
-	 * have cached answers.
-	 */
-	dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen);
-	if (dst_ace == NULL || !ACE_RESOLVED(dst_ace) ||
-	    (dst_ace->ace_xmit_arl != arl && !is_unicast) ||
-	    !(dst_ace->ace_flags & ACE_F_PUBLISH)) {
-		/*
-		 * Let the client know if the source mapping has changed, even
-		 * if the destination provides no useful information for the
-		 * client.
-		 */
-		if (err == AR_CHANGED)
-			ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
-		else
-			freemsg(mp1);
-		freeb(mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		    "arp_rput_end: q %p (%S)", q, "nottarget");
-		return;
-	}
-
-	/*
-	 * If the target is unverified by DAD, then one of two things is true:
-	 * either it's someone else claiming this address (on a probe or an
-	 * announcement) or it's just a regular request.  The former is
-	 * failure, but a regular request is not.
-	 */
-	if (dst_ace->ace_flags & ACE_F_UNVERIFIED) {
-		/*
-		 * Check for a reflection.  Some misbehaving bridges will
-		 * reflect our own transmitted packets back to us.
-		 */
-		if (hlen == dst_ace->ace_hw_addr_length &&
-		    bcmp(src_haddr, dst_ace->ace_hw_addr, hlen) == 0) {
-			DTRACE_PROBE3(rput_probe_reflected, arl_t *, arl,
-			    arh_t *, arh, ace_t *, dst_ace);
-			freeb(mp);
-			freemsg(mp1);
-			TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-			    "arp_rput_end: q %p (%S)", q, "reflection");
-			return;
-		}
-
-		/*
-		 * Conflicts seen via the wrong interface may be bogus.
-		 * Multiple interfaces on the same segment imply any conflict
-		 * will also be seen via the correct interface, so we can ignore
-		 * anything not matching the arl from the ace.
-		 */
-		if (arl != dst_ace->ace_arl) {
-			DTRACE_PROBE3(rput_probe_misdirect, arl_t *, arl,
-			    arh_t *, arh, ace_t *, dst_ace);
-			freeb(mp);
-			freemsg(mp1);
-			return;
-		}
-		/*
-		 * Responses targeting our HW address that are not responses to
-		 * our DAD probe must be ignored as they are related to requests
-		 * sent before DAD was restarted. Note: response to our DAD
-		 * probe will have been handled by ar_ce_resolve_all() above.
-		 */
-		if (op == ARP_RESPONSE &&
-		    (bcmp(dst_haddr, dst_ace->ace_hw_addr, hlen) == 0)) {
-			DTRACE_PROBE3(rput_probe_stale, arl_t *, arl,
-			    arh_t *, arh, ace_t *, dst_ace);
-			freeb(mp);
-			freemsg(mp1);
-			return;
-		}
-		/*
-		 * Responses targeted to HW addresses which are not ours but
-		 * sent to our unverified proto address are also conflicts.
-		 * These may be reported by a proxy rather than the interface
-		 * with the conflicting address, dst_paddr is in conflict
-		 * rather than src_paddr. To ensure IP can locate the correct
-		 * ipif to take down, it is necessary to copy dst_paddr to
-		 * the src_paddr field before sending it to IP. The same is
-		 * required for probes, where src_paddr will be INADDR_ANY.
-		 */
-		if (is_probe) {
-			/*
-			 * In this case, client_arl will be invalid (e.g.,
-			 * since probes don't have a valid sender address).
-			 * But dst_ace has the appropriate arl.
-			 */
-			bcopy(dst_paddr, src_paddr, plen);
-			ar_client_notify(dst_ace->ace_arl, mp1, AR_CN_FAILED);
-			ar_ce_delete(dst_ace);
-		} else if (op == ARP_RESPONSE) {
-			bcopy(dst_paddr, src_paddr, plen);
-			ar_client_notify(client_arl, mp1, AR_CN_FAILED);
-			ar_ce_delete(dst_ace);
-		} else if (err == AR_CHANGED) {
-			ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
-		} else {
-			DTRACE_PROBE3(rput_request_unverified, arl_t *, arl,
-			    arh_t *, arh, ace_t *, dst_ace);
-			freemsg(mp1);
-		}
-		freeb(mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		    "arp_rput_end: q %p (%S)", q, "unverified");
-		return;
-	}
-
-	/*
-	 * If it's a request, then we reply to this, and if we think the
-	 * sender's unknown, then we create an entry to avoid unnecessary ARPs.
-	 * The design assumption is that someone ARPing us is likely to send us
-	 * a packet soon, and that we'll want to reply to it.
-	 */
-	if (op == ARP_REQUEST) {
-		const uchar_t *dstaddr = src_haddr;
-		clock_t now;
-
-		/*
-		 * This implements periodic address defense based on a modified
-		 * version of the RFC 3927 requirements.  Instead of sending a
-		 * broadcasted reply every time, as demanded by the RFC, we
-		 * send at most one broadcast reply per arp_broadcast_interval.
-		 */
-		now = ddi_get_lbolt();
-		if ((now - dst_ace->ace_last_bcast) >
-		    MSEC_TO_TICK(as->as_broadcast_interval)) {
-			DTRACE_PROBE3(rput_bcast_reply, arl_t *, arl,
-			    arh_t *, arh, ace_t *, dst_ace);
-			dst_ace->ace_last_bcast = now;
-			dstaddr = arl->arl_phy->ap_arp_addr;
-			/*
-			 * If this is one of the long-suffering entries, then
-			 * pull it out now.  It no longer needs separate
-			 * defense, because we're doing now that with this
-			 * broadcasted reply.
-			 */
-			dst_ace->ace_flags &= ~ACE_F_DELAYED;
-		}
-
-		ar_xmit(arl, ARP_RESPONSE, dst_ace->ace_proto, plen,
-		    dst_ace->ace_hw_addr, dst_ace->ace_proto_addr,
-		    src_haddr, src_paddr, dstaddr, as);
-		if (!is_probe && err == AR_NOTFOUND &&
-		    ar_ce_create(OWNING_ARL(arl), proto, src_haddr, hlen,
-		    src_paddr, plen, NULL, NULL, 0, NULL, 0) == 0) {
-			ace_t *ace;
-
-			ace = ar_ce_lookup(arl, proto, src_paddr, plen);
-			ASSERT(ace != NULL);
-			mi_timer(ace->ace_arl->arl_wq, ace->ace_mp,
-			    as->as_cleanup_interval);
-		}
-	}
-	if (err == AR_CHANGED) {
-		freeb(mp);
-		ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		    "arp_rput_end: q %p (%S)", q, "reqchange");
-	} else {
-		freemsg(mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
-		    "arp_rput_end: q %p (%S)", q, "end");
-	}
-}
-
-static void
-ar_ce_restart_dad(ace_t *ace, void *arl_arg)
-{
-	arl_t *arl = arl_arg;
-	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
-	if ((ace->ace_xmit_arl == arl) &&
-	    (ace->ace_flags & (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) ==
-	    (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) {
-		/*
-		 * Slight cheat here: we don't use the initial probe delay
-		 * in this obscure case.
-		 */
-		if (ace->ace_flags & ACE_F_FAST) {
-			ace->ace_xmit_count = as->as_fastprobe_count;
-			ace->ace_xmit_interval = as->as_fastprobe_interval;
-		} else {
-			ace->ace_xmit_count = as->as_probe_count;
-			ace->ace_xmit_interval = as->as_probe_interval;
-		}
-		ace->ace_flags &= ~ACE_F_DAD_ABORTED;
-		ace_set_timer(ace, B_FALSE);
-	}
-}
-
-/* DLPI messages, other than DL_UNITDATA_IND are handled here. */
-static void
-ar_rput_dlpi(queue_t *q, mblk_t *mp)
-{
-	ar_t		*ar = q->q_ptr;
-	arl_t		*arl = ar->ar_arl;
-	arlphy_t	*ap = NULL;
-	union DL_primitives *dlp;
-	const char	*err_str;
-	arp_stack_t	*as = ar->ar_as;
-
-	if (arl != NULL)
-		ap = arl->arl_phy;
-
-	if (MBLKL(mp) < sizeof (dlp->dl_primitive)) {
-		putnext(q, mp);
-		return;
-	}
-	dlp = (union DL_primitives *)mp->b_rptr;
-	switch (dlp->dl_primitive) {
-	case DL_ERROR_ACK:
-		/*
-		 * ce is confused about how DLPI works, so we have to interpret
-		 * an "error" on DL_NOTIFY_ACK (which we never could have sent)
-		 * as really meaning an error on DL_NOTIFY_REQ.
-		 *
-		 * Note that supporting DL_NOTIFY_REQ is optional, so printing
-		 * out an error message on the console isn't warranted except
-		 * for debug.
-		 */
-		if (dlp->error_ack.dl_error_primitive == DL_NOTIFY_ACK ||
-		    dlp->error_ack.dl_error_primitive == DL_NOTIFY_REQ) {
-			ar_dlpi_done(arl, DL_NOTIFY_REQ);
-			freemsg(mp);
-			return;
-		}
-		err_str = dl_primstr(dlp->error_ack.dl_error_primitive);
-		DTRACE_PROBE2(rput_dl_error, arl_t *, arl,
-		    dl_error_ack_t *, &dlp->error_ack);
-		switch (dlp->error_ack.dl_error_primitive) {
-		case DL_UNBIND_REQ:
-			if (arl->arl_provider_style == DL_STYLE1)
-				arl->arl_state = ARL_S_DOWN;
-			break;
-		case DL_DETACH_REQ:
-		case DL_BIND_REQ:
-			arl->arl_state = ARL_S_DOWN;
-			break;
-		case DL_ATTACH_REQ:
-			break;
-		default:
-			/* If it's anything else, we didn't send it. */
-			putnext(q, mp);
-			return;
-		}
-		ar_dlpi_done(arl, dlp->error_ack.dl_error_primitive);
-		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
-		    "ar_rput_dlpi: %s failed, dl_errno %d, dl_unix_errno %d",
-		    err_str, dlp->error_ack.dl_errno,
-		    dlp->error_ack.dl_unix_errno);
-		break;
-	case DL_INFO_ACK:
-		DTRACE_PROBE2(rput_dl_info, arl_t *, arl,
-		    dl_info_ack_t *, &dlp->info_ack);
-		if (arl != NULL && arl->arl_dlpi_pending == DL_INFO_REQ) {
-			/*
-			 * We have a response back from the driver.  Go set up
-			 * transmit defaults.
-			 */
-			ar_ll_set_defaults(arl, mp);
-			ar_dlpi_done(arl, DL_INFO_REQ);
-		} else if (arl == NULL) {
-			ar_ll_init(as, ar, mp);
-		}
-		/* Kick off any awaiting messages */
-		qenable(WR(q));
-		break;
-	case DL_OK_ACK:
-		DTRACE_PROBE2(rput_dl_ok, arl_t *, arl,
-		    dl_ok_ack_t *, &dlp->ok_ack);
-		switch (dlp->ok_ack.dl_correct_primitive) {
-		case DL_UNBIND_REQ:
-			if (arl->arl_provider_style == DL_STYLE1)
-				arl->arl_state = ARL_S_DOWN;
-			break;
-		case DL_DETACH_REQ:
-			arl->arl_state = ARL_S_DOWN;
-			break;
-		case DL_ATTACH_REQ:
-			break;
-		default:
-			putnext(q, mp);
-			return;
-		}
-		ar_dlpi_done(arl, dlp->ok_ack.dl_correct_primitive);
-		break;
-	case DL_NOTIFY_ACK:
-		DTRACE_PROBE2(rput_dl_notify, arl_t *, arl,
-		    dl_notify_ack_t *, &dlp->notify_ack);
-		/*
-		 * We mostly care about interface-up transitions, as this is
-		 * when we need to redo duplicate address detection.
-		 */
-		if (ap != NULL) {
-			ap->ap_notifies = (dlp->notify_ack.dl_notifications &
-			    DL_NOTE_LINK_UP) != 0;
-		}
-		ar_dlpi_done(arl, DL_NOTIFY_REQ);
-		break;
-	case DL_BIND_ACK:
-		DTRACE_PROBE2(rput_dl_bind, arl_t *, arl,
-		    dl_bind_ack_t *, &dlp->bind_ack);
-		if (ap != NULL) {
-			caddr_t hw_addr;
-
-			hw_addr = (caddr_t)dlp + dlp->bind_ack.dl_addr_offset;
-			if (ap->ap_saplen > 0)
-				hw_addr += ap->ap_saplen;
-			bcopy(hw_addr, ap->ap_hw_addr, ap->ap_hw_addrlen);
-		}
-		arl->arl_state = ARL_S_UP;
-		ar_dlpi_done(arl, DL_BIND_REQ);
-		break;
-	case DL_NOTIFY_IND:
-		DTRACE_PROBE2(rput_dl_notify_ind, arl_t *, arl,
-		    dl_notify_ind_t *, &dlp->notify_ind);
-
-		if (dlp->notify_ind.dl_notification == DL_NOTE_REPLUMB) {
-			arl->arl_replumbing = B_TRUE;
-			if (arl->arl_state == ARL_S_DOWN) {
-				arp_replumb_done(arl, mp);
-				return;
-			}
-			break;
-		}
-
-		if (ap != NULL) {
-			switch (dlp->notify_ind.dl_notification) {
-			case DL_NOTE_LINK_UP:
-				ap->ap_link_down = B_FALSE;
-				ar_ce_walk(as, ar_ce_restart_dad, arl);
-				break;
-			case DL_NOTE_LINK_DOWN:
-				ap->ap_link_down = B_TRUE;
-				break;
-			}
-		}
-		break;
-	case DL_UDERROR_IND:
-		DTRACE_PROBE2(rput_dl_uderror, arl_t *, arl,
-		    dl_uderror_ind_t *, &dlp->uderror_ind);
-		(void) mi_strlog(q, 1, SL_ERROR | SL_TRACE,
-		    "ar_rput_dlpi: "
-		    "DL_UDERROR_IND, dl_dest_addr_length %d dl_errno %d",
-		    dlp->uderror_ind.dl_dest_addr_length,
-		    dlp->uderror_ind.dl_errno);
-		putnext(q, mp);
-		return;
-	default:
-		DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl,
-		    union DL_primitives *, dlp);
-		putnext(q, mp);
-		return;
-	}
-	freemsg(mp);
-}
-
-static void
-ar_set_address(ace_t *ace, uchar_t *addrpos, uchar_t *proto_addr,
-    uint32_t proto_addr_len)
-{
-	uchar_t	*mask, *to;
-	int	len;
-
-	ASSERT(ace->ace_hw_addr != NULL);
-
-	bcopy(ace->ace_hw_addr, addrpos, ace->ace_hw_addr_length);
-	if (ace->ace_flags & ACE_F_MAPPING &&
-	    proto_addr != NULL &&
-	    ace->ace_proto_extract_mask) {	/* careful */
-		len = MIN((int)ace->ace_hw_addr_length
-		    - ace->ace_hw_extract_start,
-		    proto_addr_len);
-		mask = ace->ace_proto_extract_mask;
-		to = addrpos + ace->ace_hw_extract_start;
-		while (len-- > 0)
-			*to++ |= *mask++ & *proto_addr++;
-	}
-}
-
-static int
-ar_slifname(queue_t *q, mblk_t *mp_orig)
-{
-	ar_t	*ar = q->q_ptr;
-	arl_t	*arl = ar->ar_arl;
-	struct lifreq *lifr;
-	mblk_t *mp = mp_orig;
-	arl_t *old_arl;
-	mblk_t *ioccpy;
-	struct iocblk *iocp;
-	hook_nic_event_t info;
-	arp_stack_t *as = ar->ar_as;
-
-	if (ar->ar_on_ill_stream) {
-		/*
-		 * This command is for IP, since it is coming down
-		 * the <arp-IP-driver> stream. Return ENOENT so that
-		 * it will be sent downstream by the caller
-		 */
-		return (ENOENT);
-	}
-	/* We handle both M_IOCTL and M_PROTO messages */
-	if (DB_TYPE(mp) == M_IOCTL)
-		mp = mp->b_cont;
-	if (q->q_next == NULL || arl == NULL) {
-		/*
-		 * If the interface was just opened and
-		 * the info ack has not yet come back from the driver
-		 */
-		DTRACE_PROBE2(slifname_no_arl, queue_t *, q,
-		    mblk_t *, mp_orig);
-		(void) putq(q, mp_orig);
-		return (EINPROGRESS);
-	}
-
-	if (MBLKL(mp) < sizeof (struct lifreq)) {
-		DTRACE_PROBE2(slifname_malformed, queue_t *, q,
-		    mblk_t *, mp);
-	}
-
-	if (arl->arl_name[0] != '\0') {
-		DTRACE_PROBE1(slifname_already, arl_t *, arl);
-		return (EALREADY);
-	}
-
-	lifr = (struct lifreq *)mp->b_rptr;
-
-	if (strlen(lifr->lifr_name) >= LIFNAMSIZ) {
-		DTRACE_PROBE2(slifname_bad_name, arl_t *, arl,
-		    struct lifreq *, lifr);
-		return (ENXIO);
-	}
-
-	/* Check whether the name is already in use. */
-
-	old_arl = ar_ll_lookup_by_name(as, lifr->lifr_name);
-	if (old_arl != NULL) {
-		DTRACE_PROBE2(slifname_exists, arl_t *, arl, arl_t *, old_arl);
-		return (EEXIST);
-	}
-
-	/* Make a copy of the message so we can send it downstream. */
-	if ((ioccpy = allocb(sizeof (struct iocblk), BPRI_MED)) == NULL ||
-	    (ioccpy->b_cont = copymsg(mp)) == NULL) {
-		if (ioccpy != NULL)
-			freeb(ioccpy);
-		return (ENOMEM);
-	}
-
-	(void) strlcpy(arl->arl_name, lifr->lifr_name, sizeof (arl->arl_name));
-
-	/* The ppa is sent down by ifconfig */
-	arl->arl_ppa = lifr->lifr_ppa;
-
-	/*
-	 * A network device is not considered to be fully plumb'd until
-	 * its name has been set using SIOCSLIFNAME.  Once it has
-	 * been set, it cannot be set again (see code above), so there
-	 * is currently no danger in this function causing two NE_PLUMB
-	 * events without an intervening NE_UNPLUMB.
-	 */
-	info.hne_nic = arl->arl_index;
-	info.hne_lif = 0;
-	info.hne_event = NE_PLUMB;
-	info.hne_data = arl->arl_name;
-	info.hne_datalen = strlen(arl->arl_name);
-	(void) hook_run(as->as_net_data->netd_hooks, as->as_arpnicevents,
-	    (hook_data_t)&info);
-
-	/* Chain in the new arl. */
-	rw_enter(&as->as_arl_lock, RW_WRITER);
-	arl->arl_next = as->as_arl_head;
-	as->as_arl_head = arl;
-	rw_exit(&as->as_arl_lock);
-	DTRACE_PROBE1(slifname_set, arl_t *, arl);
-
-	/*
-	 * Send along a copy of the ioctl; this is just for hitbox.  Use
-	 * M_CTL to avoid confusing anyone else who might be listening.
-	 */
-	DB_TYPE(ioccpy) = M_CTL;
-	iocp = (struct iocblk *)ioccpy->b_rptr;
-	bzero(iocp, sizeof (*iocp));
-	iocp->ioc_cmd = SIOCSLIFNAME;
-	iocp->ioc_count = msgsize(ioccpy->b_cont);
-	ioccpy->b_wptr = (uchar_t *)(iocp + 1);
-	putnext(arl->arl_wq, ioccpy);
-
-	return (0);
-}
-
-static int
-ar_set_ppa(queue_t *q, mblk_t *mp_orig)
-{
-	ar_t	*ar = (ar_t *)q->q_ptr;
-	arl_t	*arl = ar->ar_arl;
-	int	ppa;
-	char	*cp;
-	mblk_t	*mp = mp_orig;
-	arl_t	*old_arl;
-	arp_stack_t *as = ar->ar_as;
-
-	if (ar->ar_on_ill_stream) {
-		/*
-		 * This command is for IP, since it is coming down
-		 * the <arp-IP-driver> stream. Return ENOENT so that
-		 * it will be sent downstream by the caller
-		 */
-		return (ENOENT);
-	}
-
-	/* We handle both M_IOCTL and M_PROTO messages. */
-	if (DB_TYPE(mp) == M_IOCTL)
-		mp = mp->b_cont;
-	if (q->q_next == NULL || arl == NULL) {
-		/*
-		 * If the interface was just opened and
-		 * the info ack has not yet come back from the driver.
-		 */
-		DTRACE_PROBE2(setppa_no_arl, queue_t *, q,
-		    mblk_t *, mp_orig);
-		(void) putq(q, mp_orig);
-		return (EINPROGRESS);
-	}
-
-	if (arl->arl_name[0] != '\0') {
-		DTRACE_PROBE1(setppa_already, arl_t *, arl);
-		return (EALREADY);
-	}
-
-	do {
-		q = q->q_next;
-	} while (q->q_next != NULL);
-	cp = q->q_qinfo->qi_minfo->mi_idname;
-
-	ppa = *(int *)(mp->b_rptr);
-	(void) snprintf(arl->arl_name, sizeof (arl->arl_name), "%s%d", cp, ppa);
-
-	old_arl = ar_ll_lookup_by_name(as, arl->arl_name);
-	if (old_arl != NULL) {
-		DTRACE_PROBE2(setppa_exists, arl_t *, arl, arl_t *, old_arl);
-		/* Make it a null string again */
-		arl->arl_name[0] = '\0';
-		return (EBUSY);
-	}
-
-	arl->arl_ppa = ppa;
-	DTRACE_PROBE1(setppa_done, arl_t *, arl);
-	/* Chain in the new arl. */
-	rw_enter(&as->as_arl_lock, RW_WRITER);
-	arl->arl_next = as->as_arl_head;
-	as->as_arl_head = arl;
-	rw_exit(&as->as_arl_lock);
-
-	return (0);
-}
-
-static int
-ar_snmp_msg(queue_t *q, mblk_t *mp_orig)
-{
-	mblk_t		*mpdata, *mp = mp_orig;
-	struct opthdr	*optp;
-	msg2_args_t	args;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	if (mp == NULL)
-		return (0);
-	/*
-	 * ar_cmd_dispatch() already checked for us that "mp->b_cont" is valid
-	 * in case of an M_IOCTL message.
-	 */
-	if (DB_TYPE(mp) == M_IOCTL)
-		mp = mp->b_cont;
-
-	optp = (struct opthdr *)(&mp->b_rptr[sizeof (struct T_optmgmt_ack)]);
-	if (optp->level == MIB2_IP && optp->name == MIB2_IP_MEDIA) {
-		/*
-		 * Put our ARP cache entries in the ipNetToMediaTable mp from
-		 * IP.  Due to a historical side effect of IP's MIB code, it
-		 * always passes us a b_cont, but the b_cont should be empty.
-		 */
-		if ((mpdata = mp->b_cont) == NULL || MBLKL(mpdata) != 0)
-			return (EINVAL);
-
-		args.m2a_mpdata = mpdata;
-		args.m2a_mptail = NULL;
-		ar_ce_walk(as, ar_snmp_msg2, &args);
-		optp->len = msgdsize(mpdata);
-	}
-	putnext(q, mp_orig);
-	return (EINPROGRESS);	/* so that rput() exits doing nothing... */
-}
-
-static void
-ar_snmp_msg2(ace_t *ace, void *arg)
-{
-	const char	*name = "unknown";
-	mib2_ipNetToMediaEntry_t ntme;
-	msg2_args_t	*m2ap = arg;
-
-	ASSERT(ace != NULL && ace->ace_arl != NULL);
-	if (ace->ace_arl != NULL)
-		name = ace->ace_arl->arl_name;
-
-	/*
-	 * Fill in ntme using the information in the ACE.
-	 */
-	ntme.ipNetToMediaType = (ace->ace_flags & ACE_F_PERMANENT) ? 4 : 3;
-	ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name));
-	bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes,
-	    ntme.ipNetToMediaIfIndex.o_length);
-
-	bcopy(ace->ace_proto_addr, &ntme.ipNetToMediaNetAddress,
-	    MIN(sizeof (uint32_t), ace->ace_proto_addr_length));
-
-	ntme.ipNetToMediaInfo.ntm_mask.o_length =
-	    MIN(OCTET_LENGTH, ace->ace_proto_addr_length);
-	bcopy(ace->ace_proto_mask, ntme.ipNetToMediaInfo.ntm_mask.o_bytes,
-	    ntme.ipNetToMediaInfo.ntm_mask.o_length);
-	ntme.ipNetToMediaInfo.ntm_flags = ace->ace_flags;
-
-	ntme.ipNetToMediaPhysAddress.o_length =
-	    MIN(OCTET_LENGTH, ace->ace_hw_addr_length);
-	if ((ace->ace_flags & ACE_F_RESOLVED) == 0)
-		ntme.ipNetToMediaPhysAddress.o_length = 0;
-	bcopy(ace->ace_hw_addr, ntme.ipNetToMediaPhysAddress.o_bytes,
-	    ntme.ipNetToMediaPhysAddress.o_length);
-
-	/*
-	 * All entries within the ARP cache are unique, and there are no
-	 * preexisting entries in the ipNetToMediaTable mp, so just add 'em.
-	 */
-	(void) snmp_append_data2(m2ap->m2a_mpdata, &m2ap->m2a_mptail,
-	    (char *)&ntme, sizeof (ntme));
-}
-
-/* Write side put procedure. */
-static void
-ar_wput(queue_t *q, mblk_t *mp)
-{
-	int	err;
-	struct iocblk	*ioc;
-	mblk_t	*mp1;
-
-	TRACE_1(TR_FAC_ARP, TR_ARP_WPUT_START,
-	    "arp_wput_start: q %p", q);
-
-	/*
-	 * Here we handle ARP commands coming from controlling processes
-	 * either in the form of M_IOCTL messages, or M_PROTO messages.
-	 */
-	switch (DB_TYPE(mp)) {
-	case M_IOCTL:
-		switch (err = ar_cmd_dispatch(q, mp, B_TRUE)) {
-		case ENOENT:
-			/*
-			 * If it is an I_PLINK, process it. Otherwise
-			 * we don't recognize it, so pass it down.
-			 * Since ARP is a module there is always someone
-			 * below.
-			 */
-			ASSERT(q->q_next != NULL);
-			ioc = (struct iocblk *)mp->b_rptr;
-			if ((ioc->ioc_cmd != I_PLINK) &&
-			    (ioc->ioc_cmd != I_PUNLINK)) {
-				putnext(q, mp);
-				TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
-				    "arp_wput_end: q %p (%S)",
-				    q, "ioctl/enoent");
-				return;
-			}
-			err = ar_plink_send(q, mp);
-			if (err == 0) {
-				return;
-			}
-			if ((mp1 = mp->b_cont) != 0)
-				mp1->b_wptr = mp1->b_rptr;
-			break;
-		case EINPROGRESS:
-			/*
-			 * If the request resulted in an attempt to resolve
-			 * an address, we return out here.  The IOCTL will
-			 * be completed in ar_rput if something comes back,
-			 * or as a result of the timer expiring.
-			 */
-			TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
-			    "arp_wput_end: q %p (%S)", q, "inprog");
-			return;
-		default:
-			DB_TYPE(mp) = M_IOCACK;
-			break;
-		}
-		ioc = (struct iocblk *)mp->b_rptr;
-		if (err != 0)
-			ioc->ioc_error = err;
-		if (ioc->ioc_error != 0) {
-			/*
-			 * Don't free b_cont as IP/IB needs
-			 * it to identify the request.
-			 */
-			DB_TYPE(mp) = M_IOCNAK;
-		}
-		ioc->ioc_count = msgdsize(mp->b_cont);
-		qreply(q, mp);
-		TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
-		    "arp_wput_end: q %p (%S)", q, "ioctl");
-		return;
-	case M_FLUSH:
-		if (*mp->b_rptr & FLUSHW)
-			flushq(q, FLUSHDATA);
-		if (*mp->b_rptr & FLUSHR) {
-			flushq(RD(q), FLUSHDATA);
-			*mp->b_rptr &= ~FLUSHW;
-			qreply(q, mp);
-			TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
-			    "arp_wput_end: q %p (%S)", q, "flush");
-			return;
-		}
-		/*
-		 * The normal behavior of a STREAMS module should be
-		 * to pass down M_FLUSH messages. However there is a
-		 * complex sequence of events during plumb/unplumb that
-		 * can cause DLPI messages in the driver's queue to be
-		 * flushed. So we don't send down M_FLUSH. This has been
-		 * reported for some drivers (Eg. le) that send up an M_FLUSH
-		 * in response to unbind request which will eventually be
-		 * looped back at the mux head and sent down. Since IP
-		 * does not queue messages in a module instance queue
-		 * of IP, nothing is lost by not sending down the flush.
-		 */
-		freemsg(mp);
-		return;
-	case M_PROTO:
-	case M_PCPROTO:
-		/*
-		 * Commands in the form of PROTO messages are handled very
-		 * much the same as IOCTLs, but no response is returned.
-		 */
-		switch (err = ar_cmd_dispatch(q, mp, B_TRUE)) {
-		case ENOENT:
-			if (q->q_next) {
-				putnext(q, mp);
-				TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
-				    "arp_wput_end: q %p (%S)", q,
-				    "proto/enoent");
-				return;
-			}
-			break;
-		case EINPROGRESS:
-			TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
-			    "arp_wput_end: q %p (%S)", q, "proto/einprog");
-			return;
-		default:
-			break;
-		}
-		break;
-	case M_IOCDATA:
-		/*
-		 * We pass M_IOCDATA downstream because it could be as a
-		 * result of a previous M_COPYIN/M_COPYOUT message sent
-		 * upstream.
-		 */
-		/* FALLTHRU */
-	case M_CTL:
-		/*
-		 * We also send any M_CTL downstream as it could
-		 * contain control information for a module downstream.
-		 */
-		putnext(q, mp);
-		return;
-	default:
-		break;
-	}
-	/* Free any message we don't understand */
-	freemsg(mp);
-	TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
-	    "arp_wput_end: q %p (%S)", q, "end");
-}
-
-static boolean_t
-arp_say_ready(ace_t *ace)
-{
-	mblk_t	*mp;
-	arl_t *arl = ace->ace_arl;
-	arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
-	arh_t *arh;
-	uchar_t *cp;
-
-	mp = allocb(sizeof (*arh) + 2 * (ace->ace_hw_addr_length +
-	    ace->ace_proto_addr_length), BPRI_MED);
-	if (mp == NULL) {
-		/* skip a beat on allocation trouble */
-		ace->ace_xmit_count = 1;
-		ace_set_timer(ace, B_FALSE);
-		return (B_FALSE);
-	}
-	/* Tell IP address is now usable */
-	arh = (arh_t *)mp->b_rptr;
-	U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware);
-	U16_TO_BE16(ace->ace_proto, arh->arh_proto);
-	arh->arh_hlen = ace->ace_hw_addr_length;
-	arh->arh_plen = ace->ace_proto_addr_length;
-	U16_TO_BE16(ARP_REQUEST, arh->arh_operation);
-	cp = (uchar_t *)(arh + 1);
-	bcopy(ace->ace_hw_addr, cp, ace->ace_hw_addr_length);
-	cp += ace->ace_hw_addr_length;
-	bcopy(ace->ace_proto_addr, cp, ace->ace_proto_addr_length);
-	cp += ace->ace_proto_addr_length;
-	bcopy(ace->ace_hw_addr, cp, ace->ace_hw_addr_length);
-	cp += ace->ace_hw_addr_length;
-	bcopy(ace->ace_proto_addr, cp, ace->ace_proto_addr_length);
-	cp += ace->ace_proto_addr_length;
-	mp->b_wptr = cp;
-	ar_client_notify(arl, mp, AR_CN_READY);
-	DTRACE_PROBE1(ready, ace_t *, ace);
-	return (B_TRUE);
-}
-
-/*
- * Pick the longest-waiting aces for defense.
- */
-static void
-ace_reschedule(ace_t *ace, void *arg)
-{
-	ace_resched_t *art = arg;
-	ace_t **aces;
-	ace_t **acemax;
-	ace_t *atemp;
-
-	if (ace->ace_xmit_arl != art->art_arl)
-		return;
-	/*
-	 * Only published entries that are ready for announcement are eligible.
-	 */
-	if ((ace->ace_flags & (ACE_F_PUBLISH | ACE_F_UNVERIFIED | ACE_F_DYING |
-	    ACE_F_DELAYED)) != ACE_F_PUBLISH) {
-		return;
-	}
-	if (art->art_naces < ACE_RESCHED_LIST_LEN) {
-		art->art_aces[art->art_naces++] = ace;
-	} else {
-		aces = art->art_aces;
-		acemax = aces + ACE_RESCHED_LIST_LEN;
-		for (; aces < acemax; aces++) {
-			if ((*aces)->ace_last_bcast > ace->ace_last_bcast) {
-				atemp = *aces;
-				*aces = ace;
-				ace = atemp;
-			}
-		}
-	}
-}
-
-/*
- * Reschedule the ARP defense of any long-waiting ACEs.  It's assumed that this
- * doesn't happen very often (if at all), and thus it needn't be highly
- * optimized.  (Note, though, that it's actually O(N) complexity, because the
- * outer loop is bounded by a constant rather than by the length of the list.)
- */
-static void
-arl_reschedule(arl_t *arl)
-{
-	arlphy_t *ap = arl->arl_phy;
-	ace_resched_t art;
-	int i;
-	ace_t *ace;
-	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
-	i = ap->ap_defend_count;
-	ap->ap_defend_count = 0;
-	/* If none could be sitting around, then don't reschedule */
-	if (i < as->as_defend_rate) {
-		DTRACE_PROBE1(reschedule_none, arl_t *, arl);
-		return;
-	}
-	art.art_arl = arl;
-	while (ap->ap_defend_count < as->as_defend_rate) {
-		art.art_naces = 0;
-		ar_ce_walk(as, ace_reschedule, &art);
-		for (i = 0; i < art.art_naces; i++) {
-			ace = art.art_aces[i];
-			ace->ace_flags |= ACE_F_DELAYED;
-			ace_set_timer(ace, B_FALSE);
-			if (++ap->ap_defend_count >= as->as_defend_rate)
-				break;
-		}
-		if (art.art_naces < ACE_RESCHED_LIST_LEN)
-			break;
-	}
-	DTRACE_PROBE1(reschedule, arl_t *, arl);
-}
-
-/*
- * Write side service routine.	The only action here is delivery of transmit
- * timer events and delayed messages while waiting for the info_ack (ar_arl
- * not yet set).
- */
-static void
-ar_wsrv(queue_t *q)
-{
-	ace_t *ace;
-	arlphy_t *ap;
-	mblk_t *mp;
-	clock_t	ms;
-	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
-	TRACE_1(TR_FAC_ARP, TR_ARP_WSRV_START,
-	    "arp_wsrv_start: q %p", q);
-
-	while ((mp = getq(q)) != NULL) {
-		switch (DB_TYPE(mp)) {
-		case M_PCSIG:
-			if (!mi_timer_valid(mp))
-				continue;
-			ace = (ace_t *)mp->b_rptr;
-			if (ace->ace_flags & ACE_F_DYING)
-				continue;
-			ap = ace->ace_xmit_arl->arl_phy;
-			if (ace->ace_flags & ACE_F_UNVERIFIED) {
-				ASSERT(ace->ace_flags & ACE_F_PUBLISH);
-				ASSERT(ace->ace_query_mp == NULL);
-				/*
-				 * If the link is down, give up for now.  IP
-				 * will give us the go-ahead to try again when
-				 * the link restarts.
-				 */
-				if (ap->ap_link_down) {
-					DTRACE_PROBE1(timer_link_down,
-					    ace_t *, ace);
-					ace->ace_flags |= ACE_F_DAD_ABORTED;
-					continue;
-				}
-				if (ace->ace_xmit_count > 0) {
-					DTRACE_PROBE1(timer_probe,
-					    ace_t *, ace);
-					ace->ace_xmit_count--;
-					ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
-					    ace->ace_proto,
-					    ace->ace_proto_addr_length,
-					    ace->ace_hw_addr, NULL, NULL,
-					    ace->ace_proto_addr, NULL, as);
-					ace_set_timer(ace, B_FALSE);
-					continue;
-				}
-				if (!arp_say_ready(ace))
-					continue;
-				DTRACE_PROBE1(timer_ready, ace_t *, ace);
-				ace->ace_xmit_interval =
-				    as->as_publish_interval;
-				ace->ace_xmit_count = as->as_publish_count;
-				if (ace->ace_xmit_count == 0)
-					ace->ace_xmit_count++;
-				ace->ace_flags &= ~ACE_F_UNVERIFIED;
-			}
-			if (ace->ace_flags & ACE_F_PUBLISH) {
-				clock_t now;
-
-				/*
-				 * If an hour has passed, then free up the
-				 * entries that need defense by rescheduling
-				 * them.
-				 */
-				now = ddi_get_lbolt();
-				if (as->as_defend_rate > 0 &&
-				    now - ap->ap_defend_start >
-				    SEC_TO_TICK(as->as_defend_period)) {
-					ap->ap_defend_start = now;
-					arl_reschedule(ace->ace_xmit_arl);
-				}
-				/*
-				 * Finish the job that we started in
-				 * ar_entry_add.  When we get to zero
-				 * announcement retransmits left, switch to
-				 * address defense.
-				 */
-				ASSERT(ace->ace_query_mp == NULL);
-				if (ace->ace_xmit_count > 0) {
-					ace->ace_xmit_count--;
-					DTRACE_PROBE1(timer_announce,
-					    ace_t *, ace);
-				} else if (ace->ace_flags & ACE_F_DELAYED) {
-					/*
-					 * This guy was rescheduled as one of
-					 * the really old entries needing
-					 * on-going defense.  Let him through
-					 * now.
-					 */
-					DTRACE_PROBE1(timer_send_delayed,
-					    ace_t *, ace);
-					ace->ace_flags &= ~ACE_F_DELAYED;
-				} else if (as->as_defend_rate > 0 &&
-				    (ap->ap_defend_count >=
-				    as->as_defend_rate ||
-				    ++ap->ap_defend_count >=
-				    as->as_defend_rate)) {
-					/*
-					 * If we're no longer allowed to send
-					 * unbidden defense messages, then just
-					 * wait for rescheduling.
-					 */
-					DTRACE_PROBE1(timer_excess_defense,
-					    ace_t *, ace);
-					ace_set_timer(ace, B_FALSE);
-					continue;
-				} else {
-					DTRACE_PROBE1(timer_defend,
-					    ace_t *, ace);
-				}
-				ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
-				    ace->ace_proto,
-				    ace->ace_proto_addr_length,
-				    ace->ace_hw_addr,
-				    ace->ace_proto_addr,
-				    ace->ace_xmit_arl->arl_phy->ap_arp_addr,
-				    ace->ace_proto_addr, NULL, as);
-				ace->ace_last_bcast = now;
-				if (ace->ace_xmit_count == 0)
-					ace->ace_xmit_interval =
-					    as->as_defend_interval;
-				if (ace->ace_xmit_interval != 0)
-					ace_set_timer(ace, B_FALSE);
-				continue;
-			}
-
-			/*
-			 * If this is a non-permanent (regular) resolved ARP
-			 * entry, then it's now time to check if it can be
-			 * retired.  As an optimization, we check with IP
-			 * first, and just restart the timer if the address is
-			 * still in use.
-			 */
-			if (ACE_NONPERM(ace)) {
-				if (ace->ace_proto == IP_ARP_PROTO_TYPE &&
-				    ndp_lookup_ipaddr(*(ipaddr_t *)
-				    ace->ace_proto_addr, as->as_netstack)) {
-					ace->ace_flags |= ACE_F_OLD;
-					mi_timer(ace->ace_arl->arl_wq,
-					    ace->ace_mp,
-					    as->as_cleanup_interval);
-				} else {
-					ar_delete_notify(ace);
-					ar_ce_delete(ace);
-				}
-				continue;
-			}
-
-			/*
-			 * ar_query_xmit returns the number of milliseconds to
-			 * wait following this transmit.  If the number of
-			 * allowed transmissions has been exhausted, it will
-			 * return zero without transmitting.  If that happens
-			 * we complete the operation with a failure indication.
-			 * Otherwise, we restart the timer.
-			 */
-			ms = ar_query_xmit(as, ace);
-			if (ms == 0)
-				ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
-			else
-				mi_timer(q, mp, ms);
-			continue;
-		default:
-			put(q, mp);
-			continue;
-		}
-	}
-	TRACE_1(TR_FAC_ARP, TR_ARP_WSRV_END,
-	    "arp_wsrv_end: q %p", q);
-}
-
-/* ar_xmit is called to transmit an ARP Request or Response. */
-static void
-ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen,
-    const uchar_t *haddr1, const uchar_t *paddr1, const uchar_t *haddr2,
-    const uchar_t *paddr2, const uchar_t *dstaddr, arp_stack_t *as)
-{
-	arh_t	*arh;
-	uint8_t	*cp;
-	uint_t	hlen;
-	mblk_t	*mp;
-	arlphy_t *ap = arl->arl_phy;
-
-	ASSERT(!(arl->arl_flags & ARL_F_IPMP));
-
-	if (ap == NULL) {
-		DTRACE_PROBE1(xmit_no_arl_phy, arl_t *, arl);
-		return;
-	}
-
-	/* IFF_NOARP flag is set or link down: do not send arp messages */
-	if ((arl->arl_flags & ARL_F_NOARP) || ap->ap_link_down)
-		return;
-
-	hlen = ap->ap_hw_addrlen;
-	if ((mp = copyb(ap->ap_xmit_mp)) == NULL)
-		return;
-
-	mp->b_cont = allocb(AR_LL_HDR_SLACK + ARH_FIXED_LEN + (hlen * 4) +
-	    plen + plen, BPRI_MED);
-	if (mp->b_cont == NULL) {
-		freeb(mp);
-		return;
-	}
-
-	/* Get the L2 destination address for the message */
-	if (haddr2 == NULL)
-		dstaddr = ap->ap_arp_addr;
-	else if (dstaddr == NULL)
-		dstaddr = haddr2;
-
-	/*
-	 * Figure out where the target hardware address goes in the
-	 * DL_UNITDATA_REQ header, and copy it in.
-	 */
-	cp = mi_offset_param(mp, ap->ap_xmit_addroff, hlen);
-	ASSERT(cp != NULL);
-	if (cp == NULL) {
-		freemsg(mp);
-		return;
-	}
-	bcopy(dstaddr, cp, hlen);
-
-	/* Fill in the ARP header. */
-	cp = mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen);
-	mp->b_cont->b_rptr = cp;
-	arh = (arh_t *)cp;
-	U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware);
-	U16_TO_BE16(proto, arh->arh_proto);
-	arh->arh_hlen = (uint8_t)hlen;
-	arh->arh_plen = (uint8_t)plen;
-	U16_TO_BE16(operation, arh->arh_operation);
-	cp += ARH_FIXED_LEN;
-	bcopy(haddr1, cp, hlen);
-	cp += hlen;
-	if (paddr1 == NULL)
-		bzero(cp, plen);
-	else
-		bcopy(paddr1, cp, plen);
-	cp += plen;
-	if (haddr2 == NULL)
-		bzero(cp, hlen);
-	else
-		bcopy(haddr2, cp, hlen);
-	cp += hlen;
-	bcopy(paddr2, cp, plen);
-	cp += plen;
-	mp->b_cont->b_wptr = cp;
-
-	DTRACE_PROBE3(arp__physical__out__start,
-	    arl_t *, arl, arh_t *, arh, mblk_t *, mp);
-
-	ARP_HOOK_OUT(as->as_arp_physical_out_event, as->as_arp_physical_out,
-	    arl->arl_index, arh, mp, mp->b_cont, as);
-
-	DTRACE_PROBE1(arp__physical__out__end, mblk_t *, mp);
-
-	if (mp == NULL)
-		return;
-
-	/* Ship it out. */
-	if (canputnext(arl->arl_wq))
-		putnext(arl->arl_wq, mp);
-	else
-		freemsg(mp);
-}
-
-static mblk_t *
-ar_alloc(uint32_t cmd, int err)
-{
-	uint32_t	len;
-	mblk_t		*mp;
-	mblk_t		*mp1;
-	char		*cp;
-	arc_t		*arc;
-
-	/* For now only one type of command is accepted */
-	if (cmd != AR_DLPIOP_DONE)
-		return (NULL);
-	len = sizeof (arc_t);
-	mp = allocb(len, BPRI_HI);
-	if (!mp)
-		return (NULL);
-
-	DB_TYPE(mp) = M_CTL;
-	cp = (char *)mp->b_rptr;
-	arc = (arc_t *)(mp->b_rptr);
-	arc->arc_cmd = cmd;
-	mp->b_wptr = (uchar_t *)&cp[len];
-	len = sizeof (int);
-	mp1 = allocb(len, BPRI_HI);
-	if (!mp1) {
-		freeb(mp);
-		return (NULL);
-	}
-	cp = (char *)mp->b_rptr;
-	/* Initialize the error code */
-	*((int *)mp1->b_rptr) = err;
-	mp1->b_wptr = (uchar_t *)&cp[len];
-	linkb(mp, mp1);
-	return (mp);
-}
-
-void
-arp_ddi_init(void)
-{
-	/*
-	 * We want to be informed each time a stack is created or
-	 * destroyed in the kernel, so we can maintain the
-	 * set of arp_stack_t's.
-	 */
-	netstack_register(NS_ARP, arp_stack_init, arp_stack_shutdown,
-	    arp_stack_fini);
-}
-
-void
-arp_ddi_destroy(void)
-{
-	netstack_unregister(NS_ARP);
-}
-
-/*
- * Initialize the ARP stack instance.
- */
-/* ARGSUSED */
-static void *
-arp_stack_init(netstackid_t stackid, netstack_t *ns)
-{
-	arp_stack_t	*as;
-	arpparam_t	*pa;
-
-	as = (arp_stack_t *)kmem_zalloc(sizeof (*as), KM_SLEEP);
-	as->as_netstack = ns;
-
-	pa = (arpparam_t *)kmem_alloc(sizeof (arp_param_arr), KM_SLEEP);
-	as->as_param_arr = pa;
-	bcopy(arp_param_arr, as->as_param_arr, sizeof (arp_param_arr));
-
-	(void) ar_param_register(&as->as_nd,
-	    as->as_param_arr, A_CNT(arp_param_arr));
-
-	as->as_arp_index_counter = 1;
-	as->as_arp_counter_wrapped = 0;
-
-	rw_init(&as->as_arl_lock, NULL, RW_DRIVER, NULL);
-	arp_net_init(as, stackid);
-	arp_hook_init(as);
-
-	return (as);
-}
-
-/* ARGSUSED */
-static void
-arp_stack_shutdown(netstackid_t stackid, void *arg)
-{
-	arp_stack_t *as = (arp_stack_t *)arg;
-
-	arp_net_shutdown(as);
-}
-
-/*
- * Free the ARP stack instance.
- */
-/* ARGSUSED */
-static void
-arp_stack_fini(netstackid_t stackid, void *arg)
-{
-	arp_stack_t *as = (arp_stack_t *)arg;
-
-	arp_hook_destroy(as);
-	arp_net_destroy(as);
-	rw_destroy(&as->as_arl_lock);
-	nd_free(&as->as_nd);
-	kmem_free(as->as_param_arr, sizeof (arp_param_arr));
-	as->as_param_arr = NULL;
-	kmem_free(as, sizeof (*as));
-}
diff --git a/usr/src/uts/common/inet/arp/arp_netinfo.c b/usr/src/uts/common/inet/arp/arp_netinfo.c
deleted file mode 100644
index 9d9c6a5bbe..0000000000
--- a/usr/src/uts/common/inet/arp/arp_netinfo.c
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/systm.h>
-#include <sys/cmn_err.h>
-#include <sys/stream.h>
-#include <sys/sunddi.h>
-#include <sys/hook.h>
-#include <sys/hook_impl.h>
-#include <sys/netstack.h>
-#include <net/if.h>
-
-#include <sys/neti.h>
-#include <sys/hook_event.h>
-#include <inet/arp_impl.h>
-
-/*
- * ARP netinfo entry point declarations.
- */
-static int 	arp_getifname(net_handle_t, phy_if_t, char *, const size_t);
-static int 	arp_getmtu(net_handle_t, phy_if_t, lif_if_t);
-static int 	arp_getpmtuenabled(net_handle_t);
-static int 	arp_getlifaddr(net_handle_t, phy_if_t, lif_if_t, size_t,
-		    net_ifaddr_t [], void *);
-static int	arp_getlifzone(net_handle_t, phy_if_t, lif_if_t, zoneid_t *);
-static int	arp_getlifflags(net_handle_t, phy_if_t, lif_if_t, uint64_t *);
-static phy_if_t arp_phygetnext(net_handle_t, phy_if_t);
-static phy_if_t arp_phylookup(net_handle_t, const char *);
-static lif_if_t arp_lifgetnext(net_handle_t, phy_if_t, lif_if_t);
-static int 	arp_inject(net_handle_t, inject_t, net_inject_t *);
-static phy_if_t arp_routeto(net_handle_t, struct sockaddr *, struct sockaddr *);
-static int 	arp_ispartialchecksum(net_handle_t, mblk_t *);
-static int 	arp_isvalidchecksum(net_handle_t, mblk_t *);
-
-static net_protocol_t arp_netinfo = {
-	NETINFO_VERSION,
-	NHF_ARP,
-	arp_getifname,
-	arp_getmtu,
-	arp_getpmtuenabled,
-	arp_getlifaddr,
-	arp_getlifzone,
-	arp_getlifflags,
-	arp_phygetnext,
-	arp_phylookup,
-	arp_lifgetnext,
-	arp_inject,
-	arp_routeto,
-	arp_ispartialchecksum,
-	arp_isvalidchecksum
-};
-
-/*
- * Register ARP netinfo functions.
- */
-void
-arp_net_init(arp_stack_t *as, netstackid_t stackid)
-{
-	netid_t id;
-
-	id = net_getnetidbynetstackid(stackid);
-	ASSERT(id != -1);
-
-	as->as_net_data = net_protocol_register(id, &arp_netinfo);
-	ASSERT(as->as_net_data != NULL);
-}
-
-void
-arp_net_shutdown(arp_stack_t *as)
-{
-	if (as->as_arpnicevents != NULL) {
-		(void) net_event_shutdown(as->as_net_data,
-		    &as->as_arp_nic_events);
-	}
-
-	if (as->as_arp_physical_out != NULL) {
-		(void) net_event_shutdown(as->as_net_data,
-		    &as->as_arp_physical_out_event);
-	}
-
-	if (as->as_arp_physical_in != NULL) {
-		(void) net_event_shutdown(as->as_net_data,
-		    &as->as_arp_physical_in_event);
-	}
-
-	(void) net_family_shutdown(as->as_net_data, &as->as_arproot);
-}
-
-/*
- * Unregister ARP netinfo functions.
- */
-void
-arp_net_destroy(arp_stack_t *as)
-{
-	if (net_protocol_unregister(as->as_net_data) == 0)
-		as->as_net_data = NULL;
-}
-
-/*
- * Initialize ARP hook family and events
- */
-void
-arp_hook_init(arp_stack_t *as)
-{
-	HOOK_FAMILY_INIT(&as->as_arproot, Hn_ARP);
-	if (net_family_register(as->as_net_data, &as->as_arproot) != 0) {
-		cmn_err(CE_NOTE, "arp_hook_init: "
-		    "net_family_register failed for arp");
-	}
-
-	HOOK_EVENT_INIT(&as->as_arp_physical_in_event, NH_PHYSICAL_IN);
-	as->as_arp_physical_in = net_event_register(as->as_net_data,
-	    &as->as_arp_physical_in_event);
-	if (as->as_arp_physical_in == NULL) {
-		cmn_err(CE_NOTE, "arp_hook_init: "
-		    "net_event_register failed for arp/physical_in");
-	}
-
-	HOOK_EVENT_INIT(&as->as_arp_physical_out_event, NH_PHYSICAL_OUT);
-	as->as_arp_physical_out = net_event_register(as->as_net_data,
-	    &as->as_arp_physical_out_event);
-	if (as->as_arp_physical_out == NULL) {
-		cmn_err(CE_NOTE, "arp_hook_init: "
-		    "net_event_register failed for arp/physical_out");
-	}
-
-	HOOK_EVENT_INIT(&as->as_arp_nic_events, NH_NIC_EVENTS);
-	as->as_arpnicevents = net_event_register(as->as_net_data,
-	    &as->as_arp_nic_events);
-	if (as->as_arpnicevents == NULL) {
-		cmn_err(CE_NOTE, "arp_hook_init: "
-		    "net_event_register failed for arp/nic_events");
-	}
-}
-
-void
-arp_hook_destroy(arp_stack_t *as)
-{
-	if (as->as_arpnicevents != NULL) {
-		if (net_event_unregister(as->as_net_data,
-		    &as->as_arp_nic_events) == 0)
-			as->as_arpnicevents = NULL;
-	}
-
-	if (as->as_arp_physical_out != NULL) {
-		if (net_event_unregister(as->as_net_data,
-		    &as->as_arp_physical_out_event) == 0)
-			as->as_arp_physical_out = NULL;
-	}
-
-	if (as->as_arp_physical_in != NULL) {
-		if (net_event_unregister(as->as_net_data,
-		    &as->as_arp_physical_in_event) == 0)
-			as->as_arp_physical_in = NULL;
-	}
-
-	(void) net_family_unregister(as->as_net_data, &as->as_arproot);
-}
-
-/*
- * Determine the name of the lower level interface
- */
-static int
-arp_getifname(net_handle_t net, phy_if_t phy_ifdata, char *buffer,
-    const size_t buflen)
-{
-	arl_t	*arl;
-	arp_stack_t *as;
-	netstack_t *ns = net->netd_stack->nts_netstack;
-
-	ASSERT(buffer != NULL);
-	ASSERT(ns != NULL);
-
-	as = ns->netstack_arp;
-	rw_enter(&as->as_arl_lock, RW_READER);
-	for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) {
-		if (arl->arl_index == phy_ifdata) {
-			(void) strlcpy(buffer, arl->arl_name, buflen);
-			rw_exit(&as->as_arl_lock);
-			return (0);
-		}
-	}
-	rw_exit(&as->as_arl_lock);
-
-	return (1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_getmtu(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata)
-{
-	return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_getpmtuenabled(net_handle_t net)
-{
-	return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_getlifaddr(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata,
-    size_t nelem, net_ifaddr_t type[], void *storage)
-{
-	return (-1);
-}
-
-/*
- * Determine the instance number of the next lower level interface
- */
-static phy_if_t
-arp_phygetnext(net_handle_t net, phy_if_t phy_ifdata)
-{
-	arl_t *arl;
-	int index;
-	arp_stack_t *as;
-	netstack_t *ns = net->netd_stack->nts_netstack;
-
-	ASSERT(ns != NULL);
-
-	as = ns->netstack_arp;
-	rw_enter(&as->as_arl_lock, RW_READER);
-	if (phy_ifdata == 0) {
-		arl = as->as_arl_head;
-	} else {
-		for (arl = as->as_arl_head; arl != NULL;
-		    arl = arl->arl_next) {
-			if (arl->arl_index == phy_ifdata) {
-				arl = arl->arl_next;
-				break;
-			}
-		}
-	}
-
-	index = (arl != NULL) ? arl->arl_index : 0;
-
-	rw_exit(&as->as_arl_lock);
-
-	return (index);
-}
-
-/*
- * Given a network interface name, find its ARP layer instance number.
- */
-static phy_if_t
-arp_phylookup(net_handle_t net, const char *name)
-{
-	arl_t *arl;
-	int index;
-	arp_stack_t *as;
-	netstack_t *ns = net->netd_stack->nts_netstack;
-
-	ASSERT(name != NULL);
-	ASSERT(ns != NULL);
-
-	index = 0;
-	as = ns->netstack_arp;
-	rw_enter(&as->as_arl_lock, RW_READER);
-	for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) {
-		if (strcmp(name, arl->arl_name) == 0) {
-			index = arl->arl_index;
-			break;
-		}
-	}
-	rw_exit(&as->as_arl_lock);
-
-	return (index);
-
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static lif_if_t
-arp_lifgetnext(net_handle_t net, phy_if_t ifp, lif_if_t lif)
-{
-	return ((lif_if_t)-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_inject(net_handle_t net, inject_t injection, net_inject_t *neti)
-{
-	return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static phy_if_t
-arp_routeto(net_handle_t net, struct sockaddr *addr, struct sockaddr *next)
-{
-	return ((phy_if_t)-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-int
-arp_ispartialchecksum(net_handle_t net, mblk_t *mb)
-{
-	return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_isvalidchecksum(net_handle_t net, mblk_t *mb)
-{
-	return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_getlifzone(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata,
-    zoneid_t *zoneid)
-{
-	return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_getlifflags(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata,
-    uint64_t *flags)
-{
-	return (-1);
-}
diff --git a/usr/src/uts/common/inet/arp/arpddi.c b/usr/src/uts/common/inet/arp/arpddi.c
index 2cc56b77fd..de8333295b 100644
--- a/usr/src/uts/common/inet/arp/arpddi.c
+++ b/usr/src/uts/common/inet/arp/arpddi.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -27,10 +27,8 @@
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/modctl.h>
-#include <sys/ksynch.h>
 #include <inet/common.h>
 #include <inet/ip.h>
-#include <inet/arp_impl.h>
 
 #define	INET_NAME	"arp"
 #define	INET_MODDESC	"ARP STREAMS module"
@@ -39,28 +37,16 @@
 #define	INET_DEVSTRTAB	ipinfov4
 #define	INET_MODSTRTAB	arpinfo
 #define	INET_DEVMTFLAGS	IP_DEVMTFLAGS	/* since as a driver we're ip */
-#define	INET_MODMTFLAGS	(D_MP | D_MTPERMOD)
+#define	INET_MODMTFLAGS	D_MP
 
 #include "../inetddi.c"
 
-extern void arp_ddi_init(void);
-extern void arp_ddi_destroy(void);
-
 int
 _init(void)
 {
 	int	error;
 
-	/*
-	 * Note: After mod_install succeeds, another thread can enter
-	 * therefore all initialization is done before it and any
-	 * de-initialization needed done if it fails.
-	 */
-	arp_ddi_init();
 	error = mod_install(&modlinkage);
-	if (error != 0)
-		arp_ddi_destroy();
-
 	return (error);
 }
 
@@ -70,8 +56,6 @@ _fini(void)
 	int	error;
 
 	error = mod_remove(&modlinkage);
-	if (error == 0)
-		arp_ddi_destroy();
 	return (error);
 }
 
diff --git a/usr/src/uts/common/inet/arp_impl.h b/usr/src/uts/common/inet/arp_impl.h
deleted file mode 100644
index 38d0d1ab65..0000000000
--- a/usr/src/uts/common/inet/arp_impl.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_ARP_IMPL_H
-#define	_ARP_IMPL_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifdef _KERNEL
-
-#include <sys/types.h>
-#include <sys/stream.h>
-#include <net/if.h>
-#include <sys/netstack.h>
-
-/* ARP kernel hash size; used for mdb support */
-#define	ARP_HASH_SIZE	256
-
-/* Named Dispatch Parameter Management Structure */
-typedef struct arpparam_s {
-	uint32_t	arp_param_min;
-	uint32_t	arp_param_max;
-	uint32_t	arp_param_value;
-	char		*arp_param_name;
-} arpparam_t;
-
-/* ARL Structure, one per link level device */
-typedef struct arl_s {
-	struct arl_s	*arl_next;		/* ARL chain at arl_g_head */
-	queue_t		*arl_rq;		/* Read queue pointer */
-	queue_t		*arl_wq;		/* Write queue pointer */
-	t_uscalar_t	arl_ppa;		/* DL_ATTACH parameter */
-	char		arl_name[LIFNAMSIZ];	/* Lower level name */
-	mblk_t		*arl_unbind_mp;
-	mblk_t		*arl_detach_mp;
-	t_uscalar_t	arl_provider_style;	/* From DL_INFO_ACK */
-	mblk_t		*arl_queue;		/* Queued commands head */
-	mblk_t		*arl_queue_tail;	/* Queued commands tail */
-	uint32_t	arl_flags;		/* ARL_F_* values below */
-	t_uscalar_t	arl_dlpi_pending;	/* pending DLPI request */
-	mblk_t		*arl_dlpi_deferred;	/* Deferred DLPI messages */
-	uint_t		arl_state;		/* lower interface state */
-	uint_t		arl_closing : 1,	/* stream is closing */
-			arl_replumbing : 1;	/* Wait for IP to bring down */
-	uint32_t	arl_index;		/* instance number */
-	struct arlphy_s	*arl_phy;		/* physical info, if any */
-	struct arl_s	*arl_ipmp_arl;		/* pointer to group arl_t */
-} arl_t;
-
-/*
- * There is no field to get from an arl_t to an arp_stack_t, but this
- * macro does it.
- */
-#define	ARL_TO_ARPSTACK(_arl)	(((ar_t *)(_arl)->arl_rq->q_ptr)->ar_as)
-
-/* ARL physical info structure, one per physical link level device */
-typedef struct arlphy_s {
-	uint32_t	ap_arp_hw_type;		/* hardware type */
-	uchar_t		*ap_arp_addr;		/* multicast address to use */
-	uchar_t		*ap_hw_addr;		/* hardware address */
-	uint32_t	ap_hw_addrlen;		/* hardware address length */
-	mblk_t		*ap_xmit_mp;		/* DL_UNITDATA_REQ template */
-	t_uscalar_t	ap_xmit_addroff;	/* address offset in xmit_mp */
-	t_uscalar_t	ap_xmit_sapoff;		/* sap offset in xmit_mp */
-	t_scalar_t	ap_saplen;		/* sap length */
-	clock_t		ap_defend_start;	/* start of 1-hour period */
-	uint_t		ap_defend_count;	/* # of unbidden broadcasts */
-	uint_t		ap_notifies : 1,	/* handles DL_NOTE_LINK */
-			ap_link_down : 1;	/* DL_NOTE status */
-} arlphy_t;
-
-/* ARP Cache Entry */
-typedef struct ace_s {
-	struct ace_s	*ace_next;	/* Hash chain next pointer */
-	struct ace_s	**ace_ptpn;	/* Pointer to previous next */
-	struct arl_s	*ace_arl;	/* Associated arl */
-	uint32_t	ace_proto;	/* Protocol for this ace */
-	uint32_t	ace_flags;
-	uchar_t		*ace_proto_addr;
-	uint32_t	ace_proto_addr_length;
-	uchar_t		*ace_proto_mask; /* Mask for matching addr */
-	uchar_t		*ace_proto_extract_mask; /* For mappings */
-	uchar_t		*ace_hw_addr;
-	uint32_t	ace_hw_addr_length;
-	uint32_t	ace_hw_extract_start;	/* For mappings */
-	mblk_t		*ace_mp;		/* mblk we are in */
-	mblk_t		*ace_query_mp;		/* outstanding query chain */
-	clock_t		ace_last_bcast;		/* last broadcast Response */
-	clock_t		ace_xmit_interval;
-	int		ace_xmit_count;
-	arl_t		*ace_xmit_arl;		/* xmit on this arl */
-} ace_t;
-
-#define	ARPHOOK_INTERESTED_PHYSICAL_IN(as)	\
-	(as->as_arp_physical_in_event.he_interested)
-#define	ARPHOOK_INTERESTED_PHYSICAL_OUT(as)	\
-	(as->as_arp_physical_out_event.he_interested)
-
-#define	ARP_HOOK_IN(_hook, _event, _ilp, _hdr, _fm, _m, as)	\
-								\
-	if ((_hook).he_interested) {                       	\
-		hook_pkt_event_t info;                          \
-								\
-		info.hpe_protocol = as->as_net_data;		\
-		info.hpe_ifp = _ilp;                       	\
-		info.hpe_ofp = 0;                       	\
-		info.hpe_hdr = _hdr;                            \
-		info.hpe_mp = &(_fm);                           \
-		info.hpe_mb = _m;                               \
-		if (hook_run(as->as_net_data->netd_hooks,	\
-		    _event, (hook_data_t)&info) != 0) {		\
-			if (_fm != NULL) {                      \
-				freemsg(_fm);                   \
-				_fm = NULL;                     \
-			}                                       \
-			_hdr = NULL;                            \
-			_m = NULL;                              \
-		} else {                                        \
-			_hdr = info.hpe_hdr;                    \
-			_m = info.hpe_mb;                       \
-		}                                               \
-	}
-
-#define	ARP_HOOK_OUT(_hook, _event, _olp, _hdr, _fm, _m, as)	\
-								\
-	if ((_hook).he_interested) {                       	\
-		hook_pkt_event_t info;                          \
-								\
-		info.hpe_protocol = as->as_net_data;		\
-		info.hpe_ifp = 0;                       	\
-		info.hpe_ofp = _olp;                       	\
-		info.hpe_hdr = _hdr;                            \
-		info.hpe_mp = &(_fm);                           \
-		info.hpe_mb = _m;                               \
-		if (hook_run(as->as_net_data->netd_hooks,	\
-		    _event, (hook_data_t)&info) != 0) {		\
-			if (_fm != NULL) {                      \
-				freemsg(_fm);                   \
-				_fm = NULL;                     \
-			}                                       \
-			_hdr = NULL;                            \
-			_m = NULL;                              \
-		} else {                                        \
-			_hdr = info.hpe_hdr;                    \
-			_m = info.hpe_mb;                       \
-		}                                               \
-	}
-
-#define	ACE_EXTERNAL_FLAGS_MASK \
-	(ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MAPPING | ACE_F_MYADDR | \
-	ACE_F_AUTHORITY)
-
-/*
- * ARP stack instances
- */
-struct arp_stack {
-	netstack_t	*as_netstack;	/* Common netstack */
-	void		*as_head;	/* AR Instance Data List Head */
-	caddr_t		as_nd;		/* AR Named Dispatch Head */
-	struct arl_s	*as_arl_head;	/* ARL List Head */
-	arpparam_t	*as_param_arr; 	/* ndd variable table */
-
-	/* ARP Cache Entry Hash Table */
-	ace_t	*as_ce_hash_tbl[ARP_HASH_SIZE];
-	ace_t	*as_ce_mask_entries;
-
-	/*
-	 * With the introduction of netinfo (neti kernel module),
-	 * it is now possible to access data structures in the ARP module
-	 * without the code being executed in the context of the IP module,
-	 * thus there is no locking being enforced through the use of STREAMS.
-	 * as_arl_lock is used to protect as_arl_head list.
-	 */
-	krwlock_t	as_arl_lock;
-
-	uint32_t	as_arp_index_counter;
-	uint32_t	as_arp_counter_wrapped;
-
-	/* arp_neti.c */
-	hook_family_t	as_arproot;
-
-	/*
-	 * Hooks for ARP
-	 */
-	hook_event_t	as_arp_physical_in_event;
-	hook_event_t	as_arp_physical_out_event;
-	hook_event_t	as_arp_nic_events;
-
-	hook_event_token_t	as_arp_physical_in;
-	hook_event_token_t	as_arp_physical_out;
-	hook_event_token_t	as_arpnicevents;
-
-	net_handle_t	as_net_data;
-};
-typedef struct arp_stack arp_stack_t;
-
-#define	ARL_F_NOARP	0x01
-#define	ARL_F_IPMP	0x02
-
-#define	ARL_S_DOWN	0x00
-#define	ARL_S_PENDING	0x01
-#define	ARL_S_UP	0x02
-
-/* AR Structure, one per upper stream */
-typedef struct ar_s {
-	queue_t		*ar_rq;	/* Read queue pointer */
-	queue_t		*ar_wq;	/* Write queue pointer */
-	arl_t		*ar_arl;	/* Associated arl */
-	cred_t		*ar_credp;	/* Credentials associated w/ open */
-	struct ar_s	*ar_arl_ip_assoc;	/* ARL - IP association */
-	uint32_t
-			ar_ip_acked_close : 1,	/* IP has acked the close */
-			ar_on_ill_stream : 1;	/* Module below is IP */
-	arp_stack_t	*ar_as;
-} ar_t;
-
-extern void	arp_hook_init(arp_stack_t *);
-extern void	arp_hook_destroy(arp_stack_t *);
-extern void	arp_net_init(arp_stack_t *, netstackid_t);
-extern void	arp_net_shutdown(arp_stack_t *);
-extern void	arp_net_destroy(arp_stack_t *);
-
-#endif	/* _KERNEL */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ARP_IMPL_H */
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 5a7e05b210..88a14068bb 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -55,8 +55,6 @@ extern "C" {
 #include <sys/squeue.h>
 #include <net/route.h>
 #include <sys/systm.h>
-#include <sys/multidata.h>
-#include <sys/list.h>
 #include <net/radix.h>
 #include <sys/modhash.h>
 
@@ -94,6 +92,7 @@ typedef uint32_t ipaddr_t;
 
 /* Number of bits in an address */
 #define	IP_ABITS		32
+#define	IPV4_ABITS		IP_ABITS
 #define	IPV6_ABITS		128
 
 #define	IP_HOST_MASK		(ipaddr_t)0xffffffffU
@@ -101,14 +100,6 @@ typedef uint32_t ipaddr_t;
 #define	IP_CSUM(mp, off, sum)		(~ip_cksum(mp, off, sum) & 0xFFFF)
 #define	IP_CSUM_PARTIAL(mp, off, sum)	ip_cksum(mp, off, sum)
 #define	IP_BCSUM_PARTIAL(bp, len, sum)	bcksum(bp, len, sum)
-#define	IP_MD_CSUM(pd, off, sum)	(~ip_md_cksum(pd, off, sum) & 0xffff)
-#define	IP_MD_CSUM_PARTIAL(pd, off, sum) ip_md_cksum(pd, off, sum)
-
-/*
- * Flag to IP write side to indicate that the appln has sent in a pre-built
- * IP header. Stored in ipha_ident (which is otherwise zero).
- */
-#define	IP_HDR_INCLUDED			0xFFFF
 
 #define	ILL_FRAG_HASH_TBL_COUNT	((unsigned int)64)
 #define	ILL_FRAG_HASH_TBL_SIZE	(ILL_FRAG_HASH_TBL_COUNT * sizeof (ipfb_t))
@@ -137,17 +128,12 @@ typedef uint32_t ipaddr_t;
 
 #define	UDPH_SIZE			8
 
-/* Leave room for ip_newroute to tack on the src and target addresses */
-#define	OK_RESOLVER_MP(mp)						\
-	((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN))
-
 /*
  * Constants and type definitions to support IP IOCTL commands
  */
 #define	IP_IOCTL			(('i'<<8)|'p')
 #define	IP_IOC_IRE_DELETE		4
 #define	IP_IOC_IRE_DELETE_NO_REPLY	5
-#define	IP_IOC_IRE_ADVISE_NO_REPLY	6
 #define	IP_IOC_RTS_REQUEST		7
 
 /* Common definitions used by IP IOCTL data structures */
@@ -157,31 +143,6 @@ typedef struct ipllcmd_s {
 	uint_t	ipllc_name_length;
 } ipllc_t;
 
-/* IP IRE Change Command Structure. */
-typedef struct ipic_s {
-	ipllc_t	ipic_ipllc;
-	uint_t	ipic_ire_type;
-	uint_t	ipic_max_frag;
-	uint_t	ipic_addr_offset;
-	uint_t	ipic_addr_length;
-	uint_t	ipic_mask_offset;
-	uint_t	ipic_mask_length;
-	uint_t	ipic_src_addr_offset;
-	uint_t	ipic_src_addr_length;
-	uint_t	ipic_ll_hdr_offset;
-	uint_t	ipic_ll_hdr_length;
-	uint_t	ipic_gateway_addr_offset;
-	uint_t	ipic_gateway_addr_length;
-	clock_t	ipic_rtt;
-	uint32_t ipic_ssthresh;
-	clock_t	ipic_rtt_sd;
-	uchar_t ipic_ire_marks;
-} ipic_t;
-
-#define	ipic_cmd		ipic_ipllc.ipllc_cmd
-#define	ipic_ll_name_length	ipic_ipllc.ipllc_name_length
-#define	ipic_ll_name_offset	ipic_ipllc.ipllc_name_offset
-
 /* IP IRE Delete Command Structure. */
 typedef struct ipid_s {
 	ipllc_t	ipid_ipllc;
@@ -257,16 +218,8 @@ typedef struct ipoptp_s
 #define	Q_TO_ICMP(q)	(Q_TO_CONN((q))->conn_icmp)
 #define	Q_TO_RTS(q)	(Q_TO_CONN((q))->conn_rts)
 
-/*
- * The following two macros are used by IP to get the appropriate
- * wq and rq for a conn. If it is a TCP conn, then we need
- * tcp_wq/tcp_rq else, conn_wq/conn_rq. IP can use conn_wq and conn_rq
- * from a conn directly if it knows that the conn is not TCP.
- */
-#define	CONNP_TO_WQ(connp)	\
-	(IPCL_IS_TCP(connp) ? (connp)->conn_tcp->tcp_wq : (connp)->conn_wq)
-
-#define	CONNP_TO_RQ(connp)	RD(CONNP_TO_WQ(connp))
+#define	CONNP_TO_WQ(connp)	((connp)->conn_wq)
+#define	CONNP_TO_RQ(connp)	((connp)->conn_rq)
 
 #define	GRAB_CONN_LOCK(q)	{				\
 	if (q != NULL && CONN_Q(q))				\
@@ -278,9 +231,6 @@ typedef struct ipoptp_s
 		mutex_exit(&(Q_TO_CONN(q))->conn_lock);		\
 }
 
-/* "Congestion controlled" protocol */
-#define	IP_FLOW_CONTROLLED_ULP(p)   ((p) == IPPROTO_TCP || (p) == IPPROTO_SCTP)
-
 /*
  * Complete the pending operation. Usually an ioctl. Can also
  * be a bind or option management request that got enqueued
@@ -295,63 +245,13 @@ typedef struct ipoptp_s
 }
 
 /*
- * Flags for the various ip_fanout_* routines.
- */
-#define	IP_FF_SEND_ICMP		0x01	/* Send an ICMP error */
-#define	IP_FF_HDR_COMPLETE	0x02	/* Call ip_hdr_complete if error */
-#define	IP_FF_CKSUM		0x04	/* Recompute ipha_cksum if error */
-#define	IP_FF_RAWIP		0x08	/* Use rawip mib variable */
-#define	IP_FF_SRC_QUENCH	0x10	/* OK to send ICMP_SOURCE_QUENCH */
-#define	IP_FF_SYN_ADDIRE	0x20	/* Add IRE if TCP syn packet */
-#define	IP_FF_IPINFO		0x80	/* Used for both V4 and V6 */
-#define	IP_FF_SEND_SLLA		0x100	/* Send source link layer info ? */
-#define	IPV6_REACHABILITY_CONFIRMATION	0x200	/* Flags for ip_xmit_v6 */
-#define	IP_FF_NO_MCAST_LOOP	0x400	/* No multicasts for sending zone */
-
-/*
- * Following flags are used by IPQoS to determine if policy processing is
- * required.
- */
-#define	IP6_NO_IPPOLICY		0x800	/* Don't do IPQoS processing */
-#define	IP6_IN_LLMCAST		0x1000	/* Multicast */
-
-#define	IP_FF_LOOPBACK		0x2000	/* Loopback fanout */
-#define	IP_FF_SCTP_CSUM_ERR	0x4000	/* sctp pkt has failed chksum */
-
-#ifndef	IRE_DB_TYPE
-#define	IRE_DB_TYPE	M_SIG
-#endif
-
-#ifndef	IRE_DB_REQ_TYPE
-#define	IRE_DB_REQ_TYPE	M_PCSIG
-#endif
-
-#ifndef	IRE_ARPRESOLVE_TYPE
-#define	IRE_ARPRESOLVE_TYPE	M_EVENT
-#endif
-
-/*
  * Values for squeue switch:
  */
-
 #define	IP_SQUEUE_ENTER_NODRAIN	1
 #define	IP_SQUEUE_ENTER	2
-/*
- * This is part of the interface between Transport provider and
- * IP which can be used to set policy information. This is usually
- * accompanied with O_T_BIND_REQ/T_BIND_REQ.ip_bind assumes that
- * only IPSEC_POLICY_SET is there when it is found in the chain.
- * The information contained is an struct ipsec_req_t. On success
- * or failure, either the T_BIND_ACK or the T_ERROR_ACK is returned.
- * IPSEC_POLICY_SET is never returned.
- */
-#define	IPSEC_POLICY_SET	M_SETOPTS
+#define	IP_SQUEUE_FILL 3
 
-#define	IRE_IS_LOCAL(ire)	((ire != NULL) && \
-				((ire)->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
-
-#define	IRE_IS_TARGET(ire)	((ire != NULL) && \
-				((ire)->ire_type != IRE_BROADCAST))
+extern int ip_squeue_flag;
 
 /* IP Fragmentation Reassembly Header */
 typedef struct ipf_s {
@@ -387,71 +287,6 @@ typedef struct ipf_s {
 #define	ipf_src	V4_PART_OF_V6(ipf_v6src)
 #define	ipf_dst	V4_PART_OF_V6(ipf_v6dst)
 
-typedef enum {
-	IB_PKT = 0x01,
-	OB_PKT = 0x02
-} ip_pkt_t;
-
-#define	UPDATE_IB_PKT_COUNT(ire)\
-	{ \
-	(ire)->ire_ib_pkt_count++; \
-	if ((ire)->ire_ipif != NULL) { \
-		/* \
-		 * forwarding packet \
-		 */ \
-		if ((ire)->ire_type & (IRE_LOCAL|IRE_BROADCAST)) \
-			atomic_add_32(&(ire)->ire_ipif->ipif_ib_pkt_count, 1);\
-		else \
-			atomic_add_32(&(ire)->ire_ipif->ipif_fo_pkt_count, 1);\
-	} \
-	}
-
-#define	UPDATE_OB_PKT_COUNT(ire)\
-	{ \
-	(ire)->ire_ob_pkt_count++;\
-	if ((ire)->ire_ipif != NULL) { \
-		atomic_add_32(&(ire)->ire_ipif->ipif_ob_pkt_count, 1); \
-	} \
-	}
-
-#define	IP_RPUT_LOCAL(q, mp, ipha, ire, recv_ill) \
-{ \
-	switch (ipha->ipha_protocol) { \
-		case IPPROTO_UDP: \
-			ip_udp_input(q, mp, ipha, ire, recv_ill); \
-			break; \
-		default: \
-			ip_proto_input(q, mp, ipha, ire, recv_ill, 0); \
-			break; \
-	} \
-}
-
-/*
- * NCE_EXPIRED is TRUE when we have a non-permanent nce that was
- * found to be REACHABLE more than ip_ire_arp_interval ms ago.
- * This macro is used to age existing nce_t entries. The
- * nce's will get cleaned up in the following circumstances:
- * - ip_ire_trash_reclaim will free nce's using ndp_cache_reclaim
- *    when memory is low,
- * - ip_arp_news, when updates are received.
- * - if the nce is NCE_EXPIRED(), it will deleted, so that a new
- *   arp request will need to be triggered from an ND_INITIAL nce.
- *
- * Note that the nce state transition follows the pattern:
- *	ND_INITIAL -> ND_INCOMPLETE -> ND_REACHABLE
- * after which the nce is deleted when it has expired.
- *
- * nce_last is the timestamp that indicates when the nce_res_mp in the
- * nce_t was last updated to a valid link-layer address.  nce_last gets
- * modified/updated :
- *  - when the nce is created
- *  - every time we get a sane arp response for the nce.
- */
-#define	NCE_EXPIRED(nce, ipst)	(nce->nce_last > 0 &&	\
-	    ((nce->nce_flags & NCE_F_PERMANENT) == 0) &&	\
-	    ((TICK_TO_MSEC(lbolt64) - nce->nce_last) > 		\
-		(ipst)->ips_ip_ire_arp_interval))
-
 #endif /* _KERNEL */
 
 /* ICMP types */
@@ -560,7 +395,17 @@ typedef struct ipha_s {
 #define	IPH_DF		0x4000	/* Don't fragment */
 #define	IPH_MF		0x2000	/* More fragments to come */
 #define	IPH_OFFSET	0x1FFF	/* Where the offset lives */
-#define	IPH_FRAG_HDR	0x8000	/* IPv6 don't fragment bit */
+
+/* Byte-order specific values */
+#ifdef	_BIG_ENDIAN
+#define	IPH_DF_HTONS	0x4000	/* Don't fragment */
+#define	IPH_MF_HTONS	0x2000	/* More fragments to come */
+#define	IPH_OFFSET_HTONS 0x1FFF	/* Where the offset lives */
+#else
+#define	IPH_DF_HTONS	0x0040	/* Don't fragment */
+#define	IPH_MF_HTONS	0x0020	/* More fragments to come */
+#define	IPH_OFFSET_HTONS 0xFF1F	/* Where the offset lives */
+#endif
 
 /* ECN code points for IPv4 TOS byte and IPv6 traffic class octet. */
 #define	IPH_ECN_NECT	0x0	/* Not ECN-Capable Transport */
@@ -571,10 +416,8 @@ typedef struct ipha_s {
 struct ill_s;
 
 typedef	void ip_v6intfid_func_t(struct ill_s *, in6_addr_t *);
-typedef	boolean_t ip_v6mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *,
-    in6_addr_t *);
-typedef boolean_t ip_v4mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *,
-    ipaddr_t *);
+typedef void ip_v6mapinfo_func_t(struct ill_s *, uchar_t *, uchar_t *);
+typedef void ip_v4mapinfo_func_t(struct ill_s *, uchar_t *, uchar_t *);
 
 /* IP Mac info structure */
 typedef struct ip_m_s {
@@ -582,8 +425,8 @@ typedef struct ip_m_s {
 	int			ip_m_type;	/* From <net/if_types.h> */
 	t_uscalar_t		ip_m_ipv4sap;
 	t_uscalar_t		ip_m_ipv6sap;
-	ip_v4mapinfo_func_t	*ip_m_v4mapinfo;
-	ip_v6mapinfo_func_t	*ip_m_v6mapinfo;
+	ip_v4mapinfo_func_t	*ip_m_v4mapping;
+	ip_v6mapinfo_func_t	*ip_m_v6mapping;
 	ip_v6intfid_func_t	*ip_m_v6intfid;
 	ip_v6intfid_func_t	*ip_m_v6destintfid;
 } ip_m_t;
@@ -591,20 +434,14 @@ typedef struct ip_m_s {
 /*
  * The following functions attempt to reduce the link layer dependency
  * of the IP stack. The current set of link specific operations are:
- * a. map from IPv4 class D (224.0/4) multicast address range to the link
- * layer multicast address range.
- * b. map from IPv6 multicast address range (ff00::/8) to the link
- * layer multicast address range.
- * c. derive the default IPv6 interface identifier from the interface.
- * d. derive the default IPv6 destination interface identifier from
+ * a. map from IPv4 class D (224.0/4) multicast address range or the
+ * IPv6 multicast address range (ff00::/8) to the link layer multicast
+ * address.
+ * b. derive the default IPv6 interface identifier from the interface.
+ * c. derive the default IPv6 destination interface identifier from
  * the interface (point-to-point only).
  */
-#define	MEDIA_V4MINFO(ip_m, plen, bphys, maddr, hwxp, v4ptr) \
-	(((ip_m)->ip_m_v4mapinfo != NULL) && \
-	(*(ip_m)->ip_m_v4mapinfo)(plen, bphys, maddr, hwxp, v4ptr))
-#define	MEDIA_V6MINFO(ip_m, plen, bphys, maddr, hwxp, v6ptr) \
-	(((ip_m)->ip_m_v6mapinfo != NULL) && \
-	(*(ip_m)->ip_m_v6mapinfo)(plen, bphys, maddr, hwxp, v6ptr))
+extern	void ip_mcast_mapping(struct ill_s *, uchar_t *, uchar_t *);
 /* ip_m_v6*intfid return void and are never NULL */
 #define	MEDIA_V6INTFID(ip_m, ill, v6ptr) (ip_m)->ip_m_v6intfid(ill, v6ptr)
 #define	MEDIA_V6DESTINTFID(ip_m, ill, v6ptr) \
@@ -616,107 +453,38 @@ typedef struct ip_m_s {
 #define	IRE_LOCAL		0x0004	/* Route entry for local address */
 #define	IRE_LOOPBACK		0x0008	/* Route entry for loopback address */
 #define	IRE_PREFIX		0x0010	/* Route entry for prefix routes */
+#ifndef _KERNEL
+/* Keep so user-level still compiles */
 #define	IRE_CACHE		0x0020	/* Cached Route entry */
+#endif
 #define	IRE_IF_NORESOLVER	0x0040	/* Route entry for local interface */
 					/* net without any address mapping. */
 #define	IRE_IF_RESOLVER		0x0080	/* Route entry for local interface */
 					/* net with resolver. */
 #define	IRE_HOST		0x0100	/* Host route entry */
+/* Keep so user-level still compiles */
 #define	IRE_HOST_REDIRECT	0x0200	/* only used for T_SVR4_OPTMGMT_REQ */
+#define	IRE_IF_CLONE		0x0400	/* Per host clone of IRE_IF */
+#define	IRE_MULTICAST		0x0800	/* Special - not in table */
+#define	IRE_NOROUTE		0x1000	/* Special - not in table */
 
 #define	IRE_INTERFACE		(IRE_IF_NORESOLVER | IRE_IF_RESOLVER)
-#define	IRE_OFFSUBNET		(IRE_DEFAULT | IRE_PREFIX | IRE_HOST)
-#define	IRE_CACHETABLE		(IRE_CACHE | IRE_BROADCAST | IRE_LOCAL | \
-				IRE_LOOPBACK)
-#define	IRE_FORWARDTABLE	(IRE_INTERFACE | IRE_OFFSUBNET)
-
-/*
- * If an IRE is marked with IRE_MARK_CONDEMNED, the last walker of
- * the bucket should delete this IRE from this bucket.
- */
-#define	IRE_MARK_CONDEMNED	0x0001
-
-/*
- * An IRE with IRE_MARK_PMTU has ire_max_frag set from an ICMP error.
- */
-#define	IRE_MARK_PMTU		0x0002
-
-/*
- * An IRE with IRE_MARK_TESTHIDDEN is used by in.mpathd for test traffic.  It
- * can only be looked up by requesting MATCH_IRE_MARK_TESTHIDDEN.
- */
-#define	IRE_MARK_TESTHIDDEN	0x0004
-
-/*
- * An IRE with IRE_MARK_NOADD is created in ip_newroute_ipif when the outgoing
- * interface is specified by e.g. IP_PKTINFO.  The IRE is not added to the IRE
- * cache table.
- */
-#define	IRE_MARK_NOADD		0x0008	/* Mark not to add ire in cache */
-
-/*
- * IRE marked with IRE_MARK_TEMPORARY means that this IRE has been used
- * either for forwarding a packet or has not been used for sending
- * traffic on TCP connections terminated on this system.  In both
- * cases, this IRE is the first to go when IRE is being cleaned up.
- */
-#define	IRE_MARK_TEMPORARY	0x0010
-
-/*
- * IRE marked with IRE_MARK_USESRC_CHECK means that while adding an IRE with
- * this mark, additional atomic checks need to be performed. For eg: by the
- * time an IRE_CACHE is created, sent up to ARP and then comes back to IP; the
- * usesrc grouping could have changed in which case we want to fail adding
- * the IRE_CACHE entry
- */
-#define	IRE_MARK_USESRC_CHECK	0x0020
-
-/*
- * IRE_MARK_PRIVATE_ADDR is used for IP_NEXTHOP. When IP_NEXTHOP is set, the
- * routing table lookup for the destination is bypassed and the packet is
- * sent directly to the specified nexthop. The associated IRE_CACHE entries
- * should be marked with IRE_MARK_PRIVATE_ADDR flag so that they don't show up
- * in regular ire cache lookups.
- */
-#define	IRE_MARK_PRIVATE_ADDR	0x0040
 
+#define	IRE_IF_ALL		(IRE_IF_NORESOLVER | IRE_IF_RESOLVER | \
+				    IRE_IF_CLONE)
+#define	IRE_OFFSUBNET		(IRE_DEFAULT | IRE_PREFIX | IRE_HOST)
+#define	IRE_OFFLINK		IRE_OFFSUBNET
 /*
- * When we send an ARP resolution query for the nexthop gateway's ire,
- * we use esballoc to create the ire_t in the AR_ENTRY_QUERY mblk
- * chain, and mark its ire_marks with IRE_MARK_UNCACHED. This flag
- * indicates that information from ARP has not been transferred to a
- * permanent IRE_CACHE entry. The flag is reset only when the
- * information is successfully transferred to an ire_cache entry (in
- * ire_add()). Attempting to free the AR_ENTRY_QUERY mblk chain prior
- * to ire_add (e.g., from arp, or from ip`ip_wput_nondata) will
- * require that the resources (incomplete ire_cache and/or nce) must
- * be cleaned up. The free callback routine (ire_freemblk()) checks
- * for IRE_MARK_UNCACHED to see if any resources that are pinned down
- * will need to be cleaned up or not.
+ * Note that we view IRE_NOROUTE as ONLINK since we can "send" to them without
+ * going through a router; the result of sending will be an error/icmp error.
  */
-
-#define	IRE_MARK_UNCACHED	0x0080
-
-/*
- * The comment below (and for other netstack_t references) refers
- * to the fact that we only do netstack_hold in particular cases,
- * such as the references from open streams (ill_t and conn_t's
- * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
- * ire_t's when an ill goes away.
- */
-typedef struct ire_expire_arg_s {
-	int		iea_flush_flag;
-	ip_stack_t	*iea_ipst;	/* Does not have a netstack_hold */
-} ire_expire_arg_t;
-
-/* Flags with ire_expire routine */
-#define	FLUSH_ARP_TIME		0x0001	/* ARP info potentially stale timer */
-#define	FLUSH_REDIRECT_TIME	0x0002	/* Redirects potentially stale */
-#define	FLUSH_MTU_TIME		0x0004	/* Include path MTU per RFC 1191 */
+#define	IRE_ONLINK		(IRE_IF_ALL|IRE_LOCAL|IRE_LOOPBACK| \
+				    IRE_BROADCAST|IRE_MULTICAST|IRE_NOROUTE)
 
 /* Arguments to ire_flush_cache() */
 #define	IRE_FLUSH_DELETE	0
 #define	IRE_FLUSH_ADD		1
+#define	IRE_FLUSH_GWCHANGE	2
 
 /*
  * Open/close synchronization flags.
@@ -724,31 +492,21 @@ typedef struct ire_expire_arg_s {
  * depends on the atomic 32 bit access to that field.
  */
 #define	CONN_CLOSING		0x01	/* ip_close waiting for ip_wsrv */
-#define	CONN_IPSEC_LOAD_WAIT	0x02	/* waiting for load */
-#define	CONN_CONDEMNED		0x04	/* conn is closing, no more refs */
-#define	CONN_INCIPIENT		0x08	/* conn not yet visible, no refs */
-#define	CONN_QUIESCED		0x10	/* conn is now quiescent */
-
-/* Used to check connection state flags before caching the IRE */
-#define	CONN_CACHE_IRE(connp)	\
-	(!((connp)->conn_state_flags & (CONN_CLOSING|CONN_CONDEMNED)))
-
-/*
- * Parameter to ip_output giving the identity of the caller.
- * IP_WSRV means the packet was enqueued in the STREAMS queue
- * due to flow control and is now being reprocessed in the context of
- * the STREAMS service procedure, consequent to flow control relief.
- * IRE_SEND means the packet is being reprocessed consequent to an
- * ire cache creation and addition and this may or may not be happening
- * in the service procedure context. Anything other than the above 2
- * cases is identified as IP_WPUT. Most commonly this is the case of
- * packets coming down from the application.
+#define	CONN_CONDEMNED		0x02	/* conn is closing, no more refs */
+#define	CONN_INCIPIENT		0x04	/* conn not yet visible, no refs */
+#define	CONN_QUIESCED		0x08	/* conn is now quiescent */
+#define	CONN_UPDATE_ILL		0x10	/* conn_update_ill in progress */
+
+/*
+ * Flags for dce_flags field. Specifies which information has been set.
+ * dce_ident is always present, but the other ones are identified by the flags.
  */
-#ifdef _KERNEL
-#define	IP_WSRV			1	/* Called from ip_wsrv */
-#define	IP_WPUT			2	/* Called from ip_wput */
-#define	IRE_SEND		3	/* Called from ire_send */
+#define	DCEF_DEFAULT		0x0001	/* Default DCE - no pmtu or uinfo */
+#define	DCEF_PMTU		0x0002	/* Different than interface MTU */
+#define	DCEF_UINFO		0x0004	/* dce_uinfo set */
+#define	DCEF_TOO_SMALL_PMTU	0x0008	/* Smaller than IPv4/IPv6 MIN */
 
+#ifdef _KERNEL
 /*
  * Extra structures need for per-src-addr filtering (IGMPv3/MLDv2)
  */
@@ -786,90 +544,80 @@ typedef struct mrec_s {
 } mrec_t;
 
 /* Group membership list per upper conn */
+
 /*
- * XXX add ilg info for ifaddr/ifindex.
- * XXX can we make ilg survive an ifconfig unplumb + plumb
- * by setting the ipif/ill to NULL and recover that later?
+ * We record the multicast information from the socket option in
+ * ilg_ifaddr/ilg_ifindex. This allows rejoining the group in the case when
+ * the ifaddr (or ifindex) disappears and later reappears, potentially on
+ * a different ill. The IPv6 multicast socket options and ioctls all specify
+ * the interface using an ifindex. For IPv4 some socket options/ioctls use
+ * the interface address and others use the index. We record here the method
+ * that was actually used (and leave the other of ilg_ifaddr or ilg_ifindex)
+ * at zero so that we can rejoin the way the application intended.
  *
- * ilg_ipif is used by IPv4 as multicast groups are joined using an interface
- * address (ipif).
- * ilg_ill is used by IPv6 as multicast groups are joined using an interface
- * index (phyint->phyint_ifindex).
- * ilg_ill is NULL for IPv4 and ilg_ipif is NULL for IPv6.
+ * We track the ill on which we will or already have joined an ilm using
+ * ilg_ill. When we have succeeded joining the ilm and have a refhold on it
+ * then we set ilg_ilm. Thus intentionally there is a window where ilg_ill is
+ * set and ilg_ilm is not set. This allows clearing ilg_ill as a signal that
+ * the ill is being unplumbed and the ilm should be discarded.
  *
  * ilg records the state of multicast memberships of a socket end point.
  * ilm records the state of multicast memberships with the driver and is
  * maintained per interface.
  *
- * There is no direct link between a given ilg and ilm. If the
- * application has joined a group G with ifindex I, we will have
- * an ilg with ilg_v6group and ilg_ill. There will be a corresponding
- * ilm with ilm_ill/ilm_v6addr recording the multicast membership.
- * To delete the membership:
- *
- *	a) Search for ilg matching on G and I with ilg_v6group
- *	   and ilg_ill. Delete ilg_ill.
- *	b) Search the corresponding ilm matching on G and I with
- *	   ilm_v6addr and ilm_ill. Delete ilm.
- *
- * For IPv4 the only difference is that we look using ipifs, not ills.
+ * The ilg state is protected by conn_ilg_lock.
+ * The ilg will not be freed until ilg_refcnt drops to zero.
  */
-
-/*
- * The ilg_t and ilm_t members are protected by ipsq. They can be changed only
- * by a thread executing in the ipsq. In other words add/delete of a
- * multicast group has to execute in the ipsq.
- */
-#define	ILG_DELETED	0x1		/* ilg_flags */
 typedef struct ilg_s {
+	struct ilg_s	*ilg_next;
+	struct ilg_s	**ilg_ptpn;
+	struct conn_s	*ilg_connp;	/* Back pointer to get lock */
 	in6_addr_t	ilg_v6group;
-	struct ipif_s	*ilg_ipif;	/* Logical interface we are member on */
-	struct ill_s	*ilg_ill;	/* Used by IPv6 */
-	uint_t		ilg_flags;
+	ipaddr_t	ilg_ifaddr;	/* For some IPv4 cases */
+	uint_t		ilg_ifindex;	/* IPv6 and some other IPv4 cases */
+	struct ill_s	*ilg_ill;	/* Where ilm is joined. No refhold */
+	struct ilm_s	*ilg_ilm;	/* With ilm_refhold */
+	uint_t		ilg_refcnt;
 	mcast_record_t	ilg_fmode;	/* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
 	slist_t		*ilg_filter;
+	boolean_t	ilg_condemned;	/* Conceptually deleted */
 } ilg_t;
 
 /*
  * Multicast address list entry for ill.
- * ilm_ipif is used by IPv4 as multicast groups are joined using ipif.
- * ilm_ill is used by IPv6 as multicast groups are joined using ill.
- * ilm_ill is NULL for IPv4 and ilm_ipif is NULL for IPv6.
+ * ilm_ill is used by IPv4 and IPv6
+ *
+ * The ilm state (and other multicast state on the ill) is protected by
+ * ill_mcast_lock. Operations that change state on both an ilg and ilm
+ * in addition use ill_mcast_serializer to ensure that we can't have
+ * interleaving between e.g., add and delete operations for the same conn_t,
+ * group, and ill.
  *
  * The comment below (and for other netstack_t references) refers
  * to the fact that we only do netstack_hold in particular cases,
- * such as the references from open streams (ill_t and conn_t's
+ * such as the references from open endpoints (ill_t and conn_t's
  * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
  * ire_t's when an ill goes away.
  */
-#define	ILM_DELETED	0x1		/* ilm_flags */
 typedef struct ilm_s {
 	in6_addr_t	ilm_v6addr;
 	int		ilm_refcnt;
 	uint_t		ilm_timer;	/* IGMP/MLD query resp timer, in msec */
-	struct ipif_s	*ilm_ipif;	/* Back pointer to ipif for IPv4 */
 	struct ilm_s	*ilm_next;	/* Linked list for each ill */
 	uint_t		ilm_state;	/* state of the membership */
-	struct ill_s	*ilm_ill;	/* Back pointer to ill for IPv6 */
-	uint_t		ilm_flags;
-	boolean_t	ilm_notify_driver; /* Need to notify the driver */
+	struct ill_s	*ilm_ill;	/* Back pointer to ill - ill_ilm_cnt */
 	zoneid_t	ilm_zoneid;
 	int		ilm_no_ilg_cnt;	/* number of joins w/ no ilg */
 	mcast_record_t	ilm_fmode;	/* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
 	slist_t		*ilm_filter;	/* source filter list */
 	slist_t		*ilm_pendsrcs;	/* relevant src addrs for pending req */
 	rtx_state_t	ilm_rtx;	/* SCR retransmission state */
+	ipaddr_t	ilm_ifaddr;	/* For IPv4 netstat */
 	ip_stack_t	*ilm_ipst;	/* Does not have a netstack_hold */
 } ilm_t;
 
 #define	ilm_addr	V4_PART_OF_V6(ilm_v6addr)
 
-typedef struct ilm_walker {
-	struct ill_s	*ilw_ill;	/* associated ill */
-	struct ill_s	*ilw_ipmp_ill; 	/* associated ipmp ill (if any) */
-	struct ill_s	*ilw_walk_ill; 	/* current ill being walked */
-} ilm_walker_t;
-
 /*
  * Soft reference to an IPsec SA.
  *
@@ -898,40 +646,28 @@ typedef struct ipsa_ref_s
  * In the presence of IPsec policy, fully-bound conn's bind a connection
  * to more than just the 5-tuple, but also a specific IPsec action and
  * identity-pair.
- *
- * As an optimization, we also cache soft references to IPsec SA's
- * here so that we can fast-path around most of the work needed for
+ * The identity pair is accessed from both the receive and transmit side
+ * hence it is maintained in the ipsec_latch_t structure. conn_latch and
+ * ixa_ipsec_latch points to it.
+ * The policy and actions are stored in conn_latch_in_policy and
+ * conn_latch_in_action for the inbound side, and in ixa_ipsec_policy and
+ * ixa_ipsec_action for the transmit side.
+ *
+ * As an optimization, we also cache soft references to IPsec SA's in
+ * ip_xmit_attr_t so that we can fast-path around most of the work needed for
  * outbound IPsec SA selection.
- *
- * Were it not for TCP's detached connections, this state would be
- * in-line in conn_t; instead, this is in a separate structure so it
- * can be handed off to TCP when a connection is detached.
  */
 typedef struct ipsec_latch_s
 {
 	kmutex_t	ipl_lock;
 	uint32_t	ipl_refcnt;
 
-	uint64_t	ipl_unique;
-	struct ipsec_policy_s	*ipl_in_policy; /* latched policy (in) */
-	struct ipsec_policy_s	*ipl_out_policy; /* latched policy (out) */
-	struct ipsec_action_s	*ipl_in_action;	/* latched action (in) */
-	struct ipsec_action_s	*ipl_out_action; /* latched action (out) */
-	cred_t		*ipl_local_id;
 	struct ipsid_s	*ipl_local_cid;
 	struct ipsid_s	*ipl_remote_cid;
 	unsigned int
-			ipl_out_action_latched : 1,
-			ipl_in_action_latched : 1,
-			ipl_out_policy_latched : 1,
-			ipl_in_policy_latched : 1,
-
 			ipl_ids_latched : 1,
 
-			ipl_pad_to_bit_31 : 27;
-
-	ipsa_ref_t	ipl_ref[2]; /* 0: ESP, 1: AH */
-
+			ipl_pad_to_bit_31 : 31;
 } ipsec_latch_t;
 
 #define	IPLATCH_REFHOLD(ipl) { \
@@ -939,97 +675,19 @@ typedef struct ipsec_latch_s
 	ASSERT((ipl)->ipl_refcnt != 0);			\
 }
 
-#define	IPLATCH_REFRELE(ipl, ns) {				\
+#define	IPLATCH_REFRELE(ipl) {				\
 	ASSERT((ipl)->ipl_refcnt != 0);				\
 	membar_exit();						\
 	if (atomic_add_32_nv(&(ipl)->ipl_refcnt, -1) == 0)	\
-		iplatch_free(ipl, ns);			\
+		iplatch_free(ipl);				\
 }
 
 /*
  * peer identity structure.
  */
-
 typedef struct conn_s conn_t;
 
 /*
- * The old IP client structure "ipc_t" is gone. All the data is stored in the
- * connection structure "conn_t" now. The mapping of old and new fields looks
- * like this:
- *
- * ipc_ulp			conn_ulp
- * ipc_rq			conn_rq
- * ipc_wq			conn_wq
- *
- * ipc_laddr			conn_src
- * ipc_faddr			conn_rem
- * ipc_v6laddr			conn_srcv6
- * ipc_v6faddr			conn_remv6
- *
- * ipc_lport			conn_lport
- * ipc_fport			conn_fport
- * ipc_ports			conn_ports
- *
- * ipc_policy			conn_policy
- * ipc_latch			conn_latch
- *
- * ipc_irc_lock			conn_lock
- * ipc_ire_cache		conn_ire_cache
- *
- * ipc_state_flags		conn_state_flags
- * ipc_outgoing_ill		conn_outgoing_ill
- *
- * ipc_dontroute 		conn_dontroute
- * ipc_loopback 		conn_loopback
- * ipc_broadcast		conn_broadcast
- * ipc_reuseaddr		conn_reuseaddr
- *
- * ipc_multicast_loop		conn_multicast_loop
- * ipc_multi_router		conn_multi_router
- * ipc_draining 		conn_draining
- *
- * ipc_did_putbq		conn_did_putbq
- * ipc_unspec_src		conn_unspec_src
- * ipc_policy_cached		conn_policy_cached
- *
- * ipc_in_enforce_policy 	conn_in_enforce_policy
- * ipc_out_enforce_policy 	conn_out_enforce_policy
- * ipc_af_isv6			conn_af_isv6
- * ipc_pkt_isv6			conn_pkt_isv6
- *
- * ipc_ipv6_recvpktinfo		conn_ipv6_recvpktinfo
- *
- * ipc_ipv6_recvhoplimit	conn_ipv6_recvhoplimit
- * ipc_ipv6_recvhopopts		conn_ipv6_recvhopopts
- * ipc_ipv6_recvdstopts		conn_ipv6_recvdstopts
- *
- * ipc_ipv6_recvrthdr 		conn_ipv6_recvrthdr
- * ipc_ipv6_recvrtdstopts	conn_ipv6_recvrtdstopts
- * ipc_fully_bound		conn_fully_bound
- *
- * ipc_recvif			conn_recvif
- *
- * ipc_recvslla 		conn_recvslla
- * ipc_acking_unbind 		conn_acking_unbind
- * ipc_pad_to_bit_31 		conn_pad_to_bit_31
- *
- * ipc_proto			conn_proto
- * ipc_incoming_ill		conn_incoming_ill
- * ipc_pending_ill		conn_pending_ill
- * ipc_unbind_mp		conn_unbind_mp
- * ipc_ilg			conn_ilg
- * ipc_ilg_allocated		conn_ilg_allocated
- * ipc_ilg_inuse		conn_ilg_inuse
- * ipc_ilg_walker_cnt		conn_ilg_walker_cnt
- * ipc_refcv			conn_refcv
- * ipc_multicast_ipif		conn_multicast_ipif
- * ipc_multicast_ill		conn_multicast_ill
- * ipc_drain_next		conn_drain_next
- * ipc_drain_prev		conn_drain_prev
- * ipc_idl			conn_idl
- */
-
-/*
  * This is used to match an inbound/outbound datagram with policy.
  */
 typedef	struct ipsec_selector {
@@ -1069,22 +727,6 @@ typedef	struct ipsec_selector {
 #define	IPSEC_POLICY_MAX		5	/* Always max + 1. */
 
 /*
- * Folowing macro is used whenever the code does not know whether there
- * is a M_CTL present in the front and it needs to examine the actual mp
- * i.e the IP header. As a M_CTL message could be in the front, this
- * extracts the packet into mp and the M_CTL mp into first_mp. If M_CTL
- * mp is not present, both first_mp and mp point to the same message.
- */
-#define	EXTRACT_PKT_MP(mp, first_mp, mctl_present)	\
-	(first_mp) = (mp);				\
-	if ((mp)->b_datap->db_type == M_CTL) {		\
-		(mp) = (mp)->b_cont;			\
-		(mctl_present) = B_TRUE;		\
-	} else {					\
-		(mctl_present) = B_FALSE;		\
-	}
-
-/*
  * Check with IPSEC inbound policy if
  *
  * 1) per-socket policy is present - indicated by conn_in_enforce_policy.
@@ -1113,11 +755,6 @@ typedef	struct ipsec_selector {
 
 /*
  * Information cached in IRE for upper layer protocol (ULP).
- *
- * Notice that ire_max_frag is not included in the iulp_t structure, which
- * it may seem that it should.  But ire_max_frag cannot really be cached.  It
- * is fixed for each interface.  For MTU found by PMTUd, we may want to cache
- * it.  But currently, we do not do that.
  */
 typedef struct iulp_s {
 	boolean_t	iulp_set;	/* Is any metric set? */
@@ -1128,17 +765,21 @@ typedef struct iulp_s {
 	uint32_t	iulp_rpipe;	/* Receive pipe size. */
 	uint32_t	iulp_rtomax;	/* Max round trip timeout. */
 	uint32_t	iulp_sack;	/* Use SACK option (TCP)? */
+	uint32_t	iulp_mtu;	/* Setable with routing sockets */
+
 	uint32_t
 		iulp_tstamp_ok : 1,	/* Use timestamp option (TCP)? */
 		iulp_wscale_ok : 1,	/* Use window scale option (TCP)? */
 		iulp_ecn_ok : 1,	/* Enable ECN (for TCP)? */
 		iulp_pmtud_ok : 1,	/* Enable PMTUd? */
 
-		iulp_not_used : 28;
-} iulp_t;
+		/* These three are passed out by ip_set_destination */
+		iulp_localnet: 1,	/* IRE_ONLINK */
+		iulp_loopback: 1,	/* IRE_LOOPBACK */
+		iulp_local: 1,		/* IRE_LOCAL */
 
-/* Zero iulp_t. */
-extern const iulp_t ire_uinfo_null;
+		iulp_not_used : 25;
+} iulp_t;
 
 /*
  * The conn drain list structure (idl_t).
@@ -1173,7 +814,6 @@ struct idl_tx_list_s {
 struct idl_s {
 	conn_t		*idl_conn;		/* Head of drain list */
 	kmutex_t	idl_lock;		/* Lock for this list */
-	conn_t		*idl_conn_draining;	/* conn that is draining */
 	uint32_t
 		idl_repeat : 1,			/* Last conn must re-enable */
 						/* drain list again */
@@ -1182,36 +822,38 @@ struct idl_s {
 };
 
 #define	CONN_DRAIN_LIST_LOCK(connp)	(&((connp)->conn_idl->idl_lock))
+
 /*
  * Interface route structure which holds the necessary information to recreate
- * routes that are tied to an interface (namely where ire_ipif != NULL).
+ * routes that are tied to an interface i.e. have ire_ill set.
+ *
  * These routes which were initially created via a routing socket or via the
  * SIOCADDRT ioctl may be gateway routes (RTF_GATEWAY being set) or may be
- * traditional interface routes.  When an interface comes back up after being
- * marked down, this information will be used to recreate the routes.  These
- * are part of an mblk_t chain that hangs off of the IPIF (ipif_saved_ire_mp).
+ * traditional interface routes.  When an ill comes back up after being
+ * down, this information will be used to recreate the routes.  These
+ * are part of an mblk_t chain that hangs off of the ILL (ill_saved_ire_mp).
  */
 typedef struct ifrt_s {
 	ushort_t	ifrt_type;		/* Type of IRE */
 	in6_addr_t	ifrt_v6addr;		/* Address IRE represents. */
-	in6_addr_t	ifrt_v6gateway_addr;	/* Gateway if IRE_OFFSUBNET */
-	in6_addr_t	ifrt_v6src_addr;	/* Src addr if RTF_SETSRC */
+	in6_addr_t	ifrt_v6gateway_addr;	/* Gateway if IRE_OFFLINK */
+	in6_addr_t	ifrt_v6setsrc_addr;	/* Src addr if RTF_SETSRC */
 	in6_addr_t	ifrt_v6mask;		/* Mask for matching IRE. */
 	uint32_t	ifrt_flags;		/* flags related to route */
-	uint_t		ifrt_max_frag;		/* MTU (next hop or path). */
-	iulp_t		ifrt_iulp_info;		/* Cached IRE ULP info. */
+	iulp_t		ifrt_metrics;		/* Routing socket metrics */
+	zoneid_t	ifrt_zoneid;		/* zoneid for route */
 } ifrt_t;
 
 #define	ifrt_addr		V4_PART_OF_V6(ifrt_v6addr)
 #define	ifrt_gateway_addr	V4_PART_OF_V6(ifrt_v6gateway_addr)
-#define	ifrt_src_addr		V4_PART_OF_V6(ifrt_v6src_addr)
 #define	ifrt_mask		V4_PART_OF_V6(ifrt_v6mask)
+#define	ifrt_setsrc_addr	V4_PART_OF_V6(ifrt_v6setsrc_addr)
 
 /* Number of IP addresses that can be hosted on a physical interface */
 #define	MAX_ADDRS_PER_IF	8192
 /*
  * Number of Source addresses to be considered for source address
- * selection. Used by ipif_select_source[_v6].
+ * selection. Used by ipif_select_source_v4/v6.
  */
 #define	MAX_IPIF_SELECT_SOURCE	50
 
@@ -1245,16 +887,13 @@ typedef struct th_hash_s {
 #define	IPIF_CONDEMNED		0x1	/* The ipif is being removed */
 #define	IPIF_CHANGING		0x2	/* A critcal ipif field is changing */
 #define	IPIF_SET_LINKLOCAL	0x10	/* transient flag during bringup */
-#define	IPIF_ZERO_SOURCE	0x20	/* transient flag during bringup */
 
 /* IP interface structure, one per local address */
 typedef struct ipif_s {
 	struct	ipif_s	*ipif_next;
 	struct	ill_s	*ipif_ill;	/* Back pointer to our ill */
 	int	ipif_id;		/* Logical unit number */
-	uint_t	ipif_mtu;		/* Starts at ipif_ill->ill_max_frag */
 	in6_addr_t ipif_v6lcl_addr;	/* Local IP address for this if. */
-	in6_addr_t ipif_v6src_addr;	/* Source IP address for this if. */
 	in6_addr_t ipif_v6subnet;	/* Subnet prefix for this if. */
 	in6_addr_t ipif_v6net_mask;	/* Net mask for this interface. */
 	in6_addr_t ipif_v6brd_addr;	/* Broadcast addr for this interface. */
@@ -1262,47 +901,29 @@ typedef struct ipif_s {
 	uint64_t ipif_flags;		/* Interface flags. */
 	uint_t	ipif_metric;		/* BSD if metric, for compatibility. */
 	uint_t	ipif_ire_type;		/* IRE_LOCAL or IRE_LOOPBACK */
-	mblk_t	*ipif_arp_del_mp;	/* Allocated at time arp comes up, to */
-					/* prevent awkward out of mem */
-					/* condition later */
-	mblk_t	*ipif_saved_ire_mp;	/* Allocated for each extra */
-					/* IRE_IF_NORESOLVER/IRE_IF_RESOLVER */
-					/* on this interface so that they */
-					/* can survive ifconfig down. */
-	kmutex_t ipif_saved_ire_lock;	/* Protects ipif_saved_ire_mp */
-
-	mrec_t	*ipif_igmp_rpt;		/* List of group memberships which */
-					/* will be reported on.  Used when */
-					/* handling an igmp timeout.	   */
 
 	/*
-	 * The packet counts in the ipif contain the sum of the
-	 * packet counts in dead IREs that were affiliated with
-	 * this ipif.
+	 * The packet count in the ipif contain the sum of the
+	 * packet counts in dead IRE_LOCAL/LOOPBACK for this ipif.
 	 */
-	uint_t	ipif_fo_pkt_count;	/* Forwarded thru our dead IREs */
 	uint_t	ipif_ib_pkt_count;	/* Inbound packets for our dead IREs */
-	uint_t	ipif_ob_pkt_count;	/* Outbound packets to our dead IREs */
+
 	/* Exclusive bit fields, protected by ipsq_t */
 	unsigned int
-		ipif_multicast_up : 1,	/* ipif_multicast_up() successful */
 		ipif_was_up : 1,	/* ipif was up before */
 		ipif_addr_ready : 1,	/* DAD is done */
 		ipif_was_dup : 1,	/* DAD had failed */
-
-		ipif_joined_allhosts : 1, /* allhosts joined */
 		ipif_added_nce : 1,	/* nce added for local address */
-		ipif_pad_to_31 : 26;
+
+		ipif_pad_to_31 : 28;
+
+	ilm_t	*ipif_allhosts_ilm;	/* For all-nodes join */
+	ilm_t	*ipif_solmulti_ilm;	/* For IPv6 solicited multicast join */
 
 	uint_t	ipif_seqid;		/* unique index across all ills */
 	uint_t	ipif_state_flags;	/* See IPIF_* flag defs above */
 	uint_t	ipif_refcnt;		/* active consistent reader cnt */
 
-	/* Number of ire's and ilm's referencing this ipif */
-	uint_t	ipif_ire_cnt;
-	uint_t	ipif_ilm_cnt;
-
-	uint_t  ipif_saved_ire_cnt;
 	zoneid_t ipif_zoneid;		/* zone ID number */
 	timeout_id_t ipif_recovery_id;	/* Timer for DAD recovery */
 	boolean_t ipif_trace_disable;	/* True when alloc fails */
@@ -1313,40 +934,12 @@ typedef struct ipif_s {
 	 * part of a group will be pointed to, and an ill cannot disappear
 	 * while it's in a group.
 	 */
-	struct ill_s	*ipif_bound_ill;
-	struct ipif_s	*ipif_bound_next; /* bound ipif chain */
-	boolean_t	ipif_bound;	 /* B_TRUE if we successfully bound */
-} ipif_t;
+	struct ill_s    *ipif_bound_ill;
+	struct ipif_s   *ipif_bound_next; /* bound ipif chain */
+	boolean_t	ipif_bound;	/* B_TRUE if we successfully bound */
 
-/*
- * IPIF_FREE_OK() means that there are no incoming references
- * to the ipif. Incoming refs would prevent the ipif from being freed.
- */
-#define	IPIF_FREE_OK(ipif)	\
-	((ipif)->ipif_ire_cnt == 0 && (ipif)->ipif_ilm_cnt == 0)
-/*
- * IPIF_DOWN_OK() determines whether the incoming pointer reference counts
- * would permit the ipif to be considered quiescent. In order for
- * an ipif or ill to be considered quiescent, the ire and nce references
- * to that ipif/ill must be zero.
- *
- * We do not require the ilm references to go to zero for quiescence
- * because the quiescence checks are done to ensure that
- * outgoing packets do not use addresses from the ipif/ill after it
- * has been marked down, and incoming packets to addresses on a
- * queiscent interface are rejected. This implies that all the
- * ire/nce's using that source address need to be deleted and future
- * creation of any ires using that source address must be prevented.
- * Similarly incoming unicast packets destined to the 'down' address
- * will not be accepted once that ire is gone. However incoming
- * multicast packets are not destined to the downed address.
- * They are only related to the ill in question. Furthermore
- * the current API behavior allows applications to join or leave
- * multicast groups, i.e., IP_ADD_MEMBERSHIP / LEAVE_MEMBERSHIP, using a
- * down address. Therefore the ilm references are not included in
- * the _DOWN_OK macros.
- */
-#define	IPIF_DOWN_OK(ipif)		((ipif)->ipif_ire_cnt == 0)
+	struct ire_s	*ipif_ire_local; /* Our IRE_LOCAL or LOOPBACK */
+} ipif_t;
 
 /*
  * The following table lists the protection levels of the various members
@@ -1371,9 +964,7 @@ typedef struct ipif_s {
  *			ill_g_lock		ill_g_lock
  * ipif_ill		ipsq + down ipif	write once
  * ipif_id		ipsq + down ipif	write once
- * ipif_mtu		ipsq
  * ipif_v6lcl_addr	ipsq + down ipif	up ipif
- * ipif_v6src_addr	ipsq + down ipif	up ipif
  * ipif_v6subnet	ipsq + down ipif	up ipif
  * ipif_v6net_mask	ipsq + down ipif	up ipif
  *
@@ -1383,28 +974,30 @@ typedef struct ipif_s {
  * ipif_metric
  * ipif_ire_type	ipsq + down ill		up ill
  *
- * ipif_arp_del_mp	ipsq			ipsq
- * ipif_saved_ire_mp	ipif_saved_ire_lock	ipif_saved_ire_lock
- * ipif_igmp_rpt	ipsq			ipsq
- *
- * ipif_fo_pkt_count	Approx
  * ipif_ib_pkt_count	Approx
- * ipif_ob_pkt_count	Approx
  *
  * bit fields		ill_lock		ill_lock
  *
+ * ipif_allhosts_ilm	ipsq			ipsq
+ * ipif_solmulti_ilm	ipsq			ipsq
+ *
  * ipif_seqid		ipsq			Write once
  *
  * ipif_state_flags	ill_lock		ill_lock
  * ipif_refcnt		ill_lock		ill_lock
- * ipif_ire_cnt		ill_lock		ill_lock
- * ipif_ilm_cnt		ill_lock		ill_lock
- * ipif_saved_ire_cnt
- *
  * ipif_bound_ill	ipsq + ipmp_lock	ipsq OR ipmp_lock
  * ipif_bound_next	ipsq			ipsq
  * ipif_bound		ipsq			ipsq
+ *
+ * ipif_ire_local	ipsq + ips_ill_g_lock	ipsq OR ips_ill_g_lock
+ */
+
+/*
+ * Return values from ip_laddr_verify_{v4,v6}
  */
+typedef enum { IPVL_UNICAST_UP, IPVL_UNICAST_DOWN, IPVL_MCAST, IPVL_BCAST,
+	    IPVL_BAD} ip_laddr_t;
+
 
 #define	IP_TR_HASH(tid)	((((uintptr_t)tid) >> 6) & (IP_TR_HASH_MAX - 1))
 
@@ -1422,18 +1015,12 @@ typedef struct ipif_s {
 
 /* IPv4 compatibility macros */
 #define	ipif_lcl_addr		V4_PART_OF_V6(ipif_v6lcl_addr)
-#define	ipif_src_addr		V4_PART_OF_V6(ipif_v6src_addr)
 #define	ipif_subnet		V4_PART_OF_V6(ipif_v6subnet)
 #define	ipif_net_mask		V4_PART_OF_V6(ipif_v6net_mask)
 #define	ipif_brd_addr		V4_PART_OF_V6(ipif_v6brd_addr)
 #define	ipif_pp_dst_addr	V4_PART_OF_V6(ipif_v6pp_dst_addr)
 
 /* Macros for easy backreferences to the ill. */
-#define	ipif_wq			ipif_ill->ill_wq
-#define	ipif_rq			ipif_ill->ill_rq
-#define	ipif_net_type		ipif_ill->ill_net_type
-#define	ipif_ipif_up_count	ipif_ill->ill_ipif_up_count
-#define	ipif_type		ipif_ill->ill_type
 #define	ipif_isv6		ipif_ill->ill_isv6
 
 #define	SIOCLIFADDR_NDX 112	/* ndx of SIOCLIFADDR in the ndx ioctl table */
@@ -1524,7 +1111,7 @@ typedef struct ipxop_s {
 	boolean_t	ipx_current_done;  /* is the current operation done? */
 	int		ipx_current_ioctl; /* current ioctl, or 0 if no ioctl */
 	ipif_t		*ipx_current_ipif; /* ipif for current op */
-	ipif_t		*ipx_pending_ipif; /* ipif for ipsq_pending_mp */
+	ipif_t		*ipx_pending_ipif; /* ipif for ipx_pending_mp */
 	mblk_t 		*ipx_pending_mp;   /* current ioctl mp while waiting */
 	boolean_t	ipx_forced; 			/* debugging aid */
 #ifdef DEBUG
@@ -1642,24 +1229,62 @@ typedef struct irb {
 	krwlock_t	irb_lock;	/* Protect this bucket */
 	uint_t		irb_refcnt;	/* Protected by irb_lock */
 	uchar_t		irb_marks;	/* CONDEMNED ires in this bucket ? */
-#define	IRB_MARK_CONDEMNED	0x0001
-#define	IRB_MARK_FTABLE		0x0002
+#define	IRB_MARK_CONDEMNED	0x0001	/* Contains some IRE_IS_CONDEMNED */
+#define	IRB_MARK_DYNAMIC	0x0002	/* Dynamically allocated */
+	/* Once IPv6 uses radix then IRB_MARK_DYNAMIC will be always be set */
 	uint_t		irb_ire_cnt;	/* Num of active IRE in this bucket */
-	uint_t		irb_tmp_ire_cnt; /* Num of temporary IRE */
-	struct ire_s	*irb_rr_origin;	/* origin for round-robin */
 	int		irb_nire;	/* Num of ftable ire's that ref irb */
 	ip_stack_t	*irb_ipst;	/* Does not have a netstack_hold */
 } irb_t;
 
 #define	IRB2RT(irb)	(rt_t *)((caddr_t)(irb) - offsetof(rt_t, rt_irb))
 
-/* The following are return values of ip_xmit_v4() */
-typedef enum {
-	SEND_PASSED = 0,	 /* sent packet out on wire */
-	SEND_FAILED,	 /* sending of packet failed */
-	LOOKUP_IN_PROGRESS, /* ire cache found, ARP resolution in progress */
-	LLHDR_RESLV_FAILED  /* macaddr resl of onlink dst or nexthop failed */
-} ipxmit_state_t;
+/* Forward declarations */
+struct dce_s;
+typedef struct dce_s dce_t;
+struct ire_s;
+typedef struct ire_s ire_t;
+struct ncec_s;
+typedef struct ncec_s ncec_t;
+struct nce_s;
+typedef struct nce_s nce_t;
+struct ip_recv_attr_s;
+typedef struct ip_recv_attr_s ip_recv_attr_t;
+struct ip_xmit_attr_s;
+typedef struct ip_xmit_attr_s ip_xmit_attr_t;
+
+struct tsol_ire_gw_secattr_s;
+typedef struct tsol_ire_gw_secattr_s tsol_ire_gw_secattr_t;
+
+/*
+ * This is a structure for a one-element route cache that is passed
+ * by reference between ip_input and ill_inputfn.
+ */
+typedef struct {
+	ire_t		*rtc_ire;
+	ipaddr_t	rtc_ipaddr;
+	in6_addr_t	rtc_ip6addr;
+} rtc_t;
+
+/*
+ * Note: Temporarily use 64 bits, and will probably go back to 32 bits after
+ * more cleanup work is done.
+ */
+typedef uint64_t iaflags_t;
+
+/* The ill input function pointer type */
+typedef void (*pfillinput_t)(mblk_t *, void *, void *, ip_recv_attr_t *,
+    rtc_t *);
+
+/* The ire receive function pointer type */
+typedef void (*pfirerecv_t)(ire_t *, mblk_t *, void *, ip_recv_attr_t *);
+
+/* The ire send and postfrag function pointer types */
+typedef int (*pfiresend_t)(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+typedef int (*pfirepostfrag_t)(mblk_t *, nce_t *, iaflags_t, uint_t, uint32_t,
+    zoneid_t, zoneid_t, uintptr_t *);
+
 
 #define	IP_V4_G_HEAD	0
 #define	IP_V6_G_HEAD	1
@@ -1733,26 +1358,12 @@ typedef union ill_g_head_u {
 /*
  * Capabilities, possible flags for ill_capabilities.
  */
-
-#define	ILL_CAPAB_AH		0x01		/* IPsec AH acceleration */
-#define	ILL_CAPAB_ESP		0x02		/* IPsec ESP acceleration */
-#define	ILL_CAPAB_MDT		0x04		/* Multidata Transmit */
+#define	ILL_CAPAB_LSO		0x04		/* Large Send Offload */
 #define	ILL_CAPAB_HCKSUM	0x08		/* Hardware checksumming */
 #define	ILL_CAPAB_ZEROCOPY	0x10		/* Zero-copy */
 #define	ILL_CAPAB_DLD		0x20		/* DLD capabilities */
 #define	ILL_CAPAB_DLD_POLL	0x40		/* Polling */
 #define	ILL_CAPAB_DLD_DIRECT	0x80		/* Direct function call */
-#define	ILL_CAPAB_DLD_LSO	0x100		/* Large Segment Offload */
-
-/*
- * Per-ill Multidata Transmit capabilities.
- */
-typedef struct ill_mdt_capab_s ill_mdt_capab_t;
-
-/*
- * Per-ill IPsec capabilities.
- */
-typedef struct ill_ipsec_capab_s ill_ipsec_capab_t;
 
 /*
  * Per-ill Hardware Checksumming capbilities.
@@ -1775,15 +1386,18 @@ typedef struct ill_dld_capab_s ill_dld_capab_t;
 typedef struct ill_rx_ring ill_rx_ring_t;
 
 /*
- * Per-ill Large Segment Offload capabilities.
+ * Per-ill Large Send Offload capabilities.
  */
 typedef struct ill_lso_capab_s ill_lso_capab_t;
 
 /* The following are ill_state_flags */
 #define	ILL_LL_SUBNET_PENDING	0x01	/* Waiting for DL_INFO_ACK from drv */
 #define	ILL_CONDEMNED		0x02	/* No more new ref's to the ILL */
-#define	ILL_CHANGING		0x04	/* ILL not globally visible */
-#define	ILL_DL_UNBIND_IN_PROGRESS	0x08	/* UNBIND_REQ is sent */
+#define	ILL_DL_UNBIND_IN_PROGRESS	0x04	/* UNBIND_REQ is sent */
+#define	ILL_DOWN_IN_PROGRESS	0x08	/* ILL is going down - no new nce's */
+#define	ILL_LL_BIND_PENDING	0x0020	/* XXX Reuse ILL_LL_SUBNET_PENDING ? */
+#define	ILL_LL_UP		0x0040
+#define	ILL_LL_DOWN		0x0080
 
 /* Is this an ILL whose source address is used by other ILL's ? */
 #define	IS_USESRC_ILL(ill)			\
@@ -1796,10 +1410,9 @@ typedef struct ill_lso_capab_s ill_lso_capab_t;
 	((ill)->ill_usesrc_grp_next != NULL))
 
 /* Is this an virtual network interface (vni) ILL ? */
-#define	IS_VNI(ill)							     \
-	(((ill) != NULL) &&						     \
+#define	IS_VNI(ill)							\
 	(((ill)->ill_phyint->phyint_flags & (PHYI_LOOPBACK|PHYI_VIRTUAL)) == \
-	PHYI_VIRTUAL))
+	PHYI_VIRTUAL)
 
 /* Is this a loopback ILL? */
 #define	IS_LOOPBACK(ill) \
@@ -1900,18 +1513,41 @@ typedef struct ipmp_grp_s {
  * ARP up-to-date as the active set of interfaces in the group changes.
  */
 typedef struct ipmp_arpent_s {
-	mblk_t		*ia_area_mp;	/* AR_ENTRY_ADD pointer */
 	ipaddr_t	ia_ipaddr; 	/* IP address for this entry */
 	boolean_t	ia_proxyarp; 	/* proxy ARP entry? */
 	boolean_t	ia_notified; 	/* ARP notified about this entry? */
 	list_node_t	ia_node; 	/* next ARP entry in list */
+	uint16_t	ia_flags;	/* nce_flags for the address */
+	size_t		ia_lladdr_len;
+	uchar_t		*ia_lladdr;
 } ipmp_arpent_t;
 
+struct arl_s;
+
+/*
+ * Per-ill capabilities.
+ */
+struct ill_hcksum_capab_s {
+	uint_t	ill_hcksum_version;	/* interface version */
+	uint_t	ill_hcksum_txflags;	/* capabilities on transmit */
+};
+
+struct ill_zerocopy_capab_s {
+	uint_t	ill_zerocopy_version;	/* interface version */
+	uint_t	ill_zerocopy_flags;	/* capabilities */
+};
+
+struct ill_lso_capab_s {
+	uint_t	ill_lso_flags;		/* capabilities */
+	uint_t	ill_lso_max;		/* maximum size of payload */
+};
+
 /*
  * IP Lower level Structure.
  * Instance data structure in ip_open when there is a device below us.
  */
 typedef struct ill_s {
+	pfillinput_t ill_inputfn;	/* Fast input function selector */
 	ill_if_t *ill_ifptr;		/* pointer to interface type */
 	queue_t	*ill_rq;		/* Read queue. */
 	queue_t	*ill_wq;		/* Write queue. */
@@ -1922,6 +1558,8 @@ typedef struct ill_s {
 
 	uint_t	ill_ipif_up_count;	/* Number of IPIFs currently up. */
 	uint_t	ill_max_frag;		/* Max IDU from DLPI. */
+	uint_t	ill_current_frag;	/* Current IDU from DLPI. */
+	uint_t	ill_mtu;		/* User-specified MTU; SIOCSLIFMTU */
 	char	*ill_name;		/* Our name. */
 	uint_t	ill_ipif_dup_count;	/* Number of duplicate addresses. */
 	uint_t	ill_name_length;	/* Name length, incl. terminator. */
@@ -1941,8 +1579,9 @@ typedef struct ill_s {
 	uint8_t	*ill_frag_ptr;		/* Reassembly state. */
 	timeout_id_t ill_frag_timer_id; /* timeout id for the frag timer */
 	ipfb_t	*ill_frag_hash_tbl;	/* Fragment hash list head. */
-	ipif_t	*ill_pending_ipif;	/* IPIF waiting for DL operation. */
 
+	krwlock_t ill_mcast_lock;	/* Protects multicast state */
+	kmutex_t ill_mcast_serializer;	/* Serialize across ilg and ilm state */
 	ilm_t	*ill_ilm;		/* Multicast membership for ill */
 	uint_t	ill_global_timer;	/* for IGMPv3/MLDv2 general queries */
 	int	ill_mcast_type;		/* type of router which is querier */
@@ -1955,22 +1594,20 @@ typedef struct ill_s {
 	uint8_t	ill_mcast_rv;		/* IGMPv3/MLDv2 robustness variable */
 	int	ill_mcast_qi;		/* IGMPv3/MLDv2 query interval var */
 
-	mblk_t	*ill_pending_mp;	/* IOCTL/DLPI awaiting completion. */
 	/*
 	 * All non-NULL cells between 'ill_first_mp_to_free' and
 	 * 'ill_last_mp_to_free' are freed in ill_delete.
 	 */
 #define	ill_first_mp_to_free	ill_bcast_mp
 	mblk_t	*ill_bcast_mp;		/* DLPI header for broadcasts. */
-	mblk_t	*ill_resolver_mp;	/* Resolver template. */
 	mblk_t	*ill_unbind_mp;		/* unbind mp from ill_dl_up() */
 	mblk_t	*ill_promiscoff_mp;	/* for ill_leave_allmulti() */
 	mblk_t	*ill_dlpi_deferred;	/* b_next chain of control messages */
-	mblk_t	*ill_ardeact_mp;	/* deact mp from ipmp_ill_activate() */
 	mblk_t	*ill_dest_addr_mp;	/* mblk which holds ill_dest_addr */
 	mblk_t	*ill_replumb_mp;	/* replumb mp from ill_replumb() */
 	mblk_t	*ill_phys_addr_mp;	/* mblk which holds ill_phys_addr */
-#define	ill_last_mp_to_free	ill_phys_addr_mp
+	mblk_t	*ill_mcast_deferred;	/* b_next chain of IGMP/MLD packets */
+#define	ill_last_mp_to_free	ill_mcast_deferred
 
 	cred_t	*ill_credp;		/* opener's credentials */
 	uint8_t	*ill_phys_addr;		/* ill_phys_addr_mp->b_rptr + off */
@@ -1986,37 +1623,33 @@ typedef struct ill_s {
 		ill_dlpi_style_set : 1,
 
 		ill_ifname_pending : 1,
-		ill_join_allmulti : 1,
 		ill_logical_down : 1,
 		ill_dl_up : 1,
-
 		ill_up_ipifs : 1,
+
 		ill_note_link : 1,	/* supports link-up notification */
 		ill_capab_reneg : 1, /* capability renegotiation to be done */
 		ill_dld_capab_inprog : 1, /* direct dld capab call in prog */
-
 		ill_need_recover_multicast : 1,
-		ill_pad_to_bit_31 : 19;
+
+		ill_replumbing : 1,
+		ill_arl_dlpi_pending : 1,
+
+		ill_pad_to_bit_31 : 18;
 
 	/* Following bit fields protected by ill_lock */
 	uint_t
 		ill_fragtimer_executing : 1,
 		ill_fragtimer_needrestart : 1,
-		ill_ilm_cleanup_reqd : 1,
-		ill_arp_closing : 1,
-
-		ill_arp_bringup_pending : 1,
-		ill_arp_extend : 1,	/* ARP has DAD extensions */
 		ill_manual_token : 1,	/* system won't override ill_token */
 		ill_manual_linklocal : 1, /* system won't auto-conf linklocal */
 
-		ill_pad_bit_31 : 24;
+		ill_pad_bit_31 : 28;
 
 	/*
 	 * Used in SIOCSIFMUXID and SIOCGIFMUXID for 'ifconfig unplumb'.
 	 */
-	int	ill_arp_muxid;		/* muxid returned from plink for arp */
-	int	ill_ip_muxid;		/* muxid returned from plink for ip */
+	int	ill_muxid;		/* muxid returned from plink */
 
 	/* Used for IP frag reassembly throttling on a per ILL basis.  */
 	uint_t	ill_ipf_gen;		/* Generation of next fragment queue */
@@ -2033,20 +1666,13 @@ typedef struct ill_s {
 	uint_t  ill_dlpi_capab_state;	/* State of capability query, IDCS_* */
 	uint_t	ill_capab_pending_cnt;
 	uint64_t ill_capabilities;	/* Enabled capabilities, ILL_CAPAB_* */
-	ill_mdt_capab_t	*ill_mdt_capab;	/* Multidata Transmit capabilities */
-	ill_ipsec_capab_t *ill_ipsec_capab_ah;	/* IPsec AH capabilities */
-	ill_ipsec_capab_t *ill_ipsec_capab_esp;	/* IPsec ESP capabilities */
 	ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */
 	ill_zerocopy_capab_t *ill_zerocopy_capab; /* Zero-copy capabilities */
 	ill_dld_capab_t *ill_dld_capab; /* DLD capabilities */
 	ill_lso_capab_t	*ill_lso_capab;	/* Large Segment Offload capabilities */
 	mblk_t	*ill_capab_reset_mp;	/* Preallocated mblk for capab reset */
 
-	/*
-	 * Fields for IPv6
-	 */
 	uint8_t	ill_max_hops;	/* Maximum hops for any logical interface */
-	uint_t	ill_max_mtu;	/* Maximum MTU for any logical interface */
 	uint_t	ill_user_mtu;	/* User-specified MTU via SIOCSLIFLNKINFO */
 	uint32_t ill_reachable_time;	/* Value for ND algorithm in msec */
 	uint32_t ill_reachable_retrans_time; /* Value for ND algorithm msec */
@@ -2057,20 +1683,6 @@ typedef struct ill_s {
 	uint32_t	ill_xmit_count;		/* ndp max multicast xmits */
 	mib2_ipIfStatsEntry_t	*ill_ip_mib;	/* ver indep. interface mib */
 	mib2_ipv6IfIcmpEntry_t	*ill_icmp6_mib;	/* Per interface mib */
-	/*
-	 * Following two mblks are allocated common to all
-	 * the ipifs when the first interface is coming up.
-	 * It is sent up to arp when the last ipif is coming
-	 * down.
-	 */
-	mblk_t			*ill_arp_down_mp;
-	mblk_t			*ill_arp_del_mapping_mp;
-	/*
-	 * Used for implementing IFF_NOARP. As IFF_NOARP is used
-	 * to turn off for all the logicals, it is here instead
-	 * of the ipif.
-	 */
-	mblk_t			*ill_arp_on_mp;
 
 	phyint_t		*ill_phyint;
 	uint64_t		ill_flags;
@@ -2094,11 +1706,11 @@ typedef struct ill_s {
 	 */
 	uint_t		ill_ifname_pending_err;
 	avl_node_t	ill_avl_byppa; /* avl node based on ppa */
-	void		*ill_fastpath_list; /* both ire and nce hang off this */
+	list_t		ill_nce; /* pointer to nce_s list */
 	uint_t		ill_refcnt;	/* active refcnt by threads */
 	uint_t		ill_ire_cnt;	/* ires associated with this ill */
 	kcondvar_t	ill_cv;
-	uint_t		ill_ilm_walker_cnt;	/* snmp ilm walkers */
+	uint_t		ill_ncec_cnt;	/* ncecs associated with this ill */
 	uint_t		ill_nce_cnt;	/* nces associated with this ill */
 	uint_t		ill_waiters;	/* threads waiting in ipsq_enter */
 	/*
@@ -2119,6 +1731,17 @@ typedef struct ill_s {
 	void		*ill_flownotify_mh; /* Tx flow ctl, mac cb handle */
 	uint_t		ill_ilm_cnt;    /* ilms referencing this ill */
 	uint_t		ill_ipallmulti_cnt; /* ip_join_allmulti() calls */
+	ilm_t		*ill_ipallmulti_ilm;
+
+	mblk_t		*ill_saved_ire_mp; /* Allocated for each extra IRE */
+					/* with ire_ill set so they can */
+					/* survive the ill going down and up. */
+	kmutex_t	ill_saved_ire_lock; /* Protects ill_saved_ire_mp, cnt */
+	uint_t		ill_saved_ire_cnt;	/* # entries */
+	struct arl_ill_common_s    *ill_common;
+	ire_t		*ill_ire_multicast; /* IRE_MULTICAST for ill */
+	clock_t		ill_defend_start;   /* start of 1 hour period */
+	uint_t		ill_defend_count;   /* # of announce/defends per ill */
 	/*
 	 * IPMP fields.
 	 */
@@ -2131,6 +1754,8 @@ typedef struct ill_s {
 	uint_t		ill_bound_cnt;	/* # of data addresses bound to ill */
 	ipif_t		*ill_bound_ipif; /* ipif chain bound to ill */
 	timeout_id_t	ill_refresh_tid; /* ill refresh retry timeout id */
+
+	uint32_t	ill_mrouter_cnt; /* mrouter allmulti joins */
 } ill_t;
 
 /*
@@ -2139,15 +1764,17 @@ typedef struct ill_s {
  */
 #define	ILL_FREE_OK(ill)					\
 	((ill)->ill_ire_cnt == 0 && (ill)->ill_ilm_cnt == 0 &&	\
-	(ill)->ill_nce_cnt == 0)
+	(ill)->ill_ncec_cnt == 0 && (ill)->ill_nce_cnt == 0)
 
 /*
- * An ipif/ill can be marked down only when the ire and nce references
+ * An ipif/ill can be marked down only when the ire and ncec references
  * to that ipif/ill goes to zero. ILL_DOWN_OK() is a necessary condition
  * quiescence checks. See comments above IPIF_DOWN_OK for details
  * on why ires and nces are selectively considered for this macro.
  */
-#define	ILL_DOWN_OK(ill)	(ill->ill_ire_cnt == 0 && ill->ill_nce_cnt == 0)
+#define	ILL_DOWN_OK(ill)					\
+	(ill->ill_ire_cnt == 0 && ill->ill_ncec_cnt == 0 &&	\
+	ill->ill_nce_cnt == 0)
 
 /*
  * The following table lists the protection levels of the various members
@@ -2162,7 +1789,8 @@ typedef struct ill_s {
  * ill_error			ipsq			None
  * ill_ipif			ill_g_lock + ipsq	ill_g_lock OR ipsq
  * ill_ipif_up_count		ill_lock + ipsq		ill_lock OR ipsq
- * ill_max_frag			ipsq			Write once
+ * ill_max_frag			ill_lock		ill_lock
+ * ill_current_frag		ill_lock		ill_lock
  *
  * ill_name			ill_g_lock + ipsq	Write once
  * ill_name_length		ill_g_lock + ipsq	Write once
@@ -2179,23 +1807,22 @@ typedef struct ill_s {
  *
  * ill_frag_timer_id		ill_lock		ill_lock
  * ill_frag_hash_tbl		ipsq			up ill
- * ill_ilm			ipsq + ill_lock		ill_lock
- * ill_mcast_type		ill_lock		ill_lock
- * ill_mcast_v1_time		ill_lock		ill_lock
- * ill_mcast_v2_time		ill_lock		ill_lock
- * ill_mcast_v1_tset		ill_lock		ill_lock
- * ill_mcast_v2_tset		ill_lock		ill_lock
- * ill_mcast_rv			ill_lock		ill_lock
- * ill_mcast_qi			ill_lock		ill_lock
- * ill_pending_mp		ill_lock		ill_lock
- *
- * ill_bcast_mp			ipsq			ipsq
- * ill_resolver_mp		ipsq			only when ill is up
+ * ill_ilm			ill_mcast_lock(WRITER)	ill_mcast_lock(READER)
+ * ill_global_timer		ill_mcast_lock(WRITER)	ill_mcast_lock(READER)
+ * ill_mcast_type		ill_mcast_lock(WRITER)	ill_mcast_lock(READER)
+ * ill_mcast_v1_time		ill_mcast_lock(WRITER)	ill_mcast_lock(READER)
+ * ill_mcast_v2_time		ill_mcast_lock(WRITER)	ill_mcast_lock(READER)
+ * ill_mcast_v1_tset		ill_mcast_lock(WRITER)	ill_mcast_lock(READER)
+ * ill_mcast_v2_tset		ill_mcast_lock(WRITER)	ill_mcast_lock(READER)
+ * ill_mcast_rv			ill_mcast_lock(WRITER)	ill_mcast_lock(READER)
+ * ill_mcast_qi			ill_mcast_lock(WRITER)	ill_mcast_lock(READER)
+ *
  * ill_down_mp			ipsq			ipsq
  * ill_dlpi_deferred		ill_lock		ill_lock
  * ill_dlpi_pending		ipsq + ill_lock		ipsq or ill_lock or
  *							absence of ipsq writer.
  * ill_phys_addr_mp		ipsq + down ill		only when ill is up
+ * ill_mcast_deferred		ill_lock		ill_lock
  * ill_phys_addr		ipsq + down ill		only when ill is up
  * ill_dest_addr_mp		ipsq + down ill		only when ill is up
  * ill_dest_addr		ipsq + down ill		only when ill is up
@@ -2204,8 +1831,7 @@ typedef struct ill_s {
  * exclusive bit flags		ipsq_t			ipsq_t
  * shared bit flags		ill_lock		ill_lock
  *
- * ill_arp_muxid		ipsq			Not atomic
- * ill_ip_muxid			ipsq			Not atomic
+ * ill_muxid			ipsq			Not atomic
  *
  * ill_ipf_gen			Not atomic
  * ill_frag_count		atomics			atomics
@@ -2215,7 +1841,7 @@ typedef struct ill_s {
  * ill_dlpi_capab_state		ipsq			ipsq
  * ill_max_hops			ipsq			Not atomic
  *
- * ill_max_mtu
+ * ill_mtu			ill_lock		None
  *
  * ill_user_mtu			ipsq + ill_lock		ill_lock
  * ill_reachable_time		ipsq + ill_lock		ill_lock
@@ -2230,9 +1856,6 @@ typedef struct ill_s {
  * ill_xmit_count		ipsq + down ill		write once
  * ill_ip6_mib			ipsq + down ill		only when ill is up
  * ill_icmp6_mib		ipsq + down ill		only when ill is up
- * ill_arp_down_mp		ipsq			ipsq
- * ill_arp_del_mapping_mp	ipsq			ipsq
- * ill_arp_on_mp		ipsq			ipsq
  *
  * ill_phyint			ipsq, ill_g_lock, ill_lock	Any of them
  * ill_flags			ill_lock		ill_lock
@@ -2247,7 +1870,7 @@ typedef struct ill_s {
  * ill_refcnt			ill_lock		ill_lock
  * ill_ire_cnt			ill_lock		ill_lock
  * ill_cv			ill_lock		ill_lock
- * ill_ilm_walker_cnt		ill_lock		ill_lock
+ * ill_ncec_cnt			ill_lock		ill_lock
  * ill_nce_cnt			ill_lock		ill_lock
  * ill_ilm_cnt			ill_lock		ill_lock
  * ill_src_ipif			ill_g_lock		ill_g_lock
@@ -2256,8 +1879,12 @@ typedef struct ill_s {
  * ill_dhcpinit			atomics			atomics
  * ill_flownotify_mh		write once		write once
  * ill_capab_pending_cnt	ipsq			ipsq
- *
- * ill_bound_cnt		ipsq			ipsq
+ * ill_ipallmulti_cnt		ill_lock		ill_lock
+ * ill_ipallmulti_ilm		ill_lock		ill_lock
+ * ill_saved_ire_mp		ill_saved_ire_lock	ill_saved_ire_lock
+ * ill_saved_ire_cnt		ill_saved_ire_lock	ill_saved_ire_lock
+ * ill_arl			???			???
+ * ill_ire_multicast		ipsq + quiescent	none
  * ill_bound_ipif		ipsq			ipsq
  * ill_actnode			ipsq + ipmp_lock	ipsq OR ipmp_lock
  * ill_grpnode			ipsq + ill_g_lock	ipsq OR ill_g_lock
@@ -2267,6 +1894,7 @@ typedef struct ill_s {
  * ill_refresh_tid		ill_lock		ill_lock
  * ill_grp (for IPMP ill)	write once		write once
  * ill_grp (for underlying ill)	ipsq + ill_g_lock	ipsq OR ill_g_lock
+ * ill_mrouter_cnt		atomics			atomics
  *
  * NOTE: It's OK to make heuristic decisions on an underlying interface
  *	 by using IS_UNDER_IPMP() or comparing ill_grp's raw pointer value.
@@ -2311,7 +1939,6 @@ enum { IF_CMD = 1, LIF_CMD, ARP_CMD, XARP_CMD, MSFILT_CMD, MISC_CMD };
 #define	IPI_GET_CMD	0x8	/* branch to mi_copyout on success */
 /*	unused		0x10	*/
 #define	IPI_NULL_BCONT	0x20	/* ioctl has not data and hence no b_cont */
-#define	IPI_PASS_DOWN	0x40	/* pass this ioctl down when a module only */
 
 extern ip_ioctl_cmd_t	ip_ndx_ioctl_table[];
 extern ip_ioctl_cmd_t	ip_misc_ioctl_table[];
@@ -2362,6 +1989,430 @@ typedef struct ipndp_s {
 	char		*ip_ndp_name;
 } ipndp_t;
 
+/* IXA Notification types */
+typedef enum {
+	IXAN_LSO,	/* LSO capability change */
+	IXAN_PMTU,	/* PMTU change */
+	IXAN_ZCOPY	/* ZEROCOPY capability change */
+} ixa_notify_type_t;
+
+typedef uint_t ixa_notify_arg_t;
+
+typedef	void	(*ixa_notify_t)(void *, ip_xmit_attr_t *ixa, ixa_notify_type_t,
+    ixa_notify_arg_t);
+
+/*
+ * Attribute flags that are common to the transmit and receive attributes
+ */
+#define	IAF_IS_IPV4		0x80000000	/* ipsec_*_v4 */
+#define	IAF_TRUSTED_ICMP	0x40000000	/* ipsec_*_icmp_loopback */
+#define	IAF_NO_LOOP_ZONEID_SET	0x20000000	/* Zone that shouldn't have */
+						/* a copy */
+#define	IAF_LOOPBACK_COPY	0x10000000	/* For multi and broadcast */
+
+#define	IAF_MASK		0xf0000000	/* Flags that are common */
+
+/*
+ * Transmit side attributes used between the transport protocols and IP as
+ * well as inside IP. It is also used to cache information in the conn_t i.e.
+ * replaces conn_ire and the IPsec caching in the conn_t.
+ */
+struct ip_xmit_attr_s {
+	iaflags_t	ixa_flags;	/* IXAF_*. See below */
+
+	uint32_t	ixa_free_flags;	/* IXA_FREE_*. See below */
+	uint32_t	ixa_refcnt;	/* Using atomics */
+
+	/*
+	 * Always initialized independently of ixa_flags settings.
+	 * Used by ip_xmit so we keep them up front for cache locality.
+	 */
+	uint32_t	ixa_xmit_hint;	/* For ECMP and GLD TX ring fanout */
+	uint_t		ixa_pktlen;	/* Always set. For frag and stats */
+	zoneid_t	ixa_zoneid;	/* Assumed always set */
+
+	/* Always set for conn_ip_output(); might be stale */
+	/*
+	 * Since TCP keeps the conn_t around past the process going away
+	 * we need to use the "notr" (e.g, ire_refhold_notr) for ixa_ire,
+	 * ixa_nce, and ixa_dce.
+	 */
+	ire_t		*ixa_ire;	/* Forwarding table entry */
+	uint_t		ixa_ire_generation;
+	nce_t		*ixa_nce;	/* Neighbor cache entry */
+	dce_t		*ixa_dce;	/* Destination cache entry */
+	uint_t		ixa_dce_generation;
+	uint_t		ixa_src_generation;	/* If IXAF_VERIFY_SOURCE */
+
+	uint32_t	ixa_src_preferences;	/* prefs for src addr select */
+	uint32_t	ixa_pmtu;		/* IXAF_VERIFY_PMTU */
+
+	/* Set by ULP if IXAF_VERIFY_PMTU; otherwise set by IP */
+	uint32_t	ixa_fragsize;
+
+	int8_t		ixa_use_min_mtu;	/* IXAF_USE_MIN_MTU values */
+
+	pfirepostfrag_t	ixa_postfragfn;		/* Set internally in IP */
+
+	in6_addr_t	ixa_nexthop_v6;		/* IXAF_NEXTHOP_SET */
+#define	ixa_nexthop_v4	V4_PART_OF_V6(ixa_nexthop_v6)
+
+	zoneid_t	ixa_no_loop_zoneid;	/* IXAF_NO_LOOP_ZONEID_SET */
+
+	uint_t		ixa_scopeid;		/* For IPv6 link-locals */
+
+	uint_t		ixa_broadcast_ttl;	/* IXAF_BROACAST_TTL_SET */
+
+	uint_t		ixa_multicast_ttl;	/* Assumed set for multicast */
+	uint_t		ixa_multicast_ifindex;	/* Assumed set for multicast */
+	ipaddr_t	ixa_multicast_ifaddr;	/* Assumed set for multicast */
+
+	int		ixa_raw_cksum_offset;	/* If IXAF_SET_RAW_CKSUM */
+
+	uint32_t	ixa_ident;		/* For IPv6 fragment header */
+
+	/*
+	 * Cached LSO information.
+	 */
+	ill_lso_capab_t	ixa_lso_capab;		/* Valid when IXAF_LSO_CAPAB */
+
+	uint64_t	ixa_ipsec_policy_gen;	/* Generation from iph_gen */
+	/*
+	 * The following IPsec fields are only initialized when
+	 * IXAF_IPSEC_SECURE is set. Otherwise they contain garbage.
+	 */
+	ipsec_latch_t	*ixa_ipsec_latch;	/* Just the ids */
+	struct ipsa_s 	*ixa_ipsec_ah_sa;	/* Hard reference SA for AH */
+	struct ipsa_s 	*ixa_ipsec_esp_sa;	/* Hard reference SA for ESP */
+	struct ipsec_policy_s 	*ixa_ipsec_policy; /* why are we here? */
+	struct ipsec_action_s	*ixa_ipsec_action; /* For reflected packets */
+	ipsa_ref_t	ixa_ipsec_ref[2];	/* Soft reference to SA */
+						/* 0: ESP, 1: AH */
+
+	/*
+	 * The selectors here are potentially different than the SPD rule's
+	 * selectors, and we need to have both available for IKEv2.
+	 *
+	 * NOTE: "Source" and "Dest" are w.r.t. outbound datagrams.  Ports can
+	 *	 be zero, and the protocol number is needed to make the ports
+	 *	 significant.
+	 */
+	uint16_t ixa_ipsec_src_port;	/* Source port number of d-gram. */
+	uint16_t ixa_ipsec_dst_port;	/* Destination port number of d-gram. */
+	uint8_t  ixa_ipsec_icmp_type;	/* ICMP type of d-gram */
+	uint8_t  ixa_ipsec_icmp_code;	/* ICMP code of d-gram */
+
+	sa_family_t ixa_ipsec_inaf;	/* Inner address family */
+#define	IXA_MAX_ADDRLEN 4	/* Max addr len. (in 32-bit words) */
+	uint32_t ixa_ipsec_insrc[IXA_MAX_ADDRLEN];	/* Inner src address */
+	uint32_t ixa_ipsec_indst[IXA_MAX_ADDRLEN];	/* Inner dest address */
+	uint8_t  ixa_ipsec_insrcpfx;	/* Inner source prefix */
+	uint8_t  ixa_ipsec_indstpfx;	/* Inner destination prefix */
+
+	uint8_t ixa_ipsec_proto;	/* IP protocol number for d-gram. */
+
+	/* Always initialized independently of ixa_flags settings */
+	uint_t		ixa_ifindex;	/* Assumed always set */
+	uint16_t	ixa_ip_hdr_length; /* Points to ULP header */
+	uint8_t		ixa_protocol;	/* Protocol number for ULP cksum */
+	ts_label_t	*ixa_tsl;	/* Always set. NULL if not TX */
+	ip_stack_t	*ixa_ipst;	/* Always set */
+	uint32_t	ixa_extra_ident; /* Set if LSO */
+	cred_t		*ixa_cred;	/* For getpeerucred */
+	pid_t		ixa_cpid;	/* For getpeerucred */
+
+#ifdef DEBUG
+	kthread_t	*ixa_curthread;	/* For serialization assert */
+#endif
+	squeue_t	*ixa_sqp;	/* Set from conn_sqp as a hint */
+	uintptr_t	ixa_cookie;	/* cookie to use for tx flow control */
+
+	/*
+	 * Must be set by ULP if any of IXAF_VERIFY_LSO, IXAF_VERIFY_PMTU,
+	 * or IXAF_VERIFY_ZCOPY is set.
+	 */
+	ixa_notify_t	ixa_notify;	/* Registered upcall notify function */
+	void		*ixa_notify_cookie; /* ULP cookie for ixa_notify */
+};
+
+/*
+ * Flags to indicate which transmit attributes are set.
+ * Split into "xxx_SET" ones which indicate that the "xxx" field it set, and
+ * single flags.
+ */
+#define	IXAF_REACH_CONF		0x00000001	/* Reachability confirmation */
+#define	IXAF_BROADCAST_TTL_SET	0x00000002	/* ixa_broadcast_ttl valid */
+#define	IXAF_SET_SOURCE		0x00000004	/* Replace if broadcast */
+#define	IXAF_USE_MIN_MTU	0x00000008	/* IPV6_USE_MIN_MTU */
+
+#define	IXAF_DONTFRAG		0x00000010	/* IP*_DONTFRAG */
+#define	IXAF_VERIFY_PMTU	0x00000020	/* ixa_pmtu/ixa_fragsize set */
+#define	IXAF_PMTU_DISCOVERY	0x00000040	/* Create/use PMTU state */
+#define	IXAF_MULTICAST_LOOP	0x00000080	/* IP_MULTICAST_LOOP */
+
+#define	IXAF_IPSEC_SECURE	0x00000100	/* Need IPsec processing */
+#define	IXAF_UCRED_TSL		0x00000200	/* ixa_tsl from SCM_UCRED */
+#define	IXAF_DONTROUTE		0x00000400	/* SO_DONTROUTE */
+#define	IXAF_NO_IPSEC		0x00000800	/* Ignore policy */
+
+#define	IXAF_PMTU_TOO_SMALL	0x00001000	/* PMTU too small */
+#define	IXAF_SET_ULP_CKSUM	0x00002000	/* Calculate ULP checksum */
+#define	IXAF_VERIFY_SOURCE	0x00004000	/* Check that source is ok */
+#define	IXAF_NEXTHOP_SET	0x00008000	/* ixa_nexthop set */
+
+#define	IXAF_PMTU_IPV4_DF	0x00010000	/* Set IPv4 DF */
+#define	IXAF_NO_DEV_FLOW_CTL	0x00020000	/* Protocol needs no flow ctl */
+#define	IXAF_NO_TTL_CHANGE	0x00040000	/* Internal to IP */
+#define	IXAF_IPV6_ADD_FRAGHDR	0x00080000	/* Add fragment header */
+
+#define	IXAF_IPSEC_TUNNEL	0x00100000	/* Tunnel mode */
+#define	IXAF_NO_PFHOOK		0x00200000	/* Skip xmit pfhook */
+#define	IXAF_NO_TRACE		0x00400000	/* When back from ARP/ND */
+#define	IXAF_SCOPEID_SET	0x00800000	/* ixa_scopeid set */
+
+#define	IXAF_MULTIRT_MULTICAST	0x01000000	/* MULTIRT for multicast */
+#define	IXAF_NO_HW_CKSUM	0x02000000	/* Force software cksum */
+#define	IXAF_SET_RAW_CKSUM	0x04000000	/* Use ixa_raw_cksum_offset */
+#define	IXAF_IPSEC_GLOBAL_POLICY 0x08000000	/* Policy came from global */
+
+/* Note the following uses bits 0x10000000 through 0x80000000 */
+#define	IXAF_IS_IPV4		IAF_IS_IPV4
+#define	IXAF_TRUSTED_ICMP	IAF_TRUSTED_ICMP
+#define	IXAF_NO_LOOP_ZONEID_SET	IAF_NO_LOOP_ZONEID_SET
+#define	IXAF_LOOPBACK_COPY	IAF_LOOPBACK_COPY
+
+/* Note: use the upper 32 bits */
+#define	IXAF_VERIFY_LSO		0x100000000	/* Check LSO capability */
+#define	IXAF_LSO_CAPAB		0x200000000	/* Capable of LSO */
+#define	IXAF_VERIFY_ZCOPY	0x400000000	/* Check Zero Copy capability */
+#define	IXAF_ZCOPY_CAPAB	0x800000000	/* Capable of ZEROCOPY */
+
+/*
+ * The normal flags for sending packets e.g., icmp errors
+ */
+#define	IXAF_BASIC_SIMPLE_V4	(IXAF_SET_ULP_CKSUM | IXAF_IS_IPV4)
+#define	IXAF_BASIC_SIMPLE_V6	(IXAF_SET_ULP_CKSUM)
+
+/*
+ * Normally these fields do not have a hold. But in some cases they do, for
+ * instance when we've gone through ip_*_attr_to/from_mblk.
+ * We use ixa_free_flags to indicate that they have a hold and need to be
+ * released on cleanup.
+ */
+#define	IXA_FREE_CRED		0x00000001	/* ixa_cred needs to be rele */
+#define	IXA_FREE_TSL		0x00000002	/* ixa_tsl needs to be rele */
+
+/*
+ * Simplistic way to set the ixa_xmit_hint for locally generated traffic
+ * and forwarded traffic. The shift amount are based on the size of the
+ * structs to discard the low order bits which don't have much if any variation
+ * (coloring in kmem_cache_alloc might provide some variation).
+ *
+ * Basing the locally generated hint on the address of the conn_t means that
+ * the packets from the same socket/connection do not get reordered.
+ * Basing the hint for forwarded traffic on the ill_ring_t means that
+ * packets from the same NIC+ring are likely to use the same outbound ring
+ * hence we get low contention on the ring in the transmitting driver.
+ */
+#define	CONN_TO_XMIT_HINT(connp)	((uint32_t)(((uintptr_t)connp) >> 11))
+#define	ILL_RING_TO_XMIT_HINT(ring)	((uint32_t)(((uintptr_t)ring) >> 7))
+
+/*
+ * IP set Destination Flags used by function ip_set_destination,
+ * ip_attr_connect, and conn_connect.
+ */
+#define	IPDF_ALLOW_MCBC		0x1	/* Allow multi/broadcast */
+#define	IPDF_VERIFY_DST		0x2	/* Verify destination addr */
+#define	IPDF_SELECT_SRC		0x4	/* Select source address */
+#define	IPDF_LSO		0x8	/* Try LSO */
+#define	IPDF_IPSEC		0x10	/* Set IPsec policy */
+#define	IPDF_ZONE_IS_GLOBAL	0x20	/* From conn_zone_is_global */
+#define	IPDF_ZCOPY		0x40	/* Try ZEROCOPY */
+#define	IPDF_UNIQUE_DCE		0x80	/* Get a per-destination DCE */
+
+/*
+ * Receive side attributes used between the transport protocols and IP as
+ * well as inside IP.
+ */
+struct ip_recv_attr_s {
+	iaflags_t	ira_flags;	/* See below */
+
+	uint32_t	ira_free_flags;	/* IRA_FREE_*. See below */
+
+	/*
+	 * This is a hint for TCP SYN packets.
+	 * Always initialized independently of ira_flags settings
+	 */
+	squeue_t	*ira_sqp;
+	ill_rx_ring_t	*ira_ring;	/* Internal to IP */
+
+	/* For ip_accept_tcp when IRAF_TARGET_SQP is set */
+	squeue_t	*ira_target_sqp;
+	mblk_t		*ira_target_sqp_mp;
+
+	/* Always initialized independently of ira_flags settings */
+	uint32_t	ira_xmit_hint;	/* For ECMP and GLD TX ring fanout */
+	zoneid_t	ira_zoneid;	/* ALL_ZONES unless local delivery */
+	uint_t		ira_pktlen;	/* Always set. For frag and stats */
+	uint16_t	ira_ip_hdr_length; /* Points to ULP header */
+	uint8_t		ira_protocol;	/* Protocol number for ULP cksum */
+	uint_t		ira_rifindex;	/* Received ifindex */
+	uint_t		ira_ruifindex;	/* Received upper ifindex */
+	ts_label_t	*ira_tsl;	/* Always set. NULL if not TX */
+	/*
+	 * ira_rill and ira_ill is set inside IP, but not when conn_recv is
+	 * called; ULPs should use ira_ruifindex instead.
+	 */
+	ill_t		*ira_rill;	/* ill where packet came */
+	ill_t		*ira_ill;	/* ill where IP address hosted */
+	cred_t		*ira_cred;	/* For getpeerucred */
+	pid_t		ira_cpid;	/* For getpeerucred */
+
+	/* Used when IRAF_VERIFIED_SRC is set; this source was ok */
+	ipaddr_t	ira_verified_src;
+
+	/*
+	 * The following IPsec fields are only initialized when
+	 * IRAF_IPSEC_SECURE is set. Otherwise they contain garbage.
+	 */
+	struct ipsec_action_s *ira_ipsec_action; /* how we made it in.. */
+	struct ipsa_s 	*ira_ipsec_ah_sa;	/* SA for AH */
+	struct ipsa_s 	*ira_ipsec_esp_sa;	/* SA for ESP */
+
+	ipaddr_t	ira_mroute_tunnel;	/* IRAF_MROUTE_TUNNEL_SET */
+
+	zoneid_t	ira_no_loop_zoneid;	/* IRAF_NO_LOOP_ZONEID_SET */
+
+	uint32_t	ira_esp_udp_ports;	/* IRAF_ESP_UDP_PORTS */
+
+	/*
+	 * For IP_RECVSLLA and ip_ndp_conflict/find_solicitation.
+	 * Same size as max for sockaddr_dl
+	 */
+#define	IRA_L2SRC_SIZE	244
+	uint8_t		ira_l2src[IRA_L2SRC_SIZE];	/* If IRAF_L2SRC_SET */
+
+	/*
+	 * Local handle that we use to do lazy setting of ira_l2src.
+	 * We defer setting l2src until needed but we do before any
+	 * ip_input pullupmsg or copymsg.
+	 */
+	struct mac_header_info_s *ira_mhip;	/* Could be NULL */
+};
+
+/*
+ * Flags to indicate which receive attributes are set.
+ */
+#define	IRAF_SYSTEM_LABELED	0x00000001	/* is_system_labeled() */
+#define	IRAF_IPV4_OPTIONS	0x00000002	/* Performance */
+#define	IRAF_MULTICAST		0x00000004	/* Was multicast at L3 */
+#define	IRAF_BROADCAST		0x00000008	/* Was broadcast at L3 */
+#define	IRAF_MULTIBROADCAST	(IRAF_MULTICAST|IRAF_BROADCAST)
+
+#define	IRAF_LOOPBACK		0x00000010	/* Looped back by IP */
+#define	IRAF_VERIFY_IP_CKSUM	0x00000020	/* Need to verify IP */
+#define	IRAF_VERIFY_ULP_CKSUM	0x00000040	/* Need to verify TCP,UDP,etc */
+#define	IRAF_SCTP_CSUM_ERR	0x00000080	/* sctp pkt has failed chksum */
+
+#define	IRAF_IPSEC_SECURE	0x00000100	/* Passed AH and/or ESP */
+#define	IRAF_DHCP_UNICAST	0x00000200
+#define	IRAF_IPSEC_DECAPS	0x00000400	/* Was packet decapsulated */
+					/* from a matching inner packet? */
+#define	IRAF_TARGET_SQP		0x00000800	/* ira_target_sqp is set */
+#define	IRAF_VERIFIED_SRC	0x00001000	/* ira_verified_src set */
+#define	IRAF_RSVP		0x00002000	/* RSVP packet for rsvpd */
+#define	IRAF_MROUTE_TUNNEL_SET	0x00004000	/* From ip_mroute_decap */
+#define	IRAF_PIM_REGISTER	0x00008000	/* From register_mforward */
+
+#define	IRAF_TX_MAC_EXEMPTABLE	0x00010000	/* Allow MAC_EXEMPT readdown */
+#define	IRAF_TX_SHARED_ADDR	0x00020000	/* Arrived on ALL_ZONES addr */
+#define	IRAF_ESP_UDP_PORTS	0x00040000	/* NAT-traversal packet */
+#define	IRAF_NO_HW_CKSUM	0x00080000	/* Force software cksum */
+
+#define	IRAF_ICMP_ERROR		0x00100000	/* Send to conn_recvicmp */
+#define	IRAF_ROUTER_ALERT	0x00200000	/* IPv6 router alert */
+#define	IRAF_L2SRC_SET		0x00400000	/* ira_l2src has been set */
+#define	IRAF_L2SRC_LOOPBACK	0x00800000	/* Came from us */
+
+#define	IRAF_L2DST_MULTICAST	0x01000000	/* Multicast at L2 */
+#define	IRAF_L2DST_BROADCAST	0x02000000	/* Broadcast at L2 */
+/* Unused 0x04000000 */
+/* Unused 0x08000000 */
+
+/* Below starts with 0x10000000 */
+#define	IRAF_IS_IPV4		IAF_IS_IPV4
+#define	IRAF_TRUSTED_ICMP	IAF_TRUSTED_ICMP
+#define	IRAF_NO_LOOP_ZONEID_SET	IAF_NO_LOOP_ZONEID_SET
+#define	IRAF_LOOPBACK_COPY	IAF_LOOPBACK_COPY
+
+/*
+ * Normally these fields do not have a hold. But in some cases they do, for
+ * instance when we've gone through ip_*_attr_to/from_mblk.
+ * We use ira_free_flags to indicate that they have a hold and need to be
+ * released on cleanup.
+ */
+#define	IRA_FREE_CRED		0x00000001	/* ira_cred needs to be rele */
+#define	IRA_FREE_TSL		0x00000002	/* ira_tsl needs to be rele */
+
+/*
+ * Optional destination cache entry for path MTU information,
+ * and ULP metrics.
+ */
+struct dce_s {
+	uint_t		dce_generation;	/* Changed since cached? */
+	uint_t		dce_flags;	/* See below */
+	uint_t		dce_ipversion;	/* IPv4/IPv6 version */
+	uint32_t	dce_pmtu;	/* Path MTU if DCEF_PMTU */
+	uint32_t	dce_ident;	/* Per destination IP ident. */
+	iulp_t		dce_uinfo;	/* Metrics if DCEF_UINFO */
+
+	struct dce_s	*dce_next;
+	struct dce_s	**dce_ptpn;
+	struct dcb_s	*dce_bucket;
+
+	union {
+		in6_addr_t	dceu_v6addr;
+		ipaddr_t	dceu_v4addr;
+	} dce_u;
+#define	dce_v4addr	dce_u.dceu_v4addr
+#define	dce_v6addr	dce_u.dceu_v6addr
+	/* Note that for IPv6+IPMP we use the ifindex for the upper interface */
+	uint_t		dce_ifindex;	/* For IPv6 link-locals */
+
+	kmutex_t	dce_lock;
+	uint_t		dce_refcnt;
+	uint64_t	dce_last_change_time;	/* Path MTU. In seconds */
+
+	ip_stack_t	*dce_ipst;	/* Does not have a netstack_hold */
+};
+
+/*
+ * Values for dce_generation.
+ *
+ * If a DCE has DCE_GENERATION_CONDEMNED, the last dce_refrele should delete
+ * it.
+ *
+ * DCE_GENERATION_VERIFY is never stored in dce_generation but it is
+ * stored in places that cache DCE (such as ixa_dce_generation).
+ * It is used as a signal that the cache is stale and needs to be reverified.
+ */
+#define	DCE_GENERATION_CONDEMNED	0
+#define	DCE_GENERATION_VERIFY		1
+#define	DCE_GENERATION_INITIAL		2
+#define	DCE_IS_CONDEMNED(dce) \
+	((dce)->dce_generation == DCE_GENERATION_CONDEMNED)
+
+
+/*
+ * Values for ips_src_generation.
+ *
+ * SRC_GENERATION_VERIFY is never stored in ips_src_generation but it is
+ * stored in places that cache IREs (ixa_src_generation). It is used as a
+ * signal that the cache is stale and needs to be reverified.
+ */
+#define	SRC_GENERATION_VERIFY		0
+#define	SRC_GENERATION_INITIAL		1
+
 /*
  * The kernel stores security attributes of all gateways in a database made
  * up of one or more tsol_gcdb_t elements.  Each tsol_gcdb_t contains the
@@ -2453,183 +2504,28 @@ extern kmutex_t gcgrp_lock;
  */
 struct tsol_tnrhc;
 
-typedef struct tsol_ire_gw_secattr_s {
+struct tsol_ire_gw_secattr_s {
 	kmutex_t	igsa_lock;	/* lock to protect following */
 	struct tsol_tnrhc *igsa_rhc;	/* host entry for gateway */
 	tsol_gc_t	*igsa_gc;	/* for prefix IREs */
-	tsol_gcgrp_t	*igsa_gcgrp;	/* for cache IREs */
-} tsol_ire_gw_secattr_t;
-
-/*
- * Following are the macros to increment/decrement the reference
- * count of the IREs and IRBs (ire bucket).
- *
- * 1) We bump up the reference count of an IRE to make sure that
- *    it does not get deleted and freed while we are using it.
- *    Typically all the lookup functions hold the bucket lock,
- *    and look for the IRE. If it finds an IRE, it bumps up the
- *    reference count before dropping the lock. Sometimes we *may* want
- *    to bump up the reference count after we *looked* up i.e without
- *    holding the bucket lock. So, the IRE_REFHOLD macro does not assert
- *    on the bucket lock being held. Any thread trying to delete from
- *    the hash bucket can still do so but cannot free the IRE if
- *    ire_refcnt is not 0.
- *
- * 2) We bump up the reference count on the bucket where the IRE resides
- *    (IRB), when we want to prevent the IREs getting deleted from a given
- *    hash bucket. This makes life easier for ire_walk type functions which
- *    wants to walk the IRE list, call a function, but needs to drop
- *    the bucket lock to prevent recursive rw_enters. While the
- *    lock is dropped, the list could be changed by other threads or
- *    the same thread could end up deleting the ire or the ire pointed by
- *    ire_next. IRE_REFHOLDing the ire or ire_next is not sufficient as
- *    a delete will still remove the ire from the bucket while we have
- *    dropped the lock and hence the ire_next would be NULL. Thus, we
- *    need a mechanism to prevent deletions from a given bucket.
- *
- *    To prevent deletions, we bump up the reference count on the
- *    bucket. If the bucket is held, ire_delete just marks IRE_MARK_CONDEMNED
- *    both on the ire's ire_marks and the bucket's irb_marks. When the
- *    reference count on the bucket drops to zero, all the CONDEMNED ires
- *    are deleted. We don't have to bump up the reference count on the
- *    bucket if we are walking the bucket and never have to drop the bucket
- *    lock. Note that IRB_REFHOLD does not prevent addition of new ires
- *    in the list. It is okay because addition of new ires will not cause
- *    ire_next to point to freed memory. We do IRB_REFHOLD only when
- *    all of the 3 conditions are true :
- *
- *    1) The code needs to walk the IRE bucket from start to end.
- *    2) It may have to drop the bucket lock sometimes while doing (1)
- *    3) It does not want any ires to be deleted meanwhile.
- */
-
-/*
- * Bump up the reference count on the IRE. We cannot assert that the
- * bucket lock is being held as it is legal to bump up the reference
- * count after the first lookup has returned the IRE without
- * holding the lock. Currently ip_wput does this for caching IRE_CACHEs.
- */
-
-#ifdef DEBUG
-#define	IRE_UNTRACE_REF(ire)	ire_untrace_ref(ire);
-#define	IRE_TRACE_REF(ire)	ire_trace_ref(ire);
-#else
-#define	IRE_UNTRACE_REF(ire)
-#define	IRE_TRACE_REF(ire)
-#endif
-
-#define	IRE_REFHOLD_NOTR(ire) {				\
-	atomic_add_32(&(ire)->ire_refcnt, 1);		\
-	ASSERT((ire)->ire_refcnt != 0);			\
-}
-
-#define	IRE_REFHOLD(ire) {				\
-	IRE_REFHOLD_NOTR(ire);				\
-	IRE_TRACE_REF(ire);				\
-}
-
-#define	IRE_REFHOLD_LOCKED(ire)	{			\
-	IRE_TRACE_REF(ire);				\
-	(ire)->ire_refcnt++;				\
-}
-
-/*
- * Decrement the reference count on the IRE.
- * In architectures e.g sun4u, where atomic_add_32_nv is just
- * a cas, we need to maintain the right memory barrier semantics
- * as that of mutex_exit i.e all the loads and stores should complete
- * before the cas is executed. membar_exit() does that here.
- *
- * NOTE : This macro is used only in places where we want performance.
- *	  To avoid bloating the code, we use the function "ire_refrele"
- *	  which essentially calls the macro.
- */
-#define	IRE_REFRELE_NOTR(ire) {					\
-	ASSERT((ire)->ire_refcnt != 0);				\
-	membar_exit();						\
-	if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0)	\
-		ire_inactive(ire);				\
-}
-
-#define	IRE_REFRELE(ire) {					\
-	if (ire->ire_bucket != NULL) {				\
-		IRE_UNTRACE_REF(ire);				\
-	}							\
-	IRE_REFRELE_NOTR(ire);					\
-}
-
-/*
- * Bump up the reference count on the hash bucket - IRB to
- * prevent ires from being deleted in this bucket.
- */
-#define	IRB_REFHOLD(irb) {				\
-	rw_enter(&(irb)->irb_lock, RW_WRITER);		\
-	(irb)->irb_refcnt++;				\
-	ASSERT((irb)->irb_refcnt != 0);			\
-	rw_exit(&(irb)->irb_lock);			\
-}
-#define	IRB_REFHOLD_LOCKED(irb) {			\
-	ASSERT(RW_WRITE_HELD(&(irb)->irb_lock));	\
-	(irb)->irb_refcnt++;				\
-	ASSERT((irb)->irb_refcnt != 0);			\
-}
+};
 
 void irb_refrele_ftable(irb_t *);
-/*
- * Note: when IRB_MARK_FTABLE (i.e., IRE_CACHETABLE entry), the irb_t
- * is statically allocated, so that when the irb_refcnt goes to 0,
- * we simply clean up the ire list and continue.
- */
-#define	IRB_REFRELE(irb) {				\
-	if ((irb)->irb_marks & IRB_MARK_FTABLE) {	\
-		irb_refrele_ftable((irb));		\
-	} else {					\
-		rw_enter(&(irb)->irb_lock, RW_WRITER);		\
-		ASSERT((irb)->irb_refcnt != 0);			\
-		if (--(irb)->irb_refcnt	== 0 &&			\
-		    ((irb)->irb_marks & IRE_MARK_CONDEMNED)) {	\
-			ire_t *ire_list;			\
-								\
-			ire_list = ire_unlink(irb);		\
-			rw_exit(&(irb)->irb_lock);		\
-			ASSERT(ire_list != NULL);		\
-			ire_cleanup(ire_list);			\
-		} else {					\
-			rw_exit(&(irb)->irb_lock);		\
-		}						\
-	}							\
-}
 
 extern struct kmem_cache *rt_entry_cache;
 
-/*
- * Lock the fast path mp for access, since the fp_mp can be deleted
- * due a DL_NOTE_FASTPATH_FLUSH in the case of IRE_BROADCAST
- */
-
-#define	LOCK_IRE_FP_MP(ire) {				\
-		if ((ire)->ire_type == IRE_BROADCAST)	\
-			mutex_enter(&ire->ire_nce->nce_lock);	\
-	}
-#define	UNLOCK_IRE_FP_MP(ire) {				\
-		if ((ire)->ire_type == IRE_BROADCAST)	\
-			mutex_exit(&ire->ire_nce->nce_lock);	\
-	}
-
 typedef struct ire4 {
-	ipaddr_t ire4_src_addr;		/* Source address to use. */
 	ipaddr_t ire4_mask;		/* Mask for matching this IRE. */
 	ipaddr_t ire4_addr;		/* Address this IRE represents. */
-	ipaddr_t ire4_gateway_addr;	/* Gateway if IRE_CACHE/IRE_OFFSUBNET */
-	ipaddr_t ire4_cmask;		/* Mask from parent prefix route */
+	ipaddr_t ire4_gateway_addr;	/* Gateway including for IRE_ONLINK */
+	ipaddr_t ire4_setsrc_addr;	/* RTF_SETSRC */
 } ire4_t;
 
 typedef struct ire6 {
-	in6_addr_t ire6_src_addr;	/* Source address to use. */
 	in6_addr_t ire6_mask;		/* Mask for matching this IRE. */
 	in6_addr_t ire6_addr;		/* Address this IRE represents. */
-	in6_addr_t ire6_gateway_addr;	/* Gateway if IRE_CACHE/IRE_OFFSUBNET */
-	in6_addr_t ire6_cmask;		/* Mask from parent prefix route */
+	in6_addr_t ire6_gateway_addr;	/* Gateway including for IRE_ONLINK */
+	in6_addr_t ire6_setsrc_addr;	/* RTF_SETSRC */
 } ire6_t;
 
 typedef union ire_addr {
@@ -2637,115 +2533,131 @@ typedef union ire_addr {
 	ire4_t	ire4_u;
 } ire_addr_u_t;
 
-/* Internet Routing Entry */
-typedef struct ire_s {
+/*
+ * Internet Routing Entry
+ * When we have multiple identical IREs we logically add them by manipulating
+ * ire_identical_ref and ire_delete first decrements
+ * that and when it reaches 1 we know it is the last IRE.
+ * "identical" is defined as being the same for:
+ * ire_addr, ire_netmask, ire_gateway, ire_ill, ire_zoneid, and ire_type
+ * For instance, multiple IRE_BROADCASTs for the same subnet number are
+ * viewed as identical, and so are the IRE_INTERFACEs when there are
+ * multiple logical interfaces (on the same ill) with the same subnet prefix.
+ */
+struct ire_s {
 	struct	ire_s	*ire_next;	/* The hash chain must be first. */
 	struct	ire_s	**ire_ptpn;	/* Pointer to previous next. */
 	uint32_t	ire_refcnt;	/* Number of references */
-	mblk_t		*ire_mp;	/* Non-null if allocated as mblk */
-	queue_t		*ire_rfq;	/* recv from this queue */
-	queue_t		*ire_stq;	/* send to this queue */
-	union {
-		uint_t	*max_fragp;	/* Used only during ire creation */
-		uint_t	max_frag;	/* MTU (next hop or path). */
-	} imf_u;
-#define	ire_max_frag	imf_u.max_frag
-#define	ire_max_fragp	imf_u.max_fragp
-	uint32_t	ire_frag_flag;	/* IPH_DF or zero. */
-	uint32_t	ire_ident;	/* Per IRE IP ident. */
-	uint32_t	ire_tire_mark;	/* Used for reclaim of unused. */
+	ill_t		*ire_ill;
+	uint32_t	ire_identical_ref; /* IRE_INTERFACE, IRE_BROADCAST */
 	uchar_t		ire_ipversion;	/* IPv4/IPv6 version */
-	uchar_t		ire_marks;	/* IRE_MARK_CONDEMNED etc. */
 	ushort_t	ire_type;	/* Type of IRE */
+	uint_t		ire_generation;	/* Generation including CONDEMNED */
 	uint_t	ire_ib_pkt_count;	/* Inbound packets for ire_addr */
 	uint_t	ire_ob_pkt_count;	/* Outbound packets to ire_addr */
-	uint_t	ire_ll_hdr_length;	/* Non-zero if we do M_DATA prepends */
 	time_t	ire_create_time;	/* Time (in secs) IRE was created. */
-	uint32_t	ire_phandle;	/* Associate prefix IREs to cache */
-	uint32_t	ire_ihandle;	/* Associate interface IREs to cache */
-	ipif_t		*ire_ipif;	/* the interface that this ire uses */
 	uint32_t	ire_flags;	/* flags related to route (RTF_*) */
 	/*
-	 * Neighbor Cache Entry for IPv6; arp info for IPv4
+	 * ire_testhidden is TRUE for INTERFACE IREs of IS_UNDER_IPMP(ill)
+	 * interfaces
 	 */
-	struct	nce_s	*ire_nce;
+	boolean_t	ire_testhidden;
+	pfirerecv_t	ire_recvfn;	/* Receive side handling */
+	pfiresend_t	ire_sendfn;	/* Send side handling */
+	pfirepostfrag_t	ire_postfragfn;	/* Bottom end of send handling */
+
 	uint_t		ire_masklen;	/* # bits in ire_mask{,_v6} */
 	ire_addr_u_t	ire_u;		/* IPv4/IPv6 address info. */
 
 	irb_t		*ire_bucket;	/* Hash bucket when ire_ptphn is set */
-	iulp_t		ire_uinfo;	/* Upper layer protocol info. */
-	/*
-	 * Protects ire_uinfo, ire_max_frag, and ire_frag_flag.
-	 */
 	kmutex_t	ire_lock;
-	uint_t		ire_ipif_seqid; /* ipif_seqid of ire_ipif */
-	uint_t		ire_ipif_ifindex; /* ifindex associated with ipif */
-	clock_t		ire_last_used_time;	/* Last used time */
+	clock_t		ire_last_used_time;	/* For IRE_LOCAL reception */
 	tsol_ire_gw_secattr_t *ire_gw_secattr; /* gateway security attributes */
-	zoneid_t	ire_zoneid;	/* for local address discrimination */
+	zoneid_t	ire_zoneid;
+
+	/*
+	 * Cached information of where to send packets that match this route.
+	 * The ire_dep_* information is used to determine when ire_nce_cache
+	 * needs to be updated.
+	 * ire_nce_cache is the fastpath for the Neighbor Cache Entry
+	 * for IPv6; arp info for IPv4
+	 * Since this is a cache setup and torn down independently of
+	 * applications we need to use nce_ref{rele,hold}_notr for it.
+	 */
+	nce_t		*ire_nce_cache;
+
+	/*
+	 * Quick check whether the ire_type and ire_masklen indicates
+	 * that the IRE can have ire_nce_cache set i.e., whether it is
+	 * IRE_ONLINK and for a single destination.
+	 */
+	boolean_t	ire_nce_capable;
+
 	/*
-	 * ire's that are embedded inside mblk_t and sent to the external
-	 * resolver use the ire_stq_ifindex to track the ifindex of the
-	 * ire_stq, so that the ill (if it exists) can be correctly recovered
-	 * for cleanup in the esbfree routine when arp failure occurs.
-	 * Similarly, the ire_stackid is used to recover the ip_stack_t.
+	 * Dependency tracking so we can safely cache IRE and NCE pointers
+	 * in offlink and onlink IREs.
+	 * These are locked under the ips_ire_dep_lock rwlock. Write held
+	 * when modifying the linkage.
+	 * ire_dep_parent (Also chain towards IRE for nexthop)
+	 * ire_dep_parent_generation: ire_generation of ire_dep_parent
+	 * ire_dep_children (From parent to first child)
+	 * ire_dep_sib_next (linked list of siblings)
+	 * ire_dep_sib_ptpn (linked list of siblings)
+	 *
+	 * The parent has a ire_refhold on each child, and each child has
+	 * an ire_refhold on its parent.
+	 * Since ire_dep_parent is a cache setup and torn down independently of
+	 * applications we need to use ire_ref{rele,hold}_notr for it.
 	 */
-	uint_t		ire_stq_ifindex;
-	netstackid_t	ire_stackid;
+	ire_t		*ire_dep_parent;
+	ire_t		*ire_dep_children;
+	ire_t		*ire_dep_sib_next;
+	ire_t		**ire_dep_sib_ptpn;	/* Pointer to previous next */
+	uint_t		ire_dep_parent_generation;
+
+	uint_t		ire_badcnt;	/* Number of times ND_UNREACHABLE */
+	uint64_t	ire_last_badcnt;	/* In seconds */
+
+	/* ire_defense* and ire_last_used_time are only used on IRE_LOCALs */
 	uint_t		ire_defense_count;	/* number of ARP conflicts */
 	uint_t		ire_defense_time;	/* last time defended (secs) */
+
 	boolean_t	ire_trace_disable;	/* True when alloc fails */
 	ip_stack_t	*ire_ipst;	/* Does not have a netstack_hold */
-} ire_t;
+	iulp_t		ire_metrics;
+};
 
 /* IPv4 compatibility macros */
-#define	ire_src_addr		ire_u.ire4_u.ire4_src_addr
 #define	ire_mask		ire_u.ire4_u.ire4_mask
 #define	ire_addr		ire_u.ire4_u.ire4_addr
 #define	ire_gateway_addr	ire_u.ire4_u.ire4_gateway_addr
-#define	ire_cmask		ire_u.ire4_u.ire4_cmask
+#define	ire_setsrc_addr		ire_u.ire4_u.ire4_setsrc_addr
 
-#define	ire_src_addr_v6		ire_u.ire6_u.ire6_src_addr
 #define	ire_mask_v6		ire_u.ire6_u.ire6_mask
 #define	ire_addr_v6		ire_u.ire6_u.ire6_addr
 #define	ire_gateway_addr_v6	ire_u.ire6_u.ire6_gateway_addr
-#define	ire_cmask_v6		ire_u.ire6_u.ire6_cmask
-
-/* Convenient typedefs for sockaddrs */
-typedef	struct sockaddr_in	sin_t;
-typedef	struct sockaddr_in6	sin6_t;
-
-/* Address structure used for internal bind with IP */
-typedef struct ipa_conn_s {
-	ipaddr_t	ac_laddr;
-	ipaddr_t	ac_faddr;
-	uint16_t	ac_fport;
-	uint16_t	ac_lport;
-} ipa_conn_t;
-
-typedef struct ipa6_conn_s {
-	in6_addr_t	ac6_laddr;
-	in6_addr_t	ac6_faddr;
-	uint16_t	ac6_fport;
-	uint16_t	ac6_lport;
-} ipa6_conn_t;
+#define	ire_setsrc_addr_v6	ire_u.ire6_u.ire6_setsrc_addr
 
 /*
- * Using ipa_conn_x_t or ipa6_conn_x_t allows us to modify the behavior of IP's
- * bind handler.
+ * Values for ire_generation.
+ *
+ * If an IRE is marked with IRE_IS_CONDEMNED, the last walker of
+ * the bucket should delete this IRE from this bucket.
+ *
+ * IRE_GENERATION_VERIFY is never stored in ire_generation but it is
+ * stored in places that cache IREs (such as ixa_ire_generation and
+ * ire_dep_parent_generation). It is used as a signal that the cache is
+ * stale and needs to be reverified.
  */
-typedef struct ipa_conn_extended_s {
-	uint64_t	acx_flags;
-	ipa_conn_t	acx_conn;
-} ipa_conn_x_t;
+#define	IRE_GENERATION_CONDEMNED	0
+#define	IRE_GENERATION_VERIFY		1
+#define	IRE_GENERATION_INITIAL		2
+#define	IRE_IS_CONDEMNED(ire) \
+	((ire)->ire_generation == IRE_GENERATION_CONDEMNED)
 
-typedef struct ipa6_conn_extended_s {
-	uint64_t	ac6x_flags;
-	ipa6_conn_t	ac6x_conn;
-} ipa6_conn_x_t;
-
-/* flag values for ipa_conn_x_t and ipa6_conn_x_t. */
-#define	ACX_VERIFY_DST	0x1ULL	/* verify destination address is reachable */
+/* Convenient typedefs for sockaddrs */
+typedef	struct sockaddr_in	sin_t;
+typedef	struct sockaddr_in6	sin6_t;
 
 /* Name/Value Descriptor. */
 typedef struct nv_s {
@@ -2784,110 +2696,83 @@ extern uint_t ip_max_frag_dups;
  * to support the needs of such tools and private definitions moved to
  * private headers.
  */
-struct ip6_pkt_s {
+struct ip_pkt_s {
 	uint_t		ipp_fields;		/* Which fields are valid */
-	uint_t		ipp_sticky_ignored;	/* sticky fields to ignore */
-	uint_t		ipp_ifindex;		/* pktinfo ifindex */
 	in6_addr_t	ipp_addr;		/* pktinfo src/dst addr */
-	uint_t		ipp_unicast_hops;	/* IPV6_UNICAST_HOPS */
-	uint_t		ipp_multicast_hops;	/* IPV6_MULTICAST_HOPS */
+#define	ipp_addr_v4	V4_PART_OF_V6(ipp_addr)
+	uint_t		ipp_unicast_hops;	/* IPV6_UNICAST_HOPS, IP_TTL */
 	uint_t		ipp_hoplimit;		/* IPV6_HOPLIMIT */
 	uint_t		ipp_hopoptslen;
-	uint_t		ipp_rtdstoptslen;
+	uint_t		ipp_rthdrdstoptslen;
 	uint_t		ipp_rthdrlen;
 	uint_t		ipp_dstoptslen;
-	uint_t		ipp_pathmtulen;
 	uint_t		ipp_fraghdrlen;
 	ip6_hbh_t	*ipp_hopopts;
-	ip6_dest_t	*ipp_rtdstopts;
+	ip6_dest_t	*ipp_rthdrdstopts;
 	ip6_rthdr_t	*ipp_rthdr;
 	ip6_dest_t	*ipp_dstopts;
 	ip6_frag_t	*ipp_fraghdr;
-	struct ip6_mtuinfo *ipp_pathmtu;
-	in6_addr_t	ipp_nexthop;		/* Transmit only */
-	uint8_t		ipp_tclass;
-	int8_t		ipp_use_min_mtu;
+	uint8_t		ipp_tclass;		/* IPV6_TCLASS */
+	uint8_t		ipp_type_of_service;	/* IP_TOS */
+	uint_t		ipp_ipv4_options_len;	/* Len of IPv4 options */
+	uint8_t		*ipp_ipv4_options;	/* Ptr to IPv4 options */
+	uint_t		ipp_label_len_v4;	/* Len of TX label for IPv4 */
+	uint8_t		*ipp_label_v4;		/* TX label for IPv4 */
+	uint_t		ipp_label_len_v6;	/* Len of TX label for IPv6 */
+	uint8_t		*ipp_label_v6;		/* TX label for IPv6 */
 };
-typedef struct ip6_pkt_s ip6_pkt_t;
-
-extern void ip6_pkt_free(ip6_pkt_t *);	/* free storage inside ip6_pkt_t */
-
-/*
- * This struct is used by ULP_opt_set() functions to return value of IPv4
- * ancillary options. Currently this is only used by udp and icmp and only
- * IP_PKTINFO option is supported.
- */
-typedef struct ip4_pkt_s {
-	uint_t		ip4_ill_index;	/* interface index */
-	ipaddr_t	ip4_addr;	/* source address */
-} ip4_pkt_t;
-
-/*
- * Used by ULP's to pass options info to ip_output
- * currently only IP_PKTINFO is supported.
- */
-typedef struct ip_opt_info_s {
-	uint_t ip_opt_ill_index;
-	uint_t ip_opt_flags;
-} ip_opt_info_t;
-
-/*
- * value for ip_opt_flags
- */
-#define	IP_VERIFY_SRC	0x1
+typedef struct ip_pkt_s ip_pkt_t;
 
-/*
- * This structure is used to convey information from IP and the ULP.
- * Currently used for the IP_RECVSLLA, IP_RECVIF and IP_RECVPKTINFO options.
- * The type of information field is set to IN_PKTINFO (i.e inbound pkt info)
- */
-typedef struct ip_pktinfo {
-	uint32_t		ip_pkt_ulp_type;	/* type of info sent */
-	uint32_t		ip_pkt_flags;	/* what is sent up by IP */
-	uint32_t		ip_pkt_ifindex;	/* inbound interface index */
-	struct sockaddr_dl	ip_pkt_slla;	/* has source link layer addr */
-	struct in_addr		ip_pkt_match_addr; /* matched address */
-} ip_pktinfo_t;
-
-/*
- * flags to tell UDP what IP is sending; in_pkt_flags
- */
-#define	IPF_RECVIF	0x01	/* inbound interface index */
-#define	IPF_RECVSLLA	0x02	/* source link layer address */
-/*
- * Inbound interface index + matched address.
- * Used only by IPV4.
- */
-#define	IPF_RECVADDR	0x04
+extern void ip_pkt_free(ip_pkt_t *);	/* free storage inside ip_pkt_t */
+extern ipaddr_t ip_pkt_source_route_v4(const ip_pkt_t *);
+extern in6_addr_t *ip_pkt_source_route_v6(const ip_pkt_t *);
+extern int ip_pkt_copy(ip_pkt_t *, ip_pkt_t *, int);
+extern void ip_pkt_source_route_reverse_v4(ip_pkt_t *);
 
 /* ipp_fields values */
-#define	IPPF_IFINDEX	0x0001	/* Part of in6_pktinfo: ifindex */
-#define	IPPF_ADDR	0x0002	/* Part of in6_pktinfo: src/dst addr */
-#define	IPPF_SCOPE_ID	0x0004	/* Add xmit ip6i_t for sin6_scope_id */
-#define	IPPF_NO_CKSUM	0x0008	/* Add xmit ip6i_t for IP6I_NO_*_CKSUM */
-
-#define	IPPF_RAW_CKSUM	0x0010	/* Add xmit ip6i_t for IP6I_RAW_CHECKSUM */
-#define	IPPF_HOPLIMIT	0x0020
-#define	IPPF_HOPOPTS	0x0040
-#define	IPPF_RTHDR	0x0080
-
-#define	IPPF_RTDSTOPTS	0x0100
-#define	IPPF_DSTOPTS	0x0200
-#define	IPPF_NEXTHOP	0x0400
-#define	IPPF_PATHMTU	0x0800
-
-#define	IPPF_TCLASS	0x1000
-#define	IPPF_DONTFRAG	0x2000
-#define	IPPF_USE_MIN_MTU	0x04000
-#define	IPPF_MULTICAST_HOPS	0x08000
-
-#define	IPPF_UNICAST_HOPS	0x10000
-#define	IPPF_FRAGHDR		0x20000
-
-#define	IPPF_HAS_IP6I \
-	(IPPF_IFINDEX|IPPF_ADDR|IPPF_NEXTHOP|IPPF_SCOPE_ID| \
-	IPPF_NO_CKSUM|IPPF_RAW_CKSUM|IPPF_HOPLIMIT|IPPF_DONTFRAG| \
-	IPPF_USE_MIN_MTU|IPPF_MULTICAST_HOPS|IPPF_UNICAST_HOPS)
+#define	IPPF_ADDR		0x0001	/* Part of in6_pktinfo: src/dst addr */
+#define	IPPF_HOPLIMIT		0x0002	/* Overrides unicast and multicast */
+#define	IPPF_TCLASS		0x0004	/* Overrides class in sin6_flowinfo */
+
+#define	IPPF_HOPOPTS		0x0010	/* ipp_hopopts set */
+#define	IPPF_RTHDR		0x0020	/* ipp_rthdr set */
+#define	IPPF_RTHDRDSTOPTS	0x0040	/* ipp_rthdrdstopts set */
+#define	IPPF_DSTOPTS		0x0080	/* ipp_dstopts set */
+
+#define	IPPF_IPV4_OPTIONS	0x0100	/* ipp_ipv4_options set */
+#define	IPPF_LABEL_V4		0x0200	/* ipp_label_v4 set */
+#define	IPPF_LABEL_V6		0x0400	/* ipp_label_v6 set */
+
+#define	IPPF_FRAGHDR		0x0800	/* Used for IPsec receive side */
+
+/*
+ * Data structure which is passed to conn_opt_get/set.
+ * The conn_t is included even though it can be inferred from queue_t.
+ * setsockopt and getsockopt use conn_ixa and conn_xmit_ipp. However,
+ * when handling ancillary data we use separate ixa and ipps.
+ */
+typedef struct conn_opt_arg_s {
+	conn_t		*coa_connp;
+	ip_xmit_attr_t	*coa_ixa;
+	ip_pkt_t	*coa_ipp;
+	boolean_t	coa_ancillary;	/* Ancillary data and not setsockopt */
+	uint_t		coa_changed;	/* See below */
+} conn_opt_arg_t;
+
+/*
+ * Flags for what changed.
+ * If we want to be more efficient in the future we can have more fine
+ * grained flags e.g., a flag for just IP_TOS changing.
+ * For now we either call ip_set_destination (for "route changed")
+ * and/or conn_build_hdr_template/conn_prepend_hdr (for "header changed").
+ */
+#define	COA_HEADER_CHANGED	0x0001
+#define	COA_ROUTE_CHANGED	0x0002
+#define	COA_RCVBUF_CHANGED	0x0004	/* SO_RCVBUF */
+#define	COA_SNDBUF_CHANGED	0x0008	/* SO_SNDBUF */
+#define	COA_WROFF_CHANGED	0x0010	/* Header size changed */
+#define	COA_ICMP_BIND_NEEDED	0x0020
+#define	COA_OOBINLINE_CHANGED	0x0040
 
 #define	TCP_PORTS_OFFSET	0
 #define	UDP_PORTS_OFFSET	0
@@ -2902,32 +2787,21 @@ typedef struct ip_pktinfo {
 #define	IPIF_LOOKUP_FAILED	2	/* Used as error code */
 
 #define	ILL_CAN_LOOKUP(ill)						\
-	(!((ill)->ill_state_flags & (ILL_CONDEMNED | ILL_CHANGING)) ||	\
+	(!((ill)->ill_state_flags & ILL_CONDEMNED) ||			\
 	IAM_WRITER_ILL(ill))
 
-#define	ILL_CAN_WAIT(ill, q)	\
-	(((q) != NULL) && !((ill)->ill_state_flags & (ILL_CONDEMNED)))
+#define	ILL_IS_CONDEMNED(ill)	\
+	((ill)->ill_state_flags & ILL_CONDEMNED)
 
 #define	IPIF_CAN_LOOKUP(ipif)	\
-	(!((ipif)->ipif_state_flags & (IPIF_CONDEMNED | IPIF_CHANGING)) || \
+	(!((ipif)->ipif_state_flags & IPIF_CONDEMNED) || \
 	IAM_WRITER_IPIF(ipif))
 
-/*
- * If the parameter 'q' is NULL, the caller is not interested in wait and
- * restart of the operation if the ILL or IPIF cannot be looked up when it is
- * marked as 'CHANGING'. Typically a thread that tries to send out data  will
- * end up passing NULLs as the last 4 parameters to ill_lookup_on_ifindex and
- * in this case 'q' is NULL
- */
-#define	IPIF_CAN_WAIT(ipif, q)	\
-	(((q) != NULL) && !((ipif)->ipif_state_flags & (IPIF_CONDEMNED)))
-
-#define	IPIF_CAN_LOOKUP_WALKER(ipif)					\
-	(!((ipif)->ipif_state_flags & (IPIF_CONDEMNED)) ||		\
-	IAM_WRITER_IPIF(ipif))
+#define	IPIF_IS_CONDEMNED(ipif)	\
+	((ipif)->ipif_state_flags & IPIF_CONDEMNED)
 
-#define	ILL_UNMARK_CHANGING(ill)                                \
-	(ill)->ill_state_flags &= ~ILL_CHANGING;
+#define	IPIF_IS_CHANGING(ipif)	\
+	((ipif)->ipif_state_flags & IPIF_CHANGING)
 
 /* Macros used to assert that this thread is a writer */
 #define	IAM_WRITER_IPSQ(ipsq)	((ipsq)->ipsq_xop->ipx_writer == curthread)
@@ -2956,9 +2830,9 @@ typedef struct ip_pktinfo {
 #define	RELEASE_ILL_LOCKS(ill_1, ill_2)		\
 {						\
 	if (ill_1 != NULL)			\
-		mutex_exit(&(ill_1)->ill_lock); \
+		mutex_exit(&(ill_1)->ill_lock);	\
 	if (ill_2 != NULL && ill_2 != ill_1)	\
-		mutex_exit(&(ill_2)->ill_lock); \
+		mutex_exit(&(ill_2)->ill_lock);	\
 }
 
 /* Get the other protocol instance ill */
@@ -2975,20 +2849,13 @@ typedef struct cmd_info_s
 	struct lifreq *ci_lifr;	/* the lifreq struct passed down */
 } cmd_info_t;
 
-/*
- * List of AH and ESP IPsec acceleration capable ills
- */
-typedef struct ipsec_capab_ill_s {
-	uint_t ill_index;
-	boolean_t ill_isv6;
-	struct ipsec_capab_ill_s *next;
-} ipsec_capab_ill_t;
-
 extern struct kmem_cache *ire_cache;
 
 extern ipaddr_t	ip_g_all_ones;
 
-extern	uint_t	ip_loopback_mtu;	/* /etc/system */
+extern uint_t	ip_loopback_mtu;	/* /etc/system */
+extern uint_t	ip_loopback_mtuplus;
+extern uint_t	ip_loopback_mtu_v6plus;
 
 extern vmem_t *ip_minor_arena_sa;
 extern vmem_t *ip_minor_arena_la;
@@ -3014,18 +2881,18 @@ extern vmem_t *ip_minor_arena_la;
 #define	ips_ip_g_send_redirects		ips_param_arr[5].ip_param_value
 #define	ips_ip_g_forward_directed_bcast	ips_param_arr[6].ip_param_value
 #define	ips_ip_mrtdebug			ips_param_arr[7].ip_param_value
-#define	ips_ip_timer_interval		ips_param_arr[8].ip_param_value
-#define	ips_ip_ire_arp_interval		ips_param_arr[9].ip_param_value
-#define	ips_ip_ire_redir_interval	ips_param_arr[10].ip_param_value
+#define	ips_ip_ire_reclaim_fraction	ips_param_arr[8].ip_param_value
+#define	ips_ip_nce_reclaim_fraction	ips_param_arr[9].ip_param_value
+#define	ips_ip_dce_reclaim_fraction	ips_param_arr[10].ip_param_value
 #define	ips_ip_def_ttl			ips_param_arr[11].ip_param_value
 #define	ips_ip_forward_src_routed	ips_param_arr[12].ip_param_value
 #define	ips_ip_wroff_extra		ips_param_arr[13].ip_param_value
-#define	ips_ip_ire_pathmtu_interval	ips_param_arr[14].ip_param_value
+#define	ips_ip_pathmtu_interval		ips_param_arr[14].ip_param_value
 #define	ips_ip_icmp_return		ips_param_arr[15].ip_param_value
 #define	ips_ip_path_mtu_discovery	ips_param_arr[16].ip_param_value
-#define	ips_ip_ignore_delete_time	ips_param_arr[17].ip_param_value
+#define	ips_ip_pmtu_min			ips_param_arr[17].ip_param_value
 #define	ips_ip_ignore_redirect		ips_param_arr[18].ip_param_value
-#define	ips_ip_output_queue		ips_param_arr[19].ip_param_value
+#define	ips_ip_arp_icmp_error		ips_param_arr[19].ip_param_value
 #define	ips_ip_broadcast_ttl		ips_param_arr[20].ip_param_value
 #define	ips_ip_icmp_err_interval	ips_param_arr[21].ip_param_value
 #define	ips_ip_icmp_err_burst		ips_param_arr[22].ip_param_value
@@ -3046,7 +2913,7 @@ extern vmem_t *ip_minor_arena_la;
 #define	ips_ipv6_send_redirects		ips_param_arr[35].ip_param_value
 #define	ips_ipv6_ignore_redirect	ips_param_arr[36].ip_param_value
 #define	ips_ipv6_strict_dst_multihoming	ips_param_arr[37].ip_param_value
-#define	ips_ip_ire_reclaim_fraction	ips_param_arr[38].ip_param_value
+#define	ips_src_check			ips_param_arr[38].ip_param_value
 #define	ips_ipsec_policy_log_interval	ips_param_arr[39].ip_param_value
 #define	ips_pim_accept_clear_messages	ips_param_arr[40].ip_param_value
 #define	ips_ip_ndp_unsolicit_interval	ips_param_arr[41].ip_param_value
@@ -3055,21 +2922,37 @@ extern vmem_t *ip_minor_arena_la;
 
 /* Misc IP configuration knobs */
 #define	ips_ip_policy_mask		ips_param_arr[44].ip_param_value
-#define	ips_ip_multirt_resolution_interval ips_param_arr[45].ip_param_value
+#define	ips_ip_ecmp_behavior		ips_param_arr[45].ip_param_value
 #define	ips_ip_multirt_ttl  		ips_param_arr[46].ip_param_value
-#define	ips_ip_multidata_outbound	ips_param_arr[47].ip_param_value
-#define	ips_ip_ndp_defense_interval	ips_param_arr[48].ip_param_value
-#define	ips_ip_max_temp_idle		ips_param_arr[49].ip_param_value
-#define	ips_ip_max_temp_defend		ips_param_arr[50].ip_param_value
-#define	ips_ip_max_defend		ips_param_arr[51].ip_param_value
-#define	ips_ip_defend_interval		ips_param_arr[52].ip_param_value
-#define	ips_ip_dup_recovery		ips_param_arr[53].ip_param_value
-#define	ips_ip_restrict_interzone_loopback ips_param_arr[54].ip_param_value
-#define	ips_ip_lso_outbound		ips_param_arr[55].ip_param_value
-#define	ips_igmp_max_version		ips_param_arr[56].ip_param_value
-#define	ips_mld_max_version		ips_param_arr[57].ip_param_value
-#define	ips_ip_pmtu_min			ips_param_arr[58].ip_param_value
-#define	ips_ipv6_drop_inbound_icmpv6	ips_param_arr[59].ip_param_value
+#define	ips_ip_ire_badcnt_lifetime	ips_param_arr[47].ip_param_value
+#define	ips_ip_max_temp_idle		ips_param_arr[48].ip_param_value
+#define	ips_ip_max_temp_defend		ips_param_arr[49].ip_param_value
+#define	ips_ip_max_defend		ips_param_arr[50].ip_param_value
+#define	ips_ip_defend_interval		ips_param_arr[51].ip_param_value
+#define	ips_ip_dup_recovery		ips_param_arr[52].ip_param_value
+#define	ips_ip_restrict_interzone_loopback ips_param_arr[53].ip_param_value
+#define	ips_ip_lso_outbound		ips_param_arr[54].ip_param_value
+#define	ips_igmp_max_version		ips_param_arr[55].ip_param_value
+#define	ips_mld_max_version		ips_param_arr[56].ip_param_value
+#define	ips_ipv6_drop_inbound_icmpv6	ips_param_arr[57].ip_param_value
+#define	ips_arp_probe_delay		ips_param_arr[58].ip_param_value
+#define	ips_arp_fastprobe_delay		ips_param_arr[59].ip_param_value
+#define	ips_arp_probe_interval		ips_param_arr[60].ip_param_value
+#define	ips_arp_fastprobe_interval	ips_param_arr[61].ip_param_value
+#define	ips_arp_probe_count		ips_param_arr[62].ip_param_value
+#define	ips_arp_fastprobe_count		ips_param_arr[63].ip_param_value
+#define	ips_ipv4_dad_announce_interval	ips_param_arr[64].ip_param_value
+#define	ips_ipv6_dad_announce_interval	ips_param_arr[65].ip_param_value
+#define	ips_arp_defend_interval		ips_param_arr[66].ip_param_value
+#define	ips_arp_defend_rate		ips_param_arr[67].ip_param_value
+#define	ips_ndp_defend_interval		ips_param_arr[68].ip_param_value
+#define	ips_ndp_defend_rate		ips_param_arr[69].ip_param_value
+#define	ips_arp_defend_period		ips_param_arr[70].ip_param_value
+#define	ips_ndp_defend_period		ips_param_arr[71].ip_param_value
+#define	ips_ipv4_icmp_return_pmtu	ips_param_arr[72].ip_param_value
+#define	ips_ipv6_icmp_return_pmtu	ips_param_arr[73].ip_param_value
+#define	ips_ip_arp_publish_count	ips_param_arr[74].ip_param_value
+#define	ips_ip_arp_publish_interval	ips_param_arr[75].ip_param_value
 
 extern int	dohwcksum;	/* use h/w cksum if supported by the h/w */
 #ifdef ZC_TEST
@@ -3102,13 +2985,13 @@ extern struct module_info ip_mod_info;
 	((ipst)->ips_ip4_loopback_out_event.he_interested)
 #define	HOOKS6_INTERESTED_LOOPBACK_OUT(ipst)	\
 	((ipst)->ips_ip6_loopback_out_event.he_interested)
-
 /*
- * Hooks macros used inside of ip
+ * Hooks marcos used inside of ip
+ * The callers use the above INTERESTED macros first, hence
+ * the he_interested check is superflous.
  */
-#define	FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst)	\
-									\
-	if ((_hook).he_interested) {	\
+#define	FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst, _err) \
+	if ((_hook).he_interested) {					\
 		hook_pkt_event_t info;					\
 									\
 		_NOTE(CONSTCOND)					\
@@ -3121,12 +3004,15 @@ extern struct module_info ip_mod_info;
 		info.hpe_mp = &(_fm);					\
 		info.hpe_mb = _m;					\
 		info.hpe_flags = _llm;					\
-		if (hook_run(ipst->ips_ipv4_net_data->netd_hooks,	\
-		    _event, (hook_data_t)&info) != 0) {			\
+		_err = hook_run(ipst->ips_ipv4_net_data->netd_hooks,	\
+		    _event, (hook_data_t)&info);			\
+		if (_err != 0) {					\
 			ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\
 			    (_hook).he_name, (void *)_fm, (void *)_m));	\
-			freemsg(_fm);					\
-			_fm = NULL;					\
+			if (_fm != NULL) {				\
+				freemsg(_fm);				\
+				_fm = NULL;				\
+			}						\
 			_iph = NULL;					\
 			_m = NULL;					\
 		} else {						\
@@ -3135,9 +3021,8 @@ extern struct module_info ip_mod_info;
 		}							\
 	}
 
-#define	FW_HOOKS6(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst)	\
-									\
-	if ((_hook).he_interested) {	\
+#define	FW_HOOKS6(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst, _err) \
+	if ((_hook).he_interested) {					\
 		hook_pkt_event_t info;					\
 									\
 		_NOTE(CONSTCOND)					\
@@ -3150,12 +3035,15 @@ extern struct module_info ip_mod_info;
 		info.hpe_mp = &(_fm);					\
 		info.hpe_mb = _m;					\
 		info.hpe_flags = _llm;					\
-		if (hook_run(ipst->ips_ipv6_net_data->netd_hooks,	\
-		    _event, (hook_data_t)&info) != 0) {			\
+		_err = hook_run(ipst->ips_ipv6_net_data->netd_hooks,	\
+		    _event, (hook_data_t)&info);			\
+		if (_err != 0) {					\
 			ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\
 			    (_hook).he_name, (void *)_fm, (void *)_m));	\
-			freemsg(_fm);					\
-			_fm = NULL;					\
+			if (_fm != NULL) {				\
+				freemsg(_fm);				\
+				_fm = NULL;				\
+			}						\
 			_iph = NULL;					\
 			_m = NULL;					\
 		} else {						\
@@ -3194,24 +3082,6 @@ extern struct module_info ip_mod_info;
 #define	IP_LOOPBACK_ADDR(addr)			\
 	(((addr) & N_IN_CLASSA_NET == N_IN_LOOPBACK_NET))
 
-#ifdef DEBUG
-/* IPsec HW acceleration debugging support */
-
-#define	IPSECHW_CAPAB		0x0001	/* capability negotiation */
-#define	IPSECHW_SADB		0x0002	/* SADB exchange */
-#define	IPSECHW_PKT		0x0004	/* general packet flow */
-#define	IPSECHW_PKTIN		0x0008	/* driver in pkt processing details */
-#define	IPSECHW_PKTOUT		0x0010	/* driver out pkt processing details */
-
-#define	IPSECHW_DEBUG(f, x)	if (ipsechw_debug & (f)) { (void) printf x; }
-#define	IPSECHW_CALL(f, r, x)	if (ipsechw_debug & (f)) { (void) r x; }
-
-extern uint32_t ipsechw_debug;
-#else
-#define	IPSECHW_DEBUG(f, x)	{}
-#define	IPSECHW_CALL(f, r, x)	{}
-#endif
-
 extern int	ip_debug;
 extern uint_t	ip_thread_data;
 extern krwlock_t ip_thread_rwlock;
@@ -3235,8 +3105,6 @@ extern list_t	ip_thread_list;
 /* Default MAC-layer address string length for mac_colon_addr */
 #define	MAC_STR_LEN	128
 
-struct	ipsec_out_s;
-
 struct	mac_header_info_s;
 
 extern void	ill_frag_timer(void *);
@@ -3252,86 +3120,173 @@ extern char	*ip_dot_addr(ipaddr_t, char *);
 extern const char *mac_colon_addr(const uint8_t *, size_t, char *, size_t);
 extern void	ip_lwput(queue_t *, mblk_t *);
 extern boolean_t icmp_err_rate_limit(ip_stack_t *);
-extern void	icmp_time_exceeded(queue_t *, mblk_t *, uint8_t, zoneid_t,
-    ip_stack_t *);
-extern void	icmp_unreachable(queue_t *, mblk_t *, uint8_t, zoneid_t,
-    ip_stack_t *);
-extern mblk_t	*ip_add_info(mblk_t *, ill_t *, uint_t, zoneid_t, ip_stack_t *);
-cred_t		*ip_best_cred(mblk_t *, conn_t *, pid_t *);
-extern mblk_t	*ip_bind_v4(queue_t *, mblk_t *, conn_t *);
-extern	boolean_t ip_bind_ipsec_policy_set(conn_t *, mblk_t *);
-extern	int	ip_bind_laddr_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t,
-    uint16_t, boolean_t);
-extern	int	ip_proto_bind_laddr_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t,
-    uint16_t, boolean_t);
-extern	int	ip_proto_bind_connected_v4(conn_t *, mblk_t **,
-    uint8_t, ipaddr_t *, uint16_t, ipaddr_t, uint16_t, boolean_t, boolean_t,
-    cred_t *);
-extern	int	ip_bind_connected_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t *,
-    uint16_t, ipaddr_t, uint16_t, boolean_t, boolean_t, cred_t *);
+extern void	icmp_frag_needed(mblk_t *, int, ip_recv_attr_t *);
+extern mblk_t	*icmp_inbound_v4(mblk_t *, ip_recv_attr_t *);
+extern void	icmp_time_exceeded(mblk_t *, uint8_t, ip_recv_attr_t *);
+extern void	icmp_unreachable(mblk_t *, uint8_t, ip_recv_attr_t *);
+extern boolean_t ip_ipsec_policy_inherit(conn_t *, conn_t *, ip_recv_attr_t *);
+extern void	*ip_pullup(mblk_t *, ssize_t, ip_recv_attr_t *);
+extern void	ip_setl2src(mblk_t *, ip_recv_attr_t *, ill_t *);
+extern mblk_t	*ip_check_and_align_header(mblk_t *, uint_t, ip_recv_attr_t *);
+extern mblk_t	*ip_check_length(mblk_t *, uchar_t *, ssize_t, uint_t, uint_t,
+    ip_recv_attr_t *);
+extern mblk_t	*ip_check_optlen(mblk_t *, ipha_t *, uint_t, uint_t,
+    ip_recv_attr_t *);
+extern mblk_t	*ip_fix_dbref(mblk_t *, ip_recv_attr_t *);
 extern uint_t	ip_cksum(mblk_t *, int, uint32_t);
 extern int	ip_close(queue_t *, int);
 extern uint16_t	ip_csum_hdr(ipha_t *);
-extern void	ip_proto_not_sup(queue_t *, mblk_t *, uint_t, zoneid_t,
-    ip_stack_t *);
+extern void	ip_forward_xmit_v4(nce_t *, ill_t *, mblk_t *, ipha_t *,
+    ip_recv_attr_t *, uint32_t, uint32_t);
+extern boolean_t ip_forward_options(mblk_t *, ipha_t *, ill_t *,
+    ip_recv_attr_t *);
+extern int	ip_fragment_v4(mblk_t *, nce_t *, iaflags_t, uint_t, uint32_t,
+    uint32_t, zoneid_t, zoneid_t, pfirepostfrag_t postfragfn,
+    uintptr_t *cookie);
+extern void	ip_proto_not_sup(mblk_t *, ip_recv_attr_t *);
 extern void	ip_ire_g_fini(void);
 extern void	ip_ire_g_init(void);
 extern void	ip_ire_fini(ip_stack_t *);
 extern void	ip_ire_init(ip_stack_t *);
+extern void	ip_mdata_to_mhi(ill_t *, mblk_t *, struct mac_header_info_s *);
 extern int	ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
 		    cred_t *credp);
 extern int	ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
 		    cred_t *credp);
 extern int	ip_reassemble(mblk_t *, ipf_t *, uint_t, boolean_t, ill_t *,
     size_t);
-extern int	ip_opt_set_ill(conn_t *, int, boolean_t, boolean_t,
-    int, int, mblk_t *);
 extern void	ip_rput(queue_t *, mblk_t *);
 extern void	ip_input(ill_t *, ill_rx_ring_t *, mblk_t *,
     struct mac_header_info_s *);
+extern void	ip_input_v6(ill_t *, ill_rx_ring_t *, mblk_t *,
+    struct mac_header_info_s *);
+extern mblk_t	*ip_input_common_v4(ill_t *, ill_rx_ring_t *, mblk_t *,
+    struct mac_header_info_s *, squeue_t *, mblk_t **, uint_t *);
+extern mblk_t	*ip_input_common_v6(ill_t *, ill_rx_ring_t *, mblk_t *,
+    struct mac_header_info_s *, squeue_t *, mblk_t **, uint_t *);
+extern void	ill_input_full_v4(mblk_t *, void *, void *,
+    ip_recv_attr_t *, rtc_t *);
+extern void	ill_input_short_v4(mblk_t *, void *, void *,
+    ip_recv_attr_t *, rtc_t *);
+extern void	ill_input_full_v6(mblk_t *, void *, void *,
+    ip_recv_attr_t *, rtc_t *);
+extern void	ill_input_short_v6(mblk_t *, void *, void *,
+    ip_recv_attr_t *, rtc_t *);
+extern ipaddr_t	ip_input_options(ipha_t *, ipaddr_t, mblk_t *,
+    ip_recv_attr_t *, int *);
+extern boolean_t ip_input_local_options(mblk_t *, ipha_t *, ip_recv_attr_t *);
+extern mblk_t	*ip_input_fragment(mblk_t *, ipha_t *, ip_recv_attr_t *);
+extern mblk_t	*ip_input_fragment_v6(mblk_t *, ip6_t *, ip6_frag_t *, uint_t,
+    ip_recv_attr_t *);
+extern void	ip_input_post_ipsec(mblk_t *, ip_recv_attr_t *);
+extern void	ip_fanout_v4(mblk_t *, ipha_t *, ip_recv_attr_t *);
+extern void	ip_fanout_v6(mblk_t *, ip6_t *, ip_recv_attr_t *);
+extern void	ip_fanout_proto_conn(conn_t *, mblk_t *, ipha_t *, ip6_t *,
+    ip_recv_attr_t *);
+extern void	ip_fanout_proto_v4(mblk_t *, ipha_t *, ip_recv_attr_t *);
+extern void	ip_fanout_send_icmp_v4(mblk_t *, uint_t, uint_t,
+    ip_recv_attr_t *);
+extern void	ip_fanout_udp_conn(conn_t *, mblk_t *, ipha_t *, ip6_t *,
+    ip_recv_attr_t *);
+extern void	ip_fanout_udp_multi_v4(mblk_t *, ipha_t *, uint16_t, uint16_t,
+    ip_recv_attr_t *);
+extern mblk_t	*zero_spi_check(mblk_t *, ip_recv_attr_t *);
+extern void	ip_build_hdrs_v4(uchar_t *, uint_t, const ip_pkt_t *, uint8_t);
+extern int	ip_find_hdr_v4(ipha_t *, ip_pkt_t *, boolean_t);
+extern int	ip_total_hdrs_len_v4(const ip_pkt_t *);
+
 extern mblk_t	*ip_accept_tcp(ill_t *, ill_rx_ring_t *, squeue_t *,
     mblk_t *, mblk_t **, uint_t *cnt);
-extern void	ip_rput_dlpi(queue_t *, mblk_t *);
-extern void	ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
-extern void	ip_rput_forward_multicast(ipaddr_t, mblk_t *, ipif_t *);
+extern void	ip_rput_dlpi(ill_t *, mblk_t *);
+extern void	ip_rput_notdata(ill_t *, mblk_t *);
 
 extern void	ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *,
 		    mib2_ipIfStatsEntry_t *);
 extern void	ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *,
 		    mib2_ipv6IfIcmpEntry_t *);
-extern void	ip_udp_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *);
-extern void	ip_proto_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *,
-    uint32_t);
 extern void	ip_rput_other(ipsq_t *, queue_t *, mblk_t *, void *);
 extern ire_t	*ip_check_multihome(void *, ire_t *, ill_t *);
-extern void	ip_setpktversion(conn_t *, boolean_t, boolean_t, ip_stack_t *);
-extern void	ip_trash_ire_reclaim(void *);
-extern void	ip_trash_timer_expire(void *);
-extern void	ip_wput(queue_t *, mblk_t *);
-extern void	ip_output(void *, mblk_t *, void *, int);
-extern void	ip_output_options(void *, mblk_t *, void *, int,
-    ip_opt_info_t *);
-
-extern void	ip_wput_ire(queue_t *, mblk_t *, ire_t *, conn_t *, int,
-		    zoneid_t);
-extern void	ip_wput_local(queue_t *, ill_t *, ipha_t *, mblk_t *, ire_t *,
-		    int, zoneid_t);
-extern void	ip_wput_multicast(queue_t *, mblk_t *, ipif_t *, zoneid_t);
-extern void	ip_wput_nondata(ipsq_t *, queue_t *, mblk_t *, void *);
+extern void	ip_send_potential_redirect_v4(mblk_t *, ipha_t *, ire_t *,
+    ip_recv_attr_t *);
+extern int	ip_set_destination_v4(ipaddr_t *, ipaddr_t, ipaddr_t,
+    ip_xmit_attr_t *, iulp_t *, uint32_t, uint_t);
+extern int	ip_set_destination_v6(in6_addr_t *, const in6_addr_t *,
+    const in6_addr_t *, ip_xmit_attr_t *, iulp_t *, uint32_t, uint_t);
+
+extern int	ip_output_simple(mblk_t *, ip_xmit_attr_t *);
+extern int	ip_output_simple_v4(mblk_t *, ip_xmit_attr_t *);
+extern int	ip_output_simple_v6(mblk_t *, ip_xmit_attr_t *);
+extern int	ip_output_options(mblk_t *, ipha_t *, ip_xmit_attr_t *,
+    ill_t *);
+extern void	ip_output_local_options(ipha_t *, ip_stack_t *);
+
+extern ip_xmit_attr_t *conn_get_ixa(conn_t *, boolean_t);
+extern ip_xmit_attr_t *conn_get_ixa_tryhard(conn_t *, boolean_t);
+extern ip_xmit_attr_t *conn_replace_ixa(conn_t *, ip_xmit_attr_t *);
+extern ip_xmit_attr_t *conn_get_ixa_exclusive(conn_t *);
+extern ip_xmit_attr_t *ip_xmit_attr_duplicate(ip_xmit_attr_t *);
+extern void	ip_xmit_attr_replace_tsl(ip_xmit_attr_t *, ts_label_t *);
+extern void	ip_xmit_attr_restore_tsl(ip_xmit_attr_t *, cred_t *);
+boolean_t	ip_recv_attr_replace_label(ip_recv_attr_t *, ts_label_t *);
+extern void	ixa_inactive(ip_xmit_attr_t *);
+extern void	ixa_refrele(ip_xmit_attr_t *);
+extern boolean_t ixa_check_drain_insert(conn_t *, ip_xmit_attr_t *);
+extern void	ixa_cleanup(ip_xmit_attr_t *);
+extern void	ira_cleanup(ip_recv_attr_t *, boolean_t);
+extern void	ixa_safe_copy(ip_xmit_attr_t *, ip_xmit_attr_t *);
+
+extern int	conn_ip_output(mblk_t *, ip_xmit_attr_t *);
+extern boolean_t ip_output_verify_local(ip_xmit_attr_t *);
+extern mblk_t	*ip_output_process_local(mblk_t *, ip_xmit_attr_t *, boolean_t,
+    boolean_t, conn_t *);
+
+extern int	conn_opt_get(conn_opt_arg_t *, t_scalar_t, t_scalar_t,
+    uchar_t *);
+extern int	conn_opt_set(conn_opt_arg_t *, t_scalar_t, t_scalar_t, uint_t,
+    uchar_t *, boolean_t, cred_t *);
+extern boolean_t	conn_same_as_last_v4(conn_t *, sin_t *);
+extern boolean_t	conn_same_as_last_v6(conn_t *, sin6_t *);
+extern int	conn_update_label(const conn_t *, const ip_xmit_attr_t *,
+    const in6_addr_t *, ip_pkt_t *);
+
+extern int	ip_opt_set_multicast_group(conn_t *, t_scalar_t,
+    uchar_t *, boolean_t, boolean_t);
+extern int	ip_opt_set_multicast_sources(conn_t *, t_scalar_t,
+    uchar_t *, boolean_t, boolean_t);
+extern int	conn_getsockname(conn_t *, struct sockaddr *, uint_t *);
+extern int	conn_getpeername(conn_t *, struct sockaddr *, uint_t *);
+
+extern int	conn_build_hdr_template(conn_t *, uint_t, uint_t,
+    const in6_addr_t *, const in6_addr_t *, uint32_t);
+extern mblk_t	*conn_prepend_hdr(ip_xmit_attr_t *, const ip_pkt_t *,
+    const in6_addr_t *, const in6_addr_t *, uint8_t, uint32_t, uint_t,
+    mblk_t *, uint_t, uint_t, uint32_t *, int *);
+extern void	ip_attr_newdst(ip_xmit_attr_t *);
+extern void	ip_attr_nexthop(const ip_pkt_t *, const ip_xmit_attr_t *,
+    const in6_addr_t *, in6_addr_t *);
+extern int	conn_connect(conn_t *, iulp_t *, uint32_t);
+extern int	ip_attr_connect(const conn_t *, ip_xmit_attr_t *,
+    const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, in_port_t,
+    in6_addr_t *, iulp_t *, uint32_t);
+extern int	conn_inherit_parent(conn_t *, conn_t *);
+
+extern void	conn_ixa_cleanup(conn_t *connp, void *arg);
+
+extern boolean_t conn_wantpacket(conn_t *, ip_recv_attr_t *, ipha_t *);
+extern uint_t	ip_type_v4(ipaddr_t, ip_stack_t *);
+extern uint_t	ip_type_v6(const in6_addr_t *, ip_stack_t *);
+
+extern void	ip_wput_nondata(queue_t *, mblk_t *);
 extern void	ip_wsrv(queue_t *);
 extern char	*ip_nv_lookup(nv_t *, int);
 extern boolean_t ip_local_addr_ok_v6(const in6_addr_t *, const in6_addr_t *);
 extern boolean_t ip_remote_addr_ok_v6(const in6_addr_t *, const in6_addr_t *);
 extern ipaddr_t ip_massage_options(ipha_t *, netstack_t *);
 extern ipaddr_t ip_net_mask(ipaddr_t);
-extern void	ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t,
-		    ip_stack_t *);
-extern ipxmit_state_t	ip_xmit_v4(mblk_t *, ire_t *, struct ipsec_out_s *,
-    boolean_t, conn_t *);
-extern int	ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *);
+extern void	arp_bringup_done(ill_t *, int);
+extern void	arp_replumb_done(ill_t *, int);
 
 extern struct qinit iprinitv6;
-extern struct qinit ipwinitv6;
 
 extern void	ipmp_init(ip_stack_t *);
 extern void	ipmp_destroy(ip_stack_t *);
@@ -3347,12 +3302,11 @@ extern ill_t	*ipmp_illgrp_add_ipif(ipmp_illgrp_t *, ipif_t *);
 extern void	ipmp_illgrp_del_ipif(ipmp_illgrp_t *, ipif_t *);
 extern ill_t	*ipmp_illgrp_next_ill(ipmp_illgrp_t *);
 extern ill_t	*ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *);
-extern ill_t	*ipmp_illgrp_cast_ill(ipmp_illgrp_t *);
 extern ill_t	*ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *);
 extern ill_t	*ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *);
 extern void	ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *);
-extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *, mblk_t *,
-    boolean_t);
+extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *,
+    boolean_t, ipaddr_t, uchar_t *, size_t, uint16_t);
 extern void	ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *, ipmp_arpent_t *);
 extern ipmp_arpent_t *ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *, ipaddr_t *);
 extern void	ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *);
@@ -3373,19 +3327,25 @@ extern ill_t	*ipmp_ipif_bound_ill(const ipif_t *);
 extern ill_t	*ipmp_ipif_hold_bound_ill(const ipif_t *);
 extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *);
 extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *);
+extern boolean_t ipmp_packet_is_probe(mblk_t *, ill_t *);
+extern ill_t	*ipmp_ill_get_xmit_ill(ill_t *, boolean_t);
+extern void	ipmp_ncec_flush_nce(ncec_t *);
+extern void	ipmp_ncec_fastpath(ncec_t *, ill_t *);
 
 extern void	conn_drain_insert(conn_t *, idl_tx_list_t *);
+extern void	conn_setqfull(conn_t *, boolean_t *);
+extern void	conn_clrqfull(conn_t *, boolean_t *);
 extern int	conn_ipsec_length(conn_t *);
-extern void	ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *,
-    ire_t *);
 extern ipaddr_t	ip_get_dst(ipha_t *);
-extern int	ipsec_out_extra_length(mblk_t *);
-extern int	ipsec_in_extra_length(mblk_t *);
-extern mblk_t	*ipsec_in_alloc(boolean_t, netstack_t *);
-extern boolean_t ipsec_in_is_secure(mblk_t *);
-extern void	ipsec_out_process(queue_t *, mblk_t *, ire_t *, uint_t);
-extern void	ipsec_out_to_in(mblk_t *);
-extern void	ip_fanout_proto_again(mblk_t *, ill_t *, ill_t *, ire_t *);
+extern uint_t	ip_get_pmtu(ip_xmit_attr_t *);
+extern uint_t	ip_get_base_mtu(ill_t *, ire_t *);
+extern mblk_t *ip_output_attach_policy(mblk_t *, ipha_t *, ip6_t *,
+    const conn_t *, ip_xmit_attr_t *);
+extern int	ipsec_out_extra_length(ip_xmit_attr_t *);
+extern int	ipsec_out_process(mblk_t *, ip_xmit_attr_t *);
+extern int	ip_output_post_ipsec(mblk_t *, ip_xmit_attr_t *);
+extern void	ipsec_out_to_in(ip_xmit_attr_t *, ill_t *ill,
+    ip_recv_attr_t *);
 
 extern void	ire_cleanup(ire_t *);
 extern void	ire_inactive(ire_t *);
@@ -3407,14 +3367,13 @@ extern uint_t	ip_srcid_find_addr(const in6_addr_t *, zoneid_t, netstack_t *);
 
 extern uint8_t	ipoptp_next(ipoptp_t *);
 extern uint8_t	ipoptp_first(ipoptp_t *, ipha_t *);
-extern int	ip_opt_get_user(const ipha_t *, uchar_t *);
+extern int	ip_opt_get_user(conn_t *, uchar_t *);
 extern int	ipsec_req_from_conn(conn_t *, ipsec_req_t *, int);
 extern int	ip_snmp_get(queue_t *q, mblk_t *mctl, int level);
 extern int	ip_snmp_set(queue_t *q, int, int, uchar_t *, int);
 extern void	ip_process_ioctl(ipsq_t *, queue_t *, mblk_t *, void *);
 extern void	ip_quiesce_conn(conn_t *);
 extern  void    ip_reprocess_ioctl(ipsq_t *, queue_t *, mblk_t *, void *);
-extern void	ip_restart_optmgmt(ipsq_t *, queue_t *, mblk_t *, void *);
 extern void	ip_ioctl_finish(queue_t *, mblk_t *, int, int, ipsq_t *);
 
 extern boolean_t ip_cmpbuf(const void *, uint_t, boolean_t, const void *,
@@ -3425,32 +3384,36 @@ extern void	ip_savebuf(void **, uint_t *, boolean_t, const void *, uint_t);
 
 extern boolean_t	ipsq_pending_mp_cleanup(ill_t *, conn_t *);
 extern void	conn_ioctl_cleanup(conn_t *);
-extern ill_t	*conn_get_held_ill(conn_t *, ill_t **, int *);
-
-struct tcp_stack;
-extern void ip_xmit_reset_serialize(mblk_t *, int, zoneid_t, struct tcp_stack *,
-    conn_t *);
-
-struct multidata_s;
-struct pdesc_s;
-
-extern mblk_t	*ip_mdinfo_alloc(ill_mdt_capab_t *);
-extern mblk_t	*ip_mdinfo_return(ire_t *, conn_t *, char *, ill_mdt_capab_t *);
-extern mblk_t	*ip_lsoinfo_alloc(ill_lso_capab_t *);
-extern mblk_t	*ip_lsoinfo_return(ire_t *, conn_t *, char *,
-    ill_lso_capab_t *);
-extern uint_t	ip_md_cksum(struct pdesc_s *, int, uint_t);
-extern boolean_t ip_md_addr_attr(struct multidata_s *, struct pdesc_s *,
-			const mblk_t *);
-extern boolean_t ip_md_hcksum_attr(struct multidata_s *, struct pdesc_s *,
-			uint32_t, uint32_t, uint32_t, uint32_t);
-extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *,
-			uint_t);
+
 extern void	ip_unbind(conn_t *);
 
 extern void tnet_init(void);
 extern void tnet_fini(void);
 
+/*
+ * Hook functions to enable cluster networking
+ * On non-clustered systems these vectors must always be NULL.
+ */
+extern int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
+    sa_family_t addr_family, uint8_t *laddrp, void *args);
+extern uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
+    sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
+    void *args);
+extern int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol,
+    boolean_t is_outgoing, sa_family_t addr_family, uint8_t *laddrp,
+    in_port_t lport, uint8_t *faddrp, in_port_t fport, void *args);
+extern void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
+    void *);
+extern void (*cl_inet_getspi)(netstackid_t stack_id, uint8_t protocol,
+    uint8_t *ptr, size_t len, void *args);
+extern int (*cl_inet_checkspi)(netstackid_t stack_id, uint8_t protocol,
+    uint32_t spi, void *args);
+extern void (*cl_inet_deletespi)(netstackid_t stack_id, uint8_t protocol,
+    uint32_t spi, void *args);
+extern void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t,
+    sa_family_t, in6_addr_t, in6_addr_t, void *);
+
+
 /* Hooks for CGTP (multirt routes) filtering module */
 #define	CGTP_FILTER_REV_1	1
 #define	CGTP_FILTER_REV_2	2
@@ -3491,73 +3454,6 @@ extern int	ip_cgtp_filter_register(netstackid_t, cgtp_filter_ops_t *);
 extern int	ip_cgtp_filter_unregister(netstackid_t);
 extern int	ip_cgtp_filter_is_registered(netstackid_t);
 
-/* Flags for ire_multirt_lookup() */
-
-#define	MULTIRT_USESTAMP	0x0001
-#define	MULTIRT_SETSTAMP	0x0002
-#define	MULTIRT_CACHEGW		0x0004
-
-/* Debug stuff for multirt route resolution. */
-#if defined(DEBUG) && !defined(__lint)
-/* Our "don't send, rather drop" flag. */
-#define	MULTIRT_DEBUG_FLAG	0x8000
-
-#define	MULTIRT_TRACE(x)	ip2dbg(x)
-
-#define	MULTIRT_DEBUG_TAG(mblk)	\
-	do { \
-		ASSERT(mblk != NULL); \
-		MULTIRT_TRACE(("%s[%d]: tagging mblk %p, tag was %d\n", \
-		__FILE__, __LINE__, \
-		(void *)(mblk), (mblk)->b_flag & MULTIRT_DEBUG_FLAG)); \
-		(mblk)->b_flag |= MULTIRT_DEBUG_FLAG; \
-	} while (0)
-
-#define	MULTIRT_DEBUG_UNTAG(mblk) \
-	do { \
-		ASSERT(mblk != NULL); \
-		MULTIRT_TRACE(("%s[%d]: untagging mblk %p, tag was %d\n", \
-		__FILE__, __LINE__, \
-		(void *)(mblk), (mblk)->b_flag & MULTIRT_DEBUG_FLAG)); \
-		(mblk)->b_flag &= ~MULTIRT_DEBUG_FLAG; \
-	} while (0)
-
-#define	MULTIRT_DEBUG_TAGGED(mblk) \
-	(((mblk)->b_flag & MULTIRT_DEBUG_FLAG) ? B_TRUE : B_FALSE)
-#else
-#define	MULTIRT_DEBUG_TAG(mblk)		ASSERT(mblk != NULL)
-#define	MULTIRT_DEBUG_UNTAG(mblk)	ASSERT(mblk != NULL)
-#define	MULTIRT_DEBUG_TAGGED(mblk)	B_FALSE
-#endif
-
-/*
- * Per-ILL Multidata Transmit capabilities.
- */
-struct ill_mdt_capab_s {
-	uint_t ill_mdt_version;  /* interface version */
-	uint_t ill_mdt_on;	 /* on/off switch for MDT on this ILL */
-	uint_t ill_mdt_hdr_head; /* leading header fragment extra space */
-	uint_t ill_mdt_hdr_tail; /* trailing header fragment extra space */
-	uint_t ill_mdt_max_pld;	 /* maximum payload buffers per Multidata */
-	uint_t ill_mdt_span_limit; /* maximum payload span per packet */
-};
-
-struct ill_hcksum_capab_s {
-	uint_t	ill_hcksum_version;	/* interface version */
-	uint_t	ill_hcksum_txflags;	/* capabilities on transmit */
-};
-
-struct ill_zerocopy_capab_s {
-	uint_t	ill_zerocopy_version;	/* interface version */
-	uint_t	ill_zerocopy_flags;	/* capabilities */
-};
-
-struct ill_lso_capab_s {
-	uint_t	ill_lso_on;		/* on/off switch for LSO on this ILL */
-	uint_t	ill_lso_flags;		/* capabilities */
-	uint_t	ill_lso_max;		/* maximum size of payload */
-};
-
 /*
  * rr_ring_state cycles in the order shown below from RR_FREE through
  * RR_FREE_IN_PROG and  back to RR_FREE.
@@ -3669,18 +3565,61 @@ extern void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);
 extern void ip_squeue_quiesce_ring(ill_t *, ill_rx_ring_t *);
 extern void ip_squeue_restart_ring(ill_t *, ill_rx_ring_t *);
 extern void ip_squeue_clean_all(ill_t *);
+extern boolean_t	ip_source_routed(ipha_t *, ip_stack_t *);
 
 extern void tcp_wput(queue_t *, mblk_t *);
 
-extern int	ip_fill_mtuinfo(struct in6_addr *, in_port_t,
-	struct ip6_mtuinfo *, netstack_t *);
-extern	ipif_t *conn_get_held_ipif(conn_t *, ipif_t **, int *);
+extern int	ip_fill_mtuinfo(conn_t *, ip_xmit_attr_t *,
+    struct ip6_mtuinfo *);
 extern hook_t *ipobs_register_hook(netstack_t *, pfv_t);
 extern void ipobs_unregister_hook(netstack_t *, hook_t *);
 extern void ipobs_hook(mblk_t *, int, zoneid_t, zoneid_t, const ill_t *,
     ip_stack_t *);
 typedef void    (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
 
+extern void	dce_g_init(void);
+extern void	dce_g_destroy(void);
+extern void	dce_stack_init(ip_stack_t *);
+extern void	dce_stack_destroy(ip_stack_t *);
+extern void	dce_cleanup(uint_t, ip_stack_t *);
+extern dce_t	*dce_get_default(ip_stack_t *);
+extern dce_t	*dce_lookup_pkt(mblk_t *, ip_xmit_attr_t *, uint_t *);
+extern dce_t	*dce_lookup_v4(ipaddr_t, ip_stack_t *, uint_t *);
+extern dce_t	*dce_lookup_v6(const in6_addr_t *, uint_t, ip_stack_t *,
+    uint_t *);
+extern dce_t	*dce_lookup_and_add_v4(ipaddr_t, ip_stack_t *);
+extern dce_t	*dce_lookup_and_add_v6(const in6_addr_t *, uint_t,
+    ip_stack_t *);
+extern int	dce_update_uinfo_v4(ipaddr_t, iulp_t *, ip_stack_t *);
+extern int	dce_update_uinfo_v6(const in6_addr_t *, uint_t, iulp_t *,
+    ip_stack_t *);
+extern int	dce_update_uinfo(const in6_addr_t *, uint_t, iulp_t *,
+    ip_stack_t *);
+extern void	dce_increment_generation(dce_t *);
+extern void	dce_increment_all_generations(boolean_t, ip_stack_t *);
+extern void	dce_refrele(dce_t *);
+extern void	dce_refhold(dce_t *);
+extern void	dce_refrele_notr(dce_t *);
+extern void	dce_refhold_notr(dce_t *);
+mblk_t		*ip_snmp_get_mib2_ip_dce(queue_t *, mblk_t *, ip_stack_t *ipst);
+
+extern ip_laddr_t ip_laddr_verify_v4(ipaddr_t, zoneid_t,
+    ip_stack_t *, boolean_t);
+extern ip_laddr_t ip_laddr_verify_v6(const in6_addr_t *, zoneid_t,
+    ip_stack_t *, boolean_t, uint_t);
+extern int	ip_laddr_fanout_insert(conn_t *);
+
+extern boolean_t ip_verify_src(mblk_t *, ip_xmit_attr_t *, uint_t *);
+extern int	ip_verify_ire(mblk_t *, ip_xmit_attr_t *);
+
+extern mblk_t	*ip_xmit_attr_to_mblk(ip_xmit_attr_t *);
+extern boolean_t ip_xmit_attr_from_mblk(mblk_t *, ip_xmit_attr_t *);
+extern mblk_t	*ip_xmit_attr_free_mblk(mblk_t *);
+extern mblk_t	*ip_recv_attr_to_mblk(ip_recv_attr_t *);
+extern boolean_t ip_recv_attr_from_mblk(mblk_t *, ip_recv_attr_t *);
+extern mblk_t	*ip_recv_attr_free_mblk(mblk_t *);
+extern boolean_t ip_recv_attr_is_mblk(mblk_t *);
+
 /*
  * Squeue tags. Tags only need to be unique when the callback function is the
  * same to distinguish between different calls, but we use unique tags for
@@ -3729,16 +3668,8 @@ typedef void    (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
 #define	SQTAG_CONNECT_FINISH		41
 #define	SQTAG_SYNCHRONOUS_OP		42
 #define	SQTAG_TCP_SHUTDOWN_OUTPUT	43
-#define	SQTAG_XMIT_EARLY_RESET		44
-
-#define	NOT_OVER_IP(ip_wq)	\
-	(ip_wq->q_next != NULL ||	\
-	    (ip_wq->q_qinfo->qi_minfo->mi_idname) == NULL ||	\
-	    strcmp(ip_wq->q_qinfo->qi_minfo->mi_idname,	\
-	    IP_MOD_NAME) != 0 ||	\
-	    ip_wq->q_qinfo->qi_minfo->mi_idnum != IP_MOD_ID)
+#define	SQTAG_TCP_IXA_CLEANUP		44
 
-#define	PROTO_FLOW_CNTRLD(connp)	(connp->conn_flow_cntrld)
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c
new file mode 100644
index 0000000000..a46d7c4cd0
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/conn_opt.c
@@ -0,0 +1,2933 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#define	_SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+#include <sys/xti_inet.h>
+#include <sys/ucred.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/ethernet.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/tcp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/optcom.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <netinet/udp.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+static	sin_t	sin_null;	/* Zero address for quick clears */
+static	sin6_t	sin6_null;	/* Zero address for quick clears */
+
+/*
+ * Return how much size is needed for the different ancillary data items
+ */
+uint_t
+conn_recvancillary_size(conn_t *connp, crb_t recv_ancillary,
+    ip_recv_attr_t *ira, mblk_t *mp, ip_pkt_t *ipp)
+{
+	uint_t		ancil_size;
+	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
+
+	/*
+	 * If IP_RECVDSTADDR is set we include the destination IP
+	 * address as an option. With IP_RECVOPTS we include all
+	 * the IP options.
+	 */
+	ancil_size = 0;
+	if (recv_ancillary.crb_recvdstaddr &&
+	    (ira->ira_flags & IRAF_IS_IPV4)) {
+		ancil_size += sizeof (struct T_opthdr) +
+		    sizeof (struct in_addr);
+		IP_STAT(ipst, conn_in_recvdstaddr);
+	}
+
+	/*
+	 * ip_recvpktinfo is used for both AF_INET and AF_INET6 but
+	 * are different
+	 */
+	if (recv_ancillary.crb_ip_recvpktinfo &&
+	    connp->conn_family == AF_INET) {
+		ancil_size += sizeof (struct T_opthdr) +
+		    sizeof (struct in_pktinfo);
+		IP_STAT(ipst, conn_in_recvpktinfo);
+	}
+
+	if ((recv_ancillary.crb_recvopts) &&
+	    (ipp->ipp_fields & IPPF_IPV4_OPTIONS)) {
+		ancil_size += sizeof (struct T_opthdr) +
+		    ipp->ipp_ipv4_options_len;
+		IP_STAT(ipst, conn_in_recvopts);
+	}
+
+	if (recv_ancillary.crb_recvslla) {
+		ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+		ill_t *ill;
+
+		/* Make sure ira_l2src is setup if not already */
+		if (!(ira->ira_flags & IRAF_L2SRC_SET)) {
+			ill = ill_lookup_on_ifindex(ira->ira_rifindex, B_FALSE,
+			    ipst);
+			if (ill != NULL) {
+				ip_setl2src(mp, ira, ill);
+				ill_refrele(ill);
+			}
+		}
+		ancil_size += sizeof (struct T_opthdr) +
+		    sizeof (struct sockaddr_dl);
+		IP_STAT(ipst, conn_in_recvslla);
+	}
+
+	if (recv_ancillary.crb_recvif) {
+		ancil_size += sizeof (struct T_opthdr) + sizeof (uint_t);
+		IP_STAT(ipst, conn_in_recvif);
+	}
+
+	/*
+	 * ip_recvpktinfo is used for both AF_INET and AF_INET6 but
+	 * are different
+	 */
+	if (recv_ancillary.crb_ip_recvpktinfo &&
+	    connp->conn_family == AF_INET6) {
+		ancil_size += sizeof (struct T_opthdr) +
+		    sizeof (struct in6_pktinfo);
+		IP_STAT(ipst, conn_in_recvpktinfo);
+	}
+
+	if (recv_ancillary.crb_ipv6_recvhoplimit) {
+		ancil_size += sizeof (struct T_opthdr) + sizeof (int);
+		IP_STAT(ipst, conn_in_recvhoplimit);
+	}
+
+	if (recv_ancillary.crb_ipv6_recvtclass) {
+		ancil_size += sizeof (struct T_opthdr) + sizeof (int);
+		IP_STAT(ipst, conn_in_recvtclass);
+	}
+
+	if (recv_ancillary.crb_ipv6_recvhopopts &&
+	    (ipp->ipp_fields & IPPF_HOPOPTS)) {
+		ancil_size += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen;
+		IP_STAT(ipst, conn_in_recvhopopts);
+	}
+	/*
+	 * To honor RFC3542 when an application asks for both IPV6_RECVDSTOPTS
+	 * and IPV6_RECVRTHDR, we pass up the item rthdrdstopts (the destination
+	 * options that appear before a routing header.
+	 * We also pass them up if IPV6_RECVRTHDRDSTOPTS is set.
+	 */
+	if (ipp->ipp_fields & IPPF_RTHDRDSTOPTS) {
+		if (recv_ancillary.crb_ipv6_recvrthdrdstopts ||
+		    (recv_ancillary.crb_ipv6_recvdstopts &&
+		    recv_ancillary.crb_ipv6_recvrthdr)) {
+			ancil_size += sizeof (struct T_opthdr) +
+			    ipp->ipp_rthdrdstoptslen;
+			IP_STAT(ipst, conn_in_recvrthdrdstopts);
+		}
+	}
+	if ((recv_ancillary.crb_ipv6_recvrthdr) &&
+	    (ipp->ipp_fields & IPPF_RTHDR)) {
+		ancil_size += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen;
+		IP_STAT(ipst, conn_in_recvrthdr);
+	}
+	if ((recv_ancillary.crb_ipv6_recvdstopts ||
+	    recv_ancillary.crb_old_ipv6_recvdstopts) &&
+	    (ipp->ipp_fields & IPPF_DSTOPTS)) {
+		ancil_size += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen;
+		IP_STAT(ipst, conn_in_recvdstopts);
+	}
+	if (recv_ancillary.crb_recvucred && ira->ira_cred != NULL) {
+		ancil_size += sizeof (struct T_opthdr) + ucredsize;
+		IP_STAT(ipst, conn_in_recvucred);
+	}
+
+	/*
+	 * If SO_TIMESTAMP is set allocate the appropriate sized
+	 * buffer. Since gethrestime() expects a pointer aligned
+	 * argument, we allocate space necessary for extra
+	 * alignment (even though it might not be used).
+	 */
+	if (recv_ancillary.crb_timestamp) {
+		ancil_size += sizeof (struct T_opthdr) +
+		    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
+		IP_STAT(ipst, conn_in_timestamp);
+	}
+
+	/*
+	 * If IP_RECVTTL is set allocate the appropriate sized buffer
+	 */
+	if (recv_ancillary.crb_recvttl &&
+	    (ira->ira_flags & IRAF_IS_IPV4)) {
+		ancil_size += sizeof (struct T_opthdr) + sizeof (uint8_t);
+		IP_STAT(ipst, conn_in_recvttl);
+	}
+
+	return (ancil_size);
+}
+
+/*
+ * Lay down the ancillary data items at "ancil_buf".
+ * Assumes caller has used conn_recvancillary_size to allocate a sufficiently
+ * large buffer - ancil_size.
+ */
+void
+conn_recvancillary_add(conn_t *connp, crb_t recv_ancillary,
+    ip_recv_attr_t *ira, ip_pkt_t *ipp, uchar_t *ancil_buf, uint_t ancil_size)
+{
+	/*
+	 * Copy in destination address before options to avoid
+	 * any padding issues.
+	 */
+	if (recv_ancillary.crb_recvdstaddr &&
+	    (ira->ira_flags & IRAF_IS_IPV4)) {
+		struct T_opthdr *toh;
+		ipaddr_t *dstptr;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IP;
+		toh->name = IP_RECVDSTADDR;
+		toh->len = sizeof (struct T_opthdr) + sizeof (ipaddr_t);
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		dstptr = (ipaddr_t *)ancil_buf;
+		*dstptr = ipp->ipp_addr_v4;
+		ancil_buf += sizeof (ipaddr_t);
+		ancil_size -= toh->len;
+	}
+
+	/*
+	 * ip_recvpktinfo is used for both AF_INET and AF_INET6 but
+	 * are different
+	 */
+	if (recv_ancillary.crb_ip_recvpktinfo &&
+	    connp->conn_family == AF_INET) {
+		ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+		struct T_opthdr *toh;
+		struct in_pktinfo *pktinfop;
+		ill_t *ill;
+		ipif_t *ipif;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IP;
+		toh->name = IP_PKTINFO;
+		toh->len = sizeof (struct T_opthdr) + sizeof (*pktinfop);
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		pktinfop = (struct in_pktinfo *)ancil_buf;
+
+		pktinfop->ipi_ifindex = ira->ira_ruifindex;
+		pktinfop->ipi_spec_dst.s_addr = INADDR_ANY;
+
+		/* Find a good address to report */
+		ill = ill_lookup_on_ifindex(ira->ira_ruifindex, B_FALSE, ipst);
+		if (ill != NULL) {
+			ipif = ipif_good_addr(ill, IPCL_ZONEID(connp));
+			if (ipif != NULL) {
+				pktinfop->ipi_spec_dst.s_addr =
+				    ipif->ipif_lcl_addr;
+				ipif_refrele(ipif);
+			}
+			ill_refrele(ill);
+		}
+		pktinfop->ipi_addr.s_addr = ipp->ipp_addr_v4;
+		ancil_buf += sizeof (struct in_pktinfo);
+		ancil_size -= toh->len;
+	}
+
+	if ((recv_ancillary.crb_recvopts) &&
+	    (ipp->ipp_fields & IPPF_IPV4_OPTIONS)) {
+		struct T_opthdr *toh;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IP;
+		toh->name = IP_RECVOPTS;
+		toh->len = sizeof (struct T_opthdr) + ipp->ipp_ipv4_options_len;
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		bcopy(ipp->ipp_ipv4_options, ancil_buf,
+		    ipp->ipp_ipv4_options_len);
+		ancil_buf += ipp->ipp_ipv4_options_len;
+		ancil_size -= toh->len;
+	}
+
+	if (recv_ancillary.crb_recvslla) {
+		ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+		struct T_opthdr *toh;
+		struct sockaddr_dl *dstptr;
+		ill_t *ill;
+		int alen = 0;
+
+		ill = ill_lookup_on_ifindex(ira->ira_rifindex, B_FALSE, ipst);
+		if (ill != NULL)
+			alen = ill->ill_phys_addr_length;
+
+		/*
+		 * For loopback multicast and broadcast the packet arrives
+		 * with ira_ruifdex being the physical interface, but
+		 * ira_l2src is all zero since ip_postfrag_loopback doesn't
+		 * know our l2src. We don't report the address in that case.
+		 */
+		if (ira->ira_flags & IRAF_LOOPBACK)
+			alen = 0;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IP;
+		toh->name = IP_RECVSLLA;
+		toh->len = sizeof (struct T_opthdr) +
+		    sizeof (struct sockaddr_dl);
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		dstptr = (struct sockaddr_dl *)ancil_buf;
+		dstptr->sdl_family = AF_LINK;
+		dstptr->sdl_index = ira->ira_ruifindex;
+		if (ill != NULL)
+			dstptr->sdl_type = ill->ill_type;
+		else
+			dstptr->sdl_type = 0;
+		dstptr->sdl_nlen = 0;
+		dstptr->sdl_alen = alen;
+		dstptr->sdl_slen = 0;
+		bcopy(ira->ira_l2src, dstptr->sdl_data, alen);
+		ancil_buf += sizeof (struct sockaddr_dl);
+		ancil_size -= toh->len;
+		if (ill != NULL)
+			ill_refrele(ill);
+	}
+
+	if (recv_ancillary.crb_recvif) {
+		struct T_opthdr *toh;
+		uint_t		*dstptr;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IP;
+		toh->name = IP_RECVIF;
+		toh->len = sizeof (struct T_opthdr) + sizeof (uint_t);
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		dstptr = (uint_t *)ancil_buf;
+		*dstptr = ira->ira_ruifindex;
+		ancil_buf += sizeof (uint_t);
+		ancil_size -= toh->len;
+	}
+
+	/*
+	 * ip_recvpktinfo is used for both AF_INET and AF_INET6 but
+	 * are different
+	 */
+	if (recv_ancillary.crb_ip_recvpktinfo &&
+	    connp->conn_family == AF_INET6) {
+		struct T_opthdr *toh;
+		struct in6_pktinfo *pkti;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IPV6;
+		toh->name = IPV6_PKTINFO;
+		toh->len = sizeof (struct T_opthdr) + sizeof (*pkti);
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		pkti = (struct in6_pktinfo *)ancil_buf;
+		if (ira->ira_flags & IRAF_IS_IPV4) {
+			IN6_IPADDR_TO_V4MAPPED(ipp->ipp_addr_v4,
+			    &pkti->ipi6_addr);
+		} else {
+			pkti->ipi6_addr = ipp->ipp_addr;
+		}
+		pkti->ipi6_ifindex = ira->ira_ruifindex;
+
+		ancil_buf += sizeof (*pkti);
+		ancil_size -= toh->len;
+	}
+	if (recv_ancillary.crb_ipv6_recvhoplimit) {
+		struct T_opthdr *toh;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IPV6;
+		toh->name = IPV6_HOPLIMIT;
+		toh->len = sizeof (struct T_opthdr) + sizeof (uint_t);
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		*(uint_t *)ancil_buf = ipp->ipp_hoplimit;
+		ancil_buf += sizeof (uint_t);
+		ancil_size -= toh->len;
+	}
+	if (recv_ancillary.crb_ipv6_recvtclass) {
+		struct T_opthdr *toh;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IPV6;
+		toh->name = IPV6_TCLASS;
+		toh->len = sizeof (struct T_opthdr) + sizeof (uint_t);
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+
+		if (ira->ira_flags & IRAF_IS_IPV4)
+			*(uint_t *)ancil_buf = ipp->ipp_type_of_service;
+		else
+			*(uint_t *)ancil_buf = ipp->ipp_tclass;
+		ancil_buf += sizeof (uint_t);
+		ancil_size -= toh->len;
+	}
+	if (recv_ancillary.crb_ipv6_recvhopopts &&
+	    (ipp->ipp_fields & IPPF_HOPOPTS)) {
+		struct T_opthdr *toh;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IPV6;
+		toh->name = IPV6_HOPOPTS;
+		toh->len = sizeof (struct T_opthdr) + ipp->ipp_hopoptslen;
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		bcopy(ipp->ipp_hopopts, ancil_buf, ipp->ipp_hopoptslen);
+		ancil_buf += ipp->ipp_hopoptslen;
+		ancil_size -= toh->len;
+	}
+	/*
+	 * To honor RFC3542 when an application asks for both IPV6_RECVDSTOPTS
+	 * and IPV6_RECVRTHDR, we pass up the item rthdrdstopts (the destination
+	 * options that appear before a routing header.
+	 * We also pass them up if IPV6_RECVRTHDRDSTOPTS is set.
+	 */
+	if (ipp->ipp_fields & IPPF_RTHDRDSTOPTS) {
+		if (recv_ancillary.crb_ipv6_recvrthdrdstopts ||
+		    (recv_ancillary.crb_ipv6_recvdstopts &&
+		    recv_ancillary.crb_ipv6_recvrthdr)) {
+			struct T_opthdr *toh;
+
+			toh = (struct T_opthdr *)ancil_buf;
+			toh->level = IPPROTO_IPV6;
+			toh->name = IPV6_DSTOPTS;
+			toh->len = sizeof (struct T_opthdr) +
+			    ipp->ipp_rthdrdstoptslen;
+			toh->status = 0;
+			ancil_buf += sizeof (struct T_opthdr);
+			bcopy(ipp->ipp_rthdrdstopts, ancil_buf,
+			    ipp->ipp_rthdrdstoptslen);
+			ancil_buf += ipp->ipp_rthdrdstoptslen;
+			ancil_size -= toh->len;
+		}
+	}
+	if (recv_ancillary.crb_ipv6_recvrthdr &&
+	    (ipp->ipp_fields & IPPF_RTHDR)) {
+		struct T_opthdr *toh;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IPV6;
+		toh->name = IPV6_RTHDR;
+		toh->len = sizeof (struct T_opthdr) + ipp->ipp_rthdrlen;
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		bcopy(ipp->ipp_rthdr, ancil_buf, ipp->ipp_rthdrlen);
+		ancil_buf += ipp->ipp_rthdrlen;
+		ancil_size -= toh->len;
+	}
+	if ((recv_ancillary.crb_ipv6_recvdstopts ||
+	    recv_ancillary.crb_old_ipv6_recvdstopts) &&
+	    (ipp->ipp_fields & IPPF_DSTOPTS)) {
+		struct T_opthdr *toh;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IPV6;
+		toh->name = IPV6_DSTOPTS;
+		toh->len = sizeof (struct T_opthdr) + ipp->ipp_dstoptslen;
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		bcopy(ipp->ipp_dstopts, ancil_buf, ipp->ipp_dstoptslen);
+		ancil_buf += ipp->ipp_dstoptslen;
+		ancil_size -= toh->len;
+	}
+
+	if (recv_ancillary.crb_recvucred && ira->ira_cred != NULL) {
+		struct T_opthdr *toh;
+		cred_t		*rcr = connp->conn_cred;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = SOL_SOCKET;
+		toh->name = SCM_UCRED;
+		toh->len = sizeof (struct T_opthdr) + ucredsize;
+		toh->status = 0;
+		(void) cred2ucred(ira->ira_cred, ira->ira_cpid, &toh[1], rcr);
+		ancil_buf += toh->len;
+		ancil_size -= toh->len;
+	}
+	if (recv_ancillary.crb_timestamp) {
+		struct	T_opthdr *toh;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = SOL_SOCKET;
+		toh->name = SCM_TIMESTAMP;
+		toh->len = sizeof (struct T_opthdr) +
+		    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		/* Align for gethrestime() */
+		ancil_buf = (uchar_t *)P2ROUNDUP((intptr_t)ancil_buf,
+		    sizeof (intptr_t));
+		gethrestime((timestruc_t *)ancil_buf);
+		ancil_buf = (uchar_t *)toh + toh->len;
+		ancil_size -= toh->len;
+	}
+
+	/*
+	 * CAUTION:
+	 * Due to aligment issues
+	 * Processing of IP_RECVTTL option
+	 * should always be the last. Adding
+	 * any option processing after this will
+	 * cause alignment panic.
+	 */
+	if (recv_ancillary.crb_recvttl &&
+	    (ira->ira_flags & IRAF_IS_IPV4)) {
+		struct	T_opthdr *toh;
+		uint8_t	*dstptr;
+
+		toh = (struct T_opthdr *)ancil_buf;
+		toh->level = IPPROTO_IP;
+		toh->name = IP_RECVTTL;
+		toh->len = sizeof (struct T_opthdr) + sizeof (uint8_t);
+		toh->status = 0;
+		ancil_buf += sizeof (struct T_opthdr);
+		dstptr = (uint8_t *)ancil_buf;
+		*dstptr = ipp->ipp_hoplimit;
+		ancil_buf += sizeof (uint8_t);
+		ancil_size -= toh->len;
+	}
+
+	/* Consumed all of allocated space */
+	ASSERT(ancil_size == 0);
+
+}
+
+/*
+ * This routine retrieves the current status of socket options.
+ * It returns the size of the option retrieved, or -1.
+ */
+int
+conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
+    uchar_t *ptr)
+{
+	int		*i1 = (int *)ptr;
+	conn_t		*connp = coa->coa_connp;
+	ip_xmit_attr_t	*ixa = coa->coa_ixa;
+	ip_pkt_t	*ipp = coa->coa_ipp;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	uint_t		len;
+
+	ASSERT(MUTEX_HELD(&coa->coa_connp->conn_lock));
+
+	switch (level) {
+	case SOL_SOCKET:
+		switch (name) {
+		case SO_DEBUG:
+			*i1 = connp->conn_debug ? SO_DEBUG : 0;
+			break;	/* goto sizeof (int) option return */
+		case SO_KEEPALIVE:
+			*i1 = connp->conn_keepalive ? SO_KEEPALIVE : 0;
+			break;
+		case SO_LINGER:	{
+			struct linger *lgr = (struct linger *)ptr;
+
+			lgr->l_onoff = connp->conn_linger ? SO_LINGER : 0;
+			lgr->l_linger = connp->conn_lingertime;
+			}
+			return (sizeof (struct linger));
+
+		case SO_OOBINLINE:
+			*i1 = connp->conn_oobinline ? SO_OOBINLINE : 0;
+			break;
+		case SO_REUSEADDR:
+			*i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
+			break;	/* goto sizeof (int) option return */
+		case SO_TYPE:
+			*i1 = connp->conn_so_type;
+			break;	/* goto sizeof (int) option return */
+		case SO_DONTROUTE:
+			*i1 = (ixa->ixa_flags & IXAF_DONTROUTE) ?
+			    SO_DONTROUTE : 0;
+			break;	/* goto sizeof (int) option return */
+		case SO_USELOOPBACK:
+			*i1 = connp->conn_useloopback ? SO_USELOOPBACK : 0;
+			break;	/* goto sizeof (int) option return */
+		case SO_BROADCAST:
+			*i1 = connp->conn_broadcast ? SO_BROADCAST : 0;
+			break;	/* goto sizeof (int) option return */
+
+		case SO_SNDBUF:
+			*i1 = connp->conn_sndbuf;
+			break;	/* goto sizeof (int) option return */
+		case SO_RCVBUF:
+			*i1 = connp->conn_rcvbuf;
+			break;	/* goto sizeof (int) option return */
+		case SO_RCVTIMEO:
+		case SO_SNDTIMEO:
+			/*
+			 * Pass these two options in order for third part
+			 * protocol usage. Here just return directly.
+			 */
+			*i1 = 0;
+			break;
+		case SO_DGRAM_ERRIND:
+			*i1 = connp->conn_dgram_errind ? SO_DGRAM_ERRIND : 0;
+			break;	/* goto sizeof (int) option return */
+		case SO_RECVUCRED:
+			*i1 = connp->conn_recv_ancillary.crb_recvucred;
+			break;	/* goto sizeof (int) option return */
+		case SO_TIMESTAMP:
+			*i1 = connp->conn_recv_ancillary.crb_timestamp;
+			break;	/* goto sizeof (int) option return */
+#ifdef SO_VRRP
+		case SO_VRRP:
+			*i1 = connp->conn_isvrrp;
+			break;	/* goto sizeof (int) option return */
+#endif
+		case SO_ANON_MLP:
+			*i1 = connp->conn_anon_mlp;
+			break;	/* goto sizeof (int) option return */
+		case SO_MAC_EXEMPT:
+			*i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
+			break;	/* goto sizeof (int) option return */
+		case SO_MAC_IMPLICIT:
+			*i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
+			break;	/* goto sizeof (int) option return */
+		case SO_ALLZONES:
+			*i1 = connp->conn_allzones;
+			break;	/* goto sizeof (int) option return */
+		case SO_EXCLBIND:
+			*i1 = connp->conn_exclbind ? SO_EXCLBIND : 0;
+			break;
+		case SO_PROTOTYPE:
+			*i1 = connp->conn_proto;
+			break;
+
+		case SO_DOMAIN:
+			*i1 = connp->conn_family;
+			break;
+		default:
+			return (-1);
+		}
+		break;
+	case IPPROTO_IP:
+		if (connp->conn_family != AF_INET)
+			return (-1);
+		switch (name) {
+		case IP_OPTIONS:
+		case T_IP_OPTIONS:
+			if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
+				return (0);
+
+			len = ipp->ipp_ipv4_options_len;
+			if (len > 0) {
+				bcopy(ipp->ipp_ipv4_options, ptr, len);
+			}
+			return (len);
+
+		case IP_PKTINFO: {
+			/*
+			 * This also handles IP_RECVPKTINFO.
+			 * IP_PKTINFO and IP_RECVPKTINFO have same value.
+			 * Differentiation is based on the size of the
+			 * argument passed in.
+			 */
+			struct in_pktinfo *pktinfo;
+
+#ifdef notdef
+			/* optcom doesn't provide a length with "get" */
+			if (inlen == sizeof (int)) {
+				/* This is IP_RECVPKTINFO option. */
+				*i1 = connp->conn_recv_ancillary.
+				    crb_ip_recvpktinfo;
+				return (sizeof (int));
+			}
+#endif
+			/* XXX assumes that caller has room for max size! */
+
+			pktinfo = (struct in_pktinfo *)ptr;
+			pktinfo->ipi_ifindex = ixa->ixa_ifindex;
+			if (ipp->ipp_fields & IPPF_ADDR)
+				pktinfo->ipi_spec_dst.s_addr = ipp->ipp_addr_v4;
+			else
+				pktinfo->ipi_spec_dst.s_addr = INADDR_ANY;
+			return (sizeof (struct in_pktinfo));
+		}
+		case IP_DONTFRAG:
+			*i1 = (ixa->ixa_flags & IXAF_DONTFRAG) != 0;
+			return (sizeof (int));
+		case IP_TOS:
+		case T_IP_TOS:
+			*i1 = (int)ipp->ipp_type_of_service;
+			break;	/* goto sizeof (int) option return */
+		case IP_TTL:
+			*i1 = (int)ipp->ipp_unicast_hops;
+			break;	/* goto sizeof (int) option return */
+		case IP_DHCPINIT_IF:
+			return (-1);
+		case IP_NEXTHOP:
+			if (ixa->ixa_flags & IXAF_NEXTHOP_SET) {
+				*(ipaddr_t *)ptr = ixa->ixa_nexthop_v4;
+				return (sizeof (ipaddr_t));
+			} else {
+				return (0);
+			}
+
+		case IP_MULTICAST_IF:
+			/* 0 address if not set */
+			*(ipaddr_t *)ptr = ixa->ixa_multicast_ifaddr;
+			return (sizeof (ipaddr_t));
+		case IP_MULTICAST_TTL:
+			*(uchar_t *)ptr = ixa->ixa_multicast_ttl;
+			return (sizeof (uchar_t));
+		case IP_MULTICAST_LOOP:
+			*ptr = (ixa->ixa_flags & IXAF_MULTICAST_LOOP) ? 1 : 0;
+			return (sizeof (uint8_t));
+		case IP_RECVOPTS:
+			*i1 = connp->conn_recv_ancillary.crb_recvopts;
+			break;	/* goto sizeof (int) option return */
+		case IP_RECVDSTADDR:
+			*i1 = connp->conn_recv_ancillary.crb_recvdstaddr;
+			break;	/* goto sizeof (int) option return */
+		case IP_RECVIF:
+			*i1 = connp->conn_recv_ancillary.crb_recvif;
+			break;	/* goto sizeof (int) option return */
+		case IP_RECVSLLA:
+			*i1 = connp->conn_recv_ancillary.crb_recvslla;
+			break;	/* goto sizeof (int) option return */
+		case IP_RECVTTL:
+			*i1 = connp->conn_recv_ancillary.crb_recvttl;
+			break;	/* goto sizeof (int) option return */
+		case IP_ADD_MEMBERSHIP:
+		case IP_DROP_MEMBERSHIP:
+		case MCAST_JOIN_GROUP:
+		case MCAST_LEAVE_GROUP:
+		case IP_BLOCK_SOURCE:
+		case IP_UNBLOCK_SOURCE:
+		case IP_ADD_SOURCE_MEMBERSHIP:
+		case IP_DROP_SOURCE_MEMBERSHIP:
+		case MCAST_BLOCK_SOURCE:
+		case MCAST_UNBLOCK_SOURCE:
+		case MCAST_JOIN_SOURCE_GROUP:
+		case MCAST_LEAVE_SOURCE_GROUP:
+		case MRT_INIT:
+		case MRT_DONE:
+		case MRT_ADD_VIF:
+		case MRT_DEL_VIF:
+		case MRT_ADD_MFC:
+		case MRT_DEL_MFC:
+			/* cannot "get" the value for these */
+			return (-1);
+		case MRT_VERSION:
+		case MRT_ASSERT:
+			(void) ip_mrouter_get(name, connp, ptr);
+			return (sizeof (int));
+		case IP_SEC_OPT:
+			return (ipsec_req_from_conn(connp, (ipsec_req_t	*)ptr,
+			    IPSEC_AF_V4));
+		case IP_BOUND_IF:
+			/* Zero if not set */
+			*i1 = connp->conn_bound_if;
+			break;	/* goto sizeof (int) option return */
+		case IP_UNSPEC_SRC:
+			*i1 = connp->conn_unspec_src;
+			break;	/* goto sizeof (int) option return */
+		case IP_BROADCAST_TTL:
+			if (ixa->ixa_flags & IXAF_BROADCAST_TTL_SET)
+				*(uchar_t *)ptr = ixa->ixa_broadcast_ttl;
+			else
+				*(uchar_t *)ptr = ipst->ips_ip_broadcast_ttl;
+			return (sizeof (uchar_t));
+		default:
+			return (-1);
+		}
+		break;
+	case IPPROTO_IPV6:
+		if (connp->conn_family != AF_INET6)
+			return (-1);
+		switch (name) {
+		case IPV6_UNICAST_HOPS:
+			*i1 = (int)ipp->ipp_unicast_hops;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_MULTICAST_IF:
+			/* 0 index if not set */
+			*i1 = ixa->ixa_multicast_ifindex;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_MULTICAST_HOPS:
+			*i1 = ixa->ixa_multicast_ttl;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_MULTICAST_LOOP:
+			*i1 = (ixa->ixa_flags & IXAF_MULTICAST_LOOP) ? 1 : 0;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_JOIN_GROUP:
+		case IPV6_LEAVE_GROUP:
+		case MCAST_JOIN_GROUP:
+		case MCAST_LEAVE_GROUP:
+		case MCAST_BLOCK_SOURCE:
+		case MCAST_UNBLOCK_SOURCE:
+		case MCAST_JOIN_SOURCE_GROUP:
+		case MCAST_LEAVE_SOURCE_GROUP:
+			/* cannot "get" the value for these */
+			return (-1);
+		case IPV6_BOUND_IF:
+			/* Zero if not set */
+			*i1 = connp->conn_bound_if;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_UNSPEC_SRC:
+			*i1 = connp->conn_unspec_src;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_RECVPKTINFO:
+			*i1 = connp->conn_recv_ancillary.crb_ip_recvpktinfo;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_RECVTCLASS:
+			*i1 = connp->conn_recv_ancillary.crb_ipv6_recvtclass;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_RECVPATHMTU:
+			*i1 = connp->conn_ipv6_recvpathmtu;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_RECVHOPLIMIT:
+			*i1 = connp->conn_recv_ancillary.crb_ipv6_recvhoplimit;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_RECVHOPOPTS:
+			*i1 = connp->conn_recv_ancillary.crb_ipv6_recvhopopts;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_RECVDSTOPTS:
+			*i1 = connp->conn_recv_ancillary.crb_ipv6_recvdstopts;
+			break;	/* goto sizeof (int) option return */
+		case _OLD_IPV6_RECVDSTOPTS:
+			*i1 =
+			    connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_RECVRTHDRDSTOPTS:
+			*i1 = connp->conn_recv_ancillary.
+			    crb_ipv6_recvrthdrdstopts;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_RECVRTHDR:
+			*i1 = connp->conn_recv_ancillary.crb_ipv6_recvrthdr;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_PKTINFO: {
+			/* XXX assumes that caller has room for max size! */
+			struct in6_pktinfo *pkti;
+
+			pkti = (struct in6_pktinfo *)ptr;
+			pkti->ipi6_ifindex = ixa->ixa_ifindex;
+			if (ipp->ipp_fields & IPPF_ADDR)
+				pkti->ipi6_addr = ipp->ipp_addr;
+			else
+				pkti->ipi6_addr = ipv6_all_zeros;
+			return (sizeof (struct in6_pktinfo));
+		}
+		case IPV6_TCLASS:
+			*i1 = ipp->ipp_tclass;
+			break;	/* goto sizeof (int) option return */
+		case IPV6_NEXTHOP: {
+			sin6_t *sin6 = (sin6_t *)ptr;
+
+			if (ixa->ixa_flags & IXAF_NEXTHOP_SET)
+				return (0);
+
+			*sin6 = sin6_null;
+			sin6->sin6_family = AF_INET6;
+			sin6->sin6_addr = ixa->ixa_nexthop_v6;
+
+			return (sizeof (sin6_t));
+		}
+		case IPV6_HOPOPTS:
+			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
+				return (0);
+			bcopy(ipp->ipp_hopopts, ptr,
+			    ipp->ipp_hopoptslen);
+			return (ipp->ipp_hopoptslen);
+		case IPV6_RTHDRDSTOPTS:
+			if (!(ipp->ipp_fields & IPPF_RTHDRDSTOPTS))
+				return (0);
+			bcopy(ipp->ipp_rthdrdstopts, ptr,
+			    ipp->ipp_rthdrdstoptslen);
+			return (ipp->ipp_rthdrdstoptslen);
+		case IPV6_RTHDR:
+			if (!(ipp->ipp_fields & IPPF_RTHDR))
+				return (0);
+			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
+			return (ipp->ipp_rthdrlen);
+		case IPV6_DSTOPTS:
+			if (!(ipp->ipp_fields & IPPF_DSTOPTS))
+				return (0);
+			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
+			return (ipp->ipp_dstoptslen);
+		case IPV6_PATHMTU:
+			return (ip_fill_mtuinfo(connp, ixa,
+			    (struct ip6_mtuinfo *)ptr));
+		case IPV6_SEC_OPT:
+			return (ipsec_req_from_conn(connp, (ipsec_req_t	*)ptr,
+			    IPSEC_AF_V6));
+		case IPV6_SRC_PREFERENCES:
+			return (ip6_get_src_preferences(ixa, (uint32_t *)ptr));
+		case IPV6_DONTFRAG:
+			*i1 = (ixa->ixa_flags & IXAF_DONTFRAG) != 0;
+			return (sizeof (int));
+		case IPV6_USE_MIN_MTU:
+			if (ixa->ixa_flags & IXAF_USE_MIN_MTU)
+				*i1 = ixa->ixa_use_min_mtu;
+			else
+				*i1 = IPV6_USE_MIN_MTU_MULTICAST;
+			break;
+		case IPV6_V6ONLY:
+			*i1 = connp->conn_ipv6_v6only;
+			return (sizeof (int));
+		default:
+			return (-1);
+		}
+		break;
+	case IPPROTO_UDP:
+		switch (name) {
+		case UDP_ANONPRIVBIND:
+			*i1 = connp->conn_anon_priv_bind;
+			break;
+		case UDP_EXCLBIND:
+			*i1 = connp->conn_exclbind ? UDP_EXCLBIND : 0;
+			break;
+		default:
+			return (-1);
+		}
+		break;
+	case IPPROTO_TCP:
+		switch (name) {
+		case TCP_RECVDSTADDR:
+			*i1 = connp->conn_recv_ancillary.crb_recvdstaddr;
+			break;
+		case TCP_ANONPRIVBIND:
+			*i1 = connp->conn_anon_priv_bind;
+			break;
+		case TCP_EXCLBIND:
+			*i1 = connp->conn_exclbind ? TCP_EXCLBIND : 0;
+			break;
+		default:
+			return (-1);
+		}
+		break;
+	default:
+		return (-1);
+	}
+	return (sizeof (int));
+}
+
+static int conn_opt_set_socket(conn_opt_arg_t *coa, t_scalar_t name,
+    uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr);
+static int conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name,
+    uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr);
+static int conn_opt_set_ipv6(conn_opt_arg_t *coa, t_scalar_t name,
+    uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr);
+static int conn_opt_set_udp(conn_opt_arg_t *coa, t_scalar_t name,
+    uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr);
+static int conn_opt_set_tcp(conn_opt_arg_t *coa, t_scalar_t name,
+    uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr);
+
+/*
+ * This routine sets the most common socket options including some
+ * that are transport/ULP specific.
+ * It returns errno or zero.
+ *
+ * For fixed length options, there is no sanity check
+ * of passed in length is done. It is assumed *_optcom_req()
+ * routines do the right thing.
+ */
+int
+conn_opt_set(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
+    uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
+
+	/* We have different functions for different levels */
+	switch (level) {
+	case SOL_SOCKET:
+		return (conn_opt_set_socket(coa, name, inlen, invalp,
+		    checkonly, cr));
+	case IPPROTO_IP:
+		return (conn_opt_set_ip(coa, name, inlen, invalp,
+		    checkonly, cr));
+	case IPPROTO_IPV6:
+		return (conn_opt_set_ipv6(coa, name, inlen, invalp,
+		    checkonly, cr));
+	case IPPROTO_UDP:
+		return (conn_opt_set_udp(coa, name, inlen, invalp,
+		    checkonly, cr));
+	case IPPROTO_TCP:
+		return (conn_opt_set_tcp(coa, name, inlen, invalp,
+		    checkonly, cr));
+	default:
+		return (0);
+	}
+}
+
+/*
+ * Handle SOL_SOCKET
+ * Note that we do not handle SO_PROTOTYPE here. The ULPs that support
+ * it implement their own checks and setting of conn_proto.
+ */
+/* ARGSUSED1 */
+static int
+conn_opt_set_socket(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
+    uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+	conn_t		*connp = coa->coa_connp;
+	ip_xmit_attr_t	*ixa = coa->coa_ixa;
+	int		*i1 = (int *)invalp;
+	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
+
+	switch (name) {
+	case SO_ALLZONES:
+		if (IPCL_IS_BOUND(connp))
+			return (EINVAL);
+		break;
+#ifdef SO_VRRP
+	case SO_VRRP:
+		if (secpolicy_ip_config(cr, checkonly) != 0)
+			return (EACCES);
+		break;
+#endif
+	case SO_MAC_EXEMPT:
+		if (secpolicy_net_mac_aware(cr) != 0)
+			return (EACCES);
+		if (IPCL_IS_BOUND(connp))
+			return (EINVAL);
+		break;
+	case SO_MAC_IMPLICIT:
+		if (secpolicy_net_mac_implicit(cr) != 0)
+			return (EACCES);
+		break;
+	}
+	if (checkonly)
+		return (0);
+
+	mutex_enter(&connp->conn_lock);
+	/* Here we set the actual option value */
+	switch (name) {
+	case SO_DEBUG:
+		connp->conn_debug = onoff;
+		break;
+	case SO_KEEPALIVE:
+		connp->conn_keepalive = onoff;
+		break;
+	case SO_LINGER: {
+		struct linger *lgr = (struct linger *)invalp;
+
+		if (lgr->l_onoff) {
+			connp->conn_linger = 1;
+			connp->conn_lingertime = lgr->l_linger;
+		} else {
+			connp->conn_linger = 0;
+			connp->conn_lingertime = 0;
+		}
+		break;
+	}
+	case SO_OOBINLINE:
+		connp->conn_oobinline = onoff;
+		coa->coa_changed |= COA_OOBINLINE_CHANGED;
+		break;
+	case SO_REUSEADDR:
+		connp->conn_reuseaddr = onoff;
+		break;
+	case SO_DONTROUTE:
+		if (onoff)
+			ixa->ixa_flags |= IXAF_DONTROUTE;
+		else
+			ixa->ixa_flags &= ~IXAF_DONTROUTE;
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+	case SO_USELOOPBACK:
+		connp->conn_useloopback = onoff;
+		break;
+	case SO_BROADCAST:
+		connp->conn_broadcast = onoff;
+		break;
+	case SO_SNDBUF:
+		/* ULP has range checked the value */
+		connp->conn_sndbuf = *i1;
+		coa->coa_changed |= COA_SNDBUF_CHANGED;
+		break;
+	case SO_RCVBUF:
+		/* ULP has range checked the value */
+		connp->conn_rcvbuf = *i1;
+		coa->coa_changed |= COA_RCVBUF_CHANGED;
+		break;
+	case SO_RCVTIMEO:
+	case SO_SNDTIMEO:
+		/*
+		 * Pass these two options in order for third part
+		 * protocol usage.
+		 */
+		break;
+	case SO_DGRAM_ERRIND:
+		connp->conn_dgram_errind = onoff;
+		break;
+	case SO_RECVUCRED:
+		connp->conn_recv_ancillary.crb_recvucred = onoff;
+		break;
+	case SO_ALLZONES:
+		connp->conn_allzones = onoff;
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		if (onoff)
+			ixa->ixa_zoneid = ALL_ZONES;
+		else
+			ixa->ixa_zoneid = connp->conn_zoneid;
+		break;
+	case SO_TIMESTAMP:
+		connp->conn_recv_ancillary.crb_timestamp = onoff;
+		break;
+#ifdef SO_VRRP
+	case SO_VRRP:
+		connp->conn_isvrrp = onoff;
+		break;
+#endif
+	case SO_ANON_MLP:
+		connp->conn_anon_mlp = onoff;
+		break;
+	case SO_MAC_EXEMPT:
+		connp->conn_mac_mode = onoff ?
+		    CONN_MAC_AWARE : CONN_MAC_DEFAULT;
+		break;
+	case SO_MAC_IMPLICIT:
+		connp->conn_mac_mode = onoff ?
+		    CONN_MAC_IMPLICIT : CONN_MAC_DEFAULT;
+		break;
+	case SO_EXCLBIND:
+		connp->conn_exclbind = onoff;
+		break;
+	}
+	mutex_exit(&connp->conn_lock);
+	return (0);
+}
+
+/* Handle IPPROTO_IP */
+static int
+conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
+    uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+	conn_t		*connp = coa->coa_connp;
+	ip_xmit_attr_t	*ixa = coa->coa_ixa;
+	ip_pkt_t	*ipp = coa->coa_ipp;
+	int		*i1 = (int *)invalp;
+	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
+	ipaddr_t	addr = (ipaddr_t)*i1;
+	uint_t		ifindex;
+	zoneid_t	zoneid = IPCL_ZONEID(connp);
+	ipif_t		*ipif;
+	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
+	int		error;
+
+	if (connp->conn_family != AF_INET)
+		return (EINVAL);
+
+	switch (name) {
+	case IP_TTL:
+		/* Don't allow zero */
+		if (*i1 < 1 || *i1 > 255)
+			return (EINVAL);
+		break;
+	case IP_MULTICAST_IF:
+		if (addr == INADDR_ANY) {
+			/* Clear */
+			ifindex = 0;
+			break;
+		}
+		ipif = ipif_lookup_addr(addr, NULL, zoneid, ipst);
+		if (ipif == NULL)
+			return (EHOSTUNREACH);
+		/* not supported by the virtual network iface */
+		if (IS_VNI(ipif->ipif_ill)) {
+			ipif_refrele(ipif);
+			return (EINVAL);
+		}
+		ifindex = ipif->ipif_ill->ill_phyint->phyint_ifindex;
+		ipif_refrele(ipif);
+		break;
+	case IP_NEXTHOP: {
+		ire_t	*ire;
+
+		if (addr == INADDR_ANY) {
+			/* Clear */
+			break;
+		}
+		/* Verify that the next-hop is on-link */
+		ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_ONLINK, NULL, zoneid,
+		    NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
+		if (ire == NULL)
+			return (EHOSTUNREACH);
+		ire_refrele(ire);
+		break;
+	}
+	case IP_OPTIONS:
+	case T_IP_OPTIONS: {
+		uint_t newlen;
+
+		if (ipp->ipp_fields & IPPF_LABEL_V4)
+			newlen = inlen + (ipp->ipp_label_len_v4 + 3) & ~3;
+		else
+			newlen = inlen;
+		if ((inlen & 0x3) || newlen > IP_MAX_OPT_LENGTH) {
+			return (EINVAL);
+		}
+		break;
+	}
+	case IP_PKTINFO: {
+		struct in_pktinfo *pktinfo;
+
+		/* Two different valid lengths */
+		if (inlen != sizeof (int) &&
+		    inlen != sizeof (struct in_pktinfo))
+			return (EINVAL);
+		if (inlen == sizeof (int))
+			break;
+
+		pktinfo = (struct in_pktinfo *)invalp;
+		if (pktinfo->ipi_spec_dst.s_addr != INADDR_ANY) {
+			switch (ip_laddr_verify_v4(pktinfo->ipi_spec_dst.s_addr,
+			    zoneid, ipst, B_FALSE)) {
+			case IPVL_UNICAST_UP:
+			case IPVL_UNICAST_DOWN:
+				break;
+			default:
+				return (EADDRNOTAVAIL);
+			}
+		}
+		if (!ip_ifindex_valid(pktinfo->ipi_ifindex, B_FALSE, ipst))
+			return (ENXIO);
+		break;
+	}
+	case IP_BOUND_IF:
+		ifindex = *(uint_t *)i1;
+
+		/* Just check it is ok. */
+		if (!ip_ifindex_valid(ifindex, B_FALSE, ipst))
+			return (ENXIO);
+		break;
+	}
+	if (checkonly)
+		return (0);
+
+	/* Here we set the actual option value */
+	/*
+	 * conn_lock protects the bitfields, and is used to
+	 * set the fields atomically. Not needed for ixa settings since
+	 * the caller has an exclusive copy of the ixa.
+	 * We can not hold conn_lock across the multicast options though.
+	 */
+	switch (name) {
+	case IP_OPTIONS:
+	case T_IP_OPTIONS:
+		/* Save options for use by IP. */
+		mutex_enter(&connp->conn_lock);
+		error = optcom_pkt_set(invalp, inlen,
+		    (uchar_t **)&ipp->ipp_ipv4_options,
+		    &ipp->ipp_ipv4_options_len);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			return (error);
+		}
+		if (ipp->ipp_ipv4_options_len == 0) {
+			ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
+		} else {
+			ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
+		}
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		coa->coa_changed |= COA_WROFF_CHANGED;
+		break;
+
+	case IP_TTL:
+		mutex_enter(&connp->conn_lock);
+		ipp->ipp_unicast_hops = *i1;
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		break;
+	case IP_TOS:
+	case T_IP_TOS:
+		mutex_enter(&connp->conn_lock);
+		if (*i1 == -1) {
+			ipp->ipp_type_of_service = 0;
+		} else {
+			ipp->ipp_type_of_service = *i1;
+		}
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		break;
+	case IP_MULTICAST_IF:
+		ixa->ixa_multicast_ifindex = ifindex;
+		ixa->ixa_multicast_ifaddr = addr;
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+	case IP_MULTICAST_TTL:
+		ixa->ixa_multicast_ttl = *invalp;
+		/* Handled automatically by ip_output */
+		break;
+	case IP_MULTICAST_LOOP:
+		if (*invalp != 0)
+			ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
+		else
+			ixa->ixa_flags &= ~IXAF_MULTICAST_LOOP;
+		/* Handled automatically by ip_output */
+		break;
+	case IP_RECVOPTS:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_recvopts = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IP_RECVDSTADDR:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_recvdstaddr = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IP_RECVIF:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_recvif = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IP_RECVSLLA:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_recvslla = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IP_RECVTTL:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_recvttl = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IP_PKTINFO: {
+		/*
+		 * This also handles IP_RECVPKTINFO.
+		 * IP_PKTINFO and IP_RECVPKTINFO have same value.
+		 * Differentiation is based on the size of the
+		 * argument passed in.
+		 */
+		struct in_pktinfo *pktinfo;
+
+		if (inlen == sizeof (int)) {
+			/* This is IP_RECVPKTINFO option. */
+			mutex_enter(&connp->conn_lock);
+			connp->conn_recv_ancillary.crb_ip_recvpktinfo =
+			    onoff;
+			mutex_exit(&connp->conn_lock);
+			break;
+		}
+
+		/* This is IP_PKTINFO option. */
+		mutex_enter(&connp->conn_lock);
+		pktinfo = (struct in_pktinfo *)invalp;
+		if (ipp->ipp_addr_v4 != INADDR_ANY) {
+			ipp->ipp_fields |= IPPF_ADDR;
+			IN6_INADDR_TO_V4MAPPED(&pktinfo->ipi_spec_dst,
+			    &ipp->ipp_addr);
+		} else {
+			ipp->ipp_fields &= ~IPPF_ADDR;
+			ipp->ipp_addr = ipv6_all_zeros;
+		}
+		mutex_exit(&connp->conn_lock);
+		ixa->ixa_ifindex = pktinfo->ipi_ifindex;
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		break;
+	}
+	case IP_DONTFRAG:
+		if (onoff) {
+			ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
+			ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
+		} else {
+			ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
+			ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
+		}
+		/* Need to redo ip_attr_connect */
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+	case IP_ADD_MEMBERSHIP:
+	case IP_DROP_MEMBERSHIP:
+	case MCAST_JOIN_GROUP:
+	case MCAST_LEAVE_GROUP:
+		return (ip_opt_set_multicast_group(connp, name,
+		    invalp, B_FALSE, checkonly));
+
+	case IP_BLOCK_SOURCE:
+	case IP_UNBLOCK_SOURCE:
+	case IP_ADD_SOURCE_MEMBERSHIP:
+	case IP_DROP_SOURCE_MEMBERSHIP:
+	case MCAST_BLOCK_SOURCE:
+	case MCAST_UNBLOCK_SOURCE:
+	case MCAST_JOIN_SOURCE_GROUP:
+	case MCAST_LEAVE_SOURCE_GROUP:
+		return (ip_opt_set_multicast_sources(connp, name,
+		    invalp, B_FALSE, checkonly));
+
+	case IP_SEC_OPT:
+		mutex_enter(&connp->conn_lock);
+		error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp);
+		mutex_exit(&connp->conn_lock);
+		if (error != 0) {
+			return (error);
+		}
+		/* This is an IPsec policy change - redo ip_attr_connect */
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+	case IP_NEXTHOP:
+		ixa->ixa_nexthop_v4 = addr;
+		if (addr != INADDR_ANY)
+			ixa->ixa_flags |= IXAF_NEXTHOP_SET;
+		else
+			ixa->ixa_flags &= ~IXAF_NEXTHOP_SET;
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+
+	case IP_BOUND_IF:
+		ixa->ixa_ifindex = ifindex;		/* Send */
+		mutex_enter(&connp->conn_lock);
+		connp->conn_incoming_ifindex = ifindex;	/* Receive */
+		connp->conn_bound_if = ifindex;		/* getsockopt */
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+	case IP_UNSPEC_SRC:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_unspec_src = onoff;
+		if (onoff)
+			ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+		else
+			ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
+
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IP_BROADCAST_TTL:
+		ixa->ixa_broadcast_ttl = *invalp;
+		ixa->ixa_flags |= IXAF_BROADCAST_TTL_SET;
+		/* Handled automatically by ip_output */
+		break;
+	case MRT_INIT:
+	case MRT_DONE:
+	case MRT_ADD_VIF:
+	case MRT_DEL_VIF:
+	case MRT_ADD_MFC:
+	case MRT_DEL_MFC:
+	case MRT_ASSERT:
+		if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
+			return (error);
+		}
+		error = ip_mrouter_set((int)name, connp, checkonly,
+		    (uchar_t *)invalp, inlen);
+		if (error) {
+			return (error);
+		}
+		return (0);
+
+	}
+	return (0);
+}
+
+/* Handle IPPROTO_IPV6 */
+static int
+conn_opt_set_ipv6(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
+    uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+	conn_t		*connp = coa->coa_connp;
+	ip_xmit_attr_t	*ixa = coa->coa_ixa;
+	ip_pkt_t	*ipp = coa->coa_ipp;
+	int		*i1 = (int *)invalp;
+	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
+	uint_t		ifindex;
+	zoneid_t	zoneid = IPCL_ZONEID(connp);
+	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
+	int		error;
+
+	if (connp->conn_family != AF_INET6)
+		return (EINVAL);
+
+	switch (name) {
+	case IPV6_MULTICAST_IF:
+		/*
+		 * The only possible error is EINVAL.
+		 * We call this option on both V4 and V6
+		 * If both fail, then this call returns
+		 * EINVAL. If at least one of them succeeds we
+		 * return success.
+		 */
+		ifindex = *(uint_t *)i1;
+
+		if (!ip_ifindex_valid(ifindex, B_TRUE, ipst) &&
+		    !ip_ifindex_valid(ifindex, B_FALSE, ipst))
+			return (EINVAL);
+		break;
+	case IPV6_UNICAST_HOPS:
+		/* Don't allow zero. -1 means to use default */
+		if (*i1 < -1 || *i1 == 0 || *i1 > IPV6_MAX_HOPS)
+			return (EINVAL);
+		break;
+	case IPV6_MULTICAST_HOPS:
+		/* -1 means use default */
+		if (*i1 < -1 || *i1 > IPV6_MAX_HOPS)
+			return (EINVAL);
+		break;
+	case IPV6_MULTICAST_LOOP:
+		if (*i1 != 0 && *i1 != 1)
+			return (EINVAL);
+		break;
+	case IPV6_BOUND_IF:
+		ifindex = *(uint_t *)i1;
+
+		if (!ip_ifindex_valid(ifindex, B_TRUE, ipst))
+			return (ENXIO);
+		break;
+	case IPV6_PKTINFO: {
+		struct in6_pktinfo *pkti;
+		boolean_t isv6;
+
+		if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
+			return (EINVAL);
+		if (inlen == 0)
+			break;	/* Clear values below */
+
+		/*
+		 * Verify the source address and ifindex. Privileged users
+		 * can use any source address.
+		 */
+		pkti = (struct in6_pktinfo *)invalp;
+
+		/*
+		 * For link-local addresses we use the ipi6_ifindex when
+		 * we verify the local address.
+		 * If net_rawaccess then any source address can be used.
+		 */
+		if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr) &&
+		    secpolicy_net_rawaccess(cr) != 0) {
+			uint_t scopeid = 0;
+			in6_addr_t *v6src = &pkti->ipi6_addr;
+			ipaddr_t v4src;
+			ip_laddr_t laddr_type = IPVL_UNICAST_UP;
+
+			if (IN6_IS_ADDR_V4MAPPED(v6src)) {
+				IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
+				if (v4src != INADDR_ANY) {
+					laddr_type = ip_laddr_verify_v4(v4src,
+					    zoneid, ipst, B_FALSE);
+				}
+			} else {
+				if (IN6_IS_ADDR_LINKSCOPE(v6src))
+					scopeid = pkti->ipi6_ifindex;
+
+				laddr_type = ip_laddr_verify_v6(v6src, zoneid,
+				    ipst, B_FALSE, scopeid);
+			}
+			switch (laddr_type) {
+			case IPVL_UNICAST_UP:
+			case IPVL_UNICAST_DOWN:
+				break;
+			default:
+				return (EADDRNOTAVAIL);
+			}
+			ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
+		} else if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) {
+			/* Allow any source */
+			ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+		}
+		isv6 = !(IN6_IS_ADDR_V4MAPPED(&pkti->ipi6_addr));
+		if (!ip_ifindex_valid(pkti->ipi6_ifindex, isv6, ipst))
+			return (ENXIO);
+		break;
+	}
+	case IPV6_HOPLIMIT:
+		/* It is only allowed as ancilary data */
+		if (!coa->coa_ancillary)
+			return (EINVAL);
+
+		if (inlen != 0 && inlen != sizeof (int))
+			return (EINVAL);
+		if (inlen == sizeof (int)) {
+			if (*i1 > 255 || *i1 < -1 || *i1 == 0)
+				return (EINVAL);
+		}
+		break;
+	case IPV6_TCLASS:
+		if (inlen != 0 && inlen != sizeof (int))
+			return (EINVAL);
+		if (inlen == sizeof (int)) {
+			if (*i1 > 255 || *i1 < -1)
+				return (EINVAL);
+		}
+		break;
+	case IPV6_NEXTHOP:
+		if (inlen != 0 && inlen != sizeof (sin6_t))
+			return (EINVAL);
+		if (inlen == sizeof (sin6_t)) {
+			sin6_t *sin6 = (sin6_t *)invalp;
+			ire_t	*ire;
+
+			if (sin6->sin6_family != AF_INET6)
+				return (EAFNOSUPPORT);
+			if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr))
+				return (EADDRNOTAVAIL);
+
+			/* Verify that the next-hop is on-link */
+			ire = ire_ftable_lookup_v6(&sin6->sin6_addr,
+			    0, 0, IRE_ONLINK, NULL, zoneid,
+			    NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
+			if (ire == NULL)
+				return (EHOSTUNREACH);
+			ire_refrele(ire);
+			break;
+		}
+		break;
+	case IPV6_RTHDR:
+	case IPV6_DSTOPTS:
+	case IPV6_RTHDRDSTOPTS:
+	case IPV6_HOPOPTS: {
+		/* All have the length field in the same place */
+		ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
+		/*
+		 * Sanity checks - minimum size, size a multiple of
+		 * eight bytes, and matching size passed in.
+		 */
+		if (inlen != 0 &&
+		    inlen != (8 * (hopts->ip6h_len + 1)))
+			return (EINVAL);
+		break;
+	}
+	case IPV6_PATHMTU:
+		/* Can't be set */
+		return (EINVAL);
+
+	case IPV6_USE_MIN_MTU:
+		if (inlen != sizeof (int))
+			return (EINVAL);
+		if (*i1 < -1 || *i1 > 1)
+			return (EINVAL);
+		break;
+	case IPV6_SRC_PREFERENCES:
+		if (inlen != sizeof (uint32_t))
+			return (EINVAL);
+		break;
+	case IPV6_V6ONLY:
+		if (*i1 < 0 || *i1 > 1) {
+			return (EINVAL);
+		}
+		break;
+	}
+	if (checkonly)
+		return (0);
+
+	/* Here we set the actual option value */
+	/*
+	 * conn_lock protects the bitfields, and is used to
+	 * set the fields atomically. Not needed for ixa settings since
+	 * the caller has an exclusive copy of the ixa.
+	 * We can not hold conn_lock across the multicast options though.
+	 */
+	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
+	switch (name) {
+	case IPV6_MULTICAST_IF:
+		ixa->ixa_multicast_ifindex = ifindex;
+		/* Need to redo ip_attr_connect */
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+	case IPV6_UNICAST_HOPS:
+		/* -1 means use default */
+		mutex_enter(&connp->conn_lock);
+		if (*i1 == -1) {
+			ipp->ipp_unicast_hops = connp->conn_default_ttl;
+		} else {
+			ipp->ipp_unicast_hops = (uint8_t)*i1;
+		}
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		break;
+	case IPV6_MULTICAST_HOPS:
+		/* -1 means use default */
+		if (*i1 == -1) {
+			ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+		} else {
+			ixa->ixa_multicast_ttl = (uint8_t)*i1;
+		}
+		/* Handled automatically by ip_output */
+		break;
+	case IPV6_MULTICAST_LOOP:
+		if (*i1 != 0)
+			ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
+		else
+			ixa->ixa_flags &= ~IXAF_MULTICAST_LOOP;
+		/* Handled automatically by ip_output */
+		break;
+	case IPV6_JOIN_GROUP:
+	case IPV6_LEAVE_GROUP:
+	case MCAST_JOIN_GROUP:
+	case MCAST_LEAVE_GROUP:
+		return (ip_opt_set_multicast_group(connp, name,
+		    invalp, B_TRUE, checkonly));
+
+	case MCAST_BLOCK_SOURCE:
+	case MCAST_UNBLOCK_SOURCE:
+	case MCAST_JOIN_SOURCE_GROUP:
+	case MCAST_LEAVE_SOURCE_GROUP:
+		return (ip_opt_set_multicast_sources(connp, name,
+		    invalp, B_TRUE, checkonly));
+
+	case IPV6_BOUND_IF:
+		ixa->ixa_ifindex = ifindex;		/* Send */
+		mutex_enter(&connp->conn_lock);
+		connp->conn_incoming_ifindex = ifindex;	/* Receive */
+		connp->conn_bound_if = ifindex;		/* getsockopt */
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+	case IPV6_UNSPEC_SRC:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_unspec_src = onoff;
+		if (onoff)
+			ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+		else
+			ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IPV6_RECVPKTINFO:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_ip_recvpktinfo = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IPV6_RECVTCLASS:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_ipv6_recvtclass = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IPV6_RECVPATHMTU:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_ipv6_recvpathmtu = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IPV6_RECVHOPLIMIT:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_ipv6_recvhoplimit =
+		    onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IPV6_RECVHOPOPTS:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_ipv6_recvhopopts = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IPV6_RECVDSTOPTS:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_ipv6_recvdstopts = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case _OLD_IPV6_RECVDSTOPTS:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts =
+		    onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IPV6_RECVRTHDRDSTOPTS:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts =
+		    onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IPV6_RECVRTHDR:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_recv_ancillary.crb_ipv6_recvrthdr = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	case IPV6_PKTINFO:
+		mutex_enter(&connp->conn_lock);
+		if (inlen == 0) {
+			ipp->ipp_fields &= ~IPPF_ADDR;
+			ipp->ipp_addr = ipv6_all_zeros;
+			ixa->ixa_ifindex = 0;
+		} else {
+			struct in6_pktinfo *pkti;
+
+			pkti = (struct in6_pktinfo *)invalp;
+			ipp->ipp_addr = pkti->ipi6_addr;
+			if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr))
+				ipp->ipp_fields |= IPPF_ADDR;
+			else
+				ipp->ipp_fields &= ~IPPF_ADDR;
+			ixa->ixa_ifindex = pkti->ipi6_ifindex;
+		}
+		mutex_exit(&connp->conn_lock);
+		/* Source and ifindex might have changed */
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+	case IPV6_HOPLIMIT:
+		mutex_enter(&connp->conn_lock);
+		if (inlen == 0 || *i1 == -1) {
+			/* Revert to default */
+			ipp->ipp_fields &= ~IPPF_HOPLIMIT;
+			ixa->ixa_flags &= ~IXAF_NO_TTL_CHANGE;
+		} else {
+			ipp->ipp_hoplimit = *i1;
+			ipp->ipp_fields |= IPPF_HOPLIMIT;
+			/* Ensure that it sticks for multicast packets */
+			ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+		}
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		break;
+	case IPV6_TCLASS:
+		/*
+		 * IPV6_TCLASS accepts -1 as use kernel default
+		 * and [0, 255] as the actualy traffic class.
+		 */
+		mutex_enter(&connp->conn_lock);
+		if (inlen == 0 || *i1 == -1) {
+			ipp->ipp_tclass = 0;
+			ipp->ipp_fields &= ~IPPF_TCLASS;
+		} else {
+			ipp->ipp_tclass = *i1;
+			ipp->ipp_fields |= IPPF_TCLASS;
+		}
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		break;
+	case IPV6_NEXTHOP:
+		if (inlen == 0) {
+			ixa->ixa_flags &= ~IXAF_NEXTHOP_SET;
+		} else {
+			sin6_t *sin6 = (sin6_t *)invalp;
+
+			ixa->ixa_nexthop_v6 = sin6->sin6_addr;
+			if (!IN6_IS_ADDR_UNSPECIFIED(&ixa->ixa_nexthop_v6))
+				ixa->ixa_flags |= IXAF_NEXTHOP_SET;
+			else
+				ixa->ixa_flags &= ~IXAF_NEXTHOP_SET;
+		}
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+	case IPV6_HOPOPTS:
+		mutex_enter(&connp->conn_lock);
+		error = optcom_pkt_set(invalp, inlen,
+		    (uchar_t **)&ipp->ipp_hopopts, &ipp->ipp_hopoptslen);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			return (error);
+		}
+		if (ipp->ipp_hopoptslen == 0) {
+			ipp->ipp_fields &= ~IPPF_HOPOPTS;
+		} else {
+			ipp->ipp_fields |= IPPF_HOPOPTS;
+		}
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		coa->coa_changed |= COA_WROFF_CHANGED;
+		break;
+	case IPV6_RTHDRDSTOPTS:
+		mutex_enter(&connp->conn_lock);
+		error = optcom_pkt_set(invalp, inlen,
+		    (uchar_t **)&ipp->ipp_rthdrdstopts,
+		    &ipp->ipp_rthdrdstoptslen);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			return (error);
+		}
+		if (ipp->ipp_rthdrdstoptslen == 0) {
+			ipp->ipp_fields &= ~IPPF_RTHDRDSTOPTS;
+		} else {
+			ipp->ipp_fields |= IPPF_RTHDRDSTOPTS;
+		}
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		coa->coa_changed |= COA_WROFF_CHANGED;
+		break;
+	case IPV6_DSTOPTS:
+		mutex_enter(&connp->conn_lock);
+		error = optcom_pkt_set(invalp, inlen,
+		    (uchar_t **)&ipp->ipp_dstopts, &ipp->ipp_dstoptslen);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			return (error);
+		}
+		if (ipp->ipp_dstoptslen == 0) {
+			ipp->ipp_fields &= ~IPPF_DSTOPTS;
+		} else {
+			ipp->ipp_fields |= IPPF_DSTOPTS;
+		}
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		coa->coa_changed |= COA_WROFF_CHANGED;
+		break;
+	case IPV6_RTHDR:
+		mutex_enter(&connp->conn_lock);
+		error = optcom_pkt_set(invalp, inlen,
+		    (uchar_t **)&ipp->ipp_rthdr, &ipp->ipp_rthdrlen);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			return (error);
+		}
+		if (ipp->ipp_rthdrlen == 0) {
+			ipp->ipp_fields &= ~IPPF_RTHDR;
+		} else {
+			ipp->ipp_fields |= IPPF_RTHDR;
+		}
+		mutex_exit(&connp->conn_lock);
+		coa->coa_changed |= COA_HEADER_CHANGED;
+		coa->coa_changed |= COA_WROFF_CHANGED;
+		break;
+
+	case IPV6_DONTFRAG:
+		if (onoff) {
+			ixa->ixa_flags |= IXAF_DONTFRAG;
+			ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
+		} else {
+			ixa->ixa_flags &= ~IXAF_DONTFRAG;
+			ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
+		}
+		/* Need to redo ip_attr_connect */
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+
+	case IPV6_USE_MIN_MTU:
+		ixa->ixa_flags |= IXAF_USE_MIN_MTU;
+		ixa->ixa_use_min_mtu = *i1;
+		/* Need to redo ip_attr_connect */
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+
+	case IPV6_SEC_OPT:
+		mutex_enter(&connp->conn_lock);
+		error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp);
+		mutex_exit(&connp->conn_lock);
+		if (error != 0) {
+			return (error);
+		}
+		/* This is an IPsec policy change - redo ip_attr_connect */
+		coa->coa_changed |= COA_ROUTE_CHANGED;
+		break;
+	case IPV6_SRC_PREFERENCES:
+		/*
+		 * This socket option only affects connected
+		 * sockets that haven't already bound to a specific
+		 * IPv6 address.  In other words, sockets that
+		 * don't call bind() with an address other than the
+		 * unspecified address and that call connect().
+		 * ip_set_destination_v6() passes these preferences
+		 * to the ipif_select_source_v6() function.
+		 */
+		mutex_enter(&connp->conn_lock);
+		error = ip6_set_src_preferences(ixa, *(uint32_t *)invalp);
+		mutex_exit(&connp->conn_lock);
+		if (error != 0) {
+			return (error);
+		}
+		break;
+	case IPV6_V6ONLY:
+		mutex_enter(&connp->conn_lock);
+		connp->conn_ipv6_v6only = onoff;
+		mutex_exit(&connp->conn_lock);
+		break;
+	}
+	return (0);
+}
+
+/* Handle IPPROTO_UDP */
+/* ARGSUSED1 */
+static int
+conn_opt_set_udp(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
+    uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+	conn_t		*connp = coa->coa_connp;
+	int		*i1 = (int *)invalp;
+	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
+	int		error;
+
+	switch (name) {
+	case UDP_ANONPRIVBIND:
+		if ((error = secpolicy_net_privaddr(cr, 0, IPPROTO_UDP)) != 0) {
+			return (error);
+		}
+		break;
+	}
+	if (checkonly)
+		return (0);
+
+	/* Here we set the actual option value */
+	mutex_enter(&connp->conn_lock);
+	switch (name) {
+	case UDP_ANONPRIVBIND:
+		connp->conn_anon_priv_bind = onoff;
+		break;
+	case UDP_EXCLBIND:
+		connp->conn_exclbind = onoff;
+		break;
+	}
+	mutex_exit(&connp->conn_lock);
+	return (0);
+}
+
+/* Handle IPPROTO_TCP */
+/* ARGSUSED1 */
+static int
+conn_opt_set_tcp(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
+    uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+	conn_t		*connp = coa->coa_connp;
+	int		*i1 = (int *)invalp;
+	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
+	int		error;
+
+	switch (name) {
+	case TCP_ANONPRIVBIND:
+		if ((error = secpolicy_net_privaddr(cr, 0, IPPROTO_TCP)) != 0) {
+			return (error);
+		}
+		break;
+	}
+	if (checkonly)
+		return (0);
+
+	/* Here we set the actual option value */
+	mutex_enter(&connp->conn_lock);
+	switch (name) {
+	case TCP_ANONPRIVBIND:
+		connp->conn_anon_priv_bind = onoff;
+		break;
+	case TCP_EXCLBIND:
+		connp->conn_exclbind = onoff;
+		break;
+	case TCP_RECVDSTADDR:
+		connp->conn_recv_ancillary.crb_recvdstaddr = onoff;
+		break;
+	}
+	mutex_exit(&connp->conn_lock);
+	return (0);
+}
+
+int
+conn_getsockname(conn_t *connp, struct sockaddr *sa, uint_t *salenp)
+{
+	sin_t		*sin;
+	sin6_t		*sin6;
+
+	if (connp->conn_family == AF_INET) {
+		if (*salenp < sizeof (sin_t))
+			return (EINVAL);
+
+		*salenp = sizeof (sin_t);
+		/* Fill zeroes and then initialize non-zero fields */
+		sin = (sin_t *)sa;
+		*sin = sin_null;
+		sin->sin_family = AF_INET;
+		if (!IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_saddr_v6) &&
+		    !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) {
+			sin->sin_addr.s_addr = connp->conn_saddr_v4;
+		} else {
+			/*
+			 * INADDR_ANY
+			 * conn_saddr is not set, we might be bound to
+			 * broadcast/multicast. Use conn_bound_addr as
+			 * local address instead (that could
+			 * also still be INADDR_ANY)
+			 */
+			sin->sin_addr.s_addr = connp->conn_bound_addr_v4;
+		}
+		sin->sin_port = connp->conn_lport;
+	} else {
+		if (*salenp < sizeof (sin6_t))
+			return (EINVAL);
+
+		*salenp = sizeof (sin6_t);
+		/* Fill zeroes and then initialize non-zero fields */
+		sin6 = (sin6_t *)sa;
+		*sin6 = sin6_null;
+		sin6->sin6_family = AF_INET6;
+		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) {
+			sin6->sin6_addr = connp->conn_saddr_v6;
+		} else {
+			/*
+			 * conn_saddr is not set, we might be bound to
+			 * broadcast/multicast. Use conn_bound_addr as
+			 * local address instead (which could
+			 * also still be unspecified)
+			 */
+			sin6->sin6_addr = connp->conn_bound_addr_v6;
+		}
+		sin6->sin6_port = connp->conn_lport;
+		if (IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr) &&
+		    (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET))
+			sin6->sin6_scope_id = connp->conn_ixa->ixa_scopeid;
+	}
+	return (0);
+}
+
+int
+conn_getpeername(conn_t *connp, struct sockaddr *sa, uint_t *salenp)
+{
+	struct sockaddr_in	*sin;
+	struct sockaddr_in6	*sin6;
+
+	if (connp->conn_family == AF_INET) {
+		if (*salenp < sizeof (sin_t))
+			return (EINVAL);
+
+		*salenp = sizeof (sin_t);
+		/* initialize */
+		sin = (sin_t *)sa;
+		*sin = sin_null;
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = connp->conn_faddr_v4;
+		sin->sin_port = connp->conn_fport;
+	} else {
+		if (*salenp < sizeof (sin6_t))
+			return (EINVAL);
+
+		*salenp = sizeof (sin6_t);
+		/* initialize */
+		sin6 = (sin6_t *)sa;
+		*sin6 = sin6_null;
+		sin6->sin6_family = AF_INET6;
+		sin6->sin6_addr = connp->conn_faddr_v6;
+		sin6->sin6_port =  connp->conn_fport;
+		sin6->sin6_flowinfo = connp->conn_flowinfo;
+		if (IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr) &&
+		    (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET))
+			sin6->sin6_scope_id = connp->conn_ixa->ixa_scopeid;
+	}
+	return (0);
+}
+
+static uint32_t	cksum_massage_options_v4(ipha_t *, netstack_t *);
+static uint32_t cksum_massage_options_v6(ip6_t *, uint_t, netstack_t *);
+
+/*
+ * Allocate and fill in conn_ht_iphc based on the current information
+ * in the conn.
+ * Normally used when we bind() and connect().
+ * Returns failure if can't allocate memory, or if there is a problem
+ * with a routing header/option.
+ *
+ * We allocate space for the transport header (ulp_hdr_len + extra) and
+ * indicate the offset of the ulp header by setting ixa_ip_hdr_length.
+ * The extra is there for transports that want some spare room for future
+ * options. conn_ht_iphc_allocated is what was allocated; conn_ht_iphc_len
+ * excludes the extra part.
+ *
+ * We massage an routing option/header and store the ckecksum difference
+ * in conn_sum.
+ *
+ * Caller needs to update conn_wroff if desired.
+ */
+int
+conn_build_hdr_template(conn_t *connp, uint_t ulp_hdr_length, uint_t extra,
+    const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo)
+{
+	ip_xmit_attr_t	*ixa = connp->conn_ixa;
+	ip_pkt_t	*ipp = &connp->conn_xmit_ipp;
+	uint_t		ip_hdr_length;
+	uchar_t		*hdrs;
+	uint_t		hdrs_len;
+
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ip_hdr_length = ip_total_hdrs_len_v4(ipp);
+		/* In case of TX label and IP options it can be too much */
+		if (ip_hdr_length > IP_MAX_HDR_LENGTH) {
+			/* Preserves existing TX errno for this */
+			return (EHOSTUNREACH);
+		}
+	} else {
+		ip_hdr_length = ip_total_hdrs_len_v6(ipp);
+	}
+	ixa->ixa_ip_hdr_length = ip_hdr_length;
+	hdrs_len = ip_hdr_length + ulp_hdr_length + extra;
+	ASSERT(hdrs_len != 0);
+
+	if (hdrs_len != connp->conn_ht_iphc_allocated) {
+		/* Allocate new before we free any old */
+		hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
+		if (hdrs == NULL)
+			return (ENOMEM);
+
+		if (connp->conn_ht_iphc != NULL) {
+			kmem_free(connp->conn_ht_iphc,
+			    connp->conn_ht_iphc_allocated);
+		}
+		connp->conn_ht_iphc = hdrs;
+		connp->conn_ht_iphc_allocated = hdrs_len;
+	} else {
+		hdrs = connp->conn_ht_iphc;
+	}
+	hdrs_len -= extra;
+	connp->conn_ht_iphc_len = hdrs_len;
+
+	connp->conn_ht_ulp = hdrs + ip_hdr_length;
+	connp->conn_ht_ulp_len = ulp_hdr_length;
+
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t	*ipha = (ipha_t *)hdrs;
+
+		IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
+		IN6_V4MAPPED_TO_IPADDR(v6dst, ipha->ipha_dst);
+		ip_build_hdrs_v4(hdrs, ip_hdr_length, ipp, connp->conn_proto);
+		ipha->ipha_length = htons(hdrs_len);
+		if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
+			ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
+		else
+			ipha->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
+
+		if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+			connp->conn_sum = cksum_massage_options_v4(ipha,
+			    connp->conn_netstack);
+		} else {
+			connp->conn_sum = 0;
+		}
+	} else {
+		ip6_t	*ip6h = (ip6_t *)hdrs;
+
+		ip6h->ip6_src = *v6src;
+		ip6h->ip6_dst = *v6dst;
+		ip_build_hdrs_v6(hdrs, ip_hdr_length, ipp, connp->conn_proto,
+		    flowinfo);
+		ip6h->ip6_plen = htons(hdrs_len - IPV6_HDR_LEN);
+
+		if (ipp->ipp_fields & IPPF_RTHDR) {
+			connp->conn_sum = cksum_massage_options_v6(ip6h,
+			    ip_hdr_length, connp->conn_netstack);
+
+			/*
+			 * Verify that the first hop isn't a mapped address.
+			 * Routers along the path need to do this verification
+			 * for subsequent hops.
+			 */
+			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst))
+				return (EADDRNOTAVAIL);
+
+		} else {
+			connp->conn_sum = 0;
+		}
+	}
+	return (0);
+}
+
+/*
+ * Prepend a header template to data_mp based on the ip_pkt_t
+ * and the passed in source, destination and protocol.
+ *
+ * Returns failure if can't allocate memory, in which case data_mp is freed.
+ * We allocate space for the transport header (ulp_hdr_len) and
+ * indicate the offset of the ulp header by setting ixa_ip_hdr_length.
+ *
+ * We massage an routing option/header and return the ckecksum difference
+ * in *sump. This is in host byte order.
+ *
+ * Caller needs to update conn_wroff if desired.
+ */
+mblk_t *
+conn_prepend_hdr(ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
+    const in6_addr_t *v6src, const in6_addr_t *v6dst,
+    uint8_t protocol, uint32_t flowinfo, uint_t ulp_hdr_length, mblk_t *data_mp,
+    uint_t data_length, uint_t wroff_extra, uint32_t *sump, int *errorp)
+{
+	uint_t		ip_hdr_length;
+	uchar_t		*hdrs;
+	uint_t		hdrs_len;
+	mblk_t		*mp;
+
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ip_hdr_length = ip_total_hdrs_len_v4(ipp);
+		ASSERT(ip_hdr_length <= IP_MAX_HDR_LENGTH);
+	} else {
+		ip_hdr_length = ip_total_hdrs_len_v6(ipp);
+	}
+	hdrs_len = ip_hdr_length + ulp_hdr_length;
+	ASSERT(hdrs_len != 0);
+
+	ixa->ixa_ip_hdr_length = ip_hdr_length;
+
+	/* Can we prepend to data_mp? */
+	if (data_mp != NULL &&
+	    data_mp->b_rptr - data_mp->b_datap->db_base >= hdrs_len &&
+	    data_mp->b_datap->db_ref == 1) {
+		hdrs = data_mp->b_rptr - hdrs_len;
+		data_mp->b_rptr = hdrs;
+		mp = data_mp;
+	} else {
+		mp = allocb(hdrs_len + wroff_extra, BPRI_MED);
+		if (mp == NULL) {
+			freemsg(data_mp);
+			*errorp = ENOMEM;
+			return (NULL);
+		}
+		mp->b_wptr = mp->b_datap->db_lim;
+		hdrs = mp->b_rptr = mp->b_wptr - hdrs_len;
+		mp->b_cont = data_mp;
+	}
+
+	/*
+	 * Set the source in the header. ip_build_hdrs_v4/v6 will overwrite it
+	 * if PKTINFO (aka IPPF_ADDR) was set.
+	 */
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t *ipha = (ipha_t *)hdrs;
+
+		ASSERT(IN6_IS_ADDR_V4MAPPED(v6dst));
+		IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
+		IN6_V4MAPPED_TO_IPADDR(v6dst, ipha->ipha_dst);
+		ip_build_hdrs_v4(hdrs, ip_hdr_length, ipp, protocol);
+		ipha->ipha_length = htons(hdrs_len + data_length);
+		if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
+			ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
+		else
+			ipha->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
+
+		if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+			*sump = cksum_massage_options_v4(ipha,
+			    ixa->ixa_ipst->ips_netstack);
+		} else {
+			*sump = 0;
+		}
+	} else {
+		ip6_t *ip6h = (ip6_t *)hdrs;
+
+		ip6h->ip6_src = *v6src;
+		ip6h->ip6_dst = *v6dst;
+		ip_build_hdrs_v6(hdrs, ip_hdr_length, ipp, protocol, flowinfo);
+		ip6h->ip6_plen = htons(hdrs_len + data_length - IPV6_HDR_LEN);
+
+		if (ipp->ipp_fields & IPPF_RTHDR) {
+			*sump = cksum_massage_options_v6(ip6h,
+			    ip_hdr_length, ixa->ixa_ipst->ips_netstack);
+
+			/*
+			 * Verify that the first hop isn't a mapped address.
+			 * Routers along the path need to do this verification
+			 * for subsequent hops.
+			 */
+			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
+				*errorp = EADDRNOTAVAIL;
+				freemsg(mp);
+				return (NULL);
+			}
+		} else {
+			*sump = 0;
+		}
+	}
+	return (mp);
+}
+
+/*
+ * Massage a source route if any putting the first hop
+ * in ipha_dst. Compute a starting value for the checksum which
+ * takes into account that the original ipha_dst should be
+ * included in the checksum but that IP will include the
+ * first hop from the source route in the tcp checksum.
+ */
+static uint32_t
+cksum_massage_options_v4(ipha_t *ipha, netstack_t *ns)
+{
+	in_addr_t	dst;
+	uint32_t	cksum;
+
+	/* Get last hop then diff against first hop */
+	cksum = ip_massage_options(ipha, ns);
+	cksum = (cksum & 0xFFFF) + (cksum >> 16);
+	dst = ipha->ipha_dst;
+	cksum -= ((dst >> 16) + (dst & 0xffff));
+	if ((int)cksum < 0)
+		cksum--;
+	cksum = (cksum & 0xFFFF) + (cksum >> 16);
+	cksum = (cksum & 0xFFFF) + (cksum >> 16);
+	ASSERT(cksum < 0x10000);
+	return (ntohs(cksum));
+}
+
+static uint32_t
+cksum_massage_options_v6(ip6_t *ip6h, uint_t ip_hdr_len, netstack_t *ns)
+{
+	uint8_t		*end;
+	ip6_rthdr_t	*rth;
+	uint32_t	cksum;
+
+	end = (uint8_t *)ip6h + ip_hdr_len;
+	rth = ip_find_rthdr_v6(ip6h, end);
+	if (rth == NULL)
+		return (0);
+
+	cksum = ip_massage_options_v6(ip6h, rth, ns);
+	cksum = (cksum & 0xFFFF) + (cksum >> 16);
+	ASSERT(cksum < 0x10000);
+	return (ntohs(cksum));
+}
+
+/*
+ * ULPs that change the destination address need to call this for each
+ * change to discard any state about a previous destination that might
+ * have been multicast or multirt.
+ */
+void
+ip_attr_newdst(ip_xmit_attr_t *ixa)
+{
+	ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM |
+	    IXAF_NO_TTL_CHANGE | IXAF_IPV6_ADD_FRAGHDR |
+	    IXAF_NO_LOOP_ZONEID_SET);
+}
+
+/*
+ * Determine the nexthop which will be used.
+ * Normally this is just the destination, but if a IPv4 source route, or
+ * IPv6 routing header, is in the ip_pkt_t then we extract the nexthop from
+ * there.
+ */
+void
+ip_attr_nexthop(const ip_pkt_t *ipp, const ip_xmit_attr_t *ixa,
+    const in6_addr_t *dst, in6_addr_t *nexthop)
+{
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipaddr_t v4dst;
+		ipaddr_t v4nexthop;
+
+		IN6_V4MAPPED_TO_IPADDR(dst, v4dst);
+		v4nexthop = ip_pkt_source_route_v4(ipp);
+		if (v4nexthop == INADDR_ANY)
+			v4nexthop = v4dst;
+
+		IN6_IPADDR_TO_V4MAPPED(v4nexthop, nexthop);
+	} else {
+		const in6_addr_t *v6nexthop;
+
+		v6nexthop = ip_pkt_source_route_v6(ipp);
+		if (v6nexthop == NULL)
+			v6nexthop = dst;
+
+		*nexthop = *v6nexthop;
+	}
+}
+
+/*
+ * Update the ip_xmit_attr_t based the addresses, conn_xmit_ipp and conn_ixa.
+ * If IPDF_IPSEC is set we cache the IPsec policy to handle the unconnected
+ * case (connected latching is done in conn_connect).
+ * Note that IPsec policy lookup requires conn_proto and conn_laddr to be
+ * set, but doesn't otherwise use the conn_t.
+ *
+ * Caller must set/clear IXAF_IS_IPV4 as appropriately.
+ * Caller must use ip_attr_nexthop() to determine the nexthop argument.
+ *
+ * The caller must NOT hold conn_lock (to avoid problems with ill_refrele
+ * causing the squeue to run doing ipcl_walk grabbing conn_lock.)
+ *
+ * Updates laddrp and uinfo if they are non-NULL.
+ *
+ * TSOL notes: The callers if ip_attr_connect must check if the destination
+ * is different than before and in that case redo conn_update_label.
+ * The callers of conn_connect do not need that since conn_connect
+ * performs the conn_update_label.
+ */
+int
+ip_attr_connect(const conn_t *connp, ip_xmit_attr_t *ixa,
+    const in6_addr_t *v6src, const in6_addr_t *v6dst,
+    const in6_addr_t *v6nexthop, in_port_t dstport, in6_addr_t *laddrp,
+    iulp_t *uinfo, uint32_t flags)
+{
+	in6_addr_t		laddr = *v6src;
+	int			error;
+
+	ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+
+	if (connp->conn_zone_is_global)
+		flags |= IPDF_ZONE_IS_GLOBAL;
+	else
+		flags &= ~IPDF_ZONE_IS_GLOBAL;
+
+	/*
+	 * Lookup the route to determine a source address and the uinfo.
+	 * If the ULP has a source route option then the caller will
+	 * have set v6nexthop to be the first hop.
+	 */
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipaddr_t v4dst;
+		ipaddr_t v4src, v4nexthop;
+
+		IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
+		IN6_V4MAPPED_TO_IPADDR(v6nexthop, v4nexthop);
+		IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
+
+		if (connp->conn_unspec_src || v4src != INADDR_ANY)
+			flags &= ~IPDF_SELECT_SRC;
+		else
+			flags |= IPDF_SELECT_SRC;
+
+		error = ip_set_destination_v4(&v4src, v4dst, v4nexthop, ixa,
+		    uinfo, flags, connp->conn_mac_mode);
+		IN6_IPADDR_TO_V4MAPPED(v4src, &laddr);
+	} else {
+		if (connp->conn_unspec_src || !IN6_IS_ADDR_UNSPECIFIED(v6src))
+			flags &= ~IPDF_SELECT_SRC;
+		else
+			flags |= IPDF_SELECT_SRC;
+
+		error = ip_set_destination_v6(&laddr, v6dst, v6nexthop, ixa,
+		    uinfo, flags, connp->conn_mac_mode);
+	}
+	/* Pass out some address even if we hit a RTF_REJECT etc */
+	if (laddrp != NULL)
+		*laddrp = laddr;
+
+	if (error != 0)
+		return (error);
+
+	if (flags & IPDF_IPSEC) {
+		/*
+		 * Set any IPsec policy in ixa. Routine also looks at ULP
+		 * ports.
+		 */
+		ipsec_cache_outbound_policy(connp, v6src, v6dst, dstport, ixa);
+	}
+	return (0);
+}
+
+/*
+ * Connect the conn based on the addresses, conn_xmit_ipp and conn_ixa.
+ * Assumes that conn_faddr and conn_fport are already set. As such it is not
+ * usable for SCTP, since SCTP has multiple faddrs.
+ *
+ * Caller must hold conn_lock to provide atomic constency between the
+ * conn_t's addresses and the ixa.
+ * NOTE: this function drops and reaquires conn_lock since it can't be
+ * held across ip_attr_connect/ip_set_destination.
+ *
+ * The caller needs to handle inserting in the receive-side fanout when
+ * appropriate after conn_connect returns.
+ */
+int
+conn_connect(conn_t *connp, iulp_t *uinfo, uint32_t flags)
+{
+	ip_xmit_attr_t	*ixa = connp->conn_ixa;
+	in6_addr_t	nexthop;
+	in6_addr_t	saddr, faddr;
+	in_port_t	fport;
+	int		error;
+
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+
+	if (connp->conn_ipversion == IPV4_VERSION)
+		ixa->ixa_flags |= IXAF_IS_IPV4;
+	else
+		ixa->ixa_flags &= ~IXAF_IS_IPV4;
+
+	/* We do IPsec latching below - hence no caching in ip_attr_connect */
+	flags &= ~IPDF_IPSEC;
+
+	/* In case we had previously done an ip_attr_connect */
+	ip_attr_newdst(ixa);
+
+	/*
+	 * Determine the nexthop and copy the addresses before dropping
+	 * conn_lock.
+	 */
+	ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
+	    &connp->conn_faddr_v6, &nexthop);
+	saddr = connp->conn_saddr_v6;
+	faddr = connp->conn_faddr_v6;
+	fport = connp->conn_fport;
+
+	mutex_exit(&connp->conn_lock);
+	error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, fport,
+	    &saddr, uinfo, flags | IPDF_VERIFY_DST);
+	mutex_enter(&connp->conn_lock);
+
+	/* Could have changed even if an error */
+	connp->conn_saddr_v6 = saddr;
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Check whether Trusted Solaris policy allows communication with this
+	 * host, and pretend that the destination is unreachable if not.
+	 * Compute any needed label and place it in ipp_label_v4/v6.
+	 *
+	 * Later conn_build_hdr_template() takes ipp_label_v4/v6 to form
+	 * the packet.
+	 *
+	 * TSOL Note: Any concurrent threads would pick a different ixa
+	 * (and ipp if they are to change the ipp)  so we
+	 * don't have to worry about concurrent threads.
+	 */
+	if (is_system_labeled()) {
+		if (connp->conn_mlp_type != mlptSingle)
+			return (ECONNREFUSED);
+
+		/*
+		 * conn_update_label will set ipp_label* which will later
+		 * be used by conn_build_hdr_template.
+		 */
+		error = conn_update_label(connp, ixa,
+		    &connp->conn_faddr_v6, &connp->conn_xmit_ipp);
+		if (error != 0)
+			return (error);
+	}
+
+	/*
+	 * Ensure that we match on the selected local address.
+	 * This overrides conn_laddr in the case we had earlier bound to a
+	 * multicast or broadcast address.
+	 */
+	connp->conn_laddr_v6 = connp->conn_saddr_v6;
+
+	/*
+	 * Allow setting new policies.
+	 * The addresses/ports are already set, thus the IPsec policy calls
+	 * can handle their passed-in conn's.
+	 */
+	connp->conn_policy_cached = B_FALSE;
+
+	/*
+	 * Cache IPsec policy in this conn.  If we have per-socket policy,
+	 * we'll cache that.  If we don't, we'll inherit global policy.
+	 *
+	 * This is done before the caller inserts in the receive-side fanout.
+	 * Note that conn_policy_cached is set by ipsec_conn_cache_policy() even
+	 * for connections where we don't have a policy. This is to prevent
+	 * global policy lookups in the inbound path.
+	 *
+	 * If we insert before we set conn_policy_cached,
+	 * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true
+	 * because global policy cound be non-empty. We normally call
+	 * ipsec_check_policy() for conn_policy_cached connections only if
+	 * conn_in_enforce_policy is set. But in this case,
+	 * conn_policy_cached can get set anytime since we made the
+	 * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is
+	 * called, which will make the above assumption false.  Thus, we
+	 * need to insert after we set conn_policy_cached.
+	 */
+	error = ipsec_conn_cache_policy(connp,
+	    connp->conn_ipversion == IPV4_VERSION);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * We defer to do LSO check until here since now we have better idea
+	 * whether IPsec is present. If the underlying ill is LSO capable,
+	 * copy its capability in so the ULP can decide whether to enable LSO
+	 * on this connection. So far, only TCP/IPv4 is implemented, so won't
+	 * claim LSO for IPv6.
+	 *
+	 * Currently, won't enable LSO for IRE_LOOPBACK or IRE_LOCAL, because
+	 * the receiver can not handle it. Also not to enable LSO for MULTIRT.
+	 */
+	ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
+
+	ASSERT(ixa->ixa_ire != NULL);
+	if (ixa->ixa_ipst->ips_ip_lso_outbound && (flags & IPDF_LSO) &&
+	    !(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
+	    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
+	    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
+	    (ixa->ixa_nce != NULL) &&
+	    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
+	    ILL_LSO_TCP_IPV4_USABLE(ixa->ixa_nce->nce_ill) :
+	    ILL_LSO_TCP_IPV6_USABLE(ixa->ixa_nce->nce_ill))) {
+		ixa->ixa_lso_capab = *ixa->ixa_nce->nce_ill->ill_lso_capab;
+		ixa->ixa_flags |= IXAF_LSO_CAPAB;
+	}
+
+	/* Check whether ZEROCOPY capability is usable for this connection. */
+	ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
+
+	if ((flags & IPDF_ZCOPY) &&
+	    !(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
+	    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
+	    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
+	    (ixa->ixa_nce != NULL) &&
+	    ILL_ZCOPY_USABLE(ixa->ixa_nce->nce_ill)) {
+		ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
+	}
+	return (0);
+}
+
+/*
+ * Predicates to check if the addresses match conn_last*
+ */
+
+/*
+ * Compare the conn against an address.
+ * If using mapped addresses on AF_INET6 sockets, use the _v6 function
+ */
+boolean_t
+conn_same_as_last_v4(conn_t *connp, sin_t *sin)
+{
+	ASSERT(connp->conn_family == AF_INET);
+	return (sin->sin_addr.s_addr == connp->conn_v4lastdst &&
+	    sin->sin_port == connp->conn_lastdstport);
+}
+
+/*
+ * Compare, including for mapped addresses
+ */
+boolean_t
+conn_same_as_last_v6(conn_t *connp, sin6_t *sin6)
+{
+	return (IN6_ARE_ADDR_EQUAL(&connp->conn_v6lastdst, &sin6->sin6_addr) &&
+	    sin6->sin6_port == connp->conn_lastdstport &&
+	    sin6->sin6_flowinfo == connp->conn_lastflowinfo &&
+	    sin6->sin6_scope_id == connp->conn_lastscopeid);
+}
+
+/*
+ * Compute a label and place it in the ip_packet_t.
+ * Handles IPv4 and IPv6.
+ * The caller should have a correct ixa_tsl and ixa_zoneid and have
+ * already called conn_connect or ip_attr_connect to ensure that tsol_check_dest
+ * has been called.
+ */
+int
+conn_update_label(const conn_t *connp, const ip_xmit_attr_t *ixa,
+    const in6_addr_t *v6dst, ip_pkt_t *ipp)
+{
+	int		err;
+	ipaddr_t	v4dst;
+
+	if (IN6_IS_ADDR_V4MAPPED(v6dst)) {
+		uchar_t		opt_storage[IP_MAX_OPT_LENGTH];
+
+		IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
+
+		err = tsol_compute_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
+		    v4dst, opt_storage, ixa->ixa_ipst);
+		if (err == 0) {
+			/* Length contained in opt_storage[IPOPT_OLEN] */
+			err = optcom_pkt_set(opt_storage,
+			    opt_storage[IPOPT_OLEN],
+			    (uchar_t **)&ipp->ipp_label_v4,
+			    &ipp->ipp_label_len_v4);
+		}
+		if (err != 0) {
+			DTRACE_PROBE4(tx__ip__log__info__updatelabel,
+			    char *, "conn(1) failed to update options(2) "
+			    "on ixa(3)",
+			    conn_t *, connp, char *, opt_storage,
+			    ip_xmit_attr_t *, ixa);
+		}
+		if (ipp->ipp_label_len_v4 != 0)
+			ipp->ipp_fields |= IPPF_LABEL_V4;
+		else
+			ipp->ipp_fields &= ~IPPF_LABEL_V4;
+	} else {
+		uchar_t		opt_storage[TSOL_MAX_IPV6_OPTION];
+		uint_t		optlen;
+
+		err = tsol_compute_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
+		    v6dst, opt_storage, ixa->ixa_ipst);
+		if (err == 0) {
+			/*
+			 * Note that ipp_label_v6 is just the option - not
+			 * the hopopts extension header.
+			 *
+			 * Length contained in opt_storage[IPOPT_OLEN], but
+			 * that doesn't include the two byte options header.
+			 */
+			optlen = opt_storage[IPOPT_OLEN];
+			if (optlen != 0)
+				optlen += 2;
+
+			err = optcom_pkt_set(opt_storage, optlen,
+			    (uchar_t **)&ipp->ipp_label_v6,
+			    &ipp->ipp_label_len_v6);
+		}
+		if (err != 0) {
+			DTRACE_PROBE4(tx__ip__log__info__updatelabel,
+			    char *, "conn(1) failed to update options(2) "
+			    "on ixa(3)",
+			    conn_t *, connp, char *, opt_storage,
+			    ip_xmit_attr_t *, ixa);
+		}
+		if (ipp->ipp_label_len_v6 != 0)
+			ipp->ipp_fields |= IPPF_LABEL_V6;
+		else
+			ipp->ipp_fields &= ~IPPF_LABEL_V6;
+	}
+	return (err);
+}
+
+/*
+ * Inherit all options settings from the parent/listener to the eager.
+ * Returns zero on success; ENOMEM if memory allocation failed.
+ *
+ * We assume that the eager has not had any work done i.e., the conn_ixa
+ * and conn_xmit_ipp are all zero.
+ * Furthermore we assume that no other thread can access the eager (because
+ * it isn't inserted in any fanout list).
+ */
+int
+conn_inherit_parent(conn_t *lconnp, conn_t *econnp)
+{
+	cred_t	*credp;
+	int	err;
+	void	*notify_cookie;
+
+	econnp->conn_family = lconnp->conn_family;
+	econnp->conn_ipv6_v6only = lconnp->conn_ipv6_v6only;
+	econnp->conn_wq = lconnp->conn_wq;
+	econnp->conn_rq = lconnp->conn_rq;
+
+	/*
+	 * Make a safe copy of the transmit attributes.
+	 * conn_connect will later be used by the caller to setup the ire etc.
+	 */
+	ASSERT(econnp->conn_ixa->ixa_refcnt == 1);
+	ASSERT(econnp->conn_ixa->ixa_ire == NULL);
+	ASSERT(econnp->conn_ixa->ixa_dce == NULL);
+	ASSERT(econnp->conn_ixa->ixa_nce == NULL);
+
+	/* Preserve ixa_notify_cookie */
+	notify_cookie = econnp->conn_ixa->ixa_notify_cookie;
+	ixa_safe_copy(lconnp->conn_ixa, econnp->conn_ixa);
+	econnp->conn_ixa->ixa_notify_cookie = notify_cookie;
+
+	econnp->conn_bound_if = lconnp->conn_bound_if;
+	econnp->conn_incoming_ifindex = lconnp->conn_incoming_ifindex;
+
+	/* Inherit all RECV options */
+	econnp->conn_recv_ancillary = lconnp->conn_recv_ancillary;
+
+	err = ip_pkt_copy(&lconnp->conn_xmit_ipp, &econnp->conn_xmit_ipp,
+	    KM_NOSLEEP);
+	if (err != 0)
+		return (err);
+
+	econnp->conn_zoneid = lconnp->conn_zoneid;
+	econnp->conn_allzones = lconnp->conn_allzones;
+
+	/* This is odd. Pick a flowlabel for each connection instead? */
+	econnp->conn_flowinfo = lconnp->conn_flowinfo;
+
+	econnp->conn_default_ttl = lconnp->conn_default_ttl;
+
+	/*
+	 * TSOL: tsol_input_proc() needs the eager's cred before the
+	 * eager is accepted
+	 */
+	ASSERT(lconnp->conn_cred != NULL);
+	econnp->conn_cred = credp = lconnp->conn_cred;
+	crhold(credp);
+	econnp->conn_cpid = lconnp->conn_cpid;
+	econnp->conn_open_time = lbolt64;
+
+	/*
+	 * Cache things in the ixa without any refhold.
+	 * Listener might not have set up ixa_cred
+	 */
+	econnp->conn_ixa->ixa_cred = econnp->conn_cred;
+	econnp->conn_ixa->ixa_cpid = econnp->conn_cpid;
+	if (is_system_labeled())
+		econnp->conn_ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
+
+	/*
+	 * If the caller has the process-wide flag set, then default to MAC
+	 * exempt mode.  This allows read-down to unlabeled hosts.
+	 */
+	if (getpflags(NET_MAC_AWARE, credp) != 0)
+		econnp->conn_mac_mode = CONN_MAC_AWARE;
+
+	econnp->conn_zone_is_global = lconnp->conn_zone_is_global;
+
+	/*
+	 * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ
+	 * via soaccept()->soinheritoptions() which essentially applies
+	 * all the listener options to the new connection. The options that we
+	 * need to take care of are:
+	 * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST,
+	 * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER,
+	 * SO_SNDBUF, SO_RCVBUF.
+	 *
+	 * SO_RCVBUF:	conn_rcvbuf is set.
+	 * SO_SNDBUF:	conn_sndbuf is set.
+	 */
+
+	econnp->conn_sndbuf = lconnp->conn_sndbuf;
+	econnp->conn_rcvbuf = lconnp->conn_rcvbuf;
+	econnp->conn_sndlowat = lconnp->conn_sndlowat;
+	econnp->conn_rcvlowat = lconnp->conn_rcvlowat;
+	econnp->conn_dgram_errind = lconnp->conn_dgram_errind;
+	econnp->conn_oobinline = lconnp->conn_oobinline;
+	econnp->conn_debug = lconnp->conn_debug;
+	econnp->conn_keepalive = lconnp->conn_keepalive;
+	econnp->conn_linger = lconnp->conn_linger;
+	econnp->conn_lingertime = lconnp->conn_lingertime;
+
+	/* Set the IP options */
+	econnp->conn_broadcast = lconnp->conn_broadcast;
+	econnp->conn_useloopback = lconnp->conn_useloopback;
+	econnp->conn_reuseaddr = lconnp->conn_reuseaddr;
+	return (0);
+}
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 7f6d4b621f..8222c866d0 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -35,65 +35,58 @@
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 #include <sys/strsubr.h>
+#include <sys/suntpi.h>
+#include <sys/xti_inet.h>
 #include <sys/cmn_err.h>
-#include <sys/debug.h>
 #include <sys/kmem.h>
+#include <sys/cred_impl.h>
 #include <sys/policy.h>
 #include <sys/priv.h>
+#include <sys/ucred.h>
 #include <sys/zone.h>
-#include <sys/time.h>
 
 #include <sys/sockio.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
+#include <sys/vtrace.h>
+#include <sys/sdt.h>
+#include <sys/debug.h>
 #include <sys/isa_defs.h>
-#include <sys/suntpi.h>
-#include <sys/xti_inet.h>
-#include <sys/netstack.h>
-
-#include <net/route.h>
-#include <net/if.h>
-
+#include <sys/random.h>
 #include <netinet/in.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
+#include <netinet/udp.h>
+
 #include <inet/common.h>
 #include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ipsec_impl.h>
 #include <inet/ip6.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_ndp.h>
 #include <inet/proto_set.h>
+#include <inet/mib2.h>
 #include <inet/nd.h>
 #include <inet/optcom.h>
 #include <inet/snmpcom.h>
 #include <inet/kstatcom.h>
-#include <inet/rawip_impl.h>
-
-#include <netinet/ip_mroute.h>
-#include <inet/tcp.h>
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
 #include <inet/ipclassifier.h>
 
 #include <sys/tsol/label.h>
 #include <sys/tsol/tnet.h>
 
-#include <inet/ip_ire.h>
-#include <inet/ip_if.h>
+#include <inet/rawip_impl.h>
 
-#include <inet/ip_impl.h>
 #include <sys/disp.h>
 
 /*
  * Synchronization notes:
  *
- * RAWIP is MT and uses the usual kernel synchronization primitives. There is
- * locks, which is icmp_rwlock. We also use conn_lock when updating things
- * which affect the IP classifier lookup.
- * The lock order is icmp_rwlock -> conn_lock.
- *
- * The icmp_rwlock:
- * This protects most of the other fields in the icmp_t. The exact list of
- * fields which are protected by each of the above locks is documented in
- * the icmp_t structure definition.
+ * RAWIP is MT and uses the usual kernel synchronization primitives. We use
+ * conn_lock to protect the icmp_t.
  *
  * Plumbing notes:
  * ICMP is always a device driver. For compatibility with mibopen() code
@@ -103,27 +96,29 @@
 
 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
 static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
-static int	icmp_bind_proto(conn_t *connp);
-static int	icmp_build_hdrs(icmp_t *icmp);
+static void	icmp_bind_proto(icmp_t *icmp);
+static int	icmp_build_hdr_template(conn_t *, const in6_addr_t *,
+    const in6_addr_t *, uint32_t);
 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
 static int	icmp_close(queue_t *q, int flags);
+static void	icmp_close_free(conn_t *);
 static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
 static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
-		    int sys_error);
+    int sys_error);
 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
-		    t_scalar_t t_error, int sys_error);
-static void	icmp_icmp_error(conn_t *connp, mblk_t *mp);
-static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp);
+    t_scalar_t tlierr, int sys_error);
+static void	icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *);
+static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
+    ip_recv_attr_t *);
 static void	icmp_info_req(queue_t *q, mblk_t *mp);
-static void	icmp_input(void *, mblk_t *, void *);
+static void	icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
 		    cred_t *credp);
 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
 		    cred_t *credp);
-static int	icmp_unitdata_opt_process(queue_t *q, mblk_t *mp,
-		    int *errorp, void *thisdg_attrs);
 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
 int		icmp_opt_set(conn_t *connp, uint_t optset_context,
 		    int level, int name, uint_t inlen,
@@ -131,25 +126,26 @@ int		icmp_opt_set(conn_t *connp, uint_t optset_context,
 		    void *thisdg_attrs, cred_t *cr);
 int		icmp_opt_get(conn_t *connp, int level, int name,
 		    uchar_t *ptr);
+static int	icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
+		    sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
 static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
 static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
 		    caddr_t cp, cred_t *cr);
+static mblk_t	*icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
+    const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
+static mblk_t	*icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
+    mblk_t *, const in6_addr_t *, uint32_t, int *);
 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
 		    uchar_t *ptr, int len);
 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
 static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
-static int	icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst);
 static void	icmp_wput(queue_t *q, mblk_t *mp);
 static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
-static int	raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp,
-		    sin6_t *sin6, ip6_pkt_t *ipp);
-static int	raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp,
-		    ipaddr_t v4dst, ip4_pkt_t *pktinfop);
 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
-static void	icmp_ulp_recv(conn_t *, mblk_t *);
+static void	icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
 
 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
@@ -158,10 +154,14 @@ static void	*rawip_kstat_init(netstackid_t stackid);
 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
 static int	rawip_kstat_update(kstat_t *kp, int rw);
 static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
-static int	rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa,
-		    uint_t *salenp);
-static int	rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa,
-		    uint_t *salenp);
+
+/* Common routines for TPI and socket module */
+static conn_t	*rawip_do_open(int, cred_t *, int *, int);
+static void	rawip_do_close(conn_t *);
+static int	rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
+static int	rawip_do_unbind(conn_t *);
+static int	rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
+    cred_t *, pid_t);
 
 int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
 		    socklen_t *, cred_t *);
@@ -185,7 +185,7 @@ static struct qinit icmprinitv6 = {
 };
 
 static struct qinit icmpwinit = {
-	(pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info
+	(pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
 };
 
 /* ICMP entry point during fallback */
@@ -236,6 +236,8 @@ static icmpparam_t	icmp_param_arr[] = {
 	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
 	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
 	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
+	{ 0,	1,	0,	"icmp_pmtu_discovery" },
+	{ 0,	1,	0,	"icmp_sendto_ignerr" },
 };
 #define	is_wroff_extra			is_param_arr[0].icmp_param_value
 #define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
@@ -245,18 +247,17 @@ static icmpparam_t	icmp_param_arr[] = {
 #define	is_xmit_lowat			is_param_arr[5].icmp_param_value
 #define	is_recv_hiwat			is_param_arr[6].icmp_param_value
 #define	is_max_buf			is_param_arr[7].icmp_param_value
+#define	is_pmtu_discovery		is_param_arr[8].icmp_param_value
+#define	is_sendto_ignerr		is_param_arr[9].icmp_param_value
 
-static int rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len);
-static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa,
-    socklen_t len, cred_t *cr);
-static void rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error);
+typedef union T_primitives *t_primp_t;
 
 /*
  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
  * passed to icmp_wput.
- * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the ICMP
- * protocol type placed in the message following the address. A T_BIND_ACK
- * message is returned by ip_bind_v4/v6.
+ * It calls IP to verify the local IP address, and calls IP to insert
+ * the conn_t in the fanout table.
+ * If everything is ok it then sends the T_BIND_ACK back up.
  */
 static void
 icmp_tpi_bind(queue_t *q, mblk_t *mp)
@@ -297,17 +298,17 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp)
 
 	if (icmp->icmp_state != TS_UNBND) {
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
-		    "icmp_bind: bad state, %d", icmp->icmp_state);
+		    "icmp_bind: bad state, %u", icmp->icmp_state);
 		icmp_err_ack(q, mp, TOUTSTATE, 0);
 		return;
 	}
 
 	/*
 	 * Reallocate the message to make sure we have enough room for an
-	 * address and the protocol type.
+	 * address.
 	 */
-	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
-	if (!mp1) {
+	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
+	if (mp1 == NULL) {
 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 		return;
 	}
@@ -320,7 +321,7 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp)
 	switch (len) {
 	case 0:	/* request for a generic port */
 		tbr->ADDR_offset = sizeof (struct T_bind_req);
-		if (icmp->icmp_family == AF_INET) {
+		if (connp->conn_family == AF_INET) {
 			tbr->ADDR_length = sizeof (sin_t);
 			sin = (sin_t *)&tbr[1];
 			*sin = sin_null;
@@ -329,7 +330,7 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp)
 			sa = (struct sockaddr *)sin;
 			len = sizeof (sin_t);
 		} else {
-			ASSERT(icmp->icmp_family == AF_INET6);
+			ASSERT(connp->conn_family == AF_INET6);
 			tbr->ADDR_length = sizeof (sin6_t);
 			sin6 = (sin6_t *)&tbr[1];
 			*sin6 = sin6_null;
@@ -352,14 +353,12 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp)
 
 	default:
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
-		    "icmp_bind: bad ADDR_length %d", tbr->ADDR_length);
+		    "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
 		icmp_err_ack(q, mp, TBADADDR, 0);
 		return;
 	}
 
 	error = rawip_do_bind(connp, sa, len);
-done:
-	ASSERT(mp->b_cont == NULL);
 	if (error != 0) {
 		if (error > 0) {
 			icmp_err_ack(q, mp, TSYSERR, error);
@@ -377,225 +376,208 @@ rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
 {
 	sin_t		*sin;
 	sin6_t		*sin6;
-	icmp_t		*icmp;
+	icmp_t		*icmp = connp->conn_icmp;
 	int		error = 0;
-	mblk_t		*ire_mp;
-
-
-	icmp = connp->conn_icmp;
+	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
+	in_port_t	lport;		/* Network byte order */
+	ipaddr_t	v4src;		/* Set if AF_INET */
+	in6_addr_t	v6src;
+	uint_t		scopeid = 0;
+	zoneid_t	zoneid = IPCL_ZONEID(connp);
+	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
 
 	if (sa == NULL || !OK_32PTR((char *)sa)) {
 		return (EINVAL);
 	}
 
-	/*
-	 * The state must be TS_UNBND. TPI mandates that users must send
-	 * TPI primitives only 1 at a time and wait for the response before
-	 * sending the next primitive.
-	 */
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-	if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) {
-		error = -TOUTSTATE;
-		goto done;
-	}
-
-	ASSERT(len != 0);
 	switch (len) {
 	case sizeof (sin_t):    /* Complete IPv4 address */
 		sin = (sin_t *)sa;
 		if (sin->sin_family != AF_INET ||
-		    icmp->icmp_family != AF_INET) {
+		    connp->conn_family != AF_INET) {
 			/* TSYSERR, EAFNOSUPPORT */
-			error = EAFNOSUPPORT;
-			goto done;
+			return (EAFNOSUPPORT);
 		}
+		v4src = sin->sin_addr.s_addr;
+		IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
+		if (v4src != INADDR_ANY) {
+			laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
+			    B_TRUE);
+		}
+		lport = sin->sin_port;
 		break;
 	case sizeof (sin6_t): /* Complete IPv6 address */
 		sin6 = (sin6_t *)sa;
 		if (sin6->sin6_family != AF_INET6 ||
-		    icmp->icmp_family != AF_INET6) {
+		    connp->conn_family != AF_INET6) {
 			/* TSYSERR, EAFNOSUPPORT */
-			error = EAFNOSUPPORT;
-			goto done;
+			return (EAFNOSUPPORT);
 		}
 		/* No support for mapped addresses on raw sockets */
 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 			/* TSYSERR, EADDRNOTAVAIL */
-			error = EADDRNOTAVAIL;
-			goto done;
+			return (EADDRNOTAVAIL);
 		}
+		v6src = sin6->sin6_addr;
+		if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+			if (IN6_IS_ADDR_LINKSCOPE(&v6src))
+				scopeid = sin6->sin6_scope_id;
+			laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
+			    B_TRUE, scopeid);
+		}
+		lport = sin6->sin6_port;
 		break;
 
 	default:
 		/* TBADADDR */
-		error = EADDRNOTAVAIL;
-		goto done;
+		return (EADDRNOTAVAIL);
 	}
 
-	icmp->icmp_pending_op = T_BIND_REQ;
-	icmp->icmp_state = TS_IDLE;
+	/* Is the local address a valid unicast, multicast, or broadcast? */
+	if (laddr_type == IPVL_BAD)
+		return (EADDRNOTAVAIL);
+
+	/*
+	 * The state must be TS_UNBND.
+	 */
+	mutex_enter(&connp->conn_lock);
+	if (icmp->icmp_state != TS_UNBND) {
+		mutex_exit(&connp->conn_lock);
+		return (-TOUTSTATE);
+	}
 
 	/*
 	 * Copy the source address into our icmp structure.  This address
 	 * may still be zero; if so, ip will fill in the correct address
 	 * each time an outbound packet is passed to it.
 	 * If we are binding to a broadcast or multicast address then
-	 * rawip_post_ip_bind_connect will clear the source address.
+	 * we just set the conn_bound_addr since we don't want to use
+	 * that as the source address when sending.
 	 */
-
-	if (icmp->icmp_family == AF_INET) {
-		ASSERT(sin != NULL);
-		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
-		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr,
-		    &icmp->icmp_v6src);
-		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
-		    icmp->icmp_ip_snd_options_len;
-		icmp->icmp_bound_v6src = icmp->icmp_v6src;
+	connp->conn_bound_addr_v6 = v6src;
+	connp->conn_laddr_v6 = v6src;
+	if (scopeid != 0) {
+		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
+		connp->conn_ixa->ixa_scopeid = scopeid;
+		connp->conn_incoming_ifindex = scopeid;
 	} else {
-		int error;
-
-		ASSERT(sin6 != NULL);
-		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
-		icmp->icmp_v6src = sin6->sin6_addr;
-		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
-		icmp->icmp_bound_v6src = icmp->icmp_v6src;
-
-		/* Rebuild the header template */
-		error = icmp_build_hdrs(icmp);
-		if (error != 0) {
-			icmp->icmp_pending_op = -1;
-			/*
-			 * TSYSERR
-			 */
-			goto done;
-		}
+		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		connp->conn_incoming_ifindex = connp->conn_bound_if;
 	}
 
-	ire_mp = NULL;
-	if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) {
-		/*
-		 * request an IRE if src not 0 (INADDR_ANY)
-		 */
-		ire_mp = allocb(sizeof (ire_t), BPRI_HI);
-		if (ire_mp == NULL) {
-			icmp->icmp_pending_op = -1;
-			error = ENOMEM;
-			goto done;
-		}
-		DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
+	switch (laddr_type) {
+	case IPVL_UNICAST_UP:
+	case IPVL_UNICAST_DOWN:
+		connp->conn_saddr_v6 = v6src;
+		connp->conn_mcbc_bind = B_FALSE;
+		break;
+	case IPVL_MCAST:
+	case IPVL_BCAST:
+		/* ip_set_destination will pick a source address later */
+		connp->conn_saddr_v6 = ipv6_all_zeros;
+		connp->conn_mcbc_bind = B_TRUE;
+		break;
 	}
-done:
-	rw_exit(&icmp->icmp_rwlock);
-	if (error != 0)
-		return (error);
 
-	if (icmp->icmp_family == AF_INET6) {
-		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
-		    &sin6->sin6_addr, sin6->sin6_port, B_TRUE);
+	/* Any errors after this point should use late_error */
+
+	/*
+	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
+	 * with IPPROTO_TCP.
+	 */
+	connp->conn_lport = lport;
+	connp->conn_fport = 0;
+
+	if (connp->conn_family == AF_INET) {
+		ASSERT(connp->conn_ipversion == IPV4_VERSION);
 	} else {
-		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
-		    sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
+		ASSERT(connp->conn_ipversion == IPV6_VERSION);
 	}
-	rawip_post_ip_bind_connect(icmp, ire_mp, error);
-	return (error);
-}
 
-static void
-rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error)
-{
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-	if (icmp->icmp_state == TS_UNBND) {
-		/*
-		 * not yet bound - bind sent by icmp_bind_proto.
-		 */
-		rw_exit(&icmp->icmp_rwlock);
-		return;
-	}
-	ASSERT(icmp->icmp_pending_op != -1);
-	icmp->icmp_pending_op = -1;
+	icmp->icmp_state = TS_IDLE;
 
+	/*
+	 * We create an initial header template here to make a subsequent
+	 * sendto have a starting point. Since conn_last_dst is zero the
+	 * first sendto will always follow the 'dst changed' code path.
+	 * Note that we defer massaging options and the related checksum
+	 * adjustment until we have a destination address.
+	 */
+	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_flowinfo);
 	if (error != 0) {
-		if (icmp->icmp_state == TS_DATA_XFER) {
-			/* Connect failed */
-			/* Revert back to the bound source */
-			icmp->icmp_v6src = icmp->icmp_bound_v6src;
-			icmp->icmp_state = TS_IDLE;
-			if (icmp->icmp_family == AF_INET6)
-				(void) icmp_build_hdrs(icmp);
-		} else {
-			V6_SET_ZERO(icmp->icmp_v6src);
-			V6_SET_ZERO(icmp->icmp_bound_v6src);
-			icmp->icmp_state = TS_UNBND;
-			if (icmp->icmp_family == AF_INET6)
-				(void) icmp_build_hdrs(icmp);
-		}
-	} else {
-		if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
-			ire_t *ire;
-
-			ire = (ire_t *)ire_mp->b_rptr;
-			/*
-			 * If a broadcast/multicast address was bound set
-			 * the source address to 0.
-			 * This ensures no datagrams with broadcast address
-			 * as source address are emitted (which would violate
-			 * RFC1122 - Hosts requirements)
-			 * Note: we get IRE_BROADCAST for IPv6
-			 * to "mark" a multicast local address.
-			 */
+		mutex_exit(&connp->conn_lock);
+		goto late_error;
+	}
+	/* Just in case */
+	connp->conn_faddr_v6 = ipv6_all_zeros;
+	connp->conn_v6lastdst = ipv6_all_zeros;
+	mutex_exit(&connp->conn_lock);
 
+	error = ip_laddr_fanout_insert(connp);
+	if (error != 0)
+		goto late_error;
 
-			if (ire->ire_type == IRE_BROADCAST &&
-			    icmp->icmp_state != TS_DATA_XFER) {
-				/*
-				 * This was just a local bind to a
-				 * MC/broadcast addr
-				 */
-				V6_SET_ZERO(icmp->icmp_v6src);
-				if (icmp->icmp_family == AF_INET6)
-					(void) icmp_build_hdrs(icmp);
-			}
-		}
+	/* Bind succeeded */
+	return (0);
 
+late_error:
+	mutex_enter(&connp->conn_lock);
+	connp->conn_saddr_v6 = ipv6_all_zeros;
+	connp->conn_bound_addr_v6 = ipv6_all_zeros;
+	connp->conn_laddr_v6 = ipv6_all_zeros;
+	if (scopeid != 0) {
+		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		connp->conn_incoming_ifindex = connp->conn_bound_if;
 	}
-	rw_exit(&icmp->icmp_rwlock);
-	if (ire_mp != NULL)
-		freeb(ire_mp);
+	icmp->icmp_state = TS_UNBND;
+	connp->conn_v6lastdst = ipv6_all_zeros;
+	connp->conn_lport = 0;
+
+	/* Restore the header that was built above - different source address */
+	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_flowinfo);
+	mutex_exit(&connp->conn_lock);
+	return (error);
 }
 
 /*
- * Send message to IP to just bind to the protocol.
+ * Tell IP to just bind to the protocol.
  */
-static int
-icmp_bind_proto(conn_t *connp)
+static void
+icmp_bind_proto(icmp_t *icmp)
 {
-	icmp_t	*icmp;
-	int	error;
-
-	icmp = connp->conn_icmp;
+	conn_t	*connp = icmp->icmp_connp;
 
-	if (icmp->icmp_family == AF_INET6)
-		error = ip_proto_bind_laddr_v6(connp, NULL, icmp->icmp_proto,
-		    &sin6_null.sin6_addr, 0, B_TRUE);
-	else
-		error = ip_proto_bind_laddr_v4(connp, NULL, icmp->icmp_proto,
-		    sin_null.sin_addr.s_addr, 0, B_TRUE);
+	mutex_enter(&connp->conn_lock);
+	connp->conn_saddr_v6 = ipv6_all_zeros;
+	connp->conn_laddr_v6 = ipv6_all_zeros;
+	connp->conn_faddr_v6 = ipv6_all_zeros;
+	connp->conn_v6lastdst = ipv6_all_zeros;
+	mutex_exit(&connp->conn_lock);
 
-	rawip_post_ip_bind_connect(icmp, NULL, error);
-	return (error);
+	(void) ip_laddr_fanout_insert(connp);
 }
 
+/*
+ * This routine handles each T_CONN_REQ message passed to icmp.  It
+ * associates a default destination address with the stream.
+ *
+ * After various error checks are completed, icmp_connect() lays
+ * the target address and port into the composite header template.
+ * Then we ask IP for information, including a source address if we didn't
+ * already have one. Finally we send up the T_OK_ACK reply message.
+ */
 static void
 icmp_tpi_connect(queue_t *q, mblk_t *mp)
 {
 	conn_t	*connp = Q_TO_CONN(q);
 	struct T_conn_req	*tcr;
-	icmp_t	*icmp;
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 	cred_t *cr;
-
+	pid_t pid;
 	/*
 	 * All Solaris components should pass a db_credp
 	 * for this TPI message, hence we ASSERT.
@@ -603,14 +585,13 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp)
 	 * like a TPI message sent by some other kernel
 	 * component, we check and return an error.
 	 */
-	cr = msg_getcred(mp, NULL);
+	cr = msg_getcred(mp, &pid);
 	ASSERT(cr != NULL);
 	if (cr == NULL) {
 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
 		return;
 	}
 
-	icmp = connp->conn_icmp;
 	tcr = (struct T_conn_req *)mp->b_rptr;
 	/* Sanity checks */
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
@@ -639,13 +620,13 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp)
 		break;
 	}
 
-	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
+	error = proto_verify_ip_addr(connp->conn_family, sa, len);
 	if (error != 0) {
 		icmp_err_ack(q, mp, TSYSERR, error);
 		return;
 	}
 
-	error = rawip_do_connect(connp, sa, len, cr);
+	error = rawip_do_connect(connp, sa, len, cr, pid);
 	if (error != 0) {
 		if (error < 0) {
 			icmp_err_ack(q, mp, -error, 0);
@@ -659,11 +640,11 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp)
 		 * We have to send a connection confirmation to
 		 * keep TLI happy.
 		 */
-		if (icmp->icmp_family == AF_INET) {
+		if (connp->conn_family == AF_INET) {
 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 			    sizeof (sin_t), NULL, 0);
 		} else {
-			ASSERT(icmp->icmp_family == AF_INET6);
+			ASSERT(connp->conn_family == AF_INET6);
 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 			    sizeof (sin6_t), NULL, 0);
 		}
@@ -688,15 +669,20 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp)
 
 static int
 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
-    cred_t *cr)
+    cred_t *cr, pid_t pid)
 {
-	icmp_t	*icmp;
-	sin_t	*sin;
-	sin6_t	*sin6;
-	mblk_t  *ire_mp;
-	int	error;
+	icmp_t		*icmp;
+	sin_t		*sin;
+	sin6_t		*sin6;
+	int		error;
+	uint16_t 	dstport;
 	ipaddr_t	v4dst;
 	in6_addr_t	v6dst;
+	uint32_t	flowinfo;
+	ip_xmit_attr_t	*ixa;
+	uint_t		scopeid = 0;
+	uint_t		srcid = 0;
+	in6_addr_t	v6src = connp->conn_saddr_v6;
 
 	icmp = connp->conn_icmp;
 
@@ -704,170 +690,199 @@ rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 		return (EINVAL);
 	}
 
-	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
-	if (ire_mp == NULL)
-		return (ENOMEM);
-	DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
-
-
 	ASSERT(sa != NULL && len != 0);
 
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
-		rw_exit(&icmp->icmp_rwlock);
-		freeb(ire_mp);
-		return (-TOUTSTATE);
-	}
-
+	/*
+	 * Determine packet type based on type of address passed in
+	 * the request should contain an IPv4 or IPv6 address.
+	 * Make sure that address family matches the type of
+	 * family of the address passed down.
+	 */
 	switch (len) {
 	case sizeof (sin_t):
 		sin = (sin_t *)sa;
 
-		ASSERT(icmp->icmp_family == AF_INET);
-		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
-
 		v4dst = sin->sin_addr.s_addr;
-		/*
-		 * Interpret a zero destination to mean loopback.
-		 * Update the T_CONN_REQ (sin/sin6) since it is used to
-		 * generate the T_CONN_CON.
-		 */
-		if (v4dst == INADDR_ANY) {
-			v4dst = htonl(INADDR_LOOPBACK);
-		}
-
+		dstport = sin->sin_port;
 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
-		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
-		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
-		    icmp->icmp_ip_snd_options_len;
-		icmp->icmp_v6dst.sin6_addr = v6dst;
-		icmp->icmp_v6dst.sin6_family = AF_INET6;
-		icmp->icmp_v6dst.sin6_flowinfo = 0;
-		icmp->icmp_v6dst.sin6_port = 0;
-
-		/*
-		 * If the destination address is multicast and
-		 * an outgoing multicast interface has been set,
-		 * use the address of that interface as our
-		 * source address if no source address has been set.
-		 */
-		if (V4_PART_OF_V6(icmp->icmp_v6src) == INADDR_ANY &&
-		    CLASSD(v4dst) &&
-		    icmp->icmp_multicast_if_addr != INADDR_ANY) {
-			IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr,
-			    &icmp->icmp_v6src);
-		}
+		ASSERT(connp->conn_ipversion == IPV4_VERSION);
 		break;
+
 	case sizeof (sin6_t):
 		sin6 = (sin6_t *)sa;
 
 		/* No support for mapped addresses on raw sockets */
 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
-			rw_exit(&icmp->icmp_rwlock);
-			freeb(ire_mp);
 			return (EADDRNOTAVAIL);
 		}
+		v6dst = sin6->sin6_addr;
+		dstport = sin6->sin6_port;
+		ASSERT(connp->conn_ipversion == IPV6_VERSION);
+		flowinfo = sin6->sin6_flowinfo;
+		if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
+			scopeid = sin6->sin6_scope_id;
+		srcid = sin6->__sin6_src_id;
+		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+			    connp->conn_netstack);
+		}
+		break;
+	}
+
+	/*
+	 * If there is a different thread using conn_ixa then we get a new
+	 * copy and cut the old one loose from conn_ixa. Otherwise we use
+	 * conn_ixa and prevent any other thread from using/changing it.
+	 * Once connect() is done other threads can use conn_ixa since the
+	 * refcnt will be back at one.
+	 */
+	ixa = conn_get_ixa(connp, B_TRUE);
+	if (ixa == NULL)
+		return (ENOMEM);
 
-		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
-		ASSERT(icmp->icmp_family == AF_INET6);
+	ASSERT(ixa->ixa_refcnt >= 2);
+	ASSERT(ixa == connp->conn_ixa);
 
-		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
+	mutex_enter(&connp->conn_lock);
+	/*
+	 * This icmp_t must have bound already before doing a connect.
+	 * Reject if a connect is in progress (we drop conn_lock during
+	 * rawip_do_connect).
+	 */
+	if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
+		mutex_exit(&connp->conn_lock);
+		ixa_refrele(ixa);
+		return (-TOUTSTATE);
+	}
 
-		icmp->icmp_v6dst = *sin6;
-		icmp->icmp_v6dst.sin6_port = 0;
+	if (icmp->icmp_state == TS_DATA_XFER) {
+		/* Already connected - clear out state */
+		if (connp->conn_mcbc_bind)
+			connp->conn_saddr_v6 = ipv6_all_zeros;
+		else
+			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+		connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+		connp->conn_faddr_v6 = ipv6_all_zeros;
+		icmp->icmp_state = TS_IDLE;
+	}
 
+	/*
+	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
+	 * with IPPROTO_TCP.
+	 */
+	connp->conn_fport = dstport;
+	if (connp->conn_ipversion == IPV4_VERSION) {
 		/*
 		 * Interpret a zero destination to mean loopback.
 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
 		 * generate the T_CONN_CON.
 		 */
-		if (IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6dst.sin6_addr)) {
-			icmp->icmp_v6dst.sin6_addr = ipv6_loopback;
+		if (v4dst == INADDR_ANY) {
+			v4dst = htonl(INADDR_LOOPBACK);
+			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
+			ASSERT(connp->conn_family == AF_INET);
+			sin->sin_addr.s_addr = v4dst;
 		}
+		connp->conn_faddr_v6 = v6dst;
+		connp->conn_flowinfo = 0;
+	} else {
+		ASSERT(connp->conn_ipversion == IPV6_VERSION);
 		/*
-		 * If the destination address is multicast and
-		 * an outgoing multicast interface has been set,
-		 * then the ip bind logic will pick the correct source
-		 * address (i.e. matching the outgoing multicast interface).
+		 * Interpret a zero destination to mean loopback.
+		 * Update the T_CONN_REQ (sin/sin6) since it is used to
+		 * generate the T_CONN_CON.
 		 */
-		break;
+		if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
+			v6dst = ipv6_loopback;
+			sin6->sin6_addr = v6dst;
+		}
+		connp->conn_faddr_v6 = v6dst;
+		connp->conn_flowinfo = flowinfo;
 	}
 
-	icmp->icmp_pending_op = T_CONN_REQ;
-
-	if (icmp->icmp_state == TS_DATA_XFER) {
-		/* Already connected - clear out state */
-		icmp->icmp_v6src = icmp->icmp_bound_v6src;
-		icmp->icmp_state = TS_IDLE;
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
+	if (is_system_labeled()) {
+		/* We need to restart with a label based on the cred */
+		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
 	}
 
-	icmp->icmp_state = TS_DATA_XFER;
-	rw_exit(&icmp->icmp_rwlock);
-
-	if (icmp->icmp_family == AF_INET6) {
-		error = ip_proto_bind_connected_v6(connp, &ire_mp,
-		    icmp->icmp_proto, &icmp->icmp_v6src, 0,
-		    &icmp->icmp_v6dst.sin6_addr,
-		    NULL, sin6->sin6_port, B_TRUE, B_TRUE, cr);
+	if (scopeid != 0) {
+		ixa->ixa_flags |= IXAF_SCOPEID_SET;
+		ixa->ixa_scopeid = scopeid;
+		connp->conn_incoming_ifindex = scopeid;
 	} else {
-		error = ip_proto_bind_connected_v4(connp, &ire_mp,
-		    icmp->icmp_proto, &V4_PART_OF_V6(icmp->icmp_v6src), 0,
-		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr), sin->sin_port,
-		    B_TRUE, B_TRUE, cr);
+		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		connp->conn_incoming_ifindex = connp->conn_bound_if;
 	}
-	rawip_post_ip_bind_connect(icmp, ire_mp, error);
-	return (error);
-}
 
-static void
-icmp_close_free(conn_t *connp)
-{
-	icmp_t *icmp = connp->conn_icmp;
-
-	/* If there are any options associated with the stream, free them. */
-	if (icmp->icmp_ip_snd_options != NULL) {
-		mi_free((char *)icmp->icmp_ip_snd_options);
-		icmp->icmp_ip_snd_options = NULL;
-		icmp->icmp_ip_snd_options_len = 0;
-	}
+	/*
+	 * conn_connect will drop conn_lock and reacquire it.
+	 * To prevent a send* from messing with this icmp_t while the lock
+	 * is dropped we set icmp_state and clear conn_v6lastdst.
+	 * That will make all send* fail with EISCONN.
+	 */
+	connp->conn_v6lastdst = ipv6_all_zeros;
+	icmp->icmp_state = TS_WCON_CREQ;
 
-	if (icmp->icmp_filter != NULL) {
-		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
-		icmp->icmp_filter = NULL;
-	}
+	error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
+	mutex_exit(&connp->conn_lock);
+	if (error != 0)
+		goto connect_failed;
 
-	/* Free memory associated with sticky options */
-	if (icmp->icmp_sticky_hdrs_len != 0) {
-		kmem_free(icmp->icmp_sticky_hdrs,
-		    icmp->icmp_sticky_hdrs_len);
-		icmp->icmp_sticky_hdrs = NULL;
-		icmp->icmp_sticky_hdrs_len = 0;
-	}
+	/*
+	 * The addresses have been verified. Time to insert in
+	 * the correct fanout list.
+	 */
+	error = ipcl_conn_insert(connp);
+	if (error != 0)
+		goto connect_failed;
 
-	if (icmp->icmp_last_cred != NULL) {
-		crfree(icmp->icmp_last_cred);
-		icmp->icmp_last_cred = NULL;
+	mutex_enter(&connp->conn_lock);
+	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_flowinfo);
+	if (error != 0) {
+		mutex_exit(&connp->conn_lock);
+		goto connect_failed;
 	}
 
-	if (icmp->icmp_effective_cred != NULL) {
-		crfree(icmp->icmp_effective_cred);
-		icmp->icmp_effective_cred = NULL;
-	}
+	icmp->icmp_state = TS_DATA_XFER;
+	/* Record this as the "last" send even though we haven't sent any */
+	connp->conn_v6lastdst = connp->conn_faddr_v6;
+	connp->conn_lastipversion = connp->conn_ipversion;
+	connp->conn_lastdstport = connp->conn_fport;
+	connp->conn_lastflowinfo = connp->conn_flowinfo;
+	connp->conn_lastscopeid = scopeid;
+	connp->conn_lastsrcid = srcid;
+	/* Also remember a source to use together with lastdst */
+	connp->conn_v6lastsrc = v6src;
+	mutex_exit(&connp->conn_lock);
 
-	ip6_pkt_free(&icmp->icmp_sticky_ipp);
+	ixa_refrele(ixa);
+	return (0);
 
-	/*
-	 * Clear any fields which the kmem_cache constructor clears.
-	 * Only icmp_connp needs to be preserved.
-	 * TBD: We should make this more efficient to avoid clearing
-	 * everything.
-	 */
-	ASSERT(icmp->icmp_connp == connp);
-	bzero(icmp, sizeof (icmp_t));
-	icmp->icmp_connp = connp;
+connect_failed:
+	if (ixa != NULL)
+		ixa_refrele(ixa);
+	mutex_enter(&connp->conn_lock);
+	icmp->icmp_state = TS_IDLE;
+	/* In case the source address was set above */
+	if (connp->conn_mcbc_bind)
+		connp->conn_saddr_v6 = ipv6_all_zeros;
+	else
+		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+	connp->conn_faddr_v6 = ipv6_all_zeros;
+	connp->conn_v6lastdst = ipv6_all_zeros;
+	connp->conn_flowinfo = 0;
+
+	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_flowinfo);
+	mutex_exit(&connp->conn_lock);
+	return (error);
 }
 
-static int
+static void
 rawip_do_close(conn_t *connp)
 {
 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
@@ -878,8 +893,6 @@ rawip_do_close(conn_t *connp)
 		qprocsoff(connp->conn_rq);
 	}
 
-	ASSERT(connp->conn_icmp->icmp_fallback_queue_head == NULL &&
-	    connp->conn_icmp->icmp_fallback_queue_tail == NULL);
 	icmp_close_free(connp);
 
 	/*
@@ -902,8 +915,6 @@ rawip_do_close(conn_t *connp)
 
 	connp->conn_ref--;
 	ipcl_conn_destroy(connp);
-
-	return (0);
 }
 
 static int
@@ -928,60 +939,63 @@ done:
 	return (0);
 }
 
+static void
+icmp_close_free(conn_t *connp)
+{
+	icmp_t *icmp = connp->conn_icmp;
+
+	if (icmp->icmp_filter != NULL) {
+		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
+		icmp->icmp_filter = NULL;
+	}
+
+	/*
+	 * Clear any fields which the kmem_cache constructor clears.
+	 * Only icmp_connp needs to be preserved.
+	 * TBD: We should make this more efficient to avoid clearing
+	 * everything.
+	 */
+	ASSERT(icmp->icmp_connp == connp);
+	bzero(icmp, sizeof (icmp_t));
+	icmp->icmp_connp = connp;
+}
+
 /*
  * This routine handles each T_DISCON_REQ message passed to icmp
  * as an indicating that ICMP is no longer connected. This results
- * in sending a T_BIND_REQ to IP to restore the binding to just
- * the local address.
- *
- * The disconnect completes in rawip_post_ip_bind_connect.
+ * in telling IP to restore the binding to just the local address.
  */
 static int
 icmp_do_disconnect(conn_t *connp)
 {
-	icmp_t	*icmp;
-	mblk_t	*ire_mp;
-	int error;
+	icmp_t	*icmp = connp->conn_icmp;
+	int	error;
 
-	icmp = connp->conn_icmp;
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-	if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) {
-		rw_exit(&icmp->icmp_rwlock);
+	mutex_enter(&connp->conn_lock);
+	if (icmp->icmp_state != TS_DATA_XFER) {
+		mutex_exit(&connp->conn_lock);
 		return (-TOUTSTATE);
 	}
-	icmp->icmp_pending_op = T_DISCON_REQ;
-	icmp->icmp_v6src = icmp->icmp_bound_v6src;
+	if (connp->conn_mcbc_bind)
+		connp->conn_saddr_v6 = ipv6_all_zeros;
+	else
+		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+	connp->conn_faddr_v6 = ipv6_all_zeros;
 	icmp->icmp_state = TS_IDLE;
 
+	connp->conn_v6lastdst = ipv6_all_zeros;
+	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_flowinfo);
+	mutex_exit(&connp->conn_lock);
+	if (error != 0)
+		return (error);
 
-	if (icmp->icmp_family == AF_INET6) {
-		/* Rebuild the header template */
-		error = icmp_build_hdrs(icmp);
-		if (error != 0) {
-			icmp->icmp_pending_op = -1;
-			rw_exit(&icmp->icmp_rwlock);
-			return (error);
-		}
-	}
-
-	rw_exit(&icmp->icmp_rwlock);
-	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
-	if (ire_mp == NULL) {
-		return (ENOMEM);
-	}
-
-	if (icmp->icmp_family == AF_INET6) {
-		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
-		    &icmp->icmp_bound_v6src, 0, B_TRUE);
-	} else {
-
-		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
-		    V4_PART_OF_V6(icmp->icmp_bound_v6src), 0, B_TRUE);
-	}
-
-	rawip_post_ip_bind_connect(icmp, ire_mp, error);
-
-	return (error);
+	/*
+	 * Tell IP to remove the full binding and revert
+	 * to the local address binding.
+	 */
+	return (ip_laddr_fanout_insert(connp));
 }
 
 static void
@@ -1014,16 +1028,14 @@ icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
 		ASSERT(mp != NULL);
 		qreply(q, mp);
 	}
-
 }
 
 static int
 icmp_disconnect(conn_t *connp)
 {
 	int	error;
-	icmp_t	*icmp = connp->conn_icmp;
 
-	icmp->icmp_dgram_errind = B_FALSE;
+	connp->conn_dgram_errind = B_FALSE;
 
 	error = icmp_do_disconnect(connp);
 
@@ -1058,22 +1070,22 @@ icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
 }
 
 /*
- * icmp_icmp_error is called by icmp_input to process ICMP
- * messages passed up by IP.
- * Generates the appropriate permanent (non-transient) errors.
- * Assumes that IP has pulled up everything up to and including
- * the ICMP header.
+ * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
+ * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
+ * Assumes that IP has pulled up everything up to and including the ICMP header.
  */
+/* ARGSUSED2 */
 static void
-icmp_icmp_error(conn_t *connp, mblk_t *mp)
+icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 {
-	icmph_t *icmph;
-	ipha_t	*ipha;
-	int	iph_hdr_length;
-	sin_t	sin;
-	mblk_t	*mp1;
-	int	error = 0;
-	icmp_t	*icmp = connp->conn_icmp;
+	conn_t		*connp = (conn_t *)arg1;
+	icmp_t		*icmp = connp->conn_icmp;
+	icmph_t		*icmph;
+	ipha_t		*ipha;
+	int		iph_hdr_length;
+	sin_t		sin;
+	mblk_t		*mp1;
+	int		error = 0;
 
 	ipha = (ipha_t *)mp->b_rptr;
 
@@ -1081,34 +1093,57 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp)
 
 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
-		icmp_icmp_error_ipv6(connp, mp);
+		icmp_icmp_error_ipv6(connp, mp, ira);
 		return;
 	}
-
-	/*
-	 * icmp does not support v4 mapped addresses
-	 * so we can never be here for a V6 socket
-	 * i.e. icmp_family == AF_INET6
-	 */
-	ASSERT((IPH_HDR_VERSION(ipha) == IPV4_VERSION) &&
-	    (icmp->icmp_family == AF_INET));
-
-	ASSERT(icmp->icmp_family == AF_INET);
+	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
 
 	/* Skip past the outer IP and ICMP headers */
-	iph_hdr_length = IPH_HDR_LENGTH(ipha);
-	icmph = (icmph_t *)(&mp->b_rptr[iph_hdr_length]);
-	ipha = (ipha_t *)&icmph[1];
+	ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
+	iph_hdr_length = ira->ira_ip_hdr_length;
+	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
+	ipha = (ipha_t *)&icmph[1];	/* Inner IP header */
+
 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
 
 	switch (icmph->icmph_type) {
 	case ICMP_DEST_UNREACHABLE:
 		switch (icmph->icmph_code) {
-		case ICMP_FRAGMENTATION_NEEDED:
+		case ICMP_FRAGMENTATION_NEEDED: {
+			ipha_t		*ipha;
+			ip_xmit_attr_t	*ixa;
 			/*
 			 * IP has already adjusted the path MTU.
+			 * But we need to adjust DF for IPv4.
 			 */
+			if (connp->conn_ipversion != IPV4_VERSION)
+				break;
+
+			ixa = conn_get_ixa(connp, B_FALSE);
+			if (ixa == NULL || ixa->ixa_ire == NULL) {
+				/*
+				 * Some other thread holds conn_ixa. We will
+				 * redo this on the next ICMP too big.
+				 */
+				if (ixa != NULL)
+					ixa_refrele(ixa);
+				break;
+			}
+			(void) ip_get_pmtu(ixa);
+
+			mutex_enter(&connp->conn_lock);
+			ipha = (ipha_t *)connp->conn_ht_iphc;
+			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
+				ipha->ipha_fragment_offset_and_flags |=
+				    IPH_DF_HTONS;
+			} else {
+				ipha->ipha_fragment_offset_and_flags &=
+				    ~IPH_DF_HTONS;
+			}
+			mutex_exit(&connp->conn_lock);
+			ixa_refrele(ixa);
 			break;
+		}
 		case ICMP_PORT_UNREACHABLE:
 		case ICMP_PROTOCOL_UNREACHABLE:
 			error = ECONNREFUSED;
@@ -1131,7 +1166,7 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp)
 	 * Deliver T_UDERROR_IND when the application has asked for it.
 	 * The socket layer enables this automatically when connected.
 	 */
-	if (!icmp->icmp_dgram_errind) {
+	if (!connp->conn_dgram_errind) {
 		freemsg(mp);
 		return;
 	}
@@ -1141,11 +1176,10 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp)
 	sin.sin_addr.s_addr = ipha->ipha_dst;
 
 	if (IPCL_IS_NONSTR(connp)) {
-		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+		mutex_enter(&connp->conn_lock);
 		if (icmp->icmp_state == TS_DATA_XFER) {
-			if (sin.sin_addr.s_addr ==
-			    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr)) {
-				rw_exit(&icmp->icmp_rwlock);
+			if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
+				mutex_exit(&connp->conn_lock);
 				(*connp->conn_upcalls->su_set_error)
 				    (connp->conn_upper_handle, error);
 				goto done;
@@ -1154,27 +1188,25 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp)
 			icmp->icmp_delayed_error = error;
 			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
 		}
-		rw_exit(&icmp->icmp_rwlock);
+		mutex_exit(&connp->conn_lock);
 	} else {
-		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL,
-		    0, error);
+		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
+		    error);
 		if (mp1 != NULL)
 			putnext(connp->conn_rq, mp1);
 	}
 done:
-	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
 	freemsg(mp);
 }
 
 /*
- * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMPv6
- * for IPv6 packets.
- * Send permanent (non-transient) errors upstream.
- * Assumes that IP has pulled up all the extension headers as well
- * as the ICMPv6 header.
+ * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
+ * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
+ * Assumes that IP has pulled up all the extension headers as well as the
+ * ICMPv6 header.
  */
 static void
-icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
+icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
 {
 	icmp6_t		*icmp6;
 	ip6_t		*ip6h, *outer_ip6h;
@@ -1186,13 +1218,18 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 	icmp_t		*icmp = connp->conn_icmp;
 
 	outer_ip6h = (ip6_t *)mp->b_rptr;
+#ifdef DEBUG
 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
 	else
 		iph_hdr_length = IPV6_HDR_LEN;
-
+	ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
+#endif
+	/* Skip past the outer IP and ICMP headers */
+	iph_hdr_length = ira->ira_ip_hdr_length;
 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
-	ip6h = (ip6_t *)&icmp6[1];
+
+	ip6h = (ip6_t *)&icmp6[1];	/* Inner IP header */
 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
 		freemsg(mp);
 		return;
@@ -1229,7 +1266,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 		 * information, send up an empty message containing an
 		 * IPV6_PATHMTU ancillary data item.
 		 */
-		if (!icmp->icmp_ipv6_recvpathmtu)
+		if (!connp->conn_ipv6_recvpathmtu)
 			break;
 
 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
@@ -1255,7 +1292,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 		sin6 = (sin6_t *)&tudi[1];
 		bzero(sin6, sizeof (sin6_t));
 		sin6->sin6_family = AF_INET6;
-		sin6->sin6_addr = icmp->icmp_v6dst.sin6_addr;
+		sin6->sin6_addr = connp->conn_faddr_v6;
 
 		toh = (struct T_opthdr *)&sin6[1];
 		toh->level = IPPROTO_IPV6;
@@ -1273,8 +1310,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 		 * message.  Free it, then send our empty message.
 		 */
 		freemsg(mp);
-		icmp_ulp_recv(connp, newmp);
-
+		icmp_ulp_recv(connp, newmp, msgdsize(newmp));
 		return;
 	}
 	case ICMP6_TIME_EXCEEDED:
@@ -1299,7 +1335,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 	 * Deliver T_UDERROR_IND when the application has asked for it.
 	 * The socket layer enables this automatically when connected.
 	 */
-	if (!icmp->icmp_dgram_errind) {
+	if (!connp->conn_dgram_errind) {
 		freemsg(mp);
 		return;
 	}
@@ -1308,13 +1344,12 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = ip6h->ip6_dst;
 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
-
 	if (IPCL_IS_NONSTR(connp)) {
-		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+		mutex_enter(&connp->conn_lock);
 		if (icmp->icmp_state == TS_DATA_XFER) {
 			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
-			    &icmp->icmp_v6dst.sin6_addr)) {
-				rw_exit(&icmp->icmp_rwlock);
+			    &connp->conn_faddr_v6)) {
+				mutex_exit(&connp->conn_lock);
 				(*connp->conn_upcalls->su_set_error)
 				    (connp->conn_upper_handle, error);
 				goto done;
@@ -1323,7 +1358,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 			icmp->icmp_delayed_error = error;
 			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
 		}
-		rw_exit(&icmp->icmp_rwlock);
+		mutex_exit(&connp->conn_lock);
 	} else {
 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
 		    NULL, 0, error);
@@ -1331,7 +1366,6 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 			putnext(connp->conn_rq, mp1);
 	}
 done:
-	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
 	freemsg(mp);
 }
 
@@ -1345,9 +1379,12 @@ done:
 static void
 icmp_addr_req(queue_t *q, mblk_t *mp)
 {
-	icmp_t	*icmp = Q_TO_ICMP(q);
+	struct sockaddr *sa;
 	mblk_t	*ackmp;
 	struct T_addr_ack *taa;
+	icmp_t	*icmp = Q_TO_ICMP(q);
+	conn_t	*connp = icmp->icmp_connp;
+	uint_t	addrlen;
 
 	/* Make it large enough for worst case */
 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
@@ -1363,65 +1400,39 @@ icmp_addr_req(queue_t *q, mblk_t *mp)
 
 	taa->PRIM_type = T_ADDR_ACK;
 	ackmp->b_datap->db_type = M_PCPROTO;
-	rw_enter(&icmp->icmp_rwlock, RW_READER);
+
+	if (connp->conn_family == AF_INET)
+		addrlen = sizeof (sin_t);
+	else
+		addrlen = sizeof (sin6_t);
+
+	mutex_enter(&connp->conn_lock);
 	/*
 	 * Note: Following code assumes 32 bit alignment of basic
 	 * data structures like sin_t and struct T_addr_ack.
 	 */
 	if (icmp->icmp_state != TS_UNBND) {
 		/*
-		 * Fill in local address
+		 * Fill in local address first
 		 */
 		taa->LOCADDR_offset = sizeof (*taa);
-		if (icmp->icmp_family == AF_INET) {
-			sin_t	*sin;
-
-			taa->LOCADDR_length = sizeof (sin_t);
-			sin = (sin_t *)&taa[1];
-			/* Fill zeroes and then intialize non-zero fields */
-			*sin = sin_null;
-			sin->sin_family = AF_INET;
-			if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
-			    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
-				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src,
-				    sin->sin_addr.s_addr);
-			} else {
-				/*
-				 * INADDR_ANY
-				 * icmp_v6src is not set, we might be bound to
-				 * broadcast/multicast. Use icmp_bound_v6src as
-				 * local address instead (that could
-				 * also still be INADDR_ANY)
-				 */
-				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_bound_v6src,
-				    sin->sin_addr.s_addr);
-			}
-			ackmp->b_wptr = (uchar_t *)&sin[1];
-		} else {
-			sin6_t	*sin6;
-
-			ASSERT(icmp->icmp_family == AF_INET6);
-			taa->LOCADDR_length = sizeof (sin6_t);
-			sin6 = (sin6_t *)&taa[1];
-			/* Fill zeroes and then intialize non-zero fields */
-			*sin6 = sin6_null;
-			sin6->sin6_family = AF_INET6;
-			if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
-				sin6->sin6_addr = icmp->icmp_v6src;
-			} else {
-				/*
-				 * UNSPECIFIED
-				 * icmp_v6src is not set, we might be bound to
-				 * broadcast/multicast. Use icmp_bound_v6src as
-				 * local address instead (that could
-				 * also still be UNSPECIFIED)
-				 */
-				sin6->sin6_addr = icmp->icmp_bound_v6src;
-			}
-			ackmp->b_wptr = (uchar_t *)&sin6[1];
-		}
+		taa->LOCADDR_length = addrlen;
+		sa = (struct sockaddr *)&taa[1];
+		(void) conn_getsockname(connp, sa, &addrlen);
+		ackmp->b_wptr += addrlen;
+	}
+	if (icmp->icmp_state == TS_DATA_XFER) {
+		/*
+		 * connected, fill remote address too
+		 */
+		taa->REMADDR_length = addrlen;
+		/* assumed 32-bit alignment */
+		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
+		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
+		(void) conn_getpeername(connp, sa, &addrlen);
+		ackmp->b_wptr += addrlen;
 	}
-	rw_exit(&icmp->icmp_rwlock);
+	mutex_exit(&connp->conn_lock);
 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
 	qreply(q, ackmp);
 }
@@ -1429,9 +1440,11 @@ icmp_addr_req(queue_t *q, mblk_t *mp)
 static void
 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
 {
+	conn_t		*connp = icmp->icmp_connp;
+
 	*tap = icmp_g_t_info_ack;
 
-	if (icmp->icmp_family == AF_INET6)
+	if (connp->conn_family == AF_INET6)
 		tap->ADDR_size = sizeof (sin6_t);
 	else
 		tap->ADDR_size = sizeof (sin_t);
@@ -1488,6 +1501,7 @@ icmp_info_req(queue_t *q, mblk_t *mp)
 {
 	icmp_t	*icmp = Q_TO_ICMP(q);
 
+	/* Create a T_INFO_ACK message. */
 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
 	    T_INFO_ACK);
 	if (!mp)
@@ -1496,18 +1510,14 @@ icmp_info_req(queue_t *q, mblk_t *mp)
 	qreply(q, mp);
 }
 
-/* For /dev/icmp aka AF_INET open */
 static int
 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
     int family)
 {
 	conn_t *connp;
 	dev_t	conn_dev;
-	icmp_stack_t *is;
 	int	error;
 
-	conn_dev = NULL;
-
 	/* If the stream is already open, return immediately. */
 	if (q->q_ptr != NULL)
 		return (0);
@@ -1534,9 +1544,9 @@ icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 		return (0);
 	}
 
-	connp = icmp_open(family, credp, &error, KM_SLEEP);
+	connp = rawip_do_open(family, credp, &error, KM_SLEEP);
 	if (connp == NULL) {
-		ASSERT(error != NULL);
+		ASSERT(error != 0);
 		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
 		return (error);
 	}
@@ -1545,8 +1555,6 @@ icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 	connp->conn_dev = conn_dev;
 	connp->conn_minor_arena = ip_minor_arena_sa;
 
-	is = connp->conn_icmp->icmp_is;
-
 	/*
 	 * Initialize the icmp_t structure for this stream.
 	 */
@@ -1555,38 +1563,25 @@ icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 	connp->conn_rq = q;
 	connp->conn_wq = WR(q);
 
-	if (connp->conn_icmp->icmp_family == AF_INET6) {
-		/* Build initial header template for transmit */
-		rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
-		if ((error = icmp_build_hdrs(connp->conn_icmp)) != 0) {
-			rw_exit(&connp->conn_icmp->icmp_rwlock);
-			inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
-			ipcl_conn_destroy(connp);
-			return (error);
-		}
-		rw_exit(&connp->conn_icmp->icmp_rwlock);
-	}
-
-
-	q->q_hiwat = is->is_recv_hiwat;
-	WR(q)->q_hiwat = is->is_xmit_hiwat;
-	WR(q)->q_lowat = is->is_xmit_lowat;
+	WR(q)->q_hiwat = connp->conn_sndbuf;
+	WR(q)->q_lowat = connp->conn_sndlowat;
 
 	qprocson(q);
 
 	/* Set the Stream head write offset. */
-	(void) proto_set_tx_wroff(q, connp,
-	    connp->conn_icmp->icmp_max_hdr_len + is->is_wroff_extra);
-	(void) proto_set_rx_hiwat(connp->conn_rq, connp, q->q_hiwat);
+	(void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
+	(void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
 
 	mutex_enter(&connp->conn_lock);
 	connp->conn_state_flags &= ~CONN_INCIPIENT;
 	mutex_exit(&connp->conn_lock);
 
+	icmp_bind_proto(connp->conn_icmp);
+
 	return (0);
 }
 
-/* For /dev/icmp4 aka AF_INET open */
+/* For /dev/icmp aka AF_INET open */
 static int
 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 {
@@ -1604,15 +1599,15 @@ icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
  * This is the open routine for icmp.  It allocates a icmp_t structure for
  * the stream and, on the first open of the module, creates an ND table.
  */
-/* ARGSUSED */
 static conn_t *
-icmp_open(int family, cred_t *credp, int *err, int flags)
+rawip_do_open(int family, cred_t *credp, int *err, int flags)
 {
 	icmp_t	*icmp;
 	conn_t *connp;
 	zoneid_t zoneid;
 	netstack_t *ns;
 	icmp_stack_t *is;
+	int len;
 	boolean_t isv6 = B_FALSE;
 
 	*err = secpolicy_net_icmpaccess(credp);
@@ -1621,6 +1616,7 @@ icmp_open(int family, cred_t *credp, int *err, int flags)
 
 	if (family == AF_INET6)
 		isv6 = B_TRUE;
+
 	ns = netstack_find_by_cred(credp);
 	ASSERT(ns != NULL);
 	is = ns->netstack_icmp;
@@ -1639,7 +1635,6 @@ icmp_open(int family, cred_t *credp, int *err, int flags)
 
 	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
 	icmp = connp->conn_icmp;
-	icmp->icmp_v6dst = sin6_null;
 
 	/*
 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
@@ -1647,35 +1642,52 @@ icmp_open(int family, cred_t *credp, int *err, int flags)
 	 */
 	netstack_rele(ns);
 
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-	ASSERT(connp->conn_ulp == IPPROTO_ICMP);
+	/*
+	 * Since this conn_t/icmp_t is not yet visible to anybody else we don't
+	 * need to lock anything.
+	 */
+	ASSERT(connp->conn_proto == IPPROTO_ICMP);
 	ASSERT(connp->conn_icmp == icmp);
 	ASSERT(icmp->icmp_connp == connp);
 
 	/* Set the initial state of the stream and the privilege status. */
 	icmp->icmp_state = TS_UNBND;
+	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
 	if (isv6) {
-		icmp->icmp_ipversion = IPV6_VERSION;
-		icmp->icmp_family = AF_INET6;
-		connp->conn_ulp = IPPROTO_ICMPV6;
+		connp->conn_family = AF_INET6;
+		connp->conn_ipversion = IPV6_VERSION;
+		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
+		connp->conn_proto = IPPROTO_ICMPV6;
 		/* May be changed by a SO_PROTOTYPE socket option. */
-		icmp->icmp_proto = IPPROTO_ICMPV6;
-		icmp->icmp_checksum_off = 2;	/* Offset for icmp6_cksum */
-		icmp->icmp_max_hdr_len = IPV6_HDR_LEN;
-		icmp->icmp_ttl = (uint8_t)is->is_ipv6_hoplimit;
-		connp->conn_af_isv6 = B_TRUE;
+		connp->conn_proto = IPPROTO_ICMPV6;
+		connp->conn_ixa->ixa_protocol = connp->conn_proto;
+		connp->conn_ixa->ixa_raw_cksum_offset = 2;
+		connp->conn_default_ttl = is->is_ipv6_hoplimit;
+		len = sizeof (ip6_t);
 	} else {
-		icmp->icmp_ipversion = IPV4_VERSION;
-		icmp->icmp_family = AF_INET;
+		connp->conn_family = AF_INET;
+		connp->conn_ipversion = IPV4_VERSION;
+		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
 		/* May be changed by a SO_PROTOTYPE socket option. */
-		icmp->icmp_proto = IPPROTO_ICMP;
-		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH;
-		icmp->icmp_ttl = (uint8_t)is->is_ipv4_ttl;
-		connp->conn_af_isv6 = B_FALSE;
-	}
-	icmp->icmp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
-	icmp->icmp_pending_op = -1;
-	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
+		connp->conn_proto = IPPROTO_ICMP;
+		connp->conn_ixa->ixa_protocol = connp->conn_proto;
+		connp->conn_default_ttl = is->is_ipv4_ttl;
+		len = sizeof (ipha_t);
+	}
+	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
+
+	connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+	/*
+	 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
+	 * the checksum is provided in the pre-built packet. We clear
+	 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
+	 * complete IP header and not to compute the transport checksum.
+	 */
+	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
+	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+	connp->conn_ixa->ixa_zoneid = zoneid;
+
 	connp->conn_zoneid = zoneid;
 
 	/*
@@ -1685,17 +1697,35 @@ icmp_open(int family, cred_t *credp, int *err, int flags)
 	if (getpflags(NET_MAC_AWARE, credp) != 0)
 		connp->conn_mac_mode = CONN_MAC_AWARE;
 
-	connp->conn_ulp_labeled = is_system_labeled();
+	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
 
 	icmp->icmp_is = is;
 
+	connp->conn_rcvbuf = is->is_recv_hiwat;
+	connp->conn_sndbuf = is->is_xmit_hiwat;
+	connp->conn_sndlowat = is->is_xmit_lowat;
+	connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
+
+	connp->conn_wroff = len + is->is_wroff_extra;
+	connp->conn_so_type = SOCK_RAW;
+
 	connp->conn_recv = icmp_input;
+	connp->conn_recvicmp = icmp_icmp_input;
 	crhold(credp);
 	connp->conn_cred = credp;
-
-	rw_exit(&icmp->icmp_rwlock);
+	connp->conn_cpid = curproc->p_pid;
+	connp->conn_open_time = lbolt64;
+	/* Cache things in ixa without an extra refhold */
+	connp->conn_ixa->ixa_cred = connp->conn_cred;
+	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+	if (is_system_labeled())
+		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
 
 	connp->conn_flow_cntrld = B_FALSE;
+
+	if (is->is_pmtu_discovery)
+		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
+
 	return (connp);
 }
 
@@ -1713,9 +1743,8 @@ icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
  * This routine gets default values of certain options whose default
  * values are maintained by protcol specific code
  */
-/* ARGSUSED */
 int
-icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
+icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
 {
 	icmp_t *icmp = Q_TO_ICMP(q);
 	icmp_stack_t *is = icmp->icmp_is;
@@ -1759,366 +1788,88 @@ icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
 
 /*
  * This routine retrieves the current status of socket options.
- * It returns the size of the option retrieved.
+ * It returns the size of the option retrieved, or -1.
  */
 int
 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 {
 	icmp_t		*icmp = connp->conn_icmp;
-	icmp_stack_t	*is = icmp->icmp_is;
 	int		*i1 = (int *)ptr;
-	ip6_pkt_t	*ipp = &icmp->icmp_sticky_ipp;
-	int		ret = 0;
+	conn_opt_arg_t	coas;
+	int		retval;
 
-	ASSERT(RW_READ_HELD(&icmp->icmp_rwlock));
-	switch (level) {
-	case SOL_SOCKET:
-		switch (name) {
-		case SO_DEBUG:
-			*i1 = icmp->icmp_debug;
-			break;
-		case SO_TYPE:
-			*i1 = SOCK_RAW;
-			break;
-		case SO_PROTOTYPE:
-			*i1 = icmp->icmp_proto;
-			break;
-		case SO_REUSEADDR:
-			*i1 = icmp->icmp_reuseaddr;
-			break;
-
-		/*
-		 * The following three items are available here,
-		 * but are only meaningful to IP.
-		 */
-		case SO_DONTROUTE:
-			*i1 = icmp->icmp_dontroute;
-			break;
-		case SO_USELOOPBACK:
-			*i1 = icmp->icmp_useloopback;
-			break;
-		case SO_BROADCAST:
-			*i1 = icmp->icmp_broadcast;
-			break;
-
-		case SO_SNDBUF:
-			ASSERT(icmp->icmp_xmit_hiwat <= INT_MAX);
-			*i1 = icmp->icmp_xmit_hiwat;
-			break;
-		case SO_RCVBUF:
-			ASSERT(icmp->icmp_recv_hiwat <= INT_MAX);
-			*i1 = icmp->icmp_recv_hiwat;
-			break;
-		case SO_DGRAM_ERRIND:
-			*i1 = icmp->icmp_dgram_errind;
-			break;
-		case SO_TIMESTAMP:
-			*i1 = icmp->icmp_timestamp;
-			break;
-		case SO_MAC_EXEMPT:
-			*i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
-			break;
-		case SO_MAC_IMPLICIT:
-			*i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
-			break;
-		case SO_DOMAIN:
-			*i1 = icmp->icmp_family;
-			break;
+	coas.coa_connp = connp;
+	coas.coa_ixa = connp->conn_ixa;
+	coas.coa_ipp = &connp->conn_xmit_ipp;
+	coas.coa_ancillary = B_FALSE;
+	coas.coa_changed = 0;
 
-		/*
-		 * Following four not meaningful for icmp
-		 * Action is same as "default" to which we fallthrough
-		 * so we keep them in comments.
-		 * case SO_LINGER:
-		 * case SO_KEEPALIVE:
-		 * case SO_OOBINLINE:
-		 * case SO_ALLZONES:
-		 */
-		default:
-			ret = -1;
-			goto done;
-		}
-		break;
+	/*
+	 * We assume that the optcom framework has checked for the set
+	 * of levels and names that are supported, hence we don't worry
+	 * about rejecting based on that.
+	 * First check for ICMP specific handling, then pass to common routine.
+	 */
+	switch (level) {
 	case IPPROTO_IP:
 		/*
 		 * Only allow IPv4 option processing on IPv4 sockets.
 		 */
-		if (icmp->icmp_family != AF_INET) {
-			ret = -1;
-			goto done;
-		}
+		if (connp->conn_family != AF_INET)
+			return (-1);
 
 		switch (name) {
 		case IP_OPTIONS:
 		case T_IP_OPTIONS:
 			/* Options are passed up with each packet */
-			ret = 0;
-			goto done;
+			return (0);
 		case IP_HDRINCL:
+			mutex_enter(&connp->conn_lock);
 			*i1 = (int)icmp->icmp_hdrincl;
-			break;
-		case IP_TOS:
-		case T_IP_TOS:
-			*i1 = (int)icmp->icmp_type_of_service;
-			break;
-		case IP_TTL:
-			*i1 = (int)icmp->icmp_ttl;
-			break;
-		case IP_MULTICAST_IF:
-			/* 0 address if not set */
-			*(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr;
-			ret = sizeof (ipaddr_t);
-			goto done;
-		case IP_MULTICAST_TTL:
-			*(uchar_t *)ptr = icmp->icmp_multicast_ttl;
-			ret = sizeof (uchar_t);
-			goto done;
-		case IP_MULTICAST_LOOP:
-			*ptr = connp->conn_multicast_loop;
-			ret = sizeof (uint8_t);
-			goto done;
-		case IP_BOUND_IF:
-			/* Zero if not set */
-			*i1 = icmp->icmp_bound_if;
-			break;	/* goto sizeof (int) option return */
-		case IP_UNSPEC_SRC:
-			*ptr = icmp->icmp_unspec_source;
-			break;	/* goto sizeof (int) option return */
-		case IP_RECVIF:
-			*ptr = icmp->icmp_recvif;
-			break;	/* goto sizeof (int) option return */
-		case IP_BROADCAST_TTL:
-			*(uchar_t *)ptr = connp->conn_broadcast_ttl;
-			return (sizeof (uchar_t));
-		case IP_RECVPKTINFO:
-			/*
-			 * This also handles IP_PKTINFO.
-			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
-			 * Differentiation is based on the size of the argument
-			 * passed in.
-			 * This option is handled in IP which will return an
-			 * error for IP_PKTINFO as it's not supported as a
-			 * sticky option.
-			 */
-			ret = -EINVAL;
-			goto done;
-		/*
-		 * Cannot "get" the value of following options
-		 * at this level. Action is same as "default" to
-		 * which we fallthrough so we keep them in comments.
-		 *
-		 * case IP_ADD_MEMBERSHIP:
-		 * case IP_DROP_MEMBERSHIP:
-		 * case IP_BLOCK_SOURCE:
-		 * case IP_UNBLOCK_SOURCE:
-		 * case IP_ADD_SOURCE_MEMBERSHIP:
-		 * case IP_DROP_SOURCE_MEMBERSHIP:
-		 * case MCAST_JOIN_GROUP:
-		 * case MCAST_LEAVE_GROUP:
-		 * case MCAST_BLOCK_SOURCE:
-		 * case MCAST_UNBLOCK_SOURCE:
-		 * case MCAST_JOIN_SOURCE_GROUP:
-		 * case MCAST_LEAVE_SOURCE_GROUP:
-		 * case MRT_INIT:
-		 * case MRT_DONE:
-		 * case MRT_ADD_VIF:
-		 * case MRT_DEL_VIF:
-		 * case MRT_ADD_MFC:
-		 * case MRT_DEL_MFC:
-		 * case MRT_VERSION:
-		 * case MRT_ASSERT:
-		 * case IP_SEC_OPT:
-		 * case IP_NEXTHOP:
-		 */
-		default:
-			ret = -1;
-			goto done;
+			mutex_exit(&connp->conn_lock);
+			return (sizeof (int));
 		}
 		break;
+
 	case IPPROTO_IPV6:
 		/*
 		 * Only allow IPv6 option processing on native IPv6 sockets.
 		 */
-		if (icmp->icmp_family != AF_INET6) {
-			ret = -1;
-			goto done;
-		}
+		if (connp->conn_family != AF_INET6)
+			return (-1);
+
 		switch (name) {
-		case IPV6_UNICAST_HOPS:
-			*i1 = (unsigned int)icmp->icmp_ttl;
-			break;
-		case IPV6_MULTICAST_IF:
-			/* 0 index if not set */
-			*i1 = icmp->icmp_multicast_if_index;
-			break;
-		case IPV6_MULTICAST_HOPS:
-			*i1 = icmp->icmp_multicast_ttl;
-			break;
-		case IPV6_MULTICAST_LOOP:
-			*i1 = connp->conn_multicast_loop;
-			break;
-		case IPV6_BOUND_IF:
-			/* Zero if not set */
-			*i1 = icmp->icmp_bound_if;
-			break;
-		case IPV6_UNSPEC_SRC:
-			*i1 = icmp->icmp_unspec_source;
-			break;
 		case IPV6_CHECKSUM:
 			/*
 			 * Return offset or -1 if no checksum offset.
 			 * Does not apply to IPPROTO_ICMPV6
 			 */
-			if (icmp->icmp_proto == IPPROTO_ICMPV6) {
-				ret = -1;
-				goto done;
-			}
+			if (connp->conn_proto == IPPROTO_ICMPV6)
+				return (-1);
 
-			if (icmp->icmp_raw_checksum) {
-				*i1 = icmp->icmp_checksum_off;
-			} else {
-				*i1 = -1;
-			}
-			break;
-		case IPV6_JOIN_GROUP:
-		case IPV6_LEAVE_GROUP:
-		case MCAST_JOIN_GROUP:
-		case MCAST_LEAVE_GROUP:
-		case MCAST_BLOCK_SOURCE:
-		case MCAST_UNBLOCK_SOURCE:
-		case MCAST_JOIN_SOURCE_GROUP:
-		case MCAST_LEAVE_SOURCE_GROUP:
-			/* cannot "get" the value for these */
-			ret = -1;
-			goto done;
-		case IPV6_RECVPKTINFO:
-			*i1 = icmp->icmp_ip_recvpktinfo;
-			break;
-		case IPV6_RECVTCLASS:
-			*i1 = icmp->icmp_ipv6_recvtclass;
-			break;
-		case IPV6_RECVPATHMTU:
-			*i1 = icmp->icmp_ipv6_recvpathmtu;
-			break;
-		case IPV6_V6ONLY:
-			*i1 = 1;
-			break;
-		case IPV6_RECVHOPLIMIT:
-			*i1 = icmp->icmp_ipv6_recvhoplimit;
-			break;
-		case IPV6_RECVHOPOPTS:
-			*i1 = icmp->icmp_ipv6_recvhopopts;
-			break;
-		case IPV6_RECVDSTOPTS:
-			*i1 = icmp->icmp_ipv6_recvdstopts;
-			break;
-		case _OLD_IPV6_RECVDSTOPTS:
-			*i1 = icmp->icmp_old_ipv6_recvdstopts;
-			break;
-		case IPV6_RECVRTHDRDSTOPTS:
-			*i1 = icmp->icmp_ipv6_recvrtdstopts;
-			break;
-		case IPV6_RECVRTHDR:
-			*i1 = icmp->icmp_ipv6_recvrthdr;
-			break;
-		case IPV6_PKTINFO: {
-			/* XXX assumes that caller has room for max size! */
-			struct in6_pktinfo *pkti;
-
-			pkti = (struct in6_pktinfo *)ptr;
-			if (ipp->ipp_fields & IPPF_IFINDEX)
-				pkti->ipi6_ifindex = ipp->ipp_ifindex;
+			mutex_enter(&connp->conn_lock);
+			if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
+				*i1 = connp->conn_ixa->ixa_raw_cksum_offset;
 			else
-				pkti->ipi6_ifindex = 0;
-			if (ipp->ipp_fields & IPPF_ADDR)
-				pkti->ipi6_addr = ipp->ipp_addr;
-			else
-				pkti->ipi6_addr = ipv6_all_zeros;
-			ret = sizeof (struct in6_pktinfo);
-			goto done;
-		}
-		case IPV6_NEXTHOP: {
-			sin6_t *sin6 = (sin6_t *)ptr;
-
-			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
-				return (0);
-			*sin6 = sin6_null;
-			sin6->sin6_family = AF_INET6;
-			sin6->sin6_addr = ipp->ipp_nexthop;
-			ret = (sizeof (sin6_t));
-			goto done;
-		}
-		case IPV6_HOPOPTS:
-			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
-				return (0);
-			if (ipp->ipp_hopoptslen <= icmp->icmp_label_len_v6)
-				return (0);
-			bcopy((char *)ipp->ipp_hopopts +
-			    icmp->icmp_label_len_v6, ptr,
-			    ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
-			if (icmp->icmp_label_len_v6 > 0) {
-				ptr[0] = ((char *)ipp->ipp_hopopts)[0];
-				ptr[1] = (ipp->ipp_hopoptslen -
-				    icmp->icmp_label_len_v6 + 7) / 8 - 1;
-			}
-			ret = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
-			goto done;
-		case IPV6_RTHDRDSTOPTS:
-			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
-				return (0);
-			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
-			ret = ipp->ipp_rtdstoptslen;
-			goto done;
-		case IPV6_RTHDR:
-			if (!(ipp->ipp_fields & IPPF_RTHDR))
-				return (0);
-			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
-			ret = ipp->ipp_rthdrlen;
-			goto done;
-		case IPV6_DSTOPTS:
-			if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
-				ret = 0;
-				goto done;
-			}
-			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
-			ret = ipp->ipp_dstoptslen;
-			goto done;
-		case IPV6_PATHMTU:
-			if (!(ipp->ipp_fields & IPPF_PATHMTU)) {
-				ret = 0;
-			} else {
-				ret = ip_fill_mtuinfo(
-				    &icmp->icmp_v6dst.sin6_addr, 0,
-				    (struct ip6_mtuinfo *)ptr,
-				    is->is_netstack);
-			}
-			goto done;
-		case IPV6_TCLASS:
-			if (ipp->ipp_fields & IPPF_TCLASS)
-				*i1 = ipp->ipp_tclass;
-			else
-				*i1 = IPV6_FLOW_TCLASS(
-				    IPV6_DEFAULT_VERS_AND_FLOW);
-			break;
-		default:
-			ret = -1;
-			goto done;
+				*i1 = -1;
+			mutex_exit(&connp->conn_lock);
+			return (sizeof (int));
 		}
 		break;
+
 	case IPPROTO_ICMPV6:
 		/*
 		 * Only allow IPv6 option processing on native IPv6 sockets.
 		 */
-		if (icmp->icmp_family != AF_INET6) {
-			ret = -1;
-		}
+		if (connp->conn_family != AF_INET6)
+			return (-1);
 
-		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
-			ret = -1;
-		}
+		if (connp->conn_proto != IPPROTO_ICMPV6)
+			return (-1);
 
 		switch (name) {
 		case ICMP6_FILTER:
+			mutex_enter(&connp->conn_lock);
 			if (icmp->icmp_filter == NULL) {
 				/* Make it look like "pass all" */
 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
@@ -2126,501 +1877,149 @@ icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 				(void) bcopy(icmp->icmp_filter, ptr,
 				    sizeof (icmp6_filter_t));
 			}
-			ret = sizeof (icmp6_filter_t);
-			goto done;
-		default:
-			ret = -1;
-			goto done;
+			mutex_exit(&connp->conn_lock);
+			return (sizeof (icmp6_filter_t));
 		}
-	default:
-		ret = -1;
-		goto done;
 	}
-	ret = sizeof (int);
-done:
-	return (ret);
+	mutex_enter(&connp->conn_lock);
+	retval = conn_opt_get(&coas, level, name, ptr);
+	mutex_exit(&connp->conn_lock);
+	return (retval);
 }
 
 /*
  * This routine retrieves the current status of socket options.
- * It returns the size of the option retrieved.
+ * It returns the size of the option retrieved, or -1.
  */
 int
 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
 {
-	conn_t  *connp = Q_TO_CONN(q);
-	icmp_t	*icmp = connp->conn_icmp;
-	int 	err;
+	conn_t		*connp = Q_TO_CONN(q);
+	int 		err;
 
-	rw_enter(&icmp->icmp_rwlock, RW_READER);
 	err = icmp_opt_get(connp, level, name, ptr);
-	rw_exit(&icmp->icmp_rwlock);
 	return (err);
 }
 
+/*
+ * This routine sets socket options.
+ */
 int
-icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
-    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
-    void *thisdg_attrs, boolean_t checkonly)
+icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
+    uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
 {
+	conn_t		*connp = coa->coa_connp;
+	ip_xmit_attr_t	*ixa = coa->coa_ixa;
+	icmp_t		*icmp = connp->conn_icmp;
+	icmp_stack_t	*is = icmp->icmp_is;
+	int		*i1 = (int *)invalp;
+	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
+	int		error;
 
-	int	*i1 = (int *)invalp;
-	boolean_t onoff = (*i1 == 0) ? 0 : 1;
-	icmp_t *icmp = connp->conn_icmp;
-	icmp_stack_t *is = icmp->icmp_is;
-	int	error;
+	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
 
-	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
 	/*
 	 * For fixed length options, no sanity check
 	 * of passed in length is done. It is assumed *_optcom_req()
 	 * routines do the right thing.
 	 */
+
 	switch (level) {
 	case SOL_SOCKET:
 		switch (name) {
-		case SO_DEBUG:
-			if (!checkonly)
-				icmp->icmp_debug = onoff;
-			break;
 		case SO_PROTOTYPE:
 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
 			    secpolicy_net_rawaccess(cr) != 0) {
-				*outlenp = 0;
 				return (EACCES);
 			}
-			/* Can't use IPPROTO_RAW with IPv6 */
-			if ((*i1 & 0xFF) == IPPROTO_RAW &&
-			    icmp->icmp_family == AF_INET6) {
-				*outlenp = 0;
-				return (EPROTONOSUPPORT);
-			}
-			if (checkonly) {
-				/* T_CHECK case */
-				*(int *)outvalp = (*i1 & 0xFF);
+			if (checkonly)
 				break;
-			}
-			icmp->icmp_proto = *i1 & 0xFF;
-			if ((icmp->icmp_proto == IPPROTO_RAW ||
-			    icmp->icmp_proto == IPPROTO_IGMP) &&
-			    icmp->icmp_family == AF_INET)
+
+			mutex_enter(&connp->conn_lock);
+			connp->conn_proto = *i1 & 0xFF;
+			ixa->ixa_protocol = connp->conn_proto;
+			if ((connp->conn_proto == IPPROTO_RAW ||
+			    connp->conn_proto == IPPROTO_IGMP) &&
+			    connp->conn_family == AF_INET) {
 				icmp->icmp_hdrincl = 1;
-			else
+				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
+			} else if (connp->conn_proto == IPPROTO_UDP ||
+			    connp->conn_proto == IPPROTO_TCP ||
+			    connp->conn_proto == IPPROTO_SCTP) {
+				/* Used by test applications like psh */
 				icmp->icmp_hdrincl = 0;
-
-			if (icmp->icmp_family == AF_INET6 &&
-			    icmp->icmp_proto == IPPROTO_ICMPV6) {
-				/* Set offset for icmp6_cksum */
-				icmp->icmp_raw_checksum = 0;
-				icmp->icmp_checksum_off = 2;
-			}
-			if (icmp->icmp_proto == IPPROTO_UDP ||
-			    icmp->icmp_proto == IPPROTO_TCP ||
-			    icmp->icmp_proto == IPPROTO_SCTP) {
-				icmp->icmp_no_tp_cksum = 1;
-				icmp->icmp_sticky_ipp.ipp_fields |=
-				    IPPF_NO_CKSUM;
+				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
 			} else {
-				icmp->icmp_no_tp_cksum = 0;
-				icmp->icmp_sticky_ipp.ipp_fields &=
-				    ~IPPF_NO_CKSUM;
+				icmp->icmp_hdrincl = 0;
+				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
 			}
 
+			if (connp->conn_family == AF_INET6 &&
+			    connp->conn_proto == IPPROTO_ICMPV6) {
+				/* Set offset for icmp6_cksum */
+				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
+				ixa->ixa_raw_cksum_offset = 2;
+			}
 			if (icmp->icmp_filter != NULL &&
-			    icmp->icmp_proto != IPPROTO_ICMPV6) {
+			    connp->conn_proto != IPPROTO_ICMPV6) {
 				kmem_free(icmp->icmp_filter,
 				    sizeof (icmp6_filter_t));
 				icmp->icmp_filter = NULL;
 			}
+			mutex_exit(&connp->conn_lock);
 
-			/* Rebuild the header template */
-			error = icmp_build_hdrs(icmp);
-			if (error != 0) {
-				*outlenp = 0;
-				return (error);
-			}
-
+			coa->coa_changed |= COA_HEADER_CHANGED;
 			/*
 			 * For SCTP, we don't use icmp_bind_proto() for
-			 * raw socket binding.  Note that we do not need
-			 * to set *outlenp.
-			 * FIXME: how does SCTP work?
+			 * raw socket binding.
 			 */
-			if (icmp->icmp_proto == IPPROTO_SCTP)
+			if (connp->conn_proto == IPPROTO_SCTP)
 				return (0);
 
-			*outlenp = sizeof (int);
-			*(int *)outvalp = *i1 & 0xFF;
-
-			/* Drop lock across the bind operation */
-			rw_exit(&icmp->icmp_rwlock);
-			(void) icmp_bind_proto(connp);
-			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+			coa->coa_changed |= COA_ICMP_BIND_NEEDED;
 			return (0);
-		case SO_REUSEADDR:
-			if (!checkonly) {
-				icmp->icmp_reuseaddr = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-
-		/*
-		 * The following three items are available here,
-		 * but are only meaningful to IP.
-		 */
-		case SO_DONTROUTE:
-			if (!checkonly) {
-				icmp->icmp_dontroute = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case SO_USELOOPBACK:
-			if (!checkonly) {
-				icmp->icmp_useloopback = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case SO_BROADCAST:
-			if (!checkonly) {
-				icmp->icmp_broadcast = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
 
 		case SO_SNDBUF:
 			if (*i1 > is->is_max_buf) {
-				*outlenp = 0;
 				return (ENOBUFS);
 			}
-			if (!checkonly) {
-				if (!IPCL_IS_NONSTR(connp)) {
-					connp->conn_wq->q_hiwat = *i1;
-				}
-				icmp->icmp_xmit_hiwat = *i1;
-			}
 			break;
 		case SO_RCVBUF:
 			if (*i1 > is->is_max_buf) {
-				*outlenp = 0;
 				return (ENOBUFS);
 			}
-			if (!checkonly) {
-				icmp->icmp_recv_hiwat = *i1;
-				rw_exit(&icmp->icmp_rwlock);
-				(void) proto_set_rx_hiwat(connp->conn_rq, connp,
-				    *i1);
-				rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-			}
-			break;
-		case SO_DGRAM_ERRIND:
-			if (!checkonly)
-				icmp->icmp_dgram_errind = onoff;
 			break;
-		case SO_ALLZONES:
-			/*
-			 * "soft" error (negative)
-			 * option not handled at this level
-			 * Note: Do not modify *outlenp
-			 */
-			return (-EINVAL);
-		case SO_TIMESTAMP:
-			if (!checkonly) {
-				icmp->icmp_timestamp = onoff;
-			}
-			break;
-		case SO_MAC_EXEMPT:
-			/*
-			 * "soft" error (negative)
-			 * option not handled at this level
-			 * Note: Do not modify *outlenp
-			 */
-			return (-EINVAL);
-		case SO_RCVTIMEO:
-		case SO_SNDTIMEO:
-			/*
-			 * Pass these two options in order for third part
-			 * protocol usage. Here just return directly.
-			 */
-			return (0);
-		/*
-		 * Following three not meaningful for icmp
-		 * Action is same as "default" so we keep them
-		 * in comments.
-		 * case SO_LINGER:
-		 * case SO_KEEPALIVE:
-		 * case SO_OOBINLINE:
-		 */
-		default:
-			*outlenp = 0;
-			return (EINVAL);
 		}
 		break;
+
 	case IPPROTO_IP:
 		/*
 		 * Only allow IPv4 option processing on IPv4 sockets.
 		 */
-		if (icmp->icmp_family != AF_INET) {
-			*outlenp = 0;
-			return (ENOPROTOOPT);
-		}
-		switch (name) {
-		case IP_OPTIONS:
-		case T_IP_OPTIONS:
-			/* Save options for use by IP. */
-			if ((inlen & 0x3) ||
-			    inlen + icmp->icmp_label_len > IP_MAX_OPT_LENGTH) {
-				*outlenp = 0;
-				return (EINVAL);
-			}
-			if (checkonly)
-				break;
-
-			if (!tsol_option_set(&icmp->icmp_ip_snd_options,
-			    &icmp->icmp_ip_snd_options_len,
-			    icmp->icmp_label_len, invalp, inlen)) {
-				*outlenp = 0;
-				return (ENOMEM);
-			}
+		if (connp->conn_family != AF_INET)
+			return (EINVAL);
 
-			icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
-			    icmp->icmp_ip_snd_options_len;
-			rw_exit(&icmp->icmp_rwlock);
-			(void) proto_set_tx_wroff(connp->conn_rq == NULL ? NULL:
-			    RD(connp->conn_rq), connp,
-			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
-			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-			break;
+		switch (name) {
 		case IP_HDRINCL:
-			if (!checkonly)
-				icmp->icmp_hdrincl = onoff;
-			break;
-		case IP_TOS:
-		case T_IP_TOS:
-			if (!checkonly) {
-				icmp->icmp_type_of_service = (uint8_t)*i1;
-			}
-			break;
-		case IP_TTL:
 			if (!checkonly) {
-				icmp->icmp_ttl = (uint8_t)*i1;
-			}
-			break;
-		case IP_MULTICAST_IF:
-			/*
-			 * TODO should check OPTMGMT reply and undo this if
-			 * there is an error.
-			 */
-			if (!checkonly) {
-				icmp->icmp_multicast_if_addr = *i1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IP_MULTICAST_TTL:
-			if (!checkonly)
-				icmp->icmp_multicast_ttl = *invalp;
-			break;
-		case IP_MULTICAST_LOOP:
-			if (!checkonly) {
-				connp->conn_multicast_loop =
-				    (*invalp == 0) ? 0 : 1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IP_BOUND_IF:
-			if (!checkonly) {
-				icmp->icmp_bound_if = *i1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IP_UNSPEC_SRC:
-			if (!checkonly) {
-				icmp->icmp_unspec_source = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IP_BROADCAST_TTL:
-			if (!checkonly)
-				connp->conn_broadcast_ttl = *invalp;
-			break;
-		case IP_RECVIF:
-			if (!checkonly) {
-				icmp->icmp_recvif = onoff;
-			}
-			/*
-			 * pass to ip
-			 */
-			return (-EINVAL);
-		case IP_PKTINFO: {
-			/*
-			 * This also handles IP_RECVPKTINFO.
-			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
-			 * Differentiation is based on the size of the argument
-			 * passed in.
-			 */
-			struct in_pktinfo *pktinfop;
-			ip4_pkt_t *attr_pktinfop;
-
-			if (checkonly)
-				break;
-
-			if (inlen == sizeof (int)) {
-				/*
-				 * This is IP_RECVPKTINFO option.
-				 * Keep a local copy of wether this option is
-				 * set or not and pass it down to IP for
-				 * processing.
-				 */
-				icmp->icmp_ip_recvpktinfo = onoff;
-				return (-EINVAL);
-			}
-
-
-			if (inlen != sizeof (struct in_pktinfo)) {
-				return (EINVAL);
-			}
-
-			if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs)
-			    == NULL) {
-				/*
-				 * sticky option is not supported
-				 */
-				return (EINVAL);
-			}
-
-			pktinfop = (struct in_pktinfo *)invalp;
-
-			/*
-			 * Atleast one of the values should be specified
-			 */
-			if (pktinfop->ipi_ifindex == 0 &&
-			    pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) {
-				return (EINVAL);
+				mutex_enter(&connp->conn_lock);
+				icmp->icmp_hdrincl = onoff;
+				if (onoff)
+					ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
+				else
+					ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
+				mutex_exit(&connp->conn_lock);
 			}
-
-			attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr;
-			attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex;
-		}
 			break;
-		case IP_ADD_MEMBERSHIP:
-		case IP_DROP_MEMBERSHIP:
-		case IP_BLOCK_SOURCE:
-		case IP_UNBLOCK_SOURCE:
-		case IP_ADD_SOURCE_MEMBERSHIP:
-		case IP_DROP_SOURCE_MEMBERSHIP:
-		case MCAST_JOIN_GROUP:
-		case MCAST_LEAVE_GROUP:
-		case MCAST_BLOCK_SOURCE:
-		case MCAST_UNBLOCK_SOURCE:
-		case MCAST_JOIN_SOURCE_GROUP:
-		case MCAST_LEAVE_SOURCE_GROUP:
-		case MRT_INIT:
-		case MRT_DONE:
-		case MRT_ADD_VIF:
-		case MRT_DEL_VIF:
-		case MRT_ADD_MFC:
-		case MRT_DEL_MFC:
-		case MRT_VERSION:
-		case MRT_ASSERT:
-		case IP_SEC_OPT:
-		case IP_NEXTHOP:
-			/*
-			 * "soft" error (negative)
-			 * option not handled at this level
-			 * Note: Do not modify *outlenp
-			 */
-			return (-EINVAL);
-		default:
-			*outlenp = 0;
-			return (EINVAL);
 		}
 		break;
-	case IPPROTO_IPV6: {
-		ip6_pkt_t		*ipp;
-		boolean_t		sticky;
 
-		if (icmp->icmp_family != AF_INET6) {
-			*outlenp = 0;
-			return (ENOPROTOOPT);
-		}
-		/*
-		 * Deal with both sticky options and ancillary data
-		 */
-		if (thisdg_attrs == NULL) {
-			/* sticky options, or none */
-			ipp = &icmp->icmp_sticky_ipp;
-			sticky = B_TRUE;
-		} else {
-			/* ancillary data */
-			ipp = (ip6_pkt_t *)thisdg_attrs;
-			sticky = B_FALSE;
-		}
+	case IPPROTO_IPV6:
+		if (connp->conn_family != AF_INET6)
+			return (EINVAL);
 
 		switch (name) {
-		case IPV6_MULTICAST_IF:
-			if (!checkonly) {
-				icmp->icmp_multicast_if_index = *i1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_UNICAST_HOPS:
-			/* -1 means use default */
-			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
-				*outlenp = 0;
-				return (EINVAL);
-			}
-			if (!checkonly) {
-				if (*i1 == -1) {
-					icmp->icmp_ttl = ipp->ipp_unicast_hops =
-					    is->is_ipv6_hoplimit;
-					ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
-					/* Pass modified value to IP. */
-					*i1 = ipp->ipp_hoplimit;
-				} else {
-					icmp->icmp_ttl = ipp->ipp_unicast_hops =
-					    (uint8_t)*i1;
-					ipp->ipp_fields |= IPPF_UNICAST_HOPS;
-				}
-				/* Rebuild the header template */
-				error = icmp_build_hdrs(icmp);
-				if (error != 0) {
-					*outlenp = 0;
-					return (error);
-				}
-			}
-			break;
-		case IPV6_MULTICAST_HOPS:
-			/* -1 means use default */
-			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
-				*outlenp = 0;
-				return (EINVAL);
-			}
-			if (!checkonly) {
-				if (*i1 == -1) {
-					icmp->icmp_multicast_ttl =
-					    ipp->ipp_multicast_hops =
-					    IP_DEFAULT_MULTICAST_TTL;
-					ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS;
-					/* Pass modified value to IP. */
-					*i1 = icmp->icmp_multicast_ttl;
-				} else {
-					icmp->icmp_multicast_ttl =
-					    ipp->ipp_multicast_hops =
-					    (uint8_t)*i1;
-					ipp->ipp_fields |= IPPF_MULTICAST_HOPS;
-				}
-			}
-			break;
-		case IPV6_MULTICAST_LOOP:
-			if (*i1 != 0 && *i1 != 1) {
-				*outlenp = 0;
-				return (EINVAL);
-			}
-			if (!checkonly) {
-				connp->conn_multicast_loop = *i1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
 		case IPV6_CHECKSUM:
 			/*
 			 * Integer offset into the user data of where the
@@ -2628,517 +2027,93 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 			 * Offset of -1 disables option.
 			 * Does not apply to IPPROTO_ICMPV6.
 			 */
-			if (icmp->icmp_proto == IPPROTO_ICMPV6 || !sticky) {
-				*outlenp = 0;
+			if (connp->conn_proto == IPPROTO_ICMPV6 ||
+			    coa->coa_ancillary) {
 				return (EINVAL);
 			}
 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
 				/* Negative or not 16 bit aligned offset */
-				*outlenp = 0;
 				return (EINVAL);
 			}
 			if (checkonly)
 				break;
 
+			mutex_enter(&connp->conn_lock);
 			if (*i1 == -1) {
-				icmp->icmp_raw_checksum = 0;
-				ipp->ipp_fields &= ~IPPF_RAW_CKSUM;
+				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
+				ixa->ixa_raw_cksum_offset = 0;
+				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
 			} else {
-				icmp->icmp_raw_checksum = 1;
-				icmp->icmp_checksum_off = *i1;
-				ipp->ipp_fields |= IPPF_RAW_CKSUM;
-			}
-			/* Rebuild the header template */
-			error = icmp_build_hdrs(icmp);
-			if (error != 0) {
-				*outlenp = 0;
-				return (error);
-			}
-			break;
-		case IPV6_JOIN_GROUP:
-		case IPV6_LEAVE_GROUP:
-		case MCAST_JOIN_GROUP:
-		case MCAST_LEAVE_GROUP:
-		case MCAST_BLOCK_SOURCE:
-		case MCAST_UNBLOCK_SOURCE:
-		case MCAST_JOIN_SOURCE_GROUP:
-		case MCAST_LEAVE_SOURCE_GROUP:
-			/*
-			 * "soft" error (negative)
-			 * option not handled at this level
-			 * Note: Do not modify *outlenp
-			 */
-			return (-EINVAL);
-		case IPV6_BOUND_IF:
-			if (!checkonly) {
-				icmp->icmp_bound_if = *i1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_UNSPEC_SRC:
-			if (!checkonly) {
-				icmp->icmp_unspec_source = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVTCLASS:
-			if (!checkonly) {
-				icmp->icmp_ipv6_recvtclass = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		/*
-		 * Set boolean switches for ancillary data delivery
-		 */
-		case IPV6_RECVPKTINFO:
-			if (!checkonly) {
-				icmp->icmp_ip_recvpktinfo = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVPATHMTU:
-			if (!checkonly) {
-				icmp->icmp_ipv6_recvpathmtu = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVHOPLIMIT:
-			if (!checkonly) {
-				icmp->icmp_ipv6_recvhoplimit = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVHOPOPTS:
-			if (!checkonly) {
-				icmp->icmp_ipv6_recvhopopts = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVDSTOPTS:
-			if (!checkonly) {
-				icmp->icmp_ipv6_recvdstopts = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case _OLD_IPV6_RECVDSTOPTS:
-			if (!checkonly)
-				icmp->icmp_old_ipv6_recvdstopts = onoff;
-			break;
-		case IPV6_RECVRTHDRDSTOPTS:
-			if (!checkonly) {
-				icmp->icmp_ipv6_recvrtdstopts = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVRTHDR:
-			if (!checkonly) {
-				icmp->icmp_ipv6_recvrthdr = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		/*
-		 * Set sticky options or ancillary data.
-		 * If sticky options, (re)build any extension headers
-		 * that might be needed as a result.
-		 */
-		case IPV6_PKTINFO:
-			/*
-			 * The source address and ifindex are verified
-			 * in ip_opt_set(). For ancillary data the
-			 * source address is checked in ip_wput_v6.
-			 */
-			if (inlen != 0 && inlen !=
-			    sizeof (struct in6_pktinfo)) {
-				return (EINVAL);
-			}
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
-				ipp->ipp_sticky_ignored |=
-				    (IPPF_IFINDEX|IPPF_ADDR);
-			} else {
-				struct in6_pktinfo *pkti;
-
-				pkti = (struct in6_pktinfo *)invalp;
-				ipp->ipp_ifindex = pkti->ipi6_ifindex;
-				ipp->ipp_addr = pkti->ipi6_addr;
-				if (ipp->ipp_ifindex != 0)
-					ipp->ipp_fields |= IPPF_IFINDEX;
-				else
-					ipp->ipp_fields &= ~IPPF_IFINDEX;
-				if (!IN6_IS_ADDR_UNSPECIFIED(
-				    &ipp->ipp_addr))
-					ipp->ipp_fields |= IPPF_ADDR;
-				else
-					ipp->ipp_fields &= ~IPPF_ADDR;
-			}
-			if (sticky) {
-				error = icmp_build_hdrs(icmp);
-				if (error != 0)
-					return (error);
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_HOPLIMIT:
-			/* This option can only be used as ancillary data. */
-			if (sticky)
-				return (EINVAL);
-			if (inlen != 0 && inlen != sizeof (int))
-				return (EINVAL);
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~IPPF_HOPLIMIT;
-				ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT;
-			} else {
-				if (*i1 > 255 || *i1 < -1)
-					return (EINVAL);
-				if (*i1 == -1)
-					ipp->ipp_hoplimit =
-					    is->is_ipv6_hoplimit;
-				else
-					ipp->ipp_hoplimit = *i1;
-				ipp->ipp_fields |= IPPF_HOPLIMIT;
-			}
-			break;
-		case IPV6_TCLASS:
-			/*
-			 * IPV6_RECVTCLASS accepts -1 as use kernel default
-			 * and [0, 255] as the actualy traffic class.
-			 */
-			if (inlen != 0 && inlen != sizeof (int)) {
-				return (EINVAL);
-			}
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~IPPF_TCLASS;
-				ipp->ipp_sticky_ignored |= IPPF_TCLASS;
-			} else {
-				if (*i1 >= 256 || *i1 < -1)
-					return (EINVAL);
-				if (*i1 == -1) {
-					ipp->ipp_tclass =
-					    IPV6_FLOW_TCLASS(
-					    IPV6_DEFAULT_VERS_AND_FLOW);
-				} else {
-					ipp->ipp_tclass = *i1;
-				}
-				ipp->ipp_fields |= IPPF_TCLASS;
-			}
-			if (sticky) {
-				error = icmp_build_hdrs(icmp);
-				if (error != 0)
-					return (error);
-			}
-			break;
-		case IPV6_NEXTHOP:
-			/*
-			 * IP will verify that the nexthop is reachable
-			 * and fail for sticky options.
-			 */
-			if (inlen != 0 && inlen != sizeof (sin6_t)) {
-				return (EINVAL);
-			}
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~IPPF_NEXTHOP;
-				ipp->ipp_sticky_ignored |= IPPF_NEXTHOP;
-			} else {
-				sin6_t *sin6 = (sin6_t *)invalp;
-
-				if (sin6->sin6_family != AF_INET6) {
-					return (EAFNOSUPPORT);
-				}
-				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
-					return (EADDRNOTAVAIL);
-				}
-				ipp->ipp_nexthop = sin6->sin6_addr;
-				if (!IN6_IS_ADDR_UNSPECIFIED(
-				    &ipp->ipp_nexthop))
-					ipp->ipp_fields |= IPPF_NEXTHOP;
-				else
-					ipp->ipp_fields &= ~IPPF_NEXTHOP;
-			}
-			if (sticky) {
-				error = icmp_build_hdrs(icmp);
-				if (error != 0)
-					return (error);
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_HOPOPTS: {
-			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (hopts->ip6h_len + 1))) {
-				return (EINVAL);
-			}
-
-			if (checkonly)
-				break;
-			error = optcom_pkt_set(invalp, inlen, sticky,
-			    (uchar_t **)&ipp->ipp_hopopts,
-			    &ipp->ipp_hopoptslen,
-			    sticky ? icmp->icmp_label_len_v6 : 0);
-			if (error != 0)
-				return (error);
-			if (ipp->ipp_hopoptslen == 0) {
-				ipp->ipp_fields &= ~IPPF_HOPOPTS;
-				ipp->ipp_sticky_ignored |= IPPF_HOPOPTS;
-			} else {
-				ipp->ipp_fields |= IPPF_HOPOPTS;
-			}
-			if (sticky) {
-				error = icmp_build_hdrs(icmp);
-				if (error != 0)
-					return (error);
+				ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
+				ixa->ixa_raw_cksum_offset = *i1;
+				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
 			}
+			mutex_exit(&connp->conn_lock);
 			break;
 		}
-		case IPV6_RTHDRDSTOPTS: {
-			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (dopts->ip6d_len + 1)))
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				if (sticky &&
-				    (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) {
-					kmem_free(ipp->ipp_rtdstopts,
-					    ipp->ipp_rtdstoptslen);
-					ipp->ipp_rtdstopts = NULL;
-					ipp->ipp_rtdstoptslen = 0;
-				}
-				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
-				ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
-			} else {
-				error = optcom_pkt_set(invalp, inlen, sticky,
-				    (uchar_t **)&ipp->ipp_rtdstopts,
-				    &ipp->ipp_rtdstoptslen, 0);
-				if (error != 0)
-					return (error);
-				ipp->ipp_fields |= IPPF_RTDSTOPTS;
-			}
-			if (sticky) {
-				error = icmp_build_hdrs(icmp);
-				if (error != 0)
-					return (error);
-			}
-			break;
-		}
-		case IPV6_DSTOPTS: {
-			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
+		break;
 
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (dopts->ip6d_len + 1)))
-				return (EINVAL);
+	case IPPROTO_ICMPV6:
+		/*
+		 * Only allow IPv6 option processing on IPv6 sockets.
+		 */
+		if (connp->conn_family != AF_INET6)
+			return (EINVAL);
+		if (connp->conn_proto != IPPROTO_ICMPV6)
+			return (EINVAL);
 
+		switch (name) {
+		case ICMP6_FILTER:
 			if (checkonly)
 				break;
 
-			if (inlen == 0) {
-				if (sticky &&
-				    (ipp->ipp_fields & IPPF_DSTOPTS) != 0) {
-					kmem_free(ipp->ipp_dstopts,
-					    ipp->ipp_dstoptslen);
-					ipp->ipp_dstopts = NULL;
-					ipp->ipp_dstoptslen = 0;
-				}
-				ipp->ipp_fields &= ~IPPF_DSTOPTS;
-				ipp->ipp_sticky_ignored |= IPPF_DSTOPTS;
-			} else {
-				error = optcom_pkt_set(invalp, inlen, sticky,
-				    (uchar_t **)&ipp->ipp_dstopts,
-				    &ipp->ipp_dstoptslen, 0);
-				if (error != 0)
-					return (error);
-				ipp->ipp_fields |= IPPF_DSTOPTS;
-			}
-			if (sticky) {
-				error = icmp_build_hdrs(icmp);
-				if (error != 0)
-					return (error);
-			}
-			break;
-		}
-		case IPV6_RTHDR: {
-			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
-
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (rt->ip6r_len + 1)))
+			if ((inlen != 0) &&
+			    (inlen != sizeof (icmp6_filter_t)))
 				return (EINVAL);
 
-			if (checkonly)
-				break;
-
+			mutex_enter(&connp->conn_lock);
 			if (inlen == 0) {
-				if (sticky &&
-				    (ipp->ipp_fields & IPPF_RTHDR) != 0) {
-					kmem_free(ipp->ipp_rthdr,
-					    ipp->ipp_rthdrlen);
-					ipp->ipp_rthdr = NULL;
-					ipp->ipp_rthdrlen = 0;
+				if (icmp->icmp_filter != NULL) {
+					kmem_free(icmp->icmp_filter,
+					    sizeof (icmp6_filter_t));
+					icmp->icmp_filter = NULL;
 				}
-				ipp->ipp_fields &= ~IPPF_RTHDR;
-				ipp->ipp_sticky_ignored |= IPPF_RTHDR;
 			} else {
-				error = optcom_pkt_set(invalp, inlen, sticky,
-				    (uchar_t **)&ipp->ipp_rthdr,
-				    &ipp->ipp_rthdrlen, 0);
-				if (error != 0)
-					return (error);
-				ipp->ipp_fields |= IPPF_RTHDR;
-			}
-			if (sticky) {
-				error = icmp_build_hdrs(icmp);
-				if (error != 0)
-					return (error);
-			}
-			break;
-		}
-
-		case IPV6_DONTFRAG:
-			if (checkonly)
-				break;
-
-			if (onoff) {
-				ipp->ipp_fields |= IPPF_DONTFRAG;
-			} else {
-				ipp->ipp_fields &= ~IPPF_DONTFRAG;
-			}
-			break;
-
-		case IPV6_USE_MIN_MTU:
-			if (inlen != sizeof (int))
-				return (EINVAL);
-
-			if (*i1 < -1 || *i1 > 1)
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
-			ipp->ipp_use_min_mtu = *i1;
-			break;
-
-		/*
-		 * This option can't be set.  Its only returned via
-		 * getsockopt() or ancillary data.
-		 */
-		case IPV6_PATHMTU:
-			return (EINVAL);
-
-		case IPV6_SEC_OPT:
-		case IPV6_SRC_PREFERENCES:
-		case IPV6_V6ONLY:
-			/* Handled at IP level */
-			return (-EINVAL);
-		default:
-			*outlenp = 0;
-			return (EINVAL);
-		}
-		break;
-	}		/* end IPPROTO_IPV6 */
-
-	case IPPROTO_ICMPV6:
-		/*
-		 * Only allow IPv6 option processing on IPv6 sockets.
-		 */
-		if (icmp->icmp_family != AF_INET6) {
-			*outlenp = 0;
-			return (ENOPROTOOPT);
-		}
-		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
-			*outlenp = 0;
-			return (ENOPROTOOPT);
-		}
-		switch (name) {
-		case ICMP6_FILTER:
-			if (!checkonly) {
-				if ((inlen != 0) &&
-				    (inlen != sizeof (icmp6_filter_t)))
-					return (EINVAL);
-
-				if (inlen == 0) {
-					if (icmp->icmp_filter != NULL) {
-						kmem_free(icmp->icmp_filter,
-						    sizeof (icmp6_filter_t));
-						icmp->icmp_filter = NULL;
-					}
-				} else {
+				if (icmp->icmp_filter == NULL) {
+					icmp->icmp_filter = kmem_alloc(
+					    sizeof (icmp6_filter_t),
+					    KM_NOSLEEP);
 					if (icmp->icmp_filter == NULL) {
-						icmp->icmp_filter = kmem_alloc(
-						    sizeof (icmp6_filter_t),
-						    KM_NOSLEEP);
-						if (icmp->icmp_filter == NULL) {
-							*outlenp = 0;
-							return (ENOBUFS);
-						}
+						mutex_exit(&connp->conn_lock);
+						return (ENOBUFS);
 					}
-					(void) bcopy(invalp, icmp->icmp_filter,
-					    inlen);
 				}
+				(void) bcopy(invalp, icmp->icmp_filter, inlen);
 			}
+			mutex_exit(&connp->conn_lock);
 			break;
-
-		default:
-			*outlenp = 0;
-			return (EINVAL);
 		}
 		break;
-	default:
-		*outlenp = 0;
-		return (EINVAL);
-	}
-	/*
-	 * Common case of OK return with outval same as inval.
-	 */
-	if (invalp != outvalp) {
-		/* don't trust bcopy for identical src/dst */
-		(void) bcopy(invalp, outvalp, inlen);
 	}
-	*outlenp = inlen;
-	return (0);
+	error = conn_opt_set(coa, level, name, inlen, invalp,
+	    checkonly, cr);
+	return (error);
 }
 
-/* This routine sets socket options. */
-/* ARGSUSED */
+/*
+ * This routine sets socket options.
+ */
 int
 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
     void *thisdg_attrs, cred_t *cr)
 {
-	boolean_t checkonly;
-	int	error;
+	icmp_t		*icmp = connp->conn_icmp;
+	int		err;
+	conn_opt_arg_t	coas, *coa;
+	boolean_t	checkonly;
+	icmp_stack_t	*is = icmp->icmp_is;
 
-	error = 0;
 	switch (optset_context) {
 	case SETFN_OPTCOM_CHECKONLY:
 		checkonly = B_TRUE;
@@ -3152,8 +2127,7 @@ icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 		 */
 		if (inlen == 0) {
 			*outlenp = 0;
-			error = 0;
-			goto done;
+			return (0);
 		}
 		break;
 	case SETFN_OPTCOM_NEGOTIATE:
@@ -3171,8 +2145,7 @@ icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 		 */
 		if (!icmp_opt_allow_udr_set(level, name)) {
 			*outlenp = 0;
-			error = EINVAL;
-			goto done;
+			return (EINVAL);
 		}
 		break;
 	default:
@@ -3180,105 +2153,265 @@ icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 		 * We should never get here
 		 */
 		*outlenp = 0;
-		error = EINVAL;
-		goto done;
+		return (EINVAL);
 	}
 
 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
-	error = icmp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
-	    outvalp, cr, thisdg_attrs, checkonly);
 
-done:
-	return (error);
+	if (thisdg_attrs != NULL) {
+		/* Options from T_UNITDATA_REQ */
+		coa = (conn_opt_arg_t *)thisdg_attrs;
+		ASSERT(coa->coa_connp == connp);
+		ASSERT(coa->coa_ixa != NULL);
+		ASSERT(coa->coa_ipp != NULL);
+		ASSERT(coa->coa_ancillary);
+	} else {
+		coa = &coas;
+		coas.coa_connp = connp;
+		/* Get a reference on conn_ixa to prevent concurrent mods */
+		coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
+		if (coas.coa_ixa == NULL) {
+			*outlenp = 0;
+			return (ENOMEM);
+		}
+		coas.coa_ipp = &connp->conn_xmit_ipp;
+		coas.coa_ancillary = B_FALSE;
+		coas.coa_changed = 0;
+	}
+
+	err = icmp_do_opt_set(coa, level, name, inlen, invalp,
+	    cr, checkonly);
+	if (err != 0) {
+errout:
+		if (!coa->coa_ancillary)
+			ixa_refrele(coa->coa_ixa);
+		*outlenp = 0;
+		return (err);
+	}
+
+	/*
+	 * Common case of OK return with outval same as inval.
+	 */
+	if (invalp != outvalp) {
+		/* don't trust bcopy for identical src/dst */
+		(void) bcopy(invalp, outvalp, inlen);
+	}
+	*outlenp = inlen;
+
+	/*
+	 * If this was not ancillary data, then we rebuild the headers,
+	 * update the IRE/NCE, and IPsec as needed.
+	 * Since the label depends on the destination we go through
+	 * ip_set_destination first.
+	 */
+	if (coa->coa_ancillary) {
+		return (0);
+	}
+
+	if (coa->coa_changed & COA_ROUTE_CHANGED) {
+		in6_addr_t saddr, faddr, nexthop;
+		in_port_t fport;
+
+		/*
+		 * We clear lastdst to make sure we pick up the change
+		 * next time sending.
+		 * If we are connected we re-cache the information.
+		 * We ignore errors to preserve BSD behavior.
+		 * Note that we don't redo IPsec policy lookup here
+		 * since the final destination (or source) didn't change.
+		 */
+		mutex_enter(&connp->conn_lock);
+		connp->conn_v6lastdst = ipv6_all_zeros;
+
+		ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
+		    &connp->conn_faddr_v6, &nexthop);
+		saddr = connp->conn_saddr_v6;
+		faddr = connp->conn_faddr_v6;
+		fport = connp->conn_fport;
+		mutex_exit(&connp->conn_lock);
+
+		if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
+		    !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
+			(void) ip_attr_connect(connp, coa->coa_ixa,
+			    &saddr, &faddr, &nexthop, fport, NULL, NULL,
+			    IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
+		}
+	}
+
+	ixa_refrele(coa->coa_ixa);
+
+	if (coa->coa_changed & COA_HEADER_CHANGED) {
+		/*
+		 * Rebuild the header template if we are connected.
+		 * Otherwise clear conn_v6lastdst so we rebuild the header
+		 * in the data path.
+		 */
+		mutex_enter(&connp->conn_lock);
+		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
+		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
+			err = icmp_build_hdr_template(connp,
+			    &connp->conn_saddr_v6, &connp->conn_faddr_v6,
+			    connp->conn_flowinfo);
+			if (err != 0) {
+				mutex_exit(&connp->conn_lock);
+				return (err);
+			}
+		} else {
+			connp->conn_v6lastdst = ipv6_all_zeros;
+		}
+		mutex_exit(&connp->conn_lock);
+	}
+	if (coa->coa_changed & COA_RCVBUF_CHANGED) {
+		(void) proto_set_rx_hiwat(connp->conn_rq, connp,
+		    connp->conn_rcvbuf);
+	}
+	if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
+		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
+	}
+	if (coa->coa_changed & COA_WROFF_CHANGED) {
+		/* Increase wroff if needed */
+		uint_t wroff;
+
+		mutex_enter(&connp->conn_lock);
+		wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
+		if (wroff > connp->conn_wroff) {
+			connp->conn_wroff = wroff;
+			mutex_exit(&connp->conn_lock);
+			(void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
+		} else {
+			mutex_exit(&connp->conn_lock);
+		}
+	}
+	if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
+		icmp_bind_proto(icmp);
+	}
+	return (err);
 }
 
 /* This routine sets socket options. */
-/* ARGSUSED */
 int
 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
-    void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+    void *thisdg_attrs, cred_t *cr)
 {
-	conn_t	*connp =  Q_TO_CONN(q);
-	icmp_t	*icmp;
+	conn_t	*connp = Q_TO_CONN(q);
 	int error;
 
-	icmp = connp->conn_icmp;
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
 	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
 	    outlenp, outvalp, thisdg_attrs, cr);
-	rw_exit(&icmp->icmp_rwlock);
 	return (error);
 }
 
 /*
- * Update icmp_sticky_hdrs based on icmp_sticky_ipp, icmp_v6src, icmp_ttl,
- * icmp_proto, icmp_raw_checksum and icmp_no_tp_cksum.
- * The headers include ip6i_t (if needed), ip6_t, and any sticky extension
- * headers.
- * Returns failure if can't allocate memory.
+ * Setup IP headers.
+ *
+ * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
+ * but icmp_output_hdrincl restores ipha_protocol once we return.
  */
-static int
-icmp_build_hdrs(icmp_t *icmp)
+mblk_t *
+icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
+    const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
+    mblk_t *data_mp, int *errorp)
 {
-	icmp_stack_t *is = icmp->icmp_is;
-	uchar_t	*hdrs;
-	uint_t	hdrs_len;
-	ip6_t	*ip6h;
-	ip6i_t	*ip6i;
-	ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp;
-
-	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
-	hdrs_len = ip_total_hdrs_len_v6(ipp);
-	ASSERT(hdrs_len != 0);
-	if (hdrs_len != icmp->icmp_sticky_hdrs_len) {
-		/* Need to reallocate */
-		if (hdrs_len != 0) {
-			hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
-			if (hdrs == NULL)
-				return (ENOMEM);
-		} else {
-			hdrs = NULL;
-		}
-		if (icmp->icmp_sticky_hdrs_len != 0) {
-			kmem_free(icmp->icmp_sticky_hdrs,
-			    icmp->icmp_sticky_hdrs_len);
-		}
-		icmp->icmp_sticky_hdrs = hdrs;
-		icmp->icmp_sticky_hdrs_len = hdrs_len;
+	mblk_t		*mp;
+	icmp_stack_t	*is = connp->conn_netstack->netstack_icmp;
+	uint_t		data_len;
+	uint32_t	cksum;
+
+	data_len = msgdsize(data_mp);
+	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
+	    flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
+	if (mp == NULL) {
+		ASSERT(*errorp != 0);
+		return (NULL);
 	}
-	ip_build_hdrs_v6(icmp->icmp_sticky_hdrs,
-	    icmp->icmp_sticky_hdrs_len, ipp, icmp->icmp_proto);
 
-	/* Set header fields not in ipp */
-	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
-		ip6i = (ip6i_t *)icmp->icmp_sticky_hdrs;
-		ip6h = (ip6_t *)&ip6i[1];
+	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
 
-		if (ipp->ipp_fields & IPPF_RAW_CKSUM) {
-			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
-			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
+	/*
+	 * If there was a routing option/header then conn_prepend_hdr
+	 * has massaged it and placed the pseudo-header checksum difference
+	 * in the cksum argument.
+	 *
+	 * Prepare for ICMPv6 checksum done in IP.
+	 *
+	 * We make it easy for IP to include our pseudo header
+	 * by putting our length (and any routing header adjustment)
+	 * in the ICMPv6 checksum field.
+	 * The IP source, destination, and length have already been set by
+	 * conn_prepend_hdr.
+	 */
+	cksum += data_len;
+	cksum = (cksum >> 16) + (cksum & 0xFFFF);
+	ASSERT(cksum < 0x10000);
+
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
+
+		ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
+	} else {
+		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
+		uint_t	cksum_offset = 0;
+
+		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
+
+		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
+			if (connp->conn_proto == IPPROTO_ICMPV6) {
+				cksum_offset = ixa->ixa_ip_hdr_length +
+				    offsetof(icmp6_t, icmp6_cksum);
+			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
+				cksum_offset = ixa->ixa_ip_hdr_length +
+				    ixa->ixa_raw_cksum_offset;
+			}
 		}
-		if (ipp->ipp_fields & IPPF_NO_CKSUM) {
-			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
+		if (cksum_offset != 0) {
+			uint16_t *ptr;
+
+			/* Make sure the checksum fits in the first mblk */
+			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
+				mblk_t *mp1;
+
+				mp1 = msgpullup(mp,
+				    cksum_offset + sizeof (short));
+				freemsg(mp);
+				if (mp1 == NULL) {
+					*errorp = ENOMEM;
+					return (NULL);
+				}
+				mp = mp1;
+				ip6h = (ip6_t *)mp->b_rptr;
+			}
+			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
+			*ptr = htons(cksum);
 		}
-	} else {
-		ip6h = (ip6_t *)icmp->icmp_sticky_hdrs;
 	}
 
-	if (!(ipp->ipp_fields & IPPF_ADDR))
-		ip6h->ip6_src = icmp->icmp_v6src;
+	/* Note that we don't try to update wroff due to ancillary data */
+	return (mp);
+}
+
+static int
+icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
+    const in6_addr_t *v6dst, uint32_t flowinfo)
+{
+	int		error;
 
-	/* Try to get everything in a single mblk */
-	if (hdrs_len > icmp->icmp_max_hdr_len) {
-		icmp->icmp_max_hdr_len = hdrs_len;
-		rw_exit(&icmp->icmp_rwlock);
-		(void) proto_set_tx_wroff(icmp->icmp_connp->conn_rq,
-		    icmp->icmp_connp,
-		    icmp->icmp_max_hdr_len + is->is_wroff_extra);
-		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-	}
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+	/*
+	 * We clear lastdst to make sure we don't use the lastdst path
+	 * next time sending since we might not have set v6dst yet.
+	 */
+	connp->conn_v6lastdst = ipv6_all_zeros;
+
+	error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Any routing header/option has been massaged. The checksum difference
+	 * is stored in conn_sum.
+	 */
 	return (0);
 }
 
@@ -3370,16 +2503,15 @@ icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
  * TPI, then we'll queue the mp for later processing.
  */
 static void
-icmp_ulp_recv(conn_t *connp, mblk_t *mp)
+icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
 {
-
 	if (IPCL_IS_NONSTR(connp)) {
 		icmp_t *icmp = connp->conn_icmp;
 		int error;
 
+		ASSERT(len == msgdsize(mp));
 		if ((*connp->conn_upcalls->su_recv)
-		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
-		    NULL) < 0) {
+		    (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
 			mutex_enter(&icmp->icmp_recv_lock);
 			if (error == ENOSPC) {
 				/*
@@ -3409,115 +2541,74 @@ icmp_ulp_recv(conn_t *connp, mblk_t *mp)
 	}
 }
 
-/*ARGSUSED2*/
+/*
+ * This is the inbound data path.
+ * IP has already pulled up the IP headers and verified alignment
+ * etc.
+ */
+/* ARGSUSED2 */
 static void
-icmp_input(void *arg1, mblk_t *mp, void *arg2)
+icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 {
-	conn_t *connp = (conn_t *)arg1;
+	conn_t			*connp = (conn_t *)arg1;
 	struct T_unitdata_ind	*tudi;
-	uchar_t			*rptr;
+	uchar_t			*rptr;		/* Pointer to IP header */
+	int			ip_hdr_length;
+	int			udi_size;	/* Size of T_unitdata_ind */
+	int			pkt_len;
 	icmp_t			*icmp;
+	ip_pkt_t		ipps;
+	ip6_t			*ip6h;
+	mblk_t			*mp1;
+	crb_t			recv_ancillary;
 	icmp_stack_t		*is;
 	sin_t			*sin;
 	sin6_t			*sin6;
-	ip6_t			*ip6h;
-	ip6i_t			*ip6i;
-	mblk_t			*mp1;
-	int			hdr_len;
 	ipha_t			*ipha;
-	int			udi_size;	/* Size of T_unitdata_ind */
-	uint_t			ipvers;
-	ip6_pkt_t		ipp;
-	uint8_t			nexthdr;
-	ip_pktinfo_t		*pinfo = NULL;
-	mblk_t			*options_mp = NULL;
-	uint_t			icmp_opt = 0;
-	boolean_t		icmp_ipv6_recvhoplimit = B_FALSE;
-	uint_t			hopstrip;
 
 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
 
 	icmp = connp->conn_icmp;
 	is = icmp->icmp_is;
 	rptr = mp->b_rptr;
-	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
+
+	ASSERT(DB_TYPE(mp) == M_DATA);
 	ASSERT(OK_32PTR(rptr));
+	ASSERT(ira->ira_pktlen == msgdsize(mp));
+	pkt_len = ira->ira_pktlen;
 
 	/*
-	 * IP should have prepended the options data in an M_CTL
-	 * Check M_CTL "type" to make sure are not here bcos of
-	 * a valid ICMP message
+	 * Get a snapshot of these and allow other threads to change
+	 * them after that. We need the same recv_ancillary when determining
+	 * the size as when adding the ancillary data items.
 	 */
-	if (DB_TYPE(mp) == M_CTL) {
-		/*
-		 * FIXME: does IP still do this?
-		 * IP sends up the IPSEC_IN message for handling IPSEC
-		 * policy at the TCP level. We don't need it here.
-		 */
-		if (*(uint32_t *)(mp->b_rptr) == IPSEC_IN) {
-			mp1 = mp->b_cont;
-			freeb(mp);
-			mp = mp1;
-			rptr = mp->b_rptr;
-		} else if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
-		    ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
-		    IN_PKTINFO) {
-			/*
-			 * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information
-			 * has been prepended to the packet by IP. We need to
-			 * extract the mblk and adjust the rptr
-			 */
-			pinfo = (ip_pktinfo_t *)mp->b_rptr;
-			options_mp = mp;
-			mp = mp->b_cont;
-			rptr = mp->b_rptr;
-		} else {
-			/*
-			 * ICMP messages.
-			 */
-			icmp_icmp_error(connp, mp);
-			return;
-		}
-	}
+	mutex_enter(&connp->conn_lock);
+	recv_ancillary = connp->conn_recv_ancillary;
+	mutex_exit(&connp->conn_lock);
 
-	/*
-	 * Discard message if it is misaligned or smaller than the IP header.
-	 */
-	if (!OK_32PTR(rptr) || (mp->b_wptr - rptr) < sizeof (ipha_t)) {
-		freemsg(mp);
-		if (options_mp != NULL)
-			freeb(options_mp);
-		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
-		return;
-	}
-	ipvers = IPH_HDR_VERSION((ipha_t *)rptr);
+	ip_hdr_length = ira->ira_ip_hdr_length;
+	ASSERT(MBLKL(mp) >= ip_hdr_length);	/* IP did a pullup */
+
+	/* Initialize regardless of IP version */
+	ipps.ipp_fields = 0;
+
+	if (ira->ira_flags & IRAF_IS_IPV4) {
+		ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
+		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
+		ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
+
+		ipha = (ipha_t *)mp->b_rptr;
+		if (recv_ancillary.crb_all != 0)
+			(void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
 
-	/* Handle M_DATA messages containing IP packets messages */
-	if (ipvers == IPV4_VERSION) {
 		/*
-		 * Special case where IP attaches
-		 * the IRE needs to be handled so that we don't send up
-		 * IRE to the user land.
+		 * BSD for some reason adjusts ipha_length to exclude the
+		 * IP header length. We do the same.
 		 */
-		ipha = (ipha_t *)rptr;
-		hdr_len = IPH_HDR_LENGTH(ipha);
-
-		if (ipha->ipha_protocol == IPPROTO_TCP) {
-			tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
-
-			if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) ==
-			    TH_SYN) && mp->b_cont != NULL) {
-				mp1 = mp->b_cont;
-				if (mp1->b_datap->db_type == IRE_DB_TYPE) {
-					freeb(mp1);
-					mp->b_cont = NULL;
-				}
-			}
-		}
 		if (is->is_bsd_compat) {
 			ushort_t len;
-			len = ntohs(ipha->ipha_length);
 
+			len = ntohs(ipha->ipha_length);
 			if (mp->b_datap->db_ref > 1) {
 				/*
 				 * Allocate a new IP header so that we can
@@ -3525,70 +2616,58 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
 				 */
 				mblk_t	*mp1;
 
-				mp1 = allocb(hdr_len, BPRI_MED);
-				if (!mp1) {
+				mp1 = allocb(ip_hdr_length, BPRI_MED);
+				if (mp1 == NULL) {
 					freemsg(mp);
-					if (options_mp != NULL)
-						freeb(options_mp);
 					BUMP_MIB(&is->is_rawip_mib,
 					    rawipInErrors);
 					return;
 				}
-				bcopy(rptr, mp1->b_rptr, hdr_len);
-				mp->b_rptr = rptr + hdr_len;
+				bcopy(rptr, mp1->b_rptr, ip_hdr_length);
+				mp->b_rptr = rptr + ip_hdr_length;
 				rptr = mp1->b_rptr;
 				ipha = (ipha_t *)rptr;
 				mp1->b_cont = mp;
-				mp1->b_wptr = rptr + hdr_len;
+				mp1->b_wptr = rptr + ip_hdr_length;
 				mp = mp1;
 			}
-			len -= hdr_len;
+			len -= ip_hdr_length;
 			ipha->ipha_length = htons(len);
 		}
-	}
 
-	/*
-	 * This is the inbound data path.  Packets are passed upstream as
-	 * T_UNITDATA_IND messages with full IP headers still attached.
-	 */
-	if (icmp->icmp_family == AF_INET) {
-		ASSERT(ipvers == IPV4_VERSION);
-		udi_size =  sizeof (struct T_unitdata_ind) + sizeof (sin_t);
-		if (icmp->icmp_recvif && (pinfo != NULL) &&
-		    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
-			udi_size += sizeof (struct T_opthdr) +
-			    sizeof (uint_t);
-		}
+		/*
+		 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
+		 * sockets. This is ensured by icmp_bind and the IP fanout code.
+		 */
+		ASSERT(connp->conn_family == AF_INET);
 
-		if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
-		    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
-			udi_size += sizeof (struct T_opthdr) +
-			    sizeof (struct in_pktinfo);
-		}
+		/*
+		 * This is the inbound data path.  Packets are passed upstream
+		 * as T_UNITDATA_IND messages with full IPv4 headers still
+		 * attached.
+		 */
 
 		/*
-		 * If SO_TIMESTAMP is set allocate the appropriate sized
-		 * buffer. Since gethrestime() expects a pointer aligned
-		 * argument, we allocate space necessary for extra
-		 * alignment (even though it might not be used).
+		 * Normally only send up the source address.
+		 * If any ancillary data items are wanted we add those.
 		 */
-		if (icmp->icmp_timestamp) {
-			udi_size += sizeof (struct T_opthdr) +
-			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
+		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
+		if (recv_ancillary.crb_all != 0) {
+			udi_size += conn_recvancillary_size(connp,
+			    recv_ancillary, ira, mp, &ipps);
 		}
+
+		/* Allocate a message block for the T_UNITDATA_IND structure. */
 		mp1 = allocb(udi_size, BPRI_MED);
 		if (mp1 == NULL) {
 			freemsg(mp);
-			if (options_mp != NULL)
-				freeb(options_mp);
 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
 			return;
 		}
 		mp1->b_cont = mp;
-		mp = mp1;
-		tudi = (struct T_unitdata_ind *)mp->b_rptr;
-		mp->b_datap->db_type = M_PROTO;
-		mp->b_wptr = (uchar_t *)tudi + udi_size;
+		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
+		mp1->b_datap->db_type = M_PROTO;
+		mp1->b_wptr = (uchar_t *)tudi + udi_size;
 		tudi->PRIM_type = T_UNITDATA_IND;
 		tudi->SRC_length = sizeof (sin_t);
 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
@@ -3596,316 +2675,110 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
 		*sin = sin_null;
 		sin->sin_family = AF_INET;
 		sin->sin_addr.s_addr = ipha->ipha_src;
+		*(uint32_t *)&sin->sin_zero[0] = 0;
+		*(uint32_t *)&sin->sin_zero[4] = 0;
 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
 		    sizeof (sin_t);
 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
 		tudi->OPT_length = udi_size;
 
 		/*
-		 * Add options if IP_RECVIF is set
+		 * Add options if IP_RECVIF etc is set
 		 */
 		if (udi_size != 0) {
-			char *dstopt;
-
-			dstopt = (char *)&sin[1];
-			if (icmp->icmp_recvif && (pinfo != NULL) &&
-			    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
-
-				struct T_opthdr *toh;
-				uint_t		*dstptr;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IP;
-				toh->name = IP_RECVIF;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (uint_t);
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				dstptr = (uint_t *)dstopt;
-				*dstptr = pinfo->ip_pkt_ifindex;
-				dstopt += sizeof (uint_t);
-				udi_size -= toh->len;
-			}
-			if (icmp->icmp_timestamp) {
-				struct	T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = SOL_SOCKET;
-				toh->name = SCM_TIMESTAMP;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				/* Align for gethrestime() */
-				dstopt = (char *)P2ROUNDUP((intptr_t)dstopt,
-				    sizeof (intptr_t));
-				gethrestime((timestruc_t *)dstopt);
-				dstopt = (char *)toh + toh->len;
-				udi_size -= toh->len;
-			}
-			if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
-			    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
-				struct	T_opthdr *toh;
-				struct	in_pktinfo *pktinfop;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IP;
-				toh->name = IP_PKTINFO;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (in_pktinfo_t);
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				pktinfop = (struct in_pktinfo *)dstopt;
-				pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex;
-				pktinfop->ipi_spec_dst =
-				    pinfo->ip_pkt_match_addr;
-
-				pktinfop->ipi_addr.s_addr = ipha->ipha_dst;
-
-				dstopt += sizeof (struct in_pktinfo);
-				udi_size -= toh->len;
-			}
-
-			/* Consumed all of allocated space */
-			ASSERT(udi_size == 0);
+			conn_recvancillary_add(connp, recv_ancillary, ira,
+			    &ipps, (uchar_t *)&sin[1], udi_size);
 		}
-
-		if (options_mp != NULL)
-			freeb(options_mp);
-
-		BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
 		goto deliver;
 	}
 
+	ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
 	/*
-	 * We don't need options_mp in the IPv6 path.
+	 * IPv6 packets can only be received by applications
+	 * that are prepared to receive IPv6 addresses.
+	 * The IP fanout must ensure this.
 	 */
-	if (options_mp != NULL) {
-		freeb(options_mp);
-		options_mp = NULL;
-	}
+	ASSERT(connp->conn_family == AF_INET6);
 
 	/*
-	 * Discard message if it is smaller than the IPv6 header
-	 * or if the header is malformed.
+	 * Handle IPv6 packets. We don't pass up the IP headers with the
+	 * payload for IPv6.
 	 */
-	if ((mp->b_wptr - rptr) < sizeof (ip6_t) ||
-	    IPH_HDR_VERSION((ipha_t *)rptr) != IPV6_VERSION ||
-	    icmp->icmp_family != AF_INET6) {
-		freemsg(mp);
-		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
-		return;
-	}
-
-	/* Initialize */
-	ipp.ipp_fields = 0;
-	hopstrip = 0;
 
 	ip6h = (ip6_t *)rptr;
-	/*
-	 * Call on ip_find_hdr_v6 which gets the total hdr len
-	 * as well as individual lenghts of ext hdrs (and ptrs to
-	 * them).
-	 */
-	if (ip6h->ip6_nxt != icmp->icmp_proto) {
-		/* Look for ifindex information */
-		if (ip6h->ip6_nxt == IPPROTO_RAW) {
-			ip6i = (ip6i_t *)ip6h;
-			if (ip6i->ip6i_flags & IP6I_IFINDEX) {
-				ASSERT(ip6i->ip6i_ifindex != 0);
-				ipp.ipp_fields |= IPPF_IFINDEX;
-				ipp.ipp_ifindex = ip6i->ip6i_ifindex;
-			}
-			rptr = (uchar_t *)&ip6i[1];
-			mp->b_rptr = rptr;
-			if (rptr == mp->b_wptr) {
-				mp1 = mp->b_cont;
-				freeb(mp);
-				mp = mp1;
-				rptr = mp->b_rptr;
-			}
-			ASSERT(mp->b_wptr - rptr >= IPV6_HDR_LEN);
-			ip6h = (ip6_t *)rptr;
-		}
-		hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdr);
+	if (recv_ancillary.crb_all != 0) {
+		/*
+		 * Call on ip_find_hdr_v6 which gets individual lenghts of
+		 * extension headers (and pointers to them).
+		 */
+		uint8_t		nexthdr;
+
+		/* We don't care about the length or nextheader. */
+		(void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
 
 		/*
-		 * We need to lie a bit to the user because users inside
-		 * labeled compartments should not see their own labels.  We
-		 * assume that in all other respects IP has checked the label,
-		 * and that the label is always first among the options.  (If
-		 * it's not first, then this code won't see it, and the option
-		 * will be passed along to the user.)
+		 * We do not pass up hop-by-hop options or any other
+		 * extension header as part of the packet. Applications
+		 * that want to see them have to specify IPV6_RECV* socket
+		 * options. And conn_recvancillary_size/add explicitly
+		 * drops the TX option from IPV6_HOPOPTS as it does for UDP.
 		 *
-		 * If we had multilevel ICMP sockets, then the following code
-		 * should be skipped for them to allow the user to see the
-		 * label.
-		 *
-		 * Alignment restrictions in the definition of IP options
-		 * (namely, the requirement that the 4-octet DOI goes on a
-		 * 4-octet boundary) mean that we know exactly where the option
-		 * should start, but we're lenient for other hosts.
-		 *
-		 * Note that there are no multilevel ICMP or raw IP sockets
-		 * yet, thus nobody ever sees the IP6OPT_LS option.
+		 * If we had multilevel ICMP sockets, then we'd want to
+		 * modify conn_recvancillary_size/add to
+		 * allow the user to see the label.
 		 */
-		if ((ipp.ipp_fields & IPPF_HOPOPTS) &&
-		    ipp.ipp_hopoptslen > 5 && is_system_labeled()) {
-			const uchar_t *ucp =
-			    (const uchar_t *)ipp.ipp_hopopts + 2;
-			int remlen = ipp.ipp_hopoptslen - 2;
-
-			while (remlen > 0) {
-				if (*ucp == IP6OPT_PAD1) {
-					remlen--;
-					ucp++;
-				} else if (*ucp == IP6OPT_PADN) {
-					remlen -= ucp[1] + 2;
-					ucp += ucp[1] + 2;
-				} else if (*ucp == ip6opt_ls) {
-					hopstrip = (ucp -
-					    (const uchar_t *)ipp.ipp_hopopts) +
-					    ucp[1] + 2;
-					hopstrip = (hopstrip + 7) & ~7;
-					break;
-				} else {
-					/* label option must be first */
-					break;
-				}
-			}
-		}
-	} else {
-		hdr_len = IPV6_HDR_LEN;
-		ip6i = NULL;
-		nexthdr = ip6h->ip6_nxt;
-	}
-	/*
-	 * One special case where IP attaches the IRE needs to
-	 * be handled so that we don't send up IRE to the user land.
-	 */
-	if (nexthdr == IPPROTO_TCP) {
-		tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
-
-		if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) == TH_SYN) &&
-		    mp->b_cont != NULL) {
-			mp1 = mp->b_cont;
-			if (mp1->b_datap->db_type == IRE_DB_TYPE) {
-				freeb(mp1);
-				mp->b_cont = NULL;
-			}
-		}
 	}
+
 	/*
 	 * Check a filter for ICMPv6 types if needed.
 	 * Verify raw checksums if needed.
 	 */
-	if (icmp->icmp_filter != NULL || icmp->icmp_raw_checksum) {
-		if (icmp->icmp_filter != NULL) {
-			int type;
+	mutex_enter(&connp->conn_lock);
+	if (icmp->icmp_filter != NULL) {
+		int type;
 
-			/* Assumes that IP has done the pullupmsg */
-			type = mp->b_rptr[hdr_len];
+		/* Assumes that IP has done the pullupmsg */
+		type = mp->b_rptr[ip_hdr_length];
 
-			ASSERT(mp->b_rptr + hdr_len <= mp->b_wptr);
-			if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
-				freemsg(mp);
-				return;
-			}
-		} else {
-			/* Checksum */
-			uint16_t	*up;
-			uint32_t	sum;
-			int		remlen;
-
-			up = (uint16_t *)&ip6h->ip6_src;
-
-			remlen = msgdsize(mp) - hdr_len;
-			sum = htons(icmp->icmp_proto + remlen)
-			    + up[0] + up[1] + up[2] + up[3]
-			    + up[4] + up[5] + up[6] + up[7]
-			    + up[8] + up[9] + up[10] + up[11]
-			    + up[12] + up[13] + up[14] + up[15];
-			sum = (sum & 0xffff) + (sum >> 16);
-			sum = IP_CSUM(mp, hdr_len, sum);
-			if (sum != 0) {
-				/* IPv6 RAW checksum failed */
-				ip0dbg(("icmp_rput: RAW checksum "
-				    "failed %x\n", sum));
-				freemsg(mp);
-				BUMP_MIB(&is->is_rawip_mib,
-				    rawipInCksumErrs);
-				return;
-			}
+		ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
+		if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
+			mutex_exit(&connp->conn_lock);
+			freemsg(mp);
+			return;
 		}
 	}
-	/* Skip all the IPv6 headers per API */
-	mp->b_rptr += hdr_len;
-
-	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
-
-	/*
-	 * We use local variables icmp_opt and icmp_ipv6_recvhoplimit to
-	 * maintain state information, instead of relying on icmp_t
-	 * structure, since there arent any locks protecting these members
-	 * and there is a window where there might be a race between a
-	 * thread setting options on the write side and a thread reading
-	 * these options on the read size.
-	 */
-	if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS|
-	    IPPF_RTHDR|IPPF_IFINDEX)) {
-		if (icmp->icmp_ipv6_recvhopopts &&
-		    (ipp.ipp_fields & IPPF_HOPOPTS) &&
-		    ipp.ipp_hopoptslen > hopstrip) {
-			udi_size += sizeof (struct T_opthdr) +
-			    ipp.ipp_hopoptslen - hopstrip;
-			icmp_opt |= IPPF_HOPOPTS;
-		}
-		if ((icmp->icmp_ipv6_recvdstopts ||
-		    icmp->icmp_old_ipv6_recvdstopts) &&
-		    (ipp.ipp_fields & IPPF_DSTOPTS)) {
-			udi_size += sizeof (struct T_opthdr) +
-			    ipp.ipp_dstoptslen;
-			icmp_opt |= IPPF_DSTOPTS;
-		}
-		if (((icmp->icmp_ipv6_recvdstopts &&
-		    icmp->icmp_ipv6_recvrthdr &&
-		    (ipp.ipp_fields & IPPF_RTHDR)) ||
-		    icmp->icmp_ipv6_recvrtdstopts) &&
-		    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
-			udi_size += sizeof (struct T_opthdr) +
-			    ipp.ipp_rtdstoptslen;
-			icmp_opt |= IPPF_RTDSTOPTS;
-		}
-		if (icmp->icmp_ipv6_recvrthdr &&
-		    (ipp.ipp_fields & IPPF_RTHDR)) {
-			udi_size += sizeof (struct T_opthdr) +
-			    ipp.ipp_rthdrlen;
-			icmp_opt |= IPPF_RTHDR;
-		}
-		if (icmp->icmp_ip_recvpktinfo &&
-		    (ipp.ipp_fields & IPPF_IFINDEX)) {
-			udi_size += sizeof (struct T_opthdr) +
-			    sizeof (struct in6_pktinfo);
-			icmp_opt |= IPPF_IFINDEX;
+	if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
+		/* Checksum */
+		uint16_t	*up;
+		uint32_t	sum;
+		int		remlen;
+
+		up = (uint16_t *)&ip6h->ip6_src;
+
+		remlen = msgdsize(mp) - ip_hdr_length;
+		sum = htons(connp->conn_proto + remlen)
+		    + up[0] + up[1] + up[2] + up[3]
+		    + up[4] + up[5] + up[6] + up[7]
+		    + up[8] + up[9] + up[10] + up[11]
+		    + up[12] + up[13] + up[14] + up[15];
+		sum = (sum & 0xffff) + (sum >> 16);
+		sum = IP_CSUM(mp, ip_hdr_length, sum);
+		if (sum != 0) {
+			/* IPv6 RAW checksum failed */
+			ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
+			mutex_exit(&connp->conn_lock);
+			freemsg(mp);
+			BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
+			return;
 		}
 	}
-	if (icmp->icmp_ipv6_recvhoplimit) {
-		udi_size += sizeof (struct T_opthdr) + sizeof (int);
-		icmp_ipv6_recvhoplimit = B_TRUE;
-	}
+	mutex_exit(&connp->conn_lock);
 
-	if (icmp->icmp_ipv6_recvtclass)
-		udi_size += sizeof (struct T_opthdr) + sizeof (int);
+	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
 
-	/*
-	 * If SO_TIMESTAMP is set allocate the appropriate sized
-	 * buffer. Since gethrestime() expects a pointer aligned
-	 * argument, we allocate space necessary for extra
-	 * alignment (even though it might not be used).
-	 */
-	if (icmp->icmp_timestamp) {
-		udi_size += sizeof (struct T_opthdr) +
-		    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
+	if (recv_ancillary.crb_all != 0) {
+		udi_size += conn_recvancillary_size(connp,
+		    recv_ancillary, ira, mp, &ipps);
 	}
 
 	mp1 = allocb(udi_size, BPRI_MED);
@@ -3915,10 +2788,9 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
 		return;
 	}
 	mp1->b_cont = mp;
-	mp = mp1;
-	mp->b_datap->db_type = M_PROTO;
-	tudi = (struct T_unitdata_ind *)mp->b_rptr;
-	mp->b_wptr = (uchar_t *)tudi + udi_size;
+	mp1->b_datap->db_type = M_PROTO;
+	tudi = (struct T_unitdata_ind *)mp1->b_rptr;
+	mp1->b_wptr = (uchar_t *)tudi + udi_size;
 	tudi->PRIM_type = T_UNITDATA_IND;
 	tudi->SRC_length = sizeof (sin6_t);
 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
@@ -3926,166 +2798,38 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
 	tudi->OPT_length = udi_size;
 	sin6 = (sin6_t *)&tudi[1];
+	*sin6 = sin6_null;
 	sin6->sin6_port = 0;
 	sin6->sin6_family = AF_INET6;
 
 	sin6->sin6_addr = ip6h->ip6_src;
 	/* No sin6_flowinfo per API */
 	sin6->sin6_flowinfo = 0;
-	/* For link-scope source pass up scope id */
-	if ((ipp.ipp_fields & IPPF_IFINDEX) &&
-	    IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
-		sin6->sin6_scope_id = ipp.ipp_ifindex;
+	/* For link-scope pass up scope id */
+	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
+		sin6->sin6_scope_id = ira->ira_ruifindex;
 	else
 		sin6->sin6_scope_id = 0;
-
 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
-	    icmp->icmp_zoneid, is->is_netstack);
+	    IPCL_ZONEID(connp), is->is_netstack);
 
 	if (udi_size != 0) {
-		uchar_t *dstopt;
-
-		dstopt = (uchar_t *)&sin6[1];
-		if (icmp_opt & IPPF_IFINDEX) {
-			struct T_opthdr *toh;
-			struct in6_pktinfo *pkti;
-
-			toh = (struct T_opthdr *)dstopt;
-			toh->level = IPPROTO_IPV6;
-			toh->name = IPV6_PKTINFO;
-			toh->len = sizeof (struct T_opthdr) +
-			    sizeof (*pkti);
-			toh->status = 0;
-			dstopt += sizeof (struct T_opthdr);
-			pkti = (struct in6_pktinfo *)dstopt;
-			pkti->ipi6_addr = ip6h->ip6_dst;
-			pkti->ipi6_ifindex = ipp.ipp_ifindex;
-			dstopt += sizeof (*pkti);
-			udi_size -= toh->len;
-		}
-		if (icmp_ipv6_recvhoplimit) {
-			struct T_opthdr *toh;
-
-			toh = (struct T_opthdr *)dstopt;
-			toh->level = IPPROTO_IPV6;
-			toh->name = IPV6_HOPLIMIT;
-			toh->len = sizeof (struct T_opthdr) +
-			    sizeof (uint_t);
-			toh->status = 0;
-			dstopt += sizeof (struct T_opthdr);
-			*(uint_t *)dstopt = ip6h->ip6_hops;
-			dstopt += sizeof (uint_t);
-			udi_size -= toh->len;
-		}
-		if (icmp->icmp_ipv6_recvtclass) {
-			struct T_opthdr *toh;
-
-			toh = (struct T_opthdr *)dstopt;
-			toh->level = IPPROTO_IPV6;
-			toh->name = IPV6_TCLASS;
-			toh->len = sizeof (struct T_opthdr) +
-			    sizeof (uint_t);
-			toh->status = 0;
-			dstopt += sizeof (struct T_opthdr);
-			*(uint_t *)dstopt = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
-			dstopt += sizeof (uint_t);
-			udi_size -= toh->len;
-		}
-		if (icmp->icmp_timestamp) {
-			struct  T_opthdr *toh;
-
-			toh = (struct T_opthdr *)dstopt;
-			toh->level = SOL_SOCKET;
-			toh->name = SCM_TIMESTAMP;
-			toh->len = sizeof (struct T_opthdr) +
-			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
-			toh->status = 0;
-			dstopt += sizeof (struct T_opthdr);
-			/* Align for gethrestime() */
-			dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt,
-			    sizeof (intptr_t));
-			gethrestime((timestruc_t *)dstopt);
-			dstopt = (uchar_t *)toh + toh->len;
-			udi_size -= toh->len;
-		}
-
-		if (icmp_opt & IPPF_HOPOPTS) {
-			struct T_opthdr *toh;
-
-			toh = (struct T_opthdr *)dstopt;
-			toh->level = IPPROTO_IPV6;
-			toh->name = IPV6_HOPOPTS;
-			toh->len = sizeof (struct T_opthdr) +
-			    ipp.ipp_hopoptslen - hopstrip;
-			toh->status = 0;
-			dstopt += sizeof (struct T_opthdr);
-			bcopy((char *)ipp.ipp_hopopts + hopstrip, dstopt,
-			    ipp.ipp_hopoptslen - hopstrip);
-			if (hopstrip > 0) {
-				/* copy next header value and fake length */
-				dstopt[0] = ((uchar_t *)ipp.ipp_hopopts)[0];
-				dstopt[1] = ((uchar_t *)ipp.ipp_hopopts)[1] -
-				    hopstrip / 8;
-			}
-			dstopt += ipp.ipp_hopoptslen - hopstrip;
-			udi_size -= toh->len;
-		}
-		if (icmp_opt & IPPF_RTDSTOPTS) {
-			struct T_opthdr *toh;
-
-			toh = (struct T_opthdr *)dstopt;
-			toh->level = IPPROTO_IPV6;
-			toh->name = IPV6_DSTOPTS;
-			toh->len = sizeof (struct T_opthdr) +
-			    ipp.ipp_rtdstoptslen;
-			toh->status = 0;
-			dstopt += sizeof (struct T_opthdr);
-			bcopy(ipp.ipp_rtdstopts, dstopt,
-			    ipp.ipp_rtdstoptslen);
-			dstopt += ipp.ipp_rtdstoptslen;
-			udi_size -= toh->len;
-		}
-		if (icmp_opt & IPPF_RTHDR) {
-			struct T_opthdr *toh;
-
-			toh = (struct T_opthdr *)dstopt;
-			toh->level = IPPROTO_IPV6;
-			toh->name = IPV6_RTHDR;
-			toh->len = sizeof (struct T_opthdr) +
-			    ipp.ipp_rthdrlen;
-			toh->status = 0;
-			dstopt += sizeof (struct T_opthdr);
-			bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen);
-			dstopt += ipp.ipp_rthdrlen;
-			udi_size -= toh->len;
-		}
-		if (icmp_opt & IPPF_DSTOPTS) {
-			struct T_opthdr *toh;
-
-			toh = (struct T_opthdr *)dstopt;
-			toh->level = IPPROTO_IPV6;
-			toh->name = IPV6_DSTOPTS;
-			toh->len = sizeof (struct T_opthdr) +
-			    ipp.ipp_dstoptslen;
-			toh->status = 0;
-			dstopt += sizeof (struct T_opthdr);
-			bcopy(ipp.ipp_dstopts, dstopt,
-			    ipp.ipp_dstoptslen);
-			dstopt += ipp.ipp_dstoptslen;
-			udi_size -= toh->len;
-		}
-		/* Consumed all of allocated space */
-		ASSERT(udi_size == 0);
+		conn_recvancillary_add(connp, recv_ancillary, ira,
+		    &ipps, (uchar_t *)&sin6[1], udi_size);
 	}
-	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
 
-deliver:
-	icmp_ulp_recv(connp, mp);
+	/* Skip all the IPv6 headers per API */
+	mp->b_rptr += ip_hdr_length;
+	pkt_len -= ip_hdr_length;
 
+deliver:
+	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
+	icmp_ulp_recv(connp, mp1, pkt_len);
 }
 
 /*
- * return SNMP stuff in buffer in mpdata
+ * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
+ * information that can be changing beneath us.
  */
 mblk_t *
 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
@@ -4146,51 +2890,70 @@ icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
 static void
 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
 {
+	struct T_unitdata_req *tudr;
 	mblk_t	*mp1;
-	uchar_t	*rptr = mp->b_rptr;
-	struct T_unitdata_req *tudr = (struct T_unitdata_req *)rptr;
+	uchar_t *destaddr;
+	t_scalar_t destlen;
+	uchar_t	*optaddr;
+	t_scalar_t optlen;
+
+	if ((mp->b_wptr < mp->b_rptr) ||
+	    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
+		goto done;
+	}
+	tudr = (struct T_unitdata_req *)mp->b_rptr;
+	destaddr = mp->b_rptr + tudr->DEST_offset;
+	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
+	    destaddr + tudr->DEST_length < mp->b_rptr ||
+	    destaddr + tudr->DEST_length > mp->b_wptr) {
+		goto done;
+	}
+	optaddr = mp->b_rptr + tudr->OPT_offset;
+	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
+	    optaddr + tudr->OPT_length < mp->b_rptr ||
+	    optaddr + tudr->OPT_length > mp->b_wptr) {
+		goto done;
+	}
+	destlen = tudr->DEST_length;
+	optlen = tudr->OPT_length;
 
-	mp1 = mi_tpi_uderror_ind((char *)&rptr[tudr->DEST_offset],
-	    tudr->DEST_length, (char *)&rptr[tudr->OPT_offset],
-	    tudr->OPT_length, err);
-	if (mp1)
+	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
+	    (char *)optaddr, optlen, err);
+	if (mp1 != NULL)
 		qreply(q, mp1);
+
+done:
 	freemsg(mp);
 }
 
-
 static int
 rawip_do_unbind(conn_t *connp)
 {
-	icmp_t *icmp = connp->conn_icmp;
+	icmp_t	*icmp = connp->conn_icmp;
 
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+	mutex_enter(&connp->conn_lock);
 	/* If a bind has not been done, we can't unbind. */
-	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
-		rw_exit(&icmp->icmp_rwlock);
+	if (icmp->icmp_state == TS_UNBND) {
+		mutex_exit(&connp->conn_lock);
 		return (-TOUTSTATE);
 	}
-	icmp->icmp_pending_op = T_UNBIND_REQ;
-	rw_exit(&icmp->icmp_rwlock);
+	connp->conn_saddr_v6 = ipv6_all_zeros;
+	connp->conn_bound_addr_v6 = ipv6_all_zeros;
+	connp->conn_laddr_v6 = ipv6_all_zeros;
+	connp->conn_mcbc_bind = B_FALSE;
+	connp->conn_lport = 0;
+	connp->conn_fport = 0;
+	/* In case we were also connected */
+	connp->conn_faddr_v6 = ipv6_all_zeros;
+	connp->conn_v6lastdst = ipv6_all_zeros;
 
-	/*
-	 * Call ip to unbind
-	 */
+	icmp->icmp_state = TS_UNBND;
 
-	ip_unbind(connp);
+	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_flowinfo);
+	mutex_exit(&connp->conn_lock);
 
-	/*
-	 * Once we're unbound from IP, the pending operation may be cleared
-	 * here.
-	 */
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-	V6_SET_ZERO(icmp->icmp_v6src);
-	V6_SET_ZERO(icmp->icmp_bound_v6src);
-	icmp->icmp_pending_op = -1;
-	icmp->icmp_state = TS_UNBND;
-	if (icmp->icmp_family == AF_INET6)
-		(void) icmp_build_hdrs(icmp);
-	rw_exit(&icmp->icmp_rwlock);
+	ip_unbind(connp);
 	return (0);
 }
 
@@ -4230,42 +2993,86 @@ icmp_tpi_unbind(queue_t *q, mblk_t *mp)
 	qreply(q, mp);
 }
 
-
 /*
  * Process IPv4 packets that already include an IP header.
  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
  * IPPROTO_IGMP).
+ * In this case we ignore the address and any options in the T_UNITDATA_REQ.
+ *
+ * The packet is assumed to have a base (20 byte) IP header followed
+ * by the upper-layer protocol. We include any IP_OPTIONS including a
+ * CIPSO label but otherwise preserve the base IP header.
  */
 static int
-icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp,
-    ip4_pkt_t *pktinfop)
+icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
 {
-	icmp_stack_t *is = icmp->icmp_is;
-	ipha_t	*ipha;
-	int	ip_hdr_length;
-	int	tp_hdr_len;
-	int	error;
-	uchar_t	ip_snd_opt[IP_MAX_OPT_LENGTH];
-	uint32_t ip_snd_opt_len = 0;
-	mblk_t	*mp1;
-	uint_t	pkt_len;
-	ip_opt_info_t optinfo;
-	pid_t	cpid;
-	cred_t	*cr;
+	icmp_t		*icmp = connp->conn_icmp;
+	icmp_stack_t	*is = icmp->icmp_is;
+	ipha_t		iphas;
+	ipha_t		*ipha;
+	int		ip_hdr_length;
+	int		tp_hdr_len;
+	ip_xmit_attr_t	*ixa;
+	ip_pkt_t	*ipp;
+	in6_addr_t	v6src;
+	in6_addr_t	v6dst;
+	in6_addr_t	v6nexthop;
+	int		error;
+	boolean_t	do_ipsec;
 
-	rw_enter(&icmp->icmp_rwlock, RW_READER);
+	/*
+	 * We need an exclusive copy of conn_ixa since the included IP
+	 * header could have any destination.
+	 * That copy has no pointers hence we
+	 * need to set them up once we've parsed the ancillary data.
+	 */
+	ixa = conn_get_ixa_exclusive(connp);
+	if (ixa == NULL) {
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		freemsg(mp);
+		return (ENOMEM);
+	}
+	ASSERT(cr != NULL);
+	/*
+	 * Caller has a reference on cr; from db_credp or because we
+	 * are running in process context.
+	 */
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
+	if (is_system_labeled()) {
+		/* We need to restart with a label based on the cred */
+		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
+	}
+
+	/* In case previous destination was multicast or multirt */
+	ip_attr_newdst(ixa);
 
-	optinfo.ip_opt_flags = 0;
-	optinfo.ip_opt_ill_index = 0;
+	/* Get a copy of conn_xmit_ipp since the TX label might change it */
+	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
+	if (ipp == NULL) {
+		ixa_refrele(ixa);
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		freemsg(mp);
+		return (ENOMEM);
+	}
+	mutex_enter(&connp->conn_lock);
+	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
+	mutex_exit(&connp->conn_lock);
+	if (error != 0) {
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		freemsg(mp);
+		goto done;
+	}
+
+	/* Sanity check length of packet */
 	ipha = (ipha_t *)mp->b_rptr;
-	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
+
+	ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
-			ASSERT(icmp != NULL);
 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
 			freemsg(mp);
-			rw_exit(&icmp->icmp_rwlock);
-			return (0);
+			goto done;
 		}
 		ipha = (ipha_t *)mp->b_rptr;
 	}
@@ -4273,1285 +3080,1541 @@ icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp,
 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
 
 	/*
-	 * Check if our saved options are valid; update if not.
-	 * TSOL Note: Since we are not in WRITER mode, ICMP packets
-	 * to different destination may require different labels,
-	 * or worse, ICMP packets to same IP address may require
-	 * different labels due to use of shared all-zones address.
-	 * We use conn_lock to ensure that lastdst, ip_snd_options,
-	 * and ip_snd_options_len are consistent for the current
-	 * destination and are updated atomically.
-	 */
-	mutex_enter(&connp->conn_lock);
-	if (is_system_labeled()) {
-		/*
-		 * Recompute the Trusted Extensions security label if
-		 * we're not going to the same destination as last
-		 * time or the cred attached to the received mblk
-		 * changed.
-		 */
-		cr = msg_getcred(mp, &cpid);
-		if (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
-		    V4_PART_OF_V6(icmp->icmp_v6lastdst) != ipha->ipha_dst ||
-		    cr != icmp->icmp_last_cred) {
-			error = icmp_update_label(icmp, mp, ipha->ipha_dst);
-			if (error != 0) {
-				mutex_exit(&connp->conn_lock);
-				rw_exit(&icmp->icmp_rwlock);
-				return (error);
-			}
-		}
-		/*
-		 * Apply credentials with modified security label if they
-		 * exist. icmp_update_label() may have generated these
-		 * credentials for packets to unlabeled remote nodes.
-		 */
-		if (icmp->icmp_effective_cred != NULL)
-			mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
-	}
-
-	if (icmp->icmp_ip_snd_options_len > 0) {
-		ip_snd_opt_len = icmp->icmp_ip_snd_options_len;
-		bcopy(icmp->icmp_ip_snd_options, ip_snd_opt, ip_snd_opt_len);
-	}
-	mutex_exit(&connp->conn_lock);
-
-	/*
-	 * For the socket of SOCK_RAW type, the checksum is provided in the
-	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
-	 * tell IP that the application has sent a complete IP header and not
-	 * to compute the transport checksum nor change the DF flag.
+	 * We set IXAF_DONTFRAG if the application set DF which makes
+	 * IP not fragment.
 	 */
-	ipha->ipha_ident = IP_HDR_INCLUDED;
-	ipha->ipha_hdr_checksum = 0;
 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
-	/* Insert options if any */
-	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
-		/*
-		 * Put the IP header plus any transport header that is
-		 * checksumed by ip_wput into the first mblk. (ip_wput assumes
-		 * that at least the checksum field is in the first mblk.)
-		 */
-		switch (ipha->ipha_protocol) {
-		case IPPROTO_UDP:
-			tp_hdr_len = 8;
-			break;
-		case IPPROTO_TCP:
-			tp_hdr_len = 20;
-			break;
-		default:
-			tp_hdr_len = 0;
-			break;
-		}
-		/*
-		 * The code below assumes that IP_SIMPLE_HDR_LENGTH plus
-		 * tp_hdr_len bytes will be in a single mblk.
-		 */
-		if ((mp->b_wptr - mp->b_rptr) < (IP_SIMPLE_HDR_LENGTH +
-		    tp_hdr_len)) {
-			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH +
-			    tp_hdr_len)) {
-				BUMP_MIB(&is->is_rawip_mib,
-				    rawipOutErrors);
-				freemsg(mp);
-				rw_exit(&icmp->icmp_rwlock);
-				return (0);
-			}
-			ipha = (ipha_t *)mp->b_rptr;
-		}
-
-		/*
-		 * if the length is larger then the max allowed IP packet,
-		 * then send an error and abort the processing.
-		 */
-		pkt_len = ntohs(ipha->ipha_length)
-		    + ip_snd_opt_len;
-		if (pkt_len > IP_MAXPACKET) {
-			rw_exit(&icmp->icmp_rwlock);
-			return (EMSGSIZE);
-		}
-		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra +
-		    tp_hdr_len, BPRI_LO))) {
-			rw_exit(&icmp->icmp_rwlock);
-			return (ENOMEM);
-		}
-		mp1->b_rptr += is->is_wroff_extra;
-		mp1->b_wptr = mp1->b_rptr + ip_hdr_length;
+	if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
+		ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
+	else
+		ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
 
-		ipha->ipha_length = htons((uint16_t)pkt_len);
-		bcopy(ipha, mp1->b_rptr, IP_SIMPLE_HDR_LENGTH);
+	/* Even for multicast and broadcast we honor the apps ttl */
+	ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
 
-		/* Copy transport header if any */
-		bcopy(&ipha[1], mp1->b_wptr, tp_hdr_len);
-		mp1->b_wptr += tp_hdr_len;
+	if (ipha->ipha_dst == INADDR_ANY)
+		ipha->ipha_dst = htonl(INADDR_LOOPBACK);
 
-		/* Add options */
-		ipha = (ipha_t *)mp1->b_rptr;
-		bcopy(ip_snd_opt, &ipha[1], ip_snd_opt_len);
+	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
+	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
 
-		/* Drop IP header and transport header from original */
-		(void) adjmsg(mp, IP_SIMPLE_HDR_LENGTH + tp_hdr_len);
+	/* Defer IPsec if it might need to look at ICMP type/code */
+	do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
+	ixa->ixa_flags |= IXAF_IS_IPV4;
 
-		mp1->b_cont = mp;
-		mp = mp1;
+	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
+	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
+	    connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
+	    (do_ipsec ? IPDF_IPSEC : 0));
+	switch (error) {
+	case 0:
+		break;
+	case EADDRNOTAVAIL:
 		/*
-		 * Massage source route putting first source
-		 * route in ipha_dst.
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
 		 */
-		(void) ip_massage_options(ipha, is->is_netstack);
-	}
-
-	if (pktinfop != NULL) {
+		error = ENETUNREACH;
+		goto failed;
+	case ENETDOWN:
 		/*
-		 * Over write the source address provided in the header
+		 * Have !ipif_addr_ready address; drop packet silently
+		 * until we can get applications to not send until we
+		 * are ready.
 		 */
-		if (pktinfop->ip4_addr != INADDR_ANY) {
-			ipha->ipha_src = pktinfop->ip4_addr;
-			optinfo.ip_opt_flags = IP_VERIFY_SRC;
-		}
-
-		if (pktinfop->ip4_ill_index != 0) {
-			optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
+		error = 0;
+		goto failed;
+	case EHOSTUNREACH:
+	case ENETUNREACH:
+		if (ixa->ixa_ire != NULL) {
+			/*
+			 * Let conn_ip_output/ire_send_noroute return
+			 * the error and send any local ICMP error.
+			 */
+			error = 0;
+			break;
 		}
+		/* FALLTHRU */
+	default:
+	failed:
+		freemsg(mp);
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		goto done;
 	}
-
-	rw_exit(&icmp->icmp_rwlock);
-
-	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
-	return (0);
-}
-
-static int
-icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
-{
-	int err;
-	uchar_t opt_storage[IP_MAX_OPT_LENGTH];
-	icmp_stack_t		*is = icmp->icmp_is;
-	conn_t			*connp = icmp->icmp_connp;
-	cred_t	*cred;
-	cred_t	*msg_cred;
-	cred_t	*effective_cred;
+	if (ipha->ipha_src == INADDR_ANY)
+		IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
 
 	/*
-	 * All Solaris components should pass a db_credp
-	 * for this message, hence we ASSERT.
-	 * On production kernels we return an error to be robust against
-	 * random streams modules sitting on top of us.
+	 * We might be going to a different destination than last time,
+	 * thus check that TX allows the communication and compute any
+	 * needed label.
+	 *
+	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
+	 * don't have to worry about concurrent threads.
 	 */
-	cred = msg_cred = msg_getcred(mp, NULL);
-	ASSERT(cred != NULL);
-	if (cred == NULL)
-		return (EINVAL);
+	if (is_system_labeled()) {
+		/*
+		 * Check whether Trusted Solaris policy allows communication
+		 * with this host, and pretend that the destination is
+		 * unreachable if not.
+		 * Compute any needed label and place it in ipp_label_v4/v6.
+		 *
+		 * Later conn_build_hdr_template/conn_prepend_hdr takes
+		 * ipp_label_v4/v6 to form the packet.
+		 *
+		 * Tsol note: We have ipp structure local to this thread so
+		 * no locking is needed.
+		 */
+		error = conn_update_label(connp, ixa, &v6dst, ipp);
+		if (error != 0) {
+			freemsg(mp);
+			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+			goto done;
+		}
+	}
 
 	/*
-	 * Verify the destination is allowed to receive packets at
-	 * the security label of the message data. check_dest()
-	 * may create a new effective cred for this message
-	 * with a modified label or label flags.
+	 * Save away a copy of the IPv4 header the application passed down
+	 * and then prepend an IPv4 header complete with any IP options
+	 * including label.
+	 * We need a struct copy since icmp_prepend_hdr will reuse the available
+	 * space in the mblk.
 	 */
-	if ((err = tsol_check_dest(cred, &dst, IPV4_VERSION,
-	    connp->conn_mac_mode, &effective_cred)) != 0)
-		goto done;
-	if (effective_cred != NULL)
-		cred = effective_cred;
+	iphas = *ipha;
+	mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
 
-	/*
-	 * Calculate the security label to be placed in the text
-	 * of the message (if any).
-	 */
-	if ((err = tsol_compute_label(cred, dst, opt_storage,
-	    is->is_netstack->netstack_ip)) != 0)
+	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
+	if (mp == NULL) {
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		ASSERT(error != 0);
 		goto done;
-
-	/*
-	 * Insert the security label in the cached ip options,
-	 * removing any old label that may exist.
-	 */
-	if ((err = tsol_update_options(&icmp->icmp_ip_snd_options,
-	    &icmp->icmp_ip_snd_options_len, &icmp->icmp_label_len,
-	    opt_storage)) != 0)
+	}
+	if (ixa->ixa_pktlen > IP_MAXPACKET) {
+		error = EMSGSIZE;
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		freemsg(mp);
 		goto done;
+	}
+	/* Restore key parts of the header that the application passed down */
+	ipha = (ipha_t *)mp->b_rptr;
+	ipha->ipha_type_of_service = iphas.ipha_type_of_service;
+	ipha->ipha_ident = iphas.ipha_ident;
+	ipha->ipha_fragment_offset_and_flags =
+	    iphas.ipha_fragment_offset_and_flags;
+	ipha->ipha_ttl = iphas.ipha_ttl;
+	ipha->ipha_protocol = iphas.ipha_protocol;
+	ipha->ipha_src = iphas.ipha_src;
+	ipha->ipha_dst = iphas.ipha_dst;
+
+	ixa->ixa_protocol = ipha->ipha_protocol;
 
 	/*
-	 * Save the destination address and cred we used to generate
-	 * the security label text.
+	 * Make sure that the IP header plus any transport header that is
+	 * checksumed by ip_output is in the first mblk. (ip_output assumes
+	 * that at least the checksum field is in the first mblk.)
 	 */
-	IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst);
-	if (cred != icmp->icmp_effective_cred) {
-		if (icmp->icmp_effective_cred != NULL)
-			crfree(icmp->icmp_effective_cred);
-		crhold(cred);
-		icmp->icmp_effective_cred = cred;
+	switch (ipha->ipha_protocol) {
+	case IPPROTO_UDP:
+		tp_hdr_len = 8;
+		break;
+	case IPPROTO_TCP:
+		tp_hdr_len = 20;
+		break;
+	default:
+		tp_hdr_len = 0;
+		break;
+	}
+	ip_hdr_length = IPH_HDR_LENGTH(ipha);
+	if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
+		if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
+			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+			if (mp->b_cont == NULL)
+				error = EINVAL;
+			else
+				error = ENOMEM;
+			freemsg(mp);
+			goto done;
+		}
 	}
 
-	if (msg_cred != icmp->icmp_last_cred) {
-		if (icmp->icmp_last_cred != NULL)
-			crfree(icmp->icmp_last_cred);
-		crhold(msg_cred);
-		icmp->icmp_last_cred = msg_cred;
+	if (!do_ipsec) {
+		/* Policy might differ for different ICMP type/code */
+		if (ixa->ixa_ipsec_policy != NULL) {
+			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+			ixa->ixa_ipsec_policy = NULL;
+			ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+		}
+		mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
+		if (mp == NULL) {
+			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+			error = EHOSTUNREACH;	/* IPsec policy failure */
+			goto done;
+		}
 	}
 
+	/* We're done.  Pass the packet to ip. */
+	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
+
+	error = conn_ip_output(mp, ixa);
+	/* No rawipOutErrors if an error since IP increases its error counter */
+	switch (error) {
+	case 0:
+		break;
+	case EWOULDBLOCK:
+		(void) ixa_check_drain_insert(connp, ixa);
+		error = 0;
+		break;
+	case EADDRNOTAVAIL:
+		/*
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
+		 */
+		error = ENETUNREACH;
+		break;
+	}
 done:
-	if (effective_cred != NULL)
-		crfree(effective_cred);
+	ixa_refrele(ixa);
+	ip_pkt_free(ipp);
+	kmem_free(ipp, sizeof (*ipp));
+	return (error);
+}
 
-	if (err != 0) {
-		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-		DTRACE_PROBE4(
-		    tx__ip__log__drop__updatelabel__icmp,
-		    char *, "icmp(1) failed to update options(2) on mp(3)",
-		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
-		return (err);
+static mblk_t *
+icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
+{
+	ipha_t	*ipha = NULL;
+	ip6_t	*ip6h = NULL;
+
+	if (ixa->ixa_flags & IXAF_IS_IPV4)
+		ipha = (ipha_t *)mp->b_rptr;
+	else
+		ip6h = (ip6_t *)mp->b_rptr;
+
+	if (ixa->ixa_ipsec_policy != NULL) {
+		IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+		ixa->ixa_ipsec_policy = NULL;
+		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
 	}
-	return (0);
+	return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
 }
 
 /*
- * This routine handles all messages passed downstream.  It either
- * consumes the message or passes it downstream; it never queues a
- * a message.
+ * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
+ * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
+ * the TPI options, otherwise we take them from msg_control.
+ * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
+ * Always consumes mp; never consumes tudr_mp.
  */
-static void
-icmp_wput(queue_t *q, mblk_t *mp)
+static int
+icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
+    mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
 {
-	uchar_t	*rptr = mp->b_rptr;
-	mblk_t	*mp1;
-#define	tudr ((struct T_unitdata_req *)rptr)
-	size_t	ip_len;
-	conn_t	*connp = Q_TO_CONN(q);
-	icmp_t	*icmp = connp->conn_icmp;
-	icmp_stack_t *is = icmp->icmp_is;
-	sin6_t	*sin6;
-	sin_t	*sin;
-	ipaddr_t	v4dst;
-	ip4_pkt_t	pktinfo;
-	ip4_pkt_t	*pktinfop = &pktinfo;
-	ip6_pkt_t	ipp_s;  /* For ancillary data options */
-	ip6_pkt_t	*ipp = &ipp_s;
-	int error;
+	icmp_t		*icmp = connp->conn_icmp;
+	icmp_stack_t	*is = icmp->icmp_is;
+	int		error;
+	ip_xmit_attr_t	*ixa;
+	ip_pkt_t	*ipp;
+	in6_addr_t	v6src;
+	in6_addr_t	v6dst;
+	in6_addr_t	v6nexthop;
+	in_port_t	dstport;
+	uint32_t	flowinfo;
+	uint_t		srcid;
+	int		is_absreq_failure = 0;
+	conn_opt_arg_t	coas, *coa;
 
-	ipp->ipp_fields = 0;
-	ipp->ipp_sticky_ignored = 0;
+	ASSERT(tudr_mp != NULL || msg != NULL);
 
-	switch (mp->b_datap->db_type) {
-	case M_DATA:
-		if (icmp->icmp_hdrincl) {
-			ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
-			error = icmp_wput_hdrincl(q, connp, mp, icmp, NULL);
-			if (error != 0)
-				icmp_ud_err(q, mp, error);
-			return;
-		}
+	/*
+	 * Get ixa before checking state to handle a disconnect race.
+	 *
+	 * We need an exclusive copy of conn_ixa since the ancillary data
+	 * options might modify it. That copy has no pointers hence we
+	 * need to set them up once we've parsed the ancillary data.
+	 */
+	ixa = conn_get_ixa_exclusive(connp);
+	if (ixa == NULL) {
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
 		freemsg(mp);
-		return;
-	case M_PROTO:
-	case M_PCPROTO:
-		ip_len = mp->b_wptr - rptr;
-		if (ip_len >= sizeof (struct T_unitdata_req)) {
-			/* Expedite valid T_UNITDATA_REQ to below the switch */
-			if (((union T_primitives *)rptr)->type
-			    == T_UNITDATA_REQ)
-				break;
-		}
-		/* FALLTHRU */
-	default:
-		icmp_wput_other(q, mp);
-		return;
+		return (ENOMEM);
+	}
+	ASSERT(cr != NULL);
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
+	if (is_system_labeled()) {
+		/* We need to restart with a label based on the cred */
+		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
 	}
 
-	/* Handle T_UNITDATA_REQ messages here. */
+	/* In case previous destination was multicast or multirt */
+	ip_attr_newdst(ixa);
 
-	mp1 = mp->b_cont;
-	if (mp1 == NULL) {
+	/* Get a copy of conn_xmit_ipp since the options might change it */
+	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
+	if (ipp == NULL) {
+		ixa_refrele(ixa);
 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-		icmp_ud_err(q, mp, EPROTO);
-		return;
+		freemsg(mp);
+		return (ENOMEM);
 	}
-
-	if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
+	mutex_enter(&connp->conn_lock);
+	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
+	mutex_exit(&connp->conn_lock);
+	if (error != 0) {
 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-		icmp_ud_err(q, mp, EADDRNOTAVAIL);
-		return;
+		freemsg(mp);
+		goto done;
 	}
 
-	switch (icmp->icmp_family) {
-	case AF_INET6:
-		sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
-		if (!OK_32PTR((char *)sin6) ||
-		    tudr->DEST_length != sizeof (sin6_t) ||
-		    sin6->sin6_family != AF_INET6) {
-			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-			icmp_ud_err(q, mp, EADDRNOTAVAIL);
-			return;
-		}
+	/*
+	 * Parse the options and update ixa and ipp as a result.
+	 */
 
-		/* No support for mapped addresses on raw sockets */
-		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
-			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-			icmp_ud_err(q, mp, EADDRNOTAVAIL);
-			return;
-		}
+	coa = &coas;
+	coa->coa_connp = connp;
+	coa->coa_ixa = ixa;
+	coa->coa_ipp = ipp;
+	coa->coa_ancillary = B_TRUE;
+	coa->coa_changed = 0;
 
+	if (msg != NULL) {
+		error = process_auxiliary_options(connp, msg->msg_control,
+		    msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
+	} else {
+		struct T_unitdata_req *tudr;
+
+		tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
+		ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
+		error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
+		    &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
+		    coa, &is_absreq_failure);
+	}
+	if (error != 0) {
 		/*
-		 * Destination is a native IPv6 address.
-		 * Send out an IPv6 format packet.
+		 * Note: No special action needed in this
+		 * module for "is_absreq_failure"
 		 */
-		if (tudr->OPT_length != 0) {
-			int error;
-
-			error = 0;
-			if (icmp_unitdata_opt_process(q, mp, &error,
-			    (void *)ipp) < 0) {
-				/* failure */
-				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-				icmp_ud_err(q, mp, error);
-				return;
-			}
-			ASSERT(error == 0);
-		}
-
-		error = raw_ip_send_data_v6(q, connp, mp1, sin6, ipp);
+		freemsg(mp);
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
 		goto done;
-
-	case AF_INET:
-		sin = (sin_t *)&rptr[tudr->DEST_offset];
-		if (!OK_32PTR((char *)sin) ||
-		    tudr->DEST_length != sizeof (sin_t) ||
-		    sin->sin_family != AF_INET) {
-			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-			icmp_ud_err(q, mp, EADDRNOTAVAIL);
-			return;
-		}
-		/* Extract and ipaddr */
-		v4dst = sin->sin_addr.s_addr;
-		break;
-
-	default:
-		ASSERT(0);
 	}
+	ASSERT(is_absreq_failure == 0);
 
-	pktinfop->ip4_ill_index = 0;
-	pktinfop->ip4_addr = INADDR_ANY;
-
+	mutex_enter(&connp->conn_lock);
 	/*
-	 * If options passed in, feed it for verification and handling
+	 * If laddr is unspecified then we look at sin6_src_id.
+	 * We will give precedence to a source address set with IPV6_PKTINFO
+	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
+	 * want ip_attr_connect to select a source (since it can fail) when
+	 * IPV6_PKTINFO is specified.
+	 * If this doesn't result in a source address then we get a source
+	 * from ip_attr_connect() below.
 	 */
-	if (tudr->OPT_length != 0) {
-		int error;
-
-		error = 0;
-		if (icmp_unitdata_opt_process(q, mp, &error,
-		    (void *)pktinfop) < 0) {
-			/* failure */
-			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-			icmp_ud_err(q, mp, error);
-			return;
+	v6src = connp->conn_saddr_v6;
+	if (sin != NULL) {
+		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
+		dstport = sin->sin_port;
+		flowinfo = 0;
+		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		ixa->ixa_flags |= IXAF_IS_IPV4;
+	} else if (sin6 != NULL) {
+		v6dst = sin6->sin6_addr;
+		dstport = sin6->sin6_port;
+		flowinfo = sin6->sin6_flowinfo;
+		srcid = sin6->__sin6_src_id;
+		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
+			ixa->ixa_scopeid = sin6->sin6_scope_id;
+			ixa->ixa_flags |= IXAF_SCOPEID_SET;
+		} else {
+			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 		}
-		ASSERT(error == 0);
-		/*
-		 * Note: Success in processing options.
-		 * mp option buffer represented by
-		 * OPT_length/offset now potentially modified
-		 * and contain option setting results
-		 */
-	}
-
-	error = raw_ip_send_data_v4(q, connp, mp1, v4dst, pktinfop);
-done:
-	if (error != 0) {
-		icmp_ud_err(q, mp, error);
-		return;
+		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+			    connp->conn_netstack);
+		}
+		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
+			ixa->ixa_flags |= IXAF_IS_IPV4;
+		else
+			ixa->ixa_flags &= ~IXAF_IS_IPV4;
 	} else {
-		mp->b_cont = NULL;
-		freeb(mp);
+		/* Connected case */
+		v6dst = connp->conn_faddr_v6;
+		flowinfo = connp->conn_flowinfo;
+	}
+	mutex_exit(&connp->conn_lock);
+	/* Handle IPV6_PKTINFO setting source address. */
+	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
+	    (ipp->ipp_fields & IPPF_ADDR)) {
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
+			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+				v6src = ipp->ipp_addr;
+		} else {
+			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+				v6src = ipp->ipp_addr;
+		}
 	}
-}
-
-
-/* ARGSUSED */
-static void
-icmp_wput_fallback(queue_t *q, mblk_t *mp)
-{
-#ifdef DEBUG
-	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
-#endif
-	freemsg(mp);
-}
-
-static int
-raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, ipaddr_t v4dst,
-    ip4_pkt_t *pktinfop)
-{
-	ipha_t	*ipha;
-	size_t	ip_len;
-	icmp_t	*icmp = connp->conn_icmp;
-	icmp_stack_t *is = icmp->icmp_is;
-	int	ip_hdr_length;
-	ip_opt_info_t	optinfo;
-	uchar_t	ip_snd_opt[IP_MAX_OPT_LENGTH];
-	uint32_t ip_snd_opt_len = 0;
-	pid_t	cpid;
-	cred_t	*cr;
 
-	optinfo.ip_opt_flags = 0;
-	optinfo.ip_opt_ill_index = 0;
+	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
+	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
+	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
 
-	if (icmp->icmp_state == TS_UNBND) {
-		/* If a port has not been bound to the stream, fail. */
+	switch (error) {
+	case 0:
+		break;
+	case EADDRNOTAVAIL:
+		/*
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
+		 */
+		error = ENETUNREACH;
+		goto failed;
+	case ENETDOWN:
+		/*
+		 * Have !ipif_addr_ready address; drop packet silently
+		 * until we can get applications to not send until we
+		 * are ready.
+		 */
+		error = 0;
+		goto failed;
+	case EHOSTUNREACH:
+	case ENETUNREACH:
+		if (ixa->ixa_ire != NULL) {
+			/*
+			 * Let conn_ip_output/ire_send_noroute return
+			 * the error and send any local ICMP error.
+			 */
+			error = 0;
+			break;
+		}
+		/* FALLTHRU */
+	default:
+	failed:
+		freemsg(mp);
 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-		return (EPROTO);
+		goto done;
 	}
 
-	if (v4dst == INADDR_ANY)
-		v4dst = htonl(INADDR_LOOPBACK);
-
-	/* Protocol 255 contains full IP headers */
-	if (icmp->icmp_hdrincl)
-		return (icmp_wput_hdrincl(q, connp, mp, icmp, pktinfop));
-
-	rw_enter(&icmp->icmp_rwlock, RW_READER);
-
 	/*
-	 * Check if our saved options are valid; update if not.
-	 * TSOL Note: Since we are not in WRITER mode, ICMP packets
-	 * to different destination may require different labels,
-	 * or worse, ICMP packets to same IP address may require
-	 * different labels due to use of shared all-zones address.
-	 * We use conn_lock to ensure that lastdst, ip_snd_options,
-	 * and ip_snd_options_len are consistent for the current
-	 * destination and are updated atomically.
+	 * We might be going to a different destination than last time,
+	 * thus check that TX allows the communication and compute any
+	 * needed label.
+	 *
+	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
+	 * don't have to worry about concurrent threads.
 	 */
-	mutex_enter(&connp->conn_lock);
 	if (is_system_labeled()) {
-
-		/*
-		 * Recompute the Trusted Extensions security label if we're not
-		 * going to the same destination as last time or the cred
-		 * attached to the received mblk changed.
-		 */
-		cr = msg_getcred(mp, &cpid);
-		if (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
-		    V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst ||
-		    cr != icmp->icmp_last_cred) {
-			int error = icmp_update_label(icmp, mp, v4dst);
-			if (error != 0) {
-				mutex_exit(&connp->conn_lock);
-				rw_exit(&icmp->icmp_rwlock);
-				return (error);
-			}
-		}
 		/*
-		 * Apply credentials with modified security label if they
-		 * exist. icmp_update_label() may have generated these
-		 * credentials for packets to unlabeled remote nodes.
+		 * Check whether Trusted Solaris policy allows communication
+		 * with this host, and pretend that the destination is
+		 * unreachable if not.
+		 * Compute any needed label and place it in ipp_label_v4/v6.
+		 *
+		 * Later conn_build_hdr_template/conn_prepend_hdr takes
+		 * ipp_label_v4/v6 to form the packet.
+		 *
+		 * Tsol note: We have ipp structure local to this thread so
+		 * no locking is needed.
 		 */
-		if (icmp->icmp_effective_cred != NULL)
-			mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
-	}
-
-	if (icmp->icmp_ip_snd_options_len > 0) {
-		ip_snd_opt_len = icmp->icmp_ip_snd_options_len;
-		bcopy(icmp->icmp_ip_snd_options, ip_snd_opt, ip_snd_opt_len);
-	}
-	mutex_exit(&connp->conn_lock);
-
-	/* Add an IP header */
-	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + ip_snd_opt_len;
-	ipha = (ipha_t *)&mp->b_rptr[-ip_hdr_length];
-	if ((uchar_t *)ipha < mp->b_datap->db_base ||
-	    mp->b_datap->db_ref != 1 ||
-	    !OK_32PTR(ipha)) {
-		mblk_t	*mp1;
-		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra,
-		    BPRI_LO))) {
+		error = conn_update_label(connp, ixa, &v6dst, ipp);
+		if (error != 0) {
+			freemsg(mp);
 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-			rw_exit(&icmp->icmp_rwlock);
-			return (ENOMEM);
+			goto done;
 		}
-		mp1->b_cont = mp;
-		ipha = (ipha_t *)mp1->b_datap->db_lim;
-		mp1->b_wptr = (uchar_t *)ipha;
-		ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length);
-		mp = mp1;
 	}
-#ifdef	_BIG_ENDIAN
-	/* Set version, header length, and tos */
-	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
-	    ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) |
-	    icmp->icmp_type_of_service);
-	/* Set ttl and protocol */
-	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_ttl << 8) | icmp->icmp_proto;
-#else
-	/* Set version, header length, and tos */
-	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
-	    ((icmp->icmp_type_of_service << 8) |
-	    ((IP_VERSION << 4) | (ip_hdr_length>>2)));
-	/* Set ttl and protocol */
-	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_proto << 8) | icmp->icmp_ttl;
-#endif
-	if (pktinfop->ip4_addr != INADDR_ANY) {
-		ipha->ipha_src = pktinfop->ip4_addr;
-		optinfo.ip_opt_flags = IP_VERIFY_SRC;
-	} else {
-
-		/*
-		 * Copy our address into the packet.  If this is zero,
-		 * ip will fill in the real source address.
-		 */
-		IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src, ipha->ipha_src);
+	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
+	    &error);
+	if (mp == NULL) {
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		ASSERT(error != 0);
+		goto done;
 	}
-
-	ipha->ipha_fragment_offset_and_flags = 0;
-
-	if (pktinfop->ip4_ill_index != 0) {
-		optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
+	if (ixa->ixa_pktlen > IP_MAXPACKET) {
+		error = EMSGSIZE;
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		freemsg(mp);
+		goto done;
 	}
 
-
-	/*
-	 * For the socket of SOCK_RAW type, the checksum is provided in the
-	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
-	 * tell IP that the application has sent a complete IP header and not
-	 * to compute the transport checksum nor change the DF flag.
-	 */
-	ipha->ipha_ident = IP_HDR_INCLUDED;
-
-	/* Finish common formatting of the packet. */
-	mp->b_rptr = (uchar_t *)ipha;
-
-	ip_len = mp->b_wptr - (uchar_t *)ipha;
-	if (mp->b_cont != NULL)
-		ip_len += msgdsize(mp->b_cont);
-
-	/*
-	 * Set the length into the IP header.
-	 * If the length is greater than the maximum allowed by IP,
-	 * then free the message and return. Do not try and send it
-	 * as this can cause problems in layers below.
-	 */
-	if (ip_len > IP_MAXPACKET) {
+	/* Policy might differ for different ICMP type/code */
+	mp = icmp_output_attach_policy(mp, connp, ixa);
+	if (mp == NULL) {
 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-		rw_exit(&icmp->icmp_rwlock);
-		return (EMSGSIZE);
+		error = EHOSTUNREACH;	/* IPsec policy failure */
+		goto done;
 	}
-	ipha->ipha_length = htons((uint16_t)ip_len);
-	/*
-	 * Copy in the destination address request
-	 */
-	ipha->ipha_dst = v4dst;
 
-	/*
-	 * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic.
-	 */
-	if (CLASSD(v4dst))
-		ipha->ipha_ttl = icmp->icmp_multicast_ttl;
+	/* We're done.  Pass the packet to ip. */
+	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
 
-	/* Copy in options if any */
-	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
-		bcopy(ip_snd_opt,
-		    &ipha[1], ip_snd_opt_len);
+	/* Allow source not assigned to the system? */
+	ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+	error = conn_ip_output(mp, ixa);
+	if (!connp->conn_unspec_src)
+		ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
+	/* No rawipOutErrors if an error since IP increases its error counter */
+	switch (error) {
+	case 0:
+		break;
+	case EWOULDBLOCK:
+		(void) ixa_check_drain_insert(connp, ixa);
+		error = 0;
+		break;
+	case EADDRNOTAVAIL:
 		/*
-		 * Massage source route putting first source route in ipha_dst.
-		 * Ignore the destination in the T_unitdata_req.
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
 		 */
-		(void) ip_massage_options(ipha, is->is_netstack);
+		error = ENETUNREACH;
+		/* FALLTHRU */
+	default:
+		mutex_enter(&connp->conn_lock);
+		/*
+		 * Clear the source and v6lastdst so we call ip_attr_connect
+		 * for the next packet and try to pick a better source.
+		 */
+		if (connp->conn_mcbc_bind)
+			connp->conn_saddr_v6 = ipv6_all_zeros;
+		else
+			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+		connp->conn_v6lastdst = ipv6_all_zeros;
+		mutex_exit(&connp->conn_lock);
+		break;
 	}
-
-	rw_exit(&icmp->icmp_rwlock);
-	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
-
-	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
-	return (0);
+done:
+	ixa_refrele(ixa);
+	ip_pkt_free(ipp);
+	kmem_free(ipp, sizeof (*ipp));
+	return (error);
 }
 
-static int
-icmp_update_label_v6(icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
+/*
+ * Handle sending an M_DATA for a connected socket.
+ * Handles both IPv4 and IPv6.
+ */
+int
+icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
 {
-	int err;
-	uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
-	icmp_stack_t		*is = icmp->icmp_is;
-	conn_t			*connp = icmp->icmp_connp;
-	cred_t	*cred;
-	cred_t	*msg_cred;
-	cred_t	*effective_cred;
+	icmp_t		*icmp = connp->conn_icmp;
+	icmp_stack_t	*is = icmp->icmp_is;
+	int		error;
+	ip_xmit_attr_t	*ixa;
+	boolean_t	do_ipsec;
 
 	/*
-	 * All Solaris components should pass a db_credp
-	 * for this message, hence we ASSERT.
-	 * On production kernels we return an error to be robust against
-	 * random streams modules sitting on top of us.
+	 * If no other thread is using conn_ixa this just gets a reference to
+	 * conn_ixa. Otherwise we get a safe copy of conn_ixa.
 	 */
-	cred = msg_cred = msg_getcred(mp, NULL);
-	ASSERT(cred != NULL);
-	if (cred == NULL)
-		return (EINVAL);
+	ixa = conn_get_ixa(connp, B_FALSE);
+	if (ixa == NULL) {
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		freemsg(mp);
+		return (ENOMEM);
+	}
 
-	/*
-	 * Verify the destination is allowed to receive packets at
-	 * the security label of the message data. check_dest()
-	 * may create a new effective cred for this message
-	 * with a modified label or label flags.
-	 */
-	if ((err = tsol_check_dest(cred, dst, IPV6_VERSION,
-	    connp->conn_mac_mode, &effective_cred)) != 0)
-		goto done;
-	if (effective_cred != NULL)
-		cred = effective_cred;
+	ASSERT(cr != NULL);
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
 
-	/*
-	 * Calculate the security label to be placed in the text
-	 * of the message (if any).
-	 */
-	if ((err = tsol_compute_label_v6(cred, dst, opt_storage,
-	    is->is_netstack->netstack_ip)) != 0)
-		goto done;
+	/* Defer IPsec if it might need to look at ICMP type/code */
+	switch (ixa->ixa_protocol) {
+	case IPPROTO_ICMP:
+	case IPPROTO_ICMPV6:
+		do_ipsec = B_FALSE;
+		break;
+	default:
+		do_ipsec = B_TRUE;
+	}
 
-	/*
-	 * Insert the security label in the cached ip options,
-	 * removing any old label that may exist.
-	 */
-	if ((err = tsol_update_sticky(&icmp->icmp_sticky_ipp,
-	    &icmp->icmp_label_len_v6, opt_storage)) != 0)
-		goto done;
+	mutex_enter(&connp->conn_lock);
+	mp = icmp_prepend_header_template(connp, ixa, mp,
+	    &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
+
+	if (mp == NULL) {
+		ASSERT(error != 0);
+		mutex_exit(&connp->conn_lock);
+		ixa_refrele(ixa);
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		freemsg(mp);
+		return (error);
+	}
+
+	if (!do_ipsec) {
+		/* Policy might differ for different ICMP type/code */
+		mp = icmp_output_attach_policy(mp, connp, ixa);
+		if (mp == NULL) {
+			mutex_exit(&connp->conn_lock);
+			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+			ixa_refrele(ixa);
+			return (EHOSTUNREACH);	/* IPsec policy failure */
+		}
+	}
 
 	/*
-	 * Save the destination address and cred we used to generate
-	 * the security label text.
+	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
+	 * safe copy, then we need to fill in any pointers in it.
 	 */
-	icmp->icmp_v6lastdst = *dst;
-	if (cred != icmp->icmp_effective_cred) {
-		if (icmp->icmp_effective_cred != NULL)
-			crfree(icmp->icmp_effective_cred);
-		crhold(cred);
-		icmp->icmp_effective_cred = cred;
-	}
+	if (ixa->ixa_ire == NULL) {
+		in6_addr_t	faddr, saddr;
+		in6_addr_t	nexthop;
+		in_port_t	fport;
+
+		saddr = connp->conn_saddr_v6;
+		faddr = connp->conn_faddr_v6;
+		fport = connp->conn_fport;
+		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
+		mutex_exit(&connp->conn_lock);
 
-	if (msg_cred != icmp->icmp_last_cred) {
-		if (icmp->icmp_last_cred != NULL)
-			crfree(icmp->icmp_last_cred);
-		crhold(msg_cred);
-		icmp->icmp_last_cred = msg_cred;
+		error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
+		    fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
+		    (do_ipsec ? IPDF_IPSEC : 0));
+		switch (error) {
+		case 0:
+			break;
+		case EADDRNOTAVAIL:
+			/*
+			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+			 * Don't have the application see that errno
+			 */
+			error = ENETUNREACH;
+			goto failed;
+		case ENETDOWN:
+			/*
+			 * Have !ipif_addr_ready address; drop packet silently
+			 * until we can get applications to not send until we
+			 * are ready.
+			 */
+			error = 0;
+			goto failed;
+		case EHOSTUNREACH:
+		case ENETUNREACH:
+			if (ixa->ixa_ire != NULL) {
+				/*
+				 * Let conn_ip_output/ire_send_noroute return
+				 * the error and send any local ICMP error.
+				 */
+				error = 0;
+				break;
+			}
+			/* FALLTHRU */
+		default:
+		failed:
+			ixa_refrele(ixa);
+			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+			freemsg(mp);
+			return (error);
+		}
+	} else {
+		/* Done with conn_t */
+		mutex_exit(&connp->conn_lock);
 	}
 
-done:
-	if (effective_cred != NULL)
-		crfree(effective_cred);
+	/* We're done.  Pass the packet to ip. */
+	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
 
-	if (err != 0) {
-		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-		DTRACE_PROBE4(
-		    tx__ip__log__drop__updatelabel__icmp6,
-		    char *, "icmp(1) failed to update options(2) on mp(3)",
-		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
-		return (err);
+	error = conn_ip_output(mp, ixa);
+	/* No rawipOutErrors if an error since IP increases its error counter */
+	switch (error) {
+	case 0:
+		break;
+	case EWOULDBLOCK:
+		(void) ixa_check_drain_insert(connp, ixa);
+		error = 0;
+		break;
+	case EADDRNOTAVAIL:
+		/*
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
+		 */
+		error = ENETUNREACH;
+		break;
 	}
-	return (0);
+	ixa_refrele(ixa);
+	return (error);
 }
 
 /*
- * raw_ip_send_data_v6():
- * Assumes that icmp_wput did some sanity checking on the destination
- * address, but that the label may not yet be correct.
+ * Handle sending an M_DATA to the last destination.
+ * Handles both IPv4 and IPv6.
+ *
+ * NOTE: The caller must hold conn_lock and we drop it here.
  */
-static int
-raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, sin6_t *sin6,
-    ip6_pkt_t *ipp)
+int
+icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
+    ip_xmit_attr_t *ixa)
 {
-	ip6_t			*ip6h;
-	ip6i_t			*ip6i;	/* mp->b_rptr even if no ip6i_t */
-	int			ip_hdr_len = IPV6_HDR_LEN;
-	size_t			ip_len;
-	icmp_t			*icmp = connp->conn_icmp;
-	icmp_stack_t		*is = icmp->icmp_is;
-	ip6_pkt_t		*tipp;
-	ip6_hbh_t		*hopoptsptr = NULL;
-	uint_t			hopoptslen = 0;
-	uint32_t		csum = 0;
-	uint_t			ignore = 0;
-	uint_t			option_exists = 0, is_sticky = 0;
-	uint8_t			*cp;
-	uint8_t			*nxthdr_ptr;
-	in6_addr_t		ip6_dst;
-	pid_t			cpid;
-	cred_t			*cr;
-
-	rw_enter(&icmp->icmp_rwlock, RW_READER);
+	icmp_t		*icmp = connp->conn_icmp;
+	icmp_stack_t	*is = icmp->icmp_is;
+	int		error;
+	boolean_t	do_ipsec;
 
-	/*
-	 * If the local address is a mapped address return
-	 * an error.
-	 * It would be possible to send an IPv6 packet but the
-	 * response would never make it back to the application
-	 * since it is bound to a mapped address.
-	 */
-	if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) {
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+	ASSERT(ixa != NULL);
+
+	ASSERT(cr != NULL);
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
+
+	/* Defer IPsec if it might need to look at ICMP type/code */
+	switch (ixa->ixa_protocol) {
+	case IPPROTO_ICMP:
+	case IPPROTO_ICMPV6:
+		do_ipsec = B_FALSE;
+		break;
+	default:
+		do_ipsec = B_TRUE;
+	}
+
+
+	mp = icmp_prepend_header_template(connp, ixa, mp,
+	    &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
+
+	if (mp == NULL) {
+		ASSERT(error != 0);
+		mutex_exit(&connp->conn_lock);
+		ixa_refrele(ixa);
 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-		rw_exit(&icmp->icmp_rwlock);
-		return (EADDRNOTAVAIL);
+		freemsg(mp);
+		return (error);
 	}
 
-	ignore = ipp->ipp_sticky_ignored;
-	if (sin6->sin6_scope_id != 0 &&
-	    IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
-		/*
-		 * IPPF_SCOPE_ID is special.  It's neither a sticky
-		 * option nor ancillary data.  It needs to be
-		 * explicitly set in options_exists.
-		 */
-		option_exists |= IPPF_SCOPE_ID;
+	if (!do_ipsec) {
+		/* Policy might differ for different ICMP type/code */
+		mp = icmp_output_attach_policy(mp, connp, ixa);
+		if (mp == NULL) {
+			mutex_exit(&connp->conn_lock);
+			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+			ixa_refrele(ixa);
+			return (EHOSTUNREACH);	/* IPsec policy failure */
+		}
 	}
 
 	/*
-	 * Compute the destination address
+	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
+	 * safe copy, then we need to fill in any pointers in it.
 	 */
-	ip6_dst = sin6->sin6_addr;
-	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
-		ip6_dst = ipv6_loopback;
+	if (ixa->ixa_ire == NULL) {
+		in6_addr_t	lastdst, lastsrc;
+		in6_addr_t	nexthop;
+		in_port_t	lastport;
+
+		lastsrc = connp->conn_v6lastsrc;
+		lastdst = connp->conn_v6lastdst;
+		lastport = connp->conn_lastdstport;
+		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
+		mutex_exit(&connp->conn_lock);
 
-	/*
-	 * Check if our saved options are valid; update if not.
-	 * TSOL Note: Since we are not in WRITER mode, ICMP packets
-	 * to different destination may require different labels,
-	 * or worse, ICMP packets to same IP address may require
-	 * different labels due to use of shared all-zones address.
-	 * We use conn_lock to ensure that lastdst, sticky ipp_hopopts,
-	 * and sticky ipp_hopoptslen are consistent for the current
-	 * destination and are updated atomically.
-	 */
-	mutex_enter(&connp->conn_lock);
-	if (is_system_labeled()) {
-		/*
-		 * Recompute the Trusted Extensions security label if we're
-		 * not going to the same destination as last time or the cred
-		 * attached to the received mblk changed. This is done in a
-		 * separate routine to avoid blowing up our stack here.
-		 */
-		cr = msg_getcred(mp, &cpid);
-		if (!IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst) ||
-		    cr != icmp->icmp_last_cred) {
-			int error = 0;
-			error = icmp_update_label_v6(icmp, mp, &ip6_dst);
-			if (error != 0) {
-				mutex_exit(&connp->conn_lock);
-				rw_exit(&icmp->icmp_rwlock);
-				return (error);
+		error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
+		    &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
+		    IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
+		switch (error) {
+		case 0:
+			break;
+		case EADDRNOTAVAIL:
+			/*
+			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+			 * Don't have the application see that errno
+			 */
+			error = ENETUNREACH;
+			goto failed;
+		case ENETDOWN:
+			/*
+			 * Have !ipif_addr_ready address; drop packet silently
+			 * until we can get applications to not send until we
+			 * are ready.
+			 */
+			error = 0;
+			goto failed;
+		case EHOSTUNREACH:
+		case ENETUNREACH:
+			if (ixa->ixa_ire != NULL) {
+				/*
+				 * Let conn_ip_output/ire_send_noroute return
+				 * the error and send any local ICMP error.
+				 */
+				error = 0;
+				break;
 			}
+			/* FALLTHRU */
+		default:
+		failed:
+			ixa_refrele(ixa);
+			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+			freemsg(mp);
+			return (error);
 		}
+	} else {
+		/* Done with conn_t */
+		mutex_exit(&connp->conn_lock);
+	}
 
+	/* We're done.  Pass the packet to ip. */
+	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
+	error = conn_ip_output(mp, ixa);
+	/* No rawipOutErrors if an error since IP increases its error counter */
+	switch (error) {
+	case 0:
+		break;
+	case EWOULDBLOCK:
+		(void) ixa_check_drain_insert(connp, ixa);
+		error = 0;
+		break;
+	case EADDRNOTAVAIL:
+		/*
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
+		 */
+		error = ENETUNREACH;
+		/* FALLTHRU */
+	default:
+		mutex_enter(&connp->conn_lock);
 		/*
-		 * Apply credentials with modified security label if they exist.
-		 * icmp_update_label_v6() may have generated these credentials
-		 * for MAC-Exempt connections.
+		 * Clear the source and v6lastdst so we call ip_attr_connect
+		 * for the next packet and try to pick a better source.
 		 */
-		if (icmp->icmp_effective_cred != NULL)
-			mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
+		if (connp->conn_mcbc_bind)
+			connp->conn_saddr_v6 = ipv6_all_zeros;
+		else
+			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+		connp->conn_v6lastdst = ipv6_all_zeros;
+		mutex_exit(&connp->conn_lock);
+		break;
 	}
+	ixa_refrele(ixa);
+	return (error);
+}
+
+
+/*
+ * Prepend the header template and then fill in the source and
+ * flowinfo. The caller needs to handle the destination address since
+ * it's setting is different if rthdr or source route.
+ *
+ * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
+ * When it returns NULL it sets errorp.
+ */
+static mblk_t *
+icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
+    const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
+{
+	icmp_t		*icmp = connp->conn_icmp;
+	icmp_stack_t	*is = icmp->icmp_is;
+	uint_t		pktlen;
+	uint_t		copylen;
+	uint8_t		*iph;
+	uint_t		ip_hdr_length;
+	uint32_t	cksum;
+	ip_pkt_t	*ipp;
+
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
 
 	/*
-	 * If there's a security label here, then we ignore any options the
-	 * user may try to set.  We keep the peer's label as a hidden sticky
-	 * option.
+	 * Copy the header template.
 	 */
-	if (icmp->icmp_label_len_v6 > 0) {
-		ignore &= ~IPPF_HOPOPTS;
-		ipp->ipp_fields &= ~IPPF_HOPOPTS;
+	copylen = connp->conn_ht_iphc_len;
+	pktlen = copylen + msgdsize(mp);
+	if (pktlen > IP_MAXPACKET) {
+		freemsg(mp);
+		*errorp = EMSGSIZE;
+		return (NULL);
 	}
+	ixa->ixa_pktlen = pktlen;
 
-	if ((icmp->icmp_sticky_ipp.ipp_fields == 0) &&
-	    (ipp->ipp_fields == 0)) {
-		/* No sticky options nor ancillary data. */
-		mutex_exit(&connp->conn_lock);
-		goto no_options;
+	/* check/fix buffer config, setup pointers into it */
+	iph = mp->b_rptr - copylen;
+	if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
+		mblk_t *mp1;
+
+		mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
+		if (mp1 == NULL) {
+			freemsg(mp);
+			*errorp = ENOMEM;
+			return (NULL);
+		}
+		mp1->b_wptr = DB_LIM(mp1);
+		mp1->b_cont = mp;
+		mp = mp1;
+		iph = (mp->b_wptr - copylen);
 	}
+	mp->b_rptr = iph;
+	bcopy(connp->conn_ht_iphc, iph, copylen);
+	ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
+
+	ixa->ixa_ip_hdr_length = ip_hdr_length;
 
 	/*
-	 * Go through the options figuring out where each is going to
-	 * come from and build two masks.  The first mask indicates if
-	 * the option exists at all.  The second mask indicates if the
-	 * option is sticky or ancillary.
+	 * Prepare for ICMPv6 checksum done in IP.
+	 *
+	 * icmp_build_hdr_template has already massaged any routing header
+	 * and placed the result in conn_sum.
+	 *
+	 * We make it easy for IP to include our pseudo header
+	 * by putting our length (and any routing header adjustment)
+	 * in the ICMPv6 checksum field.
 	 */
-	if (!(ignore & IPPF_HOPOPTS)) {
-		if (ipp->ipp_fields & IPPF_HOPOPTS) {
-			option_exists |= IPPF_HOPOPTS;
-			ip_hdr_len += ipp->ipp_hopoptslen;
-		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
-			option_exists |= IPPF_HOPOPTS;
-			is_sticky |= IPPF_HOPOPTS;
-			ASSERT(icmp->icmp_sticky_ipp.ipp_hopoptslen != 0);
-			hopoptsptr = kmem_alloc(
-			    icmp->icmp_sticky_ipp.ipp_hopoptslen, KM_NOSLEEP);
-			if (hopoptsptr == NULL) {
-				mutex_exit(&connp->conn_lock);
-				rw_exit(&icmp->icmp_rwlock);
-				return (ENOMEM);
-			}
-			hopoptslen = icmp->icmp_sticky_ipp.ipp_hopoptslen;
-			bcopy(icmp->icmp_sticky_ipp.ipp_hopopts, hopoptsptr,
-			    hopoptslen);
-			ip_hdr_len += hopoptslen;
-		}
-	}
-	mutex_exit(&connp->conn_lock);
+	cksum = pktlen - ip_hdr_length;
 
-	if (!(ignore & IPPF_RTHDR)) {
-		if (ipp->ipp_fields & IPPF_RTHDR) {
-			option_exists |= IPPF_RTHDR;
-			ip_hdr_len += ipp->ipp_rthdrlen;
-		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
-			option_exists |= IPPF_RTHDR;
-			is_sticky |= IPPF_RTHDR;
-			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_rthdrlen;
-		}
-	}
+	cksum += connp->conn_sum;
+	cksum = (cksum >> 16) + (cksum & 0xFFFF);
+	ASSERT(cksum < 0x10000);
 
-	if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) {
-		/*
-		 * Need to have a router header to use these.
-		 */
-		if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
-			option_exists |= IPPF_RTDSTOPTS;
-			ip_hdr_len += ipp->ipp_rtdstoptslen;
-		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
-			option_exists |= IPPF_RTDSTOPTS;
-			is_sticky |= IPPF_RTDSTOPTS;
-			ip_hdr_len +=
-			    icmp->icmp_sticky_ipp.ipp_rtdstoptslen;
-		}
-	}
+	ipp = &connp->conn_xmit_ipp;
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t	*ipha = (ipha_t *)iph;
 
-	if (!(ignore & IPPF_DSTOPTS)) {
-		if (ipp->ipp_fields & IPPF_DSTOPTS) {
-			option_exists |= IPPF_DSTOPTS;
-			ip_hdr_len += ipp->ipp_dstoptslen;
-		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
-			option_exists |= IPPF_DSTOPTS;
-			is_sticky |= IPPF_DSTOPTS;
-			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_dstoptslen;
-		}
-	}
+		ipha->ipha_length = htons((uint16_t)pktlen);
 
-	if (!(ignore & IPPF_IFINDEX)) {
-		if (ipp->ipp_fields & IPPF_IFINDEX) {
-			option_exists |= IPPF_IFINDEX;
-		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_IFINDEX) {
-			option_exists |= IPPF_IFINDEX;
-			is_sticky |= IPPF_IFINDEX;
+		/* if IP_PKTINFO specified an addres it wins over bind() */
+		if ((ipp->ipp_fields & IPPF_ADDR) &&
+		    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
+			ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
+			ipha->ipha_src = ipp->ipp_addr_v4;
+		} else {
+			IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
 		}
-	}
+	} else {
+		ip6_t *ip6h = (ip6_t *)iph;
+		uint_t	cksum_offset = 0;
 
-	if (!(ignore & IPPF_ADDR)) {
-		if (ipp->ipp_fields & IPPF_ADDR) {
-			option_exists |= IPPF_ADDR;
-		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_ADDR) {
-			option_exists |= IPPF_ADDR;
-			is_sticky |= IPPF_ADDR;
-		}
-	}
+		ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
 
-	if (!(ignore & IPPF_DONTFRAG)) {
-		if (ipp->ipp_fields & IPPF_DONTFRAG) {
-			option_exists |= IPPF_DONTFRAG;
-		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) {
-			option_exists |= IPPF_DONTFRAG;
-			is_sticky |= IPPF_DONTFRAG;
+		/* if IP_PKTINFO specified an addres it wins over bind() */
+		if ((ipp->ipp_fields & IPPF_ADDR) &&
+		    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
+			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
+			ip6h->ip6_src = ipp->ipp_addr;
+		} else {
+			ip6h->ip6_src = *v6src;
 		}
-	}
-
-	if (!(ignore & IPPF_USE_MIN_MTU)) {
-		if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
-			option_exists |= IPPF_USE_MIN_MTU;
-		} else if (icmp->icmp_sticky_ipp.ipp_fields &
-		    IPPF_USE_MIN_MTU) {
-			option_exists |= IPPF_USE_MIN_MTU;
-			is_sticky |= IPPF_USE_MIN_MTU;
+		ip6h->ip6_vcf =
+		    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
+		    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
+		if (ipp->ipp_fields & IPPF_TCLASS) {
+			/* Overrides the class part of flowinfo */
+			ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
+			    ipp->ipp_tclass);
+		}
+
+		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
+			if (connp->conn_proto == IPPROTO_ICMPV6) {
+				cksum_offset = ixa->ixa_ip_hdr_length +
+				    offsetof(icmp6_t, icmp6_cksum);
+			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
+				cksum_offset = ixa->ixa_ip_hdr_length +
+				    ixa->ixa_raw_cksum_offset;
+			}
 		}
-	}
+		if (cksum_offset != 0) {
+			uint16_t *ptr;
+
+			/* Make sure the checksum fits in the first mblk */
+			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
+				mblk_t *mp1;
 
-	if (!(ignore & IPPF_NEXTHOP)) {
-		if (ipp->ipp_fields & IPPF_NEXTHOP) {
-			option_exists |= IPPF_NEXTHOP;
-		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NEXTHOP) {
-			option_exists |= IPPF_NEXTHOP;
-			is_sticky |= IPPF_NEXTHOP;
+				mp1 = msgpullup(mp,
+				    cksum_offset + sizeof (short));
+				freemsg(mp);
+				if (mp1 == NULL) {
+					*errorp = ENOMEM;
+					return (NULL);
+				}
+				mp = mp1;
+				iph = mp->b_rptr;
+				ip6h = (ip6_t *)iph;
+			}
+			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
+			*ptr = htons(cksum);
 		}
 	}
 
-	if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT))
-		option_exists |= IPPF_HOPLIMIT;
-	/* IPV6_HOPLIMIT can never be sticky */
-	ASSERT(!(icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT));
+	return (mp);
+}
 
-	if (!(ignore & IPPF_UNICAST_HOPS) &&
-	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) {
-		option_exists |= IPPF_UNICAST_HOPS;
-		is_sticky |= IPPF_UNICAST_HOPS;
-	}
+/*
+ * This routine handles all messages passed downstream.  It either
+ * consumes the message or passes it downstream; it never queues a
+ * a message.
+ */
+void
+icmp_wput(queue_t *q, mblk_t *mp)
+{
+	sin6_t		*sin6;
+	sin_t		*sin = NULL;
+	uint_t		srcid;
+	conn_t		*connp = Q_TO_CONN(q);
+	icmp_t		*icmp = connp->conn_icmp;
+	int		error = 0;
+	struct sockaddr	*addr = NULL;
+	socklen_t	addrlen;
+	icmp_stack_t	*is = icmp->icmp_is;
+	struct T_unitdata_req *tudr;
+	mblk_t		*data_mp;
+	cred_t		*cr;
+	pid_t		pid;
 
-	if (!(ignore & IPPF_MULTICAST_HOPS) &&
-	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) {
-		option_exists |= IPPF_MULTICAST_HOPS;
-		is_sticky |= IPPF_MULTICAST_HOPS;
-	}
+	/*
+	 * We directly handle several cases here: T_UNITDATA_REQ message
+	 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
+	 * socket.
+	 */
+	switch (DB_TYPE(mp)) {
+	case M_DATA:
+		/* sockfs never sends down M_DATA */
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		freemsg(mp);
+		return;
 
-	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NO_CKSUM) {
-		/* This is a sticky socket option only */
-		option_exists |= IPPF_NO_CKSUM;
-		is_sticky |= IPPF_NO_CKSUM;
-	}
+	case M_PROTO:
+	case M_PCPROTO:
+		tudr = (struct T_unitdata_req *)mp->b_rptr;
+		if (MBLKL(mp) < sizeof (*tudr) ||
+		    ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
+			icmp_wput_other(q, mp);
+			return;
+		}
+		break;
 
-	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RAW_CKSUM) {
-		/* This is a sticky socket option only */
-		option_exists |= IPPF_RAW_CKSUM;
-		is_sticky |= IPPF_RAW_CKSUM;
+	default:
+		icmp_wput_other(q, mp);
+		return;
 	}
 
-	if (!(ignore & IPPF_TCLASS)) {
-		if (ipp->ipp_fields & IPPF_TCLASS) {
-			option_exists |= IPPF_TCLASS;
-		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_TCLASS) {
-			option_exists |= IPPF_TCLASS;
-			is_sticky |= IPPF_TCLASS;
-		}
+	/* Handle valid T_UNITDATA_REQ here */
+	data_mp = mp->b_cont;
+	if (data_mp == NULL) {
+		error = EPROTO;
+		goto ud_error2;
 	}
+	mp->b_cont = NULL;
 
-no_options:
+	if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
+		error = EADDRNOTAVAIL;
+		goto ud_error2;
+	}
 
 	/*
-	 * If any options carried in the ip6i_t were specified, we
-	 * need to account for the ip6i_t in the data we'll be sending
-	 * down.
+	 * All Solaris components should pass a db_credp
+	 * for this message, hence we ASSERT.
+	 * On production kernels we return an error to be robust against
+	 * random streams modules sitting on top of us.
 	 */
-	if (option_exists & IPPF_HAS_IP6I)
-		ip_hdr_len += sizeof (ip6i_t);
+	cr = msg_getcred(mp, &pid);
+	ASSERT(cr != NULL);
+	if (cr == NULL) {
+		error = EINVAL;
+		goto ud_error2;
+	}
 
-	/* check/fix buffer config, setup pointers into it */
-	ip6h = (ip6_t *)&mp->b_rptr[-ip_hdr_len];
-	if ((mp->b_datap->db_ref != 1) ||
-	    ((unsigned char *)ip6h < mp->b_datap->db_base) ||
-	    !OK_32PTR(ip6h)) {
-		mblk_t	*mp1;
-
-		/* Try to get everything in a single mblk next time */
-		if (ip_hdr_len > icmp->icmp_max_hdr_len) {
-			icmp->icmp_max_hdr_len = ip_hdr_len;
-
-			(void) proto_set_tx_wroff(q == NULL ? NULL:RD(q), connp,
-			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
-		}
-		mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO);
-		if (!mp1) {
-			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-			kmem_free(hopoptsptr, hopoptslen);
-			rw_exit(&icmp->icmp_rwlock);
-			return (ENOMEM);
-		}
-		mp1->b_cont = mp;
-		mp1->b_wptr = mp1->b_datap->db_lim;
-		ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len);
-		mp = mp1;
+	/*
+	 * If a port has not been bound to the stream, fail.
+	 * This is not a problem when sockfs is directly
+	 * above us, because it will ensure that the socket
+	 * is first bound before allowing data to be sent.
+	 */
+	if (icmp->icmp_state == TS_UNBND) {
+		error = EPROTO;
+		goto ud_error2;
 	}
-	mp->b_rptr = (unsigned char *)ip6h;
-	ip6i = (ip6i_t *)ip6h;
-
-#define	ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp)
-	if (option_exists & IPPF_HAS_IP6I) {
-		ip6h = (ip6_t *)&ip6i[1];
-		ip6i->ip6i_flags = 0;
-		ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-
-		/* sin6_scope_id takes precendence over IPPF_IFINDEX */
-		if (option_exists & IPPF_SCOPE_ID) {
-			ip6i->ip6i_flags |= IP6I_IFINDEX;
-			ip6i->ip6i_ifindex = sin6->sin6_scope_id;
-		} else if (option_exists & IPPF_IFINDEX) {
-			tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX);
-			ASSERT(tipp->ipp_ifindex != 0);
-			ip6i->ip6i_flags |= IP6I_IFINDEX;
-			ip6i->ip6i_ifindex = tipp->ipp_ifindex;
+	addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
+	addrlen = tudr->DEST_length;
+
+	switch (connp->conn_family) {
+	case AF_INET6:
+		sin6 = (sin6_t *)addr;
+		if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
+		    (sin6->sin6_family != AF_INET6)) {
+			error = EADDRNOTAVAIL;
+			goto ud_error2;
 		}
 
-		if (option_exists & IPPF_RAW_CKSUM) {
-			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
-			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
+		/* No support for mapped addresses on raw sockets */
+		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+			error = EADDRNOTAVAIL;
+			goto ud_error2;
 		}
+		srcid = sin6->__sin6_src_id;
 
-		if (option_exists & IPPF_NO_CKSUM) {
-			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
+		/*
+		 * If the local address is a mapped address return
+		 * an error.
+		 * It would be possible to send an IPv6 packet but the
+		 * response would never make it back to the application
+		 * since it is bound to a mapped address.
+		 */
+		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
+			error = EADDRNOTAVAIL;
+			goto ud_error2;
 		}
 
-		if (option_exists & IPPF_ADDR) {
+		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+			sin6->sin6_addr = ipv6_loopback;
+
+		if (tudr->OPT_length != 0) {
 			/*
-			 * Enable per-packet source address verification if
-			 * IPV6_PKTINFO specified the source address.
-			 * ip6_src is set in the transport's _wput function.
+			 * If we are connected then the destination needs to be
+			 * the same as the connected one.
 			 */
-			ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
-		}
+			if (icmp->icmp_state == TS_DATA_XFER &&
+			    !conn_same_as_last_v6(connp, sin6)) {
+				error = EISCONN;
+				goto ud_error2;
+			}
+			error = icmp_output_ancillary(connp, NULL, sin6,
+			    data_mp, mp, NULL, cr, pid);
+		} else {
+			ip_xmit_attr_t *ixa;
 
-		if (option_exists & IPPF_DONTFRAG) {
-			ip6i->ip6i_flags |= IP6I_DONTFRAG;
+			/*
+			 * We have to allocate an ip_xmit_attr_t before we grab
+			 * conn_lock and we need to hold conn_lock once we've
+			 * checked conn_same_as_last_v6 to handle concurrent
+			 * send* calls on a socket.
+			 */
+			ixa = conn_get_ixa(connp, B_FALSE);
+			if (ixa == NULL) {
+				error = ENOMEM;
+				goto ud_error2;
+			}
+			mutex_enter(&connp->conn_lock);
+
+			if (conn_same_as_last_v6(connp, sin6) &&
+			    connp->conn_lastsrcid == srcid &&
+			    ipsec_outbound_policy_current(ixa)) {
+				/* icmp_output_lastdst drops conn_lock */
+				error = icmp_output_lastdst(connp, data_mp, cr,
+				    pid, ixa);
+			} else {
+				/* icmp_output_newdst drops conn_lock */
+				error = icmp_output_newdst(connp, data_mp, NULL,
+				    sin6, cr, pid, ixa);
+			}
+			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
 		}
+		if (error == 0) {
+			freeb(mp);
+			return;
+		}
+		break;
 
-		if (option_exists & IPPF_USE_MIN_MTU) {
-			ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU(
-			    ip6i->ip6i_flags, ipp->ipp_use_min_mtu);
+	case AF_INET:
+		sin = (sin_t *)addr;
+		if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
+		    (sin->sin_family != AF_INET)) {
+			error = EADDRNOTAVAIL;
+			goto ud_error2;
 		}
+		if (sin->sin_addr.s_addr == INADDR_ANY)
+			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 
-		if (option_exists & IPPF_NEXTHOP) {
-			tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP);
-			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop));
-			ip6i->ip6i_flags |= IP6I_NEXTHOP;
-			ip6i->ip6i_nexthop = tipp->ipp_nexthop;
+		/* Protocol 255 contains full IP headers */
+		/* Read without holding lock */
+		if (icmp->icmp_hdrincl) {
+			if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
+				if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
+					error = EINVAL;
+					goto ud_error2;
+				}
+			}
+			error = icmp_output_hdrincl(connp, data_mp, cr, pid);
+			if (error == 0) {
+				freeb(mp);
+				return;
+			}
+			/* data_mp consumed above */
+			data_mp = NULL;
+			goto ud_error2;
 		}
 
-		/*
-		 * tell IP this is an ip6i_t private header
-		 */
-		ip6i->ip6i_nxt = IPPROTO_RAW;
-	}
-
-	/* Initialize IPv6 header */
-	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-	bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src));
-
-	/* Set the hoplimit of the outgoing packet. */
-	if (option_exists & IPPF_HOPLIMIT) {
-		/* IPV6_HOPLIMIT ancillary data overrides all other settings. */
-		ip6h->ip6_hops = ipp->ipp_hoplimit;
-		ip6i->ip6i_flags |= IP6I_HOPLIMIT;
-	} else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
-		ip6h->ip6_hops = icmp->icmp_multicast_ttl;
-		if (option_exists & IPPF_MULTICAST_HOPS)
-			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
-	} else {
-		ip6h->ip6_hops = icmp->icmp_ttl;
-		if (option_exists & IPPF_UNICAST_HOPS)
-			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
-	}
+		if (tudr->OPT_length != 0) {
+			/*
+			 * If we are connected then the destination needs to be
+			 * the same as the connected one.
+			 */
+			if (icmp->icmp_state == TS_DATA_XFER &&
+			    !conn_same_as_last_v4(connp, sin)) {
+				error = EISCONN;
+				goto ud_error2;
+			}
+			error = icmp_output_ancillary(connp, sin, NULL,
+			    data_mp, mp, NULL, cr, pid);
+		} else {
+			ip_xmit_attr_t *ixa;
 
-	if (option_exists & IPPF_ADDR) {
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR);
-		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr));
-		ip6h->ip6_src = tipp->ipp_addr;
-	} else {
-		/*
-		 * The source address was not set using IPV6_PKTINFO.
-		 * First look at the bound source.
-		 * If unspecified fallback to __sin6_src_id.
-		 */
-		ip6h->ip6_src = icmp->icmp_v6src;
-		if (sin6->__sin6_src_id != 0 &&
-		    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
-			ip_srcid_find_id(sin6->__sin6_src_id,
-			    &ip6h->ip6_src, icmp->icmp_zoneid,
-			    is->is_netstack);
+			/*
+			 * We have to allocate an ip_xmit_attr_t before we grab
+			 * conn_lock and we need to hold conn_lock once we've
+			 * checked conn_same_as_last_v4 to handle concurrent
+			 * send* calls on a socket.
+			 */
+			ixa = conn_get_ixa(connp, B_FALSE);
+			if (ixa == NULL) {
+				error = ENOMEM;
+				goto ud_error2;
+			}
+			mutex_enter(&connp->conn_lock);
+
+			if (conn_same_as_last_v4(connp, sin) &&
+			    ipsec_outbound_policy_current(ixa)) {
+				/* icmp_output_lastdst drops conn_lock */
+				error = icmp_output_lastdst(connp, data_mp, cr,
+				    pid, ixa);
+			} else {
+				/* icmp_output_newdst drops conn_lock */
+				error = icmp_output_newdst(connp, data_mp, sin,
+				    NULL, cr, pid, ixa);
+			}
+			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+		}
+		if (error == 0) {
+			freeb(mp);
+			return;
 		}
+		break;
 	}
+	ASSERT(mp != NULL);
+	/* mp is freed by the following routine */
+	icmp_ud_err(q, mp, (t_scalar_t)error);
+	return;
 
-	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
-	cp = (uint8_t *)&ip6h[1];
+ud_error2:
+	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+	freemsg(data_mp);
+	ASSERT(mp != NULL);
+	/* mp is freed by the following routine */
+	icmp_ud_err(q, mp, (t_scalar_t)error);
+}
+
+/*
+ * Handle the case of the IP address or flow label being different
+ * for both IPv4 and IPv6.
+ *
+ * NOTE: The caller must hold conn_lock and we drop it here.
+ */
+static int
+icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
+    cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
+{
+	icmp_t		*icmp = connp->conn_icmp;
+	icmp_stack_t	*is = icmp->icmp_is;
+	int		error;
+	ip_xmit_attr_t	*oldixa;
+	boolean_t	do_ipsec;
+	uint_t		srcid;
+	uint32_t	flowinfo;
+	in6_addr_t	v6src;
+	in6_addr_t	v6dst;
+	in6_addr_t	v6nexthop;
+	in_port_t	dstport;
+
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+	ASSERT(ixa != NULL);
 
 	/*
-	 * Here's where we have to start stringing together
-	 * any extension headers in the right order:
-	 * Hop-by-hop, destination, routing, and final destination opts.
+	 * We hold conn_lock across all the use and modifications of
+	 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
+	 * stay consistent.
 	 */
-	if (option_exists & IPPF_HOPOPTS) {
-		/* Hop-by-hop options */
-		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
-
-		*nxthdr_ptr = IPPROTO_HOPOPTS;
-		nxthdr_ptr = &hbh->ip6h_nxt;
 
-		if (hopoptslen == 0) {
-			tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS);
-			bcopy(tipp->ipp_hopopts, cp, tipp->ipp_hopoptslen);
-			cp += tipp->ipp_hopoptslen;
-		} else {
-			bcopy(hopoptsptr, cp, hopoptslen);
-			cp += hopoptslen;
-			kmem_free(hopoptsptr, hopoptslen);
-		}
+	ASSERT(cr != NULL);
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
+	if (is_system_labeled()) {
+		/* We need to restart with a label based on the cred */
+		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
 	}
 	/*
-	 * En-route destination options
-	 * Only do them if there's a routing header as well
+	 * If we are connected then the destination needs to be the
+	 * same as the connected one, which is not the case here since we
+	 * checked for that above.
 	 */
-	if (option_exists & IPPF_RTDSTOPTS) {
-		ip6_dest_t *dst = (ip6_dest_t *)cp;
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS);
+	if (icmp->icmp_state == TS_DATA_XFER) {
+		mutex_exit(&connp->conn_lock);
+		error = EISCONN;
+		goto ud_error;
+	}
 
-		*nxthdr_ptr = IPPROTO_DSTOPTS;
-		nxthdr_ptr = &dst->ip6d_nxt;
+	/* In case previous destination was multicast or multirt */
+	ip_attr_newdst(ixa);
 
-		bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen);
-		cp += tipp->ipp_rtdstoptslen;
-	}
 	/*
-	 * Routing header next
+	 * If laddr is unspecified then we look at sin6_src_id.
+	 * We will give precedence to a source address set with IPV6_PKTINFO
+	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
+	 * want ip_attr_connect to select a source (since it can fail) when
+	 * IPV6_PKTINFO is specified.
+	 * If this doesn't result in a source address then we get a source
+	 * from ip_attr_connect() below.
 	 */
-	if (option_exists & IPPF_RTHDR) {
-		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR);
+	v6src = connp->conn_saddr_v6;
+	if (sin != NULL) {
+		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
+		dstport = sin->sin_port;
+		flowinfo = 0;
+		srcid = 0;
+		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
+			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+			    connp->conn_netstack);
+		}
+		ixa->ixa_flags |= IXAF_IS_IPV4;
+	} else {
+		v6dst = sin6->sin6_addr;
+		dstport = sin6->sin6_port;
+		flowinfo = sin6->sin6_flowinfo;
+		srcid = sin6->__sin6_src_id;
+		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
+			ixa->ixa_scopeid = sin6->sin6_scope_id;
+			ixa->ixa_flags |= IXAF_SCOPEID_SET;
+		} else {
+			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		}
+		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+			    connp->conn_netstack);
+		}
+		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
+			ixa->ixa_flags |= IXAF_IS_IPV4;
+		else
+			ixa->ixa_flags &= ~IXAF_IS_IPV4;
+	}
+	/* Handle IPV6_PKTINFO setting source address. */
+	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
+	    (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) {
+		ip_pkt_t *ipp = &connp->conn_xmit_ipp;
 
-		*nxthdr_ptr = IPPROTO_ROUTING;
-		nxthdr_ptr = &rt->ip6r_nxt;
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
+			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+				v6src = ipp->ipp_addr;
+		} else {
+			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+				v6src = ipp->ipp_addr;
+		}
+	}
 
-		bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen);
-		cp += tipp->ipp_rthdrlen;
+	/* Defer IPsec if it might need to look at ICMP type/code */
+	switch (ixa->ixa_protocol) {
+	case IPPROTO_ICMP:
+	case IPPROTO_ICMPV6:
+		do_ipsec = B_FALSE;
+		break;
+	default:
+		do_ipsec = B_TRUE;
 	}
-	/*
-	 * Do ultimate destination options
-	 */
-	if (option_exists & IPPF_DSTOPTS) {
-		ip6_dest_t *dest = (ip6_dest_t *)cp;
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS);
 
-		*nxthdr_ptr = IPPROTO_DSTOPTS;
-		nxthdr_ptr = &dest->ip6d_nxt;
+	ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
+	mutex_exit(&connp->conn_lock);
 
-		bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen);
-		cp += tipp->ipp_dstoptslen;
+	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
+	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
+	    (do_ipsec ? IPDF_IPSEC : 0));
+	switch (error) {
+	case 0:
+		break;
+	case EADDRNOTAVAIL:
+		/*
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
+		 */
+		error = ENETUNREACH;
+		goto failed;
+	case ENETDOWN:
+		/*
+		 * Have !ipif_addr_ready address; drop packet silently
+		 * until we can get applications to not send until we
+		 * are ready.
+		 */
+		error = 0;
+		goto failed;
+	case EHOSTUNREACH:
+	case ENETUNREACH:
+		if (ixa->ixa_ire != NULL) {
+			/*
+			 * Let conn_ip_output/ire_send_noroute return
+			 * the error and send any local ICMP error.
+			 */
+			error = 0;
+			break;
+		}
+		/* FALLTHRU */
+	default:
+	failed:
+		goto ud_error;
 	}
 
+	mutex_enter(&connp->conn_lock);
 	/*
-	 * Now set the last header pointer to the proto passed in
+	 * While we dropped the lock some other thread might have connected
+	 * this socket. If so we bail out with EISCONN to ensure that the
+	 * connecting thread is the one that updates conn_ixa, conn_ht_*
+	 * and conn_*last*.
 	 */
-	ASSERT((int)(cp - (uint8_t *)ip6i) == ip_hdr_len);
-	*nxthdr_ptr = icmp->icmp_proto;
+	if (icmp->icmp_state == TS_DATA_XFER) {
+		mutex_exit(&connp->conn_lock);
+		error = EISCONN;
+		goto ud_error;
+	}
 
 	/*
-	 * Copy in the destination address
+	 * We need to rebuild the headers if
+	 *  - we are labeling packets (could be different for different
+	 *    destinations)
+	 *  - we have a source route (or routing header) since we need to
+	 *    massage that to get the pseudo-header checksum
+	 *  - a socket option with COA_HEADER_CHANGED has been set which
+	 *    set conn_v6lastdst to zero.
+	 *
+	 * Otherwise the prepend function will just update the src, dst,
+	 * and flow label.
 	 */
-	ip6h->ip6_dst = ip6_dst;
-
-	ip6h->ip6_vcf =
-	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
-	    (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
-
-	if (option_exists & IPPF_TCLASS) {
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS);
-		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
-		    tipp->ipp_tclass);
-	}
-	if (option_exists & IPPF_RTHDR) {
-		ip6_rthdr_t	*rth;
-
+	if (is_system_labeled()) {
+		/* TX MLP requires SCM_UCRED and don't have that here */
+		if (connp->conn_mlp_type != mlptSingle) {
+			mutex_exit(&connp->conn_lock);
+			error = ECONNREFUSED;
+			goto ud_error;
+		}
 		/*
-		 * Perform any processing needed for source routing.
-		 * We know that all extension headers will be in the same mblk
-		 * as the IPv6 header.
+		 * Check whether Trusted Solaris policy allows communication
+		 * with this host, and pretend that the destination is
+		 * unreachable if not.
+		 * Compute any needed label and place it in ipp_label_v4/v6.
+		 *
+		 * Later conn_build_hdr_template/conn_prepend_hdr takes
+		 * ipp_label_v4/v6 to form the packet.
+		 *
+		 * Tsol note: Since we hold conn_lock we know no other
+		 * thread manipulates conn_xmit_ipp.
 		 */
-		rth = ip_find_rthdr_v6(ip6h, mp->b_wptr);
-		if (rth != NULL && rth->ip6r_segleft != 0) {
-			if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
-				/*
-				 * Drop packet - only support Type 0 routing.
-				 * Notify the application as well.
-				 */
-				BUMP_MIB(&is->is_rawip_mib,
-				    rawipOutErrors);
-				rw_exit(&icmp->icmp_rwlock);
-				return (EPROTO);
-			}
-			/*
-			 * rth->ip6r_len is twice the number of
-			 * addresses in the header
-			 */
-			if (rth->ip6r_len & 0x1) {
-				BUMP_MIB(&is->is_rawip_mib,
-				    rawipOutErrors);
-				rw_exit(&icmp->icmp_rwlock);
-				return (EPROTO);
-			}
-			/*
-			 * Shuffle the routing header and ip6_dst
-			 * addresses, and get the checksum difference
-			 * between the first hop (in ip6_dst) and
-			 * the destination (in the last routing hdr entry).
-			 */
-			csum = ip_massage_options_v6(ip6h, rth,
-			    is->is_netstack);
-			/*
-			 * Verify that the first hop isn't a mapped address.
-			 * Routers along the path need to do this verification
-			 * for subsequent hops.
-			 */
-			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
-				BUMP_MIB(&is->is_rawip_mib,
-				    rawipOutErrors);
-				rw_exit(&icmp->icmp_rwlock);
-				return (EADDRNOTAVAIL);
+		error = conn_update_label(connp, ixa, &v6dst,
+		    &connp->conn_xmit_ipp);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			goto ud_error;
+		}
+		/* Rebuild the header template */
+		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
+		    flowinfo);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			goto ud_error;
+		}
+	} else if (connp->conn_xmit_ipp.ipp_fields &
+	    (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
+	    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
+		/* Rebuild the header template */
+		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
+		    flowinfo);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			goto ud_error;
+		}
+	} else {
+		/* Simply update the destination address if no source route */
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
+			ipha_t	*ipha = (ipha_t *)connp->conn_ht_iphc;
+
+			IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
+			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
+				ipha->ipha_fragment_offset_and_flags |=
+				    IPH_DF_HTONS;
+			} else {
+				ipha->ipha_fragment_offset_and_flags &=
+				    ~IPH_DF_HTONS;
 			}
+		} else {
+			ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
+			ip6h->ip6_dst = v6dst;
 		}
 	}
 
-	ip_len = mp->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN;
-	if (mp->b_cont != NULL)
-		ip_len += msgdsize(mp->b_cont);
-
 	/*
-	 * Set the length into the IP header.
-	 * If the length is greater than the maximum allowed by IP,
-	 * then free the message and return. Do not try and send it
-	 * as this can cause problems in layers below.
+	 * Remember the dst etc which corresponds to the built header
+	 * template and conn_ixa.
 	 */
-	if (ip_len > IP_MAXPACKET) {
-		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-		rw_exit(&icmp->icmp_rwlock);
-		return (EMSGSIZE);
+	oldixa = conn_replace_ixa(connp, ixa);
+	connp->conn_v6lastdst = v6dst;
+	connp->conn_lastflowinfo = flowinfo;
+	connp->conn_lastscopeid = ixa->ixa_scopeid;
+	connp->conn_lastsrcid = srcid;
+	/* Also remember a source to use together with lastdst */
+	connp->conn_v6lastsrc = v6src;
+
+	data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
+	    flowinfo, &error);
+
+	/* Done with conn_t */
+	mutex_exit(&connp->conn_lock);
+	ixa_refrele(oldixa);
+
+	if (data_mp == NULL) {
+		ASSERT(error != 0);
+		goto ud_error;
 	}
-	if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) {
-		uint_t	cksum_off;	/* From ip6i == mp->b_rptr */
-		uint16_t *cksum_ptr;
-		uint_t	ext_hdrs_len;
 
-		/* ICMPv6 must have an offset matching icmp6_cksum offset */
-		ASSERT(icmp->icmp_proto != IPPROTO_ICMPV6 ||
-		    icmp->icmp_checksum_off == 2);
+	if (!do_ipsec) {
+		/* Policy might differ for different ICMP type/code */
+		data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
+		if (data_mp == NULL) {
+			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+			error = EHOSTUNREACH;	/* IPsec policy failure */
+			goto done;
+		}
+	}
 
+	/* We're done.  Pass the packet to ip. */
+	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
+
+	error = conn_ip_output(data_mp, ixa);
+	/* No rawipOutErrors if an error since IP increases its error counter */
+	switch (error) {
+	case 0:
+		break;
+	case EWOULDBLOCK:
+		(void) ixa_check_drain_insert(connp, ixa);
+		error = 0;
+		break;
+	case EADDRNOTAVAIL:
 		/*
-		 * We make it easy for IP to include our pseudo header
-		 * by putting our length in uh_checksum, modified (if
-		 * we have a routing header) by the checksum difference
-		 * between the ultimate destination and first hop addresses.
-		 * Note: ICMPv6 must always checksum the packet.
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
 		 */
-		cksum_off = ip_hdr_len + icmp->icmp_checksum_off;
-		if (cksum_off + sizeof (uint16_t) > mp->b_wptr - mp->b_rptr) {
-			if (!pullupmsg(mp, cksum_off + sizeof (uint16_t))) {
-				BUMP_MIB(&is->is_rawip_mib,
-				    rawipOutErrors);
-				freemsg(mp);
-				rw_exit(&icmp->icmp_rwlock);
-				return (0);
-			}
-			ip6i = (ip6i_t *)mp->b_rptr;
-			if (ip6i->ip6i_nxt == IPPROTO_RAW)
-				ip6h = (ip6_t *)&ip6i[1];
-			else
-				ip6h = (ip6_t *)ip6i;
-		}
-		/* Add payload length to checksum */
-		ext_hdrs_len = ip_hdr_len - IPV6_HDR_LEN -
-		    (int)((uchar_t *)ip6h - (uchar_t *)ip6i);
-		csum += htons(ip_len - ext_hdrs_len);
-
-		cksum_ptr = (uint16_t *)((uchar_t *)ip6i + cksum_off);
-		csum = (csum & 0xFFFF) + (csum >> 16);
-		*cksum_ptr = (uint16_t)csum;
+		error = ENETUNREACH;
+		/* FALLTHRU */
+	default:
+		mutex_enter(&connp->conn_lock);
+		/*
+		 * Clear the source and v6lastdst so we call ip_attr_connect
+		 * for the next packet and try to pick a better source.
+		 */
+		if (connp->conn_mcbc_bind)
+			connp->conn_saddr_v6 = ipv6_all_zeros;
+		else
+			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+		connp->conn_v6lastdst = ipv6_all_zeros;
+		mutex_exit(&connp->conn_lock);
+		break;
 	}
+done:
+	ixa_refrele(ixa);
+	return (error);
 
-#ifdef _LITTLE_ENDIAN
-	ip_len = htons(ip_len);
-#endif
-	ip6h->ip6_plen = (uint16_t)ip_len;
+ud_error:
+	if (ixa != NULL)
+		ixa_refrele(ixa);
 
-	/* We're done. Pass the packet to IP */
-	rw_exit(&icmp->icmp_rwlock);
-	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
-	ip_output_v6(icmp->icmp_connp, mp, q, IP_WPUT);
-	return (0);
+	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+	freemsg(data_mp);
+	return (error);
+}
+
+/* ARGSUSED */
+static void
+icmp_wput_fallback(queue_t *q, mblk_t *mp)
+{
+#ifdef DEBUG
+	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
+#endif
+	freemsg(mp);
 }
 
 static void
@@ -5559,7 +4622,6 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
 {
 	uchar_t	*rptr = mp->b_rptr;
 	struct iocblk *iocp;
-#define	tudr ((struct T_unitdata_req *)rptr)
 	conn_t	*connp = Q_TO_CONN(q);
 	icmp_t	*icmp = connp->conn_icmp;
 	icmp_stack_t *is = icmp->icmp_is;
@@ -5576,7 +4638,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
 			freemsg(mp);
 			return;
 		}
-		switch (((union T_primitives *)rptr)->type) {
+		switch (((t_primp_t)rptr)->type) {
 		case T_ADDR_REQ:
 			icmp_addr_req(q, mp);
 			return;
@@ -5596,15 +4658,14 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
 		case T_UNITDATA_REQ:
 			/*
 			 * If a T_UNITDATA_REQ gets here, the address must
-			 * be bad.  Valid T_UNITDATA_REQs are found above
-			 * and break to below this switch.
+			 * be bad.  Valid T_UNITDATA_REQs are handled
+			 * in icmp_wput.
 			 */
 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
 			return;
 		case T_UNBIND_REQ:
 			icmp_tpi_unbind(q, mp);
 			return;
-
 		case T_SVR4_OPTMGMT_REQ:
 			/*
 			 * All Solaris components should pass a db_credp
@@ -5622,9 +4683,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
 
 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
 			    cr)) {
-				/* Only IP can return anything meaningful */
-				(void) svr4_optcom_req(q, mp, cr,
-				    &icmp_opt_obj, B_TRUE);
+				svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
 			}
 			return;
 
@@ -5642,8 +4701,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
 				return;
 			}
-			/* Only IP can return anything meaningful */
-			(void) tpi_optcom_req(q, mp, cr, &icmp_opt_obj, B_TRUE);
+			tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
 			return;
 
 		case T_DISCON_REQ:
@@ -5660,13 +4718,16 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
 		case T_DATA_REQ:
 		case T_EXDATA_REQ:
 		case T_ORDREL_REQ:
-			freemsg(mp);
-			(void) putctl1(RD(q), M_ERROR, EPROTO);
+			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
 			return;
 		default:
 			break;
 		}
 		break;
+	case M_FLUSH:
+		if (*rptr & FLUSHW)
+			flushq(q, FLUSHDATA);
+		break;
 	case M_IOCTL:
 		iocp = (struct iocblk *)mp->b_rptr;
 		switch (iocp->ioc_cmd) {
@@ -5678,7 +4739,6 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
 				 * don't know the peer's name.
 				 */
 				iocp->ioc_error = ENOTCONN;
-		err_ret:;
 				iocp->ioc_count = 0;
 				mp->b_datap->db_type = M_IOCACK;
 				qreply(q, mp);
@@ -5696,22 +4756,13 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
 			return;
 		case ND_SET:
-			/* nd_getset performs the necessary error checking */
+			/* nd_getset performs the necessary checking */
 		case ND_GET:
 			if (nd_getset(q, is->is_nd, mp)) {
 				qreply(q, mp);
 				return;
 			}
 			break;
-		case _SIOCSOCKFALLBACK:
-			/*
-			 * socket is falling back to be a
-			 * streams socket. Nothing  to do
-			 */
-			iocp->ioc_count = 0;
-			iocp->ioc_rval = 0;
-			qreply(q, mp);
-			return;
 		default:
 			break;
 		}
@@ -5720,23 +4771,24 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
 		icmp_wput_iocdata(q, mp);
 		return;
 	default:
+		/* Unrecognized messages are passed through without change. */
 		break;
 	}
-	ip_wput(q, mp);
+	ip_wput_nondata(q, mp);
 }
 
 /*
- * icmp_wput_iocdata is called by icmp_wput_slow to handle all M_IOCDATA
+ * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
  * messages.
  */
 static void
 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
 {
-	mblk_t	*mp1;
+	mblk_t		*mp1;
 	STRUCT_HANDLE(strbuf, sb);
-	icmp_t	*icmp;
-	uint_t	addrlen;
-	uint_t	error;
+	uint_t		addrlen;
+	conn_t		*connp = Q_TO_CONN(q);
+	icmp_t		*icmp = connp->conn_icmp;
 
 	/* Make sure it is one of ours. */
 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
@@ -5744,10 +4796,10 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp)
 	case TI_GETPEERNAME:
 		break;
 	default:
-		icmp = Q_TO_ICMP(q);
-		ip_output(icmp->icmp_connp, mp, q, IP_WPUT);
+		ip_wput_nondata(q, mp);
 		return;
 	}
+
 	switch (mi_copy_state(q, mp, &mp1)) {
 	case -1:
 		return;
@@ -5776,6 +4828,7 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp)
 		mi_copy_done(q, mp, EPROTO);
 		return;
 	}
+
 	/*
 	 * Now we have the strbuf structure for TI_GETMYNAME
 	 * and TI_GETPEERNAME.  Next we copyout the requested
@@ -5783,8 +4836,8 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp)
 	 */
 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
 	    (void *)mp1->b_rptr);
-	icmp = Q_TO_ICMP(q);
-	if (icmp->icmp_family == AF_INET)
+
+	if (connp->conn_family == AF_INET)
 		addrlen = sizeof (sin_t);
 	else
 		addrlen = sizeof (sin6_t);
@@ -5793,72 +4846,37 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp)
 		mi_copy_done(q, mp, EINVAL);
 		return;
 	}
-
+	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
+	case TI_GETMYNAME:
+		break;
+	case TI_GETPEERNAME:
+		if (icmp->icmp_state != TS_DATA_XFER) {
+			mi_copy_done(q, mp, ENOTCONN);
+			return;
+		}
+		break;
+	default:
+		mi_copy_done(q, mp, EPROTO);
+		return;
+	}
 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
-
-	if (mp1 == NULL)
+	if (!mp1)
 		return;
 
-	rw_enter(&icmp->icmp_rwlock, RW_READER);
+	STRUCT_FSET(sb, len, addrlen);
 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
 	case TI_GETMYNAME:
-		error = rawip_do_getsockname(icmp, (void *)mp1->b_rptr,
+		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
 		    &addrlen);
 		break;
 	case TI_GETPEERNAME:
-		error = rawip_do_getpeername(icmp, (void *)mp1->b_rptr,
+		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
 		    &addrlen);
 		break;
 	}
-	rw_exit(&icmp->icmp_rwlock);
-
-	if (error != 0) {
-		mi_copy_done(q, mp, error);
-	} else {
-		mp1->b_wptr += addrlen;
-		STRUCT_FSET(sb, len, addrlen);
-
-		/* Copy out the address */
-		mi_copyout(q, mp);
-	}
-}
-
-static int
-icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
-    void *thisdg_attrs)
-{
-	struct T_unitdata_req *udreqp;
-	int is_absreq_failure;
-	cred_t *cr;
-
-	udreqp = (struct T_unitdata_req *)mp->b_rptr;
-	*errorp = 0;
-
-	/*
-	 * All Solaris components should pass a db_credp
-	 * for this TPI message, hence we ASSERT.
-	 * But in case there is some other M_PROTO that looks
-	 * like a TPI message sent by some other kernel
-	 * component, we check and return an error.
-	 */
-	cr = msg_getcred(mp, NULL);
-	ASSERT(cr != NULL);
-	if (cr == NULL)
-		return (-1);
-
-	*errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
-	    udreqp->OPT_offset, cr, &icmp_opt_obj,
-	    thisdg_attrs, &is_absreq_failure);
-
-	if (*errorp != 0) {
-		/*
-		 * Note: No special action needed in this
-		 * module for "is_absreq_failure"
-		 */
-		return (-1);		/* failure */
-	}
-	ASSERT(is_absreq_failure == 0);
-	return (0);	/* success */
+	mp1->b_wptr += addrlen;
+	/* Copy out the address */
+	mi_copyout(q, mp);
 }
 
 void
@@ -6013,7 +5031,7 @@ rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
     socklen_t len, cred_t *cr)
 {
 	conn_t  *connp = (conn_t *)proto_handle;
-	int error;
+	int	error;
 
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
@@ -6042,14 +5060,14 @@ rawip_implicit_bind(conn_t *connp)
 	socklen_t len;
 	int error;
 
-	if (connp->conn_icmp->icmp_family == AF_INET) {
+	if (connp->conn_family == AF_INET) {
 		len = sizeof (struct sockaddr_in);
 		sin = (sin_t *)&sin6addr;
 		*sin = sin_null;
 		sin->sin_family = AF_INET;
 		sin->sin_addr.s_addr = INADDR_ANY;
 	} else {
-		ASSERT(connp->conn_icmp->icmp_family == AF_INET6);
+		ASSERT(connp->conn_family == AF_INET6);
 		len = sizeof (sin6_t);
 		sin6 = (sin6_t *)&sin6addr;
 		*sin6 = sin6_null;
@@ -6081,7 +5099,6 @@ rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
 	return (EOPNOTSUPP);
 }
 
-/* ARGSUSED */
 int
 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
     socklen_t len, sock_connid_t *id, cred_t *cr)
@@ -6090,6 +5107,7 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 	icmp_t *icmp = connp->conn_icmp;
 	int	error;
 	boolean_t did_bind = B_FALSE;
+	pid_t	pid = curproc->p_pid;
 
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
@@ -6106,7 +5124,7 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 		return (error);
 	}
 
-	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
+	error = proto_verify_ip_addr(connp->conn_family, sa, len);
 	if (error != 0)
 		return (error);
 
@@ -6126,10 +5144,9 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 	/*
 	 * set SO_DGRAM_ERRIND
 	 */
-	icmp->icmp_dgram_errind = B_TRUE;
-
-	error = rawip_do_connect(connp, sa, len, cr);
+	connp->conn_dgram_errind = B_TRUE;
 
+	error = rawip_do_connect(connp, sa, len, cr, pid);
 	if (error != 0 && did_bind) {
 		int unbind_err;
 
@@ -6139,15 +5156,15 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 
 	if (error == 0) {
 		*id = 0;
-		(*connp->conn_upcalls->su_connected)
-		    (connp->conn_upper_handle, 0, NULL, -1);
+		(*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
+		    0, NULL, -1);
 	} else if (error < 0) {
 		error = proto_tlitosyserr(-error);
 	}
 	return (error);
 }
 
-/* ARGSUSED */
+/* ARGSUSED2 */
 int
 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
@@ -6184,9 +5201,8 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 	stropt_mp->b_wptr += sizeof (*stropt);
 	stropt = (struct stroptions *)stropt_mp->b_rptr;
 	stropt->so_flags = SO_WROFF | SO_HIWAT;
-	stropt->so_wroff =
-	    (ushort_t)(icmp->icmp_max_hdr_len + icmp->icmp_is->is_wroff_extra);
-	stropt->so_hiwat = icmp->icmp_recv_hiwat;
+	stropt->so_wroff = connp->conn_wroff;
+	stropt->so_hiwat = connp->conn_rcvbuf;
 	putnext(RD(q), stropt_mp);
 
 	/*
@@ -6207,9 +5223,9 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 	if (error != 0)
 		faddrlen = 0;
 	opts = 0;
-	if (icmp->icmp_dgram_errind)
+	if (connp->conn_dgram_errind)
 		opts |= SO_DGRAM_ERRIND;
-	if (icmp->icmp_dontroute)
+	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 		opts |= SO_DONTROUTE;
 
 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
@@ -6218,7 +5234,7 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 
 	/*
 	 * Attempts to send data up during fallback will result in it being
-	 * queued in udp_t. Now we push up any queued packets.
+	 * queued in icmp_t. Now we push up any queued packets.
 	 */
 	mutex_enter(&icmp->icmp_recv_lock);
 	while (icmp->icmp_fallback_queue_head != NULL) {
@@ -6236,9 +5252,9 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 	/*
 	 * No longer a streams less socket
 	 */
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+	mutex_enter(&connp->conn_lock);
 	connp->conn_flags &= ~IPCL_NONSTR;
-	rw_exit(&icmp->icmp_rwlock);
+	mutex_exit(&connp->conn_lock);
 
 	mutex_exit(&icmp->icmp_recv_lock);
 
@@ -6250,7 +5266,7 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 	return (0);
 }
 
-/* ARGSUSED */
+/* ARGSUSED2 */
 sock_lower_handle_t
 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
     uint_t *smodep, int *errorp, int flags, cred_t *credp)
@@ -6262,35 +5278,10 @@ rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 		return (NULL);
 	}
 
-	connp = icmp_open(family, credp, errorp, flags);
+	connp = rawip_do_open(family, credp, errorp, flags);
 	if (connp != NULL) {
-		icmp_stack_t *is;
-
-		is = connp->conn_icmp->icmp_is;
 		connp->conn_flags |= IPCL_NONSTR;
 
-		if (connp->conn_icmp->icmp_family == AF_INET6) {
-			/* Build initial header template for transmit */
-			rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
-			if ((*errorp =
-			    icmp_build_hdrs(connp->conn_icmp)) != 0) {
-				rw_exit(&connp->conn_icmp->icmp_rwlock);
-				ipcl_conn_destroy(connp);
-				return (NULL);
-			}
-			rw_exit(&connp->conn_icmp->icmp_rwlock);
-		}
-
-		connp->conn_icmp->icmp_recv_hiwat = is->is_recv_hiwat;
-		connp->conn_icmp->icmp_xmit_hiwat = is->is_xmit_hiwat;
-
-		if ((*errorp = ip_create_helper_stream(connp,
-		    is->is_ldi_ident)) != 0) {
-			cmn_err(CE_CONT, "create of IP helper stream failed\n");
-			(void) rawip_do_close(connp);
-			return (NULL);
-		}
-
 		mutex_enter(&connp->conn_lock);
 		connp->conn_state_flags &= ~CONN_INCIPIENT;
 		mutex_exit(&connp->conn_lock);
@@ -6303,14 +5294,13 @@ rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 	return ((sock_lower_handle_t)connp);
 }
 
-/* ARGSUSED */
+/* ARGSUSED3 */
 void
 rawip_activate(sock_lower_handle_t proto_handle,
     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
     cred_t *cr)
 {
 	conn_t 			*connp = (conn_t *)proto_handle;
-	icmp_stack_t 		*is = connp->conn_icmp->icmp_is;
 	struct sock_proto_props sopp;
 
 	/* All Solaris components should pass a cred for this operation. */
@@ -6321,10 +5311,9 @@ rawip_activate(sock_lower_handle_t proto_handle,
 
 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
-	sopp.sopp_wroff = connp->conn_icmp->icmp_max_hdr_len +
-	    is->is_wroff_extra;
-	sopp.sopp_rxhiwat = is->is_recv_hiwat;
-	sopp.sopp_rxlowat = icmp_mod_info.mi_lowat;
+	sopp.sopp_wroff = connp->conn_wroff;
+	sopp.sopp_rxhiwat = connp->conn_rcvbuf;
+	sopp.sopp_rxlowat = connp->conn_rcvlowat;
 	sopp.sopp_maxblk = INFPSZ;
 	sopp.sopp_maxpsz = IP_MAXPACKET;
 	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
@@ -6332,113 +5321,11 @@ rawip_activate(sock_lower_handle_t proto_handle,
 
 	(*connp->conn_upcalls->su_set_proto_props)
 	    (connp->conn_upper_handle, &sopp);
-}
-
-static int
-rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
-{
-	sin_t	*sin = (sin_t *)sa;
-	sin6_t	*sin6 = (sin6_t *)sa;
-
-	ASSERT(icmp != NULL);
-	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
 
-	switch (icmp->icmp_family) {
-	case AF_INET:
-		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
-		if (*salenp < sizeof (sin_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin_t);
-		*sin = sin_null;
-		sin->sin_family = AF_INET;
-		if (icmp->icmp_state == TS_UNBND) {
-			break;
-		}
-
-		if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
-		    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
-			sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_v6src);
-		} else {
-			/*
-			 * INADDR_ANY
-			 * icmp_v6src is not set, we might be bound to
-			 * broadcast/multicast. Use icmp_bound_v6src as
-			 * local address instead (that could
-			 * also still be INADDR_ANY)
-			 */
-			sin->sin_addr.s_addr =
-			    V4_PART_OF_V6(icmp->icmp_bound_v6src);
-		}
-		break;
-	case AF_INET6:
-
-		if (*salenp < sizeof (sin6_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin6_t);
-		*sin6 = sin6_null;
-		sin6->sin6_family = AF_INET6;
-		if (icmp->icmp_state == TS_UNBND) {
-			break;
-		}
-		if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
-			sin6->sin6_addr = icmp->icmp_v6src;
-		} else {
-			/*
-			 * UNSPECIFIED
-			 * icmp_v6src is not set, we might be bound to
-			 * broadcast/multicast. Use icmp_bound_v6src as
-			 * local address instead (that could
-			 * also still be UNSPECIFIED)
-			 */
-
-			sin6->sin6_addr = icmp->icmp_bound_v6src;
-		}
-		break;
-	}
-	return (0);
-}
-
-static int
-rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
-{
-	sin_t   *sin = (sin_t *)sa;
-	sin6_t  *sin6 = (sin6_t *)sa;
-
-	ASSERT(icmp != NULL);
-	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
-
-	if (icmp->icmp_state != TS_DATA_XFER)
-		return (ENOTCONN);
-
-	sa->sa_family = icmp->icmp_family;
-	switch (icmp->icmp_family) {
-	case AF_INET:
-		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
-
-		if (*salenp < sizeof (sin_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin_t);
-		*sin = sin_null;
-		sin->sin_family = AF_INET;
-		sin->sin_addr.s_addr =
-		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
-		break;
-	case AF_INET6:
-		if (*salenp < sizeof (sin6_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin6_t);
-		*sin6 = sin6_null;
-		*sin6 = icmp->icmp_v6dst;
-		break;
-	}
-	return (0);
+	icmp_bind_proto(connp->conn_icmp);
 }
 
-/* ARGSUSED */
+/* ARGSUSED3 */
 int
 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
     socklen_t *salenp, cred_t *cr)
@@ -6450,36 +5337,29 @@ rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
-	ASSERT(icmp != NULL);
-
-	rw_enter(&icmp->icmp_rwlock, RW_READER);
-
-	error = rawip_do_getpeername(icmp, sa, salenp);
-
-	rw_exit(&icmp->icmp_rwlock);
-
+	mutex_enter(&connp->conn_lock);
+	if (icmp->icmp_state != TS_DATA_XFER)
+		error = ENOTCONN;
+	else
+		error = conn_getpeername(connp, sa, salenp);
+	mutex_exit(&connp->conn_lock);
 	return (error);
 }
 
-/* ARGSUSED */
+/* ARGSUSED3 */
 int
 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
     socklen_t *salenp, cred_t *cr)
 {
 	conn_t  *connp = (conn_t *)proto_handle;
-	icmp_t	*icmp = connp->conn_icmp;
 	int	error;
 
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
-	ASSERT(icmp != NULL);
-	rw_enter(&icmp->icmp_rwlock, RW_READER);
-
-	error = rawip_do_getsockname(icmp, sa, salenp);
-
-	rw_exit(&icmp->icmp_rwlock);
-
+	mutex_enter(&connp->conn_lock);
+	error = conn_getsockname(connp, sa, salenp);
+	mutex_exit(&connp->conn_lock);
 	return (error);
 }
 
@@ -6488,7 +5368,6 @@ rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
     const void *optvalp, socklen_t optlen, cred_t *cr)
 {
 	conn_t	*connp = (conn_t *)proto_handle;
-	icmp_t *icmp = connp->conn_icmp;
 	int error;
 
 	/* All Solaris components should pass a cred for this operation. */
@@ -6497,7 +5376,6 @@ rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	error = proto_opt_check(level, option_name, optlen, NULL,
 	    icmp_opt_obj.odb_opt_des_arr,
 	    icmp_opt_obj.odb_opt_arr_cnt,
-	    icmp_opt_obj.odb_topmost_tpiprovider,
 	    B_TRUE, B_FALSE, cr);
 
 	if (error != 0) {
@@ -6510,19 +5388,9 @@ rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 		return (error);
 	}
 
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
 	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
 	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
 	    (uchar_t *)optvalp, NULL, cr);
-	rw_exit(&icmp->icmp_rwlock);
-
-	if (error < 0) {
-		/*
-		 * Pass on to ip
-		 */
-		error = ip_set_options(connp, level, option_name, optvalp,
-		    optlen, cr);
-	}
 
 	ASSERT(error >= 0);
 
@@ -6535,7 +5403,6 @@ rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 {
 	int		error;
 	conn_t		*connp = (conn_t *)proto_handle;
-	icmp_t		*icmp = connp->conn_icmp;
 	t_uscalar_t	max_optbuf_len;
 	void		*optvalp_buf;
 	int		len;
@@ -6546,7 +5413,6 @@ rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
 	    icmp_opt_obj.odb_opt_des_arr,
 	    icmp_opt_obj.odb_opt_arr_cnt,
-	    icmp_opt_obj.odb_topmost_tpiprovider,
 	    B_FALSE, B_TRUE, cr);
 
 	if (error != 0) {
@@ -6557,31 +5423,25 @@ rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	}
 
 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
-	rw_enter(&icmp->icmp_rwlock, RW_READER);
 	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
-	rw_exit(&icmp->icmp_rwlock);
-
-	if (len < 0) {
-		/*
-		 * Pass on to IP
-		 */
-		kmem_free(optvalp_buf, max_optbuf_len);
-		return (ip_get_options(connp, level, option_name, optvalp,
-		    optlen, cr));
-	} else {
-		/*
-		 * update optlen and copy option value
-		 */
-		t_uscalar_t size = MIN(len, *optlen);
-		bcopy(optvalp_buf, optvalp, size);
-		bcopy(&size, optlen, sizeof (size));
-
+	if (len == -1) {
 		kmem_free(optvalp_buf, max_optbuf_len);
-		return (0);
+		return (EINVAL);
 	}
+
+	/*
+	 * update optlen and copy option value
+	 */
+	t_uscalar_t size = MIN(len, *optlen);
+
+	bcopy(optvalp_buf, optvalp, size);
+	bcopy(&size, optlen, sizeof (size));
+
+	kmem_free(optvalp_buf, max_optbuf_len);
+	return (0);
 }
 
-/* ARGSUSED */
+/* ARGSUSED1 */
 int
 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
 {
@@ -6594,7 +5454,7 @@ rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
 	return (0);
 }
 
-/* ARGSUSED */
+/* ARGSUSED2 */
 int
 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
 {
@@ -6635,6 +5495,27 @@ rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
+	/*
+	 * If we don't have a helper stream then create one.
+	 * ip_create_helper_stream takes care of locking the conn_t,
+	 * so this check for NULL is just a performance optimization.
+	 */
+	if (connp->conn_helper_info == NULL) {
+		icmp_stack_t *is = connp->conn_icmp->icmp_is;
+
+		ASSERT(is->is_ldi_ident != NULL);
+
+		/*
+		 * Create a helper stream for non-STREAMS socket.
+		 */
+		error = ip_create_helper_stream(connp, is->is_ldi_ident);
+		if (error != 0) {
+			ip0dbg(("rawip_ioctl: create of IP helper stream "
+			    "failed %d\n", error));
+			return (error);
+		}
+	}
+
 	switch (cmd) {
 	case ND_SET:
 	case ND_GET:
@@ -6658,25 +5539,25 @@ rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 	return (error);
 }
 
-/* ARGSUSED */
 int
 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
     cred_t *cr)
 {
-	conn_t *connp = (conn_t *)proto_handle;
-	icmp_t	*icmp = connp->conn_icmp;
-	icmp_stack_t *is = icmp->icmp_is;
-	int error = 0;
-	boolean_t bypass_dgram_errind = B_FALSE;
+	sin6_t		*sin6;
+	sin_t		*sin = NULL;
+	uint_t		srcid;
+	conn_t		*connp = (conn_t *)proto_handle;
+	icmp_t		*icmp = connp->conn_icmp;
+	int		error = 0;
+	icmp_stack_t	*is = icmp->icmp_is;
+	pid_t		pid = curproc->p_pid;
+	ip_xmit_attr_t	*ixa;
 
 	ASSERT(DB_TYPE(mp) == M_DATA);
 
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
-	/* If labeled then sockfs should have already set db_credp */
-	ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
-
 	/* do an implicit bind if necessary */
 	if (icmp->icmp_state == TS_UNBND) {
 		error = rawip_implicit_bind(connp);
@@ -6691,170 +5572,191 @@ rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 		}
 	}
 
-	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-
-	if (msg->msg_name != NULL && icmp->icmp_state == TS_DATA_XFER) {
-		error = EISCONN;
-		goto done_lock;
-	}
-
-	switch (icmp->icmp_family) {
-	case AF_INET6: {
-		sin6_t	*sin6;
-		ip6_pkt_t	ipp_s;	/* For ancillary data options */
-		ip6_pkt_t	*ipp = &ipp_s;
-
-		sin6 = (sin6_t *)msg->msg_name;
-		if (sin6 != NULL) {
-			error = proto_verify_ip_addr(icmp->icmp_family,
-			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
-			if (error != 0) {
-				bypass_dgram_errind = B_TRUE;
-				goto done_lock;
+	/* Protocol 255 contains full IP headers */
+	/* Read without holding lock */
+	if (icmp->icmp_hdrincl) {
+		ASSERT(connp->conn_ipversion == IPV4_VERSION);
+		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
+			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
+				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+				freemsg(mp);
+				return (EINVAL);
 			}
-			if (icmp->icmp_delayed_error != 0) {
-				sin6_t  *sin1 = (sin6_t *)msg->msg_name;
-				sin6_t  *sin2 = (sin6_t *)
-				    &icmp->icmp_delayed_addr;
-
-				error = icmp->icmp_delayed_error;
-				icmp->icmp_delayed_error = 0;
-
-				/* Compare IP address and port */
+		}
+		error = icmp_output_hdrincl(connp, mp, cr, pid);
+		if (is->is_sendto_ignerr)
+			return (0);
+		else
+			return (error);
+	}
 
-				if (sin1->sin6_port == sin2->sin6_port &&
-				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
-				    &sin2->sin6_addr)) {
-					goto done_lock;
-				}
-			}
+	/* Connected? */
+	if (msg->msg_name == NULL) {
+		if (icmp->icmp_state != TS_DATA_XFER) {
+			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+			return (EDESTADDRREQ);
+		}
+		if (msg->msg_controllen != 0) {
+			error = icmp_output_ancillary(connp, NULL, NULL, mp,
+			    NULL, msg, cr, pid);
 		} else {
-			/*
-			 * Use connected address
-			 */
-			if (icmp->icmp_state != TS_DATA_XFER) {
-				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-				error = EDESTADDRREQ;
-				bypass_dgram_errind = B_TRUE;
-				goto done_lock;
-			}
-			sin6 = &icmp->icmp_v6dst;
+			error = icmp_output_connected(connp, mp, cr, pid);
 		}
+		if (is->is_sendto_ignerr)
+			return (0);
+		else
+			return (error);
+	}
+	if (icmp->icmp_state == TS_DATA_XFER) {
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		return (EISCONN);
+	}
+	error = proto_verify_ip_addr(connp->conn_family,
+	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
+	if (error != 0) {
+		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+		return (error);
+	}
+	switch (connp->conn_family) {
+	case AF_INET6:
+		sin6 = (sin6_t *)msg->msg_name;
 
 		/* No support for mapped addresses on raw sockets */
 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-			error = EADDRNOTAVAIL;
-			goto done_lock;
+			return (EADDRNOTAVAIL);
 		}
-
-		ipp->ipp_fields = 0;
-		ipp->ipp_sticky_ignored = 0;
+		srcid = sin6->__sin6_src_id;
 
 		/*
-		 * If options passed in, feed it for verification and handling
+		 * If the local address is a mapped address return
+		 * an error.
+		 * It would be possible to send an IPv6 packet but the
+		 * response would never make it back to the application
+		 * since it is bound to a mapped address.
 		 */
-		if (msg->msg_controllen != 0) {
-			error = process_auxiliary_options(connp,
-			    msg->msg_control, msg->msg_controllen,
-			    ipp, &icmp_opt_obj, icmp_opt_set, cr);
-			if (error != 0) {
-				goto done_lock;
-			}
+		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
+			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+			return (EADDRNOTAVAIL);
 		}
 
-		rw_exit(&icmp->icmp_rwlock);
+		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+			sin6->sin6_addr = ipv6_loopback;
 
 		/*
-		 * Destination is a native IPv6 address.
-		 * Send out an IPv6 format packet.
+		 * We have to allocate an ip_xmit_attr_t before we grab
+		 * conn_lock and we need to hold conn_lock once we've check
+		 * conn_same_as_last_v6 to handle concurrent send* calls on a
+		 * socket.
 		 */
+		if (msg->msg_controllen == 0) {
+			ixa = conn_get_ixa(connp, B_FALSE);
+			if (ixa == NULL) {
+				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+				return (ENOMEM);
+			}
+		} else {
+			ixa = NULL;
+		}
+		mutex_enter(&connp->conn_lock);
+		if (icmp->icmp_delayed_error != 0) {
+			sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
 
-		error = raw_ip_send_data_v6(connp->conn_wq, connp, mp, sin6,
-		    ipp);
-	}
-		break;
-	case AF_INET: {
-		sin_t	*sin;
-		ip4_pkt_t pktinfo;
-		ip4_pkt_t *pktinfop = &pktinfo;
-		ipaddr_t	v4dst;
+			error = icmp->icmp_delayed_error;
+			icmp->icmp_delayed_error = 0;
 
-		sin = (sin_t *)msg->msg_name;
-		if (sin != NULL) {
-			error = proto_verify_ip_addr(icmp->icmp_family,
-			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
-			if (error != 0) {
-				bypass_dgram_errind = B_TRUE;
-				goto done_lock;
-			}
-			v4dst = sin->sin_addr.s_addr;
-			if (icmp->icmp_delayed_error != 0) {
-				sin_t *sin1 = (sin_t *)msg->msg_name;
-				sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
-
-				error = icmp->icmp_delayed_error;
-				icmp->icmp_delayed_error = 0;
-
-				/* Compare IP address and port */
-				if (sin1->sin_port == sin2->sin_port &&
-				    sin1->sin_addr.s_addr ==
-				    sin2->sin_addr.s_addr) {
-					goto done_lock;
-				}
+			/* Compare IP address and family */
 
-			}
-		} else {
-			/*
-			 * Use connected address
-			 */
-			if (icmp->icmp_state != TS_DATA_XFER) {
+			if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
+			    &sin2->sin6_addr) &&
+			    sin6->sin6_family == sin2->sin6_family) {
+				mutex_exit(&connp->conn_lock);
 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
-				error = EDESTADDRREQ;
-				bypass_dgram_errind = B_TRUE;
-				goto done_lock;
+				if (ixa != NULL)
+					ixa_refrele(ixa);
+				return (error);
 			}
-			v4dst = V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
 		}
+		if (msg->msg_controllen != 0) {
+			mutex_exit(&connp->conn_lock);
+			ASSERT(ixa == NULL);
+			error = icmp_output_ancillary(connp, NULL, sin6, mp,
+			    NULL, msg, cr, pid);
+		} else if (conn_same_as_last_v6(connp, sin6) &&
+		    connp->conn_lastsrcid == srcid &&
+		    ipsec_outbound_policy_current(ixa)) {
+			/* icmp_output_lastdst drops conn_lock */
+			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
+		} else {
+			/* icmp_output_newdst drops conn_lock */
+			error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
+			    pid, ixa);
+		}
+		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+		if (is->is_sendto_ignerr)
+			return (0);
+		else
+			return (error);
+	case AF_INET:
+		sin = (sin_t *)msg->msg_name;
 
-
-		pktinfop->ip4_ill_index = 0;
-		pktinfop->ip4_addr = INADDR_ANY;
+		if (sin->sin_addr.s_addr == INADDR_ANY)
+			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 
 		/*
-		 * If options passed in, feed it for verification and handling
+		 * We have to allocate an ip_xmit_attr_t before we grab
+		 * conn_lock and we need to hold conn_lock once we've check
+		 * conn_same_as_last_v6 to handle concurrent send* on a socket.
 		 */
-		if (msg->msg_controllen != 0) {
-			error = process_auxiliary_options(connp,
-			    msg->msg_control, msg->msg_controllen,
-			    pktinfop, &icmp_opt_obj, icmp_opt_set, cr);
-			if (error != 0) {
-				goto done_lock;
+		if (msg->msg_controllen == 0) {
+			ixa = conn_get_ixa(connp, B_FALSE);
+			if (ixa == NULL) {
+				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+				return (ENOMEM);
 			}
+		} else {
+			ixa = NULL;
 		}
-		rw_exit(&icmp->icmp_rwlock);
+		mutex_enter(&connp->conn_lock);
+		if (icmp->icmp_delayed_error != 0) {
+			sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
 
-		error = raw_ip_send_data_v4(connp->conn_wq, connp, mp,
-		    v4dst, pktinfop);
-		break;
-	}
+			error = icmp->icmp_delayed_error;
+			icmp->icmp_delayed_error = 0;
 
-	default:
-		ASSERT(0);
-	}
+			/* Compare IP address */
 
-	goto done;
+			if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
+				mutex_exit(&connp->conn_lock);
+				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+				if (ixa != NULL)
+					ixa_refrele(ixa);
+				return (error);
+			}
+		}
 
-done_lock:
-	rw_exit(&icmp->icmp_rwlock);
-	if (error != 0) {
-		ASSERT(mp != NULL);
-		freemsg(mp);
+		if (msg->msg_controllen != 0) {
+			mutex_exit(&connp->conn_lock);
+			ASSERT(ixa == NULL);
+			error = icmp_output_ancillary(connp, sin, NULL, mp,
+			    NULL, msg, cr, pid);
+		} else if (conn_same_as_last_v4(connp, sin) &&
+		    ipsec_outbound_policy_current(ixa)) {
+			/* icmp_output_lastdst drops conn_lock */
+			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
+		} else {
+			/* icmp_output_newdst drops conn_lock */
+			error = icmp_output_newdst(connp, mp, sin, NULL, cr,
+			    pid, ixa);
+		}
+		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+		if (is->is_sendto_ignerr)
+			return (0);
+		else
+			return (error);
+	default:
+		return (EINVAL);
 	}
-done:
-	if (bypass_dgram_errind)
-		return (error);
-	return (icmp->icmp_dgram_errind ? error : 0);
 }
 
 sock_downcalls_t sock_rawip_downcalls = {
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index 8bee9827db..ff0310de0c 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -36,23 +36,11 @@
 #include <inet/common.h>
 #include <netinet/ip6.h>
 #include <inet/ip.h>
-/*
- * MK_XXX Following 2 includes temporary to import ip6_rthdr_t
- *        definition. May not be needed if we fix ip6_dg_snd_attrs_t
- *        to do all extension headers in identical manner.
- */
-#include <net/if.h>
-#include <inet/ip6.h>
 
 #include <netinet/tcp.h>
 #include <netinet/ip_mroute.h>
 #include <inet/optcom.h>
-
-
-extern int icmp_opt_default(queue_t *, int, int, uchar_t *);
-extern int icmp_tpi_opt_get(queue_t *, int, int, uchar_t *);
-extern int icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
-    uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+#include <inet/rawip_impl.h>
 
 /*
  * Table of all known options handled on a ICMP protocol stack.
@@ -63,250 +51,252 @@ extern int icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
  */
 opdes_t	icmp_opt_arr[] = {
 
-{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
-{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 
 #ifdef	SO_PROTOTYPE
 	/*
 	 * icmp will only allow IPPROTO_ICMP for non-privileged streams
 	 * that check is made on an adhoc basis.
 	 */
-{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 #endif
 
-{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct timeval), 0 },
-{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct timeval), 0 },
-{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	0 },
-{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
-{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	0 },
-{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	0 },
 
-{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int),
+{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
 	0 },
-{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 
 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 
-{ IP_HDRINCL,	IPPROTO_IP, OA_R,  OA_RW, OP_RAW, OP_PASSNEXT,
+{ IP_HDRINCL,	IPPROTO_IP, OA_R,  OA_RW, OP_RAW, 0,
 	sizeof (int), 0 },
-{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 
-{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct in_addr), 0 /* INADDR_ANY */ },
 
-{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 	sizeof (uchar_t), -1 /* not initialized */},
 
-{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 	sizeof (uchar_t), -1 /* not initialized */ },
 
-{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ip_mreq), -1 /* not initialized */ },
 
-{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ip_mreq), 0 },
 
-{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ip_mreq_source), -1 },
 
-{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ip_mreq_source), -1 },
 
 { IP_ADD_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
+	OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 },
 
 { IP_DROP_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
+	OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 },
 
-{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 	sizeof (ipsec_req_t), -1 /* not initialized */ },
 
-{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
 	sizeof (int), 0 },
 
 { IP_BROADCAST_TTL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, sizeof (uchar_t),
 	0 /* disabled */ },
 
-{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 
 { IP_PKTINFO, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+	(OP_NODEFAULT|OP_VARLEN),
 	sizeof (struct in_pktinfo), -1 /* not initialized */ },
 
-{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
+{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
+{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
 	sizeof (in_addr_t), -1 /* not initialized */ },
 
 { MRT_INIT, IPPROTO_IP, 0, OA_X, OP_CONFIG,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (int),
+	OP_NODEFAULT, sizeof (int),
 	-1 /* not initialized */ },
 
 { MRT_DONE, IPPROTO_IP, 0, OA_X, OP_CONFIG,
-	(OP_PASSNEXT|OP_NODEFAULT), 0, -1 /* not initialized */ },
+	OP_NODEFAULT, 0, -1 /* not initialized */ },
 
-{ MRT_ADD_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT),
+{ MRT_ADD_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT,
 	sizeof (struct vifctl), -1 /* not initialized */ },
 
-{ MRT_DEL_VIF, 	IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT),
+{ MRT_DEL_VIF, 	IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT,
 	sizeof (vifi_t), -1 /* not initialized */ },
 
-{ MRT_ADD_MFC, 	IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT),
+{ MRT_ADD_MFC, 	IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT,
 	sizeof (struct mfcctl), -1 /* not initialized */ },
 
-{ MRT_DEL_MFC, 	IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT),
+{ MRT_DEL_MFC, 	IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT,
 	sizeof (struct mfcctl), -1 /* not initialized */ },
 
-{ MRT_VERSION, 	IPPROTO_IP, OA_R, OA_R, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ MRT_VERSION, 	IPPROTO_IP, OA_R, OA_R, OP_NP, OP_NODEFAULT,
 	sizeof (int), -1 /* not initialized */ },
 
 { MRT_ASSERT, 	IPPROTO_IP, 0, OA_RW, OP_CONFIG,
-	(OP_PASSNEXT|OP_NODEFAULT),
+	OP_NODEFAULT,
 	sizeof (int), -1 /* not initialized */ },
 
 { MCAST_JOIN_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+	OP_NODEFAULT, sizeof (struct group_req),
 	-1 /* not initialized */ },
 { MCAST_LEAVE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+	OP_NODEFAULT, sizeof (struct group_req),
 	-1 /* not initialized */ },
 { MCAST_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_JOIN_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 
-{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
 
 { IPV6_MULTICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 
 { IPV6_MULTICAST_LOOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */},
+	OP_DEF_FN, sizeof (int), -1 /* not initialized */},
 
-{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ipv6_mreq), -1 /* not initialized */ },
 
-{ IPV6_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ipv6_mreq), -1 /* not initialized */ },
 
-{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 	sizeof (int), -1 /* not initialized */ },
 
-{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
 	sizeof (int), 0 },
 
-{ IPV6_CHECKSUM, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ IPV6_CHECKSUM, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	-1 },
 
 { ICMP6_FILTER, IPPROTO_ICMPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN|OP_VARLEN,
 	sizeof (icmp6_filter_t), 0 },
 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+	(OP_NODEFAULT|OP_VARLEN),
 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
 { IPV6_HOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+	(OP_NODEFAULT|OP_VARLEN),
 	sizeof (int), -1 /* not initialized */ },
 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+	(OP_NODEFAULT|OP_VARLEN),
 	sizeof (sin6_t), -1 /* not initialized */ },
 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	MAX_EHDR_LEN, -1 /* not initialized */ },
 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	MAX_EHDR_LEN, -1 /* not initialized */ },
 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	MAX_EHDR_LEN, -1 /* not initialized */ },
 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	MAX_EHDR_LEN, -1 /* not initialized */ },
 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+	(OP_NODEFAULT|OP_VARLEN),
 	sizeof (int), -1 /* not initialized */ },
-{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct ip6_mtuinfo), -1 },
-{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
 
-{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
 
-{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 	sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
 
 { MCAST_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+	OP_NODEFAULT, sizeof (struct group_req),
 	-1 /* not initialized */ },
 { MCAST_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+	OP_NODEFAULT, sizeof (struct group_req),
 	-1 /* not initialized */ },
 { MCAST_BLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_UNBLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_JOIN_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 };
 
@@ -342,9 +332,8 @@ uint_t	icmp_max_optsize; /* initialized when ICMP driver is loaded */
 
 optdb_obj_t icmp_opt_obj = {
 	icmp_opt_default,	/* ICMP default value function pointer */
-	icmp_tpi_opt_get,		/* ICMP get function pointer */
-	icmp_tpi_opt_set,		/* ICMP set function pointer */
-	B_TRUE,			/* ICMP is tpi provider */
+	icmp_tpi_opt_get,	/* ICMP get function pointer */
+	icmp_tpi_opt_set,	/* ICMP set function pointer */
 	ICMP_OPT_ARR_CNT,	/* ICMP option database count of entries */
 	icmp_opt_arr,		/* ICMP option database */
 	ICMP_VALID_LEVELS_CNT,	/* ICMP valid level count of entries */
diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c
index 5eff11af14..9e6b552a61 100644
--- a/usr/src/uts/common/inet/ip/igmp.c
+++ b/usr/src/uts/common/inet/ip/igmp.c
@@ -56,6 +56,7 @@
 #include <netinet/igmp_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
+#include <inet/ipsec_impl.h>
 
 #include <inet/common.h>
 #include <inet/mi.h>
@@ -66,9 +67,8 @@
 #include <inet/ip_listutils.h>
 
 #include <netinet/igmp.h>
+#include <inet/ip_ndp.h>
 #include <inet/ip_if.h>
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
 
 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
@@ -76,14 +76,13 @@ static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
-static void	igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
+static void	igmpv3_sendrpt(ill_t *ill, mrec_t *reclist);
 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
 		    slist_t *srclist, mrec_t *next);
 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
 		    mcast_record_t rtype, slist_t *flist);
 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
-static void	mcast_signal_restart_thread(ip_stack_t *ipst);
 
 /*
  * Macros used to do timer len conversions.  Timer values are always
@@ -122,11 +121,12 @@ static void	mcast_signal_restart_thread(ip_stack_t *ipst);
  * The first multicast join will trigger the igmp timers / mld timers
  * The unit for next is milliseconds.
  */
-static void
+void
 igmp_start_timers(unsigned next, ip_stack_t *ipst)
 {
 	int	time_left;
 	int	ret;
+	timeout_id_t id;
 
 	ASSERT(next != 0 && next != INFINITY);
 
@@ -173,9 +173,10 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst)
 		mutex_exit(&ipst->ips_igmp_timer_lock);
 		return;
 	}
+	id = ipst->ips_igmp_timeout_id;
 
 	mutex_exit(&ipst->ips_igmp_timer_lock);
-	ret = untimeout(ipst->ips_igmp_timeout_id);
+	ret = untimeout(id);
 	mutex_enter(&ipst->ips_igmp_timer_lock);
 	/*
 	 * The timeout was cancelled, or the timeout handler
@@ -207,11 +208,12 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst)
  * mld_start_timers:
  * The unit for next is milliseconds.
  */
-static void
+void
 mld_start_timers(unsigned next, ip_stack_t *ipst)
 {
 	int	time_left;
 	int	ret;
+	timeout_id_t id;
 
 	ASSERT(next != 0 && next != INFINITY);
 
@@ -257,9 +259,10 @@ mld_start_timers(unsigned next, ip_stack_t *ipst)
 		mutex_exit(&ipst->ips_mld_timer_lock);
 		return;
 	}
+	id = ipst->ips_mld_timeout_id;
 
 	mutex_exit(&ipst->ips_mld_timer_lock);
-	ret = untimeout(ipst->ips_mld_timeout_id);
+	ret = untimeout(id);
 	mutex_enter(&ipst->ips_mld_timer_lock);
 	/*
 	 * The timeout was cancelled, or the timeout handler
@@ -294,9 +297,8 @@ mld_start_timers(unsigned next, ip_stack_t *ipst)
  * Callers of igmp_input() may need to reinitialize variables that were copied
  * from the mblk as this calls pullupmsg().
  */
-/* ARGSUSED */
 mblk_t *
-igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
+igmp_input(mblk_t *mp, ip_recv_attr_t *ira)
 {
 	igmpa_t 	*igmpa;
 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
@@ -304,22 +306,22 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
 	ilm_t 		*ilm;
 	uint32_t	src, dst;
 	uint32_t 	group;
+	in6_addr_t	v6group;
 	uint_t		next;
 	ipif_t 		*ipif;
-	ip_stack_t	*ipst;
-	ilm_walker_t	ilw;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	ASSERT(ill != NULL);
 	ASSERT(!ill->ill_isv6);
-	ipst = ill->ill_ipst;
 	++ipst->ips_igmpstat.igps_rcv_total;
 
 	mblklen = MBLKL(mp);
-	if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
+	iphlen = ira->ira_ip_hdr_length;
+	if (mblklen < 1 || mblklen < iphlen) {
 		++ipst->ips_igmpstat.igps_rcv_tooshort;
 		goto bad_pkt;
 	}
-	igmplen = ntohs(ipha->ipha_length) - iphlen;
+	igmplen = ira->ira_pktlen - iphlen;
 	/*
 	 * Since msg sizes are more variable with v3, just pullup the
 	 * whole thing now.
@@ -342,13 +344,6 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
 		++ipst->ips_igmpstat.igps_rcv_tooshort;
 		goto bad_pkt;
 	}
-	/*
-	 * Validate checksum
-	 */
-	if (IP_CSUM(mp, iphlen, 0)) {
-		++ipst->ips_igmpstat.igps_rcv_badsum;
-		goto bad_pkt;
-	}
 
 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
 	src = ipha->ipha_src;
@@ -400,9 +395,8 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
 					    1,
 					    SL_TRACE,
 					    "igmp_input: we are only "
-					    "member src 0x%x ipif_local 0x%x",
-					    (int)ntohl(src),
-					    (int)ntohl(ipif->ipif_lcl_addr));
+					    "member src 0x%x\n",
+					    (int)ntohl(src));
 				}
 				mutex_exit(&ill->ill_lock);
 				return (mp);
@@ -445,15 +439,18 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
 		 * terminology, stop our timer for that group and 'clear
 		 * flag' i.e. mark as IGMP_OTHERMEMBER.
 		 */
-		ilm = ilm_walker_start(&ilw, ill);
-		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
-			if (ilm->ilm_addr == group) {
-				++ipst->ips_igmpstat.igps_rcv_ourreports;
-				ilm->ilm_timer = INFINITY;
-				ilm->ilm_state = IGMP_OTHERMEMBER;
-			}
-		}
-		ilm_walker_finish(&ilw);
+		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
+		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group))
+				continue;
+
+			++ipst->ips_igmpstat.igps_rcv_ourreports;
+			ilm->ilm_timer = INFINITY;
+			ilm->ilm_state = IGMP_OTHERMEMBER;
+		} /* for */
+		rw_exit(&ill->ill_mcast_lock);
+		ill_mcast_timer_start(ill->ill_ipst);
 		break;
 
 	case IGMP_V3_MEMBERSHIP_REPORT:
@@ -482,11 +479,11 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 	int	timer;
 	uint_t	next, current;
 	ip_stack_t	 *ipst;
-	ilm_walker_t 	ilw;
 
 	ipst = ill->ill_ipst;
 	++ipst->ips_igmpstat.igps_rcv_queries;
 
+	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
 	/*
 	 * In the IGMPv2 specification, there are 3 states and a flag.
 	 *
@@ -506,9 +503,6 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 		 * Remember that the querier on this interface is old,
 		 * and set the timer to the value in RFC 1112.
 		 */
-
-
-		mutex_enter(&ill->ill_lock);
 		ill->ill_mcast_v1_time = 0;
 		ill->ill_mcast_v1_tset = 1;
 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
@@ -517,13 +511,14 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
 			ill->ill_mcast_type = IGMP_V1_ROUTER;
 		}
-		mutex_exit(&ill->ill_lock);
 
 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
 
 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
 		    igmpa->igmpa_group != 0) {
 			++ipst->ips_igmpstat.igps_rcv_badqueries;
+			rw_exit(&ill->ill_mcast_lock);
+			ill_mcast_timer_start(ill->ill_ipst);
 			return (0);
 		}
 
@@ -537,6 +532,8 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 		group = igmpa->igmpa_group;
 		if (group != 0 && (!CLASSD(group))) {
 			++ipst->ips_igmpstat.igps_rcv_badqueries;
+			rw_exit(&ill->ill_mcast_lock);
+			ill_mcast_timer_start(ill->ill_ipst);
 			return (0);
 		}
 
@@ -545,7 +542,6 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 		 * ONLY IF current state is v3.  Let things be if current
 		 * state if v1 but do reset the v2-querier-present timer.
 		 */
-		mutex_enter(&ill->ill_lock);
 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
 			    "to IGMP_V2_ROUTER", ill->ill_name));
@@ -554,18 +550,15 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 		}
 		ill->ill_mcast_v2_time = 0;
 		ill->ill_mcast_v2_tset = 1;
-		mutex_exit(&ill->ill_lock);
 
 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
 	}
 
 	if (ip_debug > 1) {
-		mutex_enter(&ill->ill_lock);
 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
 		    (int)ntohs(igmpa->igmpa_code),
 		    (int)ntohs(igmpa->igmpa_type));
-		mutex_exit(&ill->ill_lock);
 	}
 
 	/*
@@ -582,11 +575,9 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 	 */
 	next = (unsigned)INFINITY;
 
-	ilm = ilm_walker_start(&ilw, ill);
-	mutex_enter(&ill->ill_lock);
 	current = CURRENT_MSTIME;
+	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
 
-	for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
 		/*
 		 * A multicast router joins INADDR_ANY address
 		 * to enable promiscuous reception of all
@@ -608,8 +599,12 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 			}
 		}
 	}
-	mutex_exit(&ill->ill_lock);
-	ilm_walker_finish(&ilw);
+	rw_exit(&ill->ill_mcast_lock);
+	/*
+	 * No packets have been sent above - no
+	 * ill_mcast_send_queued is needed.
+	 */
+	ill_mcast_timer_start(ill->ill_ipst);
 
 	return (next);
 }
@@ -623,7 +618,6 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
 	ipaddr_t	*src_array;
 	uint8_t		qrv;
 	ip_stack_t	 *ipst;
-	ilm_walker_t	ilw;
 
 	ipst = ill->ill_ipst;
 	/* make sure numsrc matches packet size */
@@ -636,6 +630,8 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
 
 	++ipst->ips_igmpstat.igps_rcv_queries;
 
+	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
+
 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
 		uint_t hdrval, mant, exp;
 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
@@ -669,12 +665,11 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
 	 * sooner than the delay we calculated for this response, then
 	 * no action is required (RFC3376 section 5.2 rule 1)
 	 */
-	mutex_enter(&ill->ill_lock);
 	if (ill->ill_global_timer < (current + delay)) {
-		mutex_exit(&ill->ill_lock);
+		rw_exit(&ill->ill_mcast_lock);
+		ill_mcast_timer_start(ill->ill_ipst);
 		return (next);
 	}
-	mutex_exit(&ill->ill_lock);
 
 	/*
 	 * Now take action depending upon query type:
@@ -687,16 +682,11 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
 		 * greater than our calculated delay, so reset it to
 		 * our delay (random value in range [0, response time]).
 		 */
-		mutex_enter(&ill->ill_lock);
 		ill->ill_global_timer =  current + delay;
-		mutex_exit(&ill->ill_lock);
 		next = delay;
-
 	} else {
 		/* group or group/source specific query */
-		ilm = ilm_walker_start(&ilw, ill);
-		mutex_enter(&ill->ill_lock);
-		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
@@ -750,13 +740,21 @@ group_query:
 				next = ilm->ilm_timer;
 			ilm->ilm_timer += current;
 		}
-		mutex_exit(&ill->ill_lock);
-		ilm_walker_finish(&ilw);
 	}
+	rw_exit(&ill->ill_mcast_lock);
+	/*
+	 * No packets have been sent above - no
+	 * ill_mcast_send_queued is needed.
+	 */
+	ill_mcast_timer_start(ill->ill_ipst);
 
 	return (next);
 }
 
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
 void
 igmp_joingroup(ilm_t *ilm)
 {
@@ -764,27 +762,21 @@ igmp_joingroup(ilm_t *ilm)
 	ill_t	*ill;
 	ip_stack_t	*ipst = ilm->ilm_ipst;
 
-	ill = ilm->ilm_ipif->ipif_ill;
+	ill = ilm->ilm_ill;
 
-	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
+	ASSERT(!ill->ill_isv6);
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 
-	mutex_enter(&ill->ill_lock);
 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
 		ilm->ilm_rtx.rtx_timer = INFINITY;
 		ilm->ilm_state = IGMP_OTHERMEMBER;
-		mutex_exit(&ill->ill_lock);
 	} else {
 		ip1dbg(("Querier mode %d, sending report, group %x\n",
 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
-			mutex_exit(&ill->ill_lock);
 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
-			mutex_enter(&ill->ill_lock);
 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
-			mutex_exit(&ill->ill_lock);
 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
-			mutex_enter(&ill->ill_lock);
 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
 			mrec_t *rp;
 			mcast_record_t rtype;
@@ -802,9 +794,7 @@ igmp_joingroup(ilm_t *ilm)
 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
 			    ilm->ilm_filter, NULL);
-			mutex_exit(&ill->ill_lock);
-			igmpv3_sendrpt(ilm->ilm_ipif, rp);
-			mutex_enter(&ill->ill_lock);
+			igmpv3_sendrpt(ill, rp);
 			/*
 			 * Set up retransmission state.  Timer is set below,
 			 * for both v3 and older versions.
@@ -820,35 +810,33 @@ igmp_joingroup(ilm_t *ilm)
 		timer = ilm->ilm_rtx.rtx_timer;
 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
 		ilm->ilm_state = IGMP_IREPORTEDLAST;
-		mutex_exit(&ill->ill_lock);
 
 		/*
-		 * We need to restart the IGMP timers, but we can't do it here
-		 * since we're inside the IPSQ and thus igmp_start_timers() ->
-		 * untimeout() (inside the IPSQ, waiting for a running timeout
-		 * to finish) could deadlock with igmp_timeout_handler() ->
-		 * ipsq_enter() (running the timeout, waiting to get inside
-		 * the IPSQ).  We also can't just delay it until after we
-		 * ipsq_exit() since we could be inside more than one IPSQ and
-		 * thus still have the other IPSQs pinned after we exit -- and
-		 * igmp_start_timers() may be trying to enter one of those.
-		 * Instead, signal a dedicated thread that will do it for us.
+		 * We are holding ill_mcast_lock here and the timeout
+		 * handler (igmp_timeout_handler_per_ill) acquires that
+		 * lock. Hence we can't call igmp_start_timer since it could
+		 * deadlock in untimeout().
+		 * Instead the thread which drops ill_mcast_lock will have
+		 * to call ill_mcast_timer_start().
 		 */
 		mutex_enter(&ipst->ips_igmp_timer_lock);
 		ipst->ips_igmp_deferred_next = MIN(timer,
 		    ipst->ips_igmp_deferred_next);
 		mutex_exit(&ipst->ips_igmp_timer_lock);
-		mcast_signal_restart_thread(ipst);
 	}
 
 	if (ip_debug > 1) {
-		(void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
+		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
 		    "igmp_joingroup: multicast_type %d timer %d",
-		    (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
+		    (ilm->ilm_ill->ill_mcast_type),
 		    (int)ntohl(timer));
 	}
 }
 
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
 void
 mld_joingroup(ilm_t *ilm)
 {
@@ -858,19 +846,16 @@ mld_joingroup(ilm_t *ilm)
 
 	ill = ilm->ilm_ill;
 
-	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
+	ASSERT(ill->ill_isv6);
+
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 
-	mutex_enter(&ill->ill_lock);
 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
 		ilm->ilm_rtx.rtx_timer = INFINITY;
 		ilm->ilm_state = IGMP_OTHERMEMBER;
-		mutex_exit(&ill->ill_lock);
 	} else {
 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
-			mutex_exit(&ill->ill_lock);
 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
-			mutex_enter(&ill->ill_lock);
 		} else {
 			mrec_t *rp;
 			mcast_record_t rtype;
@@ -888,9 +873,7 @@ mld_joingroup(ilm_t *ilm)
 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
 			    ilm->ilm_filter, NULL);
-			mutex_exit(&ill->ill_lock);
 			mldv2_sendrpt(ill, rp);
-			mutex_enter(&ill->ill_lock);
 			/*
 			 * Set up retransmission state.  Timer is set below,
 			 * for both v2 and v1.
@@ -909,17 +892,19 @@ mld_joingroup(ilm_t *ilm)
 		timer = ilm->ilm_rtx.rtx_timer;
 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
 		ilm->ilm_state = IGMP_IREPORTEDLAST;
-		mutex_exit(&ill->ill_lock);
 
 		/*
-		 * Signal another thread to restart the timers.  See the
-		 * comment in igmp_joingroup() for details.
+		 * We are holding ill_mcast_lock here and the timeout
+		 * handler (mld_timeout_handler_per_ill) acquires that
+		 * lock. Hence we can't call mld_start_timer since it could
+		 * deadlock in untimeout().
+		 * Instead the thread which drops ill_mcast_lock will have
+		 * to call ill_mcast_timer_start().
 		 */
 		mutex_enter(&ipst->ips_mld_timer_lock);
 		ipst->ips_mld_deferred_next = MIN(timer,
 		    ipst->ips_mld_deferred_next);
 		mutex_exit(&ipst->ips_mld_timer_lock);
-		mcast_signal_restart_thread(ipst);
 	}
 
 	if (ip_debug > 1) {
@@ -930,23 +915,26 @@ mld_joingroup(ilm_t *ilm)
 	}
 }
 
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
 void
 igmp_leavegroup(ilm_t *ilm)
 {
-	ill_t *ill = ilm->ilm_ipif->ipif_ill;
+	ill_t *ill = ilm->ilm_ill;
 
-	ASSERT(ilm->ilm_ill == NULL);
 	ASSERT(!ill->ill_isv6);
 
-	mutex_enter(&ill->ill_lock);
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
-		mutex_exit(&ill->ill_lock);
 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
 		    (htonl(INADDR_ALLRTRS_GROUP)));
 		return;
-	} else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
+	}
+	if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
 		mrec_t *rp;
 		/*
@@ -965,29 +953,30 @@ igmp_leavegroup(ilm_t *ilm)
 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
 			    NULL, NULL);
 		}
-		mutex_exit(&ill->ill_lock);
-		igmpv3_sendrpt(ilm->ilm_ipif, rp);
+		igmpv3_sendrpt(ill, rp);
 		return;
 	}
-	mutex_exit(&ill->ill_lock);
 }
 
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
 void
 mld_leavegroup(ilm_t *ilm)
 {
 	ill_t *ill = ilm->ilm_ill;
 
-	ASSERT(ilm->ilm_ipif == NULL);
 	ASSERT(ill->ill_isv6);
 
-	mutex_enter(&ill->ill_lock);
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
-		mutex_exit(&ill->ill_lock);
 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
 		return;
-	} else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
+	}
+	if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
 		mrec_t *rp;
 		/*
@@ -1006,13 +995,15 @@ mld_leavegroup(ilm_t *ilm)
 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
 			    NULL, NULL);
 		}
-		mutex_exit(&ill->ill_lock);
 		mldv2_sendrpt(ill, rp);
 		return;
 	}
-	mutex_exit(&ill->ill_lock);
 }
 
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
 void
 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
 {
@@ -1023,17 +1014,11 @@ igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
 	ASSERT(ilm != NULL);
 
 	/* state change reports should only be sent if the router is v3 */
-	if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
+	if (ilm->ilm_ill->ill_mcast_type != IGMP_V3_ROUTER)
 		return;
 
-	if (ilm->ilm_ill == NULL) {
-		ASSERT(ilm->ilm_ipif != NULL);
-		ill = ilm->ilm_ipif->ipif_ill;
-	} else {
-		ill = ilm->ilm_ill;
-	}
-
-	mutex_enter(&ill->ill_lock);
+	ill = ilm->ilm_ill;
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 
 	/*
 	 * Compare existing(old) state with the new state and prepare
@@ -1089,8 +1074,7 @@ send_to_in:
 	/*
 	 * Need to set up retransmission state; merge the new info with the
 	 * current state (which may be null).  If the timer is not currently
-	 * running, signal a thread to restart it -- see the comment in
-	 * igmp_joingroup() for details.
+	 * running, the caller will start it when dropping ill_mcast_lock.
 	 */
 	rp = mcast_merge_rtx(ilm, rp, flist);
 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
@@ -1102,13 +1086,15 @@ send_to_in:
 		    ilm->ilm_rtx.rtx_timer);
 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
 		mutex_exit(&ipst->ips_igmp_timer_lock);
-		mcast_signal_restart_thread(ipst);
 	}
 
-	mutex_exit(&ill->ill_lock);
-	igmpv3_sendrpt(ilm->ilm_ipif, rp);
+	igmpv3_sendrpt(ill, rp);
 }
 
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
 void
 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
 {
@@ -1119,11 +1105,10 @@ mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
 	ASSERT(ilm != NULL);
 
 	ill = ilm->ilm_ill;
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 
 	/* only need to send if we have an mldv2-capable router */
-	mutex_enter(&ill->ill_lock);
 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
-		mutex_exit(&ill->ill_lock);
 		return;
 	}
 
@@ -1179,8 +1164,7 @@ send_to_in:
 	/*
 	 * Need to set up retransmission state; merge the new info with the
 	 * current state (which may be null).  If the timer is not currently
-	 * running, signal a thread to restart it -- see the comment in
-	 * igmp_joingroup() for details.
+	 * running, the caller will start it when dropping ill_mcast_lock.
 	 */
 	rp = mcast_merge_rtx(ilm, rp, flist);
 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
@@ -1193,10 +1177,8 @@ send_to_in:
 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
 		mutex_exit(&ipst->ips_mld_timer_lock);
-		mcast_signal_restart_thread(ipst);
 	}
 
-	mutex_exit(&ill->ill_lock);
 	mldv2_sendrpt(ill, rp);
 }
 
@@ -1205,15 +1187,12 @@ igmp_timeout_handler_per_ill(ill_t *ill)
 {
 	uint_t	next = INFINITY, current;
 	ilm_t	*ilm;
-	ipif_t	*ipif;
 	mrec_t	*rp = NULL;
 	mrec_t	*rtxrp = NULL;
 	rtx_state_t *rtxp;
 	mcast_record_t	rtype;
 
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	mutex_enter(&ill->ill_lock);
+	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
 
 	current = CURRENT_MSTIME;
 	/* First check the global timer on this interface */
@@ -1230,10 +1209,8 @@ igmp_timeout_handler_per_ill(ill_t *ill)
 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
 				continue;
-			ASSERT(ilm->ilm_ipif != NULL);
-			ilm->ilm_ipif->ipif_igmp_rpt =
-			    mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
-			    ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt);
+			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
+			    ilm->ilm_filter, rp);
 			/*
 			 * Since we're sending a report on this group, okay
 			 * to delete pending group-specific timers.  Note
@@ -1245,20 +1222,8 @@ igmp_timeout_handler_per_ill(ill_t *ill)
 			FREE_SLIST(ilm->ilm_pendsrcs);
 			ilm->ilm_pendsrcs = NULL;
 		}
-		/*
-		 * We've built per-ipif mrec lists; walk the ill's ipif list
-		 * and send a report for each ipif that has an mrec list.
-		 */
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ipif->ipif_igmp_rpt == NULL)
-				continue;
-			mutex_exit(&ill->ill_lock);
-			igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt);
-			mutex_enter(&ill->ill_lock);
-			/* mrec list was freed by igmpv3_sendrpt() */
-			ipif->ipif_igmp_rpt = NULL;
-		}
+		igmpv3_sendrpt(ill, rp);
+		rp = NULL;
 	} else {
 		if ((ill->ill_global_timer - current) < next)
 			next = ill->ill_global_timer - current;
@@ -1288,13 +1253,9 @@ per_ilm_timer:
 		ilm->ilm_timer = INFINITY;
 		ilm->ilm_state = IGMP_IREPORTEDLAST;
 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
-			mutex_exit(&ill->ill_lock);
 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
-			mutex_enter(&ill->ill_lock);
 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
-			mutex_exit(&ill->ill_lock);
 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
-			mutex_enter(&ill->ill_lock);
 		} else {
 			slist_t *rsp;
 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
@@ -1325,9 +1286,7 @@ per_ilm_timer:
 				rp = mcast_bldmrec(ilm->ilm_fmode,
 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
 			}
-			mutex_exit(&ill->ill_lock);
-			igmpv3_sendrpt(ill->ill_ipif, rp);
-			mutex_enter(&ill->ill_lock);
+			igmpv3_sendrpt(ill, rp);
 			rp = NULL;
 		}
 
@@ -1345,14 +1304,11 @@ per_ilm_rtxtimer:
 		rtxp->rtx_timer = INFINITY;
 		ilm->ilm_state = IGMP_IREPORTEDLAST;
 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
-			mutex_exit(&ill->ill_lock);
 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
-			mutex_enter(&ill->ill_lock);
 			continue;
-		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
-			mutex_exit(&ill->ill_lock);
+		}
+		if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
-			mutex_enter(&ill->ill_lock);
 			continue;
 		}
 
@@ -1393,13 +1349,14 @@ per_ilm_rtxtimer:
 			CLEAR_SLIST(rtxp->rtx_allow);
 			CLEAR_SLIST(rtxp->rtx_block);
 		}
-		mutex_exit(&ill->ill_lock);
-		igmpv3_sendrpt(ilm->ilm_ipif, rtxrp);
-		mutex_enter(&ill->ill_lock);
+		igmpv3_sendrpt(ill, rtxrp);
 		rtxrp = NULL;
 	}
 
-	mutex_exit(&ill->ill_lock);
+	rw_exit(&ill->ill_mcast_lock);
+	/* Send any deferred/queued IP packets */
+	ill_mcast_send_queued(ill);
+	/* Defer ill_mcast_timer_start() until the caller is done */
 
 	return (next);
 }
@@ -1411,17 +1368,15 @@ per_ilm_rtxtimer:
  *
  * As part of multicast join and leave igmp we may need to send out an
  * igmp request. The igmp related state variables in the ilm are protected
- * by ill_lock. A single global igmp timer is used to track igmp timeouts.
+ * by ill_mcast_lock. A single global igmp timer is used to track igmp timeouts.
  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
  * starts the igmp timer if needed. It serializes multiple threads trying to
  * simultaneously start the timer using the igmp_timer_setter_active flag.
  *
  * igmp_input() receives igmp queries and responds to the queries
  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
- * Later the igmp_timer fires, the timeout handler igmp_timeout_handler()
- * performs the action exclusively after entering each ill's ipsq as writer.
- * (The need to enter the IPSQ is largely historical but there are still some
- * fields like ilm_filter that rely on it.)
+ * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
+ * performs the action exclusively after acquiring ill_mcast_lock.
  *
  * The igmp_slowtimeo() function is called thru another timer.
  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
@@ -1433,12 +1388,12 @@ igmp_timeout_handler(void *arg)
 	uint_t  global_next = INFINITY;
 	uint_t  next;
 	ill_walk_context_t ctx;
-	boolean_t success;
 	ip_stack_t *ipst = arg;
 
 	ASSERT(arg != NULL);
 	mutex_enter(&ipst->ips_igmp_timer_lock);
 	ASSERT(ipst->ips_igmp_timeout_id != 0);
+	ipst->ips_igmp_timeout_id = 0;
 	ipst->ips_igmp_timer_scheduled_last = 0;
 	ipst->ips_igmp_time_to_next = 0;
 	mutex_exit(&ipst->ips_igmp_timer_lock);
@@ -1447,31 +1402,17 @@ igmp_timeout_handler(void *arg)
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
 		ASSERT(!ill->ill_isv6);
-		/*
-		 * We may not be able to refhold the ill if the ill/ipif
-		 * is changing. But we need to make sure that the ill will
-		 * not vanish. So we just bump up the ill_waiter count.
-		 */
-		if (!ill_waiter_inc(ill))
+		/* Make sure the ill isn't going away. */
+		if (!ill_check_and_refhold(ill))
 			continue;
 		rw_exit(&ipst->ips_ill_g_lock);
-		success = ipsq_enter(ill, B_TRUE, NEW_OP);
-		if (success) {
-			next = igmp_timeout_handler_per_ill(ill);
-			if (next < global_next)
-				global_next = next;
-			ipsq_exit(ill->ill_phyint->phyint_ipsq);
-		}
+		next = igmp_timeout_handler_per_ill(ill);
+		if (next < global_next)
+			global_next = next;
+		ill_refrele(ill);
 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-		ill_waiter_dcr(ill);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
-
-	mutex_enter(&ipst->ips_igmp_timer_lock);
-	ASSERT(ipst->ips_igmp_timeout_id != 0);
-	ipst->ips_igmp_timeout_id = 0;
-	mutex_exit(&ipst->ips_igmp_timer_lock);
-
 	if (global_next != INFINITY)
 		igmp_start_timers(global_next, ipst);
 }
@@ -1481,7 +1422,6 @@ igmp_timeout_handler(void *arg)
  * Called when there are timeout events, every next (tick).
  * Returns number of ticks to next event (or 0 if none).
  */
-/* ARGSUSED */
 uint_t
 mld_timeout_handler_per_ill(ill_t *ill)
 {
@@ -1491,9 +1431,7 @@ mld_timeout_handler_per_ill(ill_t *ill)
 	rtx_state_t *rtxp;
 	mcast_record_t	rtype;
 
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	mutex_enter(&ill->ill_lock);
+	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
 
 	current = CURRENT_MSTIME;
 	/*
@@ -1528,9 +1466,7 @@ mld_timeout_handler_per_ill(ill_t *ill)
 			FREE_SLIST(ilm->ilm_pendsrcs);
 			ilm->ilm_pendsrcs = NULL;
 		}
-		mutex_exit(&ill->ill_lock);
 		mldv2_sendrpt(ill, rp);
-		mutex_enter(&ill->ill_lock);
 	} else {
 		if ((ill->ill_global_timer - current) < next)
 			next = ill->ill_global_timer - current;
@@ -1561,9 +1497,7 @@ per_ilm_timer:
 		ilm->ilm_timer = INFINITY;
 		ilm->ilm_state = IGMP_IREPORTEDLAST;
 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
-			mutex_exit(&ill->ill_lock);
 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
-			mutex_enter(&ill->ill_lock);
 		} else {
 			slist_t *rsp;
 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
@@ -1605,9 +1539,7 @@ per_ilm_rtxtimer:
 		rtxp->rtx_timer = INFINITY;
 		ilm->ilm_state = IGMP_IREPORTEDLAST;
 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
-			mutex_exit(&ill->ill_lock);
 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
-			mutex_enter(&ill->ill_lock);
 			continue;
 		}
 
@@ -1651,13 +1583,13 @@ per_ilm_rtxtimer:
 	}
 
 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
-		mutex_exit(&ill->ill_lock);
 		mldv2_sendrpt(ill, rp);
 		mldv2_sendrpt(ill, rtxrp);
-		return (next);
 	}
-
-	mutex_exit(&ill->ill_lock);
+	rw_exit(&ill->ill_mcast_lock);
+	/* Send any deferred/queued IP packets */
+	ill_mcast_send_queued(ill);
+	/* Defer ill_mcast_timer_start() until the caller is done */
 
 	return (next);
 }
@@ -1675,12 +1607,12 @@ mld_timeout_handler(void *arg)
 	uint_t  global_next = INFINITY;
 	uint_t  next;
 	ill_walk_context_t ctx;
-	boolean_t success;
 	ip_stack_t *ipst = arg;
 
 	ASSERT(arg != NULL);
 	mutex_enter(&ipst->ips_mld_timer_lock);
 	ASSERT(ipst->ips_mld_timeout_id != 0);
+	ipst->ips_mld_timeout_id = 0;
 	ipst->ips_mld_timer_scheduled_last = 0;
 	ipst->ips_mld_time_to_next = 0;
 	mutex_exit(&ipst->ips_mld_timer_lock);
@@ -1689,31 +1621,17 @@ mld_timeout_handler(void *arg)
 	ill = ILL_START_WALK_V6(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
 		ASSERT(ill->ill_isv6);
-		/*
-		 * We may not be able to refhold the ill if the ill/ipif
-		 * is changing. But we need to make sure that the ill will
-		 * not vanish. So we just bump up the ill_waiter count.
-		 */
-		if (!ill_waiter_inc(ill))
+		/* Make sure the ill isn't going away. */
+		if (!ill_check_and_refhold(ill))
 			continue;
 		rw_exit(&ipst->ips_ill_g_lock);
-		success = ipsq_enter(ill, B_TRUE, NEW_OP);
-		if (success) {
-			next = mld_timeout_handler_per_ill(ill);
-			if (next < global_next)
-				global_next = next;
-			ipsq_exit(ill->ill_phyint->phyint_ipsq);
-		}
+		next = mld_timeout_handler_per_ill(ill);
+		if (next < global_next)
+			global_next = next;
+		ill_refrele(ill);
 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-		ill_waiter_dcr(ill);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
-
-	mutex_enter(&ipst->ips_mld_timer_lock);
-	ASSERT(ipst->ips_mld_timeout_id != 0);
-	ipst->ips_mld_timeout_id = 0;
-	mutex_exit(&ipst->ips_mld_timer_lock);
-
 	if (global_next != INFINITY)
 		mld_start_timers(global_next, ipst);
 }
@@ -1743,8 +1661,6 @@ igmp_slowtimo(void *arg)
 	ip_stack_t *ipst = (ip_stack_t *)arg;
 
 	ASSERT(arg != NULL);
-	/* Hold the ill_g_lock so that we can safely walk the ill list */
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 
 	/*
 	 * The ill_if_t list is circular, hence the odd loop parameters.
@@ -1754,6 +1670,7 @@ igmp_slowtimo(void *arg)
 	 * structure (allowing us to skip if none of the instances have timers
 	 * running).
 	 */
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	for (ifp = IP_V4_ILL_G_LIST(ipst);
 	    ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
 	    ifp = ifp->illif_next) {
@@ -1768,7 +1685,11 @@ igmp_slowtimo(void *arg)
 		avl_tree = &ifp->illif_avl_by_ppa;
 		for (ill = avl_first(avl_tree); ill != NULL;
 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
-			mutex_enter(&ill->ill_lock);
+			/* Make sure the ill isn't going away. */
+			if (!ill_check_and_refhold(ill))
+				continue;
+			rw_exit(&ipst->ips_ill_g_lock);
+			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
 			if (ill->ill_mcast_v1_tset == 1)
 				ill->ill_mcast_v1_time++;
 			if (ill->ill_mcast_v2_tset == 1)
@@ -1808,10 +1729,13 @@ igmp_slowtimo(void *arg)
 				ill->ill_mcast_v2_tset = 0;
 				atomic_add_16(&ifp->illif_mcast_v2, -1);
 			}
-			mutex_exit(&ill->ill_lock);
+			rw_exit(&ill->ill_mcast_lock);
+			ill_refrele(ill);
+			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 		}
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
+	ill_mcast_timer_start(ipst);
 	mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
 	ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst,
 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
@@ -1826,7 +1750,6 @@ igmp_slowtimo(void *arg)
  * Check for ips_mld_max_version ensures that we don't revert to a higher
  * IGMP version than configured.
  */
-/* ARGSUSED */
 void
 mld_slowtimo(void *arg)
 {
@@ -1847,7 +1770,11 @@ mld_slowtimo(void *arg)
 		avl_tree = &ifp->illif_avl_by_ppa;
 		for (ill = avl_first(avl_tree); ill != NULL;
 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
-			mutex_enter(&ill->ill_lock);
+			/* Make sure the ill isn't going away. */
+			if (!ill_check_and_refhold(ill))
+				continue;
+			rw_exit(&ipst->ips_ill_g_lock);
+			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
 			if (ill->ill_mcast_v1_tset == 1)
 				ill->ill_mcast_v1_time++;
 			if ((ill->ill_mcast_type == MLD_V1_ROUTER) &&
@@ -1861,10 +1788,13 @@ mld_slowtimo(void *arg)
 				ill->ill_mcast_v1_tset = 0;
 				atomic_add_16(&ifp->illif_mcast_v1, -1);
 			}
-			mutex_exit(&ill->ill_lock);
+			rw_exit(&ill->ill_mcast_lock);
+			ill_refrele(ill);
+			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 		}
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
+	ill_mcast_timer_start(ipst);
 	mutex_enter(&ipst->ips_mld_slowtimeout_lock);
 	ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst,
 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
@@ -1873,9 +1803,7 @@ mld_slowtimo(void *arg)
 
 /*
  * igmp_sendpkt:
- * This will send to ip_wput like icmp_inbound.
- * Note that the lower ill (on which the membership is kept) is used
- * as an upper ill to pass in the multicast parameters.
+ * This will send to ip_output_simple just like icmp_inbound.
  */
 static void
 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
@@ -1886,51 +1814,16 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
 	ipha_t	*ipha;
 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
 	size_t	size  = hdrlen + sizeof (igmpa_t);
-	ipif_t 	*ipif = ilm->ilm_ipif;
-	ill_t 	*ill  = ipif->ipif_ill;
-	mblk_t	*first_mp;
-	ipsec_out_t *io;
-	zoneid_t zoneid;
+	ill_t 	*ill  = ilm->ilm_ill;
 	ip_stack_t *ipst = ill->ill_ipst;
 
-	/*
-	 * We need to make sure this packet goes out on an ipif. If
-	 * there is some global policy match in ip_wput_ire, we need
-	 * to get to the right interface after IPSEC processing.
-	 * To make sure this multicast packet goes out on the right
-	 * interface, we attach an ipsec_out and initialize ill_index
-	 * like we did in ip_wput. To make sure that this packet does
-	 * not get forwarded on other interfaces or looped back, we
-	 * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
-	 * to B_FALSE.
-	 */
-	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
-	if (first_mp == NULL)
-		return;
-
-	first_mp->b_datap->db_type = M_CTL;
-	first_mp->b_wptr += sizeof (ipsec_info_t);
-	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
-	/* ipsec_out_secure is B_FALSE now */
-	io = (ipsec_out_t *)first_mp->b_rptr;
-	io->ipsec_out_type = IPSEC_OUT;
-	io->ipsec_out_len = sizeof (ipsec_out_t);
-	io->ipsec_out_use_global_policy = B_TRUE;
-	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
-	io->ipsec_out_multicast_loop = B_FALSE;
-	io->ipsec_out_dontroute = B_TRUE;
-	if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
-		zoneid = GLOBAL_ZONEID;
-	io->ipsec_out_zoneid = zoneid;
-	io->ipsec_out_ns = ipst->ips_netstack;	/* No netstack_hold */
+	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
 
 	mp = allocb(size, BPRI_HI);
 	if (mp == NULL) {
-		freemsg(first_mp);
 		return;
 	}
 	mp->b_wptr = mp->b_rptr + size;
-	first_mp->b_cont = mp;
 
 	ipha = (ipha_t *)mp->b_rptr;
 	rtralert = (uint8_t *)&(ipha[1]);
@@ -1956,53 +1849,38 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
 	ipha->ipha_protocol 	= IPPROTO_IGMP;
 	ipha->ipha_hdr_checksum 	= 0;
 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
-	ipha->ipha_src 		= ipif->ipif_src_addr;
-	/*
-	 * Request loopback of the report if we are acting as a multicast
-	 * router, so that the process-level routing demon can hear it.
-	 */
-	/*
-	 * This will run multiple times for the same group if there are members
-	 * on the same group for multiple ipif's on the same ill. The
-	 * igmp_input code will suppress this due to the loopback thus we
-	 * always loopback membership report.
-	 */
-	ASSERT(ill->ill_rq != NULL);
-	ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid);
+	ipha->ipha_src 		= INADDR_ANY;
 
-	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
+	ill_mcast_queue(ill, mp);
 
 	++ipst->ips_igmpstat.igps_snd_reports;
 }
 
 /*
- * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated
- * with the passed-in ipif.  The report will contain one group record
+ * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill.
+ * The report will contain one group record
  * for each element of reclist.  If this causes packet length to
- * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent.
+ * exceed ill->ill_mtu, multiple reports are sent.
  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
  * and those buffers are freed here.
  */
 static void
-igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
+igmpv3_sendrpt(ill_t *ill, mrec_t *reclist)
 {
-	ipsec_out_t *io;
 	igmp3ra_t *igmp3ra;
 	grphdra_t *grphdr;
-	mblk_t *first_mp, *mp;
+	mblk_t *mp;
 	ipha_t *ipha;
 	uint8_t *rtralert;
 	ipaddr_t *src_array;
 	int i, j, numrec, more_src_cnt;
 	size_t hdrsize, size, rsize;
-	ill_t *ill = ipif->ipif_ill;
 	mrec_t *rp, *cur_reclist;
 	mrec_t *next_reclist = reclist;
 	boolean_t morepkts;
-	zoneid_t zoneid;
 	ip_stack_t	 *ipst = ill->ill_ipst;
 
-	ASSERT(IAM_WRITER_IPIF(ipif));
+	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
 
 	/* if there aren't any records, there's nothing to send */
 	if (reclist == NULL)
@@ -2018,7 +1896,7 @@ nextpkt:
 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
 		rsize = sizeof (grphdra_t) +
 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
-		if (size + rsize > ill->ill_max_frag) {
+		if (size + rsize > ill->ill_mtu) {
 			if (rp == cur_reclist) {
 				/*
 				 * If the first mrec we looked at is too big
@@ -2029,7 +1907,7 @@ nextpkt:
 				 * other types).
 				 */
 				int srcspace, srcsperpkt;
-				srcspace = ill->ill_max_frag - (size +
+				srcspace = ill->ill_mtu - (size +
 				    sizeof (grphdra_t));
 
 				/*
@@ -2082,37 +1960,12 @@ nextpkt:
 		numrec++;
 	}
 
-	/*
-	 * See comments in igmp_sendpkt() about initializing for ipsec and
-	 * load balancing requirements.
-	 */
-	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
-	if (first_mp == NULL)
-		goto free_reclist;
-
-	first_mp->b_datap->db_type = M_CTL;
-	first_mp->b_wptr += sizeof (ipsec_info_t);
-	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
-	/* ipsec_out_secure is B_FALSE now */
-	io = (ipsec_out_t *)first_mp->b_rptr;
-	io->ipsec_out_type = IPSEC_OUT;
-	io->ipsec_out_len = sizeof (ipsec_out_t);
-	io->ipsec_out_use_global_policy = B_TRUE;
-	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
-	io->ipsec_out_multicast_loop = B_FALSE;
-	io->ipsec_out_dontroute = B_TRUE;
-	if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
-		zoneid = GLOBAL_ZONEID;
-	io->ipsec_out_zoneid = zoneid;
-
 	mp = allocb(size, BPRI_HI);
 	if (mp == NULL) {
-		freemsg(first_mp);
 		goto free_reclist;
 	}
 	bzero((char *)mp->b_rptr, size);
 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
-	first_mp->b_cont = mp;
 
 	ipha = (ipha_t *)mp->b_rptr;
 	rtralert = (uint8_t *)&(ipha[1]);
@@ -2149,21 +2002,9 @@ nextpkt:
 	ipha->ipha_ttl = IGMP_TTL;
 	ipha->ipha_protocol = IPPROTO_IGMP;
 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
-	ipha->ipha_src = ipif->ipif_src_addr;
+	ipha->ipha_src = INADDR_ANY;
 
-	/*
-	 * Request loopback of the report if we are acting as a multicast
-	 * router, so that the process-level routing daemon can hear it.
-	 *
-	 * This will run multiple times for the same group if there are
-	 * members on the same group for multiple ipifs on the same ill.
-	 * The igmp_input code will suppress this due to the loopback;
-	 * thus we always loopback membership report.
-	 */
-	ASSERT(ill->ill_rq != NULL);
-	ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid);
-
-	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
+	ill_mcast_queue(ill, mp);
 
 	++ipst->ips_igmpstat.igps_snd_reports;
 
@@ -2190,21 +2031,24 @@ free_reclist:
 
 /*
  * mld_input:
+ * Return NULL for a bad packet that is discarded here.
+ * Return mp if the message is OK and should be handed to "raw" receivers.
+ * Callers of mld_input() may need to reinitialize variables that were copied
+ * from the mblk as this calls pullupmsg().
  */
-/* ARGSUSED */
-void
-mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
+mblk_t *
+mld_input(mblk_t *mp, ip_recv_attr_t *ira)
 {
 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
 	mld_hdr_t	*mldh;
 	ilm_t		*ilm;
 	ipif_t		*ipif;
 	uint16_t	hdr_length, exthdr_length;
-	in6_addr_t	*v6group_ptr, *lcladdr_ptr;
+	in6_addr_t	*v6group_ptr;
 	uint_t		next;
 	int		mldlen;
+	ill_t		*ill = ira->ira_ill;
 	ip_stack_t	*ipst = ill->ill_ipst;
-	ilm_walker_t	ilw;
 
 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
 
@@ -2212,30 +2056,26 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
 		freemsg(mp);
-		return;
+		return (NULL);
 	}
 
 	if (ip6h->ip6_hlim != 1) {
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
 		freemsg(mp);
-		return;
+		return (NULL);
 	}
 
 	/* Get to the icmp header part */
-	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
-		hdr_length = ip_hdr_length_v6(mp, ip6h);
-		exthdr_length = hdr_length - IPV6_HDR_LEN;
-	} else {
-		hdr_length = IPV6_HDR_LEN;
-		exthdr_length = 0;
-	}
+	hdr_length = ira->ira_ip_hdr_length;
+	exthdr_length = hdr_length - IPV6_HDR_LEN;
+
 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
 
 	/* An MLD packet must at least be 24 octets to be valid */
 	if (mldlen < MLD_MINLEN) {
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
 		freemsg(mp);
-		return;
+		return (NULL);
 	}
 
 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
@@ -2254,50 +2094,41 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
 		} else {
 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
 			freemsg(mp);
-			return;
+			return (NULL);
 		}
 		if (next == 0) {
-			freemsg(mp);
-			return;
+			return (mp);
 		}
 
 		if (next != INFINITY)
 			mld_start_timers(next, ipst);
 		break;
 
-	case MLD_LISTENER_REPORT: {
-
-		ASSERT(ill->ill_ipif != NULL);
+	case MLD_LISTENER_REPORT:
 		/*
 		 * For fast leave to work, we have to know that we are the
 		 * last person to send a report for this group.  Reports
 		 * generated by us are looped back since we could potentially
 		 * be a multicast router, so discard reports sourced by me.
 		 */
-		lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet);
 		mutex_enter(&ill->ill_lock);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
-			    lcladdr_ptr)) {
+			    &ip6h->ip6_src)) {
 				if (ip_debug > 1) {
 					char    buf1[INET6_ADDRSTRLEN];
-					char	buf2[INET6_ADDRSTRLEN];
 
 					(void) mi_strlog(ill->ill_rq,
 					    1,
 					    SL_TRACE,
 					    "mld_input: we are only "
-					    "member src %s ipif_local %s",
-					    inet_ntop(AF_INET6, lcladdr_ptr,
-					    buf1, sizeof (buf1)),
-					    inet_ntop(AF_INET6,
-					    &ipif->ipif_v6lcl_addr,
-					    buf2, sizeof (buf2)));
+					    "member src %s\n",
+					    inet_ntop(AF_INET6, &ip6h->ip6_src,
+					    buf1, sizeof (buf1)));
 				}
 				mutex_exit(&ill->ill_lock);
-				freemsg(mp);
-				return;
+				return (mp);
 			}
 		}
 		mutex_exit(&ill->ill_lock);
@@ -2308,9 +2139,10 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
 			BUMP_MIB(ill->ill_icmp6_mib,
 			    ipv6IfIcmpInGroupMembBadReports);
 			freemsg(mp);
-			return;
+			return (NULL);
 		}
 
+
 		/*
 		 * If we belong to the group being reported, and we are a
 		 * 'Delaying member' per the RFC terminology, stop our timer
@@ -2319,8 +2151,8 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
 		 * membership entries for the same group address (one per zone)
 		 * so we need to walk the ill_ilm list.
 		 */
-		ilm = ilm_walker_start(&ilw, ill);
-		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
+		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
 				continue;
 			BUMP_MIB(ill->ill_icmp6_mib,
@@ -2329,23 +2161,19 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
 			ilm->ilm_timer = INFINITY;
 			ilm->ilm_state = IGMP_OTHERMEMBER;
 		}
-		ilm_walker_finish(&ilw);
+		rw_exit(&ill->ill_mcast_lock);
+		/*
+		 * No packets have been sent above - no
+		 * ill_mcast_send_queued is needed.
+		 */
+		ill_mcast_timer_start(ill->ill_ipst);
 		break;
-	}
+
 	case MLD_LISTENER_REDUCTION:
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
 		break;
 	}
-	/*
-	 * All MLD packets have already been passed up to any
-	 * process(es) listening on a ICMP6 raw socket. This
-	 * has been accomplished in ip_deliver_local_v6 prior to
-	 * this function call. It is assumed that the multicast daemon
-	 * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the
-	 * ICMP6_FILTER socket option to only receive the MLD messages)
-	 * Thus we can free the MLD message block here
-	 */
-	freemsg(mp);
+	return (mp);
 }
 
 /*
@@ -2359,7 +2187,6 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
 	int	timer;
 	uint_t	next, current;
 	in6_addr_t *v6group;
-	ilm_walker_t ilw;
 
 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
 
@@ -2383,7 +2210,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
 	}
 
 	/* Need to do compatibility mode checking */
-	mutex_enter(&ill->ill_lock);
+	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
 	ill->ill_mcast_v1_time = 0;
 	ill->ill_mcast_v1_tset = 1;
 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
@@ -2392,7 +2219,6 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
 		ill->ill_mcast_type = MLD_V1_ROUTER;
 	}
-	mutex_exit(&ill->ill_lock);
 
 	timer = (int)ntohs(mldh->mld_maxdelay);
 	if (ip_debug > 1) {
@@ -2415,11 +2241,8 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
 	 */
 	next = INFINITY;
 
-	ilm = ilm_walker_start(&ilw, ill);
-	mutex_enter(&ill->ill_lock);
 	current = CURRENT_MSTIME;
-
-	for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
 
 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
@@ -2434,9 +2257,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
 				/* Respond immediately */
 				ilm->ilm_timer = INFINITY;
 				ilm->ilm_state = IGMP_IREPORTEDLAST;
-				mutex_exit(&ill->ill_lock);
 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
-				mutex_enter(&ill->ill_lock);
 				break;
 			}
 			if (ilm->ilm_timer > timer) {
@@ -2448,8 +2269,10 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
 			break;
 		}
 	}
-	mutex_exit(&ill->ill_lock);
-	ilm_walker_finish(&ilw);
+	rw_exit(&ill->ill_mcast_lock);
+	/* Send any deferred/queued IP packets */
+	ill_mcast_send_queued(ill);
+	ill_mcast_timer_start(ill->ill_ipst);
 
 	return (next);
 }
@@ -2466,7 +2289,6 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
 	in6_addr_t *v6group, *src_array;
 	uint_t	next, numsrc, i, mrd, delay, qqi, current;
 	uint8_t	qrv;
-	ilm_walker_t ilw;
 
 	v6group = &mld2q->mld2q_addr;
 	numsrc = ntohs(mld2q->mld2q_numsrc);
@@ -2514,12 +2336,11 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
 	 * sooner than the delay we calculated for this response, then
 	 * no action is required (MLDv2 draft section 6.2 rule 1)
 	 */
-	mutex_enter(&ill->ill_lock);
+	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
 	if (ill->ill_global_timer < (current + delay)) {
-		mutex_exit(&ill->ill_lock);
+		rw_exit(&ill->ill_mcast_lock);
 		return (next);
 	}
-	mutex_exit(&ill->ill_lock);
 
 	/*
 	 * Now take action depending on query type: general,
@@ -2532,16 +2353,11 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
 		 * greater than our calculated delay, so reset it to
 		 * our delay (random value in range [0, response time])
 		 */
-		mutex_enter(&ill->ill_lock);
 		ill->ill_global_timer = current + delay;
-		mutex_exit(&ill->ill_lock);
 		next = delay;
-
 	} else {
 		/* group or group/source specific query */
-		ilm = ilm_walker_start(&ilw, ill);
-		mutex_enter(&ill->ill_lock);
-		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
@@ -2595,9 +2411,13 @@ group_query:
 			ilm->ilm_timer += current;
 			break;
 		}
-		mutex_exit(&ill->ill_lock);
-		ilm_walker_finish(&ilw);
 	}
+	rw_exit(&ill->ill_mcast_lock);
+	/*
+	 * No packets have been sent above - no
+	 * ill_mcast_send_queued is needed.
+	 */
+	ill_mcast_timer_start(ill->ill_ipst);
 
 	return (next);
 }
@@ -2615,7 +2435,8 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
 	struct ip6_opt_router	*ip6router;
 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
 	ill_t		*ill = ilm->ilm_ill;
-	ipif_t		*ipif;
+
+	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
 
 	/*
 	 * We need to place a router alert option in this packet.  The length
@@ -2663,35 +2484,20 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
 	else
 		ip6h->ip6_dst = *v6addr;
 
-	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
-	if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) {
-		ip6h->ip6_src = ipif->ipif_v6src_addr;
-		ipif_refrele(ipif);
-	} else {
-		/* Otherwise, use IPv6 default address selection. */
-		ip6h->ip6_src = ipv6_all_zeros;
-	}
-
+	ip6h->ip6_src = ipv6_all_zeros;
 	/*
 	 * Prepare for checksum by putting icmp length in the icmp
-	 * checksum field. The checksum is calculated in ip_wput_v6.
+	 * checksum field. The checksum is calculated in ip_output.
 	 */
 	mldh->mld_cksum = htons(sizeof (*mldh));
 
-	/*
-	 * ip_wput will automatically loopback the multicast packet to
-	 * the conn if multicast loopback is enabled.
-	 * The MIB stats corresponding to this outgoing MLD packet
-	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
-	 * ->icmp_update_out_mib_v6 function call.
-	 */
-	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
+	ill_mcast_queue(ill, mp);
 }
 
 /*
  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
  * report will contain one multicast address record for each element of
- * reclist.  If this causes packet length to exceed ill->ill_max_frag,
+ * reclist.  If this causes packet length to exceed ill->ill_mtu,
  * multiple reports are sent.  reclist is assumed to be made up of
  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
  */
@@ -2706,19 +2512,17 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
 	ip6_hbh_t	*ip6hbh;
 	struct ip6_opt_router	*ip6router;
 	size_t		size, optlen, padlen, icmpsize, rsize;
-	ipif_t		*ipif;
 	int		i, numrec, more_src_cnt;
 	mrec_t		*rp, *cur_reclist;
 	mrec_t		*next_reclist = reclist;
 	boolean_t	morepkts;
 
-	ASSERT(IAM_WRITER_ILL(ill));
-
 	/* If there aren't any records, there's nothing to send */
 	if (reclist == NULL)
 		return;
 
 	ASSERT(ill->ill_isv6);
+	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
 
 	/*
 	 * Total option length (optlen + padlen) must be a multiple of
@@ -2737,7 +2541,7 @@ nextpkt:
 	    rp = rp->mrec_next, numrec++) {
 		rsize = sizeof (mld2mar_t) +
 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
-		if (size + rsize > ill->ill_max_frag) {
+		if (size + rsize > ill->ill_mtu) {
 			if (rp == cur_reclist) {
 				/*
 				 * If the first mrec we looked at is too big
@@ -2748,7 +2552,7 @@ nextpkt:
 				 * other types).
 				 */
 				int srcspace, srcsperpkt;
-				srcspace = ill->ill_max_frag -
+				srcspace = ill->ill_mtu -
 				    (size + sizeof (mld2mar_t));
 
 				/*
@@ -2819,14 +2623,7 @@ nextpkt:
 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
 	ip6h->ip6_hops = MLD_HOP_LIMIT;
 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
-	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
-	if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) {
-		ip6h->ip6_src = ipif->ipif_v6src_addr;
-		ipif_refrele(ipif);
-	} else {
-		/* otherwise, use IPv6 default address selection. */
-		ip6h->ip6_src = ipv6_all_zeros;
-	}
+	ip6h->ip6_src = ipv6_all_zeros;
 
 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
 	/*
@@ -2844,7 +2641,7 @@ nextpkt:
 	mld2r->mld2r_nummar = htons(numrec);
 	/*
 	 * Prepare for the checksum by putting icmp length in the icmp
-	 * checksum field. The checksum is calculated in ip_wput_v6.
+	 * checksum field. The checksum is calculated in ip_output_simple.
 	 */
 	mld2r->mld2r_cksum = htons(icmpsize);
 
@@ -2861,14 +2658,7 @@ nextpkt:
 		mld2mar = (mld2mar_t *)&(srcarray[i]);
 	}
 
-	/*
-	 * ip_wput will automatically loopback the multicast packet to
-	 * the conn if multicast loopback is enabled.
-	 * The MIB stats corresponding to this outgoing MLD packet
-	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
-	 * ->icmp_update_out_mib_v6 function call.
-	 */
-	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
+	ill_mcast_queue(ill, mp);
 
 	if (morepkts) {
 		if (more_src_cnt > 0) {
@@ -2997,7 +2787,7 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
 	mrec_t *rp, *rpnext, *rtnmrec;
 	boolean_t ovf;
 
-	ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill);
+	ill = ilm->ilm_ill;
 
 	if (mreclist == NULL)
 		return (mreclist);
@@ -3100,64 +2890,3 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
 
 	return (rtnmrec);
 }
-
-/*
- * Convenience routine to signal the restart-timer thread.
- */
-static void
-mcast_signal_restart_thread(ip_stack_t *ipst)
-{
-	mutex_enter(&ipst->ips_mrt_lock);
-	ipst->ips_mrt_flags |= IP_MRT_RUN;
-	cv_signal(&ipst->ips_mrt_cv);
-	mutex_exit(&ipst->ips_mrt_lock);
-}
-
-/*
- * Thread to restart IGMP/MLD timers.  See the comment in igmp_joingroup() for
- * the story behind this unfortunate thread.
- */
-void
-mcast_restart_timers_thread(ip_stack_t *ipst)
-{
-	int next;
-	char name[64];
-	callb_cpr_t cprinfo;
-
-	(void) snprintf(name, sizeof (name), "mcast_restart_timers_thread_%d",
-	    ipst->ips_netstack->netstack_stackid);
-	CALLB_CPR_INIT(&cprinfo, &ipst->ips_mrt_lock, callb_generic_cpr, name);
-
-	for (;;) {
-		mutex_enter(&ipst->ips_mrt_lock);
-		while (!(ipst->ips_mrt_flags & (IP_MRT_STOP|IP_MRT_RUN))) {
-			CALLB_CPR_SAFE_BEGIN(&cprinfo);
-			cv_wait(&ipst->ips_mrt_cv, &ipst->ips_mrt_lock);
-			CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_mrt_lock);
-		}
-		if (ipst->ips_mrt_flags & IP_MRT_STOP)
-			break;
-		ipst->ips_mrt_flags &= ~IP_MRT_RUN;
-		mutex_exit(&ipst->ips_mrt_lock);
-
-		mutex_enter(&ipst->ips_igmp_timer_lock);
-		next = ipst->ips_igmp_deferred_next;
-		ipst->ips_igmp_deferred_next = INFINITY;
-		mutex_exit(&ipst->ips_igmp_timer_lock);
-
-		if (next != INFINITY)
-			igmp_start_timers(next, ipst);
-
-		mutex_enter(&ipst->ips_mld_timer_lock);
-		next = ipst->ips_mld_deferred_next;
-		ipst->ips_mld_deferred_next = INFINITY;
-		mutex_exit(&ipst->ips_mld_timer_lock);
-		if (next != INFINITY)
-			mld_start_timers(next, ipst);
-	}
-
-	ipst->ips_mrt_flags |= IP_MRT_DONE;
-	cv_signal(&ipst->ips_mrt_done_cv);
-	CALLB_CPR_EXIT(&cprinfo);	/* drops ips_mrt_lock */
-	thread_exit();
-}
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index ebb89e3172..b59087e9b1 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -38,6 +38,7 @@
 #include <sys/tihdr.h>
 #include <sys/xti_inet.h>
 #include <sys/ddi.h>
+#include <sys/suntpi.h>
 #include <sys/cmn_err.h>
 #include <sys/debug.h>
 #include <sys/kobj.h>
@@ -94,10 +95,8 @@
 #include <inet/ipp_common.h>
 
 #include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
 #include <inet/sadb.h>
 #include <inet/ipsec_impl.h>
-#include <sys/iphada.h>
 #include <inet/iptun/iptun_impl.h>
 #include <inet/ipdrop.h>
 #include <inet/ip_netinfo.h>
@@ -111,9 +110,7 @@
 #include <ipp/ipp_impl.h>
 #include <ipp/ipgpc/ipgpc.h>
 
-#include <sys/multidata.h>
 #include <sys/pattr.h>
-
 #include <inet/ipclassifier.h>
 #include <inet/sctp_ip.h>
 #include <inet/sctp/sctp_impl.h>
@@ -126,6 +123,7 @@
 
 #include <rpc/pmap_prot.h>
 #include <sys/squeue_impl.h>
+#include <inet/ip_arp.h>
 
 /*
  * Values for squeue switch:
@@ -133,10 +131,9 @@
  * IP_SQUEUE_ENTER: SQ_PROCESS
  * IP_SQUEUE_FILL: SQ_FILL
  */
-int ip_squeue_enter = 2;	/* Setable in /etc/system */
+int ip_squeue_enter = IP_SQUEUE_ENTER;	/* Setable in /etc/system */
 
 int ip_squeue_flag;
-#define	SET_BPREV_FLAG(x)	((mblk_t *)(uintptr_t)(x))
 
 /*
  * Setable in /etc/system
@@ -177,7 +174,8 @@ typedef struct iproutedata_s {
 	listptr_t	ird_attrs;	/* ipRouteAttributeTable */
 } iproutedata_t;
 
-#define	IRD_REPORT_TESTHIDDEN	0x01	/* include IRE_MARK_TESTHIDDEN routes */
+/* Include ire_testhidden and IRE_IF_CLONE routes */
+#define	IRD_REPORT_ALL	0x01
 
 /*
  * Cluster specific hooks. These should be NULL when booted as a non-cluster
@@ -233,29 +231,26 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * MT level protection given by STREAMS. IP uses a combination of its own
  * internal serialization mechanism and standard Solaris locking techniques.
  * The internal serialization is per phyint.  This is used to serialize
- * plumbing operations, certain multicast operations, most set ioctls,
- * igmp/mld timers etc.
+ * plumbing operations, IPMP operations, most set ioctls, etc.
  *
  * Plumbing is a long sequence of operations involving message
  * exchanges between IP, ARP and device drivers. Many set ioctls are typically
  * involved in plumbing operations. A natural model is to serialize these
  * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
  * parallel without any interference. But various set ioctls on hme0 are best
- * serialized, along with multicast join/leave operations, igmp/mld timer
- * operations, and processing of DLPI control messages received from drivers
- * on a per phyint basis.  This serialization is provided by the ipsq_t and
- * primitives operating on this. Details can be found in ip_if.c above the
- * core primitives operating on ipsq_t.
+ * serialized, along with IPMP operations and processing of DLPI control
+ * messages received from drivers on a per phyint basis. This serialization is
+ * provided by the ipsq_t and primitives operating on this. Details can
+ * be found in ip_if.c above the core primitives operating on ipsq_t.
  *
  * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
  * Simiarly lookup of an ire by a thread also returns a refheld ire.
  * In addition ipif's and ill's referenced by the ire are also indirectly
- * refheld. Thus no ipif or ill can vanish nor can critical parameters like
- * the ipif's address or netmask change as long as an ipif is refheld
+ * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
  * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
  * address of an ipif has to go through the ipsq_t. This ensures that only
- * 1 such exclusive operation proceeds at any time on the ipif. It then
- * deletes all ires associated with this ipif, and waits for all refcnts
+ * one such exclusive operation proceeds at any time on the ipif. It then
+ * waits for all refcnts
  * associated with this ipif to come down to zero. The address is changed
  * only after the ipif has been quiesced. Then the ipif is brought up again.
  * More details are described above the comment in ip_sioctl_flags.
@@ -274,7 +269,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * - ire_lock to protect some of the fields of the ire, IRE tables
  *   (one lock per hash bucket). Refer to ip_ire.c for details.
  *
- * - ndp_g_lock and nce_lock for protecting NCEs.
+ * - ndp_g_lock and ncec_lock for protecting NCEs.
  *
  * - ill_lock protects fields of the ill and ipif. Details in ip.h
  *
@@ -312,12 +307,6 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  *   This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
  *   uniqueness check also done atomically.
  *
- * - ipsec_capab_ills_lock: This readers/writer lock protects the global
- *   lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken
- *   as a writer when adding or deleting elements from these lists, and
- *   as a reader when walking these lists to send a SADB update to the
- *   IPsec capable ills.
- *
  * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
  *   group list linked by ill_usesrc_grp_next. It also protects the
  *   ill_usesrc_ifindex field. It is taken as a writer when a member of the
@@ -357,20 +346,30 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  *
  * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
  * ill_g_lock -> ill_lock(s) -> phyint_lock
- * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock
+ * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
  * ill_g_lock -> ip_addr_avail_lock
  * conn_lock -> irb_lock -> ill_lock -> ire_lock
  * ill_g_lock -> ip_g_nd_lock
+ * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
+ * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
+ * arl_lock -> ill_lock
+ * ips_ire_dep_lock -> irb_lock
  *
  * When more than 1 ill lock is needed to be held, all ill lock addresses
  * are sorted on address and locked starting from highest addressed lock
  * downward.
  *
+ * Multicast scenarios
+ * ips_ill_g_lock -> ill_mcast_lock
+ * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
+ * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
+ * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
+ * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
+ * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
+ *
  * IPsec scenarios
  *
  * ipsa_lock -> ill_g_lock -> ill_lock
- * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock
- * ipsec_capab_ills_lock -> ipsa_lock
  * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
  *
  * Trusted Solaris scenarios
@@ -414,31 +413,30 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * Walker - Increment irb_refcnt before calling the walker callback. Hold the
  * global tree lock (read mode) for traversal.
  *
+ * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
+ * hence we will acquire irb_lock while holding ips_ire_dep_lock.
+ *
  * IPsec notes :
  *
- * IP interacts with the IPsec code (AH/ESP) by tagging a M_CTL message
- * in front of the actual packet. For outbound datagrams, the M_CTL
- * contains a ipsec_out_t (defined in ipsec_info.h), which has the
+ * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
+ * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
+ * ip_xmit_attr_t has the
  * information used by the IPsec code for applying the right level of
- * protection. The information initialized by IP in the ipsec_out_t
+ * protection. The information initialized by IP in the ip_xmit_attr_t
  * is determined by the per-socket policy or global policy in the system.
- * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in
- * ipsec_info.h) which starts out with nothing in it. It gets filled
+ * For inbound datagrams, the ip_recv_attr_t
+ * starts out with nothing in it. It gets filled
  * with the right information if it goes through the AH/ESP code, which
  * happens if the incoming packet is secure. The information initialized
- * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether
+ * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
  * the policy requirements needed by per-socket policy or global policy
  * is met or not.
  *
- * If there is both per-socket policy (set using setsockopt) and there
- * is also global policy match for the 5 tuples of the socket,
- * ipsec_override_policy() makes the decision of which one to use.
- *
  * For fully connected sockets i.e dst, src [addr, port] is known,
  * conn_policy_cached is set indicating that policy has been cached.
  * conn_in_enforce_policy may or may not be set depending on whether
  * there is a global policy match or per-socket policy match.
- * Policy inheriting happpens in ip_bind during the ipa_conn_t bind.
+ * Policy inheriting happpens in ip_policy_set once the destination is known.
  * Once the right policy is set on the conn_t, policy cannot change for
  * this socket. This makes life simpler for TCP (UDP ?) where
  * re-transmissions go out with the same policy. For symmetry, policy
@@ -513,7 +511,8 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is
  * called passing idl_tx_list. The connp gets inserted in a drain list
  * pointed to by idl_tx_list. conn_drain_list() asserts flow control for
- * the sockets (non stream based) and sets QFULL condition for conn_wq.
+ * the sockets (non stream based) and sets QFULL condition on the conn_wq
+ * of streams sockets, or the su_txqfull for non-streams sockets.
  * connp->conn_direct_blocked will be set to indicate the blocked
  * condition.
  *
@@ -521,46 +520,37 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * A cookie is passed in the call to ill_flow_enable() that identifies the
  * blocked Tx ring. This cookie is used to get to the idl_tx_list that
  * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t
- * and goes through each of the drain list (q)enabling the conn_wq of the
- * first conn in each of the drain list. This causes ip_wsrv to run for the
+ * and goes through each conn in the drain list and calls conn_idl_remove
+ * for the conn to clear the qfull condition for the conn, as well as to
+ * remove the conn from the idl list. In addition, streams based sockets
+ * will have the conn_wq enabled, causing ip_wsrv to run for the
  * conn. ip_wsrv drains the queued messages, and removes the conn from the
- * drain list, if all messages were drained. It also qenables the next conn
- * in the drain list to continue the drain process.
+ * drain list, if all messages were drained. It also notifies the
+ * conn_upcalls for the conn to signal that flow-control has opened up.
  *
  * In reality the drain list is not a single list, but a configurable number
- * of lists. conn_drain_walk() in the IP module, qenables the first conn in
- * each list. If the ip_wsrv of the next qenabled conn does not run, because
- * the stream closes, ip_close takes responsibility to qenable the next conn
- * in the drain list. conn_drain_insert and conn_drain_tail are the only
+ * of lists. conn_walk_drain() in the IP module, notifies the conn_upcalls for
+ * each conn in the list. conn_drain_insert and conn_drain_tail are the only
  * functions that manipulate this drain list. conn_drain_insert is called in
- * ip_wput context itself (as opposed to from ip_wsrv context for STREAMS
+ * from the protocol layer when conn_ip_output returns EWOULDBLOCK.
+ * (as opposed to from ip_wsrv context for STREAMS
  * case -- see below). The synchronization between drain insertion and flow
  * control wakeup is handled by using idl_txl->txl_lock.
  *
  * Flow control using STREAMS:
  * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism
  * is used. On the send side, if the packet cannot be sent down to the
- * driver by IP, because of a canput failure, IP does a putq on the conn_wq.
- * This will cause ip_wsrv to run on the conn_wq. ip_wsrv in turn, inserts
- * the conn in a list of conn's that need to be drained when the flow
- * control condition subsides. The blocked connps are put in first member
- * of ips_idl_tx_list[] array. Ultimately STREAMS backenables the ip_wsrv
- * on the IP module. It calls conn_walk_drain() passing ips_idl_tx_list[0].
- * ips_idl_tx_list[0] contains the drain lists of blocked conns. The
- * conn_wq of the first conn in the drain lists is (q)enabled to run.
- * ip_wsrv on this conn drains the queued messages, and removes the conn
- * from the drain list, if all messages were drained. It also qenables the
- * next conn in the drain list to continue the drain process.
- *
- * If the ip_wsrv of the next qenabled conn does not run, because the
- * stream closes, ip_close takes responsibility to qenable the next conn in
- * the drain list. The directly called ip_wput path always does a putq, if
- * it cannot putnext. Thus synchronization problems are handled between
- * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only
- * functions that manipulate this drain list. Furthermore conn_drain_insert
- * is called only from ip_wsrv for the STREAMS case, and there can be only 1
- * instance of ip_wsrv running on a queue at any time. conn_drain_tail can
- * be simultaneously called from both ip_wsrv and ip_close.
+ * driver by IP, because of a canput failure, ip_xmit drops the packet
+ * and returns EWOULDBLOCK to the caller, who may then invoke
+ * ixa_check_drain_insert to insert the conn on the 0'th drain list.
+ * When ip_wsrv runs on the ill_wq because flow control has been relieved, the
+ * blocked conns in the * 0'th drain list is drained as with the
+ * non-STREAMS case.
+ *
+ * In both the STREAMS and non-STREAMS case, the sockfs upcall to set
+ * qfull is done when the conn is inserted into the drain list
+ * (conn_drain_insert()) and cleared when the conn is removed from the drain
+ * list (conn_idl_remove()).
  *
  * IPQOS notes:
  *
@@ -579,14 +569,13 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * By default all the callout positions are enabled.
  *
  * Outbound (local_out)
- * Hooks are placed in ip_wput_ire and ipsec_out_process.
+ * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
  *
  * Inbound (local_in)
- * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and
- * TCP and UDP fanout routines.
+ * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
  *
  * Forwarding (in and out)
- * Hooks are placed in ip_rput_forward.
+ * Hooks are placed in ire_recv_forward_v4/v6.
  *
  * IP Policy Framework processing (IPPF processing)
  * Policy processing for a packet is initiated by ip_process, which ascertains
@@ -596,16 +585,6 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * filters configured in ipgpc and resumes normal IP processing thereafter.
  * An action instance can drop a packet in course of its processing.
  *
- * A boolean variable, ip_policy, is used in all the fanout routines that can
- * invoke ip_process for a packet. This variable indicates if the packet should
- * to be sent for policy processing. The variable is set to B_TRUE by default,
- * i.e. when the routines are invoked in the normal ip procesing path for a
- * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout;
- * ip_policy is set to B_FALSE for all the routines called in these two
- * functions because, in the former case,  we don't process loopback traffic
- * currently while in the latter, the packets have already been processed in
- * icmp_inbound.
- *
  * Zones notes:
  *
  * The partitioning rules for networking are as follows:
@@ -638,24 +617,18 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * IRE_LOCAL				Exclusive (x)
  * IRE_LOOPBACK				Exclusive
  * IRE_PREFIX (net routes)		Shared (*)
- * IRE_CACHE				Exclusive
  * IRE_IF_NORESOLVER (interface routes)	Exclusive
  * IRE_IF_RESOLVER (interface routes)	Exclusive
+ * IRE_IF_CLONE (interface routes)	Exclusive
  * IRE_HOST (host routes)		Shared (*)
  *
  * (*) A zone can only use a default or off-subnet route if the gateway is
  * directly reachable from the zone, that is, if the gateway's address matches
  * one of the zone's logical interfaces.
  *
- * (x) IRE_LOCAL are handled a bit differently, since for all other entries
- * in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source
- * when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP
- * address of the zone itself (the destination). Since IRE_LOCAL is used
- * for communication between zones, ip_wput_ire has special logic to set
- * the right source address when sending using an IRE_LOCAL.
- *
- * Furthermore, when ip_restrict_interzone_loopback is set (the default),
- * ire_cache_lookup restricts loopback using an IRE_LOCAL
+ * (x) IRE_LOCAL are handled a bit differently.
+ * When ip_restrict_interzone_loopback is set (the default),
+ * ire_route_recursive restricts loopback using an IRE_LOCAL
  * between zone to the case when L2 would have conceptually looped the packet
  * back, i.e. the loopback which is required since neither Ethernet drivers
  * nor Ethernet hardware loops them back. This is the case when the normal
@@ -669,17 +642,11 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * since some zones may not be on the 10.16.72/24 network. To handle this, each
  * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
  * sent to every zone that has an IRE_BROADCAST entry for the destination
- * address on the input ill, see conn_wantpacket().
+ * address on the input ill, see ip_input_broadcast().
  *
  * Applications in different zones can join the same multicast group address.
- * For IPv4, group memberships are per-logical interface, so they're already
- * inherently part of a zone. For IPv6, group memberships are per-physical
- * interface, so we distinguish IPv6 group memberships based on group address,
- * interface and zoneid. In both cases, received multicast packets are sent to
- * every zone for which a group membership entry exists. On IPv6 we need to
- * check that the target zone still has an address on the receiving physical
- * interface; it could have been removed since the application issued the
- * IPV6_JOIN_GROUP.
+ * The same logic applies for multicast as for broadcast. ip_input_multicast
+ * dispatches packets to all zones that have members on the physical interface.
  */
 
 /*
@@ -694,62 +661,37 @@ boolean_t	ip_squeue_fanout = 0;
  */
 uint_t ip_max_frag_dups = 10;
 
-#define	IS_SIMPLE_IPH(ipha)						\
-	((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)
-
 /* RFC 1122 Conformance */
 #define	IP_FORWARD_DEFAULT	IP_FORWARD_NEVER
 
 #define	ILL_MAX_NAMELEN			LIFNAMSIZ
 
-static int	conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *);
-
 static int	ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
 		    cred_t *credp, boolean_t isv6);
-static mblk_t	*ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t,
-		    ipha_t **);
+static mblk_t	*ip_xmit_attach_llhdr(mblk_t *, nce_t *);
 
-static void	icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t,
-		    ip_stack_t *);
-static void	icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int,
-		    uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t);
-static ipaddr_t	icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp);
-static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t,
-		    mblk_t *, int, ip_stack_t *);
-static void	icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *,
-		    icmph_t *, ipha_t *, int, int, boolean_t, boolean_t,
-		    ill_t *, zoneid_t);
+static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
+static void	icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
+static void	icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
+    ip_recv_attr_t *);
 static void	icmp_options_update(ipha_t *);
-static void	icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t,
-		    ip_stack_t *);
-static void	icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t,
-		    zoneid_t zoneid, ip_stack_t *);
-static mblk_t	*icmp_pkt_err_ok(mblk_t *, ip_stack_t *);
-static void	icmp_redirect(ill_t *, mblk_t *);
-static void	icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t,
-		    ip_stack_t *);
+static void	icmp_param_problem(mblk_t *, uint8_t,  ip_recv_attr_t *);
+static void	icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
+static mblk_t	*icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
+static void	icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
+    ip_recv_attr_t *);
+static void	icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
+static void	icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
+    ip_recv_attr_t *);
 
-static void	ip_arp_news(queue_t *, mblk_t *);
-static boolean_t ip_bind_get_ire_v4(mblk_t **, ire_t *, iulp_t *, ip_stack_t *);
 mblk_t		*ip_dlpi_alloc(size_t, t_uscalar_t);
 char		*ip_dot_addr(ipaddr_t, char *);
 mblk_t		*ip_carve_mp(mblk_t **, ssize_t);
 int		ip_close(queue_t *, int);
 static char	*ip_dot_saddr(uchar_t *, char *);
-static void	ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t,
-		    boolean_t, boolean_t, ill_t *, zoneid_t);
-static void	ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t,
-		    boolean_t, boolean_t, zoneid_t);
-static void	ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t,
-		    boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t);
 static void	ip_lrput(queue_t *, mblk_t *);
 ipaddr_t	ip_net_mask(ipaddr_t);
-void		ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t,
-		    ip_stack_t *);
-static void	ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t,
-		    conn_t *, uint32_t, zoneid_t, ip_opt_info_t *);
 char		*ip_nv_lookup(nv_t *, int);
-static boolean_t	ip_check_for_ipsec_opt(queue_t *, mblk_t *);
 static int	ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 static int	ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 static boolean_t	ip_param_register(IDP *ndp, ipparam_t *, size_t,
@@ -758,17 +700,6 @@ static int	ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
 void	ip_rput(queue_t *, mblk_t *);
 static void	ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
 		    void *dummy_arg);
-void	ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
-static int	ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *,
-    ip_stack_t *);
-static boolean_t	ip_rput_local_options(queue_t *, mblk_t *, ipha_t *,
-			    ire_t *, ip_stack_t *);
-static boolean_t	ip_rput_multimblk_ipoptions(queue_t *, ill_t *,
-			    mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *);
-static int	ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *,
-    ip_stack_t *);
-static boolean_t ip_rput_fragment(ill_t *, ill_t *, mblk_t **, ipha_t *,
-    uint32_t *, uint16_t *);
 int		ip_snmp_get(queue_t *, mblk_t *, int);
 static mblk_t	*ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
 		    mib2_ipIfStatsEntry_t *, ip_stack_t *);
@@ -801,49 +732,34 @@ static mblk_t	*ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
 		    ip_stack_t *ipst);
 static void	ip_snmp_get2_v4(ire_t *, iproutedata_t *);
 static void	ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
-static int	ip_snmp_get2_v6_media(nce_t *, iproutedata_t *);
+static int	ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *);
+static int	ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *);
 int		ip_snmp_set(queue_t *, int, int, uchar_t *, int);
-static boolean_t	ip_source_routed(ipha_t *, ip_stack_t *);
-static boolean_t	ip_source_route_included(ipha_t *);
-static void	ip_trash_ire_reclaim_stack(ip_stack_t *);
 
-static void	ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t,
-		    zoneid_t, ip_stack_t *, conn_t *);
-static mblk_t	*ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *,
+static mblk_t	*ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
 		    mblk_t *);
-static void	ip_wput_local_options(ipha_t *, ip_stack_t *);
-static int	ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t,
-		    zoneid_t, ip_stack_t *);
 
 static void	conn_drain_init(ip_stack_t *);
 static void	conn_drain_fini(ip_stack_t *);
 static void	conn_drain_tail(conn_t *connp, boolean_t closing);
 
 static void	conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
-static void	conn_setqfull(conn_t *);
-static void	conn_clrqfull(conn_t *);
+static void	conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
 
 static void	*ip_stack_init(netstackid_t stackid, netstack_t *ns);
 static void	ip_stack_shutdown(netstackid_t stackid, void *arg);
 static void	ip_stack_fini(netstackid_t stackid, void *arg);
 
-static boolean_t	conn_wantpacket(conn_t *, ill_t *, ipha_t *, int,
-    zoneid_t);
-static void	ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
-    void *dummy_arg);
-
 static int	ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
 
 static int	ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
-    ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *,
-    conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *);
-static void	ip_multirt_bad_mtu(ire_t *, uint32_t);
+    const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
+    ire_t *, conn_t *, boolean_t, const in6_addr_t *,  mcast_record_t,
+    const in6_addr_t *);
 
 static int	ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 static int	ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
     caddr_t, cred_t *);
-extern int	ip_helper_stream_setup(queue_t *, dev_t *, int, int,
-    cred_t *, boolean_t);
 static int	ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
     caddr_t cp, cred_t *cr);
 static int	ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
@@ -859,30 +775,15 @@ static int	icmp_kstat_update(kstat_t *kp, int rw);
 static void	*ip_kstat2_init(netstackid_t, ip_stat_t *);
 static void	ip_kstat2_fini(netstackid_t, kstat_t *);
 
-static mblk_t	*ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t,
-    ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *);
+static void	ipobs_init(ip_stack_t *);
+static void	ipobs_fini(ip_stack_t *);
 
-static void	ip_rput_process_forward(queue_t *, mblk_t *, ire_t *,
-    ipha_t *, ill_t *, boolean_t, boolean_t);
-
-static void ipobs_init(ip_stack_t *);
-static void ipobs_fini(ip_stack_t *);
 ipaddr_t	ip_g_all_ones = IP_HOST_MASK;
 
 /* How long, in seconds, we allow frags to hang around. */
 #define	IP_FRAG_TIMEOUT		15
 #define	IPV6_FRAG_TIMEOUT	60
 
-/*
- * Threshold which determines whether MDT should be used when
- * generating IP fragments; payload size must be greater than
- * this threshold for MDT to take place.
- */
-#define	IP_WPUT_FRAG_MDT_MIN	32768
-
-/* Setable in /etc/system only */
-int	ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN;
-
 static long ip_rput_pullups;
 int	dohwcksum = 1;	/* use h/w cksum if supported by the hardware */
 
@@ -891,24 +792,12 @@ vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
 
 int	ip_debug;
 
-#ifdef DEBUG
-uint32_t ipsechw_debug = 0;
-#endif
-
 /*
  * Multirouting/CGTP stuff
  */
 int	ip_cgtp_filter_rev = CGTP_FILTER_REV;	/* CGTP hooks version */
 
 /*
- * XXX following really should only be in a header. Would need more
- * header and .c clean up first.
- */
-extern optdb_obj_t	ip_opt_obj;
-
-ulong_t ip_squeue_enter_unbound = 0;
-
-/*
  * Named Dispatch Parameter Table.
  * All of these are alterable, within the min/max values given, at run time.
  */
@@ -922,18 +811,18 @@ static ipparam_t	lcl_param_arr[] = {
 	{  0,	1,	1,	"ip_send_redirects"},
 	{  0,	1,	0,	"ip_forward_directed_broadcasts"},
 	{  0,	10,	0,	"ip_mrtdebug"},
-	{  5000, 999999999,	60000, "ip_ire_timer_interval" },
-	{  60000, 999999999,	1200000, "ip_ire_arp_interval" },
-	{  60000, 999999999,	60000, "ip_ire_redirect_interval" },
+	{  1,	8,	3,	"ip_ire_reclaim_fraction" },
+	{  1,	8,	3,	"ip_nce_reclaim_fraction" },
+	{  1,	8,	3,	"ip_dce_reclaim_fraction" },
 	{  1,	255,	255,	"ip_def_ttl" },
 	{  0,	1,	0,	"ip_forward_src_routed"},
 	{  0,	256,	32,	"ip_wroff_extra" },
-	{  5000, 999999999, 600000, "ip_ire_pathmtu_interval" },
+	{  2, 999999999, 60*20, "ip_pathmtu_interval" },	/* In seconds */
 	{  8,	65536,  64,	"ip_icmp_return_data_bytes" },
 	{  0,	1,	1,	"ip_path_mtu_discovery" },
-	{  0,	240,	30,	"ip_ignore_delete_time" },
+	{ 68,	65535,	576,	"ip_pmtu_min" },
 	{  0,	1,	0,	"ip_ignore_redirect" },
-	{  0,	1,	1,	"ip_output_queue" },
+	{  0,	1,	0,	"ip_arp_icmp_error" },
 	{  1,	254,	1,	"ip_broadcast_ttl" },
 	{  0,	99999,	100,	"ip_icmp_err_interval" },
 	{  1,	99999,	10,	"ip_icmp_err_burst" },
@@ -955,7 +844,7 @@ static ipparam_t	lcl_param_arr[] = {
 	{  0,	1,	0,	"ip6_ignore_redirect" },
 	{  0,	1,	0,	"ip6_strict_dst_multihoming" },
 
-	{  1,	8,	3,	"ip_ire_reclaim_fraction" },
+	{  0,	2,	2,	"ip_src_check" },
 
 	{  0,	999999,	1000,	"ipsec_policy_log_interval" },
 
@@ -964,12 +853,16 @@ static ipparam_t	lcl_param_arr[] = {
 	{  1,	20,	3,	"ip_ndp_unsolicit_count" },
 	{  0,	1,	1,	"ip6_ignore_home_address_opt" },
 	{  0,	15,	0,	"ip_policy_mask" },
-	{  1000, 60000, 1000,	"ip_multirt_resolution_interval" },
+	{  0,	2,	2,	"ip_ecmp_behavior" },
 	{  0,	255,	1,	"ip_multirt_ttl" },
-	{  0,	1,	1,	"ip_multidata_outbound" },
-	{  0,	3600000, 300000, "ip_ndp_defense_interval" },
+	{  0,	3600,	60,	"ip_ire_badcnt_lifetime" },	/* In seconds */
 	{  0,	999999,	60*60*24, "ip_max_temp_idle" },
 	{  0,	1000,	1,	"ip_max_temp_defend" },
+	/*
+	 * when a conflict of an active address is detected,
+	 * defend up to ip_max_defend times, within any
+	 * ip_defend_interval span.
+	 */
 	{  0,	1000,	3,	"ip_max_defend" },
 	{  0,	999999,	30,	"ip_defend_interval" },
 	{  0,	3600000, 300000, "ip_dup_recovery" },
@@ -977,12 +870,45 @@ static ipparam_t	lcl_param_arr[] = {
 	{  0,	1,	1,	"ip_lso_outbound" },
 	{  IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" },
 	{  MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" },
-	{ 68,	65535,	576,	"ip_pmtu_min" },
 #ifdef DEBUG
 	{  0,	1,	0,	"ip6_drop_inbound_icmpv6" },
 #else
 	{  0,	0,	0,	"" },
 #endif
+	/* delay before sending first probe: */
+	{  0,	20000,	1000,	"arp_probe_delay" },
+	{  0,	20000,	100,	"arp_fastprobe_delay" },
+	/* interval at which DAD probes are sent: */
+	{ 10,	20000,	1500,	"arp_probe_interval" },
+	{ 10,	20000,	150,	"arp_fastprobe_interval" },
+	/* setting probe count to 0 will disable ARP probing for DAD. */
+	{  0,	20,	3,	"arp_probe_count" },
+	{  0,	20,	3,	"arp_fastprobe_count" },
+
+	{  0,	3600000, 15000,	"ipv4_dad_announce_interval"},
+	{  0,	3600000, 15000,	"ipv6_dad_announce_interval"},
+	/*
+	 * Rate limiting parameters for DAD defense used in
+	 * ill_defend_rate_limit():
+	 * defend_rate : pkts/hour permitted
+	 * defend_interval : time that can elapse before we send out a
+	 *			DAD defense.
+	 * defend_period: denominator for defend_rate (in seconds).
+	 */
+	{  0,	3600000, 300000,	"arp_defend_interval"},
+	{  0,	20000, 100,		"arp_defend_rate"},
+	{  0,	3600000, 300000,	"ndp_defend_interval"},
+	{  0,	20000, 100,		"ndp_defend_rate"},
+	{  5,	86400,	3600,		"arp_defend_period"},
+	{  5,	86400,	3600,		"ndp_defend_period"},
+	{  0,	1,	1,		"ipv4_icmp_return_pmtu" },
+	{  0,	1,	1,		"ipv6_icmp_return_pmtu" },
+	/*
+	 * publish count/interval values used to announce local addresses
+	 * for IPv4, IPv6.
+	 */
+	{  1,	20,	5,	"ip_arp_publish_count" },
+	{  1000, 20000,	2000,	"ip_arp_publish_interval" },
 };
 
 /*
@@ -1336,11 +1262,11 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 			ip_sioctl_get_lifsrcof, NULL },
 	/* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
-	/* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR,
+	/* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
 	/* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
-	/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR,
+	/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
 	/* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 	/* SIOCSENABLESDP is handled by SDP */
@@ -1355,12 +1281,12 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
 
 ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
-	{ I_LINK,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
-	{ I_UNLINK,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
-	{ I_PLINK,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
-	{ I_PUNLINK,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
-	{ ND_GET,	0, IPI_PASS_DOWN, 0, NULL, NULL },
-	{ ND_SET,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
+	{ I_LINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
+	{ I_UNLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
+	{ I_PLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
+	{ I_PUNLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
+	{ ND_GET,	0, 0, 0, NULL, NULL },
+	{ ND_SET,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
 	{ IP_IOCTL,	0, 0, 0, NULL, NULL },
 	{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
 		MISC_CMD, mrt_ioctl},
@@ -1384,12 +1310,14 @@ static nv_t	ire_nv_arr[] = {
 	{ IRE_BROADCAST, "BROADCAST" },
 	{ IRE_LOCAL, "LOCAL" },
 	{ IRE_LOOPBACK, "LOOPBACK" },
-	{ IRE_CACHE, "CACHE" },
 	{ IRE_DEFAULT, "DEFAULT" },
 	{ IRE_PREFIX, "PREFIX" },
 	{ IRE_IF_NORESOLVER, "IF_NORESOL" },
 	{ IRE_IF_RESOLVER, "IF_RESOLV" },
+	{ IRE_IF_CLONE, "IF_CLONE" },
 	{ IRE_HOST, "HOST" },
+	{ IRE_MULTICAST, "MULTICAST" },
+	{ IRE_NOROUTE, "NOROUTE" },
 	{ 0 }
 };
 
@@ -1412,7 +1340,6 @@ struct module_info ip_mod_info = {
 
 /*
  * Entry points for IP as a device and as a module.
- * FIXME: down the road we might want a separate module and driver qinit.
  * We have separate open functions for the /dev/ip and /dev/ip6 devices.
  */
 static struct qinit iprinitv4 = {
@@ -1425,13 +1352,8 @@ struct qinit iprinitv6 = {
 	&ip_mod_info
 };
 
-static struct qinit ipwinitv4 = {
-	(pfi_t)ip_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL,
-	&ip_mod_info
-};
-
-struct qinit ipwinitv6 = {
-	(pfi_t)ip_wput_v6, (pfi_t)ip_wsrv, NULL, NULL, NULL,
+static struct qinit ipwinit = {
+	(pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL,
 	&ip_mod_info
 };
 
@@ -1447,98 +1369,32 @@ static struct qinit iplwinit = {
 
 /* For AF_INET aka /dev/ip */
 struct streamtab ipinfov4 = {
-	&iprinitv4, &ipwinitv4, &iplrinit, &iplwinit
+	&iprinitv4, &ipwinit, &iplrinit, &iplwinit
 };
 
 /* For AF_INET6 aka /dev/ip6 */
 struct streamtab ipinfov6 = {
-	&iprinitv6, &ipwinitv6, &iplrinit, &iplwinit
+	&iprinitv6, &ipwinit, &iplrinit, &iplwinit
 };
 
 #ifdef	DEBUG
-static boolean_t skip_sctp_cksum = B_FALSE;
+boolean_t skip_sctp_cksum = B_FALSE;
 #endif
 
 /*
- * Prepend the zoneid using an ipsec_out_t for later use by functions like
- * ip_rput_v6(), ip_output(), etc.  If the message
- * block already has a M_CTL at the front of it, then simply set the zoneid
- * appropriately.
- */
-mblk_t *
-ip_prepend_zoneid(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
-{
-	mblk_t		*first_mp;
-	ipsec_out_t	*io;
-
-	ASSERT(zoneid != ALL_ZONES);
-	if (mp->b_datap->db_type == M_CTL) {
-		io = (ipsec_out_t *)mp->b_rptr;
-		ASSERT(io->ipsec_out_type == IPSEC_OUT);
-		io->ipsec_out_zoneid = zoneid;
-		return (mp);
-	}
-
-	first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack);
-	if (first_mp == NULL)
-		return (NULL);
-	io = (ipsec_out_t *)first_mp->b_rptr;
-	/* This is not a secure packet */
-	io->ipsec_out_secure = B_FALSE;
-	io->ipsec_out_zoneid = zoneid;
-	first_mp->b_cont = mp;
-	return (first_mp);
-}
-
-/*
- * Copy an M_CTL-tagged message, preserving reference counts appropriately.
+ * Generate an ICMP fragmentation needed message.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
  */
-mblk_t *
-ip_copymsg(mblk_t *mp)
-{
-	mblk_t *nmp;
-	ipsec_info_t *in;
-
-	if (mp->b_datap->db_type != M_CTL)
-		return (copymsg(mp));
-
-	in = (ipsec_info_t *)mp->b_rptr;
-
-	/*
-	 * Note that M_CTL is also used for delivering ICMP error messages
-	 * upstream to transport layers.
-	 */
-	if (in->ipsec_info_type != IPSEC_OUT &&
-	    in->ipsec_info_type != IPSEC_IN)
-		return (copymsg(mp));
-
-	nmp = copymsg(mp->b_cont);
-
-	if (in->ipsec_info_type == IPSEC_OUT) {
-		return (ipsec_out_tag(mp, nmp,
-		    ((ipsec_out_t *)in)->ipsec_out_ns));
-	} else {
-		return (ipsec_in_tag(mp, nmp,
-		    ((ipsec_in_t *)in)->ipsec_in_ns));
-	}
-}
-
-/* Generate an ICMP fragmentation needed message. */
-static void
-icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid,
-    ip_stack_t *ipst)
+void
+icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
 {
 	icmph_t	icmph;
-	mblk_t *first_mp;
-	boolean_t mctl_present;
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
 
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
-	if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
-		if (mctl_present)
-			freeb(first_mp);
+	mp = icmp_pkt_err_ok(mp, ira);
+	if (mp == NULL)
 		return;
-	}
 
 	bzero(&icmph, sizeof (icmph_t));
 	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
@@ -1546,29 +1402,29 @@ icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid,
 	icmph.icmph_du_mtu = htons((uint16_t)mtu);
 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
-	icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid,
-	    ipst);
+
+	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
 }
 
 /*
- * icmp_inbound deals with ICMP messages in the following ways.
+ * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
+ * If the ICMP message is consumed by IP, i.e., it should not be delivered
+ * to any IPPROTO_ICMP raw sockets, then it returns NULL.
+ * Likewise, if the ICMP error is misformed (too short, etc), then it
+ * returns NULL. The caller uses this to determine whether or not to send
+ * to raw sockets.
  *
+ * All error messages are passed to the matching transport stream.
+ *
+ * The following cases are handled by icmp_inbound:
  * 1) It needs to send a reply back and possibly delivering it
  *    to the "interested" upper clients.
- * 2) It needs to send it to the upper clients only.
+ * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
  * 3) It needs to change some values in IP only.
- * 4) It needs to change some values in IP and upper layers e.g TCP.
- *
- * We need to accomodate icmp messages coming in clear until we get
- * everything secure from the wire. If icmp_accept_clear_messages
- * is zero we check with the global policy and act accordingly. If
- * it is non-zero, we accept the message without any checks. But
- * *this does not mean* that this will be delivered to the upper
- * clients. By accepting we might send replies back, change our MTU
- * value etc. but delivery to the ULP/clients depends on their policy
- * dispositions.
+ * 4) It needs to change some values in IP and upper layers e.g TCP
+ *    by delivering an error to the upper layers.
  *
- * We handle the above 4 cases in the context of IPsec in the
+ * We handle the above three cases in the context of IPsec in the
  * following way :
  *
  * 1) Send the reply back in the same way as the request came in.
@@ -1610,13 +1466,13 @@ icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid,
  *	     come to a stop. This is solved by making similar decisions
  *	     at both levels. Currently, when we are unable to deliver
  *	     to the Upper Layer (due to policy failures) while IP has
- *	     adjusted ire_max_frag, the next outbound datagram would
+ *	     adjusted dce_pmtu, the next outbound datagram would
  *	     generate a local ICMP_FRAGMENTATION_NEEDED message - which
  *	     will be with the right level of protection. Thus the right
  *	     value will be communicated even if we are not able to
  *	     communicate when we get from the wire initially. But this
  *	     assumes there would be at least one outbound datagram after
- *	     IP has adjusted its ire_max_frag value. To make things
+ *	     IP has adjusted its dce_pmtu value. To make things
  *	     simpler, we accept in clear after the validation of
  *	     AH/ESP headers.
  *
@@ -1627,105 +1483,54 @@ icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid,
  *	     should be accepted in clear when the Upper layer expects secure.
  *	     Thus the communication may get aborted by some bad ICMP
  *	     packets.
- *
- * IPQoS Notes:
- * The only instance when a packet is sent for processing is when there
- * isn't an ICMP client and if we are interested in it.
- * If there is a client, IPPF processing will take place in the
- * ip_fanout_proto routine.
- *
- * Zones notes:
- * The packet is only processed in the context of the specified zone: typically
- * only this zone will reply to an echo request, and only interested clients in
- * this zone will receive a copy of the packet. This means that the caller must
- * call icmp_inbound() for each relevant zone.
  */
-static void
-icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
-    int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy,
-    ill_t *recv_ill, zoneid_t zoneid)
+mblk_t *
+icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
 {
-	icmph_t	*icmph;
-	ipha_t	*ipha;
-	int	iph_hdr_length;
-	int	hdr_length;
+	icmph_t		*icmph;
+	ipha_t		*ipha;		/* Outer header */
+	int		ip_hdr_length;	/* Outer header length */
 	boolean_t	interested;
+	ipif_t		*ipif;
 	uint32_t	ts;
-	uchar_t	*wptr;
-	ipif_t	*ipif;
-	mblk_t *first_mp;
-	ipsec_in_t *ii;
-	timestruc_t now;
-	uint32_t ill_index;
-	ip_stack_t *ipst;
-
-	ASSERT(ill != NULL);
-	ipst = ill->ill_ipst;
-
-	first_mp = mp;
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		ASSERT(mp != NULL);
-	}
+	uint32_t	*tsp;
+	timestruc_t	now;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	zoneid_t	zoneid = ira->ira_zoneid;
+	int		len_needed;
+	mblk_t		*mp_ret = NULL;
 
 	ipha = (ipha_t *)mp->b_rptr;
-	if (ipst->ips_icmp_accept_clear_messages == 0) {
-		first_mp = ipsec_check_global_policy(first_mp, NULL,
-		    ipha, NULL, mctl_present, ipst->ips_netstack);
-		if (first_mp == NULL)
-			return;
-	}
-
-	/*
-	 * On a labeled system, we have to check whether the zone itself is
-	 * permitted to receive raw traffic.
-	 */
-	if (is_system_labeled()) {
-		if (zoneid == ALL_ZONES)
-			zoneid = tsol_packet_to_zoneid(mp);
-		if (!tsol_can_accept_raw(mp, B_FALSE)) {
-			ip1dbg(("icmp_inbound: zone %d can't receive raw",
-			    zoneid));
-			BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
-			freemsg(first_mp);
-			return;
-		}
-	}
-
-	/*
-	 * We have accepted the ICMP message. It means that we will
-	 * respond to the packet if needed. It may not be delivered
-	 * to the upper client depending on the policy constraints
-	 * and the disposition in ipsec_inbound_accept_clear.
-	 */
-
-	ASSERT(ill != NULL);
 
 	BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
-	iph_hdr_length = IPH_HDR_LENGTH(ipha);
-	if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) {
+
+	ip_hdr_length = ira->ira_ip_hdr_length;
+	if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
+		if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+			freemsg(mp);
+			return (NULL);
+		}
 		/* Last chance to get real. */
-		if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) {
+		ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
+		if (ipha == NULL) {
 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
-			freemsg(first_mp);
-			return;
+			freemsg(mp);
+			return (NULL);
 		}
-		/* Refresh iph following the pullup. */
-		ipha = (ipha_t *)mp->b_rptr;
-	}
-	/* ICMP header checksum, including checksum field, should be zero. */
-	if (sum_valid ? (sum != 0 && sum != 0xFFFF) :
-	    IP_CSUM(mp, iph_hdr_length, 0)) {
-		BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
-		freemsg(first_mp);
-		return;
 	}
+
 	/* The IP header will always be a multiple of four bytes */
-	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-	ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type,
+	icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+	ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
 	    icmph->icmph_code));
-	wptr = (uchar_t *)icmph + ICMPH_SIZE;
-	/* We will set "interested" to "true" if we want a copy */
+
+	/*
+	 * We will set "interested" to "true" if we should pass a copy to
+	 * the transport or if we handle the packet locally.
+	 */
 	interested = B_FALSE;
 	switch (icmph->icmph_type) {
 	case ICMP_ECHO_REPLY:
@@ -1753,18 +1558,42 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
 		 * (what isn't?).  We aim to please, you pick it.
 		 * Default is do it.
 		 */
-		if (!broadcast && !CLASSD(ipha->ipha_dst)) {
-			/* unicast: always respond */
-			interested = B_TRUE;
-		} else if (CLASSD(ipha->ipha_dst)) {
+		if (ira->ira_flags & IRAF_MULTICAST) {
 			/* multicast: respond based on tunable */
 			interested = ipst->ips_ip_g_resp_to_echo_mcast;
-		} else if (broadcast) {
+		} else if (ira->ira_flags & IRAF_BROADCAST) {
 			/* broadcast: respond based on tunable */
 			interested = ipst->ips_ip_g_resp_to_echo_bcast;
+		} else {
+			/* unicast: always respond */
+			interested = B_TRUE;
 		}
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
-		break;
+		if (!interested) {
+			/* We never pass these to RAW sockets */
+			freemsg(mp);
+			return (NULL);
+		}
+
+		/* Check db_ref to make sure we can modify the packet. */
+		if (mp->b_datap->db_ref > 1) {
+			mblk_t	*mp1;
+
+			mp1 = copymsg(mp);
+			freemsg(mp);
+			if (!mp1) {
+				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
+				return (NULL);
+			}
+			mp = mp1;
+			ipha = (ipha_t *)mp->b_rptr;
+			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+		}
+		icmph->icmph_type = ICMP_ECHO_REPLY;
+		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
+		icmp_send_reply_v4(mp, ipha, icmph, ira);
+		return (NULL);
+
 	case ICMP_ROUTER_ADVERTISEMENT:
 	case ICMP_ROUTER_SOLICITATION:
 		break;
@@ -1778,28 +1607,63 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
 		break;
 	case ICMP_TIME_STAMP_REQUEST:
 		/* Response to Time Stamp Requests is local policy. */
-		if (ipst->ips_ip_g_resp_to_timestamp &&
-		    /* So is whether to respond if it was an IP broadcast. */
-		    (!broadcast || ipst->ips_ip_g_resp_to_timestamp_bcast)) {
-			int tstamp_len = 3 * sizeof (uint32_t);
-
-			if (wptr +  tstamp_len > mp->b_wptr) {
-				if (!pullupmsg(mp, wptr + tstamp_len -
-				    mp->b_rptr)) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInDiscards);
-					freemsg(first_mp);
-					return;
-				}
-				/* Refresh ipha following the pullup. */
-				ipha = (ipha_t *)mp->b_rptr;
-				icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-				wptr = (uchar_t *)icmph + ICMPH_SIZE;
+		if (ipst->ips_ip_g_resp_to_timestamp) {
+			if (ira->ira_flags & IRAF_MULTIBROADCAST)
+				interested =
+				    ipst->ips_ip_g_resp_to_timestamp_bcast;
+			else
+				interested = B_TRUE;
+		}
+		if (!interested) {
+			/* We never pass these to RAW sockets */
+			freemsg(mp);
+			return (NULL);
+		}
+
+		/* Make sure we have enough of the packet */
+		len_needed = ip_hdr_length + ICMPH_SIZE +
+		    3 * sizeof (uint32_t);
+
+		if (mp->b_wptr - mp->b_rptr < len_needed) {
+			ipha = ip_pullup(mp, len_needed, ira);
+			if (ipha == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards - ip_pullup",
+				    mp, ill);
+				freemsg(mp);
+				return (NULL);
 			}
-			interested = B_TRUE;
+			/* Refresh following the pullup. */
+			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
 		}
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
-		break;
+		/* Check db_ref to make sure we can modify the packet. */
+		if (mp->b_datap->db_ref > 1) {
+			mblk_t	*mp1;
+
+			mp1 = copymsg(mp);
+			freemsg(mp);
+			if (!mp1) {
+				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
+				return (NULL);
+			}
+			mp = mp1;
+			ipha = (ipha_t *)mp->b_rptr;
+			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+		}
+		icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
+		tsp = (uint32_t *)&icmph[1];
+		tsp++;		/* Skip past 'originate time' */
+		/* Compute # of milliseconds since midnight */
+		gethrestime(&now);
+		ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
+		    now.tv_nsec / (NANOSEC / MILLISEC);
+		*tsp++ = htonl(ts);	/* Lay in 'receive time' */
+		*tsp++ = htonl(ts);	/* Lay in 'send time' */
+		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
+		icmp_send_reply_v4(mp, ipha, icmph, ira);
+		return (NULL);
+
 	case ICMP_TIME_STAMP_REPLY:
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
 		break;
@@ -1808,14 +1672,68 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
 	case ICMP_INFO_REPLY:
 		break;
 	case ICMP_ADDRESS_MASK_REQUEST:
-		if ((ipst->ips_ip_respond_to_address_mask_broadcast ||
-		    !broadcast) &&
-		    /* TODO m_pullup of complete header? */
-		    (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) {
+		if (ira->ira_flags & IRAF_MULTIBROADCAST) {
+			interested =
+			    ipst->ips_ip_respond_to_address_mask_broadcast;
+		} else {
 			interested = B_TRUE;
 		}
+		if (!interested) {
+			/* We never pass these to RAW sockets */
+			freemsg(mp);
+			return (NULL);
+		}
+		len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
+		if (mp->b_wptr - mp->b_rptr < len_needed) {
+			ipha = ip_pullup(mp, len_needed, ira);
+			if (ipha == NULL) {
+				BUMP_MIB(ill->ill_ip_mib,
+				    ipIfStatsInTruncatedPkts);
+				ip_drop_input("ipIfStatsInTruncatedPkts", mp,
+				    ill);
+				freemsg(mp);
+				return (NULL);
+			}
+			/* Refresh following the pullup. */
+			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+		}
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
-		break;
+		/* Check db_ref to make sure we can modify the packet. */
+		if (mp->b_datap->db_ref > 1) {
+			mblk_t	*mp1;
+
+			mp1 = copymsg(mp);
+			freemsg(mp);
+			if (!mp1) {
+				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
+				return (NULL);
+			}
+			mp = mp1;
+			ipha = (ipha_t *)mp->b_rptr;
+			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+		}
+		/*
+		 * Need the ipif with the mask be the same as the source
+		 * address of the mask reply. For unicast we have a specific
+		 * ipif. For multicast/broadcast we only handle onlink
+		 * senders, and use the source address to pick an ipif.
+		 */
+		ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
+		if (ipif == NULL) {
+			/* Broadcast or multicast */
+			ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
+			if (ipif == NULL) {
+				freemsg(mp);
+				return (NULL);
+			}
+		}
+		icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
+		bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
+		ipif_refrele(ipif);
+		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
+		icmp_send_reply_v4(mp, ipha, icmph, ira);
+		return (NULL);
+
 	case ICMP_ADDRESS_MASK_REPLY:
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
 		break;
@@ -1824,206 +1742,103 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
 		break;
 	}
-	/* See if there is an ICMP client. */
-	if (ipst->ips_ipcl_proto_fanout[IPPROTO_ICMP].connf_head != NULL) {
+	/*
+	 * See if there is an ICMP client to avoid an extra copymsg/freemsg
+	 * if there isn't one.
+	 */
+	if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
 		/* If there is an ICMP client and we want one too, copy it. */
-		mblk_t *first_mp1;
 
 		if (!interested) {
-			ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present,
-			    ip_policy, recv_ill, zoneid);
-			return;
+			/* Caller will deliver to RAW sockets */
+			return (mp);
 		}
-		first_mp1 = ip_copymsg(first_mp);
-		if (first_mp1 != NULL) {
-			ip_fanout_proto(q, first_mp1, ill, ipha,
-			    0, mctl_present, ip_policy, recv_ill, zoneid);
+		mp_ret = copymsg(mp);
+		if (mp_ret == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
 		}
 	} else if (!interested) {
-		freemsg(first_mp);
-		return;
-	} else {
-		/*
-		 * Initiate policy processing for this packet if ip_policy
-		 * is true.
-		 */
-		if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) {
-			ill_index = ill->ill_phyint->phyint_ifindex;
-			ip_process(IPP_LOCAL_IN, &mp, ill_index);
-			if (mp == NULL) {
-				if (mctl_present) {
-					freeb(first_mp);
-				}
-				BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
-				return;
-			}
+		/* Neither we nor raw sockets are interested. Drop packet now */
+		freemsg(mp);
+		return (NULL);
+	}
+
+	/*
+	 * ICMP error or redirect packet. Make sure we have enough of
+	 * the header and that db_ref == 1 since we might end up modifying
+	 * the packet.
+	 */
+	if (mp->b_cont != NULL) {
+		if (ip_pullup(mp, -1, ira) == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards - ip_pullup",
+			    mp, ill);
+			freemsg(mp);
+			return (mp_ret);
 		}
 	}
-	/* We want to do something with it. */
-	/* Check db_ref to make sure we can modify the packet. */
+
 	if (mp->b_datap->db_ref > 1) {
-		mblk_t	*first_mp1;
+		mblk_t	*mp1;
 
-		first_mp1 = ip_copymsg(first_mp);
-		freemsg(first_mp);
-		if (!first_mp1) {
-			BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
-			return;
-		}
-		first_mp = first_mp1;
-		if (mctl_present) {
-			mp = first_mp->b_cont;
-			ASSERT(mp != NULL);
-		} else {
-			mp = first_mp;
+		mp1 = copymsg(mp);
+		if (mp1 == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
+			freemsg(mp);
+			return (mp_ret);
 		}
-		ipha = (ipha_t *)mp->b_rptr;
-		icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-		wptr = (uchar_t *)icmph + ICMPH_SIZE;
+		freemsg(mp);
+		mp = mp1;
 	}
-	switch (icmph->icmph_type) {
-	case ICMP_ADDRESS_MASK_REQUEST:
-		ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
-		if (ipif == NULL) {
-			freemsg(first_mp);
-			return;
-		}
-		/*
-		 * outging interface must be IPv4
-		 */
-		ASSERT(ipif != NULL && !ipif->ipif_isv6);
-		icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
-		bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN);
-		ipif_refrele(ipif);
-		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
-		break;
-	case ICMP_ECHO_REQUEST:
-		icmph->icmph_type = ICMP_ECHO_REPLY;
-		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
-		break;
-	case ICMP_TIME_STAMP_REQUEST: {
-		uint32_t *tsp;
 
-		icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
-		tsp = (uint32_t *)wptr;
-		tsp++;		/* Skip past 'originate time' */
-		/* Compute # of milliseconds since midnight */
-		gethrestime(&now);
-		ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
-		    now.tv_nsec / (NANOSEC / MILLISEC);
-		*tsp++ = htonl(ts);	/* Lay in 'receive time' */
-		*tsp++ = htonl(ts);	/* Lay in 'send time' */
-		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
-		break;
+	/*
+	 * In case mp has changed, verify the message before any further
+	 * processes.
+	 */
+	ipha = (ipha_t *)mp->b_rptr;
+	icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+	if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
+		freemsg(mp);
+		return (mp_ret);
 	}
-	default:
-		ipha = (ipha_t *)&icmph[1];
-		if ((uchar_t *)&ipha[1] > mp->b_wptr) {
-			if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(first_mp);
-				return;
-			}
-			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-			ipha = (ipha_t *)&icmph[1];
-		}
-		if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(first_mp);
-			return;
-		}
-		hdr_length = IPH_HDR_LENGTH(ipha);
-		if (hdr_length < sizeof (ipha_t)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(first_mp);
-			return;
-		}
-		if ((uchar_t *)ipha + hdr_length > mp->b_wptr) {
-			if (!pullupmsg(mp,
-			    (uchar_t *)ipha + hdr_length - mp->b_rptr)) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(first_mp);
-				return;
-			}
-			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-			ipha = (ipha_t *)&icmph[1];
-		}
-		switch (icmph->icmph_type) {
-		case ICMP_REDIRECT:
-			/*
-			 * As there is no upper client to deliver, we don't
-			 * need the first_mp any more.
-			 */
-			if (mctl_present) {
-				freeb(first_mp);
-			}
-			icmp_redirect(ill, mp);
-			return;
-		case ICMP_DEST_UNREACHABLE:
-			if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
-				if (!icmp_inbound_too_big(icmph, ipha, ill,
-				    zoneid, mp, iph_hdr_length, ipst)) {
-					freemsg(first_mp);
-					return;
-				}
-				/*
-				 * icmp_inbound_too_big() may alter mp.
-				 * Resynch ipha and icmph accordingly.
-				 */
-				icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-				ipha = (ipha_t *)&icmph[1];
-			}
-			/* FALLTHRU */
-		default :
-			/*
-			 * IPQoS notes: Since we have already done IPQoS
-			 * processing we don't want to do it again in
-			 * the fanout routines called by
-			 * icmp_inbound_error_fanout, hence the last
-			 * argument, ip_policy, is B_FALSE.
-			 */
-			icmp_inbound_error_fanout(q, ill, first_mp, icmph,
-			    ipha, iph_hdr_length, hdr_length, mctl_present,
-			    B_FALSE, recv_ill, zoneid);
+
+	switch (icmph->icmph_type) {
+	case ICMP_REDIRECT:
+		icmp_redirect_v4(mp, ipha, icmph, ira);
+		break;
+	case ICMP_DEST_UNREACHABLE:
+		if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
+			/* Update DCE and adjust MTU is icmp header if needed */
+			icmp_inbound_too_big_v4(icmph, ira);
 		}
-		return;
+		/* FALLTHRU */
+	default:
+		icmp_inbound_error_fanout_v4(mp, icmph, ira);
+		break;
 	}
+	return (mp_ret);
+}
+
+/*
+ * Send an ICMP echo, timestamp or address mask reply.
+ * The caller has already updated the payload part of the packet.
+ * We handle the ICMP checksum, IP source address selection and feed
+ * the packet into ip_output_simple.
+ */
+static void
+icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
+    ip_recv_attr_t *ira)
+{
+	uint_t		ip_hdr_length = ira->ira_ip_hdr_length;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ip_xmit_attr_t	ixas;
+
 	/* Send out an ICMP packet */
 	icmph->icmph_checksum = 0;
-	icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0);
-	if (broadcast || CLASSD(ipha->ipha_dst)) {
-		ipif_t	*ipif_chosen;
-		/*
-		 * Make it look like it was directed to us, so we don't look
-		 * like a fool with a broadcast or multicast source address.
-		 */
-		ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
-		/*
-		 * Make sure that we haven't grabbed an interface that's DOWN.
-		 */
-		if (ipif != NULL) {
-			ipif_chosen = ipif_select_source(ipif->ipif_ill,
-			    ipha->ipha_src, zoneid);
-			if (ipif_chosen != NULL) {
-				ipif_refrele(ipif);
-				ipif = ipif_chosen;
-			}
-		}
-		if (ipif == NULL) {
-			ip0dbg(("icmp_inbound: "
-			    "No source for broadcast/multicast:\n"
-			    "\tsrc 0x%x dst 0x%x ill %p "
-			    "ipif_lcl_addr 0x%x\n",
-			    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
-			    (void *)ill,
-			    ill->ill_ipif->ipif_lcl_addr));
-			freemsg(first_mp);
-			return;
-		}
-		ASSERT(ipif != NULL && !ipif->ipif_isv6);
-		ipha->ipha_dst = ipif->ipif_src_addr;
-		ipif_refrele(ipif);
-	}
+	icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
 	/* Reset time to live. */
 	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
 	{
@@ -2038,138 +1853,159 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
 	if (!IS_SIMPLE_IPH(ipha))
 		icmp_options_update(ipha);
 
-	if (!mctl_present) {
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+	ixas.ixa_zoneid = ira->ira_zoneid;
+	ixas.ixa_cred = kcred;
+	ixas.ixa_cpid = NOPID;
+	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
+	ixas.ixa_ifindex = 0;
+	ixas.ixa_ipst = ipst;
+	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
 		/*
 		 * This packet should go out the same way as it
-		 * came in i.e in clear. To make sure that global
-		 * policy will not be applied to this in ip_wput_ire,
-		 * we attach a IPSEC_IN mp and clear ipsec_in_secure.
+		 * came in i.e in clear, independent of the IPsec policy
+		 * for transmitting packets.
 		 */
-		ASSERT(first_mp == mp);
-		first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack);
-		if (first_mp == NULL) {
+		ixas.ixa_flags |= IXAF_NO_IPSEC;
+	} else {
+		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(mp);
+			/* Note: mp already consumed and ip_drop_packet done */
 			return;
 		}
-		ii = (ipsec_in_t *)first_mp->b_rptr;
-
-		/* This is not a secure packet */
-		ii->ipsec_in_secure = B_FALSE;
-		first_mp->b_cont = mp;
-	} else {
-		ii = (ipsec_in_t *)first_mp->b_rptr;
-		ii->ipsec_in_ns = ipst->ips_netstack;	/* No netstack_hold */
 	}
-	if (!ipsec_in_to_out(first_mp, ipha, NULL, zoneid)) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-		return;
+	if (ira->ira_flags & IRAF_MULTIBROADCAST) {
+		/*
+		 * Not one or our addresses (IRE_LOCALs), thus we let
+		 * ip_output_simple pick the source.
+		 */
+		ipha->ipha_src = INADDR_ANY;
+		ixas.ixa_flags |= IXAF_SET_SOURCE;
+	}
+	/* Should we send with DF and use dce_pmtu? */
+	if (ipst->ips_ipv4_icmp_return_pmtu) {
+		ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
+		ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
 	}
+
 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
-	put(WR(q), first_mp);
+
+	(void) ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
 }
 
-static ipaddr_t
-icmp_get_nexthop_addr(ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp)
+/*
+ * Verify the ICMP messages for either for ICMP error or redirect packet.
+ * The caller should have fully pulled up the message. If it's a redirect
+ * packet, only basic checks on IP header will be done; otherwise, verify
+ * the packet by looking at the included ULP header.
+ *
+ * Called before icmp_inbound_error_fanout_v4 is called.
+ */
+static boolean_t
+icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
 {
-	conn_t *connp;
-	connf_t *connfp;
-	ipaddr_t nexthop_addr = INADDR_ANY;
-	int hdr_length = IPH_HDR_LENGTH(ipha);
-	uint16_t *up;
-	uint32_t ports;
-	ip_stack_t *ipst = ill->ill_ipst;
+	ill_t		*ill = ira->ira_ill;
+	int		hdr_length;
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
+	conn_t		*connp;
+	ipha_t		*ipha;	/* Inner IP header */
 
-	up = (uint16_t *)((uchar_t *)ipha + hdr_length);
-	switch (ipha->ipha_protocol) {
-		case IPPROTO_TCP:
-		{
-			tcph_t *tcph;
-
-			/* do a reverse lookup */
-			tcph = (tcph_t *)((uchar_t *)ipha + hdr_length);
-			connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph,
-			    TCPS_LISTEN, ipst);
-			break;
-		}
-		case IPPROTO_UDP:
-		{
-			uint32_t dstport, srcport;
+	ipha = (ipha_t *)&icmph[1];
+	if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
+		goto truncated;
 
-			((uint16_t *)&ports)[0] = up[1];
-			((uint16_t *)&ports)[1] = up[0];
+	hdr_length = IPH_HDR_LENGTH(ipha);
 
-			/* Extract ports in net byte order */
-			dstport = htons(ntohl(ports) & 0xFFFF);
-			srcport = htons(ntohl(ports) >> 16);
+	if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
+		goto discard_pkt;
 
-			connfp = &ipst->ips_ipcl_udp_fanout[
-			    IPCL_UDP_HASH(dstport, ipst)];
-			mutex_enter(&connfp->connf_lock);
-			connp = connfp->connf_head;
+	if (hdr_length < sizeof (ipha_t))
+		goto truncated;
 
-			/* do a reverse lookup */
-			while ((connp != NULL) &&
-			    (!IPCL_UDP_MATCH(connp, dstport,
-			    ipha->ipha_src, srcport, ipha->ipha_dst) ||
-			    !IPCL_ZONE_MATCH(connp, zoneid))) {
-				connp = connp->conn_next;
-			}
-			if (connp != NULL)
-				CONN_INC_REF(connp);
-			mutex_exit(&connfp->connf_lock);
-			break;
-		}
-		case IPPROTO_SCTP:
-		{
-			in6_addr_t map_src, map_dst;
+	if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
+		goto truncated;
 
-			IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_src);
-			IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_dst);
-			((uint16_t *)&ports)[0] = up[1];
-			((uint16_t *)&ports)[1] = up[0];
+	/*
+	 * Stop here for ICMP_REDIRECT.
+	 */
+	if (icmph->icmph_type == ICMP_REDIRECT)
+		return (B_TRUE);
 
-			connp = sctp_find_conn(&map_src, &map_dst, ports,
-			    zoneid, ipst->ips_netstack->netstack_sctp);
-			if (connp == NULL) {
-				connp = ipcl_classify_raw(mp, IPPROTO_SCTP,
-				    zoneid, ports, ipha, ipst);
-			} else {
-				CONN_INC_REF(connp);
-				SCTP_REFRELE(CONN2SCTP(connp));
-			}
-			break;
-		}
-		default:
-		{
-			ipha_t ripha;
+	/*
+	 * ICMP errors only.
+	 */
+	switch (ipha->ipha_protocol) {
+	case IPPROTO_UDP:
+		/*
+		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+		 * transport header.
+		 */
+		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
+		    mp->b_wptr)
+			goto truncated;
+		break;
+	case IPPROTO_TCP: {
+		tcpha_t		*tcpha;
+
+		/*
+		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+		 * transport header.
+		 */
+		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
+		    mp->b_wptr)
+			goto truncated;
 
-			ripha.ipha_src = ipha->ipha_dst;
-			ripha.ipha_dst = ipha->ipha_src;
-			ripha.ipha_protocol = ipha->ipha_protocol;
+		tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
+		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
+		    ipst);
+		if (connp == NULL)
+			goto discard_pkt;
 
-			connfp = &ipst->ips_ipcl_proto_fanout[
-			    ipha->ipha_protocol];
-			mutex_enter(&connfp->connf_lock);
-			connp = connfp->connf_head;
-			for (connp = connfp->connf_head; connp != NULL;
-			    connp = connp->conn_next) {
-				if (IPCL_PROTO_MATCH(connp,
-				    ipha->ipha_protocol, &ripha, ill,
-				    0, zoneid)) {
-					CONN_INC_REF(connp);
-					break;
-				}
-			}
-			mutex_exit(&connfp->connf_lock);
+		if ((connp->conn_verifyicmp != NULL) &&
+		    !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
+			CONN_DEC_REF(connp);
+			goto discard_pkt;
 		}
-	}
-	if (connp != NULL) {
-		if (connp->conn_nexthop_set)
-			nexthop_addr = connp->conn_nexthop_v4;
 		CONN_DEC_REF(connp);
+		break;
+	}
+	case IPPROTO_SCTP:
+		/*
+		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+		 * transport header.
+		 */
+		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
+		    mp->b_wptr)
+			goto truncated;
+		break;
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+		break;
+	case IPPROTO_ENCAP:
+		if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
+		    mp->b_wptr)
+			goto truncated;
+		break;
+	default:
+		break;
 	}
-	return (nexthop_addr);
+
+	return (B_TRUE);
+
+discard_pkt:
+	/* Bogus ICMP error. */
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+	return (B_FALSE);
+
+truncated:
+	/* We pulled up everthing already. Must be truncated */
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+	return (B_FALSE);
 }
 
 /* Table from RFC 1191 */
@@ -2178,64 +2014,52 @@ static int icmp_frag_size_table[] =
 
 /*
  * Process received ICMP Packet too big.
- * After updating any IRE it does the fanout to any matching transport streams.
- * Assumes the message has been pulled up till the IP header that caused
- * the error.
+ * Just handles the DCE create/update, including using the above table of
+ * PMTU guesses. The caller is responsible for validating the packet before
+ * passing it in and also to fanout the ICMP error to any matching transport
+ * conns. Assumes the message has been fully pulled up and verified.
+ *
+ * Before getting here, the caller has called icmp_inbound_verify_v4()
+ * that should have verified with ULP to prevent undoing the changes we're
+ * going to make to DCE. For example, TCP might have verified that the packet
+ * which generated error is in the send window.
  *
- * Returns B_FALSE on failure and B_TRUE on success.
+ * In some cases modified this MTU in the ICMP header packet; the caller
+ * should pass to the matching ULP after this returns.
  */
-static boolean_t
-icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill,
-    zoneid_t zoneid, mblk_t *mp, int iph_hdr_length,
-    ip_stack_t *ipst)
+static void
+icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
 {
-	ire_t	*ire, *first_ire;
-	int	mtu, orig_mtu;
-	int	hdr_length;
-	ipaddr_t nexthop_addr;
-	boolean_t disable_pmtud;
+	dce_t		*dce;
+	int		old_mtu;
+	int		mtu, orig_mtu;
+	ipaddr_t	dst;
+	boolean_t	disable_pmtud;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	uint_t		hdr_length;
+	ipha_t		*ipha;
 
+	/* Caller already pulled up everything. */
+	ipha = (ipha_t *)&icmph[1];
 	ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
 	    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
 	ASSERT(ill != NULL);
 
 	hdr_length = IPH_HDR_LENGTH(ipha);
 
-	/* Drop if the original packet contained a source route */
-	if (ip_source_route_included(ipha)) {
-		return (B_FALSE);
-	}
 	/*
-	 * Verify we have at least ICMP_MIN_TP_HDR_LENGTH bytes of transport
-	 * header.
+	 * We handle path MTU for source routed packets since the DCE
+	 * is looked up using the final destination.
 	 */
-	if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
-	    mp->b_wptr) {
-		if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
-		    ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			ip1dbg(("icmp_inbound_too_big: insufficient hdr\n"));
-			return (B_FALSE);
-		}
-		icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-		ipha = (ipha_t *)&icmph[1];
-	}
-	nexthop_addr = icmp_get_nexthop_addr(ipha, ill, zoneid, mp);
-	if (nexthop_addr != INADDR_ANY) {
-		/* nexthop set */
-		first_ire = ire_ctable_lookup(ipha->ipha_dst,
-		    nexthop_addr, 0, NULL, ALL_ZONES, msg_getlabel(mp),
-		    MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, ipst);
-	} else {
-		/* nexthop not set */
-		first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE,
-		    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-	}
+	dst = ip_get_dst(ipha);
 
-	if (!first_ire) {
-		ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n",
-		    ntohl(ipha->ipha_dst)));
-		return (B_FALSE);
+	dce = dce_lookup_and_add_v4(dst, ipst);
+	if (dce == NULL) {
+		/* Couldn't add a unique one - ENOMEM */
+		ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
+		    ntohl(dst)));
+		return;
 	}
 
 	/* Check for MTU discovery advice as described in RFC 1191 */
@@ -2243,149 +2067,112 @@ icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill,
 	orig_mtu = mtu;
 	disable_pmtud = B_FALSE;
 
-	rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER);
-	for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst;
-	    ire = ire->ire_next) {
-		/*
-		 * Look for the connection to which this ICMP message is
-		 * directed. If it has the IP_NEXTHOP option set, then the
-		 * search is limited to IREs with the MATCH_IRE_PRIVATE
-		 * option. Else the search is limited to regular IREs.
-		 */
-		if (((ire->ire_marks & IRE_MARK_PRIVATE_ADDR) &&
-		    (nexthop_addr != ire->ire_gateway_addr)) ||
-		    (!(ire->ire_marks & IRE_MARK_PRIVATE_ADDR) &&
-		    (nexthop_addr != INADDR_ANY)))
-			continue;
+	mutex_enter(&dce->dce_lock);
+	if (dce->dce_flags & DCEF_PMTU)
+		old_mtu = dce->dce_pmtu;
+	else
+		old_mtu = ill->ill_mtu;
 
-		mutex_enter(&ire->ire_lock);
-		if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
-			uint32_t length;
-			int	i;
+	if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
+		uint32_t length;
+		int	i;
 
+		/*
+		 * Use the table from RFC 1191 to figure out
+		 * the next "plateau" based on the length in
+		 * the original IP packet.
+		 */
+		length = ntohs(ipha->ipha_length);
+		DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
+		    uint32_t, length);
+		if (old_mtu <= length &&
+		    old_mtu >= length - hdr_length) {
 			/*
-			 * Use the table from RFC 1191 to figure out
-			 * the next "plateau" based on the length in
-			 * the original IP packet.
+			 * Handle broken BSD 4.2 systems that
+			 * return the wrong ipha_length in ICMP
+			 * errors.
 			 */
-			length = ntohs(ipha->ipha_length);
-			DTRACE_PROBE2(ip4__pmtu__guess, ire_t *, ire,
-			    uint32_t, length);
-			if (ire->ire_max_frag <= length &&
-			    ire->ire_max_frag >= length - hdr_length) {
-				/*
-				 * Handle broken BSD 4.2 systems that
-				 * return the wrong iph_length in ICMP
-				 * errors.
-				 */
-				length -= hdr_length;
-			}
-			for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
-				if (length > icmp_frag_size_table[i])
-					break;
-			}
-			if (i == A_CNT(icmp_frag_size_table)) {
-				/* Smaller than 68! */
-				disable_pmtud = B_TRUE;
+			ip1dbg(("Wrong mtu: sent %d, dce %d\n",
+			    length, old_mtu));
+			length -= hdr_length;
+		}
+		for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
+			if (length > icmp_frag_size_table[i])
+				break;
+		}
+		if (i == A_CNT(icmp_frag_size_table)) {
+			/* Smaller than IP_MIN_MTU! */
+			ip1dbg(("Too big for packet size %d\n",
+			    length));
+			disable_pmtud = B_TRUE;
+			mtu = ipst->ips_ip_pmtu_min;
+		} else {
+			mtu = icmp_frag_size_table[i];
+			ip1dbg(("Calculated mtu %d, packet size %d, "
+			    "before %d\n", mtu, length, old_mtu));
+			if (mtu < ipst->ips_ip_pmtu_min) {
 				mtu = ipst->ips_ip_pmtu_min;
-			} else {
-				mtu = icmp_frag_size_table[i];
-				if (mtu < ipst->ips_ip_pmtu_min) {
-					mtu = ipst->ips_ip_pmtu_min;
-					disable_pmtud = B_TRUE;
-				}
+				disable_pmtud = B_TRUE;
 			}
-			/* Fool the ULP into believing our guessed PMTU. */
-			icmph->icmph_du_zero = 0;
-			icmph->icmph_du_mtu = htons(mtu);
-		}
-		if (disable_pmtud)
-			ire->ire_frag_flag = 0;
-		/* Reduce the IRE max frag value as advised. */
-		ire->ire_max_frag = MIN(ire->ire_max_frag, mtu);
-		if (ire->ire_max_frag == mtu) {
-			/* Decreased it */
-			ire->ire_marks |= IRE_MARK_PMTU;
 		}
-		mutex_exit(&ire->ire_lock);
-		DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, ire_t *,
-		    ire, int, orig_mtu, int, mtu);
 	}
-	rw_exit(&first_ire->ire_bucket->irb_lock);
-	ire_refrele(first_ire);
-	return (B_TRUE);
+	if (disable_pmtud)
+		dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
+	else
+		dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
+
+	dce->dce_pmtu = MIN(old_mtu, mtu);
+	/* Prepare to send the new max frag size for the ULP. */
+	icmph->icmph_du_zero = 0;
+	icmph->icmph_du_mtu =  htons((uint16_t)dce->dce_pmtu);
+	DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
+	    dce, int, orig_mtu, int, mtu);
+
+	/* We now have a PMTU for sure */
+	dce->dce_flags |= DCEF_PMTU;
+	dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+	mutex_exit(&dce->dce_lock);
+	/*
+	 * After dropping the lock the new value is visible to everyone.
+	 * Then we bump the generation number so any cached values reinspect
+	 * the dce_t.
+	 */
+	dce_increment_generation(dce);
+	dce_refrele(dce);
 }
 
 /*
- * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout
+ * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
  * calls this function.
  */
 static mblk_t *
-icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length)
+icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
 {
-	ipha_t *ipha;
-	icmph_t *icmph;
-	ipha_t *in_ipha;
 	int length;
 
 	ASSERT(mp->b_datap->db_type == M_DATA);
 
-	/*
-	 * For Self-encapsulated packets, we added an extra IP header
-	 * without the options. Inner IP header is the one from which
-	 * the outer IP header was formed. Thus, we need to remove the
-	 * outer IP header. To do this, we pullup the whole message
-	 * and overlay whatever follows the outer IP header over the
-	 * outer IP header.
-	 */
-
-	if (!pullupmsg(mp, -1))
-		return (NULL);
-
-	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-	ipha = (ipha_t *)&icmph[1];
-	in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
+	/* icmp_inbound_v4 has already pulled up the whole error packet */
+	ASSERT(mp->b_cont == NULL);
 
 	/*
-	 * The length that we want to overlay is following the inner
-	 * IP header. Subtracting the IP header + icmp header + outer
-	 * IP header's length should give us the length that we want to
-	 * overlay.
+	 * The length that we want to overlay is the inner header
+	 * and what follows it.
 	 */
-	length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) -
-	    hdr_length;
+	length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
+
 	/*
-	 * Overlay whatever follows the inner header over the
+	 * Overlay the inner header and whatever follows it over the
 	 * outer header.
 	 */
 	bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
 
-	/* Set the wptr to account for the outer header */
-	mp->b_wptr -= hdr_length;
+	/* Adjust for what we removed */
+	mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
 	return (mp);
 }
 
 /*
- * Fanout for ICMP errors containing IP-in-IPv4 packets.  Returns B_TRUE if a
- * tunnel consumed the message, and B_FALSE otherwise.
- */
-static boolean_t
-icmp_inbound_iptun_fanout(mblk_t *first_mp, ipha_t *ripha, ill_t *ill,
-    ip_stack_t *ipst)
-{
-	conn_t	*connp;
-
-	if ((connp = ipcl_iptun_classify_v4(&ripha->ipha_src, &ripha->ipha_dst,
-	    ipst)) == NULL)
-		return (B_FALSE);
-
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-	connp->conn_recv(connp, first_mp, NULL);
-	CONN_DEC_REF(connp);
-	return (B_TRUE);
-}
-
-/*
  * Try to pass the ICMP message upstream in case the ULP cares.
  *
  * If the packet that caused the ICMP error is secure, we send
@@ -2400,25 +2187,22 @@ icmp_inbound_iptun_fanout(mblk_t *first_mp, ipha_t *ripha, ill_t *ill,
  *
  * IFN could have been generated locally or by some router.
  *
- * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this.
+ * LOCAL : ire_send_wire (before calling ipsec_out_process) can call
+ * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
  *	    This happens because IP adjusted its value of MTU on an
  *	    earlier IFN message and could not tell the upper layer,
  *	    the new adjusted value of MTU e.g. Packet was encrypted
  *	    or there was not enough information to fanout to upper
- *	    layers. Thus on the next outbound datagram, ip_wput_ire
+ *	    layers. Thus on the next outbound datagram, ire_send_wire
  *	    generates the IFN, where IPsec processing has *not* been
  *	    done.
  *
- *	   *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed
- *	    could have generated this. This happens because ire_max_frag
- *	    value in IP was set to a new value, while the IPsec processing
- *	    was being done and after we made the fragmentation check in
- *	    ip_wput_ire. Thus on return from IPsec processing,
- *	    ip_wput_ipsec_out finds that the new length is > ire_max_frag
- *	    and generates the IFN. As IPsec processing is over, we fanout
- *	    to AH/ESP to remove the header.
+ *	    Note that we retain ixa_fragsize across IPsec thus once
+ *	    we have picking ixa_fragsize and entered ipsec_out_process we do
+ *	    no change the fragsize even if the path MTU changes before
+ *	    we reach ip_output_post_ipsec.
  *
- *	    In both these cases, ipsec_in_loopback will be set indicating
+ *	    In the local case, IRAF_LOOPBACK will be set indicating
  *	    that IFN was generated locally.
  *
  * ROUTER : IFN could be secure or non-secure.
@@ -2432,45 +2216,38 @@ icmp_inbound_iptun_fanout(mblk_t *first_mp, ipha_t *ripha, ill_t *ill,
  *	      If the packet in error does not have AH/ESP, we handle it
  *	      like any other case.
  *
- *	    * NON_SECURE : If the packet in error has AH/ESP headers,
- *	      we attach a dummy ipsec_in and send it up to AH/ESP
- *	      for validation. AH/ESP will verify whether there is a
+ *	    * NON_SECURE : If the packet in error has AH/ESP headers, we send it
+ *	      up to AH/ESP for validation. AH/ESP will verify whether there is a
  *	      valid SA or not and send it back. We will fanout again if
  *	      we have more data in the packet.
  *
  *	      If the packet in error does not have AH/ESP, we handle it
  *	      like any other case.
+ *
+ * The caller must have called icmp_inbound_verify_v4.
  */
 static void
-icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp,
-    icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length,
-    boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill,
-    zoneid_t zoneid)
-{
-	uint16_t *up;	/* Pointer to ports in ULP header */
-	uint32_t ports;	/* reversed ports for fanout */
-	ipha_t ripha;	/* With reversed addresses */
-	mblk_t *first_mp;
-	ipsec_in_t *ii;
-	tcph_t	*tcph;
-	conn_t	*connp;
-	ip_stack_t *ipst;
-
-	ASSERT(ill != NULL);
-
-	ASSERT(recv_ill != NULL);
-	ipst = recv_ill->ill_ipst;
+icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
+{
+	uint16_t	*up;	/* Pointer to ports in ULP header */
+	uint32_t	ports;	/* reversed ports for fanout */
+	ipha_t		ripha;	/* With reversed addresses */
+	ipha_t		*ipha;  /* Inner IP header */
+	uint_t		hdr_length; /* Inner IP header length */
+	tcpha_t		*tcpha;
+	conn_t		*connp;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
+	ill_t		*rill = ira->ira_rill;
 
-	first_mp = mp;
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		ASSERT(mp != NULL);
+	/* Caller already pulled up everything. */
+	ipha = (ipha_t *)&icmph[1];
+	ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
+	ASSERT(mp->b_cont == NULL);
 
-		ii = (ipsec_in_t *)first_mp->b_rptr;
-		ASSERT(ii->ipsec_in_type == IPSEC_IN);
-	} else {
-		ii = NULL;
-	}
+	hdr_length = IPH_HDR_LENGTH(ipha);
+	ira->ira_protocol = ipha->ipha_protocol;
 
 	/*
 	 * We need a separate IP header with the source and destination
@@ -2482,249 +2259,223 @@ icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp,
 	ripha.ipha_protocol = ipha->ipha_protocol;
 	ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
 
-	ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n",
+	ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
 	    ripha.ipha_protocol, ntohl(ipha->ipha_src),
 	    ntohl(ipha->ipha_dst),
 	    icmph->icmph_type, icmph->icmph_code));
 
 	switch (ipha->ipha_protocol) {
 	case IPPROTO_UDP:
-		/*
-		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
-		 * transport header.
-		 */
-		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
-		    mp->b_wptr) {
-			if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
-			    ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
-				goto discard_pkt;
-			}
-			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-			ipha = (ipha_t *)&icmph[1];
-		}
 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
 
 		/* Attempt to find a client stream based on port. */
-		((uint16_t *)&ports)[0] = up[1];
-		((uint16_t *)&ports)[1] = up[0];
-		ip2dbg(("icmp_inbound_error: UDP ports %d to %d\n",
+		ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
 		    ntohs(up[0]), ntohs(up[1])));
 
-		/* Have to change db_type after any pullupmsg */
-		DB_TYPE(mp) = M_CTL;
-
-		ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0,
-		    mctl_present, ip_policy, recv_ill, zoneid);
+		/* Note that we send error to all matches. */
+		ira->ira_flags |= IRAF_ICMP_ERROR;
+		ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
+		ira->ira_flags &= ~IRAF_ICMP_ERROR;
 		return;
 
 	case IPPROTO_TCP:
 		/*
-		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
-		 * transport header.
-		 */
-		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
-		    mp->b_wptr) {
-			if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
-			    ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
-				goto discard_pkt;
-			}
-			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-			ipha = (ipha_t *)&icmph[1];
-		}
-		/*
 		 * Find a TCP client stream for this packet.
 		 * Note that we do a reverse lookup since the header is
 		 * in the form we sent it out.
 		 */
-		tcph = (tcph_t *)((uchar_t *)ipha + hdr_length);
-		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN,
+		tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
+		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
 		    ipst);
 		if (connp == NULL)
 			goto discard_pkt;
 
-		/* Have to change db_type after any pullupmsg */
-		DB_TYPE(mp) = M_CTL;
-		SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp,
-		    SQ_FILL, SQTAG_TCP_INPUT_ICMP_ERR);
+		if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
+		    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
+			mp = ipsec_check_inbound_policy(mp, connp,
+			    ipha, NULL, ira);
+			if (mp == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				/* Note that mp is NULL */
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				CONN_DEC_REF(connp);
+				return;
+			}
+		}
+
+		ira->ira_flags |= IRAF_ICMP_ERROR;
+		ira->ira_ill = ira->ira_rill = NULL;
+		if (IPCL_IS_TCP(connp)) {
+			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+			    connp->conn_recvicmp, connp, ira, SQ_FILL,
+			    SQTAG_TCP_INPUT_ICMP_ERR);
+		} else {
+			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
+			(connp->conn_recv)(connp, mp, NULL, ira);
+			CONN_DEC_REF(connp);
+		}
+		ira->ira_ill = ill;
+		ira->ira_rill = rill;
+		ira->ira_flags &= ~IRAF_ICMP_ERROR;
 		return;
 
 	case IPPROTO_SCTP:
-		/*
-		 * Verify we have at least ICMP_MIN_SCTP_HDR_LEN bytes of
-		 * transport header, in the first mp.
-		 */
-		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_SCTP_HDR_LEN >
-		    mp->b_wptr) {
-			if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
-			    ICMP_MIN_SCTP_HDR_LEN - mp->b_rptr)) {
-				goto discard_pkt;
-			}
-			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-			ipha = (ipha_t *)&icmph[1];
-		}
 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
 		/* Find a SCTP client stream for this packet. */
 		((uint16_t *)&ports)[0] = up[1];
 		((uint16_t *)&ports)[1] = up[0];
 
-		/* Have to change db_type after any pullupmsg */
-		DB_TYPE(mp) = M_CTL;
-		ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0,
-		    mctl_present, ip_policy, zoneid);
+		ira->ira_flags |= IRAF_ICMP_ERROR;
+		ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
+		ira->ira_flags &= ~IRAF_ICMP_ERROR;
 		return;
 
 	case IPPROTO_ESP:
-	case IPPROTO_AH: {
-		int ipsec_rc;
-		ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
-		/*
-		 * We need a IPSEC_IN in the front to fanout to AH/ESP.
-		 * We will re-use the IPSEC_IN if it is already present as
-		 * AH/ESP will not affect any fields in the IPSEC_IN for
-		 * ICMP errors. If there is no IPSEC_IN, allocate a new
-		 * one and attach it in the front.
-		 */
-		if (ii != NULL) {
-			/*
-			 * ip_fanout_proto_again converts the ICMP errors
-			 * that come back from AH/ESP to M_DATA so that
-			 * if it is non-AH/ESP and we do a pullupmsg in
-			 * this function, it would work. Convert it back
-			 * to M_CTL before we send up as this is a ICMP
-			 * error. This could have been generated locally or
-			 * by some router. Validate the inner IPsec
-			 * headers.
-			 *
-			 * NOTE : ill_index is used by ip_fanout_proto_again
-			 * to locate the ill.
-			 */
-			ASSERT(ill != NULL);
-			ii->ipsec_in_ill_index =
-			    ill->ill_phyint->phyint_ifindex;
-			ii->ipsec_in_rill_index =
-			    recv_ill->ill_phyint->phyint_ifindex;
-			DB_TYPE(first_mp->b_cont) = M_CTL;
-		} else {
-			/*
-			 * IPSEC_IN is not present. We attach a ipsec_in
-			 * message and send up to IPsec for validating
-			 * and removing the IPsec headers. Clear
-			 * ipsec_in_secure so that when we return
-			 * from IPsec, we don't mistakenly think that this
-			 * is a secure packet came from the network.
-			 *
-			 * NOTE : ill_index is used by ip_fanout_proto_again
-			 * to locate the ill.
-			 */
-			ASSERT(first_mp == mp);
-			first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack);
-			if (first_mp == NULL) {
-				freemsg(mp);
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				return;
-			}
-			ii = (ipsec_in_t *)first_mp->b_rptr;
-
-			/* This is not a secure packet */
-			ii->ipsec_in_secure = B_FALSE;
-			first_mp->b_cont = mp;
-			DB_TYPE(mp) = M_CTL;
-			ASSERT(ill != NULL);
-			ii->ipsec_in_ill_index =
-			    ill->ill_phyint->phyint_ifindex;
-			ii->ipsec_in_rill_index =
-			    recv_ill->ill_phyint->phyint_ifindex;
-		}
-
+	case IPPROTO_AH:
 		if (!ipsec_loaded(ipss)) {
-			ip_proto_not_sup(q, first_mp, 0, zoneid, ipst);
+			ip_proto_not_sup(mp, ira);
 			return;
 		}
 
 		if (ipha->ipha_protocol == IPPROTO_ESP)
-			ipsec_rc = ipsecesp_icmp_error(first_mp);
+			mp = ipsecesp_icmp_error(mp, ira);
 		else
-			ipsec_rc = ipsecah_icmp_error(first_mp);
-		if (ipsec_rc == IPSEC_STATUS_FAILED)
+			mp = ipsecah_icmp_error(mp, ira);
+		if (mp == NULL)
+			return;
+
+		/* Just in case ipsec didn't preserve the NULL b_cont */
+		if (mp->b_cont != NULL) {
+			if (!pullupmsg(mp, -1))
+				goto discard_pkt;
+		}
+
+		/*
+		 * Note that ira_pktlen and ira_ip_hdr_length are no longer
+		 * correct, but we don't use them any more here.
+		 *
+		 * If succesful, the mp has been modified to not include
+		 * the ESP/AH header so we can fanout to the ULP's icmp
+		 * error handler.
+		 */
+		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
+			goto truncated;
+
+		/* Verify the modified message before any further processes. */
+		ipha = (ipha_t *)mp->b_rptr;
+		hdr_length = IPH_HDR_LENGTH(ipha);
+		icmph = (icmph_t *)&mp->b_rptr[hdr_length];
+		if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
+			freemsg(mp);
 			return;
+		}
 
-		ip_fanout_proto_again(first_mp, ill, recv_ill, NULL);
+		icmp_inbound_error_fanout_v4(mp, icmph, ira);
 		return;
-	}
-	case IPPROTO_ENCAP:
-	case IPPROTO_IPV6:
-		if (ipha->ipha_protocol == IPPROTO_ENCAP) {
-			ipha_t *in_ipha;
 
-			if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
-			    mp->b_wptr) {
-				if (!pullupmsg(mp, (uchar_t *)ipha +
-				    hdr_length + sizeof (ipha_t) -
-				    mp->b_rptr)) {
+	case IPPROTO_ENCAP: {
+		/* Look for self-encapsulated packets that caused an error */
+		ipha_t *in_ipha;
+
+		/*
+		 * Caller has verified that length has to be
+		 * at least the size of IP header.
+		 */
+		ASSERT(hdr_length >= sizeof (ipha_t));
+		/*
+		 * Check the sanity of the inner IP header like
+		 * we did for the outer header.
+		 */
+		in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
+		if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
+			goto discard_pkt;
+		}
+		if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
+			goto discard_pkt;
+		}
+		/* Check for Self-encapsulated tunnels */
+		if (in_ipha->ipha_src == ipha->ipha_src &&
+		    in_ipha->ipha_dst == ipha->ipha_dst) {
+
+			mp = icmp_inbound_self_encap_error_v4(mp, ipha,
+			    in_ipha);
+			if (mp == NULL)
+				goto discard_pkt;
+
+			/*
+			 * Just in case self_encap didn't preserve the NULL
+			 * b_cont
+			 */
+			if (mp->b_cont != NULL) {
+				if (!pullupmsg(mp, -1))
 					goto discard_pkt;
-				}
-				icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-				ipha = (ipha_t *)&icmph[1];
 			}
 			/*
-			 * Caller has verified that length has to be
-			 * at least the size of IP header.
+			 * Note that ira_pktlen and ira_ip_hdr_length are no
+			 * longer correct, but we don't use them any more here.
 			 */
-			ASSERT(hdr_length >= sizeof (ipha_t));
+			if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
+				goto truncated;
+
 			/*
-			 * Check the sanity of the inner IP header like
-			 * we did for the outer header.
+			 * Verify the modified message before any further
+			 * processes.
 			 */
-			in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
-			if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION) ||
-			    IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t))
-				goto discard_pkt;
-			/* Check for Self-encapsulated tunnels */
-			if (in_ipha->ipha_src == ipha->ipha_src &&
-			    in_ipha->ipha_dst == ipha->ipha_dst) {
-
-				mp = icmp_inbound_self_encap_error(mp,
-				    iph_hdr_length, hdr_length);
-				if (mp == NULL)
-					goto discard_pkt;
-				icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-				ipha = (ipha_t *)&icmph[1];
-				hdr_length = IPH_HDR_LENGTH(ipha);
-				/*
-				 * The packet in error is self-encapsualted.
-				 * And we are finding it further encapsulated
-				 * which we could not have possibly generated.
-				 */
-				if (ipha->ipha_protocol == IPPROTO_ENCAP) {
-					goto discard_pkt;
-				}
-				icmp_inbound_error_fanout(q, ill, first_mp,
-				    icmph, ipha, iph_hdr_length, hdr_length,
-				    mctl_present, ip_policy, recv_ill, zoneid);
+			ipha = (ipha_t *)mp->b_rptr;
+			hdr_length = IPH_HDR_LENGTH(ipha);
+			icmph = (icmph_t *)&mp->b_rptr[hdr_length];
+			if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
+				freemsg(mp);
 				return;
 			}
-		}
 
-		DB_TYPE(mp) = M_CTL;
-		if (icmp_inbound_iptun_fanout(first_mp, &ripha, ill, ipst))
+			/*
+			 * The packet in error is self-encapsualted.
+			 * And we are finding it further encapsulated
+			 * which we could not have possibly generated.
+			 */
+			if (ipha->ipha_protocol == IPPROTO_ENCAP) {
+				goto discard_pkt;
+			}
+			icmp_inbound_error_fanout_v4(mp, icmph, ira);
 			return;
+		}
+		/* No self-encapsulated */
+		/* FALLTHRU */
+	}
+	case IPPROTO_IPV6:
+		if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
+		    &ripha.ipha_dst, ipst)) != NULL) {
+			ira->ira_flags |= IRAF_ICMP_ERROR;
+			connp->conn_recvicmp(connp, mp, NULL, ira);
+			CONN_DEC_REF(connp);
+			ira->ira_flags &= ~IRAF_ICMP_ERROR;
+			return;
+		}
 		/*
 		 * No IP tunnel is interested, fallthrough and see
 		 * if a raw socket will want it.
 		 */
 		/* FALLTHRU */
 	default:
-		ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present,
-		    ip_policy, recv_ill, zoneid);
+		ira->ira_flags |= IRAF_ICMP_ERROR;
+		ip_fanout_proto_v4(mp, &ripha, ira);
+		ira->ira_flags &= ~IRAF_ICMP_ERROR;
 		return;
 	}
 	/* NOTREACHED */
 discard_pkt:
 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-drop_pkt:;
-	ip1dbg(("icmp_inbound_error_fanout: drop pkt\n"));
-	freemsg(first_mp);
+	ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
+	ip_drop_input("ipIfStatsInDiscards", mp, ill);
+	freemsg(mp);
+	return;
+
+truncated:
+	/* We pulled up everthing already. Must be truncated */
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+	freemsg(mp);
 }
 
 /*
@@ -2747,6 +2498,16 @@ ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
 	return (ipoptp_next(optp));
 }
 
+/* Like above but without an ipha_t */
+uint8_t
+ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
+{
+	optp->ipoptp_next = opt;
+	optp->ipoptp_end = optp->ipoptp_next + totallen;
+	optp->ipoptp_flags = 0;
+	return (ipoptp_next(optp));
+}
+
 /*
  * Common IP options parser: extract next option.
  */
@@ -2858,38 +2619,55 @@ ipoptp_next(ipoptp_t *optp)
 /*
  * Use the outgoing IP header to create an IP_OPTIONS option the way
  * it was passed down from the application.
+ *
+ * This is compatible with BSD in that it returns
+ * the reverse source route with the final destination
+ * as the last entry. The first 4 bytes of the option
+ * will contain the final destination.
  */
 int
-ip_opt_get_user(const ipha_t *ipha, uchar_t *buf)
+ip_opt_get_user(conn_t *connp, uchar_t *buf)
 {
 	ipoptp_t	opts;
-	const uchar_t	*opt;
+	uchar_t		*opt;
 	uint8_t		optval;
 	uint8_t		optlen;
 	uint32_t	len = 0;
-	uchar_t	*buf1 = buf;
+	uchar_t		*buf1 = buf;
+	uint32_t	totallen;
+	ipaddr_t	dst;
+	ip_pkt_t	*ipp = &connp->conn_xmit_ipp;
+
+	if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
+		return (0);
+
+	totallen = ipp->ipp_ipv4_options_len;
+	if (totallen & 0x3)
+		return (0);
 
 	buf += IP_ADDR_LEN;	/* Leave room for final destination */
 	len += IP_ADDR_LEN;
 	bzero(buf1, IP_ADDR_LEN);
 
-	/*
-	 * OK to cast away const here, as we don't store through the returned
-	 * opts.ipoptp_cur pointer.
-	 */
-	for (optval = ipoptp_first(&opts, (ipha_t *)ipha);
+	dst = connp->conn_faddr_v4;
+
+	for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
 	    optval != IPOPT_EOL;
 	    optval = ipoptp_next(&opts)) {
 		int	off;
 
 		opt = opts.ipoptp_cur;
+		if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
+			break;
+		}
 		optlen = opts.ipoptp_len;
+
 		switch (optval) {
 		case IPOPT_SSRR:
 		case IPOPT_LSRR:
 
 			/*
-			 * Insert ipha_dst as the first entry in the source
+			 * Insert destination as the first entry in the source
 			 * route and move down the entries on step.
 			 * The last entry gets placed at buf1.
 			 */
@@ -2902,8 +2680,9 @@ ip_opt_get_user(const ipha_t *ipha, uchar_t *buf)
 				/* No entries in source route */
 				break;
 			}
-			/* Last entry in source route */
-			bcopy(opt + off, buf1, IP_ADDR_LEN);
+			/* Last entry in source route if not already set */
+			if (dst == INADDR_ANY)
+				bcopy(opt + off, buf1, IP_ADDR_LEN);
 			off -= IP_ADDR_LEN;
 
 			while (off > 0) {
@@ -2913,19 +2692,12 @@ ip_opt_get_user(const ipha_t *ipha, uchar_t *buf)
 				off -= IP_ADDR_LEN;
 			}
 			/* ipha_dst into first slot */
-			bcopy(&ipha->ipha_dst,
-			    buf + off + IP_ADDR_LEN,
+			bcopy(&dst, buf + off + IP_ADDR_LEN,
 			    IP_ADDR_LEN);
 			buf += optlen;
 			len += optlen;
 			break;
 
-		case IPOPT_COMSEC:
-		case IPOPT_SECURITY:
-			/* if passing up a label is not ok, then remove */
-			if (is_system_labeled())
-				break;
-			/* FALLTHROUGH */
 		default:
 			bcopy(opt, buf, optlen);
 			buf += optlen;
@@ -3007,57 +2779,46 @@ icmp_options_update(ipha_t *ipha)
 
 /*
  * Process received ICMP Redirect messages.
+ * Assumes the caller has verified that the headers are in the pulled up mblk.
+ * Consumes mp.
  */
 static void
-icmp_redirect(ill_t *ill, mblk_t *mp)
+icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
 {
-	ipha_t	*ipha;
-	int	iph_hdr_length;
-	icmph_t	*icmph;
-	ipha_t	*ipha_err;
-	ire_t	*ire;
-	ire_t	*prev_ire;
-	ire_t	*save_ire;
-	ipaddr_t  src, dst, gateway;
-	iulp_t	ulp_info = { 0 };
-	int	error;
-	ip_stack_t *ipst;
+	ire_t		*ire, *nire;
+	ire_t		*prev_ire;
+	ipaddr_t  	src, dst, gateway;
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
+	ipha_t		*inner_ipha;	/* Inner IP header */
 
-	ASSERT(ill != NULL);
-	ipst = ill->ill_ipst;
-
-	ipha = (ipha_t *)mp->b_rptr;
-	iph_hdr_length = IPH_HDR_LENGTH(ipha);
-	if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) <
-	    sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) {
-		BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
-		freemsg(mp);
-		return;
-	}
-	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-	ipha_err = (ipha_t *)&icmph[1];
+	/* Caller already pulled up everything. */
+	inner_ipha = (ipha_t *)&icmph[1];
 	src = ipha->ipha_src;
-	dst = ipha_err->ipha_dst;
+	dst = inner_ipha->ipha_dst;
 	gateway = icmph->icmph_rd_gateway;
 	/* Make sure the new gateway is reachable somehow. */
-	ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL,
-	    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
+	ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
+	    ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
 	/*
 	 * Make sure we had a route for the dest in question and that
 	 * that route was pointing to the old gateway (the source of the
 	 * redirect packet.)
+	 * Note: this merely says that there is some IRE which matches that
+	 * gateway; not that the longest match matches that gateway.
 	 */
-	prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES,
-	    NULL, MATCH_IRE_GW, ipst);
+	prev_ire = ire_ftable_lookup_v4(dst, 0, src, 0, NULL, ALL_ZONES,
+	    NULL, MATCH_IRE_GW, 0, ipst, NULL);
 	/*
 	 * Check that
 	 *	the redirect was not from ourselves
 	 *	the new gateway and the old gateway are directly reachable
 	 */
-	if (!prev_ire ||
-	    !ire ||
-	    ire->ire_type == IRE_LOCAL) {
+	if (prev_ire == NULL || ire == NULL ||
+	    (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
+	    (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+	    !(ire->ire_type & IRE_IF_ALL)) {
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
+		ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
 		freemsg(mp);
 		if (ire != NULL)
 			ire_refrele(ire);
@@ -3066,49 +2827,9 @@ icmp_redirect(ill_t *ill, mblk_t *mp)
 		return;
 	}
 
-	/*
-	 * Should we use the old ULP info to create the new gateway?  From
-	 * a user's perspective, we should inherit the info so that it
-	 * is a "smooth" transition.  If we do not do that, then new
-	 * connections going thru the new gateway will have no route metrics,
-	 * which is counter-intuitive to user.  From a network point of
-	 * view, this may or may not make sense even though the new gateway
-	 * is still directly connected to us so the route metrics should not
-	 * change much.
-	 *
-	 * But if the old ire_uinfo is not initialized, we do another
-	 * recursive lookup on the dest using the new gateway.  There may
-	 * be a route to that.  If so, use it to initialize the redirect
-	 * route.
-	 */
-	if (prev_ire->ire_uinfo.iulp_set) {
-		bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t));
-	} else {
-		ire_t *tmp_ire;
-		ire_t *sire;
-
-		tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire,
-		    ALL_ZONES, 0, NULL,
-		    (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT),
-		    ipst);
-		if (sire != NULL) {
-			bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t));
-			/*
-			 * If sire != NULL, ire_ftable_lookup() should not
-			 * return a NULL value.
-			 */
-			ASSERT(tmp_ire != NULL);
-			ire_refrele(tmp_ire);
-			ire_refrele(sire);
-		} else if (tmp_ire != NULL) {
-			bcopy(&tmp_ire->ire_uinfo, &ulp_info,
-			    sizeof (iulp_t));
-			ire_refrele(tmp_ire);
-		}
-	}
-	if (prev_ire->ire_type == IRE_CACHE)
-		ire_delete(prev_ire);
 	ire_refrele(prev_ire);
+	ire_refrele(ire);
+
 	/*
 	 * TODO: more precise handling for cases 0, 2, 3, the latter two
 	 * require TOS routing
@@ -3121,47 +2842,42 @@ icmp_redirect(ill_t *ill, mblk_t *mp)
 	case 3:
 		break;
 	default:
-		freemsg(mp);
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
-		ire_refrele(ire);
+		ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
+		freemsg(mp);
 		return;
 	}
 	/*
 	 * Create a Route Association.  This will allow us to remember that
 	 * someone we believe told us to use the particular gateway.
 	 */
-	save_ire = ire;
 	ire = ire_create(
 	    (uchar_t *)&dst,			/* dest addr */
 	    (uchar_t *)&ip_g_all_ones,		/* mask */
-	    (uchar_t *)&save_ire->ire_src_addr,	/* source addr */
 	    (uchar_t *)&gateway,		/* gateway addr */
-	    &save_ire->ire_max_frag,		/* max frag */
-	    NULL,				/* no src nce */
-	    NULL,				/* no rfq */
-	    NULL,				/* no stq */
 	    IRE_HOST,
-	    NULL,				/* ipif */
-	    0,					/* cmask */
-	    0,					/* phandle */
-	    0,					/* ihandle */
+	    NULL,				/* ill */
+	    ALL_ZONES,
 	    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
-	    &ulp_info,
 	    NULL,				/* tsol_gc_t */
-	    NULL,				/* gcgrp */
 	    ipst);
 
 	if (ire == NULL) {
 		freemsg(mp);
-		ire_refrele(save_ire);
 		return;
 	}
-	error = ire_add(&ire, NULL, NULL, NULL, B_FALSE);
-	ire_refrele(save_ire);
-	atomic_inc_32(&ipst->ips_ip_redirect_cnt);
+	nire = ire_add(ire);
+	/* Check if it was a duplicate entry */
+	if (nire != NULL && nire != ire) {
+		ASSERT(nire->ire_identical_ref > 1);
+		ire_delete(nire);
+		ire_refrele(nire);
+		nire = NULL;
+	}
+	ire = nire;
+	if (ire != NULL) {
+		ire_refrele(ire);		/* Held in ire_add */
 
-	if (error == 0) {
-		ire_refrele(ire);		/* Held in ire_add_v4 */
 		/* tell routing sockets that we received a redirect */
 		ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
@@ -3173,8 +2889,8 @@ icmp_redirect(ill_t *ill, mblk_t *mp)
 	 * This together with the added IRE has the effect of
 	 * modifying an existing redirect.
 	 */
-	prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST, NULL, NULL,
-	    ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), ipst);
+	prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL,
+	    ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL);
 	if (prev_ire != NULL) {
 		if (prev_ire ->ire_flags & RTF_DYNAMIC)
 			ire_delete(prev_ire);
@@ -3186,29 +2902,24 @@ icmp_redirect(ill_t *ill, mblk_t *mp)
 
 /*
  * Generate an ICMP parameter problem message.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
  */
 static void
-icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid,
-	ip_stack_t *ipst)
+icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira)
 {
 	icmph_t	icmph;
-	boolean_t mctl_present;
-	mblk_t *first_mp;
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
 
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
-	if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
-		if (mctl_present)
-			freeb(first_mp);
+	mp = icmp_pkt_err_ok(mp, ira);
+	if (mp == NULL)
 		return;
-	}
 
 	bzero(&icmph, sizeof (icmph_t));
 	icmph.icmph_type = ICMP_PARAM_PROBLEM;
 	icmph.icmph_pp_ptr = ptr;
 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
-	icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid,
-	    ipst);
+	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
 }
 
 /*
@@ -3217,15 +2928,11 @@ icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid,
  * Note: assumes that icmp_pkt_err_ok has been called to verify that
  * an icmp error packet can be sent.
  * Assigns an appropriate source address to the packet. If ipha_dst is
- * one of our addresses use it for source. Otherwise pick a source based
- * on a route lookup back to ipha_src.
- * Note that ipha_src must be set here since the
- * packet is likely to arrive on an ill queue in ip_wput() which will
- * not set a source address.
+ * one of our addresses use it for source. Otherwise let ip_output_simple
+ * pick the source address.
  */
 static void
-icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
-    boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst)
+icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira)
 {
 	ipaddr_t dst;
 	icmph_t	*icmph;
@@ -3235,115 +2942,62 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
 	mblk_t	*mp1;
 	ipaddr_t src;
 	ire_t	*ire;
-	mblk_t *ipsec_mp;
-	ipsec_out_t	*io = NULL;
-
-	if (mctl_present) {
-		/*
-		 * If it is :
-		 *
-		 * 1) a IPSEC_OUT, then this is caused by outbound
-		 *    datagram originating on this host. IPsec processing
-		 *    may or may not have been done. Refer to comments above
-		 *    icmp_inbound_error_fanout for details.
-		 *
-		 * 2) a IPSEC_IN if we are generating a icmp_message
-		 *    for an incoming datagram destined for us i.e called
-		 *    from ip_fanout_send_icmp.
-		 */
-		ipsec_info_t *in;
-		ipsec_mp = mp;
-		mp = ipsec_mp->b_cont;
+	ip_xmit_attr_t ixas;
+	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
 
-		in = (ipsec_info_t *)ipsec_mp->b_rptr;
-		ipha = (ipha_t *)mp->b_rptr;
+	ipha = (ipha_t *)mp->b_rptr;
 
-		ASSERT(in->ipsec_info_type == IPSEC_OUT ||
-		    in->ipsec_info_type == IPSEC_IN);
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+	ixas.ixa_zoneid = ira->ira_zoneid;
+	ixas.ixa_ifindex = 0;
+	ixas.ixa_ipst = ipst;
+	ixas.ixa_cred = kcred;
+	ixas.ixa_cpid = NOPID;
+	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
+	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
 
-		if (in->ipsec_info_type == IPSEC_IN) {
-			/*
-			 * Convert the IPSEC_IN to IPSEC_OUT.
-			 */
-			if (!ipsec_in_to_out(ipsec_mp, ipha, NULL, zoneid)) {
-				BUMP_MIB(&ipst->ips_ip_mib,
-				    ipIfStatsOutDiscards);
-				return;
-			}
-			io = (ipsec_out_t *)ipsec_mp->b_rptr;
-		} else {
-			ASSERT(in->ipsec_info_type == IPSEC_OUT);
-			io = (ipsec_out_t *)in;
-			/*
-			 * Clear out ipsec_out_proc_begin, so we do a fresh
-			 * ire lookup.
-			 */
-			io->ipsec_out_proc_begin = B_FALSE;
-		}
-		ASSERT(zoneid != ALL_ZONES);
-		/*
-		 * The IPSEC_IN (now an IPSEC_OUT) didn't have its zoneid
-		 * initialized.  We need to do that now.
-		 */
-		io->ipsec_out_zoneid = zoneid;
-	} else {
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
 		/*
-		 * This is in clear. The icmp message we are building
-		 * here should go out in clear.
+		 * Apply IPsec based on how IPsec was applied to
+		 * the packet that had the error.
 		 *
-		 * Pardon the convolution of it all, but it's easier to
-		 * allocate a "use cleartext" IPSEC_IN message and convert
-		 * it than it is to allocate a new one.
+		 * If it was an outbound packet that caused the ICMP
+		 * error, then the caller will have setup the IRA
+		 * appropriately.
 		 */
-		ipsec_in_t *ii;
-		ASSERT(DB_TYPE(mp) == M_DATA);
-		ipsec_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack);
-		if (ipsec_mp == NULL) {
-			freemsg(mp);
+		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			/* Note: mp already consumed and ip_drop_packet done */
 			return;
 		}
-		ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-
-		/* This is not a secure packet */
-		ii->ipsec_in_secure = B_FALSE;
-		ipsec_mp->b_cont = mp;
-		ipha = (ipha_t *)mp->b_rptr;
+	} else {
 		/*
-		 * Convert the IPSEC_IN to IPSEC_OUT.
+		 * This is in clear. The icmp message we are building
+		 * here should go out in clear, independent of our policy.
 		 */
-		if (!ipsec_in_to_out(ipsec_mp, ipha, NULL, zoneid)) {
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-			return;
-		}
-		io = (ipsec_out_t *)ipsec_mp->b_rptr;
+		ixas.ixa_flags |= IXAF_NO_IPSEC;
 	}
 
 	/* Remember our eventual destination */
 	dst = ipha->ipha_src;
 
-	ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK),
-	    NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE, ipst);
-	if (ire != NULL &&
-	    (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) {
+	/*
+	 * If the packet was for one of our unicast addresses, make
+	 * sure we respond with that as the source. Otherwise
+	 * have ip_output_simple pick the source address.
+	 */
+	ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0,
+	    (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL,
+	    MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL);
+	if (ire != NULL) {
+		ire_refrele(ire);
 		src = ipha->ipha_dst;
 	} else {
-		if (ire != NULL)
-			ire_refrele(ire);
-		ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL,
-		    (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY),
-		    ipst);
-		if (ire == NULL) {
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
-			freemsg(ipsec_mp);
-			return;
-		}
-		src = ire->ire_src_addr;
+		src = INADDR_ANY;
+		ixas.ixa_flags |= IXAF_SET_SOURCE;
 	}
 
-	if (ire != NULL)
-		ire_refrele(ire);
-
 	/*
 	 * Check if we can send back more then 8 bytes in addition to
 	 * the IP header.  We try to send 64 bytes of data and the internal
@@ -3352,10 +3006,10 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
 	len_needed = IPH_HDR_LENGTH(ipha);
 	if (ipha->ipha_protocol == IPPROTO_ENCAP ||
 	    ipha->ipha_protocol == IPPROTO_IPV6) {
-
 		if (!pullupmsg(mp, -1)) {
 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-			freemsg(ipsec_mp);
+			ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
+			freemsg(mp);
 			return;
 		}
 		ipha = (ipha_t *)mp->b_rptr;
@@ -3376,28 +3030,23 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
 		(void) adjmsg(mp, len_needed - msg_len);
 		msg_len = len_needed;
 	}
-	/* Make sure we propagate the cred/label for TX */
-	mp1 = allocb_tmpl(sizeof (icmp_ipha) + len, mp);
+	mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_MED);
 	if (mp1 == NULL) {
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors);
-		freemsg(ipsec_mp);
+		freemsg(mp);
 		return;
 	}
 	mp1->b_cont = mp;
 	mp = mp1;
-	ASSERT(ipsec_mp->b_datap->db_type == M_CTL &&
-	    ipsec_mp->b_rptr == (uint8_t *)io &&
-	    io->ipsec_out_type == IPSEC_OUT);
-	ipsec_mp->b_cont = mp;
 
 	/*
-	 * Set ipsec_out_icmp_loopback so we can let the ICMP messages this
+	 * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
 	 * node generates be accepted in peace by all on-host destinations.
 	 * If we do NOT assume that all on-host destinations trust
 	 * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
-	 * (Look for ipsec_out_icmp_loopback).
+	 * (Look for IXAF_TRUSTED_ICMP).
 	 */
-	io->ipsec_out_icmp_loopback = B_TRUE;
+	ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
 
 	ipha = (ipha_t *)mp->b_rptr;
 	mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len);
@@ -3416,7 +3065,9 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
 	icmph->icmph_checksum = 0;
 	icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
-	put(q, ipsec_mp);
+
+	(void) ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
 }
 
 /*
@@ -3480,37 +3131,30 @@ icmp_err_rate_limit(ip_stack_t *ipst)
  * ICMP error packet should be sent.
  */
 static mblk_t *
-icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst)
+icmp_pkt_err_ok(mblk_t *mp, ip_recv_attr_t *ira)
 {
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
 	icmph_t	*icmph;
 	ipha_t	*ipha;
 	uint_t	len_needed;
-	ire_t	*src_ire;
-	ire_t	*dst_ire;
 
 	if (!mp)
 		return (NULL);
 	ipha = (ipha_t *)mp->b_rptr;
 	if (ip_csum_hdr(ipha)) {
 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs);
+		ip_drop_input("ipIfStatsInCksumErrs", mp, NULL);
 		freemsg(mp);
 		return (NULL);
 	}
-	src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST,
-	    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-	dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST,
-	    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-	if (src_ire != NULL || dst_ire != NULL ||
+	if (ip_type_v4(ipha->ipha_dst, ipst) == IRE_BROADCAST ||
+	    ip_type_v4(ipha->ipha_src, ipst) == IRE_BROADCAST ||
 	    CLASSD(ipha->ipha_dst) ||
 	    CLASSD(ipha->ipha_src) ||
 	    (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) {
 		/* Note: only errors to the fragment with offset 0 */
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
 		freemsg(mp);
-		if (src_ire != NULL)
-			ire_refrele(src_ire);
-		if (dst_ire != NULL)
-			ire_refrele(dst_ire);
 		return (NULL);
 	}
 	if (ipha->ipha_protocol == IPPROTO_ICMP) {
@@ -3546,7 +3190,7 @@ icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst)
 	 * If this is a labeled system, then check to see if we're allowed to
 	 * send a response to this particular sender.  If not, then just drop.
 	 */
-	if (is_system_labeled() && !tsol_can_reply_error(mp)) {
+	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
 		ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n"));
 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
 		freemsg(mp);
@@ -3565,956 +3209,178 @@ icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst)
 }
 
 /*
- * Generate an ICMP redirect message.
+ * Called when a packet was sent out the same link that it arrived on.
+ * Check if it is ok to send a redirect and then send it.
  */
-static void
-icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway, ip_stack_t *ipst)
+void
+ip_send_potential_redirect_v4(mblk_t *mp, ipha_t *ipha, ire_t *ire,
+    ip_recv_attr_t *ira)
 {
-	icmph_t	icmph;
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
+	ipaddr_t	src, nhop;
+	mblk_t		*mp1;
+	ire_t		*nhop_ire;
 
 	/*
-	 * We are called from ip_rput where we could
-	 * not have attached an IPSEC_IN.
-	 */
-	ASSERT(mp->b_datap->db_type == M_DATA);
-
-	if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
+	 * Check the source address to see if it originated
+	 * on the same logical subnet it is going back out on.
+	 * If so, we should be able to send it a redirect.
+	 * Avoid sending a redirect if the destination
+	 * is directly connected (i.e., we matched an IRE_ONLINK),
+	 * or if the packet was source routed out this interface.
+	 *
+	 * We avoid sending a redirect if the
+	 * destination is directly connected
+	 * because it is possible that multiple
+	 * IP subnets may have been configured on
+	 * the link, and the source may not
+	 * be on the same subnet as ip destination,
+	 * even though they are on the same
+	 * physical link.
+	 */
+	if ((ire->ire_type & IRE_ONLINK) ||
+	    ip_source_routed(ipha, ipst))
 		return;
-	}
-
-	bzero(&icmph, sizeof (icmph_t));
-	icmph.icmph_type = ICMP_REDIRECT;
-	icmph.icmph_code = 1;
-	icmph.icmph_rd_gateway = gateway;
-	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
-	/* Redirects sent by router, and router is global zone */
-	icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE, GLOBAL_ZONEID, ipst);
-}
 
-/*
- * Generate an ICMP time exceeded message.
- */
-void
-icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid,
-    ip_stack_t *ipst)
-{
-	icmph_t	icmph;
-	boolean_t mctl_present;
-	mblk_t *first_mp;
-
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
-	if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
-		if (mctl_present)
-			freeb(first_mp);
+	nhop_ire = ire_nexthop(ire);
+	if (nhop_ire == NULL)
 		return;
-	}
-
-	bzero(&icmph, sizeof (icmph_t));
-	icmph.icmph_type = ICMP_TIME_EXCEEDED;
-	icmph.icmph_code = code;
-	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
-	icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid,
-	    ipst);
-}
 
-/*
- * Generate an ICMP unreachable message.
- */
-void
-icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid,
-    ip_stack_t *ipst)
-{
-	icmph_t	icmph;
-	mblk_t *first_mp;
-	boolean_t mctl_present;
+	nhop = nhop_ire->ire_addr;
 
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
+	if (nhop_ire->ire_type & IRE_IF_CLONE) {
+		ire_t	*ire2;
 
-	if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
-		if (mctl_present)
-			freeb(first_mp);
-		return;
+		/* Follow ire_dep_parent to find non-clone IRE_INTERFACE */
+		mutex_enter(&nhop_ire->ire_lock);
+		ire2 = nhop_ire->ire_dep_parent;
+		if (ire2 != NULL)
+			ire_refhold(ire2);
+		mutex_exit(&nhop_ire->ire_lock);
+		ire_refrele(nhop_ire);
+		nhop_ire = ire2;
 	}
-
-	bzero(&icmph, sizeof (icmph_t));
-	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
-	icmph.icmph_code = code;
-	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
-	ip2dbg(("send icmp destination unreachable code %d\n", code));
-	icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present,
-	    zoneid, ipst);
-}
-
-/*
- * Attempt to start recovery of an IPv4 interface that's been shut down as a
- * duplicate.  As long as someone else holds the address, the interface will
- * stay down.  When that conflict goes away, the interface is brought back up.
- * This is done so that accidental shutdowns of addresses aren't made
- * permanent.  Your server will recover from a failure.
- *
- * For DHCP, recovery is not done in the kernel.  Instead, it's handled by a
- * user space process (dhcpagent).
- *
- * Recovery completes if ARP reports that the address is now ours (via
- * AR_CN_READY).  In that case, we go to ip_arp_excl to finish the operation.
- *
- * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
- */
-static void
-ipif_dup_recovery(void *arg)
-{
-	ipif_t *ipif = arg;
-	ill_t *ill = ipif->ipif_ill;
-	mblk_t *arp_add_mp;
-	mblk_t *arp_del_mp;
-	ip_stack_t *ipst = ill->ill_ipst;
-
-	ipif->ipif_recovery_id = 0;
-
-	/*
-	 * No lock needed for moving or condemned check, as this is just an
-	 * optimization.
-	 */
-	if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) ||
-	    (ipif->ipif_flags & IPIF_POINTOPOINT) ||
-	    (ipif->ipif_state_flags & (IPIF_CONDEMNED))) {
-		/* No reason to try to bring this address back. */
+	if (nhop_ire == NULL)
 		return;
-	}
 
-	/* ACE_F_UNVERIFIED restarts DAD */
-	if ((arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL)
-		goto alloc_fail;
-
-	if (ipif->ipif_arp_del_mp == NULL) {
-		if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL)
-			goto alloc_fail;
-		ipif->ipif_arp_del_mp = arp_del_mp;
-	}
+	ASSERT(!(nhop_ire->ire_type & IRE_IF_CLONE));
 
-	putnext(ill->ill_rq, arp_add_mp);
-	return;
+	src = ipha->ipha_src;
 
-alloc_fail:
 	/*
-	 * On allocation failure, just restart the timer.  Note that the ipif
-	 * is down here, so no other thread could be trying to start a recovery
-	 * timer.  The ill_lock protects the condemned flag and the recovery
-	 * timer ID.
+	 * We look at the interface ire for the nexthop,
+	 * to see if ipha_src is in the same subnet
+	 * as the nexthop.
 	 */
-	freemsg(arp_add_mp);
-	mutex_enter(&ill->ill_lock);
-	if (ipst->ips_ip_dup_recovery > 0 && ipif->ipif_recovery_id == 0 &&
-	    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
-		ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif,
-		    MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
-	}
-	mutex_exit(&ill->ill_lock);
-}
-
-/*
- * This is for exclusive changes due to ARP.  Either tear down an interface due
- * to AR_CN_FAILED and AR_CN_BOGON, or bring one up for successful recovery.
- */
-/* ARGSUSED */
-static void
-ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
-{
-	ill_t	*ill = rq->q_ptr;
-	arh_t *arh;
-	ipaddr_t src;
-	ipif_t	*ipif;
-	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
-	char hbuf[MAC_STR_LEN];
-	char sbuf[INET_ADDRSTRLEN];
-	const char *failtype;
-	boolean_t bring_up;
-	ip_stack_t *ipst = ill->ill_ipst;
-
-	switch (((arcn_t *)mp->b_rptr)->arcn_code) {
-	case AR_CN_READY:
-		failtype = NULL;
-		bring_up = B_TRUE;
-		break;
-	case AR_CN_FAILED:
-		failtype = "in use";
-		bring_up = B_FALSE;
-		break;
-	default:
-		failtype = "claimed";
-		bring_up = B_FALSE;
-		break;
-	}
-
-	arh = (arh_t *)mp->b_cont->b_rptr;
-	bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN);
-
-	(void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, hbuf,
-	    sizeof (hbuf));
-	(void) ip_dot_addr(src, sbuf);
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-
-		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
-		    ipif->ipif_lcl_addr != src) {
-			continue;
-		}
-
-		/*
-		 * If we failed on a recovery probe, then restart the timer to
-		 * try again later.
-		 */
-		if (!bring_up && (ipif->ipif_flags & IPIF_DUPLICATE) &&
-		    !(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
-		    ill->ill_net_type == IRE_IF_RESOLVER &&
-		    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
-		    ipst->ips_ip_dup_recovery > 0 &&
-		    ipif->ipif_recovery_id == 0) {
-			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
-			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
-			continue;
-		}
-
-		/*
-		 * If what we're trying to do has already been done, then do
-		 * nothing.
-		 */
-		if (bring_up == ((ipif->ipif_flags & IPIF_UP) != 0))
-			continue;
-
-		ipif_get_name(ipif, ibuf, sizeof (ibuf));
-
-		if (failtype == NULL) {
-			cmn_err(CE_NOTE, "recovered address %s on %s", sbuf,
-			    ibuf);
-		} else {
-			cmn_err(CE_WARN, "%s has duplicate address %s (%s "
-			    "by %s); disabled", ibuf, sbuf, failtype, hbuf);
-		}
-
-		if (bring_up) {
-			ASSERT(ill->ill_dl_up);
-			/*
-			 * Free up the ARP delete message so we can allocate
-			 * a fresh one through the normal path.
-			 */
-			freemsg(ipif->ipif_arp_del_mp);
-			ipif->ipif_arp_del_mp = NULL;
-			if (ipif_resolver_up(ipif, Res_act_initial) !=
-			    EINPROGRESS) {
-				ipif->ipif_addr_ready = 1;
-				(void) ipif_up_done(ipif);
-				ASSERT(ill->ill_move_ipif == NULL);
-			}
-			continue;
-		}
-
-		mutex_enter(&ill->ill_lock);
-		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
-		ipif->ipif_flags |= IPIF_DUPLICATE;
-		ill->ill_ipif_dup_count++;
-		mutex_exit(&ill->ill_lock);
+	if ((src & nhop_ire->ire_mask) == (nhop & nhop_ire->ire_mask)) {
 		/*
-		 * Already exclusive on the ill; no need to handle deferred
-		 * processing here.
+		 * The source is directly connected.
 		 */
-		(void) ipif_down(ipif, NULL, NULL);
-		ipif_down_tail(ipif);
-		mutex_enter(&ill->ill_lock);
-		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
-		    ill->ill_net_type == IRE_IF_RESOLVER &&
-		    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
-		    ipst->ips_ip_dup_recovery > 0) {
-			ASSERT(ipif->ipif_recovery_id == 0);
-			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
-			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
+		mp1 = copymsg(mp);
+		if (mp1 != NULL) {
+			icmp_send_redirect(mp1, nhop, ira);
 		}
-		mutex_exit(&ill->ill_lock);
 	}
-	freemsg(mp);
-}
-
-/* ARGSUSED */
-static void
-ip_arp_defend(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
-{
-	ill_t	*ill = rq->q_ptr;
-	arh_t *arh;
-	ipaddr_t src;
-	ipif_t	*ipif;
-
-	arh = (arh_t *)mp->b_cont->b_rptr;
-	bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN);
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_lcl_addr == src)
-			(void) ipif_resolver_up(ipif, Res_act_defend);
-	}
-	freemsg(mp);
+	ire_refrele(nhop_ire);
 }
 
 /*
- * News from ARP.  ARP sends notification of interesting events down
- * to its clients using M_CTL messages with the interesting ARP packet
- * attached via b_cont.
- * The interesting event from a device comes up the corresponding ARP-IP-DEV
- * queue as opposed to ARP sending the message to all the clients, i.e. all
- * its ARP-IP-DEV instances. Thus, for AR_CN_ANNOUNCE, we must walk the cache
- * table if a cache IRE is found to delete all the entries for the address in
- * the packet.
+ * Generate an ICMP redirect message.
  */
 static void
-ip_arp_news(queue_t *q, mblk_t *mp)
+icmp_send_redirect(mblk_t *mp, ipaddr_t gateway, ip_recv_attr_t *ira)
 {
-	arcn_t		*arcn;
-	arh_t		*arh;
-	ire_t		*ire = NULL;
-	char		hbuf[MAC_STR_LEN];
-	char		sbuf[INET_ADDRSTRLEN];
-	ipaddr_t	src;
-	in6_addr_t	v6src;
-	boolean_t	isv6 = B_FALSE;
-	ipif_t		*ipif;
-	ill_t		*ill;
-	ip_stack_t	*ipst;
-
-	if (CONN_Q(q)) {
-		conn_t *connp = Q_TO_CONN(q);
-
-		ipst = connp->conn_netstack->netstack_ip;
-	} else {
-		ill_t *ill = (ill_t *)q->q_ptr;
-
-		ipst = ill->ill_ipst;
-	}
+	icmph_t	icmph;
+	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
 
-	if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t)	|| !mp->b_cont) {
-		if (q->q_next) {
-			putnext(q, mp);
-		} else
-			freemsg(mp);
-		return;
-	}
-	arh = (arh_t *)mp->b_cont->b_rptr;
-	/* Is it one we are interested in? */
-	if (BE16_TO_U16(arh->arh_proto) == ETHERTYPE_IPV6) {
-		isv6 = B_TRUE;
-		bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &v6src,
-		    IPV6_ADDR_LEN);
-	} else if (BE16_TO_U16(arh->arh_proto) == IP_ARP_PROTO_TYPE) {
-		bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &src,
-		    IP_ADDR_LEN);
-	} else {
-		freemsg(mp);
+	mp = icmp_pkt_err_ok(mp, ira);
+	if (mp == NULL)
 		return;
-	}
-
-	ill = q->q_ptr;
 
-	arcn = (arcn_t *)mp->b_rptr;
-	switch (arcn->arcn_code) {
-	case AR_CN_BOGON:
-		/*
-		 * Someone is sending ARP packets with a source protocol
-		 * address that we have published and for which we believe our
-		 * entry is authoritative and (when ill_arp_extend is set)
-		 * verified to be unique on the network.
-		 *
-		 * The ARP module internally handles the cases where the sender
-		 * is just probing (for DAD) and where the hardware address of
-		 * a non-authoritative entry has changed.  Thus, these are the
-		 * real conflicts, and we have to do resolution.
-		 *
-		 * We back away quickly from the address if it's from DHCP or
-		 * otherwise temporary and hasn't been used recently (or at
-		 * all).  We'd like to include "deprecated" addresses here as
-		 * well (as there's no real reason to defend something we're
-		 * discarding), but IPMP "reuses" this flag to mean something
-		 * other than the standard meaning.
-		 *
-		 * If the ARP module above is not extended (meaning that it
-		 * doesn't know how to defend the address), then we just log
-		 * the problem as we always did and continue on.  It's not
-		 * right, but there's little else we can do, and those old ATM
-		 * users are going away anyway.
-		 */
-		(void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen,
-		    hbuf, sizeof (hbuf));
-		(void) ip_dot_addr(src, sbuf);
-		if (isv6) {
-			ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL,
-			    ipst);
-		} else {
-			ire = ire_cache_lookup(src, ALL_ZONES, NULL, ipst);
-		}
-		if (ire != NULL	&& IRE_IS_LOCAL(ire)) {
-			uint32_t now;
-			uint32_t maxage;
-			clock_t lused;
-			uint_t maxdefense;
-			uint_t defs;
-
-			/*
-			 * First, figure out if this address hasn't been used
-			 * in a while.  If it hasn't, then it's a better
-			 * candidate for abandoning.
-			 */
-			ipif = ire->ire_ipif;
-			ASSERT(ipif != NULL);
-			now = gethrestime_sec();
-			maxage = now - ire->ire_create_time;
-			if (maxage > ipst->ips_ip_max_temp_idle)
-				maxage = ipst->ips_ip_max_temp_idle;
-			lused = drv_hztousec(ddi_get_lbolt() -
-			    ire->ire_last_used_time) / MICROSEC + 1;
-			if (lused >= maxage && (ipif->ipif_flags &
-			    (IPIF_DHCPRUNNING | IPIF_TEMPORARY)))
-				maxdefense = ipst->ips_ip_max_temp_defend;
-			else
-				maxdefense = ipst->ips_ip_max_defend;
-
-			/*
-			 * Now figure out how many times we've defended
-			 * ourselves.  Ignore defenses that happened long in
-			 * the past.
-			 */
-			mutex_enter(&ire->ire_lock);
-			if ((defs = ire->ire_defense_count) > 0 &&
-			    now - ire->ire_defense_time >
-			    ipst->ips_ip_defend_interval) {
-				ire->ire_defense_count = defs = 0;
-			}
-			ire->ire_defense_count++;
-			ire->ire_defense_time = now;
-			mutex_exit(&ire->ire_lock);
-			ill_refhold(ill);
-			ire_refrele(ire);
-
-			/*
-			 * If we've defended ourselves too many times already,
-			 * then give up and tear down the interface(s) using
-			 * this address.  Otherwise, defend by sending out a
-			 * gratuitous ARP.
-			 */
-			if (defs >= maxdefense && ill->ill_arp_extend) {
-				qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP,
-				    B_FALSE);
-			} else {
-				cmn_err(CE_WARN,
-				    "node %s is using our IP address %s on %s",
-				    hbuf, sbuf, ill->ill_name);
-				/*
-				 * If this is an old (ATM) ARP module, then
-				 * don't try to defend the address.  Remain
-				 * compatible with the old behavior.  Defend
-				 * only with new ARP.
-				 */
-				if (ill->ill_arp_extend) {
-					qwriter_ip(ill, q, mp, ip_arp_defend,
-					    NEW_OP, B_FALSE);
-				} else {
-					ill_refrele(ill);
-				}
-			}
-			return;
-		}
-		cmn_err(CE_WARN,
-		    "proxy ARP problem?  Node '%s' is using %s on %s",
-		    hbuf, sbuf, ill->ill_name);
-		if (ire != NULL)
-			ire_refrele(ire);
-		break;
-	case AR_CN_ANNOUNCE:
-		if (isv6) {
-			/*
-			 * For XRESOLV interfaces.
-			 * Delete the IRE cache entry and NCE for this
-			 * v6 address
-			 */
-			ip_ire_clookup_and_delete_v6(&v6src, ipst);
-			/*
-			 * If v6src is a non-zero, it's a router address
-			 * as below. Do the same sort of thing to clean
-			 * out off-net IRE_CACHE entries that go through
-			 * the router.
-			 */
-			if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
-				ire_walk_v6(ire_delete_cache_gw_v6,
-				    (char *)&v6src, ALL_ZONES, ipst);
-			}
-		} else {
-			nce_hw_map_t hwm;
-
-			/*
-			 * ARP gives us a copy of any packet where it thinks
-			 * the address has changed, so that we can update our
-			 * caches.  We're responsible for caching known answers
-			 * in the current design.  We check whether the
-			 * hardware address really has changed in all of our
-			 * entries that have cached this mapping, and if so, we
-			 * blow them away.  This way we will immediately pick
-			 * up the rare case of a host changing hardware
-			 * address.
-			 */
-			if (src == 0)
-				break;
-			hwm.hwm_addr = src;
-			hwm.hwm_hwlen = arh->arh_hlen;
-			hwm.hwm_hwaddr = (uchar_t *)(arh + 1);
-			NDP_HW_CHANGE_INCR(ipst->ips_ndp4);
-			ndp_walk_common(ipst->ips_ndp4, NULL,
-			    (pfi_t)nce_delete_hw_changed, &hwm, ALL_ZONES);
-			NDP_HW_CHANGE_DECR(ipst->ips_ndp4);
-		}
-		break;
-	case AR_CN_READY:
-		/* No external v6 resolver has a contract to use this */
-		if (isv6)
-			break;
-		/* If the link is down, we'll retry this later */
-		if (!(ill->ill_phyint->phyint_flags & PHYI_RUNNING))
-			break;
-		ipif = ipif_lookup_addr(src, ill, ALL_ZONES, NULL, NULL,
-		    NULL, NULL, ipst);
-		if (ipif != NULL) {
-			/*
-			 * If this is a duplicate recovery, then we now need to
-			 * go exclusive to bring this thing back up.
-			 */
-			if ((ipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)) ==
-			    IPIF_DUPLICATE) {
-				ipif_refrele(ipif);
-				ill_refhold(ill);
-				qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP,
-				    B_FALSE);
-				return;
-			}
-			/*
-			 * If this is the first notice that this address is
-			 * ready, then let the user know now.
-			 */
-			if ((ipif->ipif_flags & IPIF_UP) &&
-			    !ipif->ipif_addr_ready) {
-				ipif_mask_reply(ipif);
-				ipif_up_notify(ipif);
-			}
-			ipif->ipif_addr_ready = 1;
-			ipif_refrele(ipif);
-		}
-		ire = ire_cache_lookup(src, ALL_ZONES, msg_getlabel(mp), ipst);
-		if (ire != NULL) {
-			ire->ire_defense_count = 0;
-			ire_refrele(ire);
-		}
-		break;
-	case AR_CN_FAILED:
-		/* No external v6 resolver has a contract to use this */
-		if (isv6)
-			break;
-		if (!ill->ill_arp_extend) {
-			(void) mac_colon_addr((uint8_t *)(arh + 1),
-			    arh->arh_hlen, hbuf, sizeof (hbuf));
-			(void) ip_dot_addr(src, sbuf);
-
-			cmn_err(CE_WARN,
-			    "node %s is using our IP address %s on %s",
-			    hbuf, sbuf, ill->ill_name);
-			break;
-		}
-		ill_refhold(ill);
-		qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, B_FALSE);
-		return;
-	}
-	freemsg(mp);
+	bzero(&icmph, sizeof (icmph_t));
+	icmph.icmph_type = ICMP_REDIRECT;
+	icmph.icmph_code = 1;
+	icmph.icmph_rd_gateway = gateway;
+	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
+	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
 }
 
 /*
- * Create a mblk suitable for carrying the interface index and/or source link
- * address. This mblk is tagged as an M_CTL and is sent to ULP. This is used
- * when the IP_RECVIF and/or IP_RECVSLLA socket option is set by the user
- * application.
+ * Generate an ICMP time exceeded message.
  */
-mblk_t *
-ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
-    ip_stack_t *ipst)
+void
+icmp_time_exceeded(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
 {
-	mblk_t		*mp;
-	ip_pktinfo_t	*pinfo;
-	ipha_t 		*ipha;
-	struct ether_header *pether;
-	boolean_t	ipmp_ill_held = B_FALSE;
-
-	mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED);
-	if (mp == NULL) {
-		ip1dbg(("ip_add_info: allocation failure.\n"));
-		return (data_mp);
-	}
-
-	ipha = (ipha_t *)data_mp->b_rptr;
-	pinfo = (ip_pktinfo_t *)mp->b_rptr;
-	bzero(pinfo, sizeof (ip_pktinfo_t));
-	pinfo->ip_pkt_flags = (uchar_t)flags;
-	pinfo->ip_pkt_ulp_type = IN_PKTINFO;	/* Tell ULP what type of info */
-
-	pether = (struct ether_header *)((char *)ipha
-	    - sizeof (struct ether_header));
-
-	/*
-	 * Make sure the interface is an ethernet type, since this option
-	 * is currently supported only on this type of interface. Also make
-	 * sure we are pointing correctly above db_base.
-	 */
-	if ((flags & IPF_RECVSLLA) &&
-	    ((uchar_t *)pether >= data_mp->b_datap->db_base) &&
-	    (ill->ill_type == IFT_ETHER) &&
-	    (ill->ill_net_type == IRE_IF_RESOLVER)) {
-		pinfo->ip_pkt_slla.sdl_type = IFT_ETHER;
-		bcopy(pether->ether_shost.ether_addr_octet,
-		    pinfo->ip_pkt_slla.sdl_data, ETHERADDRL);
-	} else {
-		/*
-		 * Clear the bit. Indicate to upper layer that IP is not
-		 * sending this ancillary info.
-		 */
-		pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA;
-	}
-
-	/*
-	 * If `ill' is in an IPMP group, use the IPMP ill to determine
-	 * IPF_RECVIF and IPF_RECVADDR.  (This currently assumes that
-	 * IPF_RECVADDR support on test addresses is not needed.)
-	 *
-	 * Note that `ill' may already be an IPMP ill if e.g. we're
-	 * processing a packet looped back to an IPMP data address
-	 * (since those IRE_LOCALs are tied to IPMP ills).
-	 */
-	if (IS_UNDER_IPMP(ill)) {
-		if ((ill = ipmp_ill_hold_ipmp_ill(ill)) == NULL) {
-			ip1dbg(("ip_add_info: cannot hold IPMP ill.\n"));
-			freemsg(mp);
-			return (data_mp);
-		}
-		ipmp_ill_held = B_TRUE;
-	}
-
-	if (flags & (IPF_RECVIF | IPF_RECVADDR))
-		pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex;
-	if (flags & IPF_RECVADDR) {
-		ipif_t	*ipif;
-		ire_t	*ire;
-
-		/*
-		 * Only valid for V4
-		 */
-		ASSERT((ipha->ipha_version_and_hdr_length & 0xf0) ==
-		    (IPV4_VERSION << 4));
-
-		ipif = ipif_get_next_ipif(NULL, ill);
-		if (ipif != NULL) {
-			/*
-			 * Since a decision has already been made to deliver the
-			 * packet, there is no need to test for SECATTR and
-			 * ZONEONLY.
-			 * When a multicast packet is transmitted
-			 * a cache entry is created for the multicast address.
-			 * When delivering a copy of the packet or when new
-			 * packets are received we do not want to match on the
-			 * cached entry so explicitly match on
-			 * IRE_LOCAL and IRE_LOOPBACK
-			 */
-			ire = ire_ctable_lookup(ipha->ipha_dst, 0,
-			    IRE_LOCAL | IRE_LOOPBACK,
-			    ipif, zoneid, NULL,
-			    MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-			if (ire == NULL) {
-				/*
-				 * packet must have come on a different
-				 * interface.
-				 * Since a decision has already been made to
-				 * deliver the packet, there is no need to test
-				 * for SECATTR and ZONEONLY.
-				 * Only match on local and broadcast ire's.
-				 * See detailed comment above.
-				 */
-				ire = ire_ctable_lookup(ipha->ipha_dst, 0,
-				    IRE_LOCAL | IRE_LOOPBACK, ipif, zoneid,
-				    NULL, MATCH_IRE_TYPE, ipst);
-			}
-
-			if (ire == NULL) {
-				/*
-				 * This is either a multicast packet or
-				 * the address has been removed since
-				 * the packet was received.
-				 * Return INADDR_ANY so that normal source
-				 * selection occurs for the response.
-				 */
-
-				pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY;
-			} else {
-				pinfo->ip_pkt_match_addr.s_addr =
-				    ire->ire_src_addr;
-				ire_refrele(ire);
-			}
-			ipif_refrele(ipif);
-		} else {
-			pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY;
-		}
-	}
-
-	if (ipmp_ill_held)
-		ill_refrele(ill);
+	icmph_t	icmph;
+	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
 
-	mp->b_datap->db_type = M_CTL;
-	mp->b_wptr += sizeof (ip_pktinfo_t);
-	mp->b_cont = data_mp;
+	mp = icmp_pkt_err_ok(mp, ira);
+	if (mp == NULL)
+		return;
 
-	return (mp);
+	bzero(&icmph, sizeof (icmph_t));
+	icmph.icmph_type = ICMP_TIME_EXCEEDED;
+	icmph.icmph_code = code;
+	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
+	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
 }
 
 /*
- * Used to determine the most accurate cred_t to use for TX.
- * First priority is SCM_UCRED having set the label in the message,
- * which is used for MLP on UDP. Second priority is the open credentials
- * with the peer's label (aka conn_effective_cred), which is needed for
- * MLP on TCP/SCTP and for MAC-Exempt. Last priority is the open credentials.
+ * Generate an ICMP unreachable message.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
  */
-cred_t *
-ip_best_cred(mblk_t *mp, conn_t *connp, pid_t *pidp)
+void
+icmp_unreachable(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
 {
-	cred_t *cr;
+	icmph_t	icmph;
+	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
 
-	cr = msg_getcred(mp, pidp);
-	if (cr != NULL && crgetlabel(cr) != NULL)
-		return (cr);
-	*pidp = NOPID;
-	return (CONN_CRED(connp));
+	mp = icmp_pkt_err_ok(mp, ira);
+	if (mp == NULL)
+		return;
+
+	bzero(&icmph, sizeof (icmph_t));
+	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
+	icmph.icmph_code = code;
+	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
+	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
 }
 
 /*
- * Latch in the IPsec state for a stream based on the ipsec_in_t passed in as
- * part of the bind request.
+ * Latch in the IPsec state for a stream based the policy in the listener
+ * and the actions in the ip_recv_attr_t.
+ * Called directly from TCP and SCTP.
  */
-
 boolean_t
-ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp)
+ip_ipsec_policy_inherit(conn_t *connp, conn_t *lconnp, ip_recv_attr_t *ira)
 {
-	ipsec_in_t *ii;
-
-	ASSERT(policy_mp != NULL);
-	ASSERT(policy_mp->b_datap->db_type == IPSEC_POLICY_SET);
+	ASSERT(lconnp->conn_policy != NULL);
+	ASSERT(connp->conn_policy == NULL);
 
-	ii = (ipsec_in_t *)policy_mp->b_rptr;
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
+	IPPH_REFHOLD(lconnp->conn_policy);
+	connp->conn_policy = lconnp->conn_policy;
 
-	connp->conn_policy = ii->ipsec_in_policy;
-	ii->ipsec_in_policy = NULL;
-
-	if (ii->ipsec_in_action != NULL) {
+	if (ira->ira_ipsec_action != NULL) {
 		if (connp->conn_latch == NULL) {
 			connp->conn_latch = iplatch_create();
 			if (connp->conn_latch == NULL)
 				return (B_FALSE);
 		}
-		ipsec_latch_inbound(connp->conn_latch, ii);
+		ipsec_latch_inbound(connp, ira);
 	}
 	return (B_TRUE);
 }
 
 /*
- * Upper level protocols (ULP) pass through bind requests to IP for inspection
- * and to arrange for power-fanout assist.  The ULP is identified by
- * adding a single byte at the end of the original bind message.
- * A ULP other than UDP or TCP that wishes to be recognized passes
- * down a bind with a zero length address.
- *
- * The binding works as follows:
- * - A zero byte address means just bind to the protocol.
- * - A four byte address is treated as a request to validate
- *   that the address is a valid local address, appropriate for
- *   an application to bind to. This does not affect any fanout
- *   information in IP.
- * - A sizeof sin_t byte address is used to bind to only the local address
- *   and port.
- * - A sizeof ipa_conn_t byte address contains complete fanout information
- *   consisting of local and remote addresses and ports.  In
- *   this case, the addresses are both validated as appropriate
- *   for this operation, and, if so, the information is retained
- *   for use in the inbound fanout.
+ * Verify whether or not the IP address is a valid local address.
+ * Could be a unicast, including one for a down interface.
+ * If allow_mcbc then a multicast or broadcast address is also
+ * acceptable.
  *
- * The ULP (except in the zero-length bind) can append an
- * additional mblk of db_type IRE_DB_REQ_TYPE or IPSEC_POLICY_SET to the
- * T_BIND_REQ/O_T_BIND_REQ. IRE_DB_REQ_TYPE indicates that the ULP wants
- * a copy of the source or destination IRE (source for local bind;
- * destination for complete bind). IPSEC_POLICY_SET indicates that the
- * policy information contained should be copied on to the conn.
- *
- * NOTE : Only one of IRE_DB_REQ_TYPE or IPSEC_POLICY_SET can be present.
- */
-mblk_t *
-ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
-{
-	ssize_t		len;
-	struct T_bind_req	*tbr;
-	sin_t		*sin;
-	ipa_conn_t	*ac;
-	uchar_t		*ucp;
-	int		error = 0;
-	int		protocol;
-	ipa_conn_x_t	*acx;
-	cred_t		*cr;
-
-	/*
-	 * All Solaris components should pass a db_credp
-	 * for this TPI message, hence we ASSERT.
-	 * But in case there is some other M_PROTO that looks
-	 * like a TPI message sent by some other kernel
-	 * component, we check and return an error.
-	 */
-	cr = msg_getcred(mp, NULL);
-	ASSERT(cr != NULL);
-	if (cr == NULL) {
-		error = EINVAL;
-		goto bad_addr;
-	}
-
-	ASSERT(!connp->conn_af_isv6);
-	connp->conn_pkt_isv6 = B_FALSE;
-
-	len = MBLKL(mp);
-	if (len < (sizeof (*tbr) + 1)) {
-		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
-		    "ip_bind: bogus msg, len %ld", len);
-		/* XXX: Need to return something better */
-		goto bad_addr;
-	}
-	/* Back up and extract the protocol identifier. */
-	mp->b_wptr--;
-	protocol = *mp->b_wptr & 0xFF;
-	tbr = (struct T_bind_req *)mp->b_rptr;
-	/* Reset the message type in preparation for shipping it back. */
-	DB_TYPE(mp) = M_PCPROTO;
-
-	connp->conn_ulp = (uint8_t)protocol;
-
-	/*
-	 * Check for a zero length address.  This is from a protocol that
-	 * wants to register to receive all packets of its type.
-	 */
-	if (tbr->ADDR_length == 0) {
-		/*
-		 * These protocols are now intercepted in ip_bind_v6().
-		 * Reject protocol-level binds here for now.
-		 *
-		 * For SCTP raw socket, ICMP sends down a bind with sin_t
-		 * so that the protocol type cannot be SCTP.
-		 */
-		if (protocol == IPPROTO_TCP || protocol == IPPROTO_AH ||
-		    protocol == IPPROTO_ESP || protocol == IPPROTO_SCTP) {
-			goto bad_addr;
-		}
-
-		/*
-		 *
-		 * The udp module never sends down a zero-length address,
-		 * and allowing this on a labeled system will break MLP
-		 * functionality.
-		 */
-		if (is_system_labeled() && protocol == IPPROTO_UDP)
-			goto bad_addr;
-
-		if (connp->conn_mac_mode != CONN_MAC_DEFAULT)
-			goto bad_addr;
-
-		/* No hash here really.  The table is big enough. */
-		connp->conn_srcv6 = ipv6_all_zeros;
-
-		ipcl_proto_insert(connp, protocol);
-
-		tbr->PRIM_type = T_BIND_ACK;
-		return (mp);
-	}
-
-	/* Extract the address pointer from the message. */
-	ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset,
-	    tbr->ADDR_length);
-	if (ucp == NULL) {
-		ip1dbg(("ip_bind: no address\n"));
-		goto bad_addr;
-	}
-	if (!OK_32PTR(ucp)) {
-		ip1dbg(("ip_bind: unaligned address\n"));
-		goto bad_addr;
-	}
-
-	switch (tbr->ADDR_length) {
-	default:
-		ip1dbg(("ip_bind: bad address length %d\n",
-		    (int)tbr->ADDR_length));
-		goto bad_addr;
-
-	case IP_ADDR_LEN:
-		/* Verification of local address only */
-		error = ip_bind_laddr_v4(connp, &mp->b_cont, protocol,
-		    *(ipaddr_t *)ucp, 0, B_FALSE);
-		break;
-
-	case sizeof (sin_t):
-		sin = (sin_t *)ucp;
-		error = ip_bind_laddr_v4(connp, &mp->b_cont, protocol,
-		    sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
-		break;
-
-	case sizeof (ipa_conn_t):
-		ac = (ipa_conn_t *)ucp;
-		/* For raw socket, the local port is not set. */
-		if (ac->ac_lport == 0)
-			ac->ac_lport = connp->conn_lport;
-		/* Always verify destination reachability. */
-		error = ip_bind_connected_v4(connp, &mp->b_cont, protocol,
-		    &ac->ac_laddr, ac->ac_lport, ac->ac_faddr, ac->ac_fport,
-		    B_TRUE, B_TRUE, cr);
-		break;
-
-	case sizeof (ipa_conn_x_t):
-		acx = (ipa_conn_x_t *)ucp;
-		/*
-		 * Whether or not to verify destination reachability depends
-		 * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags.
-		 */
-		error = ip_bind_connected_v4(connp, &mp->b_cont, protocol,
-		    &acx->acx_conn.ac_laddr, acx->acx_conn.ac_lport,
-		    acx->acx_conn.ac_faddr, acx->acx_conn.ac_fport,
-		    B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0, cr);
-		break;
-	}
-	ASSERT(error != EINPROGRESS);
-	if (error != 0)
-		goto bad_addr;
-
-	/* Send it home. */
-	mp->b_datap->db_type = M_PCPROTO;
-	tbr->PRIM_type = T_BIND_ACK;
-	return (mp);
-
-bad_addr:
-	/*
-	 * If error = -1 then we generate a TBADADDR - otherwise error is
-	 * a unix errno.
-	 */
-	if (error > 0)
-		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
-	else
-		mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
-	return (mp);
-}
-
-/*
- * Here address is verified to be a valid local address.
- * If the IRE_DB_REQ_TYPE mp is present, a broadcast/multicast
- * address is also considered a valid local address.
  * In the case of a broadcast/multicast address, however, the
  * upper protocol is expected to reset the src address
- * to 0 if it sees a IRE_BROADCAST type returned so that
+ * to zero when we return IPVL_MCAST/IPVL_BCAST so that
  * no packets are emitted with broadcast/multicast address as
  * source address (that violates hosts requirements RFC 1122)
  * The addresses valid for bind are:
@@ -4530,323 +3396,189 @@ bad_addr:
  *	application still has to issue an
  *	IP_ADD_MEMBERSHIP socket option.
  *
- * On error, return -1 for TBADADDR otherwise pass the
- * errno with TSYSERR reply.
- *
  * In all the above cases, the bound address must be valid in the current zone.
  * When the address is loopback, multicast or broadcast, there might be many
  * matching IREs so bind has to look up based on the zone.
- *
- * Note: lport is in network byte order.
- *
  */
-int
-ip_bind_laddr_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
-    ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert)
+ip_laddr_t
+ip_laddr_verify_v4(ipaddr_t src_addr, zoneid_t zoneid,
+    ip_stack_t *ipst, boolean_t allow_mcbc)
 {
-	int		error = 0;
-	ire_t		*src_ire;
-	zoneid_t	zoneid;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-	mblk_t		*mp = NULL;
-	boolean_t	ire_requested = B_FALSE;
-	boolean_t	ipsec_policy_set = B_FALSE;
+	ire_t *src_ire;
 
-	if (mpp)
-		mp = *mpp;
+	ASSERT(src_addr != INADDR_ANY);
 
-	if (mp != NULL) {
-		ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE);
-		ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET);
-	}
+	src_ire = ire_ftable_lookup_v4(src_addr, 0, 0, 0,
+	    NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, 0, ipst, NULL);
 
 	/*
-	 * If it was previously connected, conn_fully_bound would have
-	 * been set.
+	 * If an address other than in6addr_any is requested,
+	 * we verify that it is a valid address for bind
+	 * Note: Following code is in if-else-if form for
+	 * readability compared to a condition check.
 	 */
-	connp->conn_fully_bound = B_FALSE;
-
-	src_ire = NULL;
+	if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
+		/*
+		 * (2) Bind to address of local UP interface
+		 */
+		ire_refrele(src_ire);
+		return (IPVL_UNICAST_UP);
+	} else if (src_ire != NULL && src_ire->ire_type & IRE_BROADCAST) {
+		/*
+		 * (4) Bind to broadcast address
+		 */
+		ire_refrele(src_ire);
+		if (allow_mcbc)
+			return (IPVL_BCAST);
+		else
+			return (IPVL_BAD);
+	} else if (CLASSD(src_addr)) {
+		/* (5) bind to multicast address. */
+		if (src_ire != NULL)
+			ire_refrele(src_ire);
 
-	zoneid = IPCL_ZONEID(connp);
+		if (allow_mcbc)
+			return (IPVL_MCAST);
+		else
+			return (IPVL_BAD);
+	} else {
+		ipif_t *ipif;
 
-	if (src_addr) {
-		src_ire = ire_route_lookup(src_addr, 0, 0, 0,
-		    NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst);
 		/*
-		 * If an address other than 0.0.0.0 is requested,
-		 * we verify that it is a valid address for bind
-		 * Note: Following code is in if-else-if form for
-		 * readability compared to a condition check.
+		 * (3) Bind to address of local DOWN interface?
+		 * (ipif_lookup_addr() looks up all interfaces
+		 * but we do not get here for UP interfaces
+		 * - case (2) above)
 		 */
-		/* LINTED - statement has no consequence */
-		if (IRE_IS_LOCAL(src_ire)) {
-			/*
-			 * (2) Bind to address of local UP interface
-			 */
-		} else if (src_ire && src_ire->ire_type == IRE_BROADCAST) {
-			/*
-			 * (4) Bind to broadcast address
-			 * Note: permitted only from transports that
-			 * request IRE
-			 */
-			if (!ire_requested)
-				error = EADDRNOTAVAIL;
-		} else {
-			/*
-			 * (3) Bind to address of local DOWN interface
-			 * (ipif_lookup_addr() looks up all interfaces
-			 * but we do not get here for UP interfaces
-			 * - case (2) above)
-			 */
-			/* LINTED - statement has no consequent */
-			if (ip_addr_exists(src_addr, zoneid, ipst)) {
-				/* The address exists */
-			} else if (CLASSD(src_addr)) {
-				error = 0;
-				if (src_ire != NULL)
-					ire_refrele(src_ire);
-				/*
-				 * (5) bind to multicast address.
-				 * Fake out the IRE returned to upper
-				 * layer to be a broadcast IRE.
-				 */
-				src_ire = ire_ctable_lookup(
-				    INADDR_BROADCAST, INADDR_ANY,
-				    IRE_BROADCAST, NULL, zoneid, NULL,
-				    (MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY),
-				    ipst);
-				if (src_ire == NULL || !ire_requested)
-					error = EADDRNOTAVAIL;
-			} else {
-				/*
-				 * Not a valid address for bind
-				 */
-				error = EADDRNOTAVAIL;
-			}
-		}
-		if (error) {
-			/* Red Alert!  Attempting to be a bogon! */
-			ip1dbg(("ip_bind_laddr_v4: bad src address 0x%x\n",
-			    ntohl(src_addr)));
-			goto bad_addr;
+		if (src_ire != NULL)
+			ire_refrele(src_ire);
+
+		ipif = ipif_lookup_addr(src_addr, NULL, zoneid, ipst);
+		if (ipif == NULL)
+			return (IPVL_BAD);
+
+		/* Not a useful source? */
+		if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
+			ipif_refrele(ipif);
+			return (IPVL_BAD);
 		}
+		ipif_refrele(ipif);
+		return (IPVL_UNICAST_DOWN);
 	}
+}
+
+/*
+ * Insert in the bind fanout for IPv4 and IPv6.
+ * The caller should already have used ip_laddr_verify_v*() before calling
+ * this.
+ */
+int
+ip_laddr_fanout_insert(conn_t *connp)
+{
+	int		error;
 
 	/*
-	 * Allow setting new policies. For example, disconnects come
-	 * down as ipa_t bind. As we would have set conn_policy_cached
+	 * Allow setting new policies. For example, disconnects result
+	 * in us being called. As we would have set conn_policy_cached
 	 * to B_TRUE before, we should set it to B_FALSE, so that policy
 	 * can change after the disconnect.
 	 */
 	connp->conn_policy_cached = B_FALSE;
 
-	/*
-	 * If not fanout_insert this was just an address verification
-	 */
-	if (fanout_insert) {
-		/*
-		 * The addresses have been verified. Time to insert in
-		 * the correct fanout list.
-		 */
-		IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6);
-		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_remv6);
-		connp->conn_lport = lport;
-		connp->conn_fport = 0;
-		/*
-		 * Do we need to add a check to reject Multicast packets
-		 */
-		error = ipcl_bind_insert(connp, protocol, src_addr, lport);
-	}
-
-	if (error == 0) {
-		if (ire_requested) {
-			if (!ip_bind_get_ire_v4(mpp, src_ire, NULL, ipst)) {
-				error = -1;
-				/* Falls through to bad_addr */
-			}
-		} else if (ipsec_policy_set) {
-			if (!ip_bind_ipsec_policy_set(connp, mp)) {
-				error = -1;
-				/* Falls through to bad_addr */
-			}
-		}
-	}
-bad_addr:
+	error = ipcl_bind_insert(connp);
 	if (error != 0) {
 		if (connp->conn_anon_port) {
 			(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
-			    connp->conn_mlp_type, connp->conn_ulp, ntohs(lport),
-			    B_FALSE);
+			    connp->conn_mlp_type, connp->conn_proto,
+			    ntohs(connp->conn_lport), B_FALSE);
 		}
 		connp->conn_mlp_type = mlptSingle;
 	}
-	if (src_ire != NULL)
-		IRE_REFRELE(src_ire);
-	return (error);
-}
-
-int
-ip_proto_bind_laddr_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol,
-    ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert)
-{
-	int error;
-
-	ASSERT(!connp->conn_af_isv6);
-	connp->conn_pkt_isv6 = B_FALSE;
-	connp->conn_ulp = protocol;
-
-	error = ip_bind_laddr_v4(connp, ire_mpp, protocol, src_addr, lport,
-	    fanout_insert);
-	if (error < 0)
-		error = -TBADADDR;
 	return (error);
 }
 
 /*
- * Verify that both the source and destination addresses
- * are valid.  If verify_dst is false, then the destination address may be
- * unreachable, i.e. have no route to it.  Protocols like TCP want to verify
- * destination reachability, while tunnels do not.
- * Note that we allow connect to broadcast and multicast
- * addresses when ire_requested is set. Thus the ULP
- * has to check for IRE_BROADCAST and multicast.
+ * Verify that both the source and destination addresses are valid. If
+ * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
+ * i.e. have no route to it.  Protocols like TCP want to verify destination
+ * reachability, while tunnels do not.
  *
- * Returns zero if ok.
- * On error: returns -1 to mean TBADADDR otherwise returns an errno
- * (for use with TSYSERR reply).
+ * Determine the route, the interface, and (optionally) the source address
+ * to use to reach a given destination.
+ * Note that we allow connect to broadcast and multicast addresses when
+ * IPDF_ALLOW_MCBC is set.
+ * first_hop and dst_addr are normally the same, but if source routing
+ * they will differ; in that case the first_hop is what we'll use for the
+ * routing lookup but the dce and label checks will be done on dst_addr,
  *
- * Note: lport and fport are in network byte order.
+ * If uinfo is set, then we fill in the best available information
+ * we have for the destination. This is based on (in priority order) any
+ * metrics and path MTU stored in a dce_t, route metrics, and finally the
+ * ill_mtu.
+ *
+ * Tsol note: If we have a source route then dst_addr != firsthop. But we
+ * always do the label check on dst_addr.
  */
 int
-ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
-    ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport,
-    boolean_t fanout_insert, boolean_t verify_dst, cred_t *cr)
+ip_set_destination_v4(ipaddr_t *src_addrp, ipaddr_t dst_addr, ipaddr_t firsthop,
+    ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode)
 {
-
-	ire_t		*src_ire;
-	ire_t		*dst_ire;
+	ire_t		*ire = NULL;
 	int		error = 0;
-	ire_t		*sire = NULL;
-	ire_t		*md_dst_ire = NULL;
-	ire_t		*lso_dst_ire = NULL;
+	ipaddr_t	setsrc;				/* RTF_SETSRC */
+	zoneid_t	zoneid = ixa->ixa_zoneid;	/* Honors SO_ALLZONES */
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	dce_t		*dce;
+	uint_t		pmtu;
+	uint_t		generation;
+	nce_t		*nce;
 	ill_t		*ill = NULL;
-	zoneid_t	zoneid;
-	ipaddr_t	src_addr = *src_addrp;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-	mblk_t		*mp = NULL;
-	boolean_t	ire_requested = B_FALSE;
-	boolean_t	ipsec_policy_set = B_FALSE;
-	ts_label_t	*tsl = NULL;
-	cred_t		*effective_cred = NULL;
-
-	if (mpp)
-		mp = *mpp;
-
-	if (mp != NULL) {
-		ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE);
-		ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET);
-	}
+	boolean_t	multirt = B_FALSE;
 
-	src_ire = dst_ire = NULL;
+	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
 
 	/*
-	 * If we never got a disconnect before, clear it now.
+	 * We never send to zero; the ULPs map it to the loopback address.
+	 * We can't allow it since we use zero to mean unitialized in some
+	 * places.
 	 */
-	connp->conn_fully_bound = B_FALSE;
+	ASSERT(dst_addr != INADDR_ANY);
 
-	zoneid = IPCL_ZONEID(connp);
-
-	/*
-	 * Check whether Trusted Solaris policy allows communication with this
-	 * host, and pretend that the destination is unreachable if not.
-	 *
-	 * This is never a problem for TCP, since that transport is known to
-	 * compute the label properly as part of the tcp_rput_other T_BIND_ACK
-	 * handling.  If the remote is unreachable, it will be detected at that
-	 * point, so there's no reason to check it here.
-	 *
-	 * Note that for sendto (and other datagram-oriented friends), this
-	 * check is done as part of the data path label computation instead.
-	 * The check here is just to make non-TCP connect() report the right
-	 * error.
-	 */
-	if (is_system_labeled() && !IPCL_IS_TCP(connp)) {
-		if ((error = tsol_check_dest(cr, &dst_addr, IPV4_VERSION,
-		    connp->conn_mac_mode, &effective_cred)) != 0) {
-			if (ip_debug > 2) {
-				pr_addr_dbg(
-				    "ip_bind_connected_v4:"
-				    " no label for dst %s\n",
-				    AF_INET, &dst_addr);
-			}
-			goto bad_addr;
-		}
+	if (is_system_labeled()) {
+		ts_label_t *tsl = NULL;
 
-		/*
-		 * tsol_check_dest() may have created a new cred with
-		 * a modified security label. Use that cred if it exists
-		 * for ire lookups.
-		 */
-		if (effective_cred == NULL) {
-			tsl = crgetlabel(cr);
-		} else {
-			tsl = crgetlabel(effective_cred);
+		error = tsol_check_dest(ixa->ixa_tsl, &dst_addr, IPV4_VERSION,
+		    mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
+		if (error != 0)
+			return (error);
+		if (tsl != NULL) {
+			/* Update the label */
+			ip_xmit_attr_replace_tsl(ixa, tsl);
 		}
 	}
 
-	if (CLASSD(dst_addr)) {
-		/* Pick up an IRE_BROADCAST */
-		dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL,
-		    NULL, zoneid, tsl,
-		    (MATCH_IRE_RECURSIVE |
-		    MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE |
-		    MATCH_IRE_SECATTR), ipst);
-	} else {
-		/*
-		 * If conn_dontroute is set or if conn_nexthop_set is set,
-		 * and onlink ipif is not found set ENETUNREACH error.
-		 */
-		if (connp->conn_dontroute || connp->conn_nexthop_set) {
-			ipif_t *ipif;
-
-			ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ?
-			    dst_addr : connp->conn_nexthop_v4, zoneid, ipst);
-			if (ipif == NULL) {
-				error = ENETUNREACH;
-				goto bad_addr;
-			}
-			ipif_refrele(ipif);
-		}
+	setsrc = INADDR_ANY;
+	/*
+	 * Select a route; For IPMP interfaces, we would only select
+	 * a "hidden" route (i.e., going through a specific under_ill)
+	 * if ixa_ifindex has been specified.
+	 */
+	ire = ip_select_route_v4(firsthop, ixa, &generation, &setsrc, &error,
+	    &multirt);
+	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
+	if (error != 0)
+		goto bad_addr;
 
-		if (connp->conn_nexthop_set) {
-			dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0,
-			    0, 0, NULL, NULL, zoneid, tsl,
-			    MATCH_IRE_SECATTR, ipst);
-		} else {
-			dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL,
-			    &sire, zoneid, tsl,
-			    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-			    MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE |
-			    MATCH_IRE_SECATTR), ipst);
-		}
-	}
 	/*
-	 * dst_ire can't be a broadcast when not ire_requested.
-	 * We also prevent ire's with src address INADDR_ANY to
-	 * be used, which are created temporarily for
-	 * sending out packets from endpoints that have
-	 * conn_unspec_src set.  If verify_dst is true, the destination must be
-	 * reachable.  If verify_dst is false, the destination needn't be
-	 * reachable.
+	 * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
+	 * If IPDF_VERIFY_DST is set, the destination must be reachable;
+	 * Otherwise the destination needn't be reachable.
 	 *
 	 * If we match on a reject or black hole, then we've got a
 	 * local failure.  May as well fail out the connect() attempt,
 	 * since it's never going to succeed.
 	 */
-	if (dst_ire == NULL || dst_ire->ire_src_addr == INADDR_ANY ||
-	    (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
-	    ((dst_ire->ire_type & IRE_BROADCAST) && !ire_requested)) {
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		/*
 		 * If we're verifying destination reachability, we always want
 		 * to complain here.
@@ -4854,425 +3586,435 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
 		 * If we're not verifying destination reachability but the
 		 * destination has a route, we still want to fail on the
 		 * temporary address and broadcast address tests.
+		 *
+		 * In both cases do we let the code continue so some reasonable
+		 * information is returned to the caller. That enables the
+		 * caller to use (and even cache) the IRE. conn_ip_ouput will
+		 * use the generation mismatch path to check for the unreachable
+		 * case thereby avoiding any specific check in the main path.
 		 */
-		if (verify_dst || (dst_ire != NULL)) {
-			if (ip_debug > 2) {
-				pr_addr_dbg("ip_bind_connected_v4:"
-				    "bad connected dst %s\n",
-				    AF_INET, &dst_addr);
-			}
-			if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST))
+		ASSERT(generation == IRE_GENERATION_VERIFY);
+		if (flags & IPDF_VERIFY_DST) {
+			/*
+			 * Set errno but continue to set up ixa_ire to be
+			 * the RTF_REJECT|RTF_BLACKHOLE IRE.
+			 * That allows callers to use ip_output to get an
+			 * ICMP error back.
+			 */
+			if (!(ire->ire_type & IRE_HOST))
 				error = ENETUNREACH;
 			else
 				error = EHOSTUNREACH;
-			goto bad_addr;
+		}
+	}
+
+	if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
+	    !(flags & IPDF_ALLOW_MCBC)) {
+		ire_refrele(ire);
+		ire = ire_reject(ipst, B_FALSE);
+		generation = IRE_GENERATION_VERIFY;
+		error = ENETUNREACH;
+	}
+
+	/* Cache things */
+	if (ixa->ixa_ire != NULL)
+		ire_refrele_notr(ixa->ixa_ire);
+#ifdef DEBUG
+	ire_refhold_notr(ire);
+	ire_refrele(ire);
+#endif
+	ixa->ixa_ire = ire;
+	ixa->ixa_ire_generation = generation;
+
+	/*
+	 * For multicast with multirt we have a flag passed back from
+	 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
+	 * possible multicast address.
+	 * We also need a flag for multicast since we can't check
+	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
+	 */
+	if (multirt) {
+		ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
+		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
+	} else {
+		ixa->ixa_postfragfn = ire->ire_postfragfn;
+		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
+	}
+	if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+		/* Get an nce to cache. */
+		nce = ire_to_nce(ire, firsthop, NULL);
+		if (nce == NULL) {
+			/* Allocation failure? */
+			ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+		} else {
+			if (ixa->ixa_nce != NULL)
+				nce_refrele(ixa->ixa_nce);
+			ixa->ixa_nce = nce;
 		}
 	}
 
 	/*
-	 * If the app does a connect(), it means that it will most likely
-	 * send more than 1 packet to the destination.  It makes sense
-	 * to clear the temporary flag.
+	 * We use use ire_nexthop_ill to avoid the under ipmp
+	 * interface for source address selection. Note that for ipmp
+	 * probe packets, ixa_ifindex would have been specified, and
+	 * the ip_select_route() invocation would have picked an ire
+	 * will ire_ill pointing at an under interface.
 	 */
-	if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE &&
-	    (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) {
-		irb_t *irb = dst_ire->ire_bucket;
+	ill = ire_nexthop_ill(ire);
 
-		rw_enter(&irb->irb_lock, RW_WRITER);
+	/*
+	 * If the source address is a loopback address, the
+	 * destination had best be local or multicast.
+	 * If we are sending to an IRE_LOCAL using a loopback source then
+	 * it had better be the same zoneid.
+	 */
+	if (*src_addrp == htonl(INADDR_LOOPBACK)) {
+		if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
+			ire = NULL;	/* Stored in ixa_ire */
+			error = EADDRNOTAVAIL;
+			goto bad_addr;
+		}
+		if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
+			ire = NULL;	/* Stored in ixa_ire */
+			error = EADDRNOTAVAIL;
+			goto bad_addr;
+		}
+	}
+	if (ire->ire_type & IRE_BROADCAST) {
 		/*
-		 * We need to recheck for IRE_MARK_TEMPORARY after acquiring
-		 * the lock to guarantee irb_tmp_ire_cnt.
+		 * If the ULP didn't have a specified source, then we
+		 * make sure we reselect the source when sending
+		 * broadcasts out different interfaces.
 		 */
-		if (dst_ire->ire_marks & IRE_MARK_TEMPORARY) {
-			dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY;
-			irb->irb_tmp_ire_cnt--;
-		}
-		rw_exit(&irb->irb_lock);
+		if (flags & IPDF_SELECT_SRC)
+			ixa->ixa_flags |= IXAF_SET_SOURCE;
+		else
+			ixa->ixa_flags &= ~IXAF_SET_SOURCE;
 	}
 
 	/*
-	 * See if we should notify ULP about LSO/MDT; we do this whether or not
-	 * ire_requested is TRUE, in order to handle active connects; LSO/MDT
-	 * eligibility tests for passive connects are handled separately
-	 * through tcp_adapt_ire().  We do this before the source address
-	 * selection, because dst_ire may change after a call to
-	 * ipif_select_source().  This is a best-effort check, as the
-	 * packet for this connection may not actually go through
-	 * dst_ire->ire_stq, and the exact IRE can only be known after
-	 * calling ip_newroute().  This is why we further check on the
-	 * IRE during LSO/Multidata packet transmission in
-	 * tcp_lsosend()/tcp_multisend().
+	 * Does the caller want us to pick a source address?
 	 */
-	if (!ipsec_policy_set && dst_ire != NULL &&
-	    !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
-	    (ill = ire_to_ill(dst_ire), ill != NULL)) {
-		if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) {
-			lso_dst_ire = dst_ire;
-			IRE_REFHOLD(lso_dst_ire);
-		} else if (ipst->ips_ip_multidata_outbound &&
-		    ILL_MDT_CAPABLE(ill)) {
-			md_dst_ire = dst_ire;
-			IRE_REFHOLD(md_dst_ire);
+	if (flags & IPDF_SELECT_SRC) {
+		ipaddr_t	src_addr;
+
+		/* If unreachable we have no ill but need some source */
+		if (ill == NULL) {
+			src_addr = htonl(INADDR_LOOPBACK);
+			/* Make sure we look for a better source address */
+			generation = SRC_GENERATION_VERIFY;
+		} else {
+			error = ip_select_source_v4(ill, setsrc, dst_addr,
+			    ixa->ixa_multicast_ifaddr, zoneid,
+			    ipst, &src_addr, &generation, NULL);
+			if (error != 0) {
+				ire = NULL;	/* Stored in ixa_ire */
+				goto bad_addr;
+			}
 		}
-	}
 
-	if (dst_ire != NULL && dst_ire->ire_type == IRE_LOCAL &&
-	    dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) {
 		/*
-		 * If the IRE belongs to a different zone, look for a matching
-		 * route in the forwarding table and use the source address from
-		 * that route.
+		 * We allow the source address to to down.
+		 * However, we check that we don't use the loopback address
+		 * as a source when sending out on the wire.
 		 */
-		src_ire = ire_ftable_lookup(dst_addr, 0, 0, 0, NULL, NULL,
-		    zoneid, 0, NULL,
-		    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-		    MATCH_IRE_RJ_BHOLE, ipst);
-		if (src_ire == NULL) {
-			error = EHOSTUNREACH;
-			goto bad_addr;
-		} else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
-			if (!(src_ire->ire_type & IRE_HOST))
-				error = ENETUNREACH;
-			else
-				error = EHOSTUNREACH;
+		if ((src_addr == htonl(INADDR_LOOPBACK)) &&
+		    !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
+		    !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+			ire = NULL;	/* Stored in ixa_ire */
+			error = EADDRNOTAVAIL;
 			goto bad_addr;
 		}
-		if (src_addr == INADDR_ANY)
-			src_addr = src_ire->ire_src_addr;
-		ire_refrele(src_ire);
-		src_ire = NULL;
-	} else if ((src_addr == INADDR_ANY) && (dst_ire != NULL)) {
-		if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
-			src_addr = sire->ire_src_addr;
-			ire_refrele(dst_ire);
-			dst_ire = sire;
-			sire = NULL;
-		} else {
-			/*
-			 * Pick a source address so that a proper inbound
-			 * load spreading would happen.
-			 */
-			ill_t *ire_ill = dst_ire->ire_ipif->ipif_ill;
-			ipif_t *src_ipif = NULL;
-			ire_t *ipif_ire;
 
-			/*
-			 * Supply a local source address such that inbound
-			 * load spreading happens.
-			 *
-			 * Determine the best source address on this ill for
-			 * the destination.
-			 *
-			 * 1) For broadcast, we should return a broadcast ire
-			 *    found above so that upper layers know that the
-			 *    destination address is a broadcast address.
-			 *
-			 * 2) If the ipif is DEPRECATED, select a better
-			 *    source address.  Similarly, if the ipif is on
-			 *    the IPMP meta-interface, pick a source address
-			 *    at random to improve inbound load spreading.
-			 *
-			 * 3) If the outgoing interface is part of a usesrc
-			 *    group, then try selecting a source address from
-			 *    the usesrc ILL.
-			 */
-			if ((dst_ire->ire_zoneid != zoneid &&
-			    dst_ire->ire_zoneid != ALL_ZONES) ||
-			    (!(dst_ire->ire_flags & RTF_SETSRC)) &&
-			    (!(dst_ire->ire_type & IRE_BROADCAST) &&
-			    (IS_IPMP(ire_ill) ||
-			    (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
-			    (ire_ill->ill_usesrc_ifindex != 0)))) {
-				/*
-				 * If the destination is reachable via a
-				 * given gateway, the selected source address
-				 * should be in the same subnet as the gateway.
-				 * Otherwise, the destination is not reachable.
-				 *
-				 * If there are no interfaces on the same subnet
-				 * as the destination, ipif_select_source gives
-				 * first non-deprecated interface which might be
-				 * on a different subnet than the gateway.
-				 * This is not desirable. Hence pass the dst_ire
-				 * source address to ipif_select_source.
-				 * It is sure that the destination is reachable
-				 * with the dst_ire source address subnet.
-				 * So passing dst_ire source address to
-				 * ipif_select_source will make sure that the
-				 * selected source will be on the same subnet
-				 * as dst_ire source address.
-				 */
-				ipaddr_t saddr =
-				    dst_ire->ire_ipif->ipif_src_addr;
-				src_ipif = ipif_select_source(ire_ill,
-				    saddr, zoneid);
-				if (src_ipif != NULL) {
-					if (IS_VNI(src_ipif->ipif_ill)) {
-						/*
-						 * For VNI there is no
-						 * interface route
-						 */
-						src_addr =
-						    src_ipif->ipif_src_addr;
-					} else {
-						ipif_ire =
-						    ipif_to_ire(src_ipif);
-						if (ipif_ire != NULL) {
-							IRE_REFRELE(dst_ire);
-							dst_ire = ipif_ire;
-						}
-						src_addr =
-						    dst_ire->ire_src_addr;
-					}
-					ipif_refrele(src_ipif);
-				} else {
-					src_addr = dst_ire->ire_src_addr;
-				}
-			} else {
-				src_addr = dst_ire->ire_src_addr;
-			}
-		}
+		*src_addrp = src_addr;
+		ixa->ixa_src_generation = generation;
 	}
 
+	if (flags & IPDF_UNIQUE_DCE) {
+		/* Fallback to the default dce if allocation fails */
+		dce = dce_lookup_and_add_v4(dst_addr, ipst);
+		if (dce != NULL)
+			generation = dce->dce_generation;
+		else
+			dce = dce_lookup_v4(dst_addr, ipst, &generation);
+	} else {
+		dce = dce_lookup_v4(dst_addr, ipst, &generation);
+	}
+	ASSERT(dce != NULL);
+	if (ixa->ixa_dce != NULL)
+		dce_refrele_notr(ixa->ixa_dce);
+#ifdef DEBUG
+	dce_refhold_notr(dce);
+	dce_refrele(dce);
+#endif
+	ixa->ixa_dce = dce;
+	ixa->ixa_dce_generation = generation;
+
 	/*
-	 * We do ire_route_lookup() here (and not
-	 * interface lookup as we assert that
-	 * src_addr should only come from an
-	 * UP interface for hard binding.
+	 * Make sure we don't leave an unreachable ixa_nce in place
+	 * since ip_select_route is used when we unplumb i.e., remove
+	 * references on ixa_ire, ixa_nce, and ixa_dce.
 	 */
-	ASSERT(src_ire == NULL);
-	src_ire = ire_route_lookup(src_addr, 0, 0, 0, NULL,
-	    NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst);
-	/* src_ire must be a local|loopback */
-	if (!IRE_IS_LOCAL(src_ire)) {
-		if (ip_debug > 2) {
-			pr_addr_dbg("ip_bind_connected_v4: bad connected "
-			    "src %s\n", AF_INET, &src_addr);
-		}
-		error = EADDRNOTAVAIL;
-		goto bad_addr;
+	nce = ixa->ixa_nce;
+	if (nce != NULL && nce->nce_is_condemned) {
+		nce_refrele(nce);
+		ixa->ixa_nce = NULL;
+		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
 	}
 
 	/*
-	 * If the source address is a loopback address, the
-	 * destination had best be local or multicast.
-	 * The transports that can't handle multicast will reject
-	 * those addresses.
+	 * The caller has set IXAF_PMTU_DISCOVERY if path MTU is desired.
+	 * However, we can't do it for IPv4 multicast or broadcast.
 	 */
-	if (src_ire->ire_type == IRE_LOOPBACK &&
-	    !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) {
-		ip1dbg(("ip_bind_connected_v4: bad connected loopback\n"));
-		error = -1;
-		goto bad_addr;
-	}
+	if (ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST))
+		ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
 
 	/*
-	 * Allow setting new policies. For example, disconnects come
-	 * down as ipa_t bind. As we would have set conn_policy_cached
-	 * to B_TRUE before, we should set it to B_FALSE, so that policy
-	 * can change after the disconnect.
+	 * Set initial value for fragmentation limit. Either conn_ip_output
+	 * or ULP might updates it when there are routing changes.
+	 * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
 	 */
-	connp->conn_policy_cached = B_FALSE;
+	pmtu = ip_get_pmtu(ixa);
+	ixa->ixa_fragsize = pmtu;
+	/* Make sure ixa_fragsize and ixa_pmtu remain identical */
+	if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
+		ixa->ixa_pmtu = pmtu;
 
 	/*
-	 * Set the conn addresses/ports immediately, so the IPsec policy calls
-	 * can handle their passed-in conn's.
+	 * Extract information useful for some transports.
+	 * First we look for DCE metrics. Then we take what we have in
+	 * the metrics in the route, where the offlink is used if we have
+	 * one.
 	 */
+	if (uinfo != NULL) {
+		bzero(uinfo, sizeof (*uinfo));
 
-	IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6);
-	IN6_IPADDR_TO_V4MAPPED(dst_addr, &connp->conn_remv6);
-	connp->conn_lport = lport;
-	connp->conn_fport = fport;
-	*src_addrp = src_addr;
+		if (dce->dce_flags & DCEF_UINFO)
+			*uinfo = dce->dce_uinfo;
 
-	ASSERT(!(ipsec_policy_set && ire_requested));
-	if (ire_requested) {
-		iulp_t *ulp_info = NULL;
+		rts_merge_metrics(uinfo, &ire->ire_metrics);
 
-		/*
-		 * Note that sire will not be NULL if this is an off-link
-		 * connection and there is not cache for that dest yet.
-		 *
-		 * XXX Because of an existing bug, if there are multiple
-		 * default routes, the IRE returned now may not be the actual
-		 * default route used (default routes are chosen in a
-		 * round robin fashion).  So if the metrics for different
-		 * default routes are different, we may return the wrong
-		 * metrics.  This will not be a problem if the existing
-		 * bug is fixed.
-		 */
-		if (sire != NULL) {
-			ulp_info = &(sire->ire_uinfo);
-		}
-		if (!ip_bind_get_ire_v4(mpp, dst_ire, ulp_info, ipst)) {
-			error = -1;
-			goto bad_addr;
-		}
-		mp = *mpp;
-	} else if (ipsec_policy_set) {
-		if (!ip_bind_ipsec_policy_set(connp, mp)) {
-			error = -1;
-			goto bad_addr;
-		}
+		/* Allow ire_metrics to decrease the path MTU from above */
+		if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
+			uinfo->iulp_mtu = pmtu;
+
+		uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
+		uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
+		uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
 	}
 
-	/*
-	 * Cache IPsec policy in this conn.  If we have per-socket policy,
-	 * we'll cache that.  If we don't, we'll inherit global policy.
-	 *
-	 * We can't insert until the conn reflects the policy. Note that
-	 * conn_policy_cached is set by ipsec_conn_cache_policy() even for
-	 * connections where we don't have a policy. This is to prevent
-	 * global policy lookups in the inbound path.
-	 *
-	 * If we insert before we set conn_policy_cached,
-	 * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true
-	 * because global policy cound be non-empty. We normally call
-	 * ipsec_check_policy() for conn_policy_cached connections only if
-	 * ipc_in_enforce_policy is set. But in this case,
-	 * conn_policy_cached can get set anytime since we made the
-	 * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is
-	 * called, which will make the above assumption false.  Thus, we
-	 * need to insert after we set conn_policy_cached.
-	 */
-	if ((error = ipsec_conn_cache_policy(connp, B_TRUE)) != 0)
-		goto bad_addr;
+	if (ill != NULL)
+		ill_refrele(ill);
 
-	if (fanout_insert) {
-		/*
-		 * The addresses have been verified. Time to insert in
-		 * the correct fanout list.
-		 */
-		error = ipcl_conn_insert(connp, protocol, src_addr,
-		    dst_addr, connp->conn_ports);
-	}
+	return (error);
 
-	if (error == 0) {
-		connp->conn_fully_bound = B_TRUE;
-		/*
-		 * Our initial checks for LSO/MDT have passed; the IRE is not
-		 * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to
-		 * be supporting LSO/MDT.  Pass the IRE, IPC and ILL into
-		 * ip_xxinfo_return(), which performs further checks
-		 * against them and upon success, returns the LSO/MDT info
-		 * mblk which we will attach to the bind acknowledgment.
-		 */
-		if (lso_dst_ire != NULL) {
-			mblk_t *lsoinfo_mp;
-
-			ASSERT(ill->ill_lso_capab != NULL);
-			if ((lsoinfo_mp = ip_lsoinfo_return(lso_dst_ire, connp,
-			    ill->ill_name, ill->ill_lso_capab)) != NULL) {
-				if (mp == NULL) {
-					*mpp = lsoinfo_mp;
-				} else {
-					linkb(mp, lsoinfo_mp);
-				}
-			}
-		} else if (md_dst_ire != NULL) {
-			mblk_t *mdinfo_mp;
-
-			ASSERT(ill->ill_mdt_capab != NULL);
-			if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp,
-			    ill->ill_name, ill->ill_mdt_capab)) != NULL) {
-				if (mp == NULL) {
-					*mpp = mdinfo_mp;
-				} else {
-					linkb(mp, mdinfo_mp);
-				}
-			}
-		}
-	}
 bad_addr:
-	if (ipsec_policy_set) {
-		ASSERT(mp != NULL);
-		freeb(mp);
-		/*
-		 * As of now assume that nothing else accompanies
-		 * IPSEC_POLICY_SET.
-		 */
-		*mpp = NULL;
+	if (ire != NULL)
+		ire_refrele(ire);
+
+	if (ill != NULL)
+		ill_refrele(ill);
+
+	/*
+	 * Make sure we don't leave an unreachable ixa_nce in place
+	 * since ip_select_route is used when we unplumb i.e., remove
+	 * references on ixa_ire, ixa_nce, and ixa_dce.
+	 */
+	nce = ixa->ixa_nce;
+	if (nce != NULL && nce->nce_is_condemned) {
+		nce_refrele(nce);
+		ixa->ixa_nce = NULL;
+		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
 	}
-	if (src_ire != NULL)
-		IRE_REFRELE(src_ire);
-	if (dst_ire != NULL)
-		IRE_REFRELE(dst_ire);
-	if (sire != NULL)
-		IRE_REFRELE(sire);
-	if (md_dst_ire != NULL)
-		IRE_REFRELE(md_dst_ire);
-	if (lso_dst_ire != NULL)
-		IRE_REFRELE(lso_dst_ire);
-	if (effective_cred != NULL)
-		crfree(effective_cred);
+
 	return (error);
 }
 
-int
-ip_proto_bind_connected_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol,
-    ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport,
-    boolean_t fanout_insert, boolean_t verify_dst, cred_t *cr)
+
+/*
+ * Get the base MTU for the case when path MTU discovery is not used.
+ * Takes the MTU of the IRE into account.
+ */
+uint_t
+ip_get_base_mtu(ill_t *ill, ire_t *ire)
 {
-	int error;
-
-	ASSERT(!connp->conn_af_isv6);
-	connp->conn_pkt_isv6 = B_FALSE;
-	connp->conn_ulp = protocol;
-
-	/* For raw socket, the local port is not set. */
-	if (lport == 0)
-		lport = connp->conn_lport;
-	error = ip_bind_connected_v4(connp, ire_mpp, protocol,
-	    src_addrp, lport, dst_addr, fport, fanout_insert, verify_dst, cr);
-	if (error < 0)
-		error = -TBADADDR;
-	return (error);
+	uint_t mtu = ill->ill_mtu;
+	uint_t iremtu = ire->ire_metrics.iulp_mtu;
+
+	if (iremtu != 0 && iremtu < mtu)
+		mtu = iremtu;
+
+	return (mtu);
 }
 
 /*
- * Get the ire in *mpp. Returns false if it fails (due to lack of space).
- * Prefers dst_ire over src_ire.
+ * Get the PMTU for the attributes. Handles both IPv4 and IPv6.
+ * Assumes that ixa_ire, dce, and nce have already been set up.
+ *
+ * The caller has set IXAF_PMTU_DISCOVERY if path MTU discovery is desired.
+ * We avoid path MTU discovery if it is disabled with ndd.
+ * Furtermore, if the path MTU is too small, then we don't set DF for IPv4.
+ *
+ * NOTE: We also used to turn it off for source routed packets. That
+ * is no longer required since the dce is per final destination.
  */
-static boolean_t
-ip_bind_get_ire_v4(mblk_t **mpp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst)
+uint_t
+ip_get_pmtu(ip_xmit_attr_t *ixa)
 {
-	mblk_t	*mp = *mpp;
-	ire_t	*ret_ire;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	dce_t		*dce;
+	nce_t		*nce;
+	ire_t		*ire;
+	uint_t		pmtu;
 
-	ASSERT(mp != NULL);
+	ire = ixa->ixa_ire;
+	dce = ixa->ixa_dce;
+	nce = ixa->ixa_nce;
 
-	if (ire != NULL) {
-		/*
-		 * mp initialized above to IRE_DB_REQ_TYPE
-		 * appended mblk. Its <upper protocol>'s
-		 * job to make sure there is room.
-		 */
-		if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t))
-			return (B_FALSE);
+	/*
+	 * If path MTU discovery has been turned off by ndd, then we ignore
+	 * any dce_pmtu and for IPv4 we will not set DF.
+	 */
+	if (!ipst->ips_ip_path_mtu_discovery)
+		ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
 
-		mp->b_datap->db_type = IRE_DB_TYPE;
-		mp->b_wptr = mp->b_rptr + sizeof (ire_t);
-		bcopy(ire, mp->b_rptr, sizeof (ire_t));
-		ret_ire = (ire_t *)mp->b_rptr;
+	pmtu = IP_MAXPACKET;
+	/*
+	 * Decide whether whether IPv4 sets DF
+	 * For IPv6 "no DF" means to use the 1280 mtu
+	 */
+	if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
+		ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
+	} else {
+		ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
+		if (!(ixa->ixa_flags & IXAF_IS_IPV4))
+			pmtu = IPV6_MIN_MTU;
+	}
+
+	/* Check if the PMTU is to old before we use it */
+	if ((dce->dce_flags & DCEF_PMTU) &&
+	    TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+	    ipst->ips_ip_pathmtu_interval) {
 		/*
-		 * Pass the latest setting of the ip_path_mtu_discovery and
-		 * copy the ulp info if any.
+		 * Older than 20 minutes. Drop the path MTU information.
 		 */
-		ret_ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ?
-		    IPH_DF : 0;
-		if (ulp_info != NULL) {
-			bcopy(ulp_info, &(ret_ire->ire_uinfo),
-			    sizeof (iulp_t));
+		mutex_enter(&dce->dce_lock);
+		dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
+		dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+		mutex_exit(&dce->dce_lock);
+		dce_increment_generation(dce);
+	}
+
+	/* The metrics on the route can lower the path MTU */
+	if (ire->ire_metrics.iulp_mtu != 0 &&
+	    ire->ire_metrics.iulp_mtu < pmtu)
+		pmtu = ire->ire_metrics.iulp_mtu;
+
+	/*
+	 * If the path MTU is smaller than some minimum, we still use dce_pmtu
+	 * above (would be 576 for IPv4 and 1280 for IPv6), but we clear
+	 * IXAF_PMTU_IPV4_DF so that we avoid setting DF for IPv4.
+	 */
+	if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
+		if (dce->dce_flags & DCEF_PMTU) {
+			if (dce->dce_pmtu < pmtu)
+				pmtu = dce->dce_pmtu;
+
+			if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
+				ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
+				ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
+			} else {
+				ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
+				ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
+			}
+		} else {
+			ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
+			ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
 		}
-		ret_ire->ire_mp = mp;
-	} else {
+	}
+
+	/*
+	 * If we have an IRE_LOCAL we use the loopback mtu instead of
+	 * the ill for going out the wire i.e., IRE_LOCAL gets the same
+	 * mtu as IRE_LOOPBACK.
+	 */
+	if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
+		uint_t loopback_mtu;
+
+		loopback_mtu = (ire->ire_ipversion == IPV6_VERSION) ?
+		    ip_loopback_mtu_v6plus : ip_loopback_mtuplus;
+
+		if (loopback_mtu < pmtu)
+			pmtu = loopback_mtu;
+	} else if (nce != NULL) {
 		/*
-		 * No IRE was found. Remove IRE mblk.
+		 * Make sure we don't exceed the interface MTU.
+		 * In the case of RTF_REJECT or RTF_BLACKHOLE we might not have
+		 * an ill. We'd use the above IP_MAXPACKET in that case just
+		 * to tell the transport something larger than zero.
 		 */
-		*mpp = mp->b_cont;
-		freeb(mp);
+		if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
+			pmtu = nce->nce_common->ncec_ill->ill_mtu;
+		if (nce->nce_common->ncec_ill != nce->nce_ill &&
+		    nce->nce_ill->ill_mtu < pmtu) {
+			/*
+			 * for interfaces in an IPMP group, the mtu of
+			 * the nce_ill (under_ill) could be different
+			 * from the mtu of the ncec_ill, so we take the
+			 * min of the two.
+			 */
+			pmtu = nce->nce_ill->ill_mtu;
+		}
 	}
-	return (B_TRUE);
+
+	/*
+	 * Handle the IPV6_USE_MIN_MTU socket option or ancillary data.
+	 * Only applies to IPv6.
+	 */
+	if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
+		if (ixa->ixa_flags & IXAF_USE_MIN_MTU) {
+			switch (ixa->ixa_use_min_mtu) {
+			case IPV6_USE_MIN_MTU_MULTICAST:
+				if (ire->ire_type & IRE_MULTICAST)
+					pmtu = IPV6_MIN_MTU;
+				break;
+			case IPV6_USE_MIN_MTU_ALWAYS:
+				pmtu = IPV6_MIN_MTU;
+				break;
+			case IPV6_USE_MIN_MTU_NEVER:
+				break;
+			}
+		} else {
+			/* Default is IPV6_USE_MIN_MTU_MULTICAST */
+			if (ire->ire_type & IRE_MULTICAST)
+				pmtu = IPV6_MIN_MTU;
+		}
+	}
+
+	/*
+	 * After receiving an ICMPv6 "packet too big" message with a
+	 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
+	 * will insert a 8-byte fragment header in every packet. We compensate
+	 * for those cases by returning a smaller path MTU to the ULP.
+	 *
+	 * In the case of CGTP then ip_output will add a fragment header.
+	 * Make sure there is room for it by telling a smaller number
+	 * to the transport.
+	 *
+	 * When IXAF_IPV6_ADDR_FRAGHDR we subtract the frag hdr here
+	 * so the ULPs consistently see a iulp_pmtu and ip_get_pmtu()
+	 * which is the size of the packets it can send.
+	 */
+	if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
+		if ((dce->dce_flags & DCEF_TOO_SMALL_PMTU) ||
+		    (ire->ire_flags & RTF_MULTIRT) ||
+		    (ixa->ixa_flags & IXAF_MULTIRT_MULTICAST)) {
+			pmtu -= sizeof (ip6_frag_t);
+			ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
+		}
+	}
+
+	return (pmtu);
 }
 
 /*
@@ -5386,6 +4128,7 @@ ip_modclose(ill_t *ill)
 	queue_t	*q = ill->ill_rq;
 	ip_stack_t	*ipst = ill->ill_ipst;
 	int	i;
+	arl_ill_common_t *ai = ill->ill_common;
 
 	/*
 	 * The punlink prior to this may have initiated a capability
@@ -5452,6 +4195,7 @@ ip_modclose(ill_t *ill)
 	mutex_enter(&ill->ill_lock);
 	while (!ill_is_freeable(ill))
 		cv_wait(&ill->ill_cv, &ill->ill_lock);
+
 	while (ill->ill_waiters)
 		cv_wait(&ill->ill_cv, &ill->ill_lock);
 
@@ -5466,12 +4210,16 @@ ip_modclose(ill_t *ill)
 
 	/* qprocsoff is done via ill_delete_tail */
 	ill_delete_tail(ill);
+	/*
+	 * synchronously wait for arp stream to unbind. After this, we
+	 * cannot get any data packets up from the driver.
+	 */
+	arp_unbind_complete(ill);
 	ASSERT(ill->ill_ipst == NULL);
 
 	/*
-	 * Walk through all upper (conn) streams and qenable
-	 * those that have queued data.
-	 * close synchronization needs this to
+	 * Walk through all conns and qenable those that have queued data.
+	 * Close synchronization needs this to
 	 * be done to ensure that all upper layers blocked
 	 * due to flow control to the closing device
 	 * get unblocked.
@@ -5481,6 +4229,25 @@ ip_modclose(ill_t *ill)
 		conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
 	}
 
+	/*
+	 * ai can be null if this is an IPv6 ill, or if the IPv4
+	 * stream is being torn down before ARP was plumbed (e.g.,
+	 * /sbin/ifconfig plumbing a stream twice, and encountering
+	 * an error
+	 */
+	if (ai != NULL) {
+		ASSERT(!ill->ill_isv6);
+		mutex_enter(&ai->ai_lock);
+		ai->ai_ill = NULL;
+		if (ai->ai_arl == NULL) {
+			mutex_destroy(&ai->ai_lock);
+			kmem_free(ai, sizeof (*ai));
+		} else {
+			cv_signal(&ai->ai_ill_unplumb_done);
+			mutex_exit(&ai->ai_lock);
+		}
+	}
+
 	mutex_enter(&ipst->ips_ip_mi_lock);
 	mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
 	mutex_exit(&ipst->ips_ip_mi_lock);
@@ -5492,6 +4259,12 @@ ip_modclose(ill_t *ill)
 	if (ill->ill_credp != NULL)
 		crfree(ill->ill_credp);
 
+	mutex_destroy(&ill->ill_saved_ire_lock);
+	mutex_destroy(&ill->ill_lock);
+	rw_destroy(&ill->ill_mcast_lock);
+	mutex_destroy(&ill->ill_mcast_serializer);
+	list_destroy(&ill->ill_nce);
+
 	/*
 	 * Now we are done with the module close pieces that
 	 * need the netstack_t.
@@ -5525,11 +4298,8 @@ ip_quiesce_conn(conn_t *connp)
 	 * Mark the conn as closing, and this conn must not be
 	 * inserted in future into any list. Eg. conn_drain_insert(),
 	 * won't insert this conn into the conn_drain_list.
-	 * Similarly ill_pending_mp_add() will not add any mp to
-	 * the pending mp list, after this conn has started closing.
 	 *
-	 * conn_idl, conn_pending_ill, conn_down_pending_ill, conn_ilg
-	 * cannot get set henceforth.
+	 * conn_idl, and conn_ilg cannot get set henceforth.
 	 */
 	mutex_enter(&connp->conn_lock);
 	ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
@@ -5541,9 +4311,10 @@ ip_quiesce_conn(conn_t *connp)
 	if (connp->conn_dhcpinit_ill != NULL) {
 		ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0);
 		atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit);
+		ill_set_inputfn(connp->conn_dhcpinit_ill);
 		connp->conn_dhcpinit_ill = NULL;
 	}
-	if (connp->conn_ilg_inuse != 0)
+	if (connp->conn_ilg != NULL)
 		ilg_cleanup_reqd = B_TRUE;
 	mutex_exit(&connp->conn_lock);
 
@@ -5552,7 +4323,7 @@ ip_quiesce_conn(conn_t *connp)
 
 	if (is_system_labeled() && connp->conn_anon_port) {
 		(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
-		    connp->conn_mlp_type, connp->conn_ulp,
+		    connp->conn_mlp_type, connp->conn_proto,
 		    ntohs(connp->conn_lport), B_FALSE);
 		connp->conn_anon_port = 0;
 	}
@@ -5568,21 +4339,22 @@ ip_quiesce_conn(conn_t *connp)
 	/*
 	 * Remove this conn from the drain list, and do
 	 * any other cleanup that may be required.
-	 * (Only non-tcp streams may have a non-null conn_idl.
-	 * TCP streams are never flow controlled, and
+	 * (Only non-tcp conns may have a non-null conn_idl.
+	 * TCP conns are never flow controlled, and
 	 * conn_idl will be null)
 	 */
-	if (drain_cleanup_reqd)
+	if (drain_cleanup_reqd && connp->conn_idl != NULL) {
+		mutex_enter(&connp->conn_idl->idl_lock);
 		conn_drain_tail(connp, B_TRUE);
+		mutex_exit(&connp->conn_idl->idl_lock);
+	}
 
 	if (connp == ipst->ips_ip_g_mrouter)
-		(void) ip_mrouter_done(NULL, ipst);
+		(void) ip_mrouter_done(ipst);
 
 	if (ilg_cleanup_reqd)
 		ilg_delete_all(connp);
 
-	conn_delete_ire(connp, NULL);
-
 	/*
 	 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
 	 * callers from write side can't be there now because close
@@ -5603,8 +4375,6 @@ ip_close(queue_t *q, int flags)
 {
 	conn_t		*connp;
 
-	TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q);
-
 	/*
 	 * Call the appropriate delete routine depending on whether this is
 	 * a module or device.
@@ -5646,13 +4416,21 @@ ip_close(queue_t *q, int flags)
  */
 /*ARGSUSED2*/
 static void
-ip_conn_input(void *arg1, mblk_t *mp, void *arg2)
+ip_conn_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 {
 	conn_t *connp = (conn_t *)arg1;
 
 	putnext(connp->conn_rq, mp);
 }
 
+/* Dummy in case ICMP error delivery is attempted to a /dev/ip instance */
+/* ARGSUSED */
+static void
+ip_conn_input_icmp(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
+{
+	freemsg(mp);
+}
+
 /*
  * Called when the module is about to be unloaded
  */
@@ -5667,6 +4445,7 @@ ip_ddi_destroy(void)
 	sctp_ddi_g_destroy();
 	tcp_ddi_g_destroy();
 	ilb_ddi_g_destroy();
+	dce_g_destroy();
 	ipsec_policy_g_destroy();
 	ipcl_g_destroy();
 	ip_net_g_destroy();
@@ -5709,16 +4488,12 @@ ip_stack_shutdown(netstackid_t stackid, void *arg)
 	 */
 	ipv4_hook_shutdown(ipst);
 	ipv6_hook_shutdown(ipst);
+	arp_hook_shutdown(ipst);
 
 	mutex_enter(&ipst->ips_capab_taskq_lock);
 	ipst->ips_capab_taskq_quit = B_TRUE;
 	cv_signal(&ipst->ips_capab_taskq_cv);
 	mutex_exit(&ipst->ips_capab_taskq_lock);
-
-	mutex_enter(&ipst->ips_mrt_lock);
-	ipst->ips_mrt_flags |= IP_MRT_STOP;
-	cv_signal(&ipst->ips_mrt_cv);
-	mutex_exit(&ipst->ips_mrt_lock);
 }
 
 /*
@@ -5741,18 +4516,12 @@ ip_stack_fini(netstackid_t stackid, void *arg)
 	ipobs_fini(ipst);
 	ipv4_hook_destroy(ipst);
 	ipv6_hook_destroy(ipst);
+	arp_hook_destroy(ipst);
 	ip_net_destroy(ipst);
 
 	mutex_destroy(&ipst->ips_capab_taskq_lock);
 	cv_destroy(&ipst->ips_capab_taskq_cv);
 
-	mutex_enter(&ipst->ips_mrt_lock);
-	while (!(ipst->ips_mrt_flags & IP_MRT_DONE))
-		cv_wait(&ipst->ips_mrt_done_cv, &ipst->ips_mrt_lock);
-	mutex_destroy(&ipst->ips_mrt_lock);
-	cv_destroy(&ipst->ips_mrt_cv);
-	cv_destroy(&ipst->ips_mrt_done_cv);
-
 	ipmp_destroy(ipst);
 	rw_destroy(&ipst->ips_srcid_lock);
 
@@ -5773,10 +4542,10 @@ ip_stack_fini(netstackid_t stackid, void *arg)
 	kmem_free(ipst->ips_ndp_arr, sizeof (lcl_ndp_arr));
 	ipst->ips_ndp_arr = NULL;
 
+	dce_stack_destroy(ipst);
 	ip_mrouter_stack_destroy(ipst);
 
 	mutex_destroy(&ipst->ips_ip_mi_lock);
-	rw_destroy(&ipst->ips_ipsec_capab_ills_lock);
 	rw_destroy(&ipst->ips_ill_g_usesrc_lock);
 	rw_destroy(&ipst->ips_ip_g_nd_lock);
 
@@ -5808,13 +4577,6 @@ ip_stack_fini(netstackid_t stackid, void *arg)
 		ASSERT(ipst->ips_mld_slowtimeout_id != 0);
 		ipst->ips_mld_slowtimeout_id = 0;
 	}
-	ret = untimeout(ipst->ips_ip_ire_expire_id);
-	if (ret == -1) {
-		ASSERT(ipst->ips_ip_ire_expire_id == 0);
-	} else {
-		ASSERT(ipst->ips_ip_ire_expire_id != 0);
-		ipst->ips_ip_ire_expire_id = 0;
-	}
 
 	mutex_destroy(&ipst->ips_igmp_timer_lock);
 	mutex_destroy(&ipst->ips_mld_timer_lock);
@@ -5915,6 +4677,10 @@ ip_ddi_init(void)
 	list_create(&ip_thread_list, sizeof (th_hash_t),
 	    offsetof(th_hash_t, thh_link));
 #endif
+	ipsec_policy_g_init();
+	tcp_ddi_g_init();
+	sctp_ddi_g_init();
+	dce_g_init();
 
 	/*
 	 * We want to be informed each time a stack is created or
@@ -5924,10 +4690,6 @@ ip_ddi_init(void)
 	netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown,
 	    ip_stack_fini);
 
-	ipsec_policy_g_init();
-	tcp_ddi_g_init();
-	sctp_ddi_g_init();
-
 	tnet_init();
 
 	udp_ddi_g_init();
@@ -5973,7 +4735,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
 	mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&ipst->ips_ipsec_capab_ills_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL);
 
 	ipcl_init(ipst);
@@ -5982,6 +4743,7 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
 	ipif_init(ipst);
 	conn_drain_init(ipst);
 	ip_mrouter_stack_init(ipst);
+	dce_stack_init(ipst);
 
 	ipst->ips_ip_g_frag_timeout = IP_FRAG_TIMEOUT;
 	ipst->ips_ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000;
@@ -6026,9 +4788,12 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
 	ipst->ips_ip_src_id = 1;
 	rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
 
+	ipst->ips_src_generation = SRC_GENERATION_INITIAL;
+
 	ip_net_init(ipst, ns);
 	ipv4_hook_init(ipst);
 	ipv6_hook_init(ipst);
+	arp_hook_init(ipst);
 	ipmp_init(ipst);
 	ipobs_init(ipst);
 
@@ -6040,15 +4805,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
 	mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
 
-	/*
-	 * Create the mcast_restart_timers_thread() worker thread.
-	 */
-	mutex_init(&ipst->ips_mrt_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&ipst->ips_mrt_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&ipst->ips_mrt_done_cv, NULL, CV_DEFAULT, NULL);
-	ipst->ips_mrt_thread = thread_create(NULL, 0,
-	    mcast_restart_timers_thread, ipst, 0, &p0, TS_RUN, minclsyspri);
-
 	major = mod_name_to_major(INET_NAME);
 	(void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
 	return (ipst);
@@ -6161,37 +4917,26 @@ mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen)
 }
 
 /*
- * Send an ICMP error after patching up the packet appropriately.  Returns
- * non-zero if the appropriate MIB should be bumped; zero otherwise.
+ * Called when it is conceptually a ULP that would sent the packet
+ * e.g., port unreachable and protocol unreachable. Check that the packet
+ * would have passed the IPsec global policy before sending the error.
+ *
+ * Send an ICMP error after patching up the packet appropriately.
+ * Uses ip_drop_input and bumps the appropriate MIB.
  */
-static boolean_t
-ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
-    uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present,
-    zoneid_t zoneid, ip_stack_t *ipst)
+void
+ip_fanout_send_icmp_v4(mblk_t *mp, uint_t icmp_type, uint_t icmp_code,
+    ip_recv_attr_t *ira)
 {
-	ipha_t *ipha;
-	mblk_t *first_mp;
-	boolean_t secure;
-	unsigned char db_type;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
+	ipha_t		*ipha;
+	boolean_t	secure;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	netstack_t	*ns = ipst->ips_netstack;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+
+	secure = ira->ira_flags & IRAF_IPSEC_SECURE;
 
-	first_mp = mp;
-	if (mctl_present) {
-		mp = mp->b_cont;
-		secure = ipsec_in_is_secure(first_mp);
-		ASSERT(mp != NULL);
-	} else {
-		/*
-		 * If this is an ICMP error being reported - which goes
-		 * up as M_CTLs, we need to convert them to M_DATA till
-		 * we finish checking with global policy because
-		 * ipsec_check_global_policy() assumes M_DATA as clear
-		 * and M_CTL as secure.
-		 */
-		db_type = DB_TYPE(mp);
-		DB_TYPE(mp) = M_DATA;
-		secure = B_FALSE;
-	}
 	/*
 	 * We are generating an icmp error for some inbound packet.
 	 * Called from all ip_fanout_(udp, tcp, proto) functions.
@@ -6201,47 +4946,52 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
 	 */
 	ipha = (ipha_t *)mp->b_rptr;
 	if (secure || ipss->ipsec_inbound_v4_policy_present) {
-		first_mp = ipsec_check_global_policy(first_mp, NULL,
-		    ipha, NULL, mctl_present, ipst->ips_netstack);
-		if (first_mp == NULL)
-			return (B_FALSE);
+		mp = ipsec_check_global_policy(mp, NULL, ipha, NULL, ira, ns);
+		if (mp == NULL)
+			return;
 	}
 
-	if (!mctl_present)
-		DB_TYPE(mp) = db_type;
+	/* We never send errors for protocols that we do implement */
+	if (ira->ira_protocol == IPPROTO_ICMP ||
+	    ira->ira_protocol == IPPROTO_IGMP) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ip_fanout_send_icmp_v4", mp, ill);
+		freemsg(mp);
+		return;
+	}
+	/*
+	 * Have to correct checksum since
+	 * the packet might have been
+	 * fragmented and the reassembly code in ip_rput
+	 * does not restore the IP checksum.
+	 */
+	ipha->ipha_hdr_checksum = 0;
+	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
 
-	if (flags & IP_FF_SEND_ICMP) {
-		if (flags & IP_FF_HDR_COMPLETE) {
-			if (ip_hdr_complete(ipha, zoneid, ipst)) {
-				freemsg(first_mp);
-				return (B_TRUE);
-			}
-		}
-		if (flags & IP_FF_CKSUM) {
-			/*
-			 * Have to correct checksum since
-			 * the packet might have been
-			 * fragmented and the reassembly code in ip_rput
-			 * does not restore the IP checksum.
-			 */
-			ipha->ipha_hdr_checksum = 0;
-			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
-		}
-		switch (icmp_type) {
-		case ICMP_DEST_UNREACHABLE:
-			icmp_unreachable(WR(q), first_mp, icmp_code, zoneid,
-			    ipst);
+	switch (icmp_type) {
+	case ICMP_DEST_UNREACHABLE:
+		switch (icmp_code) {
+		case ICMP_PROTOCOL_UNREACHABLE:
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
+			ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
 			break;
-		default:
-			freemsg(first_mp);
+		case ICMP_PORT_UNREACHABLE:
+			BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
+			ip_drop_input("ipIfStatsNoPorts", mp, ill);
 			break;
 		}
-	} else {
-		freemsg(first_mp);
-		return (B_FALSE);
-	}
 
-	return (B_TRUE);
+		icmp_unreachable(mp, icmp_code, ira);
+		break;
+	default:
+#ifdef DEBUG
+		panic("ip_fanout_send_icmp_v4: wrong type");
+		/*NOTREACHED*/
+#else
+		freemsg(mp);
+		break;
+#endif
+	}
 }
 
 /*
@@ -6250,66 +5000,86 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
  * is consumed by this function.
  */
 void
-ip_proto_not_sup(queue_t *q, mblk_t *ipsec_mp, uint_t flags, zoneid_t zoneid,
-    ip_stack_t *ipst)
+ip_proto_not_sup(mblk_t *mp, ip_recv_attr_t *ira)
 {
-	mblk_t *mp;
-	ipha_t *ipha;
-	ill_t *ill;
-	ipsec_in_t *ii;
-
-	ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
+	ipha_t		*ipha;
 
-	mp = ipsec_mp->b_cont;
-	ipsec_mp->b_cont = NULL;
 	ipha = (ipha_t *)mp->b_rptr;
-	/* Get ill from index in ipsec_in_t. */
-	ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index,
-	    (IPH_HDR_VERSION(ipha) == IPV6_VERSION), NULL, NULL, NULL, NULL,
-	    ipst);
-	if (ill != NULL) {
-		if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
-			if (ip_fanout_send_icmp(q, mp, flags,
-			    ICMP_DEST_UNREACHABLE,
-			    ICMP_PROTOCOL_UNREACHABLE, B_FALSE, zoneid, ipst)) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsInUnknownProtos);
-			}
-		} else {
-			if (ip_fanout_send_icmp_v6(q, mp, flags,
-			    ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER,
-			    0, B_FALSE, zoneid, ipst)) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsInUnknownProtos);
-			}
-		}
-		ill_refrele(ill);
-	} else { /* re-link for the freemsg() below. */
-		ipsec_mp->b_cont = mp;
+	if (ira->ira_flags & IRAF_IS_IPV4) {
+		ASSERT(IPH_HDR_VERSION(ipha) == IP_VERSION);
+		ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
+		    ICMP_PROTOCOL_UNREACHABLE, ira);
+	} else {
+		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
+		ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
+		    ICMP6_PARAMPROB_NEXTHEADER, ira);
 	}
-
-	/* If ICMP delivered, ipsec_mp will be a singleton (b_cont == NULL). */
-	freemsg(ipsec_mp);
 }
 
 /*
- * See if the inbound datagram has had IPsec processing applied to it.
+ * Deliver a rawip packet to the given conn, possibly applying ipsec policy.
+ * Handles IPv4 and IPv6.
+ * We are responsible for disposing of mp, such as by freemsg() or putnext()
+ * Caller is responsible for dropping references to the conn.
  */
-boolean_t
-ipsec_in_is_secure(mblk_t *ipsec_mp)
+void
+ip_fanout_proto_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
+    ip_recv_attr_t *ira)
 {
-	ipsec_in_t *ii;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
+	boolean_t	secure;
+	uint_t		protocol = ira->ira_protocol;
+	iaflags_t	iraflags = ira->ira_flags;
+	queue_t		*rq;
+
+	secure = iraflags & IRAF_IPSEC_SECURE;
+
+	rq = connp->conn_rq;
+	if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
+		switch (protocol) {
+		case IPPROTO_ICMPV6:
+			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
+			break;
+		case IPPROTO_ICMP:
+			BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
+			break;
+		default:
+			BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
+			break;
+		}
+		freemsg(mp);
+		return;
+	}
 
-	ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
+	ASSERT(!(IPCL_IS_IPTUN(connp)));
 
-	if (ii->ipsec_in_loopback) {
-		return (ii->ipsec_in_secure);
+	if (((iraflags & IRAF_IS_IPV4) ?
+	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
+	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
+	    secure) {
+		mp = ipsec_check_inbound_policy(mp, connp, ipha,
+		    ip6h, ira);
+		if (mp == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			/* Note that mp is NULL */
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			return;
+		}
+	}
+
+	if (iraflags & IRAF_ICMP_ERROR) {
+		(connp->conn_recvicmp)(connp, mp, NULL, ira);
 	} else {
-		return (ii->ipsec_in_ah_sa != NULL ||
-		    ii->ipsec_in_esp_sa != NULL ||
-		    ii->ipsec_in_decaps);
+		ill_t *rill = ira->ira_rill;
+
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		ira->ira_ill = ira->ira_rill = NULL;
+		/* Send it upstream */
+		(connp->conn_recv)(connp, mp, NULL, ira);
+		ira->ira_ill = ill;
+		ira->ira_rill = rill;
 	}
 }
 
@@ -6336,65 +5106,33 @@ ipsec_in_is_secure(mblk_t *ipsec_mp)
  * is used to negotiate SAs as SAs will be added only after
  * verifying the policy.
  *
- * IPQoS Notes:
- * Once we have determined the client, invoke IPPF processing.
- * Policy processing takes place only if the callout_position, IPP_LOCAL_IN,
- * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local
- * ip_policy will be false.
- *
  * Zones notes:
- * Currently only applications in the global zone can create raw sockets for
- * protocols other than ICMP. So unlike the broadcast / multicast case of
- * ip_fanout_udp(), we only send a copy of the packet to streams in the
- * specified zone. For ICMP, this is handled by the callers of icmp_inbound().
+ * Earlier in ip_input on a system with multiple shared-IP zones we
+ * duplicate the multicast and broadcast packets and send them up
+ * with each explicit zoneid that exists on that ill.
+ * This means that here we can match the zoneid with SO_ALLZONES being special.
  */
-static void
-ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
-    boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill,
-    zoneid_t zoneid)
+void
+ip_fanout_proto_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
 {
-	queue_t	*rq;
-	mblk_t	*mp1, *first_mp1;
-	uint_t	protocol = ipha->ipha_protocol;
-	ipaddr_t dst;
-	mblk_t *first_mp = mp;
-	boolean_t secure;
-	uint32_t ill_index;
-	conn_t	*connp, *first_connp, *next_connp;
-	connf_t	*connfp;
-	boolean_t shared_addr;
-	mib2_ipIfStatsEntry_t *mibptr;
-	ip_stack_t *ipst = recv_ill->ill_ipst;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
+	mblk_t		*mp1;
+	ipaddr_t	laddr;
+	conn_t		*connp, *first_connp, *next_connp;
+	connf_t		*connfp;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib;
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		secure = ipsec_in_is_secure(first_mp);
-		ASSERT(mp != NULL);
-	} else {
-		secure = B_FALSE;
-	}
-	dst = ipha->ipha_dst;
-	shared_addr = (zoneid == ALL_ZONES);
-	if (shared_addr) {
-		/*
-		 * We don't allow multilevel ports for raw IP, so no need to
-		 * check for that here.
-		 */
-		zoneid = tsol_packet_to_zoneid(mp);
-	}
+	laddr = ipha->ipha_dst;
 
-	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
+	connfp = &ipst->ips_ipcl_proto_fanout_v4[ira->ira_protocol];
 	mutex_enter(&connfp->connf_lock);
 	connp = connfp->connf_head;
 	for (connp = connfp->connf_head; connp != NULL;
 	    connp = connp->conn_next) {
-		if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, flags,
-		    zoneid) &&
-		    (!is_system_labeled() ||
-		    tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
-		    connp))) {
+		/* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
+		if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
+		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) {
 			break;
 		}
 	}
@@ -6406,40 +5144,12 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
 		 * unclaimed datagrams?
 		 */
 		mutex_exit(&connfp->connf_lock);
-		/*
-		 * Check for IPPROTO_ENCAP...
-		 */
-		if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) {
-			/*
-			 * If an IPsec mblk is here on a multicast
-			 * tunnel (using ip_mroute stuff), check policy here,
-			 * THEN ship off to ip_mroute_decap().
-			 *
-			 * BTW,  If I match a configured IP-in-IP
-			 * tunnel, this path will not be reached, and
-			 * ip_mroute_decap will never be called.
-			 */
-			first_mp = ipsec_check_global_policy(first_mp, connp,
-			    ipha, NULL, mctl_present, ipst->ips_netstack);
-			if (first_mp != NULL) {
-				if (mctl_present)
-					freeb(first_mp);
-				ip_mroute_decap(q, mp, ill);
-			} /* Else we already freed everything! */
-		} else {
-			/*
-			 * Otherwise send an ICMP protocol unreachable.
-			 */
-			if (ip_fanout_send_icmp(q, first_mp, flags,
-			    ICMP_DEST_UNREACHABLE, ICMP_PROTOCOL_UNREACHABLE,
-			    mctl_present, zoneid, ipst)) {
-				BUMP_MIB(mibptr, ipIfStatsInUnknownProtos);
-			}
-		}
+		ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
+		    ICMP_PROTOCOL_UNREACHABLE, ira);
 		return;
 	}
 
-	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
 
 	CONN_INC_REF(connp);
 	first_connp = connp;
@@ -6447,111 +5157,35 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
 
 	for (;;) {
 		while (connp != NULL) {
-			if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill,
-			    flags, zoneid) &&
-			    (!is_system_labeled() ||
-			    tsol_receive_local(mp, &dst, IPV4_VERSION,
-			    shared_addr, connp)))
+			/* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
+			if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
+			    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+			    tsol_receive_local(mp, &laddr, IPV4_VERSION,
+			    ira, connp)))
 				break;
 			connp = connp->conn_next;
 		}
 
-		/*
-		 * Copy the packet.
-		 */
-		if (connp == NULL ||
-		    (((first_mp1 = dupmsg(first_mp)) == NULL) &&
-		    ((first_mp1 = ip_copymsg(first_mp)) == NULL))) {
-			/*
-			 * No more interested clients or memory
-			 * allocation failed
-			 */
+		if (connp == NULL) {
+			/* No more interested clients */
 			connp = first_connp;
 			break;
 		}
-		ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
-		mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
+		if (((mp1 = dupmsg(mp)) == NULL) &&
+		    ((mp1 = copymsg(mp)) == NULL)) {
+			/* Memory allocation failed */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			connp = first_connp;
+			break;
+		}
+
 		CONN_INC_REF(connp);
 		mutex_exit(&connfp->connf_lock);
-		rq = connp->conn_rq;
 
-		/*
-		 * Check flow control
-		 */
-		if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
-		    (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
-			if (flags & IP_FF_RAWIP) {
-				BUMP_MIB(mibptr, rawipIfStatsInOverflows);
-			} else {
-				BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
-			}
+		ip_fanout_proto_conn(connp, mp1, (ipha_t *)mp1->b_rptr, NULL,
+		    ira);
 
-			freemsg(first_mp1);
-		} else {
-			/*
-			 * Enforce policy like any other conn_t.  Note that
-			 * IP-in-IP packets don't come through here, but
-			 * through ip_iptun_input() or
-			 * icmp_inbound_iptun_fanout().  IPsec policy for such
-			 * packets is enforced in the iptun module.
-			 */
-			if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
-			    secure) {
-				first_mp1 = ipsec_check_inbound_policy
-				    (first_mp1, connp, ipha, NULL,
-				    mctl_present);
-			}
-			if (first_mp1 != NULL) {
-				int in_flags = 0;
-				/*
-				 * ip_fanout_proto also gets called from
-				 * icmp_inbound_error_fanout, in which case
-				 * the msg type is M_CTL.  Don't add info
-				 * in this case for the time being. In future
-				 * when there is a need for knowing the
-				 * inbound iface index for ICMP error msgs,
-				 * then this can be changed.
-				 */
-				if (connp->conn_recvif)
-					in_flags = IPF_RECVIF;
-				/*
-				 * The ULP may support IP_RECVPKTINFO for both
-				 * IP v4 and v6 so pass the appropriate argument
-				 * based on conn IP version.
-				 */
-				if (connp->conn_ip_recvpktinfo) {
-					if (connp->conn_af_isv6) {
-						/*
-						 * V6 only needs index
-						 */
-						in_flags |= IPF_RECVIF;
-					} else {
-						/*
-						 * V4 needs index +
-						 * matching address.
-						 */
-						in_flags |= IPF_RECVADDR;
-					}
-				}
-				if ((in_flags != 0) &&
-				    (mp->b_datap->db_type != M_CTL)) {
-					/*
-					 * the actual data will be
-					 * contained in b_cont upon
-					 * successful return of the
-					 * following call else
-					 * original mblk is returned
-					 */
-					ASSERT(recv_ill != NULL);
-					mp1 = ip_add_info(mp1, recv_ill,
-					    in_flags, IPCL_ZONEID(connp), ipst);
-				}
-				BUMP_MIB(mibptr, ipIfStatsHCInDelivers);
-				if (mctl_present)
-					freeb(first_mp1);
-				(connp->conn_recv)(connp, mp1, NULL);
-			}
-		}
 		mutex_enter(&connfp->connf_lock);
 		/* Follow the next pointer before releasing the conn. */
 		next_connp = connp->conn_next;
@@ -6562,363 +5196,27 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
 	/* Last one.  Send it upstream. */
 	mutex_exit(&connfp->connf_lock);
 
-	/*
-	 * If this packet is coming from icmp_inbound_error_fanout ip_policy
-	 * will be set to false.
-	 */
-	if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) {
-		ill_index = ill->ill_phyint->phyint_ifindex;
-		ip_process(IPP_LOCAL_IN, &mp, ill_index);
-		if (mp == NULL) {
-			CONN_DEC_REF(connp);
-			if (mctl_present) {
-				freeb(first_mp);
-			}
-			return;
-		}
-	}
-
-	rq = connp->conn_rq;
-	/*
-	 * Check flow control
-	 */
-	if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
-	    (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
-		if (flags & IP_FF_RAWIP) {
-			BUMP_MIB(mibptr, rawipIfStatsInOverflows);
-		} else {
-			BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
-		}
-
-		freemsg(first_mp);
-	} else {
-		ASSERT(!IPCL_IS_IPTUN(connp));
-
-		if ((CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure)) {
-			first_mp = ipsec_check_inbound_policy(first_mp, connp,
-			    ipha, NULL, mctl_present);
-		}
-
-		if (first_mp != NULL) {
-			int in_flags = 0;
-
-			/*
-			 * ip_fanout_proto also gets called
-			 * from icmp_inbound_error_fanout, in
-			 * which case the msg type is M_CTL.
-			 * Don't add info in this case for time
-			 * being. In future when there is a
-			 * need for knowing the inbound iface
-			 * index for ICMP error msgs, then this
-			 * can be changed
-			 */
-			if (connp->conn_recvif)
-				in_flags = IPF_RECVIF;
-			if (connp->conn_ip_recvpktinfo) {
-				if (connp->conn_af_isv6) {
-					/*
-					 * V6 only needs index
-					 */
-					in_flags |= IPF_RECVIF;
-				} else {
-					/*
-					 * V4 needs index +
-					 * matching address.
-					 */
-					in_flags |= IPF_RECVADDR;
-				}
-			}
-			if ((in_flags != 0) &&
-			    (mp->b_datap->db_type != M_CTL)) {
+	ip_fanout_proto_conn(connp, mp, ipha, NULL, ira);
 
-				/*
-				 * the actual data will be contained in
-				 * b_cont upon successful return
-				 * of the following call else original
-				 * mblk is returned
-				 */
-				ASSERT(recv_ill != NULL);
-				mp = ip_add_info(mp, recv_ill,
-				    in_flags, IPCL_ZONEID(connp), ipst);
-			}
-			BUMP_MIB(mibptr, ipIfStatsHCInDelivers);
-			(connp->conn_recv)(connp, mp, NULL);
-			if (mctl_present)
-				freeb(first_mp);
-		}
-	}
 	CONN_DEC_REF(connp);
 }
 
 /*
- * Serialize tcp resets by calling tcp_xmit_reset_serialize through
- * SQUEUE_ENTER_ONE(SQ_FILL). We do this to ensure the reset is handled on
- * the correct squeue, in this case the same squeue as a valid listener with
- * no current connection state for the packet we are processing. The function
- * is called for synchronizing both IPv4 and IPv6.
- */
-void
-ip_xmit_reset_serialize(mblk_t *mp, int hdrlen, zoneid_t zoneid,
-    tcp_stack_t *tcps, conn_t *connp)
-{
-	mblk_t *rst_mp;
-	tcp_xmit_reset_event_t *eventp;
-
-	rst_mp = allocb(sizeof (tcp_xmit_reset_event_t), BPRI_HI);
-
-	if (rst_mp == NULL) {
-		freemsg(mp);
-		return;
-	}
-
-	rst_mp->b_datap->db_type = M_PROTO;
-	rst_mp->b_wptr += sizeof (tcp_xmit_reset_event_t);
-
-	eventp = (tcp_xmit_reset_event_t *)rst_mp->b_rptr;
-	eventp->tcp_xre_event = TCP_XRE_EVENT_IP_FANOUT_TCP;
-	eventp->tcp_xre_iphdrlen = hdrlen;
-	eventp->tcp_xre_zoneid = zoneid;
-	eventp->tcp_xre_tcps = tcps;
-
-	rst_mp->b_cont = mp;
-	mp = rst_mp;
-
-	/*
-	 * Increment the connref, this ref will be released by the squeue
-	 * framework.
-	 */
-	CONN_INC_REF(connp);
-	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_xmit_reset, connp,
-	    SQ_FILL, SQTAG_XMIT_EARLY_RESET);
-}
-
-/*
- * Fanout for TCP packets
- * The caller puts <fport, lport> in the ports parameter.
- *
- * IPQoS Notes
- * Before sending it to the client, invoke IPPF processing.
- * Policy processing takes place only if the callout_position, IPP_LOCAL_IN,
- * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local
- * ip_policy is false.
- */
-static void
-ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha,
-    uint_t flags, boolean_t mctl_present, boolean_t ip_policy, zoneid_t zoneid)
-{
-	mblk_t  *first_mp;
-	boolean_t secure;
-	uint32_t ill_index;
-	int	ip_hdr_len;
-	tcph_t	*tcph;
-	boolean_t syn_present = B_FALSE;
-	conn_t	*connp;
-	ip_stack_t	*ipst = recv_ill->ill_ipst;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
-
-	ASSERT(recv_ill != NULL);
-
-	first_mp = mp;
-	if (mctl_present) {
-		ASSERT(first_mp->b_datap->db_type == M_CTL);
-		mp = first_mp->b_cont;
-		secure = ipsec_in_is_secure(first_mp);
-		ASSERT(mp != NULL);
-	} else {
-		secure = B_FALSE;
-	}
-
-	ip_hdr_len = IPH_HDR_LENGTH(mp->b_rptr);
-
-	if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len,
-	    zoneid, ipst)) == NULL) {
-		/*
-		 * No connected connection or listener. Send a
-		 * TH_RST via tcp_xmit_listeners_reset.
-		 */
-
-		/* Initiate IPPf processing, if needed. */
-		if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
-			uint32_t ill_index;
-			ill_index = recv_ill->ill_phyint->phyint_ifindex;
-			ip_process(IPP_LOCAL_IN, &first_mp, ill_index);
-			if (first_mp == NULL)
-				return;
-		}
-		BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
-		ip2dbg(("ip_fanout_tcp: no listener; send reset to zone %d\n",
-		    zoneid));
-		tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid,
-		    ipst->ips_netstack->netstack_tcp, NULL);
-		return;
-	}
-
-	/*
-	 * Allocate the SYN for the TCP connection here itself
-	 */
-	tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
-	if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
-		if (IPCL_IS_TCP(connp)) {
-			squeue_t *sqp;
-
-			/*
-			 * If the queue belongs to a conn, and fused tcp
-			 * loopback is enabled, assign the eager's squeue
-			 * to be that of the active connect's. Note that
-			 * we don't check for IP_FF_LOOPBACK here since this
-			 * routine gets called only for loopback (unlike the
-			 * IPv6 counterpart).
-			 */
-			if (do_tcp_fusion &&
-			    CONN_Q(q) && IPCL_IS_TCP(Q_TO_CONN(q)) &&
-			    !CONN_INBOUND_POLICY_PRESENT(connp, ipss) &&
-			    !secure &&
-			    !IPP_ENABLED(IPP_LOCAL_IN, ipst) && !ip_policy) {
-				ASSERT(Q_TO_CONN(q)->conn_sqp != NULL);
-				sqp = Q_TO_CONN(q)->conn_sqp;
-			} else {
-				sqp = IP_SQUEUE_GET(lbolt);
-			}
-
-			mp->b_datap->db_struioflag |= STRUIO_EAGER;
-			DB_CKSUMSTART(mp) = (intptr_t)sqp;
-			syn_present = B_TRUE;
-		}
-	}
-
-	if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) {
-		uint_t	flags = (unsigned int)tcph->th_flags[0] & 0xFF;
-		BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
-		if ((flags & TH_RST) || (flags & TH_URG)) {
-			CONN_DEC_REF(connp);
-			freemsg(first_mp);
-			return;
-		}
-		if (flags & TH_ACK) {
-			ip_xmit_reset_serialize(first_mp, ip_hdr_len, zoneid,
-			    ipst->ips_netstack->netstack_tcp, connp);
-			CONN_DEC_REF(connp);
-			return;
-		}
-
-		CONN_DEC_REF(connp);
-		freemsg(first_mp);
-		return;
-	}
-
-	if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) {
-		first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha,
-		    NULL, mctl_present);
-		if (first_mp == NULL) {
-			BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
-			CONN_DEC_REF(connp);
-			return;
-		}
-		if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) {
-			ASSERT(syn_present);
-			if (mctl_present) {
-				ASSERT(first_mp != mp);
-				first_mp->b_datap->db_struioflag |=
-				    STRUIO_POLICY;
-			} else {
-				ASSERT(first_mp == mp);
-				mp->b_datap->db_struioflag &=
-				    ~STRUIO_EAGER;
-				mp->b_datap->db_struioflag |=
-				    STRUIO_POLICY;
-			}
-		} else {
-			/*
-			 * Discard first_mp early since we're dealing with a
-			 * fully-connected conn_t and tcp doesn't do policy in
-			 * this case.
-			 */
-			if (mctl_present) {
-				freeb(first_mp);
-				mctl_present = B_FALSE;
-			}
-			first_mp = mp;
-		}
-	}
-
-	/*
-	 * Initiate policy processing here if needed. If we get here from
-	 * icmp_inbound_error_fanout, ip_policy is false.
-	 */
-	if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) {
-		ill_index = recv_ill->ill_phyint->phyint_ifindex;
-		ip_process(IPP_LOCAL_IN, &mp, ill_index);
-		if (mp == NULL) {
-			CONN_DEC_REF(connp);
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		} else if (mctl_present) {
-			ASSERT(first_mp != mp);
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
-		}
-	}
-
-	/* Handle socket options. */
-	if (!syn_present &&
-	    connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) {
-		/* Add header */
-		ASSERT(recv_ill != NULL);
-		/*
-		 * Since tcp does not support IP_RECVPKTINFO for V4, only pass
-		 * IPF_RECVIF.
-		 */
-		mp = ip_add_info(mp, recv_ill, IPF_RECVIF, IPCL_ZONEID(connp),
-		    ipst);
-		if (mp == NULL) {
-			BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
-			CONN_DEC_REF(connp);
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		} else if (mctl_present) {
-			/*
-			 * ip_add_info might return a new mp.
-			 */
-			ASSERT(first_mp != mp);
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
-		}
-	}
-	BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
-	if (IPCL_IS_TCP(connp)) {
-		/* do not drain, certain use cases can blow the stack */
-		SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv,
-		    connp, SQ_NODRAIN, SQTAG_IP_FANOUT_TCP);
-	} else {
-		/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
-		(connp->conn_recv)(connp, first_mp, NULL);
-		CONN_DEC_REF(connp);
-	}
-}
-
-/*
  * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or
- * pass it along to ESP if the SPI is non-zero.  Returns TRUE if the mblk
+ * pass it along to ESP if the SPI is non-zero.  Returns the mblk if the mblk
  * is not consumed.
  *
- * One of four things can happen, all of which affect the passed-in mblk:
- *
- * 1.) ICMP messages that go through here just get returned TRUE.
+ * One of three things can happen, all of which affect the passed-in mblk:
  *
- * 2.) The packet is stock UDP and gets its zero-SPI stripped.  Return TRUE.
+ * 1.) The packet is stock UDP and gets its zero-SPI stripped.  Return mblk..
  *
- * 3.) The packet is ESP-in-UDP, gets transformed into an equivalent
- *     ESP packet, and is passed along to ESP for consumption.  Return FALSE.
+ * 2.) The packet is ESP-in-UDP, gets transformed into an equivalent
+ *     ESP packet, and is passed along to ESP for consumption.  Return NULL.
  *
- * 4.) The packet is an ESP-in-UDP Keepalive.  Drop it and return FALSE.
+ * 3.) The packet is an ESP-in-UDP Keepalive.  Drop it and return NULL.
  */
-static boolean_t
-zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill,
-    ipsec_stack_t *ipss)
+mblk_t *
+zero_spi_check(mblk_t *mp, ip_recv_attr_t *ira)
 {
 	int shift, plen, iph_len;
 	ipha_t *ipha;
@@ -6926,28 +5224,12 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill,
 	uint32_t *spi;
 	uint32_t esp_ports;
 	uint8_t *orptr;
-	boolean_t free_ire;
-
-	if (DB_TYPE(mp) == M_CTL) {
-		/*
-		 * ICMP message with UDP inside.  Don't bother stripping, just
-		 * send it up.
-		 *
-		 * NOTE: Any app with UDP_NAT_T_ENDPOINT set is probably going
-		 * to ignore errors set by ICMP anyway ('cause they might be
-		 * forged), but that's the app's decision, not ours.
-		 */
-
-		/* Bunch of reality checks for DEBUG kernels... */
-		ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
-		ASSERT(((ipha_t *)mp->b_rptr)->ipha_protocol == IPPROTO_ICMP);
-
-		return (B_TRUE);
-	}
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
+	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
 
 	ipha = (ipha_t *)mp->b_rptr;
-	iph_len = IPH_HDR_LENGTH(ipha);
-	plen = ntohs(ipha->ipha_length);
+	iph_len = ira->ira_ip_hdr_length;
+	plen = ira->ira_pktlen;
 
 	if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) {
 		/*
@@ -6958,18 +5240,18 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill,
 		 * byte packets (keepalives are 1-byte), but we'll drop them
 		 * also.
 		 */
-		ip_drop_packet(mp, B_TRUE, recv_ill, NULL,
+		ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper);
-		return (B_FALSE);
+		return (NULL);
 	}
 
 	if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) {
 		/* might as well pull it all up - it might be ESP. */
 		if (!pullupmsg(mp, -1)) {
-			ip_drop_packet(mp, B_TRUE, recv_ill, NULL,
+			ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 			    DROPPER(ipss, ipds_esp_nomem),
 			    &ipss->ipsec_dropper);
-			return (B_FALSE);
+			return (NULL);
 		}
 
 		ipha = (ipha_t *)mp->b_rptr;
@@ -6985,7 +5267,8 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill,
 	}
 
 	/* Fix IP header */
-	ipha->ipha_length = htons(plen - shift);
+	ira->ira_pktlen = (plen - shift);
+	ipha->ipha_length = htons(ira->ira_pktlen);
 	ipha->ipha_hdr_checksum = 0;
 
 	orptr = mp->b_rptr;
@@ -7005,388 +5288,185 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill,
 	if (esp_ports != 0) /* Punt up for ESP processing. */ {
 		ipha = (ipha_t *)(orptr + shift);
 
-		free_ire = (ire == NULL);
-		if (free_ire) {
-			/* Re-acquire ire. */
-			ire = ire_cache_lookup(ipha->ipha_dst, ALL_ZONES, NULL,
-			    ipss->ipsec_netstack->netstack_ip);
-			if (ire == NULL || !(ire->ire_type & IRE_LOCAL)) {
-				if (ire != NULL)
-					ire_refrele(ire);
-				/*
-				 * Do a regular freemsg(), as this is an IP
-				 * error (no local route) not an IPsec one.
-				 */
-				freemsg(mp);
-			}
-		}
-
-		ip_proto_input(q, mp, ipha, ire, recv_ill, esp_ports);
-		if (free_ire)
-			ire_refrele(ire);
+		ira->ira_flags |= IRAF_ESP_UDP_PORTS;
+		ira->ira_esp_udp_ports = esp_ports;
+		ip_fanout_v4(mp, ipha, ira);
+		return (NULL);
 	}
-
-	return (esp_ports == 0);
+	return (mp);
 }
 
 /*
  * Deliver a udp packet to the given conn, possibly applying ipsec policy.
+ * Handles IPv4 and IPv6.
  * We are responsible for disposing of mp, such as by freemsg() or putnext()
- * Caller is responsible for dropping references to the conn, and freeing
- * first_mp.
- *
- * IPQoS Notes
- * Before sending it to the client, invoke IPPF processing. Policy processing
- * takes place only if the callout_position, IPP_LOCAL_IN, is enabled and
- * ip_policy is true. If we get here from icmp_inbound_error_fanout or
- * ip_wput_local, ip_policy is false.
+ * Caller is responsible for dropping references to the conn.
  */
-static void
-ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp,
-    boolean_t secure, ill_t *ill, ipha_t *ipha, uint_t flags, ill_t *recv_ill,
-    boolean_t ip_policy)
+void
+ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
+    ip_recv_attr_t *ira)
 {
-	boolean_t	mctl_present = (first_mp != NULL);
-	uint32_t	in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */
-	uint32_t	ill_index;
-	ip_stack_t	*ipst = recv_ill->ill_ipst;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
+	boolean_t	secure;
+	iaflags_t	iraflags = ira->ira_flags;
 
-	ASSERT(ill != NULL);
+	secure = iraflags & IRAF_IPSEC_SECURE;
 
-	if (mctl_present)
-		first_mp->b_cont = mp;
-	else
-		first_mp = mp;
-
-	if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
-	    (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
+	if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
+	    !canputnext(connp->conn_rq)) {
 		BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
-		freemsg(first_mp);
+		freemsg(mp);
 		return;
 	}
 
-	if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) {
-		first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha,
-		    NULL, mctl_present);
-		/* Freed by ipsec_check_inbound_policy(). */
-		if (first_mp == NULL) {
+	if (((iraflags & IRAF_IS_IPV4) ?
+	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
+	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
+	    secure) {
+		mp = ipsec_check_inbound_policy(mp, connp, ipha,
+		    ip6h, ira);
+		if (mp == NULL) {
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			/* Note that mp is NULL */
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
 			return;
 		}
 	}
-	if (mctl_present)
-		freeb(first_mp);
-
-	/* Let's hope the compilers utter "branch, predict-not-taken..." ;) */
-	if (connp->conn_udp->udp_nat_t_endpoint) {
-		if (mctl_present) {
-			/* mctl_present *shouldn't* happen. */
-			ip_drop_packet(mp, B_TRUE, NULL, NULL,
-			    DROPPER(ipss, ipds_esp_nat_t_ipsec),
-			    &ipss->ipsec_dropper);
-			return;
-		}
-
-		if (!zero_spi_check(ill->ill_rq, mp, NULL, recv_ill, ipss))
-			return;
-	}
 
-	/* Handle options. */
-	if (connp->conn_recvif)
-		in_flags = IPF_RECVIF;
 	/*
-	 * UDP supports IP_RECVPKTINFO option for both v4 and v6 so the flag
-	 * passed to ip_add_info is based on IP version of connp.
+	 * Since this code is not used for UDP unicast we don't need a NAT_T
+	 * check. Only ip_fanout_v4 has that check.
 	 */
-	if (connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) {
-		if (connp->conn_af_isv6) {
-			/*
-			 * V6 only needs index
-			 */
-			in_flags |= IPF_RECVIF;
-		} else {
-			/*
-			 * V4 needs index + matching address.
-			 */
-			in_flags |= IPF_RECVADDR;
-		}
-	}
-
-	if (connp->conn_recvslla && !(flags & IP_FF_SEND_SLLA))
-		in_flags |= IPF_RECVSLLA;
+	if (ira->ira_flags & IRAF_ICMP_ERROR) {
+		(connp->conn_recvicmp)(connp, mp, NULL, ira);
+	} else {
+		ill_t *rill = ira->ira_rill;
 
-	/*
-	 * Initiate IPPF processing here, if needed. Note first_mp won't be
-	 * freed if the packet is dropped. The caller will do so.
-	 */
-	if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) {
-		ill_index = recv_ill->ill_phyint->phyint_ifindex;
-		ip_process(IPP_LOCAL_IN, &mp, ill_index);
-		if (mp == NULL) {
-			return;
-		}
-	}
-	if ((in_flags != 0) &&
-	    (mp->b_datap->db_type != M_CTL)) {
-		/*
-		 * The actual data will be contained in b_cont
-		 * upon successful return of the following call
-		 * else original mblk is returned
-		 */
-		ASSERT(recv_ill != NULL);
-		mp = ip_add_info(mp, recv_ill, in_flags, IPCL_ZONEID(connp),
-		    ipst);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		ira->ira_ill = ira->ira_rill = NULL;
+		/* Send it upstream */
+		(connp->conn_recv)(connp, mp, NULL, ira);
+		ira->ira_ill = ill;
+		ira->ira_rill = rill;
 	}
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-	/* Send it upstream */
-	(connp->conn_recv)(connp, mp, NULL);
 }
 
 /*
- * Fanout for UDP packets.
- * The caller puts <fport, lport> in the ports parameter.
+ * Fanout for UDP packets that are multicast or broadcast, and ICMP errors.
+ * (Unicast fanout is handled in ip_input_v4.)
  *
  * If SO_REUSEADDR is set all multicast and broadcast packets
- * will be delivered to all streams bound to the same port.
+ * will be delivered to all conns bound to the same port.
  *
- * Zones notes:
- * Multicast and broadcast packets will be distributed to streams in all zones.
+ * If there is at least one matching AF_INET receiver, then we will
+ * ignore any AF_INET6 receivers.
  * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an
  * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4
- * packets. To maintain this behavior with multiple zones, the conns are grouped
- * by zone and the SO_REUSEADDR flag is checked for the first matching conn in
- * each zone. If unset, all the following conns in the same zone are skipped.
+ * packets.
+ *
+ * Zones notes:
+ * Earlier in ip_input on a system with multiple shared-IP zones we
+ * duplicate the multicast and broadcast packets and send them up
+ * with each explicit zoneid that exists on that ill.
+ * This means that here we can match the zoneid with SO_ALLZONES being special.
  */
-static void
-ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
-    uint32_t ports, boolean_t broadcast, uint_t flags, boolean_t mctl_present,
-    boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid)
+void
+ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport,
+    ip_recv_attr_t *ira)
 {
-	uint32_t	dstport, srcport;
-	ipaddr_t	dst;
-	mblk_t		*first_mp;
-	boolean_t	secure;
-	in6_addr_t	v6src;
+	ipaddr_t	laddr;
+	in6_addr_t	v6faddr;
 	conn_t		*connp;
 	connf_t		*connfp;
-	conn_t		*first_connp;
-	conn_t		*next_connp;
-	mblk_t		*mp1, *first_mp1;
-	ipaddr_t	src;
-	zoneid_t	last_zoneid;
-	boolean_t	reuseaddr;
-	boolean_t	shared_addr;
-	boolean_t	unlabeled;
-	ip_stack_t	*ipst;
-
-	ASSERT(recv_ill != NULL);
-	ipst = recv_ill->ill_ipst;
-
-	first_mp = mp;
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		first_mp->b_cont = NULL;
-		secure = ipsec_in_is_secure(first_mp);
-		ASSERT(mp != NULL);
-	} else {
-		first_mp = NULL;
-		secure = B_FALSE;
-	}
+	ipaddr_t	faddr;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	/* Extract ports in net byte order */
-	dstport = htons(ntohl(ports) & 0xFFFF);
-	srcport = htons(ntohl(ports) >> 16);
-	dst = ipha->ipha_dst;
-	src = ipha->ipha_src;
+	ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
 
-	unlabeled = B_FALSE;
-	if (is_system_labeled())
-		/* Cred cannot be null on IPv4 */
-		unlabeled = (msg_getlabel(mp)->tsl_flags &
-		    TSLF_UNLABELED) != 0;
-	shared_addr = (zoneid == ALL_ZONES);
-	if (shared_addr) {
-		/*
-		 * No need to handle exclusive-stack zones since ALL_ZONES
-		 * only applies to the shared stack.
-		 */
-		zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport);
-		/*
-		 * If no shared MLP is found, tsol_mlp_findzone returns
-		 * ALL_ZONES.  In that case, we assume it's SLP, and
-		 * search for the zone based on the packet label.
-		 *
-		 * If there is such a zone, we prefer to find a
-		 * connection in it.  Otherwise, we look for a
-		 * MAC-exempt connection in any zone whose label
-		 * dominates the default label on the packet.
-		 */
-		if (zoneid == ALL_ZONES)
-			zoneid = tsol_packet_to_zoneid(mp);
-		else
-			unlabeled = B_FALSE;
-	}
+	laddr = ipha->ipha_dst;
+	faddr = ipha->ipha_src;
 
-	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)];
+	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
 	mutex_enter(&connfp->connf_lock);
 	connp = connfp->connf_head;
-	if (!broadcast && !CLASSD(dst)) {
-		/*
-		 * Not broadcast or multicast. Send to the one (first)
-		 * client we find. No need to check conn_wantpacket()
-		 * since IP_BOUND_IF/conn_incoming_ill does not apply to
-		 * IPv4 unicast packets.
-		 */
-		while ((connp != NULL) &&
-		    (!IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) ||
-		    (!IPCL_ZONE_MATCH(connp, zoneid) &&
-		    !(unlabeled && (connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
-		    shared_addr)))) {
-			/*
-			 * We keep searching since the conn did not match,
-			 * or its zone did not match and it is not either
-			 * an allzones conn or a mac exempt conn (if the
-			 * sender is unlabeled.)
-			 */
-			connp = connp->conn_next;
-		}
-
-		if (connp == NULL ||
-		    !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL)
-			goto notfound;
-
-		ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
-
-		if (is_system_labeled() &&
-		    !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
-		    connp))
-			goto notfound;
-
-		CONN_INC_REF(connp);
-		mutex_exit(&connfp->connf_lock);
-		ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha,
-		    flags, recv_ill, ip_policy);
-		IP_STAT(ipst, ip_udp_fannorm);
-		CONN_DEC_REF(connp);
-		return;
-	}
 
 	/*
-	 * Broadcast and multicast case
-	 *
-	 * Need to check conn_wantpacket().
 	 * If SO_REUSEADDR has been set on the first we send the
 	 * packet to all clients that have joined the group and
 	 * match the port.
 	 */
-
 	while (connp != NULL) {
-		if ((IPCL_UDP_MATCH(connp, dstport, dst, srcport, src)) &&
-		    conn_wantpacket(connp, ill, ipha, flags, zoneid) &&
-		    (!is_system_labeled() ||
-		    tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
-		    connp)))
+		if ((IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)) &&
+		    conn_wantpacket(connp, ira, ipha) &&
+		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
 			break;
 		connp = connp->conn_next;
 	}
 
-	if (connp == NULL ||
-	    !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL)
+	if (connp == NULL)
 		goto notfound;
 
-	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+	CONN_INC_REF(connp);
 
-	first_connp = connp;
-	/*
-	 * When SO_REUSEADDR is not set, send the packet only to the first
-	 * matching connection in its zone by keeping track of the zoneid.
-	 */
-	reuseaddr = first_connp->conn_reuseaddr;
-	last_zoneid = first_connp->conn_zoneid;
+	if (connp->conn_reuseaddr) {
+		conn_t		*first_connp = connp;
+		conn_t		*next_connp;
+		mblk_t		*mp1;
 
-	CONN_INC_REF(connp);
-	connp = connp->conn_next;
-	for (;;) {
-		while (connp != NULL) {
-			if (IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) &&
-			    (reuseaddr || connp->conn_zoneid != last_zoneid) &&
-			    conn_wantpacket(connp, ill, ipha, flags, zoneid) &&
-			    (!is_system_labeled() ||
-			    tsol_receive_local(mp, &dst, IPV4_VERSION,
-			    shared_addr, connp)))
+		connp = connp->conn_next;
+		for (;;) {
+			while (connp != NULL) {
+				if (IPCL_UDP_MATCH(connp, lport, laddr,
+				    fport, faddr) &&
+				    conn_wantpacket(connp, ira, ipha) &&
+				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+				    tsol_receive_local(mp, &laddr, IPV4_VERSION,
+				    ira, connp)))
+					break;
+				connp = connp->conn_next;
+			}
+			if (connp == NULL) {
+				/* No more interested clients */
+				connp = first_connp;
 				break;
-			connp = connp->conn_next;
-		}
-		/*
-		 * Just copy the data part alone. The mctl part is
-		 * needed just for verifying policy and it is never
-		 * sent up.
-		 */
-		if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) &&
-		    ((mp1 = copymsg(mp)) == NULL))) {
-			/*
-			 * No more interested clients or memory
-			 * allocation failed
-			 */
-			connp = first_connp;
-			break;
-		}
-		if (connp->conn_zoneid != last_zoneid) {
-			/*
-			 * Update the zoneid so that the packet isn't sent to
-			 * any more conns in the same zone unless SO_REUSEADDR
-			 * is set.
-			 */
-			reuseaddr = connp->conn_reuseaddr;
-			last_zoneid = connp->conn_zoneid;
-		}
-		if (first_mp != NULL) {
-			ASSERT(((ipsec_info_t *)first_mp->b_rptr)->
-			    ipsec_info_type == IPSEC_IN);
-			first_mp1 = ipsec_in_tag(first_mp, NULL,
-			    ipst->ips_netstack);
-			if (first_mp1 == NULL) {
-				freemsg(mp1);
+			}
+			if (((mp1 = dupmsg(mp)) == NULL) &&
+			    ((mp1 = copymsg(mp)) == NULL)) {
+				/* Memory allocation failed */
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
 				connp = first_connp;
 				break;
 			}
-		} else {
-			first_mp1 = NULL;
+			CONN_INC_REF(connp);
+			mutex_exit(&connfp->connf_lock);
+
+			IP_STAT(ipst, ip_udp_fanmb);
+			ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
+			    NULL, ira);
+			mutex_enter(&connfp->connf_lock);
+			/* Follow the next pointer before releasing the conn */
+			next_connp = connp->conn_next;
+			CONN_DEC_REF(connp);
+			connp = next_connp;
 		}
-		CONN_INC_REF(connp);
-		mutex_exit(&connfp->connf_lock);
-		/*
-		 * IPQoS notes: We don't send the packet for policy
-		 * processing here, will do it for the last one (below).
-		 * i.e. we do it per-packet now, but if we do policy
-		 * processing per-conn, then we would need to do it
-		 * here too.
-		 */
-		ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill,
-		    ipha, flags, recv_ill, B_FALSE);
-		mutex_enter(&connfp->connf_lock);
-		/* Follow the next pointer before releasing the conn. */
-		next_connp = connp->conn_next;
-		IP_STAT(ipst, ip_udp_fanmb);
-		CONN_DEC_REF(connp);
-		connp = next_connp;
 	}
 
 	/* Last one.  Send it upstream. */
 	mutex_exit(&connfp->connf_lock);
-	ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags,
-	    recv_ill, ip_policy);
 	IP_STAT(ipst, ip_udp_fanmb);
+	ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
 	CONN_DEC_REF(connp);
 	return;
 
 notfound:
-
 	mutex_exit(&connfp->connf_lock);
-	IP_STAT(ipst, ip_udp_fanothers);
 	/*
-	 * IPv6 endpoints bound to unicast or multicast IPv4-mapped addresses
+	 * IPv6 endpoints bound to multicast IPv4-mapped addresses
 	 * have already been matched above, since they live in the IPv4
 	 * fanout tables. This implies we only need to
 	 * check for IPv6 in6addr_any endpoints here.
@@ -7394,85 +5474,28 @@ notfound:
 	 * address, except for the multicast group membership lookup which
 	 * uses the IPv4 destination.
 	 */
-	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
-	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)];
+	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6faddr);
+	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
 	mutex_enter(&connfp->connf_lock);
 	connp = connfp->connf_head;
-	if (!broadcast && !CLASSD(dst)) {
-		while (connp != NULL) {
-			if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros,
-			    srcport, v6src) && IPCL_ZONE_MATCH(connp, zoneid) &&
-			    conn_wantpacket(connp, ill, ipha, flags, zoneid) &&
-			    !connp->conn_ipv6_v6only)
-				break;
-			connp = connp->conn_next;
-		}
-
-		if (connp != NULL && is_system_labeled() &&
-		    !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
-		    connp))
-			connp = NULL;
-
-		if (connp == NULL ||
-		    !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) {
-			/*
-			 * No one bound to this port.  Is
-			 * there a client that wants all
-			 * unclaimed datagrams?
-			 */
-			mutex_exit(&connfp->connf_lock);
-
-			if (mctl_present)
-				first_mp->b_cont = mp;
-			else
-				first_mp = mp;
-			if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP].
-			    connf_head != NULL) {
-				ip_fanout_proto(q, first_mp, ill, ipha,
-				    flags | IP_FF_RAWIP, mctl_present,
-				    ip_policy, recv_ill, zoneid);
-			} else {
-				if (ip_fanout_send_icmp(q, first_mp, flags,
-				    ICMP_DEST_UNREACHABLE,
-				    ICMP_PORT_UNREACHABLE,
-				    mctl_present, zoneid, ipst)) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    udpIfStatsNoPorts);
-				}
-			}
-			return;
-		}
-		ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
-
-		CONN_INC_REF(connp);
-		mutex_exit(&connfp->connf_lock);
-		ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha,
-		    flags, recv_ill, ip_policy);
-		CONN_DEC_REF(connp);
-		return;
-	}
 	/*
 	 * IPv4 multicast packet being delivered to an AF_INET6
 	 * in6addr_any endpoint.
 	 * Need to check conn_wantpacket(). Note that we use conn_wantpacket()
 	 * and not conn_wantpacket_v6() since any multicast membership is
 	 * for an IPv4-mapped multicast address.
-	 * The packet is sent to all clients in all zones that have joined the
-	 * group and match the port.
 	 */
 	while (connp != NULL) {
-		if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros,
-		    srcport, v6src) &&
-		    conn_wantpacket(connp, ill, ipha, flags, zoneid) &&
-		    (!is_system_labeled() ||
-		    tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
-		    connp)))
+		if (IPCL_UDP_MATCH_V6(connp, lport, ipv6_all_zeros,
+		    fport, v6faddr) &&
+		    conn_wantpacket(connp, ira, ipha) &&
+		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
 			break;
 		connp = connp->conn_next;
 	}
 
-	if (connp == NULL ||
-	    !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) {
+	if (connp == NULL) {
 		/*
 		 * No one bound to this port.  Is
 		 * there a client that wants all
@@ -7480,15 +5503,10 @@ notfound:
 		 */
 		mutex_exit(&connfp->connf_lock);
 
-		if (mctl_present)
-			first_mp->b_cont = mp;
-		else
-			first_mp = mp;
-		if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP].connf_head !=
+		if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].connf_head !=
 		    NULL) {
-			ip_fanout_proto(q, first_mp, ill, ipha,
-			    flags | IP_FF_RAWIP, mctl_present, ip_policy,
-			    recv_ill, zoneid);
+			ASSERT(ira->ira_protocol == IPPROTO_UDP);
+			ip_fanout_proto_v4(mp, ipha, ira);
 		} else {
 			/*
 			 * We used to attempt to send an icmp error here, but
@@ -7497,102 +5515,263 @@ notfound:
 			 * multicast, just drop the packet and give up sooner.
 			 */
 			BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
-			freemsg(first_mp);
+			freemsg(mp);
 		}
 		return;
 	}
-	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
 
-	first_connp = connp;
+	/*
+	 * If SO_REUSEADDR has been set on the first we send the
+	 * packet to all clients that have joined the group and
+	 * match the port.
+	 */
+	if (connp->conn_reuseaddr) {
+		conn_t		*first_connp = connp;
+		conn_t		*next_connp;
+		mblk_t		*mp1;
 
-	CONN_INC_REF(connp);
-	connp = connp->conn_next;
-	for (;;) {
-		while (connp != NULL) {
-			if (IPCL_UDP_MATCH_V6(connp, dstport,
-			    ipv6_all_zeros, srcport, v6src) &&
-			    conn_wantpacket(connp, ill, ipha, flags, zoneid) &&
-			    (!is_system_labeled() ||
-			    tsol_receive_local(mp, &dst, IPV4_VERSION,
-			    shared_addr, connp)))
+		CONN_INC_REF(connp);
+		connp = connp->conn_next;
+		for (;;) {
+			while (connp != NULL) {
+				if (IPCL_UDP_MATCH_V6(connp, lport,
+				    ipv6_all_zeros, fport, v6faddr) &&
+				    conn_wantpacket(connp, ira, ipha) &&
+				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+				    tsol_receive_local(mp, &laddr, IPV4_VERSION,
+				    ira, connp)))
+					break;
+				connp = connp->conn_next;
+			}
+			if (connp == NULL) {
+				/* No more interested clients */
+				connp = first_connp;
 				break;
-			connp = connp->conn_next;
-		}
-		/*
-		 * Just copy the data part alone. The mctl part is
-		 * needed just for verifying policy and it is never
-		 * sent up.
-		 */
-		if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) &&
-		    ((mp1 = copymsg(mp)) == NULL))) {
-			/*
-			 * No more intested clients or memory
-			 * allocation failed
-			 */
-			connp = first_connp;
-			break;
-		}
-		if (first_mp != NULL) {
-			ASSERT(((ipsec_info_t *)first_mp->b_rptr)->
-			    ipsec_info_type == IPSEC_IN);
-			first_mp1 = ipsec_in_tag(first_mp, NULL,
-			    ipst->ips_netstack);
-			if (first_mp1 == NULL) {
-				freemsg(mp1);
+			}
+			if (((mp1 = dupmsg(mp)) == NULL) &&
+			    ((mp1 = copymsg(mp)) == NULL)) {
+				/* Memory allocation failed */
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
 				connp = first_connp;
 				break;
 			}
-		} else {
-			first_mp1 = NULL;
+			CONN_INC_REF(connp);
+			mutex_exit(&connfp->connf_lock);
+
+			IP_STAT(ipst, ip_udp_fanmb);
+			ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
+			    NULL, ira);
+			mutex_enter(&connfp->connf_lock);
+			/* Follow the next pointer before releasing the conn */
+			next_connp = connp->conn_next;
+			CONN_DEC_REF(connp);
+			connp = next_connp;
 		}
-		CONN_INC_REF(connp);
-		mutex_exit(&connfp->connf_lock);
-		/*
-		 * IPQoS notes: We don't send the packet for policy
-		 * processing here, will do it for the last one (below).
-		 * i.e. we do it per-packet now, but if we do policy
-		 * processing per-conn, then we would need to do it
-		 * here too.
-		 */
-		ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill,
-		    ipha, flags, recv_ill, B_FALSE);
-		mutex_enter(&connfp->connf_lock);
-		/* Follow the next pointer before releasing the conn. */
-		next_connp = connp->conn_next;
-		CONN_DEC_REF(connp);
-		connp = next_connp;
 	}
 
 	/* Last one.  Send it upstream. */
 	mutex_exit(&connfp->connf_lock);
-	ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags,
-	    recv_ill, ip_policy);
+	IP_STAT(ipst, ip_udp_fanmb);
+	ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
 	CONN_DEC_REF(connp);
 }
 
 /*
- * Complete the ip_wput header so that it
- * is possible to generate ICMP
- * errors.
+ * Split an incoming packet's IPv4 options into the label and the other options.
+ * If 'allocate' is set it does memory allocation for the ip_pkt_t, including
+ * clearing out any leftover label or options.
+ * Otherwise it just makes ipp point into the packet.
+ *
+ * Returns zero if ok; ENOMEM if the buffer couldn't be allocated.
  */
 int
-ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid, ip_stack_t *ipst)
+ip_find_hdr_v4(ipha_t *ipha, ip_pkt_t *ipp, boolean_t allocate)
 {
-	ire_t *ire;
+	uchar_t		*opt;
+	uint32_t	totallen;
+	uint32_t	optval;
+	uint32_t	optlen;
 
-	if (ipha->ipha_src == INADDR_ANY) {
-		ire = ire_lookup_local(zoneid, ipst);
-		if (ire == NULL) {
-			ip1dbg(("ip_hdr_complete: no source IRE\n"));
-			return (1);
+	ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
+	ipp->ipp_hoplimit = ipha->ipha_ttl;
+	ipp->ipp_type_of_service = ipha->ipha_type_of_service;
+	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &ipp->ipp_addr);
+
+	/*
+	 * Get length (in 4 byte octets) of IP header options.
+	 */
+	totallen = ipha->ipha_version_and_hdr_length -
+	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
+
+	if (totallen == 0) {
+		if (!allocate)
+			return (0);
+
+		/* Clear out anything from a previous packet */
+		if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+			kmem_free(ipp->ipp_ipv4_options,
+			    ipp->ipp_ipv4_options_len);
+			ipp->ipp_ipv4_options = NULL;
+			ipp->ipp_ipv4_options_len = 0;
+			ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
 		}
-		ipha->ipha_src = ire->ire_addr;
-		ire_refrele(ire);
+		if (ipp->ipp_fields & IPPF_LABEL_V4) {
+			kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
+			ipp->ipp_label_v4 = NULL;
+			ipp->ipp_label_len_v4 = 0;
+			ipp->ipp_fields &= ~IPPF_LABEL_V4;
+		}
+		return (0);
 	}
-	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
-	ipha->ipha_hdr_checksum = 0;
-	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
-	return (0);
+
+	totallen <<= 2;
+	opt = (uchar_t *)&ipha[1];
+	if (!is_system_labeled()) {
+
+	copyall:
+		if (!allocate) {
+			if (totallen != 0) {
+				ipp->ipp_ipv4_options = opt;
+				ipp->ipp_ipv4_options_len = totallen;
+				ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
+			}
+			return (0);
+		}
+		/* Just copy all of options */
+		if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+			if (totallen == ipp->ipp_ipv4_options_len) {
+				bcopy(opt, ipp->ipp_ipv4_options, totallen);
+				return (0);
+			}
+			kmem_free(ipp->ipp_ipv4_options,
+			    ipp->ipp_ipv4_options_len);
+			ipp->ipp_ipv4_options = NULL;
+			ipp->ipp_ipv4_options_len = 0;
+			ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
+		}
+		if (totallen == 0)
+			return (0);
+
+		ipp->ipp_ipv4_options = kmem_alloc(totallen, KM_NOSLEEP);
+		if (ipp->ipp_ipv4_options == NULL)
+			return (ENOMEM);
+		ipp->ipp_ipv4_options_len = totallen;
+		ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
+		bcopy(opt, ipp->ipp_ipv4_options, totallen);
+		return (0);
+	}
+
+	if (allocate && (ipp->ipp_fields & IPPF_LABEL_V4)) {
+		kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
+		ipp->ipp_label_v4 = NULL;
+		ipp->ipp_label_len_v4 = 0;
+		ipp->ipp_fields &= ~IPPF_LABEL_V4;
+	}
+
+	/*
+	 * Search for CIPSO option.
+	 * We assume CIPSO is first in options if it is present.
+	 * If it isn't, then ipp_opt_ipv4_options will not include the options
+	 * prior to the CIPSO option.
+	 */
+	while (totallen != 0) {
+		switch (optval = opt[IPOPT_OPTVAL]) {
+		case IPOPT_EOL:
+			return (0);
+		case IPOPT_NOP:
+			optlen = 1;
+			break;
+		default:
+			if (totallen <= IPOPT_OLEN)
+				return (EINVAL);
+			optlen = opt[IPOPT_OLEN];
+			if (optlen < 2)
+				return (EINVAL);
+		}
+		if (optlen > totallen)
+			return (EINVAL);
+
+		switch (optval) {
+		case IPOPT_COMSEC:
+			if (!allocate) {
+				ipp->ipp_label_v4 = opt;
+				ipp->ipp_label_len_v4 = optlen;
+				ipp->ipp_fields |= IPPF_LABEL_V4;
+			} else {
+				ipp->ipp_label_v4 = kmem_alloc(optlen,
+				    KM_NOSLEEP);
+				if (ipp->ipp_label_v4 == NULL)
+					return (ENOMEM);
+				ipp->ipp_label_len_v4 = optlen;
+				ipp->ipp_fields |= IPPF_LABEL_V4;
+				bcopy(opt, ipp->ipp_label_v4, optlen);
+			}
+			totallen -= optlen;
+			opt += optlen;
+
+			/* Skip padding bytes until we get to a multiple of 4 */
+			while ((totallen & 3) != 0 && opt[0] == IPOPT_NOP) {
+				totallen--;
+				opt++;
+			}
+			/* Remaining as ipp_ipv4_options */
+			goto copyall;
+		}
+		totallen -= optlen;
+		opt += optlen;
+	}
+	/* No CIPSO found; return everything as ipp_ipv4_options */
+	totallen = ipha->ipha_version_and_hdr_length -
+	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
+	totallen <<= 2;
+	opt = (uchar_t *)&ipha[1];
+	goto copyall;
+}
+
+/*
+ * Efficient versions of lookup for an IRE when we only
+ * match the address.
+ * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
+ * Does not handle multicast addresses.
+ */
+uint_t
+ip_type_v4(ipaddr_t addr, ip_stack_t *ipst)
+{
+	ire_t *ire;
+	uint_t result;
+
+	ire = ire_ftable_lookup_simple_v4(addr, 0, ipst, NULL);
+	ASSERT(ire != NULL);
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+		result = IRE_NOROUTE;
+	else
+		result = ire->ire_type;
+	ire_refrele(ire);
+	return (result);
+}
+
+/*
+ * Efficient versions of lookup for an IRE when we only
+ * match the address.
+ * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
+ * Does not handle multicast addresses.
+ */
+uint_t
+ip_type_v6(const in6_addr_t *addr, ip_stack_t *ipst)
+{
+	ire_t *ire;
+	uint_t result;
+
+	ire = ire_ftable_lookup_simple_v6(addr, 0, ipst, NULL);
+	ASSERT(ire != NULL);
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+		result = IRE_NOROUTE;
+	else
+		result = ire->ire_type;
+	ire_refrele(ire);
+	return (result);
 }
 
 /*
@@ -7602,8 +5781,6 @@ ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid, ip_stack_t *ipst)
 static void
 ip_lrput(queue_t *q, mblk_t *mp)
 {
-	mblk_t *mp1;
-
 	switch (mp->b_datap->db_type) {
 	case M_FLUSH:
 		/* Turn around */
@@ -7614,9 +5791,6 @@ ip_lrput(queue_t *q, mblk_t *mp)
 		}
 		break;
 	}
-	/* Could receive messages that passed through ar_rput */
-	for (mp1 = mp; mp1; mp1 = mp1->b_cont)
-		mp1->b_prev = mp1->b_next = NULL;
 	freemsg(mp);
 }
 
@@ -7631,7 +5805,7 @@ ip_lwput(queue_t *q, mblk_t *mp)
 /*
  * Move the first hop in any source route to ipha_dst and remove that part of
  * the source route.  Called by other protocols.  Errors in option formatting
- * are ignored - will be handled by ip_wput_options Return the final
+ * are ignored - will be handled by ip_output_options. Return the final
  * destination (either ipha_dst or the last entry in a source route.)
  */
 ipaddr_t
@@ -7643,7 +5817,6 @@ ip_massage_options(ipha_t *ipha, netstack_t *ns)
 	uint8_t		optlen;
 	ipaddr_t	dst;
 	int		i;
-	ire_t		*ire;
 	ip_stack_t	*ipst = ns->netstack_ip;
 
 	ip2dbg(("ip_massage_options\n"));
@@ -7679,10 +5852,7 @@ ip_massage_options(ipha_t *ipha, netstack_t *ns)
 			 * XXX verify per-interface ip_forwarding
 			 * for source route?
 			 */
-			ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL,
-			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-			if (ire != NULL) {
-				ire_refrele(ire);
+			if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
 				off += IP_ADDR_LEN;
 				goto redo_srr;
 			}
@@ -7760,1843 +5930,41 @@ ip_net_mask(ipaddr_t addr)
 	return ((ipaddr_t)0);
 }
 
-/*
- * Helper ill lookup function used by IPsec.
- */
-ill_t *
-ip_grab_ill(mblk_t *first_mp, int ifindex, boolean_t isv6, ip_stack_t *ipst)
+/* Name/Value Table Lookup Routine */
+char *
+ip_nv_lookup(nv_t *nv, int value)
 {
-	ill_t *ret_ill;
-
-	ASSERT(ifindex != 0);
-
-	ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
-	    ipst);
-	if (ret_ill == NULL) {
-		if (isv6) {
-			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
-			ip1dbg(("ip_grab_ill (IPv6): bad ifindex %d.\n",
-			    ifindex));
-		} else {
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-			ip1dbg(("ip_grab_ill (IPv4): bad ifindex %d.\n",
-			    ifindex));
-		}
-		freemsg(first_mp);
+	if (!nv)
 		return (NULL);
+	for (; nv->nv_name; nv++) {
+		if (nv->nv_value == value)
+			return (nv->nv_name);
 	}
-	return (ret_ill);
-}
-
-/*
- * IPv4 -
- * ip_newroute is called by ip_rput or ip_wput whenever we need to send
- * out a packet to a destination address for which we do not have specific
- * (or sufficient) routing information.
- *
- * NOTE : These are the scopes of some of the variables that point at IRE,
- *	  which needs to be followed while making any future modifications
- *	  to avoid memory leaks.
- *
- *	- ire and sire are the entries looked up initially by
- *	  ire_ftable_lookup.
- *	- ipif_ire is used to hold the interface ire associated with
- *	  the new cache ire. But it's scope is limited, so we always REFRELE
- *	  it before branching out to error paths.
- *	- save_ire is initialized before ire_create, so that ire returned
- *	  by ire_create will not over-write the ire. We REFRELE save_ire
- *	  before breaking out of the switch.
- *
- *	Thus on failures, we have to REFRELE only ire and sire, if they
- *	are not NULL.
- */
-void
-ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
-    zoneid_t zoneid, ip_stack_t *ipst)
-{
-	areq_t	*areq;
-	ipaddr_t gw = 0;
-	ire_t	*ire = NULL;
-	mblk_t	*res_mp;
-	ipaddr_t *addrp;
-	ipaddr_t nexthop_addr;
-	ipif_t  *src_ipif = NULL;
-	ill_t	*dst_ill = NULL;
-	ipha_t  *ipha;
-	ire_t	*sire = NULL;
-	mblk_t	*first_mp;
-	ire_t	*save_ire;
-	ushort_t ire_marks = 0;
-	boolean_t mctl_present;
-	ipsec_out_t *io;
-	mblk_t	*saved_mp;
-	mblk_t	*copy_mp = NULL;
-	mblk_t	*xmit_mp = NULL;
-	ipaddr_t save_dst;
-	uint32_t multirt_flags =
-	    MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP;
-	boolean_t multirt_is_resolvable;
-	boolean_t multirt_resolve_next;
-	boolean_t unspec_src;
-	boolean_t ip_nexthop = B_FALSE;
-	tsol_ire_gw_secattr_t *attrp = NULL;
-	tsol_gcgrp_t *gcgrp = NULL;
-	tsol_gcgrp_addr_t ga;
-	int multirt_res_failures = 0;
-	int multirt_res_attempts = 0;
-	int multirt_already_resolved = 0;
-	boolean_t multirt_no_icmp_error = B_FALSE;
-
-	if (ip_debug > 2) {
-		/* ip1dbg */
-		pr_addr_dbg("ip_newroute: dst %s\n", AF_INET, &dst);
-	}
-
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-	if (mctl_present) {
-		io = (ipsec_out_t *)first_mp->b_rptr;
-		ASSERT(io->ipsec_out_type == IPSEC_OUT);
-		ASSERT(zoneid == io->ipsec_out_zoneid);
-		ASSERT(zoneid != ALL_ZONES);
-	}
-
-	ipha = (ipha_t *)mp->b_rptr;
-
-	/* All multicast lookups come through ip_newroute_ipif() */
-	if (CLASSD(dst)) {
-		ip0dbg(("ip_newroute: CLASSD 0x%x (b_prev %p, b_next %p)\n",
-		    ntohl(dst), (void *)mp->b_prev, (void *)mp->b_next));
-		freemsg(first_mp);
-		return;
-	}
-
-	if (mctl_present && io->ipsec_out_ip_nexthop) {
-		ip_nexthop = B_TRUE;
-		nexthop_addr = io->ipsec_out_nexthop_addr;
-	}
-	/*
-	 * If this IRE is created for forwarding or it is not for
-	 * traffic for congestion controlled protocols, mark it as temporary.
-	 */
-	if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ipha->ipha_protocol))
-		ire_marks |= IRE_MARK_TEMPORARY;
-
-	/*
-	 * Get what we can from ire_ftable_lookup which will follow an IRE
-	 * chain until it gets the most specific information available.
-	 * For example, we know that there is no IRE_CACHE for this dest,
-	 * but there may be an IRE_OFFSUBNET which specifies a gateway.
-	 * ire_ftable_lookup will look up the gateway, etc.
-	 * Otherwise, given ire_ftable_lookup algorithm, only one among routes
-	 * to the destination, of equal netmask length in the forward table,
-	 * will be recursively explored. If no information is available
-	 * for the final gateway of that route, we force the returned ire
-	 * to be equal to sire using MATCH_IRE_PARENT.
-	 * At least, in this case we have a starting point (in the buckets)
-	 * to look for other routes to the destination in the forward table.
-	 * This is actually used only for multirouting, where a list
-	 * of routes has to be processed in sequence.
-	 *
-	 * In the process of coming up with the most specific information,
-	 * ire_ftable_lookup may end up with an incomplete IRE_CACHE entry
-	 * for the gateway (i.e., one for which the ire_nce->nce_state is
-	 * not yet ND_REACHABLE, and is in the middle of arp resolution).
-	 * Two caveats when handling incomplete ire's in ip_newroute:
-	 * - we should be careful when accessing its ire_nce (specifically
-	 *   the nce_res_mp) ast it might change underneath our feet, and,
-	 * - not all legacy code path callers are prepared to handle
-	 *   incomplete ire's, so we should not create/add incomplete
-	 *   ire_cache entries here. (See discussion about temporary solution
-	 *   further below).
-	 *
-	 * In order to minimize packet dropping, and to preserve existing
-	 * behavior, we treat this case as if there were no IRE_CACHE for the
-	 * gateway, and instead use the IF_RESOLVER ire to send out
-	 * another request to ARP (this is achieved by passing the
-	 * MATCH_IRE_COMPLETE flag to ire_ftable_lookup). When the
-	 * arp response comes back in ip_wput_nondata, we will create
-	 * a per-dst ire_cache that has an ND_COMPLETE ire.
-	 *
-	 * Note that this is a temporary solution; the correct solution is
-	 * to create an incomplete  per-dst ire_cache entry, and send the
-	 * packet out when the gw's nce is resolved. In order to achieve this,
-	 * all packet processing must have been completed prior to calling
-	 * ire_add_then_send. Some legacy code paths (e.g. cgtp) would need
-	 * to be modified to accomodate this solution.
-	 */
-	if (ip_nexthop) {
-		/*
-		 * The first time we come here, we look for an IRE_INTERFACE
-		 * entry for the specified nexthop, set the dst to be the
-		 * nexthop address and create an IRE_CACHE entry for the
-		 * nexthop. The next time around, we are able to find an
-		 * IRE_CACHE entry for the nexthop, set the gateway to be the
-		 * nexthop address and create an IRE_CACHE entry for the
-		 * destination address via the specified nexthop.
-		 */
-		ire = ire_cache_lookup(nexthop_addr, zoneid,
-		    msg_getlabel(mp), ipst);
-		if (ire != NULL) {
-			gw = nexthop_addr;
-			ire_marks |= IRE_MARK_PRIVATE_ADDR;
-		} else {
-			ire = ire_ftable_lookup(nexthop_addr, 0, 0,
-			    IRE_INTERFACE, NULL, NULL, zoneid, 0,
-			    msg_getlabel(mp),
-			    MATCH_IRE_TYPE | MATCH_IRE_SECATTR,
-			    ipst);
-			if (ire != NULL) {
-				dst = nexthop_addr;
-			}
-		}
-	} else {
-		ire = ire_ftable_lookup(dst, 0, 0, 0,
-		    NULL, &sire, zoneid, 0, msg_getlabel(mp),
-		    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-		    MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT |
-		    MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE,
-		    ipst);
-	}
-
-	ip3dbg(("ip_newroute: ire_ftable_lookup() "
-	    "returned ire %p, sire %p\n", (void *)ire, (void *)sire));
-
-	/*
-	 * This loop is run only once in most cases.
-	 * We loop to resolve further routes only when the destination
-	 * can be reached through multiple RTF_MULTIRT-flagged ires.
-	 */
-	do {
-		/* Clear the previous iteration's values */
-		if (src_ipif != NULL) {
-			ipif_refrele(src_ipif);
-			src_ipif = NULL;
-		}
-		if (dst_ill != NULL) {
-			ill_refrele(dst_ill);
-			dst_ill = NULL;
-		}
-
-		multirt_resolve_next = B_FALSE;
-		/*
-		 * We check if packets have to be multirouted.
-		 * In this case, given the current <ire, sire> couple,
-		 * we look for the next suitable <ire, sire>.
-		 * This check is done in ire_multirt_lookup(),
-		 * which applies various criteria to find the next route
-		 * to resolve. ire_multirt_lookup() leaves <ire, sire>
-		 * unchanged if it detects it has not been tried yet.
-		 */
-		if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) {
-			ip3dbg(("ip_newroute: starting next_resolution "
-			    "with first_mp %p, tag %d\n",
-			    (void *)first_mp,
-			    MULTIRT_DEBUG_TAGGED(first_mp)));
-
-			ASSERT(sire != NULL);
-			multirt_is_resolvable =
-			    ire_multirt_lookup(&ire, &sire, multirt_flags,
-			    &multirt_already_resolved, msg_getlabel(mp), ipst);
-
-			ip3dbg(("ip_newroute: multirt_is_resolvable %d, "
-			    "multirt_already_resolved %d, "
-			    "multirt_res_attempts %d, multirt_res_failures %d, "
-			    "ire %p, sire %p\n", multirt_is_resolvable,
-			    multirt_already_resolved, multirt_res_attempts,
-			    multirt_res_failures, (void *)ire, (void *)sire));
-
-			if (!multirt_is_resolvable) {
-				/*
-				 * No more multirt route to resolve; give up
-				 * (all routes resolved or no more
-				 * resolvable routes).
-				 */
-				if (ire != NULL) {
-					ire_refrele(ire);
-					ire = NULL;
-				}
-				/*
-				 * Generate ICMP error only if all attempts to
-				 * resolve multirt route failed and there is no
-				 * already resolved one.  Don't generate ICMP
-				 * error when:
-				 *
-				 *  1) there was no attempt to resolve
-				 *  2) at least one attempt passed
-				 *  3) a multirt route is already resolved
-				 *
-				 *  Case 1) may occur due to multiple
-				 *    resolution attempts during single
-				 *    ip_multirt_resolution_interval.
-				 *
-				 *  Case 2-3) means that CGTP destination is
-				 *    reachable via one link so we don't want to
-				 *    generate ICMP host unreachable error.
-				 */
-				if (multirt_res_attempts == 0 ||
-				    multirt_res_failures <
-				    multirt_res_attempts ||
-				    multirt_already_resolved > 0)
-					multirt_no_icmp_error = B_TRUE;
-			} else {
-				ASSERT(sire != NULL);
-				ASSERT(ire != NULL);
-
-				multirt_res_attempts++;
-			}
-		}
-
-		if (ire == NULL) {
-			if (ip_debug > 3) {
-				/* ip2dbg */
-				pr_addr_dbg("ip_newroute: "
-				    "can't resolve %s\n", AF_INET, &dst);
-			}
-			ip3dbg(("ip_newroute: "
-			    "ire %p, sire %p, multirt_no_icmp_error %d\n",
-			    (void *)ire, (void *)sire,
-			    (int)multirt_no_icmp_error));
-
-			if (sire != NULL) {
-				ire_refrele(sire);
-				sire = NULL;
-			}
-
-			if (multirt_no_icmp_error) {
-				/* There is no need to report an ICMP error. */
-				MULTIRT_DEBUG_UNTAG(first_mp);
-				freemsg(first_mp);
-				return;
-			}
-			ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0,
-			    RTA_DST, ipst);
-			goto icmp_err_ret;
-		}
-
-		/*
-		 * Verify that the returned IRE does not have either
-		 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
-		 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
-		 */
-		if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
-		    (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) {
-			goto icmp_err_ret;
-		}
-		/*
-		 * Increment the ire_ob_pkt_count field for ire if it is an
-		 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
-		 * increment the same for the parent IRE, sire, if it is some
-		 * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST)
-		 */
-		if ((ire->ire_type & IRE_INTERFACE) != 0) {
-			UPDATE_OB_PKT_COUNT(ire);
-			ire->ire_last_used_time = lbolt;
-		}
-
-		if (sire != NULL) {
-			gw = sire->ire_gateway_addr;
-			ASSERT((sire->ire_type & (IRE_CACHETABLE |
-			    IRE_INTERFACE)) == 0);
-			UPDATE_OB_PKT_COUNT(sire);
-			sire->ire_last_used_time = lbolt;
-		}
-		/*
-		 * We have a route to reach the destination.  Find the
-		 * appropriate ill, then get a source address using
-		 * ipif_select_source().
-		 *
-		 * If we are here trying to create an IRE_CACHE for an offlink
-		 * destination and have an IRE_CACHE entry for VNI, then use
-		 * ire_stq instead since VNI's queue is a black hole.
-		 */
-		if ((ire->ire_type == IRE_CACHE) &&
-		    IS_VNI(ire->ire_ipif->ipif_ill)) {
-			dst_ill = ire->ire_stq->q_ptr;
-			ill_refhold(dst_ill);
-		} else {
-			ill_t *ill = ire->ire_ipif->ipif_ill;
-
-			if (IS_IPMP(ill)) {
-				dst_ill =
-				    ipmp_illgrp_hold_next_ill(ill->ill_grp);
-			} else {
-				dst_ill = ill;
-				ill_refhold(dst_ill);
-			}
-		}
-
-		if (dst_ill == NULL) {
-			if (ip_debug > 2) {
-				pr_addr_dbg("ip_newroute: no dst "
-				    "ill for dst %s\n", AF_INET, &dst);
-			}
-			goto icmp_err_ret;
-		}
-		ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name));
-
-		/*
-		 * Pick the best source address from dst_ill.
-		 *
-		 * 1) Try to pick the source address from the destination
-		 *    route. Clustering assumes that when we have multiple
-		 *    prefixes hosted on an interface, the prefix of the
-		 *    source address matches the prefix of the destination
-		 *    route. We do this only if the address is not
-		 *    DEPRECATED.
-		 *
-		 * 2) If the conn is in a different zone than the ire, we
-		 *    need to pick a source address from the right zone.
-		 */
-		ASSERT(src_ipif == NULL);
-		if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
-			/*
-			 * The RTF_SETSRC flag is set in the parent ire (sire).
-			 * Check that the ipif matching the requested source
-			 * address still exists.
-			 */
-			src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL,
-			    zoneid, NULL, NULL, NULL, NULL, ipst);
-		}
-
-		unspec_src = (connp != NULL && connp->conn_unspec_src);
-
-		if (src_ipif == NULL &&
-		    (!unspec_src || ipha->ipha_src != INADDR_ANY)) {
-			ire_marks |= IRE_MARK_USESRC_CHECK;
-			if (!IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
-			    IS_IPMP(ire->ire_ipif->ipif_ill) ||
-			    (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
-			    (connp != NULL && ire->ire_zoneid != zoneid &&
-			    ire->ire_zoneid != ALL_ZONES) ||
-			    (dst_ill->ill_usesrc_ifindex != 0)) {
-				/*
-				 * If the destination is reachable via a
-				 * given gateway, the selected source address
-				 * should be in the same subnet as the gateway.
-				 * Otherwise, the destination is not reachable.
-				 *
-				 * If there are no interfaces on the same subnet
-				 * as the destination, ipif_select_source gives
-				 * first non-deprecated interface which might be
-				 * on a different subnet than the gateway.
-				 * This is not desirable. Hence pass the dst_ire
-				 * source address to ipif_select_source.
-				 * It is sure that the destination is reachable
-				 * with the dst_ire source address subnet.
-				 * So passing dst_ire source address to
-				 * ipif_select_source will make sure that the
-				 * selected source will be on the same subnet
-				 * as dst_ire source address.
-				 */
-				ipaddr_t saddr = ire->ire_ipif->ipif_src_addr;
-
-				src_ipif = ipif_select_source(dst_ill, saddr,
-				    zoneid);
-				if (src_ipif == NULL) {
-					/*
-					 * In the case of multirouting, it may
-					 * happen that ipif_select_source fails
-					 * as DAD may disallow use of the
-					 * particular source interface.  Anyway,
-					 * we need to continue and attempt to
-					 * resolve other multirt routes.
-					 */
-					if ((sire != NULL) &&
-					    (sire->ire_flags & RTF_MULTIRT)) {
-						ire_refrele(ire);
-						ire = NULL;
-						multirt_resolve_next = B_TRUE;
-						multirt_res_failures++;
-						continue;
-					}
-
-					if (ip_debug > 2) {
-						pr_addr_dbg("ip_newroute: "
-						    "no src for dst %s ",
-						    AF_INET, &dst);
-						printf("on interface %s\n",
-						    dst_ill->ill_name);
-					}
-					goto icmp_err_ret;
-				}
-			} else {
-				src_ipif = ire->ire_ipif;
-				ASSERT(src_ipif != NULL);
-				/* hold src_ipif for uniformity */
-				ipif_refhold(src_ipif);
-			}
-		}
-
-		/*
-		 * Assign a source address while we have the conn.
-		 * We can't have ip_wput_ire pick a source address when the
-		 * packet returns from arp since we need to look at
-		 * conn_unspec_src and conn_zoneid, and we lose the conn when
-		 * going through arp.
-		 *
-		 * NOTE : ip_newroute_v6 does not have this piece of code as
-		 *	  it uses ip6i to store this information.
-		 */
-		if (ipha->ipha_src == INADDR_ANY && !unspec_src)
-			ipha->ipha_src = src_ipif->ipif_src_addr;
-
-		if (ip_debug > 3) {
-			/* ip2dbg */
-			pr_addr_dbg("ip_newroute: first hop %s\n",
-			    AF_INET, &gw);
-		}
-		ip2dbg(("\tire type %s (%d)\n",
-		    ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type));
-
-		/*
-		 * The TTL of multirouted packets is bounded by the
-		 * ip_multirt_ttl ndd variable.
-		 */
-		if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) {
-			/* Force TTL of multirouted packets */
-			if ((ipst->ips_ip_multirt_ttl > 0) &&
-			    (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
-				ip2dbg(("ip_newroute: forcing multirt TTL "
-				    "to %d (was %d), dst 0x%08x\n",
-				    ipst->ips_ip_multirt_ttl, ipha->ipha_ttl,
-				    ntohl(sire->ire_addr)));
-				ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
-			}
-		}
-		/*
-		 * At this point in ip_newroute(), ire is either the
-		 * IRE_CACHE of the next-hop gateway for an off-subnet
-		 * destination or an IRE_INTERFACE type that should be used
-		 * to resolve an on-subnet destination or an on-subnet
-		 * next-hop gateway.
-		 *
-		 * In the IRE_CACHE case, we have the following :
-		 *
-		 * 1) src_ipif - used for getting a source address.
-		 *
-		 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
-		 *    means packets using this IRE_CACHE will go out on
-		 *    dst_ill.
-		 *
-		 * 3) The IRE sire will point to the prefix that is the
-		 *    longest  matching route for the destination. These
-		 *    prefix types include IRE_DEFAULT, IRE_PREFIX, IRE_HOST.
-		 *
-		 *    The newly created IRE_CACHE entry for the off-subnet
-		 *    destination is tied to both the prefix route and the
-		 *    interface route used to resolve the next-hop gateway
-		 *    via the ire_phandle and ire_ihandle fields,
-		 *    respectively.
-		 *
-		 * In the IRE_INTERFACE case, we have the following :
-		 *
-		 * 1) src_ipif - used for getting a source address.
-		 *
-		 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
-		 *    means packets using the IRE_CACHE that we will build
-		 *    here will go out on dst_ill.
-		 *
-		 * 3) sire may or may not be NULL. But, the IRE_CACHE that is
-		 *    to be created will only be tied to the IRE_INTERFACE
-		 *    that was derived from the ire_ihandle field.
-		 *
-		 *    If sire is non-NULL, it means the destination is
-		 *    off-link and we will first create the IRE_CACHE for the
-		 *    gateway. Next time through ip_newroute, we will create
-		 *    the IRE_CACHE for the final destination as described
-		 *    above.
-		 *
-		 * In both cases, after the current resolution has been
-		 * completed (or possibly initialised, in the IRE_INTERFACE
-		 * case), the loop may be re-entered to attempt the resolution
-		 * of another RTF_MULTIRT route.
-		 *
-		 * When an IRE_CACHE entry for the off-subnet destination is
-		 * created, RTF_SETSRC and RTF_MULTIRT are inherited from sire,
-		 * for further processing in emission loops.
-		 */
-		save_ire = ire;
-		switch (ire->ire_type) {
-		case IRE_CACHE: {
-			ire_t	*ipif_ire;
-
-			ASSERT(save_ire->ire_nce->nce_state == ND_REACHABLE);
-			if (gw == 0)
-				gw = ire->ire_gateway_addr;
-			/*
-			 * We need 3 ire's to create a new cache ire for an
-			 * off-link destination from the cache ire of the
-			 * gateway.
-			 *
-			 *	1. The prefix ire 'sire' (Note that this does
-			 *	   not apply to the conn_nexthop_set case)
-			 *	2. The cache ire of the gateway 'ire'
-			 *	3. The interface ire 'ipif_ire'
-			 *
-			 * We have (1) and (2). We lookup (3) below.
-			 *
-			 * If there is no interface route to the gateway,
-			 * it is a race condition, where we found the cache
-			 * but the interface route has been deleted.
-			 */
-			if (ip_nexthop) {
-				ipif_ire = ire_ihandle_lookup_onlink(ire);
-			} else {
-				ipif_ire =
-				    ire_ihandle_lookup_offlink(ire, sire);
-			}
-			if (ipif_ire == NULL) {
-				ip1dbg(("ip_newroute: "
-				    "ire_ihandle_lookup_offlink failed\n"));
-				goto icmp_err_ret;
-			}
-
-			/*
-			 * Check cached gateway IRE for any security
-			 * attributes; if found, associate the gateway
-			 * credentials group to the destination IRE.
-			 */
-			if ((attrp = save_ire->ire_gw_secattr) != NULL) {
-				mutex_enter(&attrp->igsa_lock);
-				if ((gcgrp = attrp->igsa_gcgrp) != NULL)
-					GCGRP_REFHOLD(gcgrp);
-				mutex_exit(&attrp->igsa_lock);
-			}
-
-			/*
-			 * XXX For the source of the resolver mp,
-			 * we are using the same DL_UNITDATA_REQ
-			 * (from save_ire->ire_nce->nce_res_mp)
-			 * though the save_ire is not pointing at the same ill.
-			 * This is incorrect. We need to send it up to the
-			 * resolver to get the right res_mp. For ethernets
-			 * this may be okay (ill_type == DL_ETHER).
-			 */
-
-			ire = ire_create(
-			    (uchar_t *)&dst,		/* dest address */
-			    (uchar_t *)&ip_g_all_ones,	/* mask */
-			    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
-			    (uchar_t *)&gw,		/* gateway address */
-			    &save_ire->ire_max_frag,
-			    save_ire->ire_nce,		/* src nce */
-			    dst_ill->ill_rq,		/* recv-from queue */
-			    dst_ill->ill_wq,		/* send-to queue */
-			    IRE_CACHE,			/* IRE type */
-			    src_ipif,
-			    (sire != NULL) ?
-			    sire->ire_mask : 0, 	/* Parent mask */
-			    (sire != NULL) ?
-			    sire->ire_phandle : 0,	/* Parent handle */
-			    ipif_ire->ire_ihandle,	/* Interface handle */
-			    (sire != NULL) ? (sire->ire_flags &
-			    (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */
-			    (sire != NULL) ?
-			    &(sire->ire_uinfo) : &(save_ire->ire_uinfo),
-			    NULL,
-			    gcgrp,
-			    ipst);
-
-			if (ire == NULL) {
-				if (gcgrp != NULL) {
-					GCGRP_REFRELE(gcgrp);
-					gcgrp = NULL;
-				}
-				ire_refrele(ipif_ire);
-				ire_refrele(save_ire);
-				break;
-			}
-
-			/* reference now held by IRE */
-			gcgrp = NULL;
-
-			ire->ire_marks |= ire_marks;
-
-			/*
-			 * Prevent sire and ipif_ire from getting deleted.
-			 * The newly created ire is tied to both of them via
-			 * the phandle and ihandle respectively.
-			 */
-			if (sire != NULL) {
-				IRB_REFHOLD(sire->ire_bucket);
-				/* Has it been removed already ? */
-				if (sire->ire_marks & IRE_MARK_CONDEMNED) {
-					IRB_REFRELE(sire->ire_bucket);
-					ire_refrele(ipif_ire);
-					ire_refrele(save_ire);
-					break;
-				}
-			}
-
-			IRB_REFHOLD(ipif_ire->ire_bucket);
-			/* Has it been removed already ? */
-			if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) {
-				IRB_REFRELE(ipif_ire->ire_bucket);
-				if (sire != NULL)
-					IRB_REFRELE(sire->ire_bucket);
-				ire_refrele(ipif_ire);
-				ire_refrele(save_ire);
-				break;
-			}
-
-			xmit_mp = first_mp;
-			/*
-			 * In the case of multirouting, a copy
-			 * of the packet is done before its sending.
-			 * The copy is used to attempt another
-			 * route resolution, in a next loop.
-			 */
-			if (ire->ire_flags & RTF_MULTIRT) {
-				copy_mp = copymsg(first_mp);
-				if (copy_mp != NULL) {
-					xmit_mp = copy_mp;
-					MULTIRT_DEBUG_TAG(first_mp);
-				}
-			}
-
-			ire_add_then_send(q, ire, xmit_mp);
-			ire_refrele(save_ire);
-
-			/* Assert that sire is not deleted yet. */
-			if (sire != NULL) {
-				ASSERT(sire->ire_ptpn != NULL);
-				IRB_REFRELE(sire->ire_bucket);
-			}
-
-			/* Assert that ipif_ire is not deleted yet. */
-			ASSERT(ipif_ire->ire_ptpn != NULL);
-			IRB_REFRELE(ipif_ire->ire_bucket);
-			ire_refrele(ipif_ire);
-
-			/*
-			 * If copy_mp is not NULL, multirouting was
-			 * requested. We loop to initiate a next
-			 * route resolution attempt, starting from sire.
-			 */
-			if (copy_mp != NULL) {
-				/*
-				 * Search for the next unresolved
-				 * multirt route.
-				 */
-				copy_mp = NULL;
-				ipif_ire = NULL;
-				ire = NULL;
-				multirt_resolve_next = B_TRUE;
-				continue;
-			}
-			if (sire != NULL)
-				ire_refrele(sire);
-			ipif_refrele(src_ipif);
-			ill_refrele(dst_ill);
-			return;
-		}
-		case IRE_IF_NORESOLVER: {
-			if (dst_ill->ill_resolver_mp == NULL) {
-				ip1dbg(("ip_newroute: dst_ill %p "
-				    "for IRE_IF_NORESOLVER ire %p has "
-				    "no ill_resolver_mp\n",
-				    (void *)dst_ill, (void *)ire));
-				break;
-			}
-
-			/*
-			 * TSol note: We are creating the ire cache for the
-			 * destination 'dst'. If 'dst' is offlink, going
-			 * through the first hop 'gw', the security attributes
-			 * of 'dst' must be set to point to the gateway
-			 * credentials of gateway 'gw'. If 'dst' is onlink, it
-			 * is possible that 'dst' is a potential gateway that is
-			 * referenced by some route that has some security
-			 * attributes. Thus in the former case, we need to do a
-			 * gcgrp_lookup of 'gw' while in the latter case we
-			 * need to do gcgrp_lookup of 'dst' itself.
-			 */
-			ga.ga_af = AF_INET;
-			IN6_IPADDR_TO_V4MAPPED(gw != INADDR_ANY ? gw : dst,
-			    &ga.ga_addr);
-			gcgrp = gcgrp_lookup(&ga, B_FALSE);
-
-			ire = ire_create(
-			    (uchar_t *)&dst,		/* dest address */
-			    (uchar_t *)&ip_g_all_ones,	/* mask */
-			    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
-			    (uchar_t *)&gw,		/* gateway address */
-			    &save_ire->ire_max_frag,
-			    NULL,			/* no src nce */
-			    dst_ill->ill_rq,		/* recv-from queue */
-			    dst_ill->ill_wq,		/* send-to queue */
-			    IRE_CACHE,
-			    src_ipif,
-			    save_ire->ire_mask,		/* Parent mask */
-			    (sire != NULL) ?		/* Parent handle */
-			    sire->ire_phandle : 0,
-			    save_ire->ire_ihandle,	/* Interface handle */
-			    (sire != NULL) ? sire->ire_flags &
-			    (RTF_SETSRC | RTF_MULTIRT) : 0, /* flags */
-			    &(save_ire->ire_uinfo),
-			    NULL,
-			    gcgrp,
-			    ipst);
-
-			if (ire == NULL) {
-				if (gcgrp != NULL) {
-					GCGRP_REFRELE(gcgrp);
-					gcgrp = NULL;
-				}
-				ire_refrele(save_ire);
-				break;
-			}
-
-			/* reference now held by IRE */
-			gcgrp = NULL;
-
-			ire->ire_marks |= ire_marks;
-
-			/* Prevent save_ire from getting deleted */
-			IRB_REFHOLD(save_ire->ire_bucket);
-			/* Has it been removed already ? */
-			if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
-				IRB_REFRELE(save_ire->ire_bucket);
-				ire_refrele(save_ire);
-				break;
-			}
-
-			/*
-			 * In the case of multirouting, a copy
-			 * of the packet is made before it is sent.
-			 * The copy is used in the next
-			 * loop to attempt another resolution.
-			 */
-			xmit_mp = first_mp;
-			if ((sire != NULL) &&
-			    (sire->ire_flags & RTF_MULTIRT)) {
-				copy_mp = copymsg(first_mp);
-				if (copy_mp != NULL) {
-					xmit_mp = copy_mp;
-					MULTIRT_DEBUG_TAG(first_mp);
-				}
-			}
-			ire_add_then_send(q, ire, xmit_mp);
-
-			/* Assert that it is not deleted yet. */
-			ASSERT(save_ire->ire_ptpn != NULL);
-			IRB_REFRELE(save_ire->ire_bucket);
-			ire_refrele(save_ire);
-
-			if (copy_mp != NULL) {
-				/*
-				 * If we found a (no)resolver, we ignore any
-				 * trailing top priority IRE_CACHE in further
-				 * loops. This ensures that we do not omit any
-				 * (no)resolver.
-				 * This IRE_CACHE, if any, will be processed
-				 * by another thread entering ip_newroute().
-				 * IRE_CACHE entries, if any, will be processed
-				 * by another thread entering ip_newroute(),
-				 * (upon resolver response, for instance).
-				 * This aims to force parallel multirt
-				 * resolutions as soon as a packet must be sent.
-				 * In the best case, after the tx of only one
-				 * packet, all reachable routes are resolved.
-				 * Otherwise, the resolution of all RTF_MULTIRT
-				 * routes would require several emissions.
-				 */
-				multirt_flags &= ~MULTIRT_CACHEGW;
-
-				/*
-				 * Search for the next unresolved multirt
-				 * route.
-				 */
-				copy_mp = NULL;
-				save_ire = NULL;
-				ire = NULL;
-				multirt_resolve_next = B_TRUE;
-				continue;
-			}
-
-			/*
-			 * Don't need sire anymore
-			 */
-			if (sire != NULL)
-				ire_refrele(sire);
-
-			ipif_refrele(src_ipif);
-			ill_refrele(dst_ill);
-			return;
-		}
-		case IRE_IF_RESOLVER:
-			/*
-			 * We can't build an IRE_CACHE yet, but at least we
-			 * found a resolver that can help.
-			 */
-			res_mp = dst_ill->ill_resolver_mp;
-			if (!OK_RESOLVER_MP(res_mp))
-				break;
-
-			/*
-			 * To be at this point in the code with a non-zero gw
-			 * means that dst is reachable through a gateway that
-			 * we have never resolved.  By changing dst to the gw
-			 * addr we resolve the gateway first.
-			 * When ire_add_then_send() tries to put the IP dg
-			 * to dst, it will reenter ip_newroute() at which
-			 * time we will find the IRE_CACHE for the gw and
-			 * create another IRE_CACHE in case IRE_CACHE above.
-			 */
-			if (gw != INADDR_ANY) {
-				/*
-				 * The source ipif that was determined above was
-				 * relative to the destination address, not the
-				 * gateway's. If src_ipif was not taken out of
-				 * the IRE_IF_RESOLVER entry, we'll need to call
-				 * ipif_select_source() again.
-				 */
-				if (src_ipif != ire->ire_ipif) {
-					ipif_refrele(src_ipif);
-					src_ipif = ipif_select_source(dst_ill,
-					    gw, zoneid);
-					/*
-					 * In the case of multirouting, it may
-					 * happen that ipif_select_source fails
-					 * as DAD may disallow use of the
-					 * particular source interface.  Anyway,
-					 * we need to continue and attempt to
-					 * resolve other multirt routes.
-					 */
-					if (src_ipif == NULL) {
-						if (sire != NULL &&
-						    (sire->ire_flags &
-						    RTF_MULTIRT)) {
-							ire_refrele(ire);
-							ire = NULL;
-							multirt_resolve_next =
-							    B_TRUE;
-							multirt_res_failures++;
-							continue;
-						}
-						if (ip_debug > 2) {
-							pr_addr_dbg(
-							    "ip_newroute: no "
-							    "src for gw %s ",
-							    AF_INET, &gw);
-							printf("on "
-							    "interface %s\n",
-							    dst_ill->ill_name);
-						}
-						goto icmp_err_ret;
-					}
-				}
-				save_dst = dst;
-				dst = gw;
-				gw = INADDR_ANY;
-			}
-
-			/*
-			 * We obtain a partial IRE_CACHE which we will pass
-			 * along with the resolver query.  When the response
-			 * comes back it will be there ready for us to add.
-			 * The ire_max_frag is atomically set under the
-			 * irebucket lock in ire_add_v[46].
-			 */
-
-			ire = ire_create_mp(
-			    (uchar_t *)&dst,		/* dest address */
-			    (uchar_t *)&ip_g_all_ones,	/* mask */
-			    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
-			    (uchar_t *)&gw,		/* gateway address */
-			    NULL,			/* ire_max_frag */
-			    NULL,			/* no src nce */
-			    dst_ill->ill_rq,		/* recv-from queue */
-			    dst_ill->ill_wq,		/* send-to queue */
-			    IRE_CACHE,
-			    src_ipif,			/* Interface ipif */
-			    save_ire->ire_mask,		/* Parent mask */
-			    0,
-			    save_ire->ire_ihandle,	/* Interface handle */
-			    0,				/* flags if any */
-			    &(save_ire->ire_uinfo),
-			    NULL,
-			    NULL,
-			    ipst);
-
-			if (ire == NULL) {
-				ire_refrele(save_ire);
-				break;
-			}
-
-			if ((sire != NULL) &&
-			    (sire->ire_flags & RTF_MULTIRT)) {
-				copy_mp = copymsg(first_mp);
-				if (copy_mp != NULL)
-					MULTIRT_DEBUG_TAG(copy_mp);
-			}
-
-			ire->ire_marks |= ire_marks;
-
-			/*
-			 * Construct message chain for the resolver
-			 * of the form:
-			 * 	ARP_REQ_MBLK-->IRE_MBLK-->Packet
-			 * Packet could contain a IPSEC_OUT mp.
-			 *
-			 * NOTE : ire will be added later when the response
-			 * comes back from ARP. If the response does not
-			 * come back, ARP frees the packet. For this reason,
-			 * we can't REFHOLD the bucket of save_ire to prevent
-			 * deletions. We may not be able to REFRELE the bucket
-			 * if the response never comes back. Thus, before
-			 * adding the ire, ire_add_v4 will make sure that the
-			 * interface route does not get deleted. This is the
-			 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6
-			 * where we can always prevent deletions because of
-			 * the synchronous nature of adding IRES i.e
-			 * ire_add_then_send is called after creating the IRE.
-			 */
-			ASSERT(ire->ire_mp != NULL);
-			ire->ire_mp->b_cont = first_mp;
-			/* Have saved_mp handy, for cleanup if canput fails */
-			saved_mp = mp;
-			mp = copyb(res_mp);
-			if (mp == NULL) {
-				/* Prepare for cleanup */
-				mp = saved_mp; /* pkt */
-				ire_delete(ire); /* ire_mp */
-				ire = NULL;
-				ire_refrele(save_ire);
-				if (copy_mp != NULL) {
-					MULTIRT_DEBUG_UNTAG(copy_mp);
-					freemsg(copy_mp);
-					copy_mp = NULL;
-				}
-				break;
-			}
-			linkb(mp, ire->ire_mp);
-
-			/*
-			 * Fill in the source and dest addrs for the resolver.
-			 * NOTE: this depends on memory layouts imposed by
-			 * ill_init().
-			 */
-			areq = (areq_t *)mp->b_rptr;
-			addrp = (ipaddr_t *)((char *)areq +
-			    areq->areq_sender_addr_offset);
-			*addrp = save_ire->ire_src_addr;
-
-			ire_refrele(save_ire);
-			addrp = (ipaddr_t *)((char *)areq +
-			    areq->areq_target_addr_offset);
-			*addrp = dst;
-			/* Up to the resolver. */
-			if (canputnext(dst_ill->ill_rq) &&
-			    !(dst_ill->ill_arp_closing)) {
-				putnext(dst_ill->ill_rq, mp);
-				ire = NULL;
-				if (copy_mp != NULL) {
-					/*
-					 * If we found a resolver, we ignore
-					 * any trailing top priority IRE_CACHE
-					 * in the further loops. This ensures
-					 * that we do not omit any resolver.
-					 * IRE_CACHE entries, if any, will be
-					 * processed next time we enter
-					 * ip_newroute().
-					 */
-					multirt_flags &= ~MULTIRT_CACHEGW;
-					/*
-					 * Search for the next unresolved
-					 * multirt route.
-					 */
-					first_mp = copy_mp;
-					copy_mp = NULL;
-					/* Prepare the next resolution loop. */
-					mp = first_mp;
-					EXTRACT_PKT_MP(mp, first_mp,
-					    mctl_present);
-					if (mctl_present)
-						io = (ipsec_out_t *)
-						    first_mp->b_rptr;
-					ipha = (ipha_t *)mp->b_rptr;
-
-					ASSERT(sire != NULL);
-
-					dst = save_dst;
-					multirt_resolve_next = B_TRUE;
-					continue;
-				}
-
-				if (sire != NULL)
-					ire_refrele(sire);
-
-				/*
-				 * The response will come back in ip_wput
-				 * with db_type IRE_DB_TYPE.
-				 */
-				ipif_refrele(src_ipif);
-				ill_refrele(dst_ill);
-				return;
-			} else {
-				/* Prepare for cleanup */
-				DTRACE_PROBE1(ip__newroute__drop, mblk_t *,
-				    mp);
-				mp->b_cont = NULL;
-				freeb(mp); /* areq */
-				/*
-				 * this is an ire that is not added to the
-				 * cache. ire_freemblk will handle the release
-				 * of any resources associated with the ire.
-				 */
-				ire_delete(ire); /* ire_mp */
-				mp = saved_mp; /* pkt */
-				ire = NULL;
-				if (copy_mp != NULL) {
-					MULTIRT_DEBUG_UNTAG(copy_mp);
-					freemsg(copy_mp);
-					copy_mp = NULL;
-				}
-				break;
-			}
-		default:
-			break;
-		}
-	} while (multirt_resolve_next);
-
-	ip1dbg(("ip_newroute: dropped\n"));
-	/* Did this packet originate externally? */
-	if (mp->b_prev) {
-		mp->b_next = NULL;
-		mp->b_prev = NULL;
-		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
-	} else {
-		if (dst_ill != NULL) {
-			BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards);
-		} else {
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-		}
-	}
-	ASSERT(copy_mp == NULL);
-	MULTIRT_DEBUG_UNTAG(first_mp);
-	freemsg(first_mp);
-	if (ire != NULL)
-		ire_refrele(ire);
-	if (sire != NULL)
-		ire_refrele(sire);
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-	if (dst_ill != NULL)
-		ill_refrele(dst_ill);
-	return;
-
-icmp_err_ret:
-	ip1dbg(("ip_newroute: no route\n"));
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-	if (dst_ill != NULL)
-		ill_refrele(dst_ill);
-	if (sire != NULL)
-		ire_refrele(sire);
-	/* Did this packet originate externally? */
-	if (mp->b_prev) {
-		mp->b_next = NULL;
-		mp->b_prev = NULL;
-		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInNoRoutes);
-		q = WR(q);
-	} else {
-		/*
-		 * There is no outgoing ill, so just increment the
-		 * system MIB.
-		 */
-		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
-		/*
-		 * Since ip_wput() isn't close to finished, we fill
-		 * in enough of the header for credible error reporting.
-		 */
-		if (ip_hdr_complete(ipha, zoneid, ipst)) {
-			/* Failed */
-			MULTIRT_DEBUG_UNTAG(first_mp);
-			freemsg(first_mp);
-			if (ire != NULL)
-				ire_refrele(ire);
-			return;
-		}
-	}
-
-	/*
-	 * At this point we will have ire only if RTF_BLACKHOLE
-	 * or RTF_REJECT flags are set on the IRE. It will not
-	 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set.
-	 */
-	if (ire != NULL) {
-		if (ire->ire_flags & RTF_BLACKHOLE) {
-			ire_refrele(ire);
-			MULTIRT_DEBUG_UNTAG(first_mp);
-			freemsg(first_mp);
-			return;
-		}
-		ire_refrele(ire);
-	}
-	if (ip_source_routed(ipha, ipst)) {
-		icmp_unreachable(q, first_mp, ICMP_SOURCE_ROUTE_FAILED,
-		    zoneid, ipst);
-		return;
-	}
-	icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst);
+	return ("unknown");
 }
 
-ip_opt_info_t zero_info;
-
-/*
- * IPv4 -
- * ip_newroute_ipif is called by ip_wput_multicast and
- * ip_rput_forward_multicast whenever we need to send
- * out a packet to a destination address for which we do not have specific
- * routing information. It is used when the packet will be sent out
- * on a specific interface. It is also called by ip_wput() when IP_BOUND_IF
- * socket option is set or icmp error message wants to go out on a particular
- * interface for a unicast packet.
- *
- * In most cases, the destination address is resolved thanks to the ipif
- * intrinsic resolver. However, there are some cases where the call to
- * ip_newroute_ipif must take into account the potential presence of
- * RTF_SETSRC and/or RTF_MULITRT flags in an IRE_OFFSUBNET ire
- * that uses the interface. This is specified through flags,
- * which can be a combination of:
- * - RTF_SETSRC: if an IRE_OFFSUBNET ire exists that has the RTF_SETSRC
- *   flag, the resulting ire will inherit the IRE_OFFSUBNET source address
- *   and flags. Additionally, the packet source address has to be set to
- *   the specified address. The caller is thus expected to set this flag
- *   if the packet has no specific source address yet.
- * - RTF_MULTIRT: if an IRE_OFFSUBNET ire exists that has the RTF_MULTIRT
- *   flag, the resulting ire will inherit the flag. All unresolved routes
- *   to the destination must be explored in the same call to
- *   ip_newroute_ipif().
- */
-static void
-ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
-    conn_t *connp, uint32_t flags, zoneid_t zoneid, ip_opt_info_t *infop)
+static int
+ip_wait_for_info_ack(ill_t *ill)
 {
-	areq_t	*areq;
-	ire_t	*ire = NULL;
-	mblk_t	*res_mp;
-	ipaddr_t *addrp;
-	mblk_t *first_mp;
-	ire_t	*save_ire = NULL;
-	ipif_t	*src_ipif = NULL;
-	ushort_t ire_marks = 0;
-	ill_t	*dst_ill = NULL;
-	ipha_t *ipha;
-	mblk_t	*saved_mp;
-	ire_t   *fire = NULL;
-	mblk_t  *copy_mp = NULL;
-	boolean_t multirt_resolve_next;
-	boolean_t unspec_src;
-	ipaddr_t ipha_dst;
-	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
-	/*
-	 * CGTP goes in a loop which looks up a new ipif, do an ipif_refhold
-	 * here for uniformity
-	 */
-	ipif_refhold(ipif);
-
-	/*
-	 * This loop is run only once in most cases.
-	 * We loop to resolve further routes only when the destination
-	 * can be reached through multiple RTF_MULTIRT-flagged ires.
-	 */
-	do {
-		if (dst_ill != NULL) {
-			ill_refrele(dst_ill);
-			dst_ill = NULL;
-		}
-		if (src_ipif != NULL) {
-			ipif_refrele(src_ipif);
-			src_ipif = NULL;
-		}
-		multirt_resolve_next = B_FALSE;
-
-		ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst),
-		    ipif->ipif_ill->ill_name));
-
-		first_mp = mp;
-		if (DB_TYPE(mp) == M_CTL)
-			mp = mp->b_cont;
-		ipha = (ipha_t *)mp->b_rptr;
-
-		/*
-		 * Save the packet destination address, we may need it after
-		 * the packet has been consumed.
-		 */
-		ipha_dst = ipha->ipha_dst;
-
-		/*
-		 * If the interface is a pt-pt interface we look for an
-		 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the
-		 * local_address and the pt-pt destination address. Otherwise
-		 * we just match the local address.
-		 * NOTE: dst could be different than ipha->ipha_dst in case
-		 * of sending igmp multicast packets over a point-to-point
-		 * connection.
-		 * Thus we must be careful enough to check ipha_dst to be a
-		 * multicast address, otherwise it will take xmit_if path for
-		 * multicast packets resulting into kernel stack overflow by
-		 * repeated calls to ip_newroute_ipif from ire_send().
-		 */
-		if (CLASSD(ipha_dst) &&
-		    !(ipif->ipif_ill->ill_flags & ILLF_MULTICAST)) {
-			goto err_ret;
-		}
-
-		/*
-		 * We check if an IRE_OFFSUBNET for the addr that goes through
-		 * ipif exists. We need it to determine if the RTF_SETSRC and/or
-		 * RTF_MULTIRT flags must be honored. This IRE_OFFSUBNET ire may
-		 * propagate its flags to the new ire.
-		 */
-		if (CLASSD(ipha_dst) && (flags & (RTF_MULTIRT | RTF_SETSRC))) {
-			fire = ipif_lookup_multi_ire(ipif, ipha_dst);
-			ip2dbg(("ip_newroute_ipif: "
-			    "ipif_lookup_multi_ire("
-			    "ipif %p, dst %08x) = fire %p\n",
-			    (void *)ipif, ntohl(dst), (void *)fire));
-		}
-
-		/*
-		 * Note: While we pick a dst_ill we are really only
-		 * interested in the ill for load spreading. The source
-		 * ipif is determined by source address selection below.
-		 */
-		if (IS_IPMP(ipif->ipif_ill)) {
-			ipmp_illgrp_t *illg = ipif->ipif_ill->ill_grp;
-
-			if (CLASSD(ipha_dst))
-				dst_ill = ipmp_illgrp_hold_cast_ill(illg);
-			else
-				dst_ill = ipmp_illgrp_hold_next_ill(illg);
-		} else {
-			dst_ill = ipif->ipif_ill;
-			ill_refhold(dst_ill);
-		}
-
-		if (dst_ill == NULL) {
-			if (ip_debug > 2) {
-				pr_addr_dbg("ip_newroute_ipif: no dst ill "
-				    "for dst %s\n", AF_INET, &dst);
-			}
-			goto err_ret;
-		}
-
-		/*
-		 * Pick a source address preferring non-deprecated ones.
-		 * Unlike ip_newroute, we don't do any source address
-		 * selection here since for multicast it really does not help
-		 * in inbound load spreading as in the unicast case.
-		 */
-		if ((flags & RTF_SETSRC) && (fire != NULL) &&
-		    (fire->ire_flags & RTF_SETSRC)) {
-			/*
-			 * As requested by flags, an IRE_OFFSUBNET was looked up
-			 * on that interface. This ire has RTF_SETSRC flag, so
-			 * the source address of the packet must be changed.
-			 * Check that the ipif matching the requested source
-			 * address still exists.
-			 */
-			src_ipif = ipif_lookup_addr(fire->ire_src_addr, NULL,
-			    zoneid, NULL, NULL, NULL, NULL, ipst);
-		}
-
-		unspec_src = (connp != NULL && connp->conn_unspec_src);
-
-		if (!IS_UNDER_IPMP(ipif->ipif_ill) &&
-		    (IS_IPMP(ipif->ipif_ill) ||
-		    (!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) ||
-		    (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP ||
-		    (connp != NULL && ipif->ipif_zoneid != zoneid &&
-		    ipif->ipif_zoneid != ALL_ZONES)) &&
-		    (src_ipif == NULL) &&
-		    (!unspec_src || ipha->ipha_src != INADDR_ANY)) {
-			src_ipif = ipif_select_source(dst_ill, dst, zoneid);
-			if (src_ipif == NULL) {
-				if (ip_debug > 2) {
-					/* ip1dbg */
-					pr_addr_dbg("ip_newroute_ipif: "
-					    "no src for dst %s",
-					    AF_INET, &dst);
-				}
-				ip1dbg((" on interface %s\n",
-				    dst_ill->ill_name));
-				goto err_ret;
-			}
-			ipif_refrele(ipif);
-			ipif = src_ipif;
-			ipif_refhold(ipif);
-		}
-		if (src_ipif == NULL) {
-			src_ipif = ipif;
-			ipif_refhold(src_ipif);
-		}
-
-		/*
-		 * Assign a source address while we have the conn.
-		 * We can't have ip_wput_ire pick a source address when the
-		 * packet returns from arp since conn_unspec_src might be set
-		 * and we lose the conn when going through arp.
-		 */
-		if (ipha->ipha_src == INADDR_ANY && !unspec_src)
-			ipha->ipha_src = src_ipif->ipif_src_addr;
-
-		/*
-		 * In the case of IP_BOUND_IF and IP_PKTINFO, it is possible
-		 * that the outgoing interface does not have an interface ire.
-		 */
-		if (CLASSD(ipha_dst) && (connp == NULL ||
-		    connp->conn_outgoing_ill == NULL) &&
-		    infop->ip_opt_ill_index == 0) {
-			/* ipif_to_ire returns an held ire */
-			ire = ipif_to_ire(ipif);
-			if (ire == NULL)
-				goto err_ret;
-			if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
-				goto err_ret;
-			save_ire = ire;
-
-			ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, "
-			    "flags %04x\n",
-			    (void *)ire, (void *)ipif, flags));
-			if ((flags & RTF_MULTIRT) && (fire != NULL) &&
-			    (fire->ire_flags & RTF_MULTIRT)) {
-				/*
-				 * As requested by flags, an IRE_OFFSUBNET was
-				 * looked up on that interface. This ire has
-				 * RTF_MULTIRT flag, so the resolution loop will
-				 * be re-entered to resolve additional routes on
-				 * other interfaces. For that purpose, a copy of
-				 * the packet is performed at this point.
-				 */
-				fire->ire_last_used_time = lbolt;
-				copy_mp = copymsg(first_mp);
-				if (copy_mp) {
-					MULTIRT_DEBUG_TAG(copy_mp);
-				}
-			}
-			if ((flags & RTF_SETSRC) && (fire != NULL) &&
-			    (fire->ire_flags & RTF_SETSRC)) {
-				/*
-				 * As requested by flags, an IRE_OFFSUBET was
-				 * looked up on that interface. This ire has
-				 * RTF_SETSRC flag, so the source address of the
-				 * packet must be changed.
-				 */
-				ipha->ipha_src = fire->ire_src_addr;
-			}
-		} else {
-			/*
-			 * The only ways we can come here are:
-			 * 1) IP_BOUND_IF socket option is set
-			 * 2) SO_DONTROUTE socket option is set
-			 * 3) IP_PKTINFO option is passed in as ancillary data.
-			 * In all cases, the new ire will not be added
-			 * into cache table.
-			 */
-			ASSERT(connp == NULL || connp->conn_dontroute ||
-			    connp->conn_outgoing_ill != NULL ||
-			    infop->ip_opt_ill_index != 0);
-			ire_marks |= IRE_MARK_NOADD;
-		}
-
-		switch (ipif->ipif_net_type) {
-		case IRE_IF_NORESOLVER: {
-			/* We have what we need to build an IRE_CACHE. */
-
-			if (dst_ill->ill_resolver_mp == NULL) {
-				ip1dbg(("ip_newroute_ipif: dst_ill %p "
-				    "for IRE_IF_NORESOLVER ire %p has "
-				    "no ill_resolver_mp\n",
-				    (void *)dst_ill, (void *)ire));
-				break;
-			}
-
-			/*
-			 * The new ire inherits the IRE_OFFSUBNET flags
-			 * and source address, if this was requested.
-			 */
-			ire = ire_create(
-			    (uchar_t *)&dst,		/* dest address */
-			    (uchar_t *)&ip_g_all_ones,	/* mask */
-			    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
-			    NULL,			/* gateway address */
-			    &ipif->ipif_mtu,
-			    NULL,			/* no src nce */
-			    dst_ill->ill_rq,		/* recv-from queue */
-			    dst_ill->ill_wq,		/* send-to queue */
-			    IRE_CACHE,
-			    src_ipif,
-			    (save_ire != NULL ? save_ire->ire_mask : 0),
-			    (fire != NULL) ?		/* Parent handle */
-			    fire->ire_phandle : 0,
-			    (save_ire != NULL) ?	/* Interface handle */
-			    save_ire->ire_ihandle : 0,
-			    (fire != NULL) ?
-			    (fire->ire_flags &
-			    (RTF_SETSRC | RTF_MULTIRT)) : 0,
-			    (save_ire == NULL ? &ire_uinfo_null :
-			    &save_ire->ire_uinfo),
-			    NULL,
-			    NULL,
-			    ipst);
-
-			if (ire == NULL) {
-				if (save_ire != NULL)
-					ire_refrele(save_ire);
-				break;
-			}
-
-			ire->ire_marks |= ire_marks;
-
-			/*
-			 * If IRE_MARK_NOADD is set then we need to convert
-			 * the max_fragp to a useable value now. This is
-			 * normally done in ire_add_v[46]. We also need to
-			 * associate the ire with an nce (normally would be
-			 * done in ip_wput_nondata()).
-			 *
-			 * Note that IRE_MARK_NOADD packets created here
-			 * do not have a non-null ire_mp pointer. The null
-			 * value of ire_bucket indicates that they were
-			 * never added.
-			 */
-			if (ire->ire_marks & IRE_MARK_NOADD) {
-				uint_t  max_frag;
-
-				max_frag = *ire->ire_max_fragp;
-				ire->ire_max_fragp = NULL;
-				ire->ire_max_frag = max_frag;
-
-				if ((ire->ire_nce = ndp_lookup_v4(
-				    ire_to_ill(ire),
-				    (ire->ire_gateway_addr != INADDR_ANY ?
-				    &ire->ire_gateway_addr : &ire->ire_addr),
-				    B_FALSE)) == NULL) {
-					if (save_ire != NULL)
-						ire_refrele(save_ire);
-					break;
-				}
-				ASSERT(ire->ire_nce->nce_state ==
-				    ND_REACHABLE);
-				NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
-			}
-
-			/* Prevent save_ire from getting deleted */
-			if (save_ire != NULL) {
-				IRB_REFHOLD(save_ire->ire_bucket);
-				/* Has it been removed already ? */
-				if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
-					IRB_REFRELE(save_ire->ire_bucket);
-					ire_refrele(save_ire);
-					break;
-				}
-			}
-
-			ire_add_then_send(q, ire, first_mp);
-
-			/* Assert that save_ire is not deleted yet. */
-			if (save_ire != NULL) {
-				ASSERT(save_ire->ire_ptpn != NULL);
-				IRB_REFRELE(save_ire->ire_bucket);
-				ire_refrele(save_ire);
-				save_ire = NULL;
-			}
-			if (fire != NULL) {
-				ire_refrele(fire);
-				fire = NULL;
-			}
-
-			/*
-			 * the resolution loop is re-entered if this
-			 * was requested through flags and if we
-			 * actually are in a multirouting case.
-			 */
-			if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) {
-				boolean_t need_resolve =
-				    ire_multirt_need_resolve(ipha_dst,
-				    msg_getlabel(copy_mp), ipst);
-				if (!need_resolve) {
-					MULTIRT_DEBUG_UNTAG(copy_mp);
-					freemsg(copy_mp);
-					copy_mp = NULL;
-				} else {
-					/*
-					 * ipif_lookup_group() calls
-					 * ire_lookup_multi() that uses
-					 * ire_ftable_lookup() to find
-					 * an IRE_INTERFACE for the group.
-					 * In the multirt case,
-					 * ire_lookup_multi() then invokes
-					 * ire_multirt_lookup() to find
-					 * the next resolvable ire.
-					 * As a result, we obtain an new
-					 * interface, derived from the
-					 * next ire.
-					 */
-					ipif_refrele(ipif);
-					ipif = ipif_lookup_group(ipha_dst,
-					    zoneid, ipst);
-					ip2dbg(("ip_newroute_ipif: "
-					    "multirt dst %08x, ipif %p\n",
-					    htonl(dst), (void *)ipif));
-					if (ipif != NULL) {
-						mp = copy_mp;
-						copy_mp = NULL;
-						multirt_resolve_next = B_TRUE;
-						continue;
-					} else {
-						freemsg(copy_mp);
-					}
-				}
-			}
-			if (ipif != NULL)
-				ipif_refrele(ipif);
-			ill_refrele(dst_ill);
-			ipif_refrele(src_ipif);
-			return;
-		}
-		case IRE_IF_RESOLVER:
-			/*
-			 * We can't build an IRE_CACHE yet, but at least
-			 * we found a resolver that can help.
-			 */
-			res_mp = dst_ill->ill_resolver_mp;
-			if (!OK_RESOLVER_MP(res_mp))
-				break;
-
-			/*
-			 * We obtain a partial IRE_CACHE which we will pass
-			 * along with the resolver query.  When the response
-			 * comes back it will be there ready for us to add.
-			 * The new ire inherits the IRE_OFFSUBNET flags
-			 * and source address, if this was requested.
-			 * The ire_max_frag is atomically set under the
-			 * irebucket lock in ire_add_v[46]. Only in the
-			 * case of IRE_MARK_NOADD, we set it here itself.
-			 */
-			ire = ire_create_mp(
-			    (uchar_t *)&dst,		/* dest address */
-			    (uchar_t *)&ip_g_all_ones,	/* mask */
-			    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
-			    NULL,			/* gateway address */
-			    (ire_marks & IRE_MARK_NOADD) ?
-			    ipif->ipif_mtu : 0,		/* max_frag */
-			    NULL,			/* no src nce */
-			    dst_ill->ill_rq,		/* recv-from queue */
-			    dst_ill->ill_wq,		/* send-to queue */
-			    IRE_CACHE,
-			    src_ipif,
-			    (save_ire != NULL ? save_ire->ire_mask : 0),
-			    (fire != NULL) ?		/* Parent handle */
-			    fire->ire_phandle : 0,
-			    (save_ire != NULL) ?	/* Interface handle */
-			    save_ire->ire_ihandle : 0,
-			    (fire != NULL) ?		/* flags if any */
-			    (fire->ire_flags &
-			    (RTF_SETSRC | RTF_MULTIRT)) : 0,
-			    (save_ire == NULL ? &ire_uinfo_null :
-			    &save_ire->ire_uinfo),
-			    NULL,
-			    NULL,
-			    ipst);
-
-			if (save_ire != NULL) {
-				ire_refrele(save_ire);
-				save_ire = NULL;
-			}
-			if (ire == NULL)
-				break;
-
-			ire->ire_marks |= ire_marks;
-			/*
-			 * Construct message chain for the resolver of the
-			 * form:
-			 *	ARP_REQ_MBLK-->IRE_MBLK-->Packet
-			 *
-			 * NOTE : ire will be added later when the response
-			 * comes back from ARP. If the response does not
-			 * come back, ARP frees the packet. For this reason,
-			 * we can't REFHOLD the bucket of save_ire to prevent
-			 * deletions. We may not be able to REFRELE the
-			 * bucket if the response never comes back.
-			 * Thus, before adding the ire, ire_add_v4 will make
-			 * sure that the interface route does not get deleted.
-			 * This is the only case unlike ip_newroute_v6,
-			 * ip_newroute_ipif_v6 where we can always prevent
-			 * deletions because ire_add_then_send is called after
-			 * creating the IRE.
-			 * If IRE_MARK_NOADD is set, then ire_add_then_send
-			 * does not add this IRE into the IRE CACHE.
-			 */
-			ASSERT(ire->ire_mp != NULL);
-			ire->ire_mp->b_cont = first_mp;
-			/* Have saved_mp handy, for cleanup if canput fails */
-			saved_mp = mp;
-			mp = copyb(res_mp);
-			if (mp == NULL) {
-				/* Prepare for cleanup */
-				mp = saved_mp; /* pkt */
-				ire_delete(ire); /* ire_mp */
-				ire = NULL;
-				if (copy_mp != NULL) {
-					MULTIRT_DEBUG_UNTAG(copy_mp);
-					freemsg(copy_mp);
-					copy_mp = NULL;
-				}
-				break;
-			}
-			linkb(mp, ire->ire_mp);
-
-			/*
-			 * Fill in the source and dest addrs for the resolver.
-			 * NOTE: this depends on memory layouts imposed by
-			 * ill_init().  There are corner cases above where we
-			 * might've created the IRE with an INADDR_ANY source
-			 * address (e.g., if the zeroth ipif on an underlying
-			 * ill in an IPMP group is 0.0.0.0, but another ipif
-			 * on the ill has a usable test address).  If so, tell
-			 * ARP to use ipha_src as its sender address.
-			 */
-			areq = (areq_t *)mp->b_rptr;
-			addrp = (ipaddr_t *)((char *)areq +
-			    areq->areq_sender_addr_offset);
-			if (ire->ire_src_addr != INADDR_ANY)
-				*addrp = ire->ire_src_addr;
-			else
-				*addrp = ipha->ipha_src;
-			addrp = (ipaddr_t *)((char *)areq +
-			    areq->areq_target_addr_offset);
-			*addrp = dst;
-			/* Up to the resolver. */
-			if (canputnext(dst_ill->ill_rq) &&
-			    !(dst_ill->ill_arp_closing)) {
-				putnext(dst_ill->ill_rq, mp);
-				/*
-				 * The response will come back in ip_wput
-				 * with db_type IRE_DB_TYPE.
-				 */
-			} else {
-				mp->b_cont = NULL;
-				freeb(mp); /* areq */
-				ire_delete(ire); /* ire_mp */
-				saved_mp->b_next = NULL;
-				saved_mp->b_prev = NULL;
-				freemsg(first_mp); /* pkt */
-				ip2dbg(("ip_newroute_ipif: dropped\n"));
-			}
-
-			if (fire != NULL) {
-				ire_refrele(fire);
-				fire = NULL;
-			}
+	int err;
 
-			/*
-			 * The resolution loop is re-entered if this was
-			 * requested through flags and we actually are
-			 * in a multirouting case.
-			 */
-			if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) {
-				boolean_t need_resolve =
-				    ire_multirt_need_resolve(ipha_dst,
-				    msg_getlabel(copy_mp), ipst);
-				if (!need_resolve) {
-					MULTIRT_DEBUG_UNTAG(copy_mp);
-					freemsg(copy_mp);
-					copy_mp = NULL;
-				} else {
-					/*
-					 * ipif_lookup_group() calls
-					 * ire_lookup_multi() that uses
-					 * ire_ftable_lookup() to find
-					 * an IRE_INTERFACE for the group.
-					 * In the multirt case,
-					 * ire_lookup_multi() then invokes
-					 * ire_multirt_lookup() to find
-					 * the next resolvable ire.
-					 * As a result, we obtain an new
-					 * interface, derived from the
-					 * next ire.
-					 */
-					ipif_refrele(ipif);
-					ipif = ipif_lookup_group(ipha_dst,
-					    zoneid, ipst);
-					if (ipif != NULL) {
-						mp = copy_mp;
-						copy_mp = NULL;
-						multirt_resolve_next = B_TRUE;
-						continue;
-					} else {
-						freemsg(copy_mp);
-					}
-				}
-			}
-			if (ipif != NULL)
-				ipif_refrele(ipif);
-			ill_refrele(dst_ill);
-			ipif_refrele(src_ipif);
-			return;
-		default:
-			break;
-		}
-	} while (multirt_resolve_next);
-
-err_ret:
-	ip2dbg(("ip_newroute_ipif: dropped\n"));
-	if (fire != NULL)
-		ire_refrele(fire);
-	ipif_refrele(ipif);
-	/* Did this packet originate externally? */
-	if (dst_ill != NULL)
-		ill_refrele(dst_ill);
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-	if (mp->b_prev || mp->b_next) {
-		mp->b_next = NULL;
-		mp->b_prev = NULL;
-	} else {
+	mutex_enter(&ill->ill_lock);
+	while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
 		/*
-		 * Since ip_wput() isn't close to finished, we fill
-		 * in enough of the header for credible error reporting.
+		 * Return value of 0 indicates a pending signal.
 		 */
-		if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) {
-			/* Failed */
-			freemsg(first_mp);
-			if (ire != NULL)
-				ire_refrele(ire);
-			return;
+		err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
+		if (err == 0) {
+			mutex_exit(&ill->ill_lock);
+			return (EINTR);
 		}
 	}
+	mutex_exit(&ill->ill_lock);
 	/*
-	 * At this point we will have ire only if RTF_BLACKHOLE
-	 * or RTF_REJECT flags are set on the IRE. It will not
-	 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set.
+	 * ip_rput_other could have set an error  in ill_error on
+	 * receipt of M_ERROR.
 	 */
-	if (ire != NULL) {
-		if (ire->ire_flags & RTF_BLACKHOLE) {
-			ire_refrele(ire);
-			freemsg(first_mp);
-			return;
-		}
-		ire_refrele(ire);
-	}
-	icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst);
-}
-
-/* Name/Value Table Lookup Routine */
-char *
-ip_nv_lookup(nv_t *nv, int value)
-{
-	if (!nv)
-		return (NULL);
-	for (; nv->nv_name; nv++) {
-		if (nv->nv_value == value)
-			return (nv->nv_name);
-	}
-	return ("unknown");
+	return (ill->ill_error);
 }
 
 /*
@@ -9604,7 +5972,7 @@ ip_nv_lookup(nv_t *nv, int value)
  * to a DLPI device.  We allocate an ill_t as the instance data in
  * this case.
  */
-int
+static int
 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 {
 	ill_t	*ill;
@@ -9644,6 +6012,7 @@ ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 	 * down a DL_INFO_REQ after calling qprocson.
 	 */
 	err = ill_init(q, ill);
+
 	if (err != 0) {
 		mi_free(ill);
 		netstack_rele(ipst->ips_netstack);
@@ -9652,41 +6021,26 @@ ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 		return (err);
 	}
 
-	/* ill_init initializes the ipsq marking this thread as writer */
-	ipsq_exit(ill->ill_phyint->phyint_ipsq);
-	/* Wait for the DL_INFO_ACK */
-	mutex_enter(&ill->ill_lock);
-	while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
-		/*
-		 * Return value of 0 indicates a pending signal.
-		 */
-		err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
-		if (err == 0) {
-			mutex_exit(&ill->ill_lock);
-			(void) ip_close(q, 0);
-			return (EINTR);
-		}
-	}
-	mutex_exit(&ill->ill_lock);
-
 	/*
-	 * ip_rput_other could have set an error  in ill_error on
-	 * receipt of M_ERROR.
+	 * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
+	 *
+	 * ill_init initializes the ipsq marking this thread as
+	 * writer
 	 */
+	ipsq_exit(ill->ill_phyint->phyint_ipsq);
+	err = ip_wait_for_info_ack(ill);
+	if (err == 0)
+		ill->ill_credp = credp;
+	else
+		goto fail;
 
-	err = ill->ill_error;
-	if (err != 0) {
-		(void) ip_close(q, 0);
-		return (err);
-	}
-
-	ill->ill_credp = credp;
 	crhold(credp);
 
 	mutex_enter(&ipst->ips_ip_mi_lock);
-	err = mi_open_link(&ipst->ips_ip_g_head, (IDP)ill, devp, flag, sflag,
-	    credp);
+	err = mi_open_link(&ipst->ips_ip_g_head, (IDP)q->q_ptr, devp, flag,
+	    sflag, credp);
 	mutex_exit(&ipst->ips_ip_mi_lock);
+fail:
 	if (err) {
 		(void) ip_close(q, 0);
 		return (err);
@@ -9719,8 +6073,6 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 	netstack_t	*ns;
 	ip_stack_t	*ipst;
 
-	TRACE_1(TR_FAC_IP, TR_IP_OPEN, "ip_open: q %p", q);
-
 	/* Allow reopen. */
 	if (q->q_ptr != NULL)
 		return (0);
@@ -9765,25 +6117,24 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 	 */
 	netstack_rele(ipst->ips_netstack);
 
+	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
+	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+	connp->conn_ixa->ixa_zoneid = zoneid;
 	connp->conn_zoneid = zoneid;
-	connp->conn_sqp = NULL;
-	connp->conn_initial_sqp = NULL;
-	connp->conn_final_sqp = NULL;
 
-	connp->conn_upq = q;
+	connp->conn_rq = q;
 	q->q_ptr = WR(q)->q_ptr = connp;
 
-	if (flag & SO_SOCKSTR)
-		connp->conn_flags |= IPCL_SOCKET;
-
 	/* Minor tells us which /dev entry was opened */
 	if (isv6) {
-		connp->conn_af_isv6 = B_TRUE;
-		ip_setpktversion(connp, isv6, B_FALSE, ipst);
-		connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT;
+		connp->conn_family = AF_INET6;
+		connp->conn_ipversion = IPV6_VERSION;
+		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
+		connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
 	} else {
-		connp->conn_af_isv6 = B_FALSE;
-		connp->conn_pkt_isv6 = B_FALSE;
+		connp->conn_family = AF_INET;
+		connp->conn_ipversion = IPV4_VERSION;
+		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
 	}
 
 	if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
@@ -9812,11 +6163,17 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 	 * connp->conn_cred is crfree()ed in ipcl_conn_destroy()
 	 */
 	connp->conn_cred = credp;
+	/* Cache things in ixa without an extra refhold */
+	connp->conn_ixa->ixa_cred = connp->conn_cred;
+	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+	if (is_system_labeled())
+		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
 
 	/*
-	 * Handle IP_RTS_REQUEST and other ioctls which use conn_recv
+	 * Handle IP_IOC_RTS_REQUEST and other ioctls which use conn_recv
 	 */
 	connp->conn_recv = ip_conn_input;
+	connp->conn_recvicmp = ip_conn_input_icmp;
 
 	crhold(connp->conn_cred);
 
@@ -9827,11 +6184,13 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 	if (getpflags(NET_MAC_AWARE, credp) != 0)
 		connp->conn_mac_mode = CONN_MAC_AWARE;
 
+	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
+
 	connp->conn_rq = q;
 	connp->conn_wq = WR(q);
 
 	/* Non-zero default values */
-	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
+	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
 
 	/*
 	 * Make the conn globally visible to walkers
@@ -9847,210 +6206,6 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 }
 
 /*
- * Change the output format (IPv4 vs. IPv6) for a conn_t.
- * Note that there is no race since either ip_output function works - it
- * is just an optimization to enter the best ip_output routine directly.
- */
-void
-ip_setpktversion(conn_t *connp, boolean_t isv6, boolean_t bump_mib,
-    ip_stack_t *ipst)
-{
-	if (isv6)  {
-		if (bump_mib) {
-			BUMP_MIB(&ipst->ips_ip6_mib,
-			    ipIfStatsOutSwitchIPVersion);
-		}
-		connp->conn_send = ip_output_v6;
-		connp->conn_pkt_isv6 = B_TRUE;
-	} else {
-		if (bump_mib) {
-			BUMP_MIB(&ipst->ips_ip_mib,
-			    ipIfStatsOutSwitchIPVersion);
-		}
-		connp->conn_send = ip_output;
-		connp->conn_pkt_isv6 = B_FALSE;
-	}
-
-}
-
-/*
- * See if IPsec needs loading because of the options in mp.
- */
-static boolean_t
-ipsec_opt_present(mblk_t *mp)
-{
-	uint8_t *optcp, *next_optcp, *opt_endcp;
-	struct opthdr *opt;
-	struct T_opthdr *topt;
-	int opthdr_len;
-	t_uscalar_t optname, optlevel;
-	struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr;
-	ipsec_req_t *ipsr;
-
-	/*
-	 * Walk through the mess, and find IP_SEC_OPT.  If it's there,
-	 * return TRUE.
-	 */
-
-	optcp = mi_offset_param(mp, tor->OPT_offset, tor->OPT_length);
-	opt_endcp = optcp + tor->OPT_length;
-	if (tor->PRIM_type == T_OPTMGMT_REQ) {
-		opthdr_len = sizeof (struct T_opthdr);
-	} else {		/* O_OPTMGMT_REQ */
-		ASSERT(tor->PRIM_type == T_SVR4_OPTMGMT_REQ);
-		opthdr_len = sizeof (struct opthdr);
-	}
-	for (; optcp < opt_endcp; optcp = next_optcp) {
-		if (optcp + opthdr_len > opt_endcp)
-			return (B_FALSE);	/* Not enough option header. */
-		if (tor->PRIM_type == T_OPTMGMT_REQ) {
-			topt = (struct T_opthdr *)optcp;
-			optlevel = topt->level;
-			optname = topt->name;
-			next_optcp = optcp + _TPI_ALIGN_TOPT(topt->len);
-		} else {
-			opt = (struct opthdr *)optcp;
-			optlevel = opt->level;
-			optname = opt->name;
-			next_optcp = optcp + opthdr_len +
-			    _TPI_ALIGN_OPT(opt->len);
-		}
-		if ((next_optcp < optcp) || /* wraparound pointer space */
-		    ((next_optcp >= opt_endcp) && /* last option bad len */
-		    ((next_optcp - opt_endcp) >= __TPI_ALIGN_SIZE)))
-			return (B_FALSE); /* bad option buffer */
-		if ((optlevel == IPPROTO_IP && optname == IP_SEC_OPT) ||
-		    (optlevel == IPPROTO_IPV6 && optname == IPV6_SEC_OPT)) {
-			/*
-			 * Check to see if it's an all-bypass or all-zeroes
-			 * IPsec request.  Don't bother loading IPsec if
-			 * the socket doesn't want to use it.  (A good example
-			 * is a bypass request.)
-			 *
-			 * Basically, if any of the non-NEVER bits are set,
-			 * load IPsec.
-			 */
-			ipsr = (ipsec_req_t *)(optcp + opthdr_len);
-			if ((ipsr->ipsr_ah_req & ~IPSEC_PREF_NEVER) != 0 ||
-			    (ipsr->ipsr_esp_req & ~IPSEC_PREF_NEVER) != 0 ||
-			    (ipsr->ipsr_self_encap_req & ~IPSEC_PREF_NEVER)
-			    != 0)
-				return (B_TRUE);
-		}
-	}
-	return (B_FALSE);
-}
-
-/*
- * If conn is is waiting for ipsec to finish loading, kick it.
- */
-/* ARGSUSED */
-static void
-conn_restart_ipsec_waiter(conn_t *connp, void *arg)
-{
-	t_scalar_t	optreq_prim;
-	mblk_t		*mp;
-	cred_t		*cr;
-	int		err = 0;
-
-	/*
-	 * This function is called, after ipsec loading is complete.
-	 * Since IP checks exclusively and atomically (i.e it prevents
-	 * ipsec load from completing until ip_optcom_req completes)
-	 * whether ipsec load is complete, there cannot be a race with IP
-	 * trying to set the CONN_IPSEC_LOAD_WAIT flag on any conn now.
-	 */
-	mutex_enter(&connp->conn_lock);
-	if (connp->conn_state_flags & CONN_IPSEC_LOAD_WAIT) {
-		ASSERT(connp->conn_ipsec_opt_mp != NULL);
-		mp = connp->conn_ipsec_opt_mp;
-		connp->conn_ipsec_opt_mp = NULL;
-		connp->conn_state_flags  &= ~CONN_IPSEC_LOAD_WAIT;
-		mutex_exit(&connp->conn_lock);
-
-		/*
-		 * All Solaris components should pass a db_credp
-		 * for this TPI message, hence we ASSERT.
-		 * But in case there is some other M_PROTO that looks
-		 * like a TPI message sent by some other kernel
-		 * component, we check and return an error.
-		 */
-		cr = msg_getcred(mp, NULL);
-		ASSERT(cr != NULL);
-		if (cr == NULL) {
-			mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
-			if (mp != NULL)
-				qreply(connp->conn_wq, mp);
-			return;
-		}
-
-		ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
-
-		optreq_prim = ((union T_primitives *)mp->b_rptr)->type;
-		if (optreq_prim == T_OPTMGMT_REQ) {
-			err = tpi_optcom_req(CONNP_TO_WQ(connp), mp, cr,
-			    &ip_opt_obj, B_FALSE);
-		} else {
-			ASSERT(optreq_prim == T_SVR4_OPTMGMT_REQ);
-			err = svr4_optcom_req(CONNP_TO_WQ(connp), mp, cr,
-			    &ip_opt_obj, B_FALSE);
-		}
-		if (err != EINPROGRESS)
-			CONN_OPER_PENDING_DONE(connp);
-		return;
-	}
-	mutex_exit(&connp->conn_lock);
-}
-
-/*
- * Called from the ipsec_loader thread, outside any perimeter, to tell
- * ip qenable any of the queues waiting for the ipsec loader to
- * complete.
- */
-void
-ip_ipsec_load_complete(ipsec_stack_t *ipss)
-{
-	netstack_t *ns = ipss->ipsec_netstack;
-
-	ipcl_walk(conn_restart_ipsec_waiter, NULL, ns->netstack_ip);
-}
-
-/*
- * Can't be used. Need to call svr4* -> optset directly. the leaf routine
- * determines the grp on which it has to become exclusive, queues the mp
- * and IPSQ draining restarts the optmgmt
- */
-static boolean_t
-ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp)
-{
-	conn_t *connp = Q_TO_CONN(q);
-	ipsec_stack_t *ipss = connp->conn_netstack->netstack_ipsec;
-
-	/*
-	 * Take IPsec requests and treat them special.
-	 */
-	if (ipsec_opt_present(mp)) {
-		/* First check if IPsec is loaded. */
-		mutex_enter(&ipss->ipsec_loader_lock);
-		if (ipss->ipsec_loader_state != IPSEC_LOADER_WAIT) {
-			mutex_exit(&ipss->ipsec_loader_lock);
-			return (B_FALSE);
-		}
-		mutex_enter(&connp->conn_lock);
-		connp->conn_state_flags |= CONN_IPSEC_LOAD_WAIT;
-
-		ASSERT(connp->conn_ipsec_opt_mp == NULL);
-		connp->conn_ipsec_opt_mp = mp;
-		mutex_exit(&connp->conn_lock);
-		mutex_exit(&ipss->ipsec_loader_lock);
-
-		ipsec_loader_loadnow(ipss);
-		return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-/*
  * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid,
  * all of them are copied to the conn_t. If the req is "zero", the policy is
  * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req
@@ -10149,15 +6304,14 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
 		}
 	}
 
-	mutex_enter(&connp->conn_lock);
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
 
 	/*
-	 * If we have already cached policies in ip_bind_connected*(), don't
+	 * If we have already cached policies in conn_connect(), don't
 	 * let them change now. We cache policies for connections
 	 * whose src,dst [addr, port] is known.
 	 */
 	if (connp->conn_policy_cached) {
-		mutex_exit(&connp->conn_lock);
 		return (EINVAL);
 	}
 
@@ -10171,10 +6325,8 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
 			IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack);
 			connp->conn_policy = NULL;
 		}
-		connp->conn_flags &= ~IPCL_CHECK_POLICY;
 		connp->conn_in_enforce_policy = B_FALSE;
 		connp->conn_out_enforce_policy = B_FALSE;
-		mutex_exit(&connp->conn_lock);
 		return (0);
 	}
 
@@ -10203,7 +6355,7 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
 	 * We're looking at a v6 socket, also insert the v6-specific
 	 * entries.
 	 */
-	if (connp->conn_af_isv6) {
+	if (connp->conn_family == AF_INET6) {
 		if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
 		    IPSEC_TYPE_INBOUND, ns))
 			goto enomem;
@@ -10217,10 +6369,10 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
 	/*
 	 * If the requests need security, set enforce_policy.
 	 * If the requests are IPSEC_PREF_NEVER, one should
-	 * still set conn_out_enforce_policy so that an ipsec_out
-	 * gets attached in ip_wput. This is needed so that
-	 * for connections that we don't cache policy in ip_bind,
-	 * if global policy matches in ip_wput_attach_policy, we
+	 * still set conn_out_enforce_policy so that ip_set_destination
+	 * marks the ip_xmit_attr_t appropriatly. This is needed so that
+	 * for connections that we don't cache policy in at connect time,
+	 * if global policy matches in ip_output_attach_policy, we
 	 * don't wrongly inherit global policy. Similarly, we need
 	 * to set conn_in_enforce_policy also so that we don't verify
 	 * policy wrongly.
@@ -10230,10 +6382,8 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
 	    (se_req & REQ_MASK) != 0) {
 		connp->conn_in_enforce_policy = B_TRUE;
 		connp->conn_out_enforce_policy = B_TRUE;
-		connp->conn_flags |= IPCL_CHECK_POLICY;
 	}
 
-	mutex_exit(&connp->conn_lock);
 	return (error);
 #undef REQ_MASK
 
@@ -10241,7 +6391,6 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
 	 * Common memory-allocation-failure exit path.
 	 */
 enomem:
-	mutex_exit(&connp->conn_lock);
 	if (actp != NULL)
 		ipsec_actvec_free(actp, nact);
 	if (is_pol_inserted)
@@ -10250,1250 +6399,283 @@ enomem:
 }
 
 /*
- * Only for options that pass in an IP addr. Currently only V4 options
- * pass in an ipif. V6 options always pass an ifindex specifying the ill.
- * So this function assumes level is IPPROTO_IP
+ * Set socket options for joining and leaving multicast groups.
+ * Common to IPv4 and IPv6; inet6 indicates the type of socket.
+ * The caller has already check that the option name is consistent with
+ * the address family of the socket.
  */
 int
-ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option,
-    mblk_t *first_mp)
+ip_opt_set_multicast_group(conn_t *connp, t_scalar_t name,
+    uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
 {
-	ipif_t *ipif = NULL;
-	int error;
-	ill_t *ill;
-	int zoneid;
-	ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
-	ip2dbg(("ip_opt_set_ipif: ipaddr %X\n", addr));
-
-	if (addr != INADDR_ANY || checkonly) {
-		ASSERT(connp != NULL);
-		zoneid = IPCL_ZONEID(connp);
-		if (option == IP_NEXTHOP) {
-			ipif = ipif_lookup_onlink_addr(addr,
-			    connp->conn_zoneid, ipst);
-		} else {
-			ipif = ipif_lookup_addr(addr, NULL, zoneid,
-			    CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt,
-			    &error, ipst);
-		}
-		if (ipif == NULL) {
-			if (error == EINPROGRESS)
-				return (error);
-			if ((option == IP_MULTICAST_IF) ||
-			    (option == IP_NEXTHOP))
-				return (EHOSTUNREACH);
-			else
-				return (EINVAL);
-		} else if (checkonly) {
-			if (option == IP_MULTICAST_IF) {
-				ill = ipif->ipif_ill;
-				/* not supported by the virtual network iface */
-				if (IS_VNI(ill)) {
-					ipif_refrele(ipif);
-					return (EINVAL);
-				}
-			}
-			ipif_refrele(ipif);
-			return (0);
-		}
-		ill = ipif->ipif_ill;
-		mutex_enter(&connp->conn_lock);
-		mutex_enter(&ill->ill_lock);
-		if ((ill->ill_state_flags & ILL_CONDEMNED) ||
-		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
-			mutex_exit(&ill->ill_lock);
-			mutex_exit(&connp->conn_lock);
-			ipif_refrele(ipif);
-			return (option == IP_MULTICAST_IF ?
-			    EHOSTUNREACH : EINVAL);
-		}
-	} else {
-		mutex_enter(&connp->conn_lock);
-	}
-
-	/* None of the options below are supported on the VNI */
-	if (ipif != NULL && IS_VNI(ipif->ipif_ill)) {
-		mutex_exit(&ill->ill_lock);
-		mutex_exit(&connp->conn_lock);
-		ipif_refrele(ipif);
-		return (EINVAL);
-	}
-
-	switch (option) {
-	case IP_MULTICAST_IF:
-		connp->conn_multicast_ipif = ipif;
+	int		*i1 = (int *)invalp;
+	int		error = 0;
+	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
+	struct ip_mreq	*v4_mreqp;
+	struct ipv6_mreq *v6_mreqp;
+	struct group_req *greqp;
+	ire_t *ire;
+	boolean_t done = B_FALSE;
+	ipaddr_t ifaddr;
+	in6_addr_t v6group;
+	uint_t ifindex;
+	boolean_t mcast_opt = B_TRUE;
+	mcast_record_t fmode;
+	int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
+	    ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
+
+	switch (name) {
+	case IP_ADD_MEMBERSHIP:
+	case IPV6_JOIN_GROUP:
+		mcast_opt = B_FALSE;
+		/* FALLTHRU */
+	case MCAST_JOIN_GROUP:
+		fmode = MODE_IS_EXCLUDE;
+		optfn = ip_opt_add_group;
 		break;
-	case IP_NEXTHOP:
-		connp->conn_nexthop_v4 = addr;
-		connp->conn_nexthop_set = B_TRUE;
+
+	case IP_DROP_MEMBERSHIP:
+	case IPV6_LEAVE_GROUP:
+		mcast_opt = B_FALSE;
+		/* FALLTHRU */
+	case MCAST_LEAVE_GROUP:
+		fmode = MODE_IS_INCLUDE;
+		optfn = ip_opt_delete_group;
 		break;
+	default:
+		ASSERT(0);
 	}
 
-	if (ipif != NULL) {
-		mutex_exit(&ill->ill_lock);
-		mutex_exit(&connp->conn_lock);
-		ipif_refrele(ipif);
-		return (0);
-	}
-	mutex_exit(&connp->conn_lock);
-	/* We succeded in cleared the option */
-	return (0);
-}
+	if (mcast_opt) {
+		struct sockaddr_in *sin;
+		struct sockaddr_in6 *sin6;
 
-/*
- * For options that pass in an ifindex specifying the ill. V6 options always
- * pass in an ill. Some v4 options also pass in ifindex specifying the ill.
- */
-int
-ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly,
-    int level, int option, mblk_t *first_mp)
-{
-	ill_t *ill = NULL;
-	int error = 0;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	ip2dbg(("ip_opt_set_ill: ifindex %d\n", ifindex));
-	if (ifindex != 0) {
-		ASSERT(connp != NULL);
-		ill = ill_lookup_on_ifindex(ifindex, isv6, CONNP_TO_WQ(connp),
-		    first_mp, ip_restart_optmgmt, &error, ipst);
-		if (ill != NULL) {
-			if (checkonly) {
-				/* not supported by the virtual network iface */
-				if (IS_VNI(ill)) {
-					ill_refrele(ill);
-					return (EINVAL);
-				}
-				ill_refrele(ill);
-				return (0);
-			}
-			if (!ipif_lookup_zoneid(ill, connp->conn_zoneid,
-			    0, NULL)) {
-				ill_refrele(ill);
-				ill = NULL;
-				mutex_enter(&connp->conn_lock);
-				goto setit;
-			}
-			mutex_enter(&connp->conn_lock);
-			mutex_enter(&ill->ill_lock);
-			if (ill->ill_state_flags & ILL_CONDEMNED) {
-				mutex_exit(&ill->ill_lock);
-				mutex_exit(&connp->conn_lock);
-				ill_refrele(ill);
-				ill = NULL;
-				mutex_enter(&connp->conn_lock);
-			}
-			goto setit;
-		} else if (error == EINPROGRESS) {
-			return (error);
+		greqp = (struct group_req *)i1;
+		if (greqp->gr_group.ss_family == AF_INET) {
+			sin = (struct sockaddr_in *)&(greqp->gr_group);
+			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &v6group);
 		} else {
-			error = 0;
-		}
+			if (!inet6)
+				return (EINVAL);	/* Not on INET socket */
+
+			sin6 = (struct sockaddr_in6 *)&(greqp->gr_group);
+			v6group = sin6->sin6_addr;
+		}
+		ifaddr = INADDR_ANY;
+		ifindex = greqp->gr_interface;
+	} else if (inet6) {
+		v6_mreqp = (struct ipv6_mreq *)i1;
+		v6group = v6_mreqp->ipv6mr_multiaddr;
+		ifaddr = INADDR_ANY;
+		ifindex = v6_mreqp->ipv6mr_interface;
+	} else {
+		v4_mreqp = (struct ip_mreq *)i1;
+		IN6_INADDR_TO_V4MAPPED(&v4_mreqp->imr_multiaddr, &v6group);
+		ifaddr = (ipaddr_t)v4_mreqp->imr_interface.s_addr;
+		ifindex = 0;
 	}
-	mutex_enter(&connp->conn_lock);
-setit:
-	ASSERT((level == IPPROTO_IP || level == IPPROTO_IPV6));
 
 	/*
-	 * The options below assume that the ILL (if any) transmits and/or
-	 * receives traffic. Neither of which is true for the virtual network
-	 * interface, so fail setting these on a VNI.
+	 * In the multirouting case, we need to replicate
+	 * the request on all interfaces that will take part
+	 * in replication.  We do so because multirouting is
+	 * reflective, thus we will probably receive multi-
+	 * casts on those interfaces.
+	 * The ip_multirt_apply_membership() succeeds if
+	 * the operation succeeds on at least one interface.
 	 */
-	if (IS_VNI(ill)) {
-		ASSERT(ill != NULL);
-		mutex_exit(&ill->ill_lock);
-		mutex_exit(&connp->conn_lock);
-		ill_refrele(ill);
-		return (EINVAL);
-	}
-
-	if (level == IPPROTO_IP) {
-		switch (option) {
-		case IP_BOUND_IF:
-			connp->conn_incoming_ill = ill;
-			connp->conn_outgoing_ill = ill;
-			break;
-
-		case IP_MULTICAST_IF:
-			/*
-			 * This option is an internal special. The socket
-			 * level IP_MULTICAST_IF specifies an 'ipaddr' and
-			 * is handled in ip_opt_set_ipif. IPV6_MULTICAST_IF
-			 * specifies an ifindex and we try first on V6 ill's.
-			 * If we don't find one, we they try using on v4 ill's
-			 * intenally and we come here.
-			 */
-			if (!checkonly && ill != NULL) {
-				ipif_t	*ipif;
-				ipif = ill->ill_ipif;
-
-				if (ipif->ipif_state_flags & IPIF_CONDEMNED) {
-					mutex_exit(&ill->ill_lock);
-					mutex_exit(&connp->conn_lock);
-					ill_refrele(ill);
-					ill = NULL;
-					mutex_enter(&connp->conn_lock);
-				} else {
-					connp->conn_multicast_ipif = ipif;
-				}
-			}
-			break;
+	if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
+		ipaddr_t group;
 
-		case IP_DHCPINIT_IF:
-			if (connp->conn_dhcpinit_ill != NULL) {
-				/*
-				 * We've locked the conn so conn_cleanup_ill()
-				 * cannot clear conn_dhcpinit_ill -- so it's
-				 * safe to access the ill.
-				 */
-				ill_t *oill = connp->conn_dhcpinit_ill;
+		IN6_V4MAPPED_TO_IPADDR(&v6group, group);
 
-				ASSERT(oill->ill_dhcpinit != 0);
-				atomic_dec_32(&oill->ill_dhcpinit);
-				connp->conn_dhcpinit_ill = NULL;
-			}
-
-			if (ill != NULL) {
-				connp->conn_dhcpinit_ill = ill;
-				atomic_inc_32(&ill->ill_dhcpinit);
-			}
-			break;
-		}
+		ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
+		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
+		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
 	} else {
-		switch (option) {
-		case IPV6_BOUND_IF:
-			connp->conn_incoming_ill = ill;
-			connp->conn_outgoing_ill = ill;
-			break;
-
-		case IPV6_MULTICAST_IF:
-			/*
-			 * Set conn_multicast_ill to be the IPv6 ill.
-			 * Set conn_multicast_ipif to be an IPv4 ipif
-			 * for ifindex to make IPv4 mapped addresses
-			 * on PF_INET6 sockets honor IPV6_MULTICAST_IF.
-			 * Even if no IPv6 ill exists for the ifindex
-			 * we need to check for an IPv4 ifindex in order
-			 * for this to work with mapped addresses. In that
-			 * case only set conn_multicast_ipif.
-			 */
-			if (!checkonly) {
-				if (ifindex == 0) {
-					connp->conn_multicast_ill = NULL;
-					connp->conn_multicast_ipif = NULL;
-				} else if (ill != NULL) {
-					connp->conn_multicast_ill = ill;
-				}
-			}
-			break;
+		ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
+		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
+		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
+	}
+	if (ire != NULL) {
+		if (ire->ire_flags & RTF_MULTIRT) {
+			error = ip_multirt_apply_membership(optfn, ire, connp,
+			    checkonly, &v6group, fmode, &ipv6_all_zeros);
+			done = B_TRUE;
 		}
+		ire_refrele(ire);
 	}
 
-	if (ill != NULL) {
-		mutex_exit(&ill->ill_lock);
-		mutex_exit(&connp->conn_lock);
-		ill_refrele(ill);
-		return (0);
+	if (!done) {
+		error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
+		    fmode, &ipv6_all_zeros);
 	}
-	mutex_exit(&connp->conn_lock);
-	/*
-	 * We succeeded in clearing the option (ifindex == 0) or failed to
-	 * locate the ill and could not set the option (ifindex != 0)
-	 */
-	return (ifindex == 0 ? 0 : EINVAL);
+	return (error);
 }
 
-/* This routine sets socket options. */
-/* ARGSUSED */
+/*
+ * Set socket options for joining and leaving multicast groups
+ * for specific sources.
+ * Common to IPv4 and IPv6; inet6 indicates the type of socket.
+ * The caller has already check that the option name is consistent with
+ * the address family of the socket.
+ */
 int
-ip_opt_set(queue_t *q, uint_t optset_context, int level, int name,
-    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
-    void *dummy, cred_t *cr, mblk_t *first_mp)
+ip_opt_set_multicast_sources(conn_t *connp, t_scalar_t name,
+    uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
 {
 	int		*i1 = (int *)invalp;
-	conn_t		*connp = Q_TO_CONN(q);
 	int		error = 0;
-	boolean_t	checkonly;
-	ire_t		*ire;
-	boolean_t	found;
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
+	struct ip_mreq_source *imreqp;
+	struct group_source_req *gsreqp;
+	in6_addr_t v6group, v6src;
+	uint32_t ifindex;
+	ipaddr_t ifaddr;
+	boolean_t mcast_opt = B_TRUE;
+	mcast_record_t fmode;
+	ire_t *ire;
+	boolean_t done = B_FALSE;
+	int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
+	    ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
 
-	switch (optset_context) {
-
-	case SETFN_OPTCOM_CHECKONLY:
-		checkonly = B_TRUE;
-		/*
-		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
-		 * inlen != 0 implies value supplied and
-		 * 	we have to "pretend" to set it.
-		 * inlen == 0 implies that there is no
-		 * 	value part in T_CHECK request and just validation
-		 * done elsewhere should be enough, we just return here.
-		 */
-		if (inlen == 0) {
-			*outlenp = 0;
-			return (0);
-		}
-		break;
-	case SETFN_OPTCOM_NEGOTIATE:
-	case SETFN_UD_NEGOTIATE:
-	case SETFN_CONN_NEGOTIATE:
-		checkonly = B_FALSE;
+	switch (name) {
+	case IP_BLOCK_SOURCE:
+		mcast_opt = B_FALSE;
+		/* FALLTHRU */
+	case MCAST_BLOCK_SOURCE:
+		fmode = MODE_IS_EXCLUDE;
+		optfn = ip_opt_add_group;
 		break;
-	default:
-		/*
-		 * We should never get here
-		 */
-		*outlenp = 0;
-		return (EINVAL);
-	}
-
-	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
-	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
 
-	/*
-	 * For fixed length options, no sanity check
-	 * of passed in length is done. It is assumed *_optcom_req()
-	 * routines do the right thing.
-	 */
-
-	switch (level) {
-	case SOL_SOCKET:
-		/*
-		 * conn_lock protects the bitfields, and is used to
-		 * set the fields atomically.
-		 */
-		switch (name) {
-		case SO_BROADCAST:
-			if (!checkonly) {
-				/* TODO: use value someplace? */
-				mutex_enter(&connp->conn_lock);
-				connp->conn_broadcast = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_USELOOPBACK:
-			if (!checkonly) {
-				/* TODO: use value someplace? */
-				mutex_enter(&connp->conn_lock);
-				connp->conn_loopback = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_DONTROUTE:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_dontroute = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_REUSEADDR:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_reuseaddr = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_PROTOTYPE:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_proto = *i1;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_ALLZONES:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				if (IPCL_IS_BOUND(connp)) {
-					mutex_exit(&connp->conn_lock);
-					return (EINVAL);
-				}
-				connp->conn_allzones = *i1 != 0 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_ANON_MLP:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_anon_mlp = *i1 != 0 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_MAC_EXEMPT:
-			if (secpolicy_net_mac_aware(cr) != 0 ||
-			    IPCL_IS_BOUND(connp))
-				return (EACCES);
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_mac_mode = *i1 != 0 ?
-				    CONN_MAC_AWARE : CONN_MAC_DEFAULT;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_MAC_IMPLICIT:
-			if (secpolicy_net_mac_implicit(cr) != 0)
-				return (EACCES);
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_mac_mode = *i1 != 0 ?
-				    CONN_MAC_IMPLICIT : CONN_MAC_DEFAULT;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		default:
-			/*
-			 * "soft" error (negative)
-			 * option not handled at this level
-			 * Note: Do not modify *outlenp
-			 */
-			return (-EINVAL);
-		}
+	case IP_UNBLOCK_SOURCE:
+		mcast_opt = B_FALSE;
+		/* FALLTHRU */
+	case MCAST_UNBLOCK_SOURCE:
+		fmode = MODE_IS_EXCLUDE;
+		optfn = ip_opt_delete_group;
 		break;
-	case IPPROTO_IP:
-		switch (name) {
-		case IP_NEXTHOP:
-			if (secpolicy_ip_config(cr, B_FALSE) != 0)
-				return (EPERM);
-			/* FALLTHRU */
-		case IP_MULTICAST_IF: {
-			ipaddr_t addr = *i1;
-
-			error = ip_opt_set_ipif(connp, addr, checkonly, name,
-			    first_mp);
-			if (error != 0)
-				return (error);
-			break;	/* goto sizeof (int) option return */
-		}
-
-		case IP_MULTICAST_TTL:
-			/* Recorded in transport above IP */
-			*outvalp = *invalp;
-			*outlenp = sizeof (uchar_t);
-			return (0);
-		case IP_MULTICAST_LOOP:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_multicast_loop = *invalp ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			*outvalp = *invalp;
-			*outlenp = sizeof (uchar_t);
-			return (0);
-		case IP_ADD_MEMBERSHIP:
-		case MCAST_JOIN_GROUP:
-		case IP_DROP_MEMBERSHIP:
-		case MCAST_LEAVE_GROUP: {
-			struct ip_mreq *mreqp;
-			struct group_req *greqp;
-			ire_t *ire;
-			boolean_t done = B_FALSE;
-			ipaddr_t group, ifaddr;
-			struct sockaddr_in *sin;
-			uint32_t *ifindexp;
-			boolean_t mcast_opt = B_TRUE;
-			mcast_record_t fmode;
-			int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t,
-			    uint_t *, mcast_record_t, ipaddr_t, mblk_t *);
-
-			switch (name) {
-			case IP_ADD_MEMBERSHIP:
-				mcast_opt = B_FALSE;
-				/* FALLTHRU */
-			case MCAST_JOIN_GROUP:
-				fmode = MODE_IS_EXCLUDE;
-				optfn = ip_opt_add_group;
-				break;
-
-			case IP_DROP_MEMBERSHIP:
-				mcast_opt = B_FALSE;
-				/* FALLTHRU */
-			case MCAST_LEAVE_GROUP:
-				fmode = MODE_IS_INCLUDE;
-				optfn = ip_opt_delete_group;
-				break;
-			}
-
-			if (mcast_opt) {
-				greqp = (struct group_req *)i1;
-				sin = (struct sockaddr_in *)&greqp->gr_group;
-				if (sin->sin_family != AF_INET) {
-					*outlenp = 0;
-					return (ENOPROTOOPT);
-				}
-				group = (ipaddr_t)sin->sin_addr.s_addr;
-				ifaddr = INADDR_ANY;
-				ifindexp = &greqp->gr_interface;
-			} else {
-				mreqp = (struct ip_mreq *)i1;
-				group = (ipaddr_t)mreqp->imr_multiaddr.s_addr;
-				ifaddr = (ipaddr_t)mreqp->imr_interface.s_addr;
-				ifindexp = NULL;
-			}
-
-			/*
-			 * In the multirouting case, we need to replicate
-			 * the request on all interfaces that will take part
-			 * in replication.  We do so because multirouting is
-			 * reflective, thus we will probably receive multi-
-			 * casts on those interfaces.
-			 * The ip_multirt_apply_membership() succeeds if the
-			 * operation succeeds on at least one interface.
-			 */
-			ire = ire_ftable_lookup(group, IP_HOST_MASK, 0,
-			    IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL,
-			    MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst);
-			if (ire != NULL) {
-				if (ire->ire_flags & RTF_MULTIRT) {
-					error = ip_multirt_apply_membership(
-					    optfn, ire, connp, checkonly, group,
-					    fmode, INADDR_ANY, first_mp);
-					done = B_TRUE;
-				}
-				ire_refrele(ire);
-			}
-			if (!done) {
-				error = optfn(connp, checkonly, group, ifaddr,
-				    ifindexp, fmode, INADDR_ANY, first_mp);
-			}
-			if (error) {
-				/*
-				 * EINPROGRESS is a soft error, needs retry
-				 * so don't make *outlenp zero.
-				 */
-				if (error != EINPROGRESS)
-					*outlenp = 0;
-				return (error);
-			}
-			/* OK return - copy input buffer into output buffer */
-			if (invalp != outvalp) {
-				/* don't trust bcopy for identical src/dst */
-				bcopy(invalp, outvalp, inlen);
-			}
-			*outlenp = inlen;
-			return (0);
-		}
-		case IP_BLOCK_SOURCE:
-		case IP_UNBLOCK_SOURCE:
-		case IP_ADD_SOURCE_MEMBERSHIP:
-		case IP_DROP_SOURCE_MEMBERSHIP:
-		case MCAST_BLOCK_SOURCE:
-		case MCAST_UNBLOCK_SOURCE:
-		case MCAST_JOIN_SOURCE_GROUP:
-		case MCAST_LEAVE_SOURCE_GROUP: {
-			struct ip_mreq_source *imreqp;
-			struct group_source_req *gsreqp;
-			in_addr_t grp, src, ifaddr = INADDR_ANY;
-			uint32_t ifindex = 0;
-			mcast_record_t fmode;
-			struct sockaddr_in *sin;
-			ire_t *ire;
-			boolean_t mcast_opt = B_TRUE, done = B_FALSE;
-			int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t,
-			    uint_t *, mcast_record_t, ipaddr_t, mblk_t *);
-
-			switch (name) {
-			case IP_BLOCK_SOURCE:
-				mcast_opt = B_FALSE;
-				/* FALLTHRU */
-			case MCAST_BLOCK_SOURCE:
-				fmode = MODE_IS_EXCLUDE;
-				optfn = ip_opt_add_group;
-				break;
-
-			case IP_UNBLOCK_SOURCE:
-				mcast_opt = B_FALSE;
-				/* FALLTHRU */
-			case MCAST_UNBLOCK_SOURCE:
-				fmode = MODE_IS_EXCLUDE;
-				optfn = ip_opt_delete_group;
-				break;
-
-			case IP_ADD_SOURCE_MEMBERSHIP:
-				mcast_opt = B_FALSE;
-				/* FALLTHRU */
-			case MCAST_JOIN_SOURCE_GROUP:
-				fmode = MODE_IS_INCLUDE;
-				optfn = ip_opt_add_group;
-				break;
-
-			case IP_DROP_SOURCE_MEMBERSHIP:
-				mcast_opt = B_FALSE;
-				/* FALLTHRU */
-			case MCAST_LEAVE_SOURCE_GROUP:
-				fmode = MODE_IS_INCLUDE;
-				optfn = ip_opt_delete_group;
-				break;
-			}
-
-			if (mcast_opt) {
-				gsreqp = (struct group_source_req *)i1;
-				if (gsreqp->gsr_group.ss_family != AF_INET) {
-					*outlenp = 0;
-					return (ENOPROTOOPT);
-				}
-				sin = (struct sockaddr_in *)&gsreqp->gsr_group;
-				grp = (ipaddr_t)sin->sin_addr.s_addr;
-				sin = (struct sockaddr_in *)&gsreqp->gsr_source;
-				src = (ipaddr_t)sin->sin_addr.s_addr;
-				ifindex = gsreqp->gsr_interface;
-			} else {
-				imreqp = (struct ip_mreq_source *)i1;
-				grp = (ipaddr_t)imreqp->imr_multiaddr.s_addr;
-				src = (ipaddr_t)imreqp->imr_sourceaddr.s_addr;
-				ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
-			}
 
-			/*
-			 * In the multirouting case, we need to replicate
-			 * the request as noted in the mcast cases above.
-			 */
-			ire = ire_ftable_lookup(grp, IP_HOST_MASK, 0,
-			    IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL,
-			    MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst);
-			if (ire != NULL) {
-				if (ire->ire_flags & RTF_MULTIRT) {
-					error = ip_multirt_apply_membership(
-					    optfn, ire, connp, checkonly, grp,
-					    fmode, src, first_mp);
-					done = B_TRUE;
-				}
-				ire_refrele(ire);
-			}
-			if (!done) {
-				error = optfn(connp, checkonly, grp, ifaddr,
-				    &ifindex, fmode, src, first_mp);
-			}
-			if (error != 0) {
-				/*
-				 * EINPROGRESS is a soft error, needs retry
-				 * so don't make *outlenp zero.
-				 */
-				if (error != EINPROGRESS)
-					*outlenp = 0;
-				return (error);
-			}
-			/* OK return - copy input buffer into output buffer */
-			if (invalp != outvalp) {
-				bcopy(invalp, outvalp, inlen);
-			}
-			*outlenp = inlen;
-			return (0);
-		}
-		case IP_SEC_OPT:
-			error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp);
-			if (error != 0) {
-				*outlenp = 0;
-				return (error);
-			}
-			break;
-		case IP_HDRINCL:
-		case IP_OPTIONS:
-		case T_IP_OPTIONS:
-		case IP_TOS:
-		case T_IP_TOS:
-		case IP_TTL:
-		case IP_RECVDSTADDR:
-		case IP_RECVOPTS:
-			/* OK return - copy input buffer into output buffer */
-			if (invalp != outvalp) {
-				/* don't trust bcopy for identical src/dst */
-				bcopy(invalp, outvalp, inlen);
-			}
-			*outlenp = inlen;
-			return (0);
-		case IP_RECVIF:
-			/* Retrieve the inbound interface index */
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_recvif = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case IP_RECVPKTINFO:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_ip_recvpktinfo = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case IP_RECVSLLA:
-			/* Retrieve the source link layer address */
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_recvslla = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case MRT_INIT:
-		case MRT_DONE:
-		case MRT_ADD_VIF:
-		case MRT_DEL_VIF:
-		case MRT_ADD_MFC:
-		case MRT_DEL_MFC:
-		case MRT_ASSERT:
-			if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
-				*outlenp = 0;
-				return (error);
-			}
-			error = ip_mrouter_set((int)name, q, checkonly,
-			    (uchar_t *)invalp, inlen, first_mp);
-			if (error) {
-				*outlenp = 0;
-				return (error);
-			}
-			/* OK return - copy input buffer into output buffer */
-			if (invalp != outvalp) {
-				/* don't trust bcopy for identical src/dst */
-				bcopy(invalp, outvalp, inlen);
-			}
-			*outlenp = inlen;
-			return (0);
-		case IP_BOUND_IF:
-		case IP_DHCPINIT_IF:
-			error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly,
-			    level, name, first_mp);
-			if (error != 0)
-				return (error);
-			break; 		/* goto sizeof (int) option return */
-
-		case IP_UNSPEC_SRC:
-			/* Allow sending with a zero source address */
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_unspec_src = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		default:
-			/*
-			 * "soft" error (negative)
-			 * option not handled at this level
-			 * Note: Do not modify *outlenp
-			 */
-			return (-EINVAL);
-		}
+	case IP_ADD_SOURCE_MEMBERSHIP:
+		mcast_opt = B_FALSE;
+		/* FALLTHRU */
+	case MCAST_JOIN_SOURCE_GROUP:
+		fmode = MODE_IS_INCLUDE;
+		optfn = ip_opt_add_group;
 		break;
-	case IPPROTO_IPV6:
-		switch (name) {
-		case IPV6_BOUND_IF:
-			error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly,
-			    level, name, first_mp);
-			if (error != 0)
-				return (error);
-			break; 		/* goto sizeof (int) option return */
 
-		case IPV6_MULTICAST_IF:
-			/*
-			 * The only possible errors are EINPROGRESS and
-			 * EINVAL. EINPROGRESS will be restarted and is not
-			 * a hard error. We call this option on both V4 and V6
-			 * If both return EINVAL, then this call returns
-			 * EINVAL. If at least one of them succeeds we
-			 * return success.
-			 */
-			found = B_FALSE;
-			error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly,
-			    level, name, first_mp);
-			if (error == EINPROGRESS)
-				return (error);
-			if (error == 0)
-				found = B_TRUE;
-			error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly,
-			    IPPROTO_IP, IP_MULTICAST_IF, first_mp);
-			if (error == 0)
-				found = B_TRUE;
-			if (!found)
-				return (error);
-			break; 		/* goto sizeof (int) option return */
-
-		case IPV6_MULTICAST_HOPS:
-			/* Recorded in transport above IP */
-			break;	/* goto sizeof (int) option return */
-		case IPV6_MULTICAST_LOOP:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_multicast_loop = *i1;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_JOIN_GROUP:
-		case MCAST_JOIN_GROUP:
-		case IPV6_LEAVE_GROUP:
-		case MCAST_LEAVE_GROUP: {
-			struct ipv6_mreq *ip_mreqp;
-			struct group_req *greqp;
-			ire_t *ire;
-			boolean_t done = B_FALSE;
-			in6_addr_t groupv6;
-			uint32_t ifindex;
-			boolean_t mcast_opt = B_TRUE;
-			mcast_record_t fmode;
-			int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
-			    int, mcast_record_t, const in6_addr_t *, mblk_t *);
-
-			switch (name) {
-			case IPV6_JOIN_GROUP:
-				mcast_opt = B_FALSE;
-				/* FALLTHRU */
-			case MCAST_JOIN_GROUP:
-				fmode = MODE_IS_EXCLUDE;
-				optfn = ip_opt_add_group_v6;
-				break;
-
-			case IPV6_LEAVE_GROUP:
-				mcast_opt = B_FALSE;
-				/* FALLTHRU */
-			case MCAST_LEAVE_GROUP:
-				fmode = MODE_IS_INCLUDE;
-				optfn = ip_opt_delete_group_v6;
-				break;
-			}
+	case IP_DROP_SOURCE_MEMBERSHIP:
+		mcast_opt = B_FALSE;
+		/* FALLTHRU */
+	case MCAST_LEAVE_SOURCE_GROUP:
+		fmode = MODE_IS_INCLUDE;
+		optfn = ip_opt_delete_group;
+		break;
+	default:
+		ASSERT(0);
+	}
 
-			if (mcast_opt) {
-				struct sockaddr_in *sin;
-				struct sockaddr_in6 *sin6;
-				greqp = (struct group_req *)i1;
-				if (greqp->gr_group.ss_family == AF_INET) {
-					sin = (struct sockaddr_in *)
-					    &(greqp->gr_group);
-					IN6_INADDR_TO_V4MAPPED(&sin->sin_addr,
-					    &groupv6);
-				} else {
-					sin6 = (struct sockaddr_in6 *)
-					    &(greqp->gr_group);
-					groupv6 = sin6->sin6_addr;
-				}
-				ifindex = greqp->gr_interface;
-			} else {
-				ip_mreqp = (struct ipv6_mreq *)i1;
-				groupv6 = ip_mreqp->ipv6mr_multiaddr;
-				ifindex = ip_mreqp->ipv6mr_interface;
-			}
-			/*
-			 * In the multirouting case, we need to replicate
-			 * the request on all interfaces that will take part
-			 * in replication.  We do so because multirouting is
-			 * reflective, thus we will probably receive multi-
-			 * casts on those interfaces.
-			 * The ip_multirt_apply_membership_v6() succeeds if
-			 * the operation succeeds on at least one interface.
-			 */
-			ire = ire_ftable_lookup_v6(&groupv6, &ipv6_all_ones, 0,
-			    IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL,
-			    MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst);
-			if (ire != NULL) {
-				if (ire->ire_flags & RTF_MULTIRT) {
-					error = ip_multirt_apply_membership_v6(
-					    optfn, ire, connp, checkonly,
-					    &groupv6, fmode, &ipv6_all_zeros,
-					    first_mp);
-					done = B_TRUE;
-				}
-				ire_refrele(ire);
-			}
-			if (!done) {
-				error = optfn(connp, checkonly, &groupv6,
-				    ifindex, fmode, &ipv6_all_zeros, first_mp);
-			}
-			if (error) {
-				/*
-				 * EINPROGRESS is a soft error, needs retry
-				 * so don't make *outlenp zero.
-				 */
-				if (error != EINPROGRESS)
-					*outlenp = 0;
-				return (error);
-			}
-			/* OK return - copy input buffer into output buffer */
-			if (invalp != outvalp) {
-				/* don't trust bcopy for identical src/dst */
-				bcopy(invalp, outvalp, inlen);
-			}
-			*outlenp = inlen;
-			return (0);
-		}
-		case MCAST_BLOCK_SOURCE:
-		case MCAST_UNBLOCK_SOURCE:
-		case MCAST_JOIN_SOURCE_GROUP:
-		case MCAST_LEAVE_SOURCE_GROUP: {
-			struct group_source_req *gsreqp;
-			in6_addr_t v6grp, v6src;
-			uint32_t ifindex;
-			mcast_record_t fmode;
-			ire_t *ire;
-			boolean_t done = B_FALSE;
-			int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
-			    int, mcast_record_t, const in6_addr_t *, mblk_t *);
-
-			switch (name) {
-			case MCAST_BLOCK_SOURCE:
-				fmode = MODE_IS_EXCLUDE;
-				optfn = ip_opt_add_group_v6;
-				break;
-			case MCAST_UNBLOCK_SOURCE:
-				fmode = MODE_IS_EXCLUDE;
-				optfn = ip_opt_delete_group_v6;
-				break;
-			case MCAST_JOIN_SOURCE_GROUP:
-				fmode = MODE_IS_INCLUDE;
-				optfn = ip_opt_add_group_v6;
-				break;
-			case MCAST_LEAVE_SOURCE_GROUP:
-				fmode = MODE_IS_INCLUDE;
-				optfn = ip_opt_delete_group_v6;
-				break;
-			}
+	if (mcast_opt) {
+		gsreqp = (struct group_source_req *)i1;
+		ifindex = gsreqp->gsr_interface;
+		if (gsreqp->gsr_group.ss_family == AF_INET) {
+			struct sockaddr_in *s;
+			s = (struct sockaddr_in *)&gsreqp->gsr_group;
+			IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6group);
+			s = (struct sockaddr_in *)&gsreqp->gsr_source;
+			IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
+		} else {
+			struct sockaddr_in6 *s6;
 
-			gsreqp = (struct group_source_req *)i1;
-			ifindex = gsreqp->gsr_interface;
-			if (gsreqp->gsr_group.ss_family == AF_INET) {
-				struct sockaddr_in *s;
-				s = (struct sockaddr_in *)&gsreqp->gsr_group;
-				IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6grp);
-				s = (struct sockaddr_in *)&gsreqp->gsr_source;
-				IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
-			} else {
-				struct sockaddr_in6 *s6;
-				s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
-				v6grp = s6->sin6_addr;
-				s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
-				v6src = s6->sin6_addr;
-			}
+			if (!inet6)
+				return (EINVAL);	/* Not on INET socket */
 
-			/*
-			 * In the multirouting case, we need to replicate
-			 * the request as noted in the mcast cases above.
-			 */
-			ire = ire_ftable_lookup_v6(&v6grp, &ipv6_all_ones, 0,
-			    IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL,
-			    MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst);
-			if (ire != NULL) {
-				if (ire->ire_flags & RTF_MULTIRT) {
-					error = ip_multirt_apply_membership_v6(
-					    optfn, ire, connp, checkonly,
-					    &v6grp, fmode, &v6src, first_mp);
-					done = B_TRUE;
-				}
-				ire_refrele(ire);
-			}
-			if (!done) {
-				error = optfn(connp, checkonly, &v6grp,
-				    ifindex, fmode, &v6src, first_mp);
-			}
-			if (error != 0) {
-				/*
-				 * EINPROGRESS is a soft error, needs retry
-				 * so don't make *outlenp zero.
-				 */
-				if (error != EINPROGRESS)
-					*outlenp = 0;
-				return (error);
-			}
-			/* OK return - copy input buffer into output buffer */
-			if (invalp != outvalp) {
-				bcopy(invalp, outvalp, inlen);
-			}
-			*outlenp = inlen;
-			return (0);
+			s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
+			v6group = s6->sin6_addr;
+			s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
+			v6src = s6->sin6_addr;
 		}
-		case IPV6_UNICAST_HOPS:
-			/* Recorded in transport above IP */
-			break;	/* goto sizeof (int) option return */
-		case IPV6_UNSPEC_SRC:
-			/* Allow sending with a zero source address */
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_unspec_src = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVPKTINFO:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_ip_recvpktinfo = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVTCLASS:
-			if (!checkonly) {
-				if (*i1 < 0 || *i1 > 1) {
-					return (EINVAL);
-				}
-				mutex_enter(&connp->conn_lock);
-				connp->conn_ipv6_recvtclass = *i1;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;
-		case IPV6_RECVPATHMTU:
-			if (!checkonly) {
-				if (*i1 < 0 || *i1 > 1) {
-					return (EINVAL);
-				}
-				mutex_enter(&connp->conn_lock);
-				connp->conn_ipv6_recvpathmtu = *i1;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;
-		case IPV6_RECVHOPLIMIT:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_ipv6_recvhoplimit = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVHOPOPTS:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_ipv6_recvhopopts = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVDSTOPTS:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_ipv6_recvdstopts = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVRTHDR:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_ipv6_recvrthdr = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVRTHDRDSTOPTS:
-			if (!checkonly) {
-				mutex_enter(&connp->conn_lock);
-				connp->conn_ipv6_recvrtdstopts = *i1 ? 1 : 0;
-				mutex_exit(&connp->conn_lock);
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_PKTINFO:
-			if (inlen == 0)
-				return (-EINVAL);	/* clearing option */
-			error = ip6_set_pktinfo(cr, connp,
-			    (struct in6_pktinfo *)invalp);
-			if (error != 0)
-				*outlenp = 0;
-			else
-				*outlenp = inlen;
-			return (error);
-		case IPV6_NEXTHOP: {
-			struct sockaddr_in6 *sin6;
-
-			/* Verify that the nexthop is reachable */
-			if (inlen == 0)
-				return (-EINVAL);	/* clearing option */
+		ifaddr = INADDR_ANY;
+	} else {
+		imreqp = (struct ip_mreq_source *)i1;
+		IN6_INADDR_TO_V4MAPPED(&imreqp->imr_multiaddr, &v6group);
+		IN6_INADDR_TO_V4MAPPED(&imreqp->imr_sourceaddr, &v6src);
+		ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
+		ifindex = 0;
+	}
 
-			sin6 = (struct sockaddr_in6 *)invalp;
-			ire = ire_route_lookup_v6(&sin6->sin6_addr,
-			    0, 0, 0, NULL, NULL, connp->conn_zoneid,
-			    NULL, MATCH_IRE_DEFAULT, ipst);
+	/*
+	 * Handle src being mapped INADDR_ANY by changing it to unspecified.
+	 */
+	if (IN6_IS_ADDR_V4MAPPED_ANY(&v6src))
+		v6src = ipv6_all_zeros;
 
-			if (ire == NULL) {
-				*outlenp = 0;
-				return (EHOSTUNREACH);
-			}
-			ire_refrele(ire);
-			return (-EINVAL);
-		}
-		case IPV6_SEC_OPT:
-			error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp);
-			if (error != 0) {
-				*outlenp = 0;
-				return (error);
-			}
-			break;
-		case IPV6_SRC_PREFERENCES: {
-			/*
-			 * This is implemented strictly in the ip module
-			 * (here and in tcp_opt_*() to accomodate tcp
-			 * sockets).  Modules above ip pass this option
-			 * down here since ip is the only one that needs to
-			 * be aware of source address preferences.
-			 *
-			 * This socket option only affects connected
-			 * sockets that haven't already bound to a specific
-			 * IPv6 address.  In other words, sockets that
-			 * don't call bind() with an address other than the
-			 * unspecified address and that call connect().
-			 * ip_bind_connected_v6() passes these preferences
-			 * to the ipif_select_source_v6() function.
-			 */
-			if (inlen != sizeof (uint32_t))
-				return (EINVAL);
-			error = ip6_set_src_preferences(connp,
-			    *(uint32_t *)invalp);
-			if (error != 0) {
-				*outlenp = 0;
-				return (error);
-			} else {
-				*outlenp = sizeof (uint32_t);
-			}
-			break;
-		}
-		case IPV6_V6ONLY:
-			if (*i1 < 0 || *i1 > 1) {
-				return (EINVAL);
-			}
-			mutex_enter(&connp->conn_lock);
-			connp->conn_ipv6_v6only = *i1;
-			mutex_exit(&connp->conn_lock);
-			break;
-		default:
-			return (-EINVAL);
-		}
-		break;
-	default:
-		/*
-		 * "soft" error (negative)
-		 * option not handled at this level
-		 * Note: Do not modify *outlenp
-		 */
-		return (-EINVAL);
-	}
 	/*
-	 * Common case of return from an option that is sizeof (int)
+	 * In the multirouting case, we need to replicate
+	 * the request as noted in the mcast cases above.
 	 */
-	*(int *)outvalp = *i1;
-	*outlenp = sizeof (int);
-	return (0);
-}
+	if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
+		ipaddr_t group;
 
-/*
- * This routine gets default values of certain options whose default
- * values are maintained by protocol specific code
- */
-/* ARGSUSED */
-int
-ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
-{
-	int *i1 = (int *)ptr;
-	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
+		IN6_V4MAPPED_TO_IPADDR(&v6group, group);
 
-	switch (level) {
-	case IPPROTO_IP:
-		switch (name) {
-		case IP_MULTICAST_TTL:
-			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
-			return (sizeof (uchar_t));
-		case IP_MULTICAST_LOOP:
-			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
-			return (sizeof (uchar_t));
-		default:
-			return (-1);
-		}
-	case IPPROTO_IPV6:
-		switch (name) {
-		case IPV6_UNICAST_HOPS:
-			*i1 = ipst->ips_ipv6_def_hops;
-			return (sizeof (int));
-		case IPV6_MULTICAST_HOPS:
-			*i1 = IP_DEFAULT_MULTICAST_TTL;
-			return (sizeof (int));
-		case IPV6_MULTICAST_LOOP:
-			*i1 = IP_DEFAULT_MULTICAST_LOOP;
-			return (sizeof (int));
-		case IPV6_V6ONLY:
-			*i1 = 1;
-			return (sizeof (int));
-		default:
-			return (-1);
+		ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
+		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
+		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
+	} else {
+		ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
+		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
+		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
+	}
+	if (ire != NULL) {
+		if (ire->ire_flags & RTF_MULTIRT) {
+			error = ip_multirt_apply_membership(optfn, ire, connp,
+			    checkonly, &v6group, fmode, &v6src);
+			done = B_TRUE;
 		}
-	default:
-		return (-1);
+		ire_refrele(ire);
 	}
-	/* NOTREACHED */
+	if (!done) {
+		error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
+		    fmode, &v6src);
+	}
+	return (error);
 }
 
 /*
  * Given a destination address and a pointer to where to put the information
  * this routine fills in the mtuinfo.
+ * The socket must be connected.
+ * For sctp conn_faddr is the primary address.
  */
 int
-ip_fill_mtuinfo(struct in6_addr *in6, in_port_t port,
-    struct ip6_mtuinfo *mtuinfo, netstack_t *ns)
+ip_fill_mtuinfo(conn_t *connp, ip_xmit_attr_t *ixa, struct ip6_mtuinfo *mtuinfo)
 {
-	ire_t *ire;
-	ip_stack_t	*ipst = ns->netstack_ip;
+	uint32_t	pmtu = IP_MAXPACKET;
+	uint_t		scopeid;
 
-	if (IN6_IS_ADDR_UNSPECIFIED(in6))
+	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6))
 		return (-1);
 
+	/* In case we never sent or called ip_set_destination_v4/v6 */
+	if (ixa->ixa_ire != NULL)
+		pmtu = ip_get_pmtu(ixa);
+
+	if (ixa->ixa_flags & IXAF_SCOPEID_SET)
+		scopeid = ixa->ixa_scopeid;
+	else
+		scopeid = 0;
+
 	bzero(mtuinfo, sizeof (*mtuinfo));
 	mtuinfo->ip6m_addr.sin6_family = AF_INET6;
-	mtuinfo->ip6m_addr.sin6_port = port;
-	mtuinfo->ip6m_addr.sin6_addr = *in6;
+	mtuinfo->ip6m_addr.sin6_port = connp->conn_fport;
+	mtuinfo->ip6m_addr.sin6_addr = connp->conn_faddr_v6;
+	mtuinfo->ip6m_addr.sin6_scope_id = scopeid;
+	mtuinfo->ip6m_mtu = pmtu;
 
-	ire = ire_cache_lookup_v6(in6, ALL_ZONES, NULL, ipst);
-	if (ire != NULL) {
-		mtuinfo->ip6m_mtu = ire->ire_max_frag;
-		ire_refrele(ire);
-	} else {
-		mtuinfo->ip6m_mtu = IPV6_MIN_MTU;
-	}
 	return (sizeof (struct ip6_mtuinfo));
 }
 
-/*
- * This routine gets socket options.  For MRT_VERSION and MRT_ASSERT, error
- * checking of cred and that ip_g_mrouter is set should be done and
- * isn't.  This doesn't matter as the error checking is done properly for the
- * other MRT options coming in through ip_opt_set.
- */
-int
-ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
-{
-	conn_t		*connp = Q_TO_CONN(q);
-	ipsec_req_t	*req = (ipsec_req_t *)ptr;
-
-	switch (level) {
-	case IPPROTO_IP:
-		switch (name) {
-		case MRT_VERSION:
-		case MRT_ASSERT:
-			(void) ip_mrouter_get(name, q, ptr);
-			return (sizeof (int));
-		case IP_SEC_OPT:
-			return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4));
-		case IP_NEXTHOP:
-			if (connp->conn_nexthop_set) {
-				*(ipaddr_t *)ptr = connp->conn_nexthop_v4;
-				return (sizeof (ipaddr_t));
-			} else
-				return (0);
-		case IP_RECVPKTINFO:
-			*(int *)ptr = connp->conn_ip_recvpktinfo ? 1: 0;
-			return (sizeof (int));
-		default:
-			break;
-		}
-		break;
-	case IPPROTO_IPV6:
-		switch (name) {
-		case IPV6_SEC_OPT:
-			return (ipsec_req_from_conn(connp, req, IPSEC_AF_V6));
-		case IPV6_SRC_PREFERENCES: {
-			return (ip6_get_src_preferences(connp,
-			    (uint32_t *)ptr));
-		}
-		case IPV6_V6ONLY:
-			*(int *)ptr = connp->conn_ipv6_v6only ? 1 : 0;
-			return (sizeof (int));
-		case IPV6_PATHMTU:
-			return (ip_fill_mtuinfo(&connp->conn_remv6, 0,
-			    (struct ip6_mtuinfo *)ptr, connp->conn_netstack));
-		default:
-			break;
-		}
-		break;
-	default:
-		break;
-	}
-	return (-1);
-}
 /* Named Dispatch routine to get a current value out of our parameter table. */
 /* ARGSUSED */
 static int
@@ -11955,130 +7137,18 @@ ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill,
 }
 
 /*
- * ipsec processing for the fast path, used for input UDP Packets
- * Returns true if ready for passup to UDP.
- * Return false if packet is not passable to UDP (e.g. it failed IPsec policy,
- * was an ESP-in-UDP packet, etc.).
- */
-static boolean_t
-ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha,
-    mblk_t **mpp, mblk_t **first_mpp, boolean_t mctl_present, ire_t *ire)
-{
-	uint32_t	ill_index;
-	uint_t		in_flags;	/* IPF_RECVSLLA and/or IPF_RECVIF */
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
-	udp_t		*udp = connp->conn_udp;
-
-	ASSERT(ipha->ipha_protocol == IPPROTO_UDP);
-	/* The ill_index of the incoming ILL */
-	ill_index = ((ill_t *)q->q_ptr)->ill_phyint->phyint_ifindex;
-
-	/* pass packet up to the transport */
-	if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) {
-		*first_mpp = ipsec_check_inbound_policy(*first_mpp, connp, ipha,
-		    NULL, mctl_present);
-		if (*first_mpp == NULL) {
-			return (B_FALSE);
-		}
-	}
-
-	/* Initiate IPPF processing for fastpath UDP */
-	if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
-		ip_process(IPP_LOCAL_IN, mpp, ill_index);
-		if (*mpp == NULL) {
-			ip2dbg(("ip_input_ipsec_process: UDP pkt "
-			    "deferred/dropped during IPPF processing\n"));
-			return (B_FALSE);
-		}
-	}
-	/*
-	 * Remove 0-spi if it's 0, or move everything behind
-	 * the UDP header over it and forward to ESP via
-	 * ip_proto_input().
-	 */
-	if (udp->udp_nat_t_endpoint) {
-		if (mctl_present) {
-			/* mctl_present *shouldn't* happen. */
-			ip_drop_packet(*first_mpp, B_TRUE, NULL,
-			    NULL, DROPPER(ipss, ipds_esp_nat_t_ipsec),
-			    &ipss->ipsec_dropper);
-			*first_mpp = NULL;
-			return (B_FALSE);
-		}
-
-		/* "ill" is "recv_ill" in actuality. */
-		if (!zero_spi_check(q, *mpp, ire, ill, ipss))
-			return (B_FALSE);
-
-		/* Else continue like a normal UDP packet. */
-	}
-
-	/*
-	 * We make the checks as below since we are in the fast path
-	 * and want to minimize the number of checks if the IP_RECVIF and/or
-	 * IP_RECVSLLA and/or IPV6_RECVPKTINFO options are not set
-	 */
-	if (connp->conn_recvif || connp->conn_recvslla ||
-	    connp->conn_ip_recvpktinfo) {
-		if (connp->conn_recvif) {
-			in_flags = IPF_RECVIF;
-		}
-		/*
-		 * UDP supports IP_RECVPKTINFO option for both v4 and v6
-		 * so the flag passed to ip_add_info is based on IP version
-		 * of connp.
-		 */
-		if (connp->conn_ip_recvpktinfo) {
-			if (connp->conn_af_isv6) {
-				/*
-				 * V6 only needs index
-				 */
-				in_flags |= IPF_RECVIF;
-			} else {
-				/*
-				 * V4 needs index + matching address.
-				 */
-				in_flags |= IPF_RECVADDR;
-			}
-		}
-		if (connp->conn_recvslla) {
-			in_flags |= IPF_RECVSLLA;
-		}
-		/*
-		 * since in_flags are being set ill will be
-		 * referenced in ip_add_info, so it better not
-		 * be NULL.
-		 */
-		/*
-		 * the actual data will be contained in b_cont
-		 * upon successful return of the following call.
-		 * If the call fails then the original mblk is
-		 * returned.
-		 */
-		*mpp = ip_add_info(*mpp, ill, in_flags, IPCL_ZONEID(connp),
-		    ipst);
-	}
-
-	return (B_TRUE);
-}
-
-/*
  * Fragmentation reassembly.  Each ILL has a hash table for
  * queuing packets undergoing reassembly for all IPIFs
  * associated with the ILL.  The hash is based on the packet
  * IP ident field.  The ILL frag hash table was allocated
  * as a timer block at the time the ILL was created.  Whenever
  * there is anything on the reassembly queue, the timer will
- * be running.  Returns B_TRUE if successful else B_FALSE;
- * frees mp on failure.
+ * be running.  Returns the reassembled packet if reassembly completes.
  */
-static boolean_t
-ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
-    uint32_t *cksum_val, uint16_t *cksum_flags)
+mblk_t *
+ip_input_fragment(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
 {
 	uint32_t	frag_offset_flags;
-	mblk_t		*mp = *mpp;
 	mblk_t		*t_mp;
 	ipaddr_t	dst;
 	uint8_t		proto = ipha->ipha_protocol;
@@ -12099,12 +7169,8 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
 	uint8_t		ecn_info = 0;
 	uint32_t	packet_size;
 	boolean_t	pruned = B_FALSE;
-	ip_stack_t *ipst = ill->ill_ipst;
-
-	if (cksum_val != NULL)
-		*cksum_val = 0;
-	if (cksum_flags != NULL)
-		*cksum_flags = 0;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
 	/*
 	 * Drop the fragmented as early as possible, if
@@ -12112,13 +7178,13 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
 	 */
 	if (ipst->ips_ip_reass_queue_bytes == 0) {
 		freemsg(mp);
-		return (B_FALSE);
+		return (NULL);
 	}
 
 	/* Check for fragmentation offset; return if there's none */
 	if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
 	    (IPH_MF | IPH_OFFSET)) == 0)
-		return (B_TRUE);
+		return (mp);
 
 	/*
 	 * We utilize hardware computed checksum info only for UDP since
@@ -12126,8 +7192,9 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
 	 * addition, checksum offload support for IP fragments carrying
 	 * UDP payload is commonly implemented across network adapters.
 	 */
-	ASSERT(recv_ill != NULL);
-	if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(recv_ill) &&
+	ASSERT(ira->ira_rill != NULL);
+	if (proto == IPPROTO_UDP && dohwcksum &&
+	    ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
 	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
 		mblk_t *mp1 = mp->b_cont;
 		int32_t len;
@@ -12178,7 +7245,7 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
 	/* If end == 0 then we have a packet with no data, so just free it */
 	if (end == 0) {
 		freemsg(mp);
-		return (B_FALSE);
+		return (NULL);
 	}
 
 	/* Record the ECN field info. */
@@ -12192,16 +7259,25 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
 		end += offset;
 	}
 
-	msg_len = MBLKSIZE(mp);
+	/* Handle vnic loopback of fragments */
+	if (mp->b_datap->db_ref > 2)
+		msg_len = 0;
+	else
+		msg_len = MBLKSIZE(mp);
+
 	tail_mp = mp;
 	while (tail_mp->b_cont != NULL) {
 		tail_mp = tail_mp->b_cont;
-		msg_len += MBLKSIZE(tail_mp);
+		if (tail_mp->b_datap->db_ref <= 2)
+			msg_len += MBLKSIZE(tail_mp);
 	}
 
 	/* If the reassembly list for this ILL will get too big, prune it */
 	if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
 	    ipst->ips_ip_reass_queue_bytes) {
+		DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
+		    uint_t, ill->ill_frag_count,
+		    uint_t, ipst->ips_ip_reass_queue_bytes);
 		ill_frag_prune(ill,
 		    (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
 		    (ipst->ips_ip_reass_queue_bytes - msg_len));
@@ -12232,7 +7308,7 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
 					ill_frag_free_pkts(ill, ipfb, ipf, 1);
 					freemsg(mp);
 					mutex_exit(&ipfb->ipfb_lock);
-					return (B_FALSE);
+					return (NULL);
 				}
 				/* Found it. */
 				break;
@@ -12254,7 +7330,7 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
 		if (pruned && offset != 0) {
 			mutex_exit(&ipfb->ipfb_lock);
 			freemsg(mp);
-			return (B_FALSE);
+			return (NULL);
 		}
 
 		if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
@@ -12269,10 +7345,11 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
 		mp1 = allocb(sizeof (*ipf), BPRI_MED);
 		if (mp1 == NULL) {
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
 			freemsg(mp);
 reass_done:
 			mutex_exit(&ipfb->ipfb_lock);
-			return (B_FALSE);
+			return (NULL);
 		}
 
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
@@ -12478,19 +7555,22 @@ reass_done:
 	/* Restore original IP length in header. */
 	packet_size = (uint32_t)msgdsize(mp);
 	if (packet_size > IP_MAXPACKET) {
-		freemsg(mp);
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
-		return (B_FALSE);
+		ip_drop_input("Reassembled packet too large", mp, ill);
+		freemsg(mp);
+		return (NULL);
 	}
 
 	if (DB_REF(mp) > 1) {
 		mblk_t *mp2 = copymsg(mp);
 
-		freemsg(mp);
 		if (mp2 == NULL) {
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			return (B_FALSE);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			freemsg(mp);
+			return (NULL);
 		}
+		freemsg(mp);
 		mp = mp2;
 	}
 	ipha = (ipha_t *)mp->b_rptr;
@@ -12501,1187 +7581,239 @@ reass_done:
 	/* Record the ECN info. */
 	ipha->ipha_type_of_service &= 0xFC;
 	ipha->ipha_type_of_service |= ecn_info;
-	*mpp = mp;
 
-	/* Reassembly is successful; return checksum information if needed */
-	if (cksum_val != NULL)
-		*cksum_val = sum_val;
-	if (cksum_flags != NULL)
-		*cksum_flags = sum_flags;
+	/* Update the receive attributes */
+	ira->ira_pktlen = packet_size;
+	ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
 
-	return (B_TRUE);
+	/* Reassembly is successful; set checksum information in packet */
+	DB_CKSUM16(mp) = (uint16_t)sum_val;
+	DB_CKSUMFLAGS(mp) = sum_flags;
+	DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
+
+	return (mp);
 }
 
 /*
- * Perform ip header check sum update local options.
- * return B_TRUE if all is well, else return B_FALSE and release
- * the mp. caller is responsible for decrementing ire ref cnt.
+ * Pullup function that should be used for IP input in order to
+ * ensure we do not loose the L2 source address; we need the l2 source
+ * address for IP_RECVSLLA and for ndp_input.
+ *
+ * We return either NULL or b_rptr.
  */
-static boolean_t
-ip_options_cksum(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t *ipha, ire_t *ire,
-    ip_stack_t *ipst)
+void *
+ip_pullup(mblk_t *mp, ssize_t len, ip_recv_attr_t *ira)
 {
-	mblk_t		*first_mp;
-	boolean_t	mctl_present;
-	uint16_t	sum;
+	ill_t		*ill = ira->ira_ill;
 
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-	/*
-	 * Don't do the checksum if it has gone through AH/ESP
-	 * processing.
-	 */
-	if (!mctl_present) {
-		sum = ip_csum_hdr(ipha);
-		if (sum != 0) {
-			if (ill != NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
-			} else {
-				BUMP_MIB(&ipst->ips_ip_mib,
-				    ipIfStatsInCksumErrs);
-			}
-			freemsg(first_mp);
-			return (B_FALSE);
-		}
+	if (ip_rput_pullups++ == 0) {
+		(void) mi_strlog(ill->ill_rq, 1, SL_ERROR|SL_TRACE,
+		    "ip_pullup: %s forced us to "
+		    " pullup pkt, hdr len %ld, hdr addr %p",
+		    ill->ill_name, len, (void *)mp->b_rptr);
 	}
-
-	if (!ip_rput_local_options(q, mp, ipha, ire, ipst)) {
-		if (mctl_present)
-			freeb(first_mp);
-		return (B_FALSE);
-	}
-
-	return (B_TRUE);
+	if (!(ira->ira_flags & IRAF_L2SRC_SET))
+		ip_setl2src(mp, ira, ira->ira_rill);
+	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
+	if (!pullupmsg(mp, len))
+		return (NULL);
+	else
+		return (mp->b_rptr);
 }
 
 /*
- * All udp packet are delivered to the local host via this routine.
+ * Make sure ira_l2src has an address. If we don't have one fill with zeros.
+ * When called from the ULP ira_rill will be NULL hence the caller has to
+ * pass in the ill.
  */
+/* ARGSUSED */
 void
-ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
-    ill_t *recv_ill)
+ip_setl2src(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill)
 {
-	uint32_t	sum;
-	uint32_t	u1;
-	boolean_t	mctl_present;
-	conn_t		*connp;
-	mblk_t		*first_mp;
-	uint16_t	*up;
-	ill_t		*ill = (ill_t *)q->q_ptr;
-	uint16_t	reass_hck_flags = 0;
-	ip_stack_t	*ipst;
-
-	ASSERT(recv_ill != NULL);
-	ipst = recv_ill->ill_ipst;
+	const uchar_t *addr;
+	int alen;
 
-#define	rptr    ((uchar_t *)ipha)
+	if (ira->ira_flags & IRAF_L2SRC_SET)
+		return;
 
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-	ASSERT(!mctl_present || ipsec_in_is_secure(first_mp));
-	ASSERT(ipha->ipha_protocol == IPPROTO_UDP);
 	ASSERT(ill != NULL);
-
-	/*
-	 * FAST PATH for udp packets
-	 */
-
-	/* u1 is # words of IP options */
-	u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) +
-	    IP_SIMPLE_HDR_LENGTH_IN_WORDS);
-
-	/* IP options present */
-	if (u1 != 0)
-		goto ipoptions;
-
-	/* Check the IP header checksum.  */
-	if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) {
-		/* Clear the IP header h/w cksum flag */
-		DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
-	} else if (!mctl_present) {
-		/*
-		 * Don't verify header checksum if this packet is coming
-		 * back from AH/ESP as we already did it.
-		 */
-#define	uph	((uint16_t *)ipha)
-		sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] +
-		    uph[6] + uph[7] + uph[8] + uph[9];
-#undef	uph
-		/* finish doing IP checksum */
-		sum = (sum & 0xFFFF) + (sum >> 16);
-		sum = ~(sum + (sum >> 16)) & 0xFFFF;
-		if (sum != 0 && sum != 0xFFFF) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
-			freemsg(first_mp);
-			return;
-		}
-	}
-
-	/*
-	 * Count for SNMP of inbound packets for ire.
-	 * if mctl is present this might be a secure packet and
-	 * has already been counted for in ip_proto_input().
-	 */
-	if (!mctl_present) {
-		UPDATE_IB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
+	alen = ill->ill_phys_addr_length;
+	ASSERT(alen <= sizeof (ira->ira_l2src));
+	if (ira->ira_mhip != NULL &&
+	    (addr = ira->ira_mhip->mhi_saddr) != NULL) {
+		bcopy(addr, ira->ira_l2src, alen);
+	} else if ((ira->ira_flags & IRAF_L2SRC_LOOPBACK) &&
+	    (addr = ill->ill_phys_addr) != NULL) {
+		bcopy(addr, ira->ira_l2src, alen);
+	} else {
+		bzero(ira->ira_l2src, alen);
 	}
+	ira->ira_flags |= IRAF_L2SRC_SET;
+}
 
-	/* packet part of fragmented IP packet? */
-	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
-	if (u1 & (IPH_MF | IPH_OFFSET)) {
-		goto fragmented;
-	}
+/*
+ * check ip header length and align it.
+ */
+mblk_t *
+ip_check_and_align_header(mblk_t *mp, uint_t min_size, ip_recv_attr_t *ira)
+{
+	ill_t	*ill = ira->ira_ill;
+	ssize_t len;
 
-	/* u1 = IP header length (20 bytes) */
-	u1 = IP_SIMPLE_HDR_LENGTH;
+	len = MBLKL(mp);
 
-	/* packet does not contain complete IP & UDP headers */
-	if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE))
-		goto udppullup;
+	if (!OK_32PTR(mp->b_rptr))
+		IP_STAT(ill->ill_ipst, ip_notaligned);
+	else
+		IP_STAT(ill->ill_ipst, ip_recv_pullup);
 
-	/* up points to UDP header */
-	up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH);
-#define	iphs    ((uint16_t *)ipha)
+	/* Guard against bogus device drivers */
+	if (len < 0) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+		freemsg(mp);
+		return (NULL);
+	}
 
-	/* if udp hdr cksum != 0, then need to checksum udp packet */
-	if (up[3] != 0) {
+	if (len == 0) {
+		/* GLD sometimes sends up mblk with b_rptr == b_wptr! */
 		mblk_t *mp1 = mp->b_cont;
-		boolean_t cksum_err;
-		uint16_t hck_flags = 0;
 
-		/* Pseudo-header checksum */
-		u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
-		    iphs[9] + up[2];
+		if (!(ira->ira_flags & IRAF_L2SRC_SET))
+			ip_setl2src(mp, ira, ira->ira_rill);
+		ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
 
-		/*
-		 * Revert to software checksum calculation if the interface
-		 * isn't capable of checksum offload or if IPsec is present.
-		 */
-		if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum)
-			hck_flags = DB_CKSUMFLAGS(mp);
-
-		if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
-			IP_STAT(ipst, ip_in_sw_cksum);
-
-		IP_CKSUM_RECV(hck_flags, u1,
-		    (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
-		    (int32_t)((uchar_t *)up - rptr),
-		    mp, mp1, cksum_err);
-
-		if (cksum_err) {
-			BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
-			if (hck_flags & HCK_FULLCKSUM)
-				IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
-			else if (hck_flags & HCK_PARTIALCKSUM)
-				IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
-			else
-				IP_STAT(ipst, ip_udp_in_sw_cksum_err);
+		freeb(mp);
+		mp = mp1;
+		if (mp == NULL)
+			return (NULL);
 
-			freemsg(first_mp);
-			return;
-		}
+		if (OK_32PTR(mp->b_rptr) && MBLKL(mp) >= min_size)
+			return (mp);
 	}
-
-	/* Non-fragmented broadcast or multicast packet? */
-	if (ire->ire_type == IRE_BROADCAST)
-		goto udpslowpath;
-
-	if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH,
-	    ire->ire_zoneid, ipst)) != NULL) {
-		ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
-		IP_STAT(ipst, ip_udp_fast_path);
-
-		if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
-		    (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
-			freemsg(mp);
-			BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
+	if (ip_pullup(mp, min_size, ira) == NULL) {
+		if (msgdsize(mp) < min_size) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
 		} else {
-			if (!mctl_present) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsHCInDelivers);
-			}
-			/*
-			 * mp and first_mp can change.
-			 */
-			if (ip_udp_check(q, connp, recv_ill,
-			    ipha, &mp, &first_mp, mctl_present, ire)) {
-				/* Send it upstream */
-				(connp->conn_recv)(connp, mp, NULL);
-			}
-		}
-		/*
-		 * freeb() cannot deal with null mblk being passed
-		 * in and first_mp can be set to null in the call
-		 * ipsec_input_fast_proc()->ipsec_check_inbound_policy.
-		 */
-		if (mctl_present && first_mp != NULL) {
-			freeb(first_mp);
-		}
-		CONN_DEC_REF(connp);
-		return;
-	}
-
-	/*
-	 * if we got here we know the packet is not fragmented and
-	 * has no options. The classifier could not find a conn_t and
-	 * most likely its an icmp packet so send it through slow path.
-	 */
-
-	goto udpslowpath;
-
-ipoptions:
-	if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) {
-		goto slow_done;
-	}
-
-	UPDATE_IB_PKT_COUNT(ire);
-	ire->ire_last_used_time = lbolt;
-	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
-	if (u1 & (IPH_MF | IPH_OFFSET)) {
-fragmented:
-		/*
-		 * "sum" and "reass_hck_flags" are non-zero if the
-		 * reassembled packet has a valid hardware computed
-		 * checksum information associated with it.
-		 */
-		if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, &sum,
-		    &reass_hck_flags)) {
-			goto slow_done;
-		}
-
-		/*
-		 * Make sure that first_mp points back to mp as
-		 * the mp we came in with could have changed in
-		 * ip_rput_fragment().
-		 */
-		ASSERT(!mctl_present);
-		ipha = (ipha_t *)mp->b_rptr;
-		first_mp = mp;
-	}
-
-	/* Now we have a complete datagram, destined for this machine. */
-	u1 = IPH_HDR_LENGTH(ipha);
-	/* Pull up the UDP header, if necessary. */
-	if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) {
-udppullup:
-		if (!pullupmsg(mp, u1 + UDPH_SIZE)) {
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(first_mp);
-			goto slow_done;
-		}
-		ipha = (ipha_t *)mp->b_rptr;
-	}
-
-	/*
-	 * Validate the checksum for the reassembled packet; for the
-	 * pullup case we calculate the payload checksum in software.
-	 */
-	up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET);
-	if (up[3] != 0) {
-		boolean_t cksum_err;
-
-		if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
-			IP_STAT(ipst, ip_in_sw_cksum);
-
-		IP_CKSUM_RECV_REASS(reass_hck_flags,
-		    (int32_t)((uchar_t *)up - (uchar_t *)ipha),
-		    IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
-		    iphs[9] + up[2], sum, cksum_err);
-
-		if (cksum_err) {
-			BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
-
-			if (reass_hck_flags & HCK_FULLCKSUM)
-				IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
-			else if (reass_hck_flags & HCK_PARTIALCKSUM)
-				IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
-			else
-				IP_STAT(ipst, ip_udp_in_sw_cksum_err);
-
-			freemsg(first_mp);
-			goto slow_done;
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
 		}
+		freemsg(mp);
+		return (NULL);
 	}
-udpslowpath:
-
-	/* Clear hardware checksum flag to be safe */
-	DB_CKSUMFLAGS(mp) = 0;
-
-	ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up,
-	    (ire->ire_type == IRE_BROADCAST),
-	    IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IPINFO,
-	    mctl_present, B_TRUE, recv_ill, ire->ire_zoneid);
-
-slow_done:
-	IP_STAT(ipst, ip_udp_slow_path);
-	return;
-
-#undef  iphs
-#undef  rptr
-}
-
-static boolean_t
-ip_iptun_input(mblk_t *ipsec_mp, mblk_t *data_mp, ipha_t *ipha, ill_t *ill,
-    ire_t *ire, ip_stack_t *ipst)
-{
-	conn_t	*connp;
-
-	ASSERT(ipsec_mp == NULL || ipsec_mp->b_cont == data_mp);
-
-	if ((connp = ipcl_classify_v4(data_mp, ipha->ipha_protocol,
-	    IP_SIMPLE_HDR_LENGTH, ire->ire_zoneid, ipst)) != NULL) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-		connp->conn_recv(connp, ipsec_mp != NULL ? ipsec_mp : data_mp,
-		    NULL);
-		CONN_DEC_REF(connp);
-		return (B_TRUE);
-	}
-	return (B_FALSE);
+	return (mp);
 }
 
-/* ARGSUSED */
-static mblk_t *
-ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
-    ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q,
-    ill_rx_ring_t *ill_ring)
+/*
+ * Common code for IPv4 and IPv6 to check and pullup multi-mblks
+ */
+mblk_t *
+ip_check_length(mblk_t *mp, uchar_t *rptr, ssize_t len,	uint_t pkt_len,
+    uint_t min_size, ip_recv_attr_t *ira)
 {
-	conn_t		*connp;
-	uint32_t	sum;
-	uint32_t	u1;
-	uint16_t	*up;
-	int		offset;
-	ssize_t		len;
-	mblk_t		*mp1;
-	boolean_t	syn_present = B_FALSE;
-	tcph_t		*tcph;
-	uint_t		tcph_flags;
-	uint_t		ip_hdr_len;
-	ill_t		*ill = (ill_t *)q->q_ptr;
-	zoneid_t	zoneid = ire->ire_zoneid;
-	boolean_t	cksum_err;
-	uint16_t	hck_flags = 0;
-	ip_stack_t	*ipst = recv_ill->ill_ipst;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
-
-#define	rptr	((uchar_t *)ipha)
-
-	ASSERT(ipha->ipha_protocol == IPPROTO_TCP);
-	ASSERT(ill != NULL);
-
-	/*
-	 * FAST PATH for tcp packets
-	 */
-
-	/* u1 is # words of IP options */
-	u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4)
-	    + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
-
-	/* IP options present */
-	if (u1) {
-		goto ipoptions;
-	} else if (!mctl_present) {
-		/* Check the IP header checksum.  */
-		if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) {
-			/* Clear the IP header h/w cksum flag */
-			DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
-		} else if (!mctl_present) {
-			/*
-			 * Don't verify header checksum if this packet
-			 * is coming back from AH/ESP as we already did it.
-			 */
-#define	uph	((uint16_t *)ipha)
-			sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
-			    uph[5] + uph[6] + uph[7] + uph[8] + uph[9];
-#undef	uph
-			/* finish doing IP checksum */
-			sum = (sum & 0xFFFF) + (sum >> 16);
-			sum = ~(sum + (sum >> 16)) & 0xFFFF;
-			if (sum != 0 && sum != 0xFFFF) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsInCksumErrs);
-				goto error;
-			}
-		}
-	}
-
-	if (!mctl_present) {
-		UPDATE_IB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
-	}
-
-	/* packet part of fragmented IP packet? */
-	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
-	if (u1 & (IPH_MF | IPH_OFFSET)) {
-		goto fragmented;
-	}
-
-	/* u1 = IP header length (20 bytes) */
-	u1 = ip_hdr_len = IP_SIMPLE_HDR_LENGTH;
-
-	/* does packet contain IP+TCP headers? */
-	len = mp->b_wptr - rptr;
-	if (len < (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH)) {
-		IP_STAT(ipst, ip_tcppullup);
-		goto tcppullup;
-	}
-
-	/* TCP options present? */
-	offset = ((uchar_t *)ipha)[IP_SIMPLE_HDR_LENGTH + 12] >> 4;
-
-	/*
-	 * If options need to be pulled up, then goto tcpoptions.
-	 * otherwise we are still in the fast path
-	 */
-	if (len < (offset << 2) + IP_SIMPLE_HDR_LENGTH) {
-		IP_STAT(ipst, ip_tcpoptions);
-		goto tcpoptions;
-	}
-
-	/* multiple mblks of tcp data? */
-	if ((mp1 = mp->b_cont) != NULL) {
-		IP_STAT(ipst, ip_multipkttcp);
-		len += msgdsize(mp1);
-	}
-
-	up = (uint16_t *)(rptr + IP_SIMPLE_HDR_LENGTH + TCP_PORTS_OFFSET);
-
-	/* part of pseudo checksum */
-
-	/* TCP datagram length */
-	u1 = len - IP_SIMPLE_HDR_LENGTH;
-
-#define	iphs    ((uint16_t *)ipha)
-
-#ifdef	_BIG_ENDIAN
-	u1 += IPPROTO_TCP;
-#else
-	u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8);
-#endif
-	u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9];
-
-	/*
-	 * Revert to software checksum calculation if the interface
-	 * isn't capable of checksum offload or if IPsec is present.
-	 */
-	if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum)
-		hck_flags = DB_CKSUMFLAGS(mp);
-
-	if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
-		IP_STAT(ipst, ip_in_sw_cksum);
-
-	IP_CKSUM_RECV(hck_flags, u1,
-	    (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
-	    (int32_t)((uchar_t *)up - rptr),
-	    mp, mp1, cksum_err);
-
-	if (cksum_err) {
-		BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
-
-		if (hck_flags & HCK_FULLCKSUM)
-			IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err);
-		else if (hck_flags & HCK_PARTIALCKSUM)
-			IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err);
-		else
-			IP_STAT(ipst, ip_tcp_in_sw_cksum_err);
-
-		goto error;
-	}
-
-try_again:
-
-	if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len,
-	    zoneid, ipst)) == NULL) {
-		/* Send the TH_RST */
-		goto no_conn;
-	}
-
-	tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
-	tcph_flags = tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG);
+	ill_t	*ill = ira->ira_ill;
 
 	/*
-	 * TCP FAST PATH for AF_INET socket.
-	 *
-	 * TCP fast path to avoid extra work. An AF_INET socket type
-	 * does not have facility to receive extra information via
-	 * ip_process or ip_add_info. Also, when the connection was
-	 * established, we made a check if this connection is impacted
-	 * by any global IPsec policy or per connection policy (a
-	 * policy that comes in effect later will not apply to this
-	 * connection). Since all this can be determined at the
-	 * connection establishment time, a quick check of flags
-	 * can avoid extra work.
+	 * Make sure we have data length consistent
+	 * with the IP header.
 	 */
-	if (IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) && !mctl_present &&
-	    !IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
-		ASSERT(first_mp == mp);
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-		if (tcph_flags != (TH_SYN | TH_ACK)) {
-			SET_SQUEUE(mp, tcp_rput_data, connp);
-			return (mp);
-		}
-		mp->b_datap->db_struioflag |= STRUIO_CONNECT;
-		DB_CKSUMSTART(mp) = (intptr_t)ip_squeue_get(ill_ring);
-		SET_SQUEUE(mp, tcp_input, connp);
-		return (mp);
-	}
-
-	if (tcph_flags == TH_SYN) {
-		if (IPCL_IS_TCP(connp)) {
-			mp->b_datap->db_struioflag |= STRUIO_EAGER;
-			DB_CKSUMSTART(mp) =
-			    (intptr_t)ip_squeue_get(ill_ring);
-			if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present &&
-			    !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsHCInDelivers);
-				SET_SQUEUE(mp, connp->conn_recv, connp);
-				return (mp);
-			} else if (IPCL_IS_BOUND(connp) && !mctl_present &&
-			    !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsHCInDelivers);
-				ip_squeue_enter_unbound++;
-				SET_SQUEUE(mp, tcp_conn_request_unbound,
-				    connp);
-				return (mp);
-			}
-			syn_present = B_TRUE;
-		}
-	}
-
-	if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) {
-		uint_t	flags = (unsigned int)tcph->th_flags[0] & 0xFF;
-
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-		/* No need to send this packet to TCP */
-		if ((flags & TH_RST) || (flags & TH_URG)) {
-			CONN_DEC_REF(connp);
-			freemsg(first_mp);
-			return (NULL);
-		}
-		if (flags & TH_ACK) {
-			ip_xmit_reset_serialize(first_mp, ip_hdr_len, zoneid,
-			    ipst->ips_netstack->netstack_tcp, connp);
-			CONN_DEC_REF(connp);
-			return (NULL);
-		}
-
-		CONN_DEC_REF(connp);
-		freemsg(first_mp);
-		return (NULL);
-	}
-
-	if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) {
-		first_mp = ipsec_check_inbound_policy(first_mp, connp,
-		    ipha, NULL, mctl_present);
-		if (first_mp == NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			CONN_DEC_REF(connp);
+	if (mp->b_cont == NULL) {
+		/* pkt_len is based on ipha_len, not the mblk length */
+		if (pkt_len < min_size) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+			freemsg(mp);
 			return (NULL);
 		}
-		if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) {
-			ASSERT(syn_present);
-			if (mctl_present) {
-				ASSERT(first_mp != mp);
-				first_mp->b_datap->db_struioflag |=
-				    STRUIO_POLICY;
-			} else {
-				ASSERT(first_mp == mp);
-				mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
-				mp->b_datap->db_struioflag |= STRUIO_POLICY;
-			}
-		} else {
-			/*
-			 * Discard first_mp early since we're dealing with a
-			 * fully-connected conn_t and tcp doesn't do policy in
-			 * this case.
-			 */
-			if (mctl_present) {
-				freeb(first_mp);
-				mctl_present = B_FALSE;
-			}
-			first_mp = mp;
-		}
-	}
-
-	/* Initiate IPPF processing for fastpath */
-	if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
-		uint32_t	ill_index;
-
-		ill_index = recv_ill->ill_phyint->phyint_ifindex;
-		ip_process(IPP_LOCAL_IN, &mp, ill_index);
-		if (mp == NULL) {
-			ip2dbg(("ip_input_ipsec_process: TCP pkt "
-			    "deferred/dropped during IPPF processing\n"));
-			CONN_DEC_REF(connp);
-			if (mctl_present)
-				freeb(first_mp);
+		if (len < 0) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+			freemsg(mp);
 			return (NULL);
-		} else if (mctl_present) {
-			/*
-			 * ip_process might return a new mp.
-			 */
-			ASSERT(first_mp != mp);
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
 		}
-
-	}
-
-	if (!syn_present && connp->conn_ip_recvpktinfo) {
-		/*
-		 * TCP does not support IP_RECVPKTINFO for v4 so lets
-		 * make sure IPF_RECVIF is passed to ip_add_info.
-		 */
-		mp = ip_add_info(mp, recv_ill, flags|IPF_RECVIF,
-		    IPCL_ZONEID(connp), ipst);
-		if (mp == NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			CONN_DEC_REF(connp);
-			if (mctl_present)
-				freeb(first_mp);
+		/* Drop any pad */
+		mp->b_wptr = rptr + pkt_len;
+	} else if ((len += msgdsize(mp->b_cont)) != 0) {
+		ASSERT(pkt_len >= min_size);
+		if (pkt_len < min_size) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+			freemsg(mp);
 			return (NULL);
-		} else if (mctl_present) {
-			/*
-			 * ip_add_info might return a new mp.
-			 */
-			ASSERT(first_mp != mp);
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
 		}
-	}
-
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-	if (IPCL_IS_TCP(connp)) {
-		SET_SQUEUE(first_mp, connp->conn_recv, connp);
-		return (first_mp);
-	} else {
-		/* SOCK_RAW, IPPROTO_TCP case */
-		(connp->conn_recv)(connp, first_mp, NULL);
-		CONN_DEC_REF(connp);
-		return (NULL);
-	}
-
-no_conn:
-	/* Initiate IPPf processing, if needed. */
-	if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
-		uint32_t ill_index;
-		ill_index = recv_ill->ill_phyint->phyint_ifindex;
-		ip_process(IPP_LOCAL_IN, &first_mp, ill_index);
-		if (first_mp == NULL) {
+		if (len < 0) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+			freemsg(mp);
 			return (NULL);
 		}
-	}
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-
-	tcp_xmit_listeners_reset(first_mp, IPH_HDR_LENGTH(mp->b_rptr), zoneid,
-	    ipst->ips_netstack->netstack_tcp, NULL);
-	return (NULL);
-ipoptions:
-	if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) {
-		goto slow_done;
-	}
-
-	UPDATE_IB_PKT_COUNT(ire);
-	ire->ire_last_used_time = lbolt;
-
-	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
-	if (u1 & (IPH_MF | IPH_OFFSET)) {
-fragmented:
-		if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) {
-			if (mctl_present)
-				freeb(first_mp);
-			goto slow_done;
-		}
-		/*
-		 * Make sure that first_mp points back to mp as
-		 * the mp we came in with could have changed in
-		 * ip_rput_fragment().
-		 */
-		ASSERT(!mctl_present);
-		ipha = (ipha_t *)mp->b_rptr;
-		first_mp = mp;
-	}
-
-	/* Now we have a complete datagram, destined for this machine. */
-	u1 = ip_hdr_len = IPH_HDR_LENGTH(ipha);
-
-	len = mp->b_wptr - mp->b_rptr;
-	/* Pull up a minimal TCP header, if necessary. */
-	if (len < (u1 + 20)) {
-tcppullup:
-		if (!pullupmsg(mp, u1 + 20)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			goto error;
-		}
-		ipha = (ipha_t *)mp->b_rptr;
-		len = mp->b_wptr - mp->b_rptr;
-	}
-
-	/*
-	 * Extract the offset field from the TCP header.  As usual, we
-	 * try to help the compiler more than the reader.
-	 */
-	offset = ((uchar_t *)ipha)[u1 + 12] >> 4;
-	if (offset != 5) {
-tcpoptions:
-		if (offset < 5) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			goto error;
-		}
-		/*
-		 * There must be TCP options.
-		 * Make sure we can grab them.
-		 */
-		offset <<= 2;
-		offset += u1;
-		if (len < offset) {
-			if (!pullupmsg(mp, offset)) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				goto error;
-			}
-			ipha = (ipha_t *)mp->b_rptr;
-			len = mp->b_wptr - rptr;
-		}
-	}
-
-	/* Get the total packet length in len, including headers. */
-	if (mp->b_cont)
-		len = msgdsize(mp);
-
-	/*
-	 * Check the TCP checksum by pulling together the pseudo-
-	 * header checksum, and passing it to ip_csum to be added in
-	 * with the TCP datagram.
-	 *
-	 * Since we are not using the hwcksum if available we must
-	 * clear the flag. We may come here via tcppullup or tcpoptions.
-	 * If either of these fails along the way the mblk is freed.
-	 * If this logic ever changes and mblk is reused to say send
-	 * ICMP's back, then this flag may need to be cleared in
-	 * other places as well.
-	 */
-	DB_CKSUMFLAGS(mp) = 0;
-
-	up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET);
-
-	u1 = (uint32_t)(len - u1);	/* TCP datagram length. */
-#ifdef	_BIG_ENDIAN
-	u1 += IPPROTO_TCP;
-#else
-	u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8);
-#endif
-	u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9];
-	/*
-	 * Not M_DATA mblk or its a dup, so do the checksum now.
-	 */
-	IP_STAT(ipst, ip_in_sw_cksum);
-	if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) {
-		BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
-		goto error;
-	}
-
-	IP_STAT(ipst, ip_tcp_slow_path);
-	goto try_again;
-#undef  iphs
-#undef  rptr
-
-error:
-	freemsg(first_mp);
-slow_done:
-	return (NULL);
-}
-
-/* ARGSUSED */
-static void
-ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
-    ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, ipaddr_t dst)
-{
-	conn_t		*connp;
-	uint32_t	sum;
-	uint32_t	u1;
-	ssize_t		len;
-	sctp_hdr_t	*sctph;
-	zoneid_t	zoneid = ire->ire_zoneid;
-	uint32_t	pktsum;
-	uint32_t	calcsum;
-	uint32_t	ports;
-	in6_addr_t	map_src, map_dst;
-	ill_t		*ill = (ill_t *)q->q_ptr;
-	ip_stack_t	*ipst;
-	sctp_stack_t	*sctps;
-	boolean_t	sctp_csum_err = B_FALSE;
-
-	ASSERT(recv_ill != NULL);
-	ipst = recv_ill->ill_ipst;
-	sctps = ipst->ips_netstack->netstack_sctp;
-
-#define	rptr	((uchar_t *)ipha)
-
-	ASSERT(ipha->ipha_protocol == IPPROTO_SCTP);
-	ASSERT(ill != NULL);
-
-	/* u1 is # words of IP options */
-	u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4)
-	    + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
-
-	/* IP options present */
-	if (u1 > 0) {
-		goto ipoptions;
-	} else {
-		/* Check the IP header checksum.  */
-		if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill) &&
-		    !mctl_present) {
-#define	uph	((uint16_t *)ipha)
-			sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
-			    uph[5] + uph[6] + uph[7] + uph[8] + uph[9];
-#undef	uph
-			/* finish doing IP checksum */
-			sum = (sum & 0xFFFF) + (sum >> 16);
-			sum = ~(sum + (sum >> 16)) & 0xFFFF;
-			/*
-			 * Don't verify header checksum if this packet
-			 * is coming back from AH/ESP as we already did it.
-			 */
-			if (sum != 0 && sum != 0xFFFF) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
-				goto error;
-			}
-		}
+		/* Drop any pad */
+		(void) adjmsg(mp, -len);
 		/*
-		 * Since there is no SCTP h/w cksum support yet, just
-		 * clear the flag.
+		 * adjmsg may have freed an mblk from the chain, hence
+		 * invalidate any hw checksum here. This will force IP to
+		 * calculate the checksum in sw, but only for this packet.
 		 */
 		DB_CKSUMFLAGS(mp) = 0;
+		IP_STAT(ill->ill_ipst, ip_multimblk);
 	}
-
-	/*
-	 * Don't verify header checksum if this packet is coming
-	 * back from AH/ESP as we already did it.
-	 */
-	if (!mctl_present) {
-		UPDATE_IB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
-	}
-
-	/* packet part of fragmented IP packet? */
-	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
-	if (u1 & (IPH_MF | IPH_OFFSET))
-		goto fragmented;
-
-	/* u1 = IP header length (20 bytes) */
-	u1 = IP_SIMPLE_HDR_LENGTH;
-
-find_sctp_client:
-	/* Pullup if we don't have the sctp common header. */
-	len = MBLKL(mp);
-	if (len < (u1 + SCTP_COMMON_HDR_LENGTH)) {
-		if (mp->b_cont == NULL ||
-		    !pullupmsg(mp, u1 + SCTP_COMMON_HDR_LENGTH)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			goto error;
-		}
-		ipha = (ipha_t *)mp->b_rptr;
-		len = MBLKL(mp);
-	}
-
-	sctph = (sctp_hdr_t *)(rptr + u1);
-#ifdef	DEBUG
-	if (!skip_sctp_cksum) {
-#endif
-		pktsum = sctph->sh_chksum;
-		sctph->sh_chksum = 0;
-		calcsum = sctp_cksum(mp, u1);
-		sctph->sh_chksum = pktsum;
-		if (calcsum != pktsum)
-			sctp_csum_err = B_TRUE;
-#ifdef	DEBUG	/* skip_sctp_cksum */
-	}
-#endif
-	/* get the ports */
-	ports = *(uint32_t *)&sctph->sh_sport;
-
-	IRE_REFRELE(ire);
-	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
-	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
-	if (sctp_csum_err) {
-		/*
-		 * No potential sctp checksum errors go to the Sun
-		 * sctp stack however they might be Adler-32 summed
-		 * packets a userland stack bound to a raw IP socket
-		 * could reasonably use. Note though that Adler-32 is
-		 * a long deprecated algorithm and customer sctp
-		 * networks should eventually migrate to CRC-32 at
-		 * which time this facility should be removed.
-		 */
-		flags |= IP_FF_SCTP_CSUM_ERR;
-		goto no_conn;
-	}
-	if ((connp = sctp_fanout(&map_src, &map_dst, ports, zoneid, mp,
-	    sctps)) == NULL) {
-		/* Check for raw socket or OOTB handling */
-		goto no_conn;
-	}
-
-	/* Found a client; up it goes */
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-	sctp_input(connp, ipha, mp, first_mp, recv_ill, B_TRUE, mctl_present);
-	return;
-
-no_conn:
-	ip_fanout_sctp_raw(first_mp, recv_ill, ipha, B_TRUE,
-	    ports, mctl_present, flags, B_TRUE, zoneid);
-	return;
-
-ipoptions:
-	DB_CKSUMFLAGS(mp) = 0;
-	if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst))
-		goto slow_done;
-
-	UPDATE_IB_PKT_COUNT(ire);
-	ire->ire_last_used_time = lbolt;
-
-	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
-	if (u1 & (IPH_MF | IPH_OFFSET)) {
-fragmented:
-		if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL))
-			goto slow_done;
-		/*
-		 * Make sure that first_mp points back to mp as
-		 * the mp we came in with could have changed in
-		 * ip_rput_fragment().
-		 */
-		ASSERT(!mctl_present);
-		ipha = (ipha_t *)mp->b_rptr;
-		first_mp = mp;
-	}
-
-	/* Now we have a complete datagram, destined for this machine. */
-	u1 = IPH_HDR_LENGTH(ipha);
-	goto find_sctp_client;
-#undef  iphs
-#undef  rptr
-
-error:
-	freemsg(first_mp);
-slow_done:
-	IRE_REFRELE(ire);
+	return (mp);
 }
 
-#define	VER_BITS	0xF0
-#define	VERSION_6	0x60
-
-static boolean_t
-ip_rput_multimblk_ipoptions(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t **iphapp,
-    ipaddr_t *dstp, ip_stack_t *ipst)
+/*
+ * Check that the IPv4 opt_len is consistent with the packet and pullup
+ * the options.
+ */
+mblk_t *
+ip_check_optlen(mblk_t *mp, ipha_t *ipha, uint_t opt_len, uint_t pkt_len,
+    ip_recv_attr_t *ira)
 {
-	uint_t	opt_len;
-	ipha_t *ipha;
+	ill_t	*ill = ira->ira_ill;
 	ssize_t len;
-	uint_t	pkt_len;
 
-	ASSERT(ill != NULL);
-	IP_STAT(ipst, ip_ipoptions);
-	ipha = *iphapp;
-
-#define	rptr    ((uchar_t *)ipha)
 	/* Assume no IPv6 packets arrive over the IPv4 queue */
-	if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) {
+	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
-		freemsg(mp);
-		return (B_FALSE);
-	}
-
-	/* multiple mblk or too short */
-	pkt_len = ntohs(ipha->ipha_length);
-
-	/* Get the number of words of IP options in the IP header. */
-	opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION;
-	if (opt_len) {
-		/* IP Options present!  Validate and process. */
-		if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
-			goto done;
-		}
-		/*
-		 * Recompute complete header length and make sure we
-		 * have access to all of it.
-		 */
-		len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2;
-		if (len > (mp->b_wptr - rptr)) {
-			if (len > pkt_len) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
-				goto done;
-			}
-			if (!pullupmsg(mp, len)) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				goto done;
-			}
-			ipha = (ipha_t *)mp->b_rptr;
-		}
-		/*
-		 * Go off to ip_rput_options which returns the next hop
-		 * destination address, which may have been affected
-		 * by source routing.
-		 */
-		IP_STAT(ipst, ip_opt);
-		if (ip_rput_options(q, mp, ipha, dstp, ipst) == -1) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			return (B_FALSE);
-		}
-	}
-	*iphapp = ipha;
-	return (B_TRUE);
-done:
-	/* clear b_prev - used by ip_mroute_decap */
-	mp->b_prev = NULL;
-	freemsg(mp);
-	return (B_FALSE);
-#undef  rptr
-}
-
-/*
- * Deal with the fact that there is no ire for the destination.
- */
-static ire_t *
-ip_rput_noire(queue_t *q, mblk_t *mp, int ll_multicast, ipaddr_t dst)
-{
-	ipha_t	*ipha;
-	ill_t	*ill;
-	ire_t	*ire;
-	ip_stack_t *ipst;
-	enum	ire_forward_action ret_action;
-
-	ipha = (ipha_t *)mp->b_rptr;
-	ill = (ill_t *)q->q_ptr;
-
-	ASSERT(ill != NULL);
-	ipst = ill->ill_ipst;
-
-	/*
-	 * No IRE for this destination, so it can't be for us.
-	 * Unless we are forwarding, drop the packet.
-	 * We have to let source routed packets through
-	 * since we don't yet know if they are 'ping -l'
-	 * packets i.e. if they will go out over the
-	 * same interface as they came in on.
-	 */
-	if (ll_multicast) {
+		ip_drop_input("IPvN packet on IPv4 ill", mp, ill);
 		freemsg(mp);
 		return (NULL);
 	}
-	if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+
+	if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
 		freemsg(mp);
 		return (NULL);
 	}
-
 	/*
-	 * Mark this packet as having originated externally.
-	 *
-	 * For non-forwarding code path, ire_send later double
-	 * checks this interface to see if it is still exists
-	 * post-ARP resolution.
-	 *
-	 * Also, IPQOS uses this to differentiate between
-	 * IPP_FWD_OUT and IPP_LOCAL_OUT for post-ARP
-	 * QOS packet processing in ip_wput_attach_llhdr().
-	 * The QoS module can mark the b_band for a fastpath message
-	 * or the dl_priority field in a unitdata_req header for
-	 * CoS marking. This info can only be found in
-	 * ip_wput_attach_llhdr().
+	 * Recompute complete header length and make sure we
+	 * have access to all of it.
 	 */
-	mp->b_prev = (mblk_t *)(uintptr_t)ill->ill_phyint->phyint_ifindex;
-	/*
-	 * Clear the indication that this may have a hardware checksum
-	 * as we are not using it
-	 */
-	DB_CKSUMFLAGS(mp) = 0;
-
-	ire = ire_forward(dst, &ret_action, NULL, NULL,
-	    msg_getlabel(mp), ipst);
-
-	if (ire == NULL && ret_action == Forward_check_multirt) {
-		/* Let ip_newroute handle CGTP  */
-		ip_newroute(q, mp, dst, NULL, GLOBAL_ZONEID, ipst);
-		return (NULL);
-	}
-
-	if (ire != NULL)
-		return (ire);
-
-	mp->b_prev = mp->b_next = 0;
-
-	if (ret_action == Forward_blackhole) {
-		freemsg(mp);
-		return (NULL);
-	}
-	/* send icmp unreachable */
-	q = WR(q);
-	/* Sent by forwarding path, and router is global zone */
-	if (ip_source_routed(ipha, ipst)) {
-		icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED,
-		    GLOBAL_ZONEID, ipst);
-	} else {
-		icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, GLOBAL_ZONEID,
-		    ipst);
-	}
-
-	return (NULL);
-
-}
-
-/*
- * check ip header length and align it.
- */
-static boolean_t
-ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst)
-{
-	ssize_t len;
-	ill_t *ill;
-	ipha_t	*ipha;
-
-	len = MBLKL(mp);
-
-	if (!OK_32PTR(mp->b_rptr) || len < IP_SIMPLE_HDR_LENGTH) {
-		ill = (ill_t *)q->q_ptr;
-
-		if (!OK_32PTR(mp->b_rptr))
-			IP_STAT(ipst, ip_notaligned1);
-		else
-			IP_STAT(ipst, ip_notaligned2);
-		/* Guard against bogus device drivers */
-		if (len < 0) {
-			/* clear b_prev - used by ip_mroute_decap */
-			mp->b_prev = NULL;
+	len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2;
+	if (len > (mp->b_wptr - mp->b_rptr)) {
+		if (len > pkt_len) {
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
 			freemsg(mp);
-			return (B_FALSE);
-		}
-
-		if (ip_rput_pullups++ == 0) {
-			ipha = (ipha_t *)mp->b_rptr;
-			(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
-			    "ip_check_and_align_header: %s forced us to "
-			    " pullup pkt, hdr len %ld, hdr addr %p",
-			    ill->ill_name, len, (void *)ipha);
+			return (NULL);
 		}
-		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
-			/* clear b_prev - used by ip_mroute_decap */
-			mp->b_prev = NULL;
+		if (ip_pullup(mp, len, ira) == NULL) {
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
 			freemsg(mp);
-			return (B_FALSE);
+			return (NULL);
 		}
 	}
-	return (B_TRUE);
+	return (mp);
 }
 
 /*
- * Handle the situation where a packet came in on `ill' but matched an IRE
- * whose ire_rfq doesn't match `ill'.  We return the IRE that should be used
- * for interface statistics.
+ * Returns a new ire, or the same ire, or NULL.
+ * If a different IRE is returned, then it is held; the caller
+ * needs to release it.
+ * In no case is there any hold/release on the ire argument.
  */
 ire_t *
 ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
@@ -13697,10 +7829,9 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
 	 * issue (e.g. packet received on an underlying interface matched an
 	 * IRE_LOCAL on its associated group interface).
 	 */
-	if (ire->ire_rfq != NULL &&
-	    IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) {
+	ASSERT(ire->ire_ill != NULL);
+	if (IS_IN_SAME_ILLGRP(ill, ire->ire_ill))
 		return (ire);
-	}
 
 	/*
 	 * Do another ire lookup here, using the ingress ill, to see if the
@@ -13711,25 +7842,24 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
 	 * ip*_strict_dst_multihoming switch is on.
 	 * We also need to check for IPIF_UNNUMBERED point2point interfaces
 	 * where the local address may not be unique. In this case we were
-	 * at the mercy of the initial ire cache lookup and the IRE_LOCAL it
+	 * at the mercy of the initial ire lookup and the IRE_LOCAL it
 	 * actually returned. The new lookup, which is more specific, should
 	 * only find the IRE_LOCAL associated with the ingress ill if one
 	 * exists.
 	 */
-
 	if (ire->ire_ipversion == IPV4_VERSION) {
 		if (ipst->ips_ip_strict_dst_multihoming)
 			strict_check = B_TRUE;
-		new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL,
-		    ill->ill_ipif, ALL_ZONES, NULL,
-		    (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst);
+		new_ire = ire_ftable_lookup_v4(*((ipaddr_t *)addr), 0, 0,
+		    IRE_LOCAL, ill, ALL_ZONES, NULL,
+		    (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
 	} else {
 		ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr));
 		if (ipst->ips_ipv6_strict_dst_multihoming)
 			strict_check = B_TRUE;
-		new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL,
-		    IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL,
-		    (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst);
+		new_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
+		    IRE_LOCAL, ill, ALL_ZONES, NULL,
+		    (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
 	}
 	/*
 	 * If the same ire that was returned in ip_input() is found then this
@@ -13741,38 +7871,27 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
 	 * order to have accurate interface statistics.
 	 */
 	if (new_ire != NULL) {
-		if ((new_ire != ire) && (new_ire->ire_rfq != NULL)) {
-			ire_refrele(ire);
-			ire = new_ire;
-		} else {
-			ire_refrele(new_ire);
-		}
-		return (ire);
-	} else if ((ire->ire_rfq == NULL) &&
-	    (ire->ire_ipversion == IPV4_VERSION)) {
-		/*
-		 * The best match could have been the original ire which
-		 * was created against an IRE_LOCAL on lo0. In the IPv4 case
-		 * the strict multihoming checks are irrelevant as we consider
-		 * local addresses hosted on lo0 to be interface agnostic. We
-		 * only expect a null ire_rfq on IREs which are associated with
-		 * lo0 hence we can return now.
-		 */
+		/* Note: held in one case but not the other? Caller handles */
+		if (new_ire != ire)
+			return (new_ire);
+		/* Unchanged */
+		ire_refrele(new_ire);
 		return (ire);
 	}
 
 	/*
 	 * Chase pointers once and store locally.
 	 */
-	ire_ill = (ire->ire_rfq == NULL) ? NULL :
-	    (ill_t *)(ire->ire_rfq->q_ptr);
+	ASSERT(ire->ire_ill != NULL);
+	ire_ill = ire->ire_ill;
 	ifindex = ill->ill_usesrc_ifindex;
 
 	/*
 	 * Check if it's a legal address on the 'usesrc' interface.
+	 * For IPMP data addresses the IRE_LOCAL is the upper, hence we
+	 * can just check phyint_ifindex.
 	 */
-	if ((ifindex != 0) && (ire_ill != NULL) &&
-	    (ifindex == ire_ill->ill_phyint->phyint_ifindex)) {
+	if (ifindex != 0 && ifindex == ire_ill->ill_phyint->phyint_ifindex) {
 		return (ire);
 	}
 
@@ -13783,905 +7902,234 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
 	if (!(strict_check))
 		return (ire);
 
-	if ((ill->ill_flags & ire->ire_ipif->ipif_ill->ill_flags &
-	    ILLF_ROUTER) != 0) {
+	if ((ill->ill_flags & ire->ire_ill->ill_flags & ILLF_ROUTER) != 0) {
 		return (ire);
 	}
-
-	ire_refrele(ire);
 	return (NULL);
 }
 
 /*
+ * This function is used to construct a mac_header_info_s from a
+ * DL_UNITDATA_IND message.
+ * The address fields in the mhi structure points into the message,
+ * thus the caller can't use those fields after freeing the message.
  *
- * This is the fast forward path. If we are here, we dont need to
- * worry about RSVP, CGTP, or TSol. Furthermore the ftable lookup
- * needed to find the nexthop in this case is much simpler
+ * We determine whether the packet received is a non-unicast packet
+ * and in doing so, determine whether or not it is broadcast vs multicast.
+ * For it to be a broadcast packet, we must have the appropriate mblk_t
+ * hanging off the ill_t.  If this is either not present or doesn't match
+ * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
+ * to be multicast.  Thus NICs that have no broadcast address (or no
+ * capability for one, such as point to point links) cannot return as
+ * the packet being broadcast.
  */
-ire_t *
-ip_fast_forward(ire_t *ire, ipaddr_t dst,  ill_t *ill, mblk_t *mp)
+void
+ip_dlur_to_mhi(ill_t *ill, mblk_t *mb, struct mac_header_info_s *mhip)
 {
-	ipha_t	*ipha;
-	ire_t	*src_ire;
-	ill_t	*stq_ill;
-	uint_t	hlen;
-	uint_t	pkt_len;
-	uint32_t sum;
-	queue_t	*dev_q;
-	ip_stack_t *ipst = ill->ill_ipst;
-	mblk_t *fpmp;
-	enum	ire_forward_action ret_action;
-
-	ipha = (ipha_t *)mp->b_rptr;
-
-	if (ire != NULL &&
-	    ire->ire_zoneid != GLOBAL_ZONEID &&
-	    ire->ire_zoneid != ALL_ZONES) {
-		/*
-		 * Should only use IREs that are visible to the global
-		 * zone for forwarding.
-		 */
-		ire_refrele(ire);
-		ire = ire_cache_lookup(dst, GLOBAL_ZONEID, NULL, ipst);
-		/*
-		 * ire_cache_lookup() can return ire of IRE_LOCAL in
-		 * transient cases. In such case, just drop the packet
-		 */
-		if (ire != NULL && ire->ire_type != IRE_CACHE)
-			goto indiscard;
-	}
-
-	/*
-	 * Martian Address Filtering [RFC 1812, Section 5.3.7]
-	 * The loopback address check for both src and dst has already
-	 * been checked in ip_input
-	 */
-
-	if (dst == INADDR_ANY || CLASSD(ipha->ipha_src)) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
-		goto drop;
-	}
-	src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL,
-	    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-
-	if (src_ire != NULL) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
-		ire_refrele(src_ire);
-		goto drop;
-	}
-
-	/* No ire cache of nexthop. So first create one  */
-	if (ire == NULL) {
-
-		ire = ire_forward_simple(dst, &ret_action, ipst);
+	dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr;
+	mblk_t *bmp;
+	uint_t extra_offset;
 
-		/*
-		 * We only come to ip_fast_forward if ip_cgtp_filter
-		 * is not set. So ire_forward() should not return with
-		 * Forward_check_multirt as the next action.
-		 */
-		ASSERT(ret_action != Forward_check_multirt);
-		if (ire == NULL) {
-			/* An attempt was made to forward the packet */
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			mp->b_prev = mp->b_next = 0;
-			/* send icmp unreachable */
-			/* Sent by forwarding path, and router is global zone */
-			if (ret_action == Forward_ret_icmp_err) {
-				if (ip_source_routed(ipha, ipst)) {
-					icmp_unreachable(ill->ill_wq, mp,
-					    ICMP_SOURCE_ROUTE_FAILED,
-					    GLOBAL_ZONEID, ipst);
-				} else {
-					icmp_unreachable(ill->ill_wq, mp,
-					    ICMP_HOST_UNREACHABLE,
-					    GLOBAL_ZONEID, ipst);
-				}
-			} else {
-				freemsg(mp);
-			}
-			return (NULL);
-		}
-	}
+	bzero(mhip, sizeof (struct mac_header_info_s));
 
-	/*
-	 * Forwarding fastpath exception case:
-	 * If any of the following are true, we take the slowpath:
-	 *	o forwarding is not enabled
-	 *	o incoming and outgoing interface are the same, or in the same
-	 *	  IPMP group.
-	 *	o corresponding ire is in incomplete state
-	 *	o packet needs fragmentation
-	 *	o ARP cache is not resolved
-	 *
-	 * The codeflow from here on is thus:
-	 *	ip_rput_process_forward->ip_rput_forward->ip_xmit_v4
-	 */
-	pkt_len = ntohs(ipha->ipha_length);
-	stq_ill = (ill_t *)ire->ire_stq->q_ptr;
-	if (!(stq_ill->ill_flags & ILLF_ROUTER) ||
-	    (ill == stq_ill) || IS_IN_SAME_ILLGRP(ill, stq_ill) ||
-	    (ire->ire_nce == NULL) ||
-	    (pkt_len > ire->ire_max_frag) ||
-	    ((fpmp = ire->ire_nce->nce_fp_mp) == NULL) ||
-	    ((hlen = MBLKL(fpmp)) > MBLKHEAD(mp)) ||
-	    ipha->ipha_ttl <= 1) {
-		ip_rput_process_forward(ill->ill_rq, mp, ire,
-		    ipha, ill, B_FALSE, B_TRUE);
-		return (ire);
-	}
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
+	mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
 
-	DTRACE_PROBE4(ip4__forwarding__start,
-	    ill_t *, ill, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp);
+	if (ill->ill_sap_length < 0)
+		extra_offset = 0;
+	else
+		extra_offset = ill->ill_sap_length;
 
-	FW_HOOKS(ipst->ips_ip4_forwarding_event,
-	    ipst->ips_ipv4firewall_forwarding,
-	    ill, stq_ill, ipha, mp, mp, 0, ipst);
+	mhip->mhi_daddr = (uchar_t *)ind + ind->dl_dest_addr_offset +
+	    extra_offset;
+	mhip->mhi_saddr = (uchar_t *)ind + ind->dl_src_addr_offset +
+	    extra_offset;
 
-	DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
+	if (!ind->dl_group_address)
+		return;
 
-	if (mp == NULL)
-		goto drop;
+	/* Multicast or broadcast */
+	mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
 
-	mp->b_datap->db_struioun.cksum.flags = 0;
-	/* Adjust the checksum to reflect the ttl decrement. */
-	sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
-	ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
-	ipha->ipha_ttl--;
+	if (ind->dl_dest_addr_offset > sizeof (*ind) &&
+	    ind->dl_dest_addr_offset + ind->dl_dest_addr_length < MBLKL(mb) &&
+	    (bmp = ill->ill_bcast_mp) != NULL) {
+		dl_unitdata_req_t *dlur;
+		uint8_t *bphys_addr;
 
-	/*
-	 * Write the link layer header.  We can do this safely here,
-	 * because we have already tested to make sure that the IP
-	 * policy is not set, and that we have a fast path destination
-	 * header.
-	 */
-	mp->b_rptr -= hlen;
-	bcopy(fpmp->b_rptr, mp->b_rptr, hlen);
-
-	UPDATE_IB_PKT_COUNT(ire);
-	ire->ire_last_used_time = lbolt;
-	BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
-	BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutTransmits);
-	UPDATE_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutOctets, pkt_len);
-
-	if (!ILL_DIRECT_CAPABLE(stq_ill) || DB_TYPE(mp) != M_DATA) {
-		dev_q = ire->ire_stq->q_next;
-		if (DEV_Q_FLOW_BLOCKED(dev_q))
-			goto indiscard;
-	}
-
-	DTRACE_PROBE4(ip4__physical__out__start,
-	    ill_t *, NULL, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp);
-	FW_HOOKS(ipst->ips_ip4_physical_out_event,
-	    ipst->ips_ipv4firewall_physical_out,
-	    NULL, stq_ill, ipha, mp, mp, 0, ipst);
-	DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
-	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
-	    ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha,
-	    ip6_t *, NULL, int, 0);
-
-	if (mp != NULL) {
-		if (ipst->ips_ip4_observe.he_interested) {
-			zoneid_t szone;
+		dlur = (dl_unitdata_req_t *)bmp->b_rptr;
+		bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
+		    extra_offset;
 
-			/*
-			 * Both of these functions expect b_rptr to be
-			 * where the IP header starts, so advance past the
-			 * link layer header if present.
-			 */
-			mp->b_rptr += hlen;
-			szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
-			    ipst, ALL_ZONES);
-			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
-			    ALL_ZONES, ill, ipst);
-			mp->b_rptr -= hlen;
-		}
-		ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC, NULL);
+		if (bcmp(mhip->mhi_daddr, bphys_addr,
+		    ind->dl_dest_addr_length) == 0)
+			mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
 	}
-	return (ire);
-
-indiscard:
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-drop:
-	if (mp != NULL)
-		freemsg(mp);
-	return (ire);
-
 }
 
 /*
- * This function is called in the forwarding slowpath, when
- * either the ire lacks the link-layer address, or the packet needs
- * further processing(eg. fragmentation), before transmission.
+ * This function is used to construct a mac_header_info_s from a
+ * M_DATA fastpath message from a DLPI driver.
+ * The address fields in the mhi structure points into the message,
+ * thus the caller can't use those fields after freeing the message.
+ *
+ * We determine whether the packet received is a non-unicast packet
+ * and in doing so, determine whether or not it is broadcast vs multicast.
+ * For it to be a broadcast packet, we must have the appropriate mblk_t
+ * hanging off the ill_t.  If this is either not present or doesn't match
+ * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
+ * to be multicast.  Thus NICs that have no broadcast address (or no
+ * capability for one, such as point to point links) cannot return as
+ * the packet being broadcast.
  */
-
-static void
-ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
-    ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward)
+void
+ip_mdata_to_mhi(ill_t *ill, mblk_t *mp, struct mac_header_info_s *mhip)
 {
-	queue_t		*dev_q;
-	ire_t		*src_ire;
-	ip_stack_t	*ipst = ill->ill_ipst;
-	boolean_t	same_illgrp = B_FALSE;
-
-	ASSERT(ire->ire_stq != NULL);
-
-	mp->b_prev = NULL; /* ip_rput_noire sets incoming interface here */
-	mp->b_next = NULL; /* ip_rput_noire sets dst here */
+	mblk_t *bmp;
+	struct ether_header *pether;
 
-	/*
-	 * If the caller of this function is ip_fast_forward() skip the
-	 * next three checks as it does not apply.
-	 */
-	if (from_ip_fast_forward)
-		goto skip;
+	bzero(mhip, sizeof (struct mac_header_info_s));
 
-	if (ll_multicast != 0) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-		goto drop_pkt;
-	}
+	mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
 
-	/*
-	 * check if ipha_src is a broadcast address. Note that this
-	 * check is redundant when we get here from ip_fast_forward()
-	 * which has already done this check. However, since we can
-	 * also get here from ip_rput_process_broadcast() or, for
-	 * for the slow path through ip_fast_forward(), we perform
-	 * the check again for code-reusability
-	 */
-	src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL,
-	    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-	if (src_ire != NULL || ipha->ipha_dst == INADDR_ANY) {
-		if (src_ire != NULL)
-			ire_refrele(src_ire);
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
-		ip2dbg(("ip_rput_process_forward: Received packet with"
-		    " bad src/dst address on %s\n", ill->ill_name));
-		goto drop_pkt;
-	}
+	pether = (struct ether_header *)((char *)mp->b_rptr
+	    - sizeof (struct ether_header));
 
 	/*
-	 * Check if we want to forward this one at this time.
-	 * We allow source routed packets on a host provided that
-	 * they go out the same ill or illgrp as they came in on.
-	 *
-	 * XXX To be quicker, we may wish to not chase pointers to
-	 * get the ILLF_ROUTER flag and instead store the
-	 * forwarding policy in the ire.  An unfortunate
-	 * side-effect of that would be requiring an ire flush
-	 * whenever the ILLF_ROUTER flag changes.
+	 * Make sure the interface is an ethernet type, since we don't
+	 * know the header format for anything but Ethernet. Also make
+	 * sure we are pointing correctly above db_base.
 	 */
-skip:
-	same_illgrp = IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr);
-
-	if (((ill->ill_flags &
-	    ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_ROUTER) == 0) &&
-	    !(ip_source_routed(ipha, ipst) &&
-	    (ire->ire_rfq == q || same_illgrp))) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
-		if (ip_source_routed(ipha, ipst)) {
-			q = WR(q);
-			/*
-			 * Clear the indication that this may have
-			 * hardware checksum as we are not using it.
-			 */
-			DB_CKSUMFLAGS(mp) = 0;
-			/* Sent by forwarding path, and router is global zone */
-			icmp_unreachable(q, mp,
-			    ICMP_SOURCE_ROUTE_FAILED, GLOBAL_ZONEID, ipst);
-			return;
-		}
-		goto drop_pkt;
-	}
-
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
-
-	/* Packet is being forwarded. Turning off hwcksum flag. */
-	DB_CKSUMFLAGS(mp) = 0;
-	if (ipst->ips_ip_g_send_redirects) {
-		/*
-		 * Check whether the incoming interface and outgoing
-		 * interface is part of the same group. If so,
-		 * send redirects.
-		 *
-		 * Check the source address to see if it originated
-		 * on the same logical subnet it is going back out on.
-		 * If so, we should be able to send it a redirect.
-		 * Avoid sending a redirect if the destination
-		 * is directly connected (i.e., ipha_dst is the same
-		 * as ire_gateway_addr or the ire_addr of the
-		 * nexthop IRE_CACHE ), or if the packet was source
-		 * routed out this interface.
-		 */
-		ipaddr_t src, nhop;
-		mblk_t	*mp1;
-		ire_t	*nhop_ire = NULL;
-
-		/*
-		 * Check whether ire_rfq and q are from the same ill or illgrp.
-		 * If so, send redirects.
-		 */
-		if ((ire->ire_rfq == q || same_illgrp) &&
-		    !ip_source_routed(ipha, ipst)) {
-
-			nhop = (ire->ire_gateway_addr != 0 ?
-			    ire->ire_gateway_addr : ire->ire_addr);
-
-			if (ipha->ipha_dst == nhop) {
-				/*
-				 * We avoid sending a redirect if the
-				 * destination is directly connected
-				 * because it is possible that multiple
-				 * IP subnets may have been configured on
-				 * the link, and the source may not
-				 * be on the same subnet as ip destination,
-				 * even though they are on the same
-				 * physical link.
-				 */
-				goto sendit;
-			}
-
-			src = ipha->ipha_src;
-
-			/*
-			 * We look up the interface ire for the nexthop,
-			 * to see if ipha_src is in the same subnet
-			 * as the nexthop.
-			 *
-			 * Note that, if, in the future, IRE_CACHE entries
-			 * are obsoleted,  this lookup will not be needed,
-			 * as the ire passed to this function will be the
-			 * same as the nhop_ire computed below.
-			 */
-			nhop_ire = ire_ftable_lookup(nhop, 0, 0,
-			    IRE_INTERFACE, NULL, NULL, ALL_ZONES,
-			    0, NULL, MATCH_IRE_TYPE, ipst);
-
-			if (nhop_ire != NULL) {
-				if ((src & nhop_ire->ire_mask) ==
-				    (nhop & nhop_ire->ire_mask)) {
-					/*
-					 * The source is directly connected.
-					 * Just copy the ip header (which is
-					 * in the first mblk)
-					 */
-					mp1 = copyb(mp);
-					if (mp1 != NULL) {
-						icmp_send_redirect(WR(q), mp1,
-						    nhop, ipst);
-					}
-				}
-				ire_refrele(nhop_ire);
-			}
-		}
-	}
-sendit:
-	dev_q = ire->ire_stq->q_next;
-	if (DEV_Q_FLOW_BLOCKED(dev_q)) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-		freemsg(mp);
+	if (ill->ill_type != IFT_ETHER)
 		return;
-	}
-
-	ip_rput_forward(ire, ipha, mp, ill);
-	return;
-
-drop_pkt:
-	ip2dbg(("ip_rput_process_forward: drop pkt\n"));
-	freemsg(mp);
-}
-
-ire_t *
-ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha,
-    ill_t *ill, ipaddr_t dst, int cgtp_flt_pkt, int ll_multicast)
-{
-	queue_t		*q;
-	uint16_t	hcksumflags;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	q = *qp;
-
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts);
-
-	/*
-	 * Clear the indication that this may have hardware
-	 * checksum as we are not using it for forwarding.
-	 */
-	hcksumflags = DB_CKSUMFLAGS(mp);
-	DB_CKSUMFLAGS(mp) = 0;
-
-	/*
-	 * Directed broadcast forwarding: if the packet came in over a
-	 * different interface then it is routed out over we can forward it.
-	 */
-	if (ipha->ipha_protocol == IPPROTO_TCP) {
-		ire_refrele(ire);
-		freemsg(mp);
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-		return (NULL);
-	}
-	/*
-	 * For multicast we have set dst to be INADDR_BROADCAST
-	 * for delivering to all STREAMS.
-	 */
-	if (!CLASSD(ipha->ipha_dst)) {
-		ire_t *new_ire;
-		ipif_t *ipif;
-
-		ipif = ipif_get_next_ipif(NULL, ill);
-		if (ipif == NULL) {
-discard:		ire_refrele(ire);
-			freemsg(mp);
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			return (NULL);
-		}
-		new_ire = ire_ctable_lookup(dst, 0, 0,
-		    ipif, ALL_ZONES, NULL, MATCH_IRE_ILL, ipst);
-		ipif_refrele(ipif);
 
-		if (new_ire != NULL) {
-			/*
-			 * If the matching IRE_BROADCAST is part of an IPMP
-			 * group, then drop the packet unless our ill has been
-			 * nominated to receive for the group.
-			 */
-			if (IS_IPMP(new_ire->ire_ipif->ipif_ill) &&
-			    new_ire->ire_rfq != q) {
-				ire_refrele(new_ire);
-				goto discard;
-			}
-
-			/*
-			 * In the special case of multirouted broadcast
-			 * packets, we unconditionally need to "gateway"
-			 * them to the appropriate interface here.
-			 * In the normal case, this cannot happen, because
-			 * there is no broadcast IRE tagged with the
-			 * RTF_MULTIRT flag.
-			 */
-			if (new_ire->ire_flags & RTF_MULTIRT) {
-				ire_refrele(new_ire);
-				if (ire->ire_rfq != NULL) {
-					q = ire->ire_rfq;
-					*qp = q;
-				}
-			} else {
-				ire_refrele(ire);
-				ire = new_ire;
-			}
-		} else if (cgtp_flt_pkt == CGTP_IP_PKT_NOT_CGTP) {
-			if (!ipst->ips_ip_g_forward_directed_bcast) {
-				/*
-				 * Free the message if
-				 * ip_g_forward_directed_bcast is turned
-				 * off for non-local broadcast.
-				 */
-				ire_refrele(ire);
-				freemsg(mp);
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				return (NULL);
-			}
-		} else {
-			/*
-			 * This CGTP packet successfully passed the
-			 * CGTP filter, but the related CGTP
-			 * broadcast IRE has not been found,
-			 * meaning that the redundant ipif is
-			 * probably down. However, if we discarded
-			 * this packet, its duplicate would be
-			 * filtered out by the CGTP filter so none
-			 * of them would get through. So we keep
-			 * going with this one.
-			 */
-			ASSERT(cgtp_flt_pkt == CGTP_IP_PKT_PREMIUM);
-			if (ire->ire_rfq != NULL) {
-				q = ire->ire_rfq;
-				*qp = q;
-			}
-		}
-	}
-	if (ipst->ips_ip_g_forward_directed_bcast && ll_multicast == 0) {
-		/*
-		 * Verify that there are not more then one
-		 * IRE_BROADCAST with this broadcast address which
-		 * has ire_stq set.
-		 * TODO: simplify, loop over all IRE's
-		 */
-		ire_t	*ire1;
-		int	num_stq = 0;
-		mblk_t	*mp1;
-
-		/* Find the first one with ire_stq set */
-		rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
-		for (ire1 = ire; ire1 &&
-		    !ire1->ire_stq && ire1->ire_addr == ire->ire_addr;
-		    ire1 = ire1->ire_next)
-			;
-		if (ire1) {
-			ire_refrele(ire);
-			ire = ire1;
-			IRE_REFHOLD(ire);
-		}
+retry:
+	if ((uchar_t *)pether < mp->b_datap->db_base)
+		return;
 
-		/* Check if there are additional ones with stq set */
-		for (ire1 = ire; ire1; ire1 = ire1->ire_next) {
-			if (ire->ire_addr != ire1->ire_addr)
-				break;
-			if (ire1->ire_stq) {
-				num_stq++;
-				break;
-			}
+	/* Is there a VLAN tag? */
+	if (ill->ill_isv6) {
+		if (pether->ether_type != htons(ETHERTYPE_IPV6)) {
+			pether = (struct ether_header *)((char *)pether - 4);
+			goto retry;
 		}
-		rw_exit(&ire->ire_bucket->irb_lock);
-		if (num_stq == 1 && ire->ire_stq != NULL) {
-			ip1dbg(("ip_rput_process_broadcast: directed "
-			    "broadcast to 0x%x\n",
-			    ntohl(ire->ire_addr)));
-			mp1 = copymsg(mp);
-			if (mp1) {
-				switch (ipha->ipha_protocol) {
-				case IPPROTO_UDP:
-					ip_udp_input(q, mp1, ipha, ire, ill);
-					break;
-				default:
-					ip_proto_input(q, mp1, ipha, ire, ill,
-					    0);
-					break;
-				}
-			}
-			/*
-			 * Adjust ttl to 2 (1+1 - the forward engine
-			 * will decrement it by one.
-			 */
-			if (ip_csum_hdr(ipha)) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
-				ip2dbg(("ip_rput_broadcast:drop pkt\n"));
-				freemsg(mp);
-				ire_refrele(ire);
-				return (NULL);
-			}
-			ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1;
-			ipha->ipha_hdr_checksum = 0;
-			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
-			ip_rput_process_forward(q, mp, ire, ipha,
-			    ill, ll_multicast, B_FALSE);
-			ire_refrele(ire);
-			return (NULL);
+	} else {
+		if (pether->ether_type != htons(ETHERTYPE_IP)) {
+			pether = (struct ether_header *)((char *)pether - 4);
+			goto retry;
 		}
-		ip1dbg(("ip_rput: NO directed broadcast to 0x%x\n",
-		    ntohl(ire->ire_addr)));
 	}
+	mhip->mhi_daddr = (uchar_t *)&pether->ether_dhost;
+	mhip->mhi_saddr = (uchar_t *)&pether->ether_shost;
 
-	/* Restore any hardware checksum flags */
-	DB_CKSUMFLAGS(mp) = hcksumflags;
-	return (ire);
-}
-
-/* ARGSUSED */
-static boolean_t
-ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
-    int *ll_multicast, ipaddr_t *dstp)
-{
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets,
-	    ntohs(ipha->ipha_length));
+	if (!(mhip->mhi_daddr[0] & 0x01))
+		return;
 
-	/*
-	 * So that we don't end up with dups, only one ill in an IPMP group is
-	 * nominated to receive multicast traffic.
-	 */
-	if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast)
-		goto drop_pkt;
+	/* Multicast or broadcast */
+	mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
 
-	/*
-	 * Forward packets only if we have joined the allmulti
-	 * group on this interface.
-	 */
-	if (ipst->ips_ip_g_mrouter && ill->ill_join_allmulti) {
-		int retval;
+	if ((bmp = ill->ill_bcast_mp) != NULL) {
+		dl_unitdata_req_t *dlur;
+		uint8_t *bphys_addr;
+		uint_t	addrlen;
 
-		/*
-		 * Clear the indication that this may have hardware
-		 * checksum as we are not using it.
-		 */
-		DB_CKSUMFLAGS(mp) = 0;
-		retval = ip_mforward(ill, ipha, mp);
-		/* ip_mforward updates mib variables if needed */
-		/* clear b_prev - used by ip_mroute_decap */
-		mp->b_prev = NULL;
-
-		switch (retval) {
-		case 0:
-			/*
-			 * pkt is okay and arrived on phyint.
-			 *
-			 * If we are running as a multicast router
-			 * we need to see all IGMP and/or PIM packets.
-			 */
-			if ((ipha->ipha_protocol == IPPROTO_IGMP) ||
-			    (ipha->ipha_protocol == IPPROTO_PIM)) {
-				goto done;
-			}
-			break;
-		case -1:
-			/* pkt is mal-formed, toss it */
-			goto drop_pkt;
-		case 1:
-			/* pkt is okay and arrived on a tunnel */
-			/*
-			 * If we are running a multicast router
-			 *  we need to see all igmp packets.
-			 */
-			if (ipha->ipha_protocol == IPPROTO_IGMP) {
-				*dstp = INADDR_BROADCAST;
-				*ll_multicast = 1;
-				return (B_FALSE);
-			}
-
-			goto drop_pkt;
+		dlur = (dl_unitdata_req_t *)bmp->b_rptr;
+		addrlen = dlur->dl_dest_addr_length;
+		if (ill->ill_sap_length < 0) {
+			bphys_addr = (uchar_t *)dlur +
+			    dlur->dl_dest_addr_offset;
+			addrlen += ill->ill_sap_length;
+		} else {
+			bphys_addr = (uchar_t *)dlur +
+			    dlur->dl_dest_addr_offset +
+			    ill->ill_sap_length;
+			addrlen -= ill->ill_sap_length;
 		}
+		if (bcmp(mhip->mhi_daddr, bphys_addr, addrlen) == 0)
+			mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
 	}
-
-	if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) {
-		/*
-		 * This might just be caused by the fact that
-		 * multiple IP Multicast addresses map to the same
-		 * link layer multicast - no need to increment counter!
-		 */
-		freemsg(mp);
-		return (B_TRUE);
-	}
-done:
-	ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp)));
-	/*
-	 * This assumes the we deliver to all streams for multicast
-	 * and broadcast packets.
-	 */
-	*dstp = INADDR_BROADCAST;
-	*ll_multicast = 1;
-	return (B_FALSE);
-drop_pkt:
-	ip2dbg(("ip_rput: drop pkt\n"));
-	freemsg(mp);
-	return (B_TRUE);
 }
 
 /*
- * This function is used to both return an indication of whether or not
- * the packet received is a non-unicast packet (by way of the DL_UNITDATA_IND)
- * and in doing so, determine whether or not it is broadcast vs multicast.
- * For it to be a broadcast packet, we must have the appropriate mblk_t
- * hanging off the ill_t.  If this is either not present or doesn't match
- * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
- * to be multicast.  Thus NICs that have no broadcast address (or no
- * capability for one, such as point to point links) cannot return as
- * the packet being broadcast.  The use of HPE_BROADCAST/HPE_MULTICAST as
- * the return values simplifies the current use of the return value of this
- * function, which is to pass through the multicast/broadcast characteristic
- * to consumers of the netinfo/pfhooks API.  While this is not cast in stone,
- * changing the return value to some other symbol demands the appropriate
- * "translation" when hpe_flags is set prior to calling hook_run() for
- * packet events.
+ * Handle anything but M_DATA messages
+ * We see the DL_UNITDATA_IND which are part
+ * of the data path, and also the other messages from the driver.
  */
-int
-ip_get_dlpi_mbcast(ill_t *ill, mblk_t *mb)
-{
-	dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr;
-	mblk_t *bmp;
-
-	if (ind->dl_group_address) {
-		if (ind->dl_dest_addr_offset > sizeof (*ind) &&
-		    ind->dl_dest_addr_offset + ind->dl_dest_addr_length <
-		    MBLKL(mb) &&
-		    (bmp = ill->ill_bcast_mp) != NULL) {
-			dl_unitdata_req_t *dlur;
-			uint8_t *bphys_addr;
-
-			dlur = (dl_unitdata_req_t *)bmp->b_rptr;
-			if (ill->ill_sap_length < 0)
-				bphys_addr = (uchar_t *)dlur +
-				    dlur->dl_dest_addr_offset;
-			else
-				bphys_addr = (uchar_t *)dlur +
-				    dlur->dl_dest_addr_offset +
-				    ill->ill_sap_length;
-
-			if (bcmp(mb->b_rptr + ind->dl_dest_addr_offset,
-			    bphys_addr, ind->dl_dest_addr_length) == 0) {
-				return (HPE_BROADCAST);
-			}
-			return (HPE_MULTICAST);
-		}
-		return (HPE_MULTICAST);
-	}
-	return (0);
-}
-
-static boolean_t
-ip_rput_process_notdata(queue_t *q, mblk_t **first_mpp, ill_t *ill,
-    int *ll_multicast, mblk_t **mpp)
+void
+ip_rput_notdata(ill_t *ill, mblk_t *mp)
 {
-	mblk_t *mp1, *from_mp, *to_mp, *mp, *first_mp;
-	boolean_t must_copy = B_FALSE;
+	mblk_t		*first_mp;
 	struct iocblk   *iocp;
-	ipha_t		*ipha;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-#define	rptr    ((uchar_t *)ipha)
-
-	first_mp = *first_mpp;
-	mp = *mpp;
+	struct mac_header_info_s mhi;
 
-	ASSERT(first_mp == mp);
-
-	/*
-	 * if db_ref > 1 then copymsg and free original. Packet may be
-	 * changed and do not want other entity who has a reference to this
-	 * message to trip over the changes. This is a blind change because
-	 * trying to catch all places that might change packet is too
-	 * difficult (since it may be a module above this one)
-	 *
-	 * This corresponds to the non-fast path case. We walk down the full
-	 * chain in this case, and check the db_ref count of all the dblks,
-	 * and do a copymsg if required. It is possible that the db_ref counts
-	 * of the data blocks in the mblk chain can be different.
-	 * For Example, we can get a DL_UNITDATA_IND(M_PROTO) with a db_ref
-	 * count of 1, followed by a M_DATA block with a ref count of 2, if
-	 * 'snoop' is running.
-	 */
-	for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
-		if (mp1->b_datap->db_ref > 1) {
-			must_copy = B_TRUE;
-			break;
-		}
-	}
-
-	if (must_copy) {
-		mp1 = copymsg(mp);
-		if (mp1 == NULL) {
-			for (mp1 = mp; mp1 != NULL;
-			    mp1 = mp1->b_cont) {
-				mp1->b_next = NULL;
-				mp1->b_prev = NULL;
-			}
-			freemsg(mp);
-			if (ill != NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			} else {
-				BUMP_MIB(&ipst->ips_ip_mib,
-				    ipIfStatsInDiscards);
-			}
-			return (B_TRUE);
-		}
-		for (from_mp = mp, to_mp = mp1; from_mp != NULL;
-		    from_mp = from_mp->b_cont, to_mp = to_mp->b_cont) {
-			/* Copy b_prev - used by ip_mroute_decap */
-			to_mp->b_prev = from_mp->b_prev;
-			from_mp->b_prev = NULL;
-		}
-		*first_mpp = first_mp = mp1;
-		freemsg(mp);
-		mp = mp1;
-		*mpp = mp1;
-	}
-
-	ipha = (ipha_t *)mp->b_rptr;
-
-	/*
-	 * previous code has a case for M_DATA.
-	 * We want to check how that happens.
-	 */
-	ASSERT(first_mp->b_datap->db_type != M_DATA);
-	switch (first_mp->b_datap->db_type) {
+	switch (DB_TYPE(mp)) {
 	case M_PROTO:
-	case M_PCPROTO:
-		if (((dl_unitdata_ind_t *)rptr)->dl_primitive !=
+	case M_PCPROTO: {
+		if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive !=
 		    DL_UNITDATA_IND) {
 			/* Go handle anything other than data elsewhere. */
-			ip_rput_dlpi(q, mp);
-			return (B_TRUE);
+			ip_rput_dlpi(ill, mp);
+			return;
 		}
 
-		*ll_multicast = ip_get_dlpi_mbcast(ill, mp);
+		first_mp = mp;
+		mp = first_mp->b_cont;
+		first_mp->b_cont = NULL;
+
+		if (mp == NULL) {
+			freeb(first_mp);
+			return;
+		}
+		ip_dlur_to_mhi(ill, first_mp, &mhi);
+		if (ill->ill_isv6)
+			ip_input_v6(ill, NULL, mp, &mhi);
+		else
+			ip_input(ill, NULL, mp, &mhi);
+
 		/* Ditch the DLPI header. */
-		mp1 = mp->b_cont;
-		ASSERT(first_mp == mp);
-		*first_mpp = mp1;
-		freeb(mp);
-		*mpp = mp1;
-		return (B_FALSE);
+		freeb(first_mp);
+		return;
+	}
 	case M_IOCACK:
-		ip1dbg(("got iocack "));
 		iocp = (struct iocblk *)mp->b_rptr;
 		switch (iocp->ioc_cmd) {
 		case DL_IOC_HDR_INFO:
-			ill = (ill_t *)q->q_ptr;
 			ill_fastpath_ack(ill, mp);
-			return (B_TRUE);
+			return;
 		default:
-			putnext(q, mp);
-			return (B_TRUE);
+			putnext(ill->ill_rq, mp);
+			return;
 		}
 		/* FALLTHRU */
 	case M_ERROR:
 	case M_HANGUP:
-		/*
-		 * Since this is on the ill stream we unconditionally
-		 * bump up the refcount
-		 */
-		ill_refhold(ill);
-		qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE);
-		return (B_TRUE);
-	case M_CTL:
-		if ((MBLKL(first_mp) >= sizeof (da_ipsec_t)) &&
-		    (((da_ipsec_t *)first_mp->b_rptr)->da_type ==
-		    IPHADA_M_CTL)) {
-			/*
-			 * It's an IPsec accelerated packet.
-			 * Make sure that the ill from which we received the
-			 * packet has enabled IPsec hardware acceleration.
-			 */
-			if (!(ill->ill_capabilities &
-			    (ILL_CAPAB_AH|ILL_CAPAB_ESP))) {
-				/* IPsec kstats: bean counter */
-				freemsg(mp);
-				return (B_TRUE);
-			}
-
-			/*
-			 * Make mp point to the mblk following the M_CTL,
-			 * then process according to type of mp.
-			 * After this processing, first_mp will point to
-			 * the data-attributes and mp to the pkt following
-			 * the M_CTL.
-			 */
-			mp = first_mp->b_cont;
-			if (mp == NULL) {
-				freemsg(first_mp);
-				return (B_TRUE);
-			}
-			/*
-			 * A Hardware Accelerated packet can only be M_DATA
-			 * ESP or AH packet.
-			 */
-			if (mp->b_datap->db_type != M_DATA) {
-				/* non-M_DATA IPsec accelerated packet */
-				IPSECHW_DEBUG(IPSECHW_PKT,
-				    ("non-M_DATA IPsec accelerated pkt\n"));
-				freemsg(first_mp);
-				return (B_TRUE);
-			}
-			ipha = (ipha_t *)mp->b_rptr;
-			if (ipha->ipha_protocol != IPPROTO_AH &&
-			    ipha->ipha_protocol != IPPROTO_ESP) {
-				IPSECHW_DEBUG(IPSECHW_PKT,
-				    ("non-M_DATA IPsec accelerated pkt\n"));
-				freemsg(first_mp);
-				return (B_TRUE);
-			}
-			*mpp = mp;
-			return (B_FALSE);
+		mutex_enter(&ill->ill_lock);
+		if (ill->ill_state_flags & ILL_CONDEMNED) {
+			mutex_exit(&ill->ill_lock);
+			freemsg(mp);
+			return;
 		}
-		putnext(q, mp);
-		return (B_TRUE);
+		ill_refhold_locked(ill);
+		mutex_exit(&ill->ill_lock);
+		qwriter_ip(ill, ill->ill_rq, mp, ip_rput_other, CUR_OP,
+		    B_FALSE);
+		return;
+	case M_CTL:
+		putnext(ill->ill_rq, mp);
+		return;
 	case M_IOCNAK:
 		ip1dbg(("got iocnak "));
 		iocp = (struct iocblk *)mp->b_rptr;
 		switch (iocp->ioc_cmd) {
 		case DL_IOC_HDR_INFO:
-			ip_rput_other(NULL, q, mp, NULL);
-			return (B_TRUE);
+			ip_rput_other(NULL, ill->ill_rq, mp, NULL);
+			return;
 		default:
 			break;
 		}
 		/* FALLTHRU */
 	default:
-		putnext(q, mp);
-		return (B_TRUE);
+		putnext(ill->ill_rq, mp);
+		return;
 	}
 }
 
@@ -14692,8 +8140,6 @@ ip_rput(queue_t *q, mblk_t *mp)
 	ill_t	*ill;
 	union DL_primitives *dl;
 
-	TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_rput_start: q %p", q);
-
 	ill = (ill_t *)q->q_ptr;
 
 	if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
@@ -14707,70 +8153,42 @@ ip_rput(queue_t *q, mblk_t *mp)
 		if (DB_TYPE(mp) != M_PCPROTO ||
 		    dl->dl_primitive == DL_UNITDATA_IND) {
 			inet_freemsg(mp);
-			TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
-			    "ip_rput_end: q %p (%S)", q, "uninit");
 			return;
 		}
 	}
+	if (DB_TYPE(mp) == M_DATA) {
+		struct mac_header_info_s mhi;
 
-	TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
-	    "ip_rput_end: q %p (%S)", q, "end");
-
-	ip_input(ill, NULL, mp, NULL);
+		ip_mdata_to_mhi(ill, mp, &mhi);
+		ip_input(ill, NULL, mp, &mhi);
+	} else {
+		ip_rput_notdata(ill, mp);
+	}
 }
 
-static mblk_t *
-ip_fix_dbref(ill_t *ill, mblk_t *mp)
+/*
+ * Move the information to a copy.
+ */
+mblk_t *
+ip_fix_dbref(mblk_t *mp, ip_recv_attr_t *ira)
 {
-	mblk_t *mp1;
-	boolean_t adjusted = B_FALSE;
-	ip_stack_t *ipst = ill->ill_ipst;
+	mblk_t		*mp1;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
 	IP_STAT(ipst, ip_db_ref);
-	/*
-	 * The IP_RECVSLLA option depends on having the
-	 * link layer header. First check that:
-	 * a> the underlying device is of type ether,
-	 * since this option is currently supported only
-	 * over ethernet.
-	 * b> there is enough room to copy over the link
-	 * layer header.
-	 *
-	 * Once the checks are done, adjust rptr so that
-	 * the link layer header will be copied via
-	 * copymsg. Note that, IFT_ETHER may be returned
-	 * by some non-ethernet drivers but in this case
-	 * the second check will fail.
-	 */
-	if (ill->ill_type == IFT_ETHER &&
-	    (mp->b_rptr - mp->b_datap->db_base) >=
-	    sizeof (struct ether_header)) {
-		mp->b_rptr -= sizeof (struct ether_header);
-		adjusted = B_TRUE;
-	}
-	mp1 = copymsg(mp);
 
+	/* Make sure we have ira_l2src before we loose the original mblk */
+	if (!(ira->ira_flags & IRAF_L2SRC_SET))
+		ip_setl2src(mp, ira, ira->ira_rill);
+
+	mp1 = copymsg(mp);
 	if (mp1 == NULL) {
-		mp->b_next = NULL;
-		/* clear b_prev - used by ip_mroute_decap */
-		mp->b_prev = NULL;
-		freemsg(mp);
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards", mp, ill);
+		freemsg(mp);
 		return (NULL);
 	}
-
-	if (adjusted) {
-		/*
-		 * Copy is done. Restore the pointer in
-		 * the _new_ mblk
-		 */
-		mp1->b_rptr += sizeof (struct ether_header);
-	}
-
-	/* Copy b_prev - used by ip_mroute_decap */
-	mp1->b_prev = mp->b_prev;
-	mp->b_prev = NULL;
-
 	/* preserve the hardware checksum flags and data, if present */
 	if (DB_CKSUMFLAGS(mp) != 0) {
 		DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
@@ -14779,888 +8197,10 @@ ip_fix_dbref(ill_t *ill, mblk_t *mp)
 		DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
 		DB_CKSUM16(mp1) = DB_CKSUM16(mp);
 	}
-
 	freemsg(mp);
 	return (mp1);
 }
 
-#define	ADD_TO_CHAIN(head, tail, cnt, mp) {    			\
-	if (tail != NULL)					\
-		tail->b_next = mp;				\
-	else							\
-		head = mp;					\
-	tail = mp;						\
-	cnt++;							\
-}
-
-/*
- * Direct read side procedure capable of dealing with chains. GLDv3 based
- * drivers call this function directly with mblk chains while STREAMS
- * read side procedure ip_rput() calls this for single packet with ip_ring
- * set to NULL to process one packet at a time.
- *
- * The ill will always be valid if this function is called directly from
- * the driver.
- *
- * If ip_input() is called from GLDv3:
- *
- *   - This must be a non-VLAN IP stream.
- *   - 'mp' is either an untagged or a special priority-tagged packet.
- *   - Any VLAN tag that was in the MAC header has been stripped.
- *
- * If the IP header in packet is not 32-bit aligned, every message in the
- * chain will be aligned before further operations. This is required on SPARC
- * platform.
- */
-/* ARGSUSED */
-void
-ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
-    struct mac_header_info_s *mhip)
-{
-	ipaddr_t		dst = NULL;
-	ipaddr_t		prev_dst;
-	ire_t			*ire = NULL;
-	ipha_t			*ipha;
-	uint_t			pkt_len;
-	ssize_t			len;
-	uint_t			opt_len;
-	int			ll_multicast;
-	int			cgtp_flt_pkt;
-	queue_t			*q = ill->ill_rq;
-	squeue_t		*curr_sqp = NULL;
-	mblk_t 			*head = NULL;
-	mblk_t			*tail = NULL;
-	mblk_t			*first_mp;
-	int			cnt = 0;
-	ip_stack_t		*ipst = ill->ill_ipst;
-	mblk_t			*mp;
-	mblk_t			*dmp;
-	uint8_t			tag;
-	ilb_stack_t		*ilbs;
-	ipaddr_t		lb_dst;
-
-	ASSERT(mp_chain != NULL);
-	ASSERT(ill != NULL);
-
-	TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q);
-
-	tag = (ip_ring != NULL) ? SQTAG_IP_INPUT_RX_RING : SQTAG_IP_INPUT;
-
-#define	rptr	((uchar_t *)ipha)
-
-	ilbs = ipst->ips_netstack->netstack_ilb;
-	while (mp_chain != NULL) {
-		mp = mp_chain;
-		mp_chain = mp_chain->b_next;
-		mp->b_next = NULL;
-		ll_multicast = 0;
-
-		/*
-		 * We do ire caching from one iteration to
-		 * another. In the event the packet chain contains
-		 * all packets from the same dst, this caching saves
-		 * an ire_cache_lookup for each of the succeeding
-		 * packets in a packet chain.
-		 */
-		prev_dst = dst;
-
-		/*
-		 * if db_ref > 1 then copymsg and free original. Packet
-		 * may be changed and we do not want the other entity
-		 * who has a reference to this message to trip over the
-		 * changes. This is a blind change because trying to
-		 * catch all places that might change the packet is too
-		 * difficult.
-		 *
-		 * This corresponds to the fast path case, where we have
-		 * a chain of M_DATA mblks.  We check the db_ref count
-		 * of only the 1st data block in the mblk chain. There
-		 * doesn't seem to be a reason why a device driver would
-		 * send up data with varying db_ref counts in the mblk
-		 * chain. In any case the Fast path is a private
-		 * interface, and our drivers don't do such a thing.
-		 * Given the above assumption, there is no need to walk
-		 * down the entire mblk chain (which could have a
-		 * potential performance problem)
-		 *
-		 * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
-		 * to here because of exclusive ip stacks and vnics.
-		 * Packets transmitted from exclusive stack over vnic
-		 * can have db_ref > 1 and when it gets looped back to
-		 * another vnic in a different zone, you have ip_input()
-		 * getting dblks with db_ref > 1. So if someone
-		 * complains of TCP performance under this scenario,
-		 * take a serious look here on the impact of copymsg().
-		 */
-
-		if (DB_REF(mp) > 1) {
-			if ((mp = ip_fix_dbref(ill, mp)) == NULL)
-				continue;
-		}
-
-		/*
-		 * Check and align the IP header.
-		 */
-		first_mp = mp;
-		if (DB_TYPE(mp) == M_DATA) {
-			dmp = mp;
-		} else if (DB_TYPE(mp) == M_PROTO &&
-		    *(t_uscalar_t *)mp->b_rptr == DL_UNITDATA_IND) {
-			dmp = mp->b_cont;
-		} else {
-			dmp = NULL;
-		}
-		if (dmp != NULL) {
-			/*
-			 * IP header ptr not aligned?
-			 * OR IP header not complete in first mblk
-			 */
-			if (!OK_32PTR(dmp->b_rptr) ||
-			    MBLKL(dmp) < IP_SIMPLE_HDR_LENGTH) {
-				if (!ip_check_and_align_header(q, dmp, ipst))
-					continue;
-			}
-		}
-
-		/*
-		 * ip_input fast path
-		 */
-
-		/* mblk type is not M_DATA */
-		if (DB_TYPE(mp) != M_DATA) {
-			if (ip_rput_process_notdata(q, &first_mp, ill,
-			    &ll_multicast, &mp))
-				continue;
-
-			/*
-			 * The only way we can get here is if we had a
-			 * packet that was either a DL_UNITDATA_IND or
-			 * an M_CTL for an IPsec accelerated packet.
-			 *
-			 * In either case, the first_mp will point to
-			 * the leading M_PROTO or M_CTL.
-			 */
-			ASSERT(first_mp != NULL);
-		} else if (mhip != NULL) {
-			/*
-			 * ll_multicast is set here so that it is ready
-			 * for easy use with FW_HOOKS().  ip_get_dlpi_mbcast
-			 * manipulates ll_multicast in the same fashion when
-			 * called from ip_rput_process_notdata.
-			 */
-			switch (mhip->mhi_dsttype) {
-			case MAC_ADDRTYPE_MULTICAST :
-				ll_multicast = HPE_MULTICAST;
-				break;
-			case MAC_ADDRTYPE_BROADCAST :
-				ll_multicast = HPE_BROADCAST;
-				break;
-			default :
-				break;
-			}
-		}
-
-		/* Only M_DATA can come here and it is always aligned */
-		ASSERT(DB_TYPE(mp) == M_DATA);
-		ASSERT(DB_REF(mp) == 1 && OK_32PTR(mp->b_rptr));
-
-		ipha = (ipha_t *)mp->b_rptr;
-		len = mp->b_wptr - rptr;
-		pkt_len = ntohs(ipha->ipha_length);
-
-		/*
-		 * We must count all incoming packets, even if they end
-		 * up being dropped later on.
-		 */
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
-		UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len);
-
-		/* multiple mblk or too short */
-		len -= pkt_len;
-		if (len != 0) {
-			/*
-			 * Make sure we have data length consistent
-			 * with the IP header.
-			 */
-			if (mp->b_cont == NULL) {
-				if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInHdrErrors);
-					ip2dbg(("ip_input: drop pkt\n"));
-					freemsg(mp);
-					continue;
-				}
-				mp->b_wptr = rptr + pkt_len;
-			} else if ((len += msgdsize(mp->b_cont)) != 0) {
-				if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInHdrErrors);
-					ip2dbg(("ip_input: drop pkt\n"));
-					freemsg(mp);
-					continue;
-				}
-				(void) adjmsg(mp, -len);
-				/*
-				 * adjmsg may have freed an mblk from the chain,
-				 * hence invalidate any hw checksum here. This
-				 * will force IP to calculate the checksum in
-				 * sw, but only for this packet.
-				 */
-				DB_CKSUMFLAGS(mp) = 0;
-				IP_STAT(ipst, ip_multimblk3);
-			}
-		}
-
-		/* Obtain the dst of the current packet */
-		dst = ipha->ipha_dst;
-
-		DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL,
-		    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *,
-		    ipha, ip6_t *, NULL, int, 0);
-
-		/*
-		 * The following test for loopback is faster than
-		 * IP_LOOPBACK_ADDR(), because it avoids any bitwise
-		 * operations.
-		 * Note that these addresses are always in network byte order
-		 */
-		if (((*(uchar_t *)&ipha->ipha_dst) == 127) ||
-		    ((*(uchar_t *)&ipha->ipha_src) == 127)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
-			freemsg(mp);
-			continue;
-		}
-
-		/*
-		 * The event for packets being received from a 'physical'
-		 * interface is placed after validation of the source and/or
-		 * destination address as being local so that packets can be
-		 * redirected to loopback addresses using ipnat.
-		 */
-		DTRACE_PROBE4(ip4__physical__in__start,
-		    ill_t *, ill, ill_t *, NULL,
-		    ipha_t *, ipha, mblk_t *, first_mp);
-
-		FW_HOOKS(ipst->ips_ip4_physical_in_event,
-		    ipst->ips_ipv4firewall_physical_in,
-		    ill, NULL, ipha, first_mp, mp, ll_multicast, ipst);
-
-		DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, first_mp);
-
-		if (first_mp == NULL) {
-			continue;
-		}
-		dst = ipha->ipha_dst;
-		/*
-		 * Attach any necessary label information to
-		 * this packet
-		 */
-		if (is_system_labeled() &&
-		    !tsol_get_pkt_label(mp, IPV4_VERSION)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(mp);
-			continue;
-		}
-
-		if (ipst->ips_ip4_observe.he_interested) {
-			zoneid_t dzone;
-
-			/*
-			 * On the inbound path the src zone will be unknown as
-			 * this packet has come from the wire.
-			 */
-			dzone = ip_get_zoneid_v4(dst, mp, ipst, ALL_ZONES);
-			ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone,
-			    ill, ipst);
-		}
-
-		/*
-		 * Here we check to see if we machine is setup as
-		 * L3 loadbalancer and if the incoming packet is for a VIP
-		 *
-		 * Check the following:
-		 * - there is at least a rule
-		 * - protocol of the packet is supported
-		 */
-		if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) {
-			int lb_ret;
-
-			/* For convenience, we pull up the mblk. */
-			if (mp->b_cont != NULL) {
-				if (pullupmsg(mp, -1) == 0) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInDiscards);
-					freemsg(first_mp);
-					continue;
-				}
-				ipha = (ipha_t *)mp->b_rptr;
-			}
-
-			/*
-			 * We just drop all fragments going to any VIP, at
-			 * least for now....
-			 */
-			if (ntohs(ipha->ipha_fragment_offset_and_flags) &
-			    (IPH_MF | IPH_OFFSET)) {
-				if (!ilb_rule_match_vip_v4(ilbs,
-				    ipha->ipha_dst, NULL)) {
-					goto after_ilb;
-				}
-
-				ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1);
-				ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1);
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(first_mp);
-				continue;
-			}
-			lb_ret = ilb_check_v4(ilbs, ill, mp, ipha,
-			    ipha->ipha_protocol, (uint8_t *)ipha +
-			    IPH_HDR_LENGTH(ipha), &lb_dst);
-
-			if (lb_ret == ILB_DROPPED) {
-				/* Is this the right counter to increase? */
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(first_mp);
-				continue;
-			} else if (lb_ret == ILB_BALANCED) {
-				/* Set the dst to that of the chosen server */
-				dst = lb_dst;
-				DB_CKSUMFLAGS(mp) = 0;
-			}
-		}
-
-after_ilb:
-		/*
-		 * Reuse the cached ire only if the ipha_dst of the previous
-		 * packet is the same as the current packet AND it is not
-		 * INADDR_ANY.
-		 */
-		if (!(dst == prev_dst && dst != INADDR_ANY) &&
-		    (ire != NULL)) {
-			ire_refrele(ire);
-			ire = NULL;
-		}
-
-		opt_len = ipha->ipha_version_and_hdr_length -
-		    IP_SIMPLE_HDR_VERSION;
-
-		/*
-		 * Check to see if we can take the fastpath.
-		 * That is possible if the following conditions are met
-		 *	o Tsol disabled
-		 *	o CGTP disabled
-		 *	o ipp_action_count is 0
-		 *	o no options in the packet
-		 *	o not a RSVP packet
-		 * 	o not a multicast packet
-		 *	o ill not in IP_DHCPINIT_IF mode
-		 */
-		if (!is_system_labeled() &&
-		    !ipst->ips_ip_cgtp_filter && ipp_action_count == 0 &&
-		    opt_len == 0 && ipha->ipha_protocol != IPPROTO_RSVP &&
-		    !ll_multicast && !CLASSD(dst) && ill->ill_dhcpinit == 0) {
-			if (ire == NULL)
-				ire = ire_cache_lookup_simple(dst, ipst);
-			/*
-			 * Unless forwarding is enabled, dont call
-			 * ip_fast_forward(). Incoming packet is for forwarding
-			 */
-			if ((ill->ill_flags & ILLF_ROUTER) &&
-			    (ire == NULL || (ire->ire_type & IRE_CACHE))) {
-				ire = ip_fast_forward(ire, dst, ill, mp);
-				continue;
-			}
-			/* incoming packet is for local consumption */
-			if ((ire != NULL) && (ire->ire_type & IRE_LOCAL))
-				goto local;
-		}
-
-		/*
-		 * Disable ire caching for anything more complex
-		 * than the simple fast path case we checked for above.
-		 */
-		if (ire != NULL) {
-			ire_refrele(ire);
-			ire = NULL;
-		}
-
-		/*
-		 * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP
-		 * server to unicast DHCP packets to a DHCP client using the
-		 * IP address it is offering to the client.  This can be
-		 * disabled through the "broadcast bit", but not all DHCP
-		 * servers honor that bit.  Therefore, to interoperate with as
-		 * many DHCP servers as possible, the DHCP client allows the
-		 * server to unicast, but we treat those packets as broadcast
-		 * here.  Note that we don't rewrite the packet itself since
-		 * (a) that would mess up the checksums and (b) the DHCP
-		 * client conn is bound to INADDR_ANY so ip_fanout_udp() will
-		 * hand it the packet regardless.
-		 */
-		if (ill->ill_dhcpinit != 0 &&
-		    IS_SIMPLE_IPH(ipha) && ipha->ipha_protocol == IPPROTO_UDP &&
-		    pullupmsg(mp, sizeof (ipha_t) + sizeof (udpha_t)) == 1) {
-			udpha_t *udpha;
-
-			/*
-			 * Reload ipha since pullupmsg() can change b_rptr.
-			 */
-			ipha = (ipha_t *)mp->b_rptr;
-			udpha = (udpha_t *)&ipha[1];
-
-			if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) {
-				DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill,
-				    mblk_t *, mp);
-				dst = INADDR_BROADCAST;
-			}
-		}
-
-		/* Full-blown slow path */
-		if (opt_len != 0) {
-			if (len != 0)
-				IP_STAT(ipst, ip_multimblk4);
-			else
-				IP_STAT(ipst, ip_ipoptions);
-			if (!ip_rput_multimblk_ipoptions(q, ill, mp, &ipha,
-			    &dst, ipst))
-				continue;
-		}
-
-		/*
-		 * Invoke the CGTP (multirouting) filtering module to process
-		 * the incoming packet. Packets identified as duplicates
-		 * must be discarded. Filtering is active only if the
-		 * the ip_cgtp_filter ndd variable is non-zero.
-		 */
-		cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP;
-		if (ipst->ips_ip_cgtp_filter &&
-		    ipst->ips_ip_cgtp_filter_ops != NULL) {
-			netstackid_t stackid;
-
-			stackid = ipst->ips_netstack->netstack_stackid;
-			cgtp_flt_pkt =
-			    ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid,
-			    ill->ill_phyint->phyint_ifindex, mp);
-			if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
-				freemsg(first_mp);
-				continue;
-			}
-		}
-
-		/*
-		 * If rsvpd is running, let RSVP daemon handle its processing
-		 * and forwarding of RSVP multicast/unicast packets.
-		 * If rsvpd is not running but mrouted is running, RSVP
-		 * multicast packets are forwarded as multicast traffic
-		 * and RSVP unicast packets are forwarded by unicast router.
-		 * If neither rsvpd nor mrouted is running, RSVP multicast
-		 * packets are not forwarded, but the unicast packets are
-		 * forwarded like unicast traffic.
-		 */
-		if (ipha->ipha_protocol == IPPROTO_RSVP &&
-		    ipst->ips_ipcl_proto_fanout[IPPROTO_RSVP].connf_head !=
-		    NULL) {
-			/* RSVP packet and rsvpd running. Treat as ours */
-			ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(dst)));
-			/*
-			 * This assumes that we deliver to all streams for
-			 * multicast and broadcast packets.
-			 * We have to force ll_multicast to 1 to handle the
-			 * M_DATA messages passed in from ip_mroute_decap.
-			 */
-			dst = INADDR_BROADCAST;
-			ll_multicast = 1;
-		} else if (CLASSD(dst)) {
-			/* packet is multicast */
-			mp->b_next = NULL;
-			if (ip_rput_process_multicast(q, mp, ill, ipha,
-			    &ll_multicast, &dst))
-				continue;
-		}
-
-		if (ire == NULL) {
-			ire = ire_cache_lookup(dst, ALL_ZONES,
-			    msg_getlabel(mp), ipst);
-		}
-
-		if (ire != NULL && ire->ire_stq != NULL &&
-		    ire->ire_zoneid != GLOBAL_ZONEID &&
-		    ire->ire_zoneid != ALL_ZONES) {
-			/*
-			 * Should only use IREs that are visible from the
-			 * global zone for forwarding.
-			 */
-			ire_refrele(ire);
-			ire = ire_cache_lookup(dst, GLOBAL_ZONEID,
-			    msg_getlabel(mp), ipst);
-		}
-
-		if (ire == NULL) {
-			/*
-			 * No IRE for this destination, so it can't be for us.
-			 * Unless we are forwarding, drop the packet.
-			 * We have to let source routed packets through
-			 * since we don't yet know if they are 'ping -l'
-			 * packets i.e. if they will go out over the
-			 * same interface as they came in on.
-			 */
-			ire = ip_rput_noire(q, mp, ll_multicast, dst);
-			if (ire == NULL)
-				continue;
-		}
-
-		/*
-		 * Broadcast IRE may indicate either broadcast or
-		 * multicast packet
-		 */
-		if (ire->ire_type == IRE_BROADCAST) {
-			/*
-			 * Skip broadcast checks if packet is UDP multicast;
-			 * we'd rather not enter ip_rput_process_broadcast()
-			 * unless the packet is broadcast for real, since
-			 * that routine is a no-op for multicast.
-			 */
-			if (ipha->ipha_protocol != IPPROTO_UDP ||
-			    !CLASSD(ipha->ipha_dst)) {
-				ire = ip_rput_process_broadcast(&q, mp,
-				    ire, ipha, ill, dst, cgtp_flt_pkt,
-				    ll_multicast);
-				if (ire == NULL)
-					continue;
-			}
-		} else if (ire->ire_stq != NULL) {
-			/* fowarding? */
-			ip_rput_process_forward(q, mp, ire, ipha, ill,
-			    ll_multicast, B_FALSE);
-			/* ip_rput_process_forward consumed the packet */
-			continue;
-		}
-
-local:
-		/*
-		 * If the queue in the ire is different to the ingress queue
-		 * then we need to check to see if we can accept the packet.
-		 * Note that for multicast packets and broadcast packets sent
-		 * to a broadcast address which is shared between multiple
-		 * interfaces we should not do this since we just got a random
-		 * broadcast ire.
-		 */
-		if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) {
-			ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
-			if (ire == NULL) {
-				/* Drop packet */
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsForwProhibits);
-				freemsg(mp);
-				continue;
-			}
-			if (ire->ire_rfq != NULL)
-				q = ire->ire_rfq;
-		}
-
-		switch (ipha->ipha_protocol) {
-		case IPPROTO_TCP:
-			ASSERT(first_mp == mp);
-			if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire,
-			    mp, 0, q, ip_ring)) != NULL) {
-				if (curr_sqp == NULL) {
-					curr_sqp = GET_SQUEUE(mp);
-					ASSERT(cnt == 0);
-					cnt++;
-					head = tail = mp;
-				} else if (curr_sqp == GET_SQUEUE(mp)) {
-					ASSERT(tail != NULL);
-					cnt++;
-					tail->b_next = mp;
-					tail = mp;
-				} else {
-					/*
-					 * A different squeue. Send the
-					 * chain for the previous squeue on
-					 * its way. This shouldn't happen
-					 * often unless interrupt binding
-					 * changes.
-					 */
-					IP_STAT(ipst, ip_input_multi_squeue);
-					SQUEUE_ENTER(curr_sqp, head,
-					    tail, cnt, SQ_PROCESS, tag);
-					curr_sqp = GET_SQUEUE(mp);
-					head = mp;
-					tail = mp;
-					cnt = 1;
-				}
-			}
-			continue;
-		case IPPROTO_UDP:
-			ASSERT(first_mp == mp);
-			ip_udp_input(q, mp, ipha, ire, ill);
-			continue;
-		case IPPROTO_SCTP:
-			ASSERT(first_mp == mp);
-			ip_sctp_input(mp, ipha, ill, B_FALSE, ire, mp, 0,
-			    q, dst);
-			/* ire has been released by ip_sctp_input */
-			ire = NULL;
-			continue;
-		case IPPROTO_ENCAP:
-		case IPPROTO_IPV6:
-			ASSERT(first_mp == mp);
-			if (ip_iptun_input(NULL, mp, ipha, ill, ire, ipst))
-				break;
-			/*
-			 * If there was no IP tunnel data-link bound to
-			 * receive this packet, then we fall through to
-			 * allow potential raw sockets bound to either of
-			 * these protocols to pick it up.
-			 */
-		default:
-			ip_proto_input(q, first_mp, ipha, ire, ill, 0);
-			continue;
-		}
-	}
-
-	if (ire != NULL)
-		ire_refrele(ire);
-
-	if (head != NULL)
-		SQUEUE_ENTER(curr_sqp, head, tail, cnt, SQ_PROCESS, tag);
-
-	TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
-	    "ip_input_end: q %p (%S)", q, "end");
-#undef  rptr
-}
-
-/*
- * ip_accept_tcp() - This function is called by the squeue when it retrieves
- * a chain of packets in the poll mode. The packets have gone through the
- * data link processing but not IP processing. For performance and latency
- * reasons, the squeue wants to process the chain in line instead of feeding
- * it back via ip_input path.
- *
- * So this is a light weight function which checks to see if the packets
- * retrived are indeed TCP packets (TCP squeue always polls TCP soft ring
- * but we still do the paranoid check) meant for local machine and we don't
- * have labels etc enabled. Packets that meet the criterion are returned to
- * the squeue and processed inline while the rest go via ip_input path.
- */
-/*ARGSUSED*/
-mblk_t *
-ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
-    mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
-{
-	mblk_t 		*mp;
-	ipaddr_t	dst = NULL;
-	ipaddr_t	prev_dst;
-	ire_t		*ire = NULL;
-	ipha_t		*ipha;
-	uint_t		pkt_len;
-	ssize_t		len;
-	uint_t		opt_len;
-	queue_t		*q = ill->ill_rq;
-	squeue_t	*curr_sqp;
-	mblk_t 		*ahead = NULL;	/* Accepted head */
-	mblk_t		*atail = NULL;	/* Accepted tail */
-	uint_t		acnt = 0;	/* Accepted count */
-	mblk_t		*utail = NULL;	/* Unaccepted head */
-	mblk_t		*uhead = NULL;	/* Unaccepted tail */
-	uint_t		ucnt = 0;	/* Unaccepted cnt */
-	ip_stack_t	*ipst = ill->ill_ipst;
-	ilb_stack_t	*ilbs = ipst->ips_netstack->netstack_ilb;
-
-	*cnt = 0;
-
-	ASSERT(ill != NULL);
-	ASSERT(ip_ring != NULL);
-
-	TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_accept_tcp: q %p", q);
-
-	/* If ILB is enabled, don't do fast processing. */
-	if (ilb_has_rules(ilbs)) {
-		uhead = mp_chain;
-		goto all_reject;
-	}
-
-#define	rptr	((uchar_t *)ipha)
-
-	while (mp_chain != NULL) {
-		mp = mp_chain;
-		mp_chain = mp_chain->b_next;
-		mp->b_next = NULL;
-
-		/*
-		 * We do ire caching from one iteration to
-		 * another. In the event the packet chain contains
-		 * all packets from the same dst, this caching saves
-		 * an ire_cache_lookup for each of the succeeding
-		 * packets in a packet chain.
-		 */
-		prev_dst = dst;
-
-		ipha = (ipha_t *)mp->b_rptr;
-		len = mp->b_wptr - rptr;
-
-		ASSERT(!MBLK_RX_FANOUT_SLOWPATH(mp, ipha));
-
-		/*
-		 * If it is a non TCP packet, or doesn't have H/W cksum,
-		 * or doesn't have min len, reject.
-		 */
-		if ((ipha->ipha_protocol != IPPROTO_TCP) || (len <
-		    (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH))) {
-			ADD_TO_CHAIN(uhead, utail, ucnt, mp);
-			continue;
-		}
-
-		pkt_len = ntohs(ipha->ipha_length);
-		if (len != pkt_len) {
-			if (len > pkt_len) {
-				mp->b_wptr = rptr + pkt_len;
-			} else {
-				ADD_TO_CHAIN(uhead, utail, ucnt, mp);
-				continue;
-			}
-		}
-
-		opt_len = ipha->ipha_version_and_hdr_length -
-		    IP_SIMPLE_HDR_VERSION;
-		dst = ipha->ipha_dst;
-
-		/* IP version bad or there are IP options */
-		if (opt_len && (!ip_rput_multimblk_ipoptions(q, ill,
-		    mp, &ipha, &dst, ipst)))
-			continue;
-
-		if (is_system_labeled() || (ill->ill_dhcpinit != 0) ||
-		    (ipst->ips_ip_cgtp_filter &&
-		    ipst->ips_ip_cgtp_filter_ops != NULL)) {
-			ADD_TO_CHAIN(uhead, utail, ucnt, mp);
-			continue;
-		}
-
-		/*
-		 * Reuse the cached ire only if the ipha_dst of the previous
-		 * packet is the same as the current packet AND it is not
-		 * INADDR_ANY.
-		 */
-		if (!(dst == prev_dst && dst != INADDR_ANY) &&
-		    (ire != NULL)) {
-			ire_refrele(ire);
-			ire = NULL;
-		}
-
-		if (ire == NULL)
-			ire = ire_cache_lookup_simple(dst, ipst);
-
-		/*
-		 * Unless forwarding is enabled, dont call
-		 * ip_fast_forward(). Incoming packet is for forwarding
-		 */
-		if ((ill->ill_flags & ILLF_ROUTER) &&
-		    (ire == NULL || (ire->ire_type & IRE_CACHE))) {
-
-			DTRACE_PROBE4(ip4__physical__in__start,
-			    ill_t *, ill, ill_t *, NULL,
-			    ipha_t *, ipha, mblk_t *, mp);
-
-			FW_HOOKS(ipst->ips_ip4_physical_in_event,
-			    ipst->ips_ipv4firewall_physical_in,
-			    ill, NULL, ipha, mp, mp, 0, ipst);
-
-			DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
-
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
-			UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
-			    pkt_len);
-
-			if (mp != NULL)
-				ire = ip_fast_forward(ire, dst, ill, mp);
-			continue;
-		}
-
-		/* incoming packet is for local consumption */
-		if ((ire != NULL) && (ire->ire_type & IRE_LOCAL))
-			goto local_accept;
-
-		/*
-		 * Disable ire caching for anything more complex
-		 * than the simple fast path case we checked for above.
-		 */
-		if (ire != NULL) {
-			ire_refrele(ire);
-			ire = NULL;
-		}
-
-		ire = ire_cache_lookup(dst, ALL_ZONES, msg_getlabel(mp),
-		    ipst);
-		if (ire == NULL || ire->ire_type == IRE_BROADCAST ||
-		    ire->ire_stq != NULL) {
-			ADD_TO_CHAIN(uhead, utail, ucnt, mp);
-			if (ire != NULL) {
-				ire_refrele(ire);
-				ire = NULL;
-			}
-			continue;
-		}
-
-local_accept:
-
-		if (ire->ire_rfq != q) {
-			ADD_TO_CHAIN(uhead, utail, ucnt, mp);
-			if (ire != NULL) {
-				ire_refrele(ire);
-				ire = NULL;
-			}
-			continue;
-		}
-
-		/*
-		 * The event for packets being received from a 'physical'
-		 * interface is placed after validation of the source and/or
-		 * destination address as being local so that packets can be
-		 * redirected to loopback addresses using ipnat.
-		 */
-		DTRACE_PROBE4(ip4__physical__in__start,
-		    ill_t *, ill, ill_t *, NULL,
-		    ipha_t *, ipha, mblk_t *, mp);
-
-		FW_HOOKS(ipst->ips_ip4_physical_in_event,
-		    ipst->ips_ipv4firewall_physical_in,
-		    ill, NULL, ipha, mp, mp, 0, ipst);
-
-		DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
-
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
-		UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len);
-
-		if (mp != NULL &&
-		    (mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, mp,
-		    0, q, ip_ring)) != NULL) {
-			if ((curr_sqp = GET_SQUEUE(mp)) == target_sqp) {
-				ADD_TO_CHAIN(ahead, atail, acnt, mp);
-			} else {
-				SQUEUE_ENTER(curr_sqp, mp, mp, 1,
-				    SQ_FILL, SQTAG_IP_INPUT);
-			}
-		}
-	}
-
-	if (ire != NULL)
-		ire_refrele(ire);
-
-all_reject:
-	if (uhead != NULL)
-		ip_input(ill, ip_ring, uhead, NULL);
-
-	if (ahead != NULL) {
-		*last = atail;
-		*cnt = acnt;
-		return (ahead);
-	}
-
-	return (NULL);
-#undef  rptr
-}
-
 static void
 ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err,
     t_uscalar_t err)
@@ -15684,14 +8224,16 @@ ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err,
  * ill_refhold before that, since qwriter_ip does an ill_refrele.
  */
 void
-ip_rput_dlpi(queue_t *q, mblk_t *mp)
+ip_rput_dlpi(ill_t *ill, mblk_t *mp)
 {
 	dl_ok_ack_t	*dloa = (dl_ok_ack_t *)mp->b_rptr;
 	dl_error_ack_t	*dlea = (dl_error_ack_t *)dloa;
-	ill_t		*ill = q->q_ptr;
+	queue_t		*q = ill->ill_rq;
 	t_uscalar_t	prim = dloa->dl_primitive;
 	t_uscalar_t	reqprim = DL_PRIM_INVAL;
 
+	DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi",
+	    char *, dl_primstr(prim), ill_t *, ill);
 	ip1dbg(("ip_rput_dlpi"));
 
 	/*
@@ -15721,9 +8263,6 @@ ip_rput_dlpi(queue_t *q, mblk_t *mp)
 	case DL_NOTIFY_ACK:
 		reqprim = DL_NOTIFY_REQ;
 		break;
-	case DL_CONTROL_ACK:
-		reqprim = DL_CONTROL_REQ;
-		break;
 	case DL_CAPABILITY_ACK:
 		reqprim = DL_CAPABILITY_REQ;
 		break;
@@ -15781,7 +8320,7 @@ ip_rput_dlpi(queue_t *q, mblk_t *mp)
 /*
  * Handling of DLPI messages that require exclusive access to the ipsq.
  *
- * Need to do ill_pending_mp_release on ioctl completion, which could
+ * Need to do ipsq_pending_mp_get on ioctl completion, which could
  * happen here. (along with mi_copy_done)
  */
 /* ARGSUSED */
@@ -15791,7 +8330,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 	dl_ok_ack_t	*dloa = (dl_ok_ack_t *)mp->b_rptr;
 	dl_error_ack_t	*dlea = (dl_error_ack_t *)dloa;
 	int		err = 0;
-	ill_t		*ill;
+	ill_t		*ill = (ill_t *)q->q_ptr;
 	ipif_t		*ipif = NULL;
 	mblk_t		*mp1 = NULL;
 	conn_t		*connp = NULL;
@@ -15800,15 +8339,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 	boolean_t	success;
 	boolean_t	ioctl_aborted = B_FALSE;
 	boolean_t	log = B_TRUE;
-	ip_stack_t		*ipst;
+
+	DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer",
+	    char *, dl_primstr(dloa->dl_primitive), ill_t *, ill);
 
 	ip1dbg(("ip_rput_dlpi_writer .."));
-	ill = (ill_t *)q->q_ptr;
 	ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop);
 	ASSERT(IAM_WRITER_ILL(ill));
 
-	ipst = ill->ill_ipst;
-
 	ipif = ipsq->ipsq_xop->ipx_pending_ipif;
 	/*
 	 * The current ioctl could have been aborted by the user and a new
@@ -15823,6 +8361,10 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n",
 		    dl_primstr(dlea->dl_error_primitive)));
 
+		DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer error",
+		    char *, dl_primstr(dlea->dl_error_primitive),
+		    ill_t *, ill);
+
 		switch (dlea->dl_error_primitive) {
 		case DL_DISABMULTI_REQ:
 			ill_dlpi_done(ill, dlea->dl_error_primitive);
@@ -15916,7 +8458,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
 				ill->ill_dlpi_multicast_state = IDS_FAILED;
 			if (ill->ill_dlpi_multicast_state == IDS_FAILED) {
-				ipif_t *ipif;
 
 				printf("ip: joining multicasts failed (%d)"
 				    " on %s - will use link layer "
@@ -15924,32 +8465,18 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 				    dlea->dl_errno, ill->ill_name);
 
 				/*
-				 * Set up the multicast mapping alone.
+				 * Set up for multi_bcast; We are the
 				 * writer, so ok to access ill->ill_ipif
 				 * without any lock.
 				 */
-				ipif = ill->ill_ipif;
 				mutex_enter(&ill->ill_phyint->phyint_lock);
 				ill->ill_phyint->phyint_flags |=
 				    PHYI_MULTI_BCAST;
 				mutex_exit(&ill->ill_phyint->phyint_lock);
 
-				if (!ill->ill_isv6) {
-					(void) ipif_arp_setup_multicast(ipif,
-					    NULL);
-				} else {
-					(void) ipif_ndp_setup_multicast(ipif,
-					    NULL);
-				}
 			}
 			freemsg(mp);	/* Don't want to pass this up */
 			return;
-		case DL_CONTROL_REQ:
-			ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
-			    "DL_CONTROL_REQ\n"));
-			ill_dlpi_done(ill, dlea->dl_error_primitive);
-			freemsg(mp);
-			return;
 		case DL_CAPABILITY_REQ:
 			ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
 			    "DL_CAPABILITY REQ\n"));
@@ -16003,10 +8530,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		mp = NULL;
 		break;
 
-	case DL_CONTROL_ACK:
-		/* We treat all of these as "fire and forget" */
-		ill_dlpi_done(ill, DL_CONTROL_REQ);
-		break;
 	case DL_INFO_ACK:
 		/* Call a routine to handle this one. */
 		ill_dlpi_done(ill, DL_INFO_REQ);
@@ -16019,29 +8542,33 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		 * sent by ill_dl_phys, in which case just return
 		 */
 		ill_dlpi_done(ill, DL_BIND_REQ);
-		if (ill->ill_ifname_pending)
+		if (ill->ill_ifname_pending) {
+			DTRACE_PROBE2(ip__rput__dlpi__ifname__pending,
+			    ill_t *, ill, mblk_t *, mp);
 			break;
-
+		}
 		if (!ioctl_aborted)
 			mp1 = ipsq_pending_mp_get(ipsq, &connp);
-		if (mp1 == NULL)
+		if (mp1 == NULL) {
+			DTRACE_PROBE1(ip__rput__dlpi__no__mblk, ill_t *, ill);
 			break;
+		}
 		/*
 		 * mp1 was added by ill_dl_up(). if that is a result of
 		 * a DL_NOTE_REPLUMB notification, connp could be NULL.
 		 */
 		if (connp != NULL)
 			q = CONNP_TO_WQ(connp);
-
 		/*
 		 * We are exclusive. So nothing can change even after
-		 * we get the pending mp. If need be we can put it back
-		 * and restart, as in calling ipif_arp_up()  below.
+		 * we get the pending mp.
 		 */
 		ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name));
+		DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
 
 		mutex_enter(&ill->ill_lock);
 		ill->ill_dl_up = 1;
+		ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
 		ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
 		mutex_exit(&ill->ill_lock);
 
@@ -16052,34 +8579,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		 * ill_dl_up(), which stopped ipif_up()'s processing.
 		 */
 		if (ill->ill_isv6) {
-			if (ill->ill_flags & ILLF_XRESOLV) {
-				if (connp != NULL)
-					mutex_enter(&connp->conn_lock);
-				mutex_enter(&ill->ill_lock);
-				success = ipsq_pending_mp_add(connp, ipif, q,
-				    mp1, 0);
-				mutex_exit(&ill->ill_lock);
-				if (connp != NULL)
-					mutex_exit(&connp->conn_lock);
-				if (success) {
-					err = ipif_resolver_up(ipif,
-					    Res_act_initial);
-					if (err == EINPROGRESS) {
-						freemsg(mp);
-						return;
-					}
-					ASSERT(err != 0);
-					mp1 = ipsq_pending_mp_get(ipsq, &connp);
-					ASSERT(mp1 != NULL);
-				} else {
-					/* conn has started closing */
-					err = EINTR;
-				}
-			} else { /* Non XRESOLV interface */
-				(void) ipif_resolver_up(ipif, Res_act_initial);
-				if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
-					err = ipif_up_done_v6(ipif);
-			}
+			/*
+			 * v6 interfaces.
+			 * Unlike ARP which has to do another bind
+			 * and attach, once we get here we are
+			 * done with NDP
+			 */
+			(void) ipif_resolver_up(ipif, Res_act_initial);
+			if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
+				err = ipif_up_done_v6(ipif);
 		} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
 			/*
 			 * ARP and other v4 external resolvers.
@@ -16099,7 +8607,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 					freemsg(mp);
 					return;
 				}
-				ASSERT(err != 0);
+				ASSERT(arp_no_defense || err != 0);
 				mp1 = ipsq_pending_mp_get(ipsq, &connp);
 			} else {
 				/* The conn has started closing */
@@ -16144,10 +8652,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 
 	case DL_NOTIFY_IND: {
 		dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
-		ire_t *ire;
 		uint_t orig_mtu;
-		boolean_t need_ire_walk_v4 = B_FALSE;
-		boolean_t need_ire_walk_v6 = B_FALSE;
 
 		switch (notify->dl_notification) {
 		case DL_NOTE_PHYS_ADDR:
@@ -16164,95 +8669,52 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			return;
 
 		case DL_NOTE_FASTPATH_FLUSH:
-			ill_fastpath_flush(ill);
+			nce_flush(ill, B_FALSE);
 			break;
 
 		case DL_NOTE_SDU_SIZE:
 			/*
-			 * Change the MTU size of the interface, of all
-			 * attached ipif's, and of all relevant ire's.  The
-			 * new value's a uint32_t at notify->dl_data.
-			 * Mtu change Vs. new ire creation - protocol below.
-			 *
-			 * a Mark the ipif as IPIF_CHANGING.
-			 * b Set the new mtu in the ipif.
-			 * c Change the ire_max_frag on all affected ires
-			 * d Unmark the IPIF_CHANGING
+			 * The dce and fragmentation code can cope with
+			 * this changing while packets are being sent.
+			 * When packets are sent ip_output will discover
+			 * a change.
 			 *
-			 * To see how the protocol works, assume an interface
-			 * route is also being added simultaneously by
-			 * ip_rt_add and let 'ipif' be the ipif referenced by
-			 * the ire. If the ire is created before step a,
-			 * it will be cleaned up by step c. If the ire is
-			 * created after step d, it will see the new value of
-			 * ipif_mtu. Any attempt to create the ire between
-			 * steps a to d will fail because of the IPIF_CHANGING
-			 * flag. Note that ire_create() is passed a pointer to
-			 * the ipif_mtu, and not the value. During ire_add
-			 * under the bucket lock, the ire_max_frag of the
-			 * new ire being created is set from the ipif/ire from
-			 * which it is being derived.
+			 * Change the MTU size of the interface.
 			 */
 			mutex_enter(&ill->ill_lock);
+			ill->ill_current_frag = (uint_t)notify->dl_data;
+			if (ill->ill_current_frag > ill->ill_max_frag)
+				ill->ill_max_frag = ill->ill_current_frag;
 
-			orig_mtu = ill->ill_max_mtu;
-			ill->ill_max_frag = (uint_t)notify->dl_data;
-			ill->ill_max_mtu = (uint_t)notify->dl_data;
-
-			/*
-			 * If ill_user_mtu was set (via SIOCSLIFLNKINFO),
-			 * clamp ill_max_mtu at it.
-			 */
-			if (ill->ill_user_mtu != 0 &&
-			    ill->ill_user_mtu < ill->ill_max_mtu)
-				ill->ill_max_mtu = ill->ill_user_mtu;
+			orig_mtu = ill->ill_mtu;
+			if (!(ill->ill_flags & ILLF_FIXEDMTU)) {
+				ill->ill_mtu = ill->ill_current_frag;
 
-			/*
-			 * If the MTU is unchanged, we're done.
-			 */
-			if (orig_mtu == ill->ill_max_mtu) {
-				mutex_exit(&ill->ill_lock);
-				break;
-			}
-
-			if (ill->ill_isv6) {
-				if (ill->ill_max_mtu < IPV6_MIN_MTU)
-					ill->ill_max_mtu = IPV6_MIN_MTU;
-			} else {
-				if (ill->ill_max_mtu < IP_MIN_MTU)
-					ill->ill_max_mtu = IP_MIN_MTU;
-			}
-			for (ipif = ill->ill_ipif; ipif != NULL;
-			    ipif = ipif->ipif_next) {
 				/*
-				 * Don't override the mtu if the user
-				 * has explicitly set it.
+				 * If ill_user_mtu was set (via
+				 * SIOCSLIFLNKINFO), clamp ill_mtu at it.
 				 */
-				if (ipif->ipif_flags & IPIF_FIXEDMTU)
-					continue;
-				ipif->ipif_mtu = (uint_t)notify->dl_data;
-				if (ipif->ipif_isv6)
-					ire = ipif_to_ire_v6(ipif);
-				else
-					ire = ipif_to_ire(ipif);
-				if (ire != NULL) {
-					ire->ire_max_frag = ipif->ipif_mtu;
-					ire_refrele(ire);
-				}
-				if (ipif->ipif_flags & IPIF_UP) {
-					if (ill->ill_isv6)
-						need_ire_walk_v6 = B_TRUE;
-					else
-						need_ire_walk_v4 = B_TRUE;
+				if (ill->ill_user_mtu != 0 &&
+				    ill->ill_user_mtu < ill->ill_mtu)
+					ill->ill_mtu = ill->ill_user_mtu;
+
+				if (ill->ill_isv6) {
+					if (ill->ill_mtu < IPV6_MIN_MTU)
+						ill->ill_mtu = IPV6_MIN_MTU;
+				} else {
+					if (ill->ill_mtu < IP_MIN_MTU)
+						ill->ill_mtu = IP_MIN_MTU;
 				}
 			}
 			mutex_exit(&ill->ill_lock);
-			if (need_ire_walk_v4)
-				ire_walk_v4(ill_mtu_change, (char *)ill,
-				    ALL_ZONES, ipst);
-			if (need_ire_walk_v6)
-				ire_walk_v6(ill_mtu_change, (char *)ill,
-				    ALL_ZONES, ipst);
+			/*
+			 * Make sure all dce_generation checks find out
+			 * that ill_mtu has changed.
+			 */
+			if (orig_mtu != ill->ill_mtu) {
+				dce_increment_all_generations(ill->ill_isv6,
+				    ill->ill_ipst);
+			}
 
 			/*
 			 * Refresh IPMP meta-interface MTU if necessary.
@@ -16303,8 +8765,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		case DL_NOTE_PROMISC_ON_PHYS: {
 			phyint_t *phyint = ill->ill_phyint;
 
-			IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: "
-			    "got a DL_NOTE_PROMISC_ON_PHYS\n"));
 			mutex_enter(&phyint->phyint_lock);
 			phyint->phyint_flags |= PHYI_PROMISC;
 			mutex_exit(&phyint->phyint_lock);
@@ -16313,8 +8773,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		case DL_NOTE_PROMISC_OFF_PHYS: {
 			phyint_t *phyint = ill->ill_phyint;
 
-			IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: "
-			    "got a DL_NOTE_PROMISC_OFF_PHYS\n"));
 			mutex_enter(&phyint->phyint_lock);
 			phyint->phyint_flags &= ~PHYI_PROMISC;
 			mutex_exit(&phyint->phyint_lock);
@@ -16474,6 +8932,10 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		ip2dbg(("DL_OK_ACK %s (0x%x)\n",
 		    dl_primstr((int)dloa->dl_correct_primitive),
 		    dloa->dl_correct_primitive));
+		DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer ok",
+		    char *, dl_primstr(dloa->dl_correct_primitive),
+		    ill_t *, ill);
+
 		switch (dloa->dl_correct_primitive) {
 		case DL_ENABMULTI_REQ:
 		case DL_DISABMULTI_REQ:
@@ -16502,6 +8964,10 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 	 */
 	ASSERT(err != EINPROGRESS);
 
+	DTRACE_PROBE4(ipif__ioctl, char *, "ip_rput_dlpi_writer finish",
+	    int, ipsq->ipsq_xop->ipx_current_ioctl, ill_t *, ill,
+	    ipif_t *, NULL);
+
 	switch (ipsq->ipsq_xop->ipx_current_ioctl) {
 	case 0:
 		ipsq_current_finish(ipsq);
@@ -16595,7 +9061,10 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) {
 			ill->ill_dlpi_fastpath_state = IDS_FAILED;
 			mutex_exit(&ill->ill_lock);
-			ill_fastpath_nack(ill);
+			/*
+			 * don't flush the nce_t entries: we use them
+			 * as an index to the ncec itself.
+			 */
 			ip1dbg(("ip_rput: DLPI fastpath off on interface %s\n",
 			    ill->ill_name));
 		} else {
@@ -16611,235 +9080,24 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 }
 
 /*
- * NOTE : This function does not ire_refrele the ire argument passed in.
- *
- * IPQoS notes
- * IP policy is invoked twice for a forwarded packet, once on the read side
- * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are
- * enabled. An additional parameter, in_ill, has been added for this purpose.
- * Note that in_ill could be NULL when called from ip_rput_forward_multicast
- * because ip_mroute drops this information.
- *
+ * Update any source route, record route or timestamp options
+ * When it fails it has consumed the message and BUMPed the MIB.
  */
-void
-ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill)
-{
-	uint32_t	old_pkt_len;
-	uint32_t	pkt_len;
-	queue_t	*q;
-	uint32_t	sum;
-#define	rptr	((uchar_t *)ipha)
-	uint32_t	max_frag;
-	uint32_t	ill_index;
-	ill_t		*out_ill;
-	mib2_ipIfStatsEntry_t *mibptr;
-	ip_stack_t	*ipst = ((ill_t *)(ire->ire_stq->q_ptr))->ill_ipst;
-
-	/* Get the ill_index of the incoming ILL */
-	ill_index = (in_ill != NULL) ? in_ill->ill_phyint->phyint_ifindex : 0;
-	mibptr = (in_ill != NULL) ? in_ill->ill_ip_mib : &ipst->ips_ip_mib;
-
-	/* Initiate Read side IPPF processing */
-	if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
-		ip_process(IPP_FWD_IN, &mp, ill_index);
-		if (mp == NULL) {
-			ip2dbg(("ip_rput_forward: pkt dropped/deferred "\
-			    "during IPPF processing\n"));
-			return;
-		}
-	}
-
-	/* Adjust the checksum to reflect the ttl decrement. */
-	sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
-	ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
-
-	if (ipha->ipha_ttl-- <= 1) {
-		if (ip_csum_hdr(ipha)) {
-			BUMP_MIB(mibptr, ipIfStatsInCksumErrs);
-			goto drop_pkt;
-		}
-		/*
-		 * Note: ire_stq this will be NULL for multicast
-		 * datagrams using the long path through arp (the IRE
-		 * is not an IRE_CACHE). This should not cause
-		 * problems since we don't generate ICMP errors for
-		 * multicast packets.
-		 */
-		BUMP_MIB(mibptr, ipIfStatsForwProhibits);
-		q = ire->ire_stq;
-		if (q != NULL) {
-			/* Sent by forwarding path, and router is global zone */
-			icmp_time_exceeded(q, mp, ICMP_TTL_EXCEEDED,
-			    GLOBAL_ZONEID, ipst);
-		} else
-			freemsg(mp);
-		return;
-	}
-
-	/*
-	 * Don't forward if the interface is down
-	 */
-	if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) {
-		BUMP_MIB(mibptr, ipIfStatsInDiscards);
-		ip2dbg(("ip_rput_forward:interface is down\n"));
-		goto drop_pkt;
-	}
-
-	/* Get the ill_index of the outgoing ILL */
-	out_ill = ire_to_ill(ire);
-	ill_index = out_ill->ill_phyint->phyint_ifindex;
-
-	DTRACE_PROBE4(ip4__forwarding__start,
-	    ill_t *, in_ill, ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp);
-
-	FW_HOOKS(ipst->ips_ip4_forwarding_event,
-	    ipst->ips_ipv4firewall_forwarding,
-	    in_ill, out_ill, ipha, mp, mp, 0, ipst);
-
-	DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
-
-	if (mp == NULL)
-		return;
-	old_pkt_len = pkt_len = ntohs(ipha->ipha_length);
-
-	if (is_system_labeled()) {
-		mblk_t *mp1;
-
-		if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) {
-			BUMP_MIB(mibptr, ipIfStatsForwProhibits);
-			goto drop_pkt;
-		}
-		/* Size may have changed */
-		mp = mp1;
-		ipha = (ipha_t *)mp->b_rptr;
-		pkt_len = ntohs(ipha->ipha_length);
-	}
-
-	/* Check if there are options to update */
-	if (!IS_SIMPLE_IPH(ipha)) {
-		if (ip_csum_hdr(ipha)) {
-			BUMP_MIB(mibptr, ipIfStatsInCksumErrs);
-			goto drop_pkt;
-		}
-		if (ip_rput_forward_options(mp, ipha, ire, ipst)) {
-			BUMP_MIB(mibptr, ipIfStatsForwProhibits);
-			return;
-		}
-
-		ipha->ipha_hdr_checksum = 0;
-		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
-	}
-	max_frag = ire->ire_max_frag;
-	if (pkt_len > max_frag) {
-		/*
-		 * It needs fragging on its way out.  We haven't
-		 * verified the header checksum yet.  Since we
-		 * are going to put a surely good checksum in the
-		 * outgoing header, we have to make sure that it
-		 * was good coming in.
-		 */
-		if (ip_csum_hdr(ipha)) {
-			BUMP_MIB(mibptr, ipIfStatsInCksumErrs);
-			goto drop_pkt;
-		}
-		/* Initiate Write side IPPF processing */
-		if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
-			ip_process(IPP_FWD_OUT, &mp, ill_index);
-			if (mp == NULL) {
-				ip2dbg(("ip_rput_forward: pkt dropped/deferred"\
-				    " during IPPF processing\n"));
-				return;
-			}
-		}
-		/*
-		 * Handle labeled packet resizing.
-		 *
-		 * If we have added a label, inform ip_wput_frag() of its
-		 * effect on the MTU for ICMP messages.
-		 */
-		if (pkt_len > old_pkt_len) {
-			uint32_t secopt_size;
-
-			secopt_size = pkt_len - old_pkt_len;
-			if (secopt_size < max_frag)
-				max_frag -= secopt_size;
-		}
-
-		ip_wput_frag(ire, mp, IB_PKT, max_frag, 0,
-		    GLOBAL_ZONEID, ipst, NULL);
-		ip2dbg(("ip_rput_forward:sent to ip_wput_frag\n"));
-		return;
-	}
-
-	DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL,
-	    ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp);
-	FW_HOOKS(ipst->ips_ip4_physical_out_event,
-	    ipst->ips_ipv4firewall_physical_out,
-	    NULL, out_ill, ipha, mp, mp, 0, ipst);
-	DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
-	if (mp == NULL)
-		return;
-
-	mp->b_prev = (mblk_t *)IPP_FWD_OUT;
-	ip1dbg(("ip_rput_forward: Calling ip_xmit_v4\n"));
-	(void) ip_xmit_v4(mp, ire, NULL, B_FALSE, NULL);
-	/* ip_xmit_v4 always consumes the packet */
-	return;
-
-drop_pkt:;
-	ip1dbg(("ip_rput_forward: drop pkt\n"));
-	freemsg(mp);
-#undef	rptr
-}
-
-void
-ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif)
-{
-	ire_t	*ire;
-	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
-	ASSERT(!ipif->ipif_isv6);
-	/*
-	 * Find an IRE which matches the destination and the outgoing
-	 * queue in the cache table. All we need is an IRE_CACHE which
-	 * is pointing at ipif->ipif_ill.
-	 */
-	if (ipif->ipif_flags & IPIF_POINTOPOINT)
-		dst = ipif->ipif_pp_dst_addr;
-
-	ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, msg_getlabel(mp),
-	    MATCH_IRE_ILL | MATCH_IRE_SECATTR, ipst);
-	if (ire == NULL) {
-		/*
-		 * Mark this packet to make it be delivered to
-		 * ip_rput_forward after the new ire has been
-		 * created.
-		 */
-		mp->b_prev = NULL;
-		mp->b_next = mp;
-		ip_newroute_ipif(ipif->ipif_ill->ill_wq, mp, ipif, dst,
-		    NULL, 0, GLOBAL_ZONEID, &zero_info);
-	} else {
-		ip_rput_forward(ire, (ipha_t *)mp->b_rptr, mp, NULL);
-		IRE_REFRELE(ire);
-	}
-}
-
-/* Update any source route, record route or timestamp options */
-static int
-ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
+boolean_t
+ip_forward_options(mblk_t *mp, ipha_t *ipha, ill_t *dst_ill,
+    ip_recv_attr_t *ira)
 {
 	ipoptp_t	opts;
 	uchar_t		*opt;
 	uint8_t		optval;
 	uint8_t		optlen;
 	ipaddr_t	dst;
+	ipaddr_t	ifaddr;
 	uint32_t	ts;
-	ire_t		*dst_ire = NULL;
-	ire_t		*tmp_ire = NULL;
 	timestruc_t	now;
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
 
-	ip2dbg(("ip_rput_forward_options\n"));
+	ip2dbg(("ip_forward_options\n"));
 	dst = ipha->ipha_dst;
 	for (optval = ipoptp_first(&opts, ipha);
 	    optval != IPOPT_EOL;
@@ -16847,7 +9105,7 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
 		opt = opts.ipoptp_cur;
 		optlen = opts.ipoptp_len;
-		ip2dbg(("ip_rput_forward_options: opt %d, len %d\n",
+		ip2dbg(("ip_forward_options: opt %d, len %d\n",
 		    optval, opts.ipoptp_len));
 		switch (optval) {
 			uint32_t off;
@@ -16855,27 +9113,17 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
 		case IPOPT_LSRR:
 			/* Check if adminstratively disabled */
 			if (!ipst->ips_ip_forward_src_routed) {
-				if (ire->ire_stq != NULL) {
-					/*
-					 * Sent by forwarding path, and router
-					 * is global zone
-					 */
-					icmp_unreachable(ire->ire_stq, mp,
-					    ICMP_SOURCE_ROUTE_FAILED,
-					    GLOBAL_ZONEID, ipst);
-				} else {
-					ip0dbg(("ip_rput_forward_options: "
-					    "unable to send unreach\n"));
-					freemsg(mp);
-				}
-				return (-1);
+				BUMP_MIB(dst_ill->ill_ip_mib,
+				    ipIfStatsForwProhibits);
+				ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
+				    mp, dst_ill);
+				icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
+				    ira);
+				return (B_FALSE);
 			}
-
-			dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL,
-			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-			if (dst_ire == NULL) {
+			if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
 				/*
-				 * Must be partial since ip_rput_options
+				 * Must be partial since ip_input_options
 				 * checked for strict.
 				 */
 				break;
@@ -16887,31 +9135,33 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
 			    off > optlen - IP_ADDR_LEN) {
 				/* End of source route */
 				ip1dbg((
-				    "ip_rput_forward_options: end of SR\n"));
-				ire_refrele(dst_ire);
+				    "ip_forward_options: end of SR\n"));
 				break;
 			}
+			/* Pick a reasonable address on the outbound if */
+			ASSERT(dst_ill != NULL);
+			if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
+			    INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
+			    NULL) != 0) {
+				/* No source! Shouldn't happen */
+				ifaddr = INADDR_ANY;
+			}
 			bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
-			bcopy(&ire->ire_src_addr, (char *)opt + off,
-			    IP_ADDR_LEN);
-			ip1dbg(("ip_rput_forward_options: next hop 0x%x\n",
+			bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
+			ip1dbg(("ip_forward_options: next hop 0x%x\n",
 			    ntohl(dst)));
 
 			/*
 			 * Check if our address is present more than
 			 * once as consecutive hops in source route.
 			 */
-			tmp_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL,
-			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-			if (tmp_ire != NULL) {
-				ire_refrele(tmp_ire);
+			if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
 				off += IP_ADDR_LEN;
 				opt[IPOPT_OFFSET] += IP_ADDR_LEN;
 				goto redo_srr;
 			}
 			ipha->ipha_dst = dst;
 			opt[IPOPT_OFFSET] += IP_ADDR_LEN;
-			ire_refrele(dst_ire);
 			break;
 		case IPOPT_RR:
 			off = opt[IPOPT_OFFSET];
@@ -16920,11 +9170,18 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
 			    off > optlen - IP_ADDR_LEN) {
 				/* No more room - ignore */
 				ip1dbg((
-				    "ip_rput_forward_options: end of RR\n"));
+				    "ip_forward_options: end of RR\n"));
 				break;
 			}
-			bcopy(&ire->ire_src_addr, (char *)opt + off,
-			    IP_ADDR_LEN);
+			/* Pick a reasonable address on the outbound if */
+			ASSERT(dst_ill != NULL);
+			if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
+			    INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
+			    NULL) != 0) {
+				/* No source! Shouldn't happen */
+				ifaddr = INADDR_ANY;
+			}
+			bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
 			opt[IPOPT_OFFSET] += IP_ADDR_LEN;
 			break;
 		case IPOPT_TS:
@@ -16938,14 +9195,10 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
 				/* Verify that the address matched */
 				off = opt[IPOPT_OFFSET] - 1;
 				bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
-				dst_ire = ire_ctable_lookup(dst, 0,
-				    IRE_LOCAL, NULL, ALL_ZONES, NULL,
-				    MATCH_IRE_TYPE, ipst);
-				if (dst_ire == NULL) {
+				if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
 					/* Not for us */
 					break;
 				}
-				ire_refrele(dst_ire);
 				/* FALLTHRU */
 			case IPOPT_TS_TSANDADDR:
 				off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
@@ -16955,9 +9208,9 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
 				 * ip_*put_options should have already
 				 * dropped this packet.
 				 */
-				cmn_err(CE_PANIC, "ip_rput_forward_options: "
-				    "unknown IT - bug in ip_rput_options?\n");
-				return (0);	/* Keep "lint" happy */
+				cmn_err(CE_PANIC, "ip_forward_options: "
+				    "unknown IT - bug in ip_input_options?\n");
+				return (B_TRUE);	/* Keep "lint" happy */
 			}
 			if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
 				/* Increase overflow counter */
@@ -16972,8 +9225,15 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
 			case IPOPT_TS_PRESPEC:
 			case IPOPT_TS_PRESPEC_RFC791:
 			case IPOPT_TS_TSANDADDR:
-				bcopy(&ire->ire_src_addr,
-				    (char *)opt + off, IP_ADDR_LEN);
+				/* Pick a reasonable addr on the outbound if */
+				ASSERT(dst_ill != NULL);
+				if (ip_select_source_v4(dst_ill, INADDR_ANY,
+				    dst, INADDR_ANY, ALL_ZONES, ipst, &ifaddr,
+				    NULL, NULL) != 0) {
+					/* No source! Shouldn't happen */
+					ifaddr = INADDR_ANY;
+				}
+				bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
 				opt[IPOPT_OFFSET] += IP_ADDR_LEN;
 				/* FALLTHRU */
 			case IPOPT_TS_TSONLY:
@@ -16989,223 +9249,7 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
 			break;
 		}
 	}
-	return (0);
-}
-
-/*
- * This is called after processing at least one of AH/ESP headers.
- *
- * NOTE: the ill corresponding to ipsec_in_ill_index may not be
- * the actual, physical interface on which the packet was received,
- * but, when ip_strict_dst_multihoming is set to 1, could be the
- * interface which had the ipha_dst configured when the packet went
- * through ip_rput. The ill_index corresponding to the recv_ill
- * is saved in ipsec_in_rill_index
- *
- * NOTE2: The "ire" argument is only used in IPv4 cases.  This function
- * cannot assume "ire" points to valid data for any IPv6 cases.
- */
-void
-ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire)
-{
-	mblk_t *mp;
-	ipaddr_t dst;
-	in6_addr_t *v6dstp;
-	ipha_t *ipha;
-	ip6_t *ip6h;
-	ipsec_in_t *ii;
-	boolean_t ill_need_rele = B_FALSE;
-	boolean_t rill_need_rele = B_FALSE;
-	boolean_t ire_need_rele = B_FALSE;
-	netstack_t	*ns;
-	ip_stack_t	*ipst;
-
-	ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	ASSERT(ii->ipsec_in_ill_index != 0);
-	ns = ii->ipsec_in_ns;
-	ASSERT(ii->ipsec_in_ns != NULL);
-	ipst = ns->netstack_ip;
-
-	mp = ipsec_mp->b_cont;
-	ASSERT(mp != NULL);
-
-	if (ill == NULL) {
-		ASSERT(recv_ill == NULL);
-		/*
-		 * We need to get the original queue on which ip_rput_local
-		 * or ip_rput_data_v6 was called.
-		 */
-		ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index,
-		    !ii->ipsec_in_v4, NULL, NULL, NULL, NULL, ipst);
-		ill_need_rele = B_TRUE;
-
-		if (ii->ipsec_in_ill_index != ii->ipsec_in_rill_index) {
-			recv_ill = ill_lookup_on_ifindex(
-			    ii->ipsec_in_rill_index, !ii->ipsec_in_v4,
-			    NULL, NULL, NULL, NULL, ipst);
-			rill_need_rele = B_TRUE;
-		} else {
-			recv_ill = ill;
-		}
-
-		if ((ill == NULL) || (recv_ill == NULL)) {
-			ip0dbg(("ip_fanout_proto_again: interface "
-			    "disappeared\n"));
-			if (ill != NULL)
-				ill_refrele(ill);
-			if (recv_ill != NULL)
-				ill_refrele(recv_ill);
-			freemsg(ipsec_mp);
-			return;
-		}
-	}
-
-	ASSERT(ill != NULL && recv_ill != NULL);
-
-	if (mp->b_datap->db_type == M_CTL) {
-		/*
-		 * AH/ESP is returning the ICMP message after
-		 * removing their headers. Fanout again till
-		 * it gets to the right protocol.
-		 */
-		if (ii->ipsec_in_v4) {
-			icmph_t *icmph;
-			int iph_hdr_length;
-			int hdr_length;
-
-			ipha = (ipha_t *)mp->b_rptr;
-			iph_hdr_length = IPH_HDR_LENGTH(ipha);
-			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-			ipha = (ipha_t *)&icmph[1];
-			hdr_length = IPH_HDR_LENGTH(ipha);
-			/*
-			 * icmp_inbound_error_fanout may need to do pullupmsg.
-			 * Reset the type to M_DATA.
-			 */
-			mp->b_datap->db_type = M_DATA;
-			icmp_inbound_error_fanout(ill->ill_rq, ill, ipsec_mp,
-			    icmph, ipha, iph_hdr_length, hdr_length, B_TRUE,
-			    B_FALSE, ill, ii->ipsec_in_zoneid);
-		} else {
-			icmp6_t *icmp6;
-			int hdr_length;
-
-			ip6h = (ip6_t *)mp->b_rptr;
-			/* Don't call hdr_length_v6() unless you have to. */
-			if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
-				hdr_length = ip_hdr_length_v6(mp, ip6h);
-			else
-				hdr_length = IPV6_HDR_LEN;
-
-			icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
-			/*
-			 * icmp_inbound_error_fanout_v6 may need to do
-			 * pullupmsg.  Reset the type to M_DATA.
-			 */
-			mp->b_datap->db_type = M_DATA;
-			icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp,
-			    ip6h, icmp6, ill, recv_ill, B_TRUE,
-			    ii->ipsec_in_zoneid);
-		}
-		if (ill_need_rele)
-			ill_refrele(ill);
-		if (rill_need_rele)
-			ill_refrele(recv_ill);
-		return;
-	}
-
-	if (ii->ipsec_in_v4) {
-		ipha = (ipha_t *)mp->b_rptr;
-		dst = ipha->ipha_dst;
-		if (CLASSD(dst)) {
-			/*
-			 * Multicast has to be delivered to all streams.
-			 */
-			dst = INADDR_BROADCAST;
-		}
-
-		if (ire == NULL) {
-			ire = ire_cache_lookup(dst, ii->ipsec_in_zoneid,
-			    msg_getlabel(mp), ipst);
-			if (ire == NULL) {
-				if (ill_need_rele)
-					ill_refrele(ill);
-				if (rill_need_rele)
-					ill_refrele(recv_ill);
-				ip1dbg(("ip_fanout_proto_again: "
-				    "IRE not found"));
-				freemsg(ipsec_mp);
-				return;
-			}
-			ire_need_rele = B_TRUE;
-		}
-
-		switch (ipha->ipha_protocol) {
-		case IPPROTO_UDP:
-			ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire,
-			    recv_ill);
-			if (ire_need_rele)
-				ire_refrele(ire);
-			break;
-		case IPPROTO_TCP:
-			if (!ire_need_rele)
-				IRE_REFHOLD(ire);
-			mp = ip_tcp_input(mp, ipha, ill, B_TRUE,
-			    ire, ipsec_mp, 0, ill->ill_rq, NULL);
-			IRE_REFRELE(ire);
-			if (mp != NULL) {
-				SQUEUE_ENTER(GET_SQUEUE(mp), mp,
-				    mp, 1, SQ_PROCESS,
-				    SQTAG_IP_PROTO_AGAIN);
-			}
-			break;
-		case IPPROTO_SCTP:
-			if (!ire_need_rele)
-				IRE_REFHOLD(ire);
-			ip_sctp_input(mp, ipha, ill, B_TRUE, ire,
-			    ipsec_mp, 0, ill->ill_rq, dst);
-			break;
-		case IPPROTO_ENCAP:
-		case IPPROTO_IPV6:
-			if (ip_iptun_input(ipsec_mp, mp, ipha, ill, ire,
-			    ill->ill_ipst)) {
-				/*
-				 * If we made it here, we don't need to worry
-				 * about the raw-socket/protocol fanout.
-				 */
-				if (ire_need_rele)
-					ire_refrele(ire);
-				break;
-			}
-			/* else FALLTHRU */
-		default:
-			ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire,
-			    recv_ill, 0);
-			if (ire_need_rele)
-				ire_refrele(ire);
-			break;
-		}
-	} else {
-		uint32_t rput_flags = 0;
-
-		ip6h = (ip6_t *)mp->b_rptr;
-		v6dstp = &ip6h->ip6_dst;
-		/*
-		 * XXX Assumes ip_rput_v6 sets ll_multicast  only for multicast
-		 * address.
-		 *
-		 * Currently, we don't store that state in the IPSEC_IN
-		 * message, and we may need to.
-		 */
-		rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ?
-		    IP6_IN_LLMCAST : 0);
-		ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags,
-		    NULL, NULL);
-	}
-	if (ill_need_rele)
-		ill_refrele(ill);
-	if (rill_need_rele)
-		ill_refrele(recv_ill);
+	return (B_TRUE);
 }
 
 /*
@@ -17290,609 +9334,25 @@ ill_frag_timer_start(ill_t *ill)
 }
 
 /*
- * This routine is needed for loopback when forwarding multicasts.
- *
- * IPQoS Notes:
- * IPPF processing is done in fanout routines.
- * Policy processing is done only if IPP_lOCAL_IN is enabled. Further,
- * processing for IPsec packets is done when it comes back in clear.
- * NOTE : The callers of this function need to do the ire_refrele for the
- *	  ire that is being passed in.
- */
-void
-ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
-    ill_t *recv_ill, uint32_t esp_udp_ports)
-{
-	boolean_t esp_in_udp_packet = (esp_udp_ports != 0);
-	ill_t	*ill = (ill_t *)q->q_ptr;
-	uint32_t	sum;
-	uint32_t	u1;
-	uint32_t	u2;
-	int		hdr_length;
-	boolean_t	mctl_present;
-	mblk_t		*first_mp = mp;
-	mblk_t		*hada_mp = NULL;
-	ipha_t		*inner_ipha;
-	ip_stack_t	*ipst;
-
-	ASSERT(recv_ill != NULL);
-	ipst = recv_ill->ill_ipst;
-
-	TRACE_1(TR_FAC_IP, TR_IP_RPUT_LOCL_START,
-	    "ip_rput_locl_start: q %p", q);
-
-	ASSERT(ire->ire_ipversion == IPV4_VERSION);
-	ASSERT(ill != NULL);
-
-#define	rptr	((uchar_t *)ipha)
-#define	iphs	((uint16_t *)ipha)
-
-	/*
-	 * no UDP or TCP packet should come here anymore.
-	 */
-	ASSERT(ipha->ipha_protocol != IPPROTO_TCP &&
-	    ipha->ipha_protocol != IPPROTO_UDP);
-
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-	if (mctl_present &&
-	    ((da_ipsec_t *)first_mp->b_rptr)->da_type == IPHADA_M_CTL) {
-		ASSERT(MBLKL(first_mp) >= sizeof (da_ipsec_t));
-
-		/*
-		 * It's an IPsec accelerated packet.
-		 * Keep a pointer to the data attributes around until
-		 * we allocate the ipsec_info_t.
-		 */
-		IPSECHW_DEBUG(IPSECHW_PKT,
-		    ("ip_rput_local: inbound HW accelerated IPsec pkt\n"));
-		hada_mp = first_mp;
-		hada_mp->b_cont = NULL;
-		/*
-		 * Since it is accelerated, it comes directly from
-		 * the ill and the data attributes is followed by
-		 * the packet data.
-		 */
-		ASSERT(mp->b_datap->db_type != M_CTL);
-		first_mp = mp;
-		mctl_present = B_FALSE;
-	}
-
-	/*
-	 * IF M_CTL is not present, then ipsec_in_is_secure
-	 * should return B_TRUE. There is a case where loopback
-	 * packets has an M_CTL in the front with all the
-	 * IPsec options set to IPSEC_PREF_NEVER - which means
-	 * ipsec_in_is_secure will return B_FALSE. As loopback
-	 * packets never comes here, it is safe to ASSERT the
-	 * following.
-	 */
-	ASSERT(!mctl_present || ipsec_in_is_secure(first_mp));
-
-	/*
-	 * Also, we should never have an mctl_present if this is an
-	 * ESP-in-UDP packet.
-	 */
-	ASSERT(!mctl_present || !esp_in_udp_packet);
-
-	/* u1 is # words of IP options */
-	u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) +
-	    IP_SIMPLE_HDR_LENGTH_IN_WORDS);
-
-	/*
-	 * Don't verify header checksum if we just removed UDP header or
-	 * packet is coming back from AH/ESP.
-	 */
-	if (!esp_in_udp_packet && !mctl_present) {
-		if (u1) {
-			if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) {
-				if (hada_mp != NULL)
-					freemsg(hada_mp);
-				return;
-			}
-		} else {
-			/* Check the IP header checksum.  */
-#define	uph	((uint16_t *)ipha)
-			sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
-			    uph[5] + uph[6] + uph[7] + uph[8] + uph[9];
-#undef  uph
-			/* finish doing IP checksum */
-			sum = (sum & 0xFFFF) + (sum >> 16);
-			sum = ~(sum + (sum >> 16)) & 0xFFFF;
-			if (sum && sum != 0xFFFF) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
-				goto drop_pkt;
-			}
-		}
-	}
-
-	/*
-	 * Count for SNMP of inbound packets for ire. As ip_proto_input
-	 * might be called more than once for secure packets, count only
-	 * the first time.
-	 */
-	if (!mctl_present) {
-		UPDATE_IB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
-	}
-
-	/* Check for fragmentation offset. */
-	u2 = ntohs(ipha->ipha_fragment_offset_and_flags);
-	u1 = u2 & (IPH_MF | IPH_OFFSET);
-	if (u1) {
-		/*
-		 * We re-assemble fragments before we do the AH/ESP
-		 * processing. Thus, M_CTL should not be present
-		 * while we are re-assembling.
-		 */
-		ASSERT(!mctl_present);
-		ASSERT(first_mp == mp);
-		if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL))
-			return;
-
-		/*
-		 * Make sure that first_mp points back to mp as
-		 * the mp we came in with could have changed in
-		 * ip_rput_fragment().
-		 */
-		ipha = (ipha_t *)mp->b_rptr;
-		first_mp = mp;
-	}
-
-	/*
-	 * Clear hardware checksumming flag as it is currently only
-	 * used by TCP and UDP.
-	 */
-	DB_CKSUMFLAGS(mp) = 0;
-
-	/* Now we have a complete datagram, destined for this machine. */
-	u1 = IPH_HDR_LENGTH(ipha);
-	switch (ipha->ipha_protocol) {
-	case IPPROTO_ICMP: {
-		ire_t		*ire_zone;
-		ilm_t		*ilm;
-		mblk_t		*mp1;
-		zoneid_t	last_zoneid;
-		ilm_walker_t	ilw;
-
-		if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) {
-			ASSERT(ire->ire_type == IRE_BROADCAST);
-
-			/*
-			 * In the multicast case, applications may have joined
-			 * the group from different zones, so we need to deliver
-			 * the packet to each of them. Loop through the
-			 * multicast memberships structures (ilm) on the receive
-			 * ill and send a copy of the packet up each matching
-			 * one. However, we don't do this for multicasts sent on
-			 * the loopback interface (PHYI_LOOPBACK flag set) as
-			 * they must stay in the sender's zone.
-			 *
-			 * ilm_add_v6() ensures that ilms in the same zone are
-			 * contiguous in the ill_ilm list. We use this property
-			 * to avoid sending duplicates needed when two
-			 * applications in the same zone join the same group on
-			 * different logical interfaces: we ignore the ilm if
-			 * its zoneid is the same as the last matching one.
-			 * In addition, the sending of the packet for
-			 * ire_zoneid is delayed until all of the other ilms
-			 * have been exhausted.
-			 */
-			last_zoneid = -1;
-			ilm = ilm_walker_start(&ilw, recv_ill);
-			for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
-				if (ipha->ipha_dst != ilm->ilm_addr ||
-				    ilm->ilm_zoneid == last_zoneid ||
-				    ilm->ilm_zoneid == ire->ire_zoneid ||
-				    ilm->ilm_zoneid == ALL_ZONES ||
-				    !(ilm->ilm_ipif->ipif_flags & IPIF_UP))
-					continue;
-				mp1 = ip_copymsg(first_mp);
-				if (mp1 == NULL)
-					continue;
-				icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill,
-				    0, sum, mctl_present, B_TRUE,
-				    recv_ill, ilm->ilm_zoneid);
-				last_zoneid = ilm->ilm_zoneid;
-			}
-			ilm_walker_finish(&ilw);
-		} else if (ire->ire_type == IRE_BROADCAST) {
-			/*
-			 * In the broadcast case, there may be many zones
-			 * which need a copy of the packet delivered to them.
-			 * There is one IRE_BROADCAST per broadcast address
-			 * and per zone; we walk those using a helper function.
-			 * In addition, the sending of the packet for ire is
-			 * delayed until all of the other ires have been
-			 * processed.
-			 */
-			IRB_REFHOLD(ire->ire_bucket);
-			ire_zone = NULL;
-			while ((ire_zone = ire_get_next_bcast_ire(ire_zone,
-			    ire)) != NULL) {
-				mp1 = ip_copymsg(first_mp);
-				if (mp1 == NULL)
-					continue;
-
-				UPDATE_IB_PKT_COUNT(ire_zone);
-				ire_zone->ire_last_used_time = lbolt;
-				icmp_inbound(q, mp1, B_TRUE, ill,
-				    0, sum, mctl_present, B_TRUE,
-				    recv_ill, ire_zone->ire_zoneid);
-			}
-			IRB_REFRELE(ire->ire_bucket);
-		}
-		icmp_inbound(q, first_mp, (ire->ire_type == IRE_BROADCAST),
-		    ill, 0, sum, mctl_present, B_TRUE, recv_ill,
-		    ire->ire_zoneid);
-		TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END,
-		    "ip_rput_locl_end: q %p (%S)", q, "icmp");
-		return;
-	}
-	case IPPROTO_IGMP:
-		/*
-		 * If we are not willing to accept IGMP packets in clear,
-		 * then check with global policy.
-		 */
-		if (ipst->ips_igmp_accept_clear_messages == 0) {
-			first_mp = ipsec_check_global_policy(first_mp, NULL,
-			    ipha, NULL, mctl_present, ipst->ips_netstack);
-			if (first_mp == NULL)
-				return;
-		}
-		if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) {
-			freemsg(first_mp);
-			ip1dbg(("ip_proto_input: zone all cannot accept raw"));
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			return;
-		}
-		if ((mp = igmp_input(q, mp, ill)) == NULL) {
-			/* Bad packet - discarded by igmp_input */
-			TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END,
-			    "ip_rput_locl_end: q %p (%S)", q, "igmp");
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		}
-		/*
-		 * igmp_input() may have returned the pulled up message.
-		 * So first_mp and ipha need to be reinitialized.
-		 */
-		ipha = (ipha_t *)mp->b_rptr;
-		if (mctl_present)
-			first_mp->b_cont = mp;
-		else
-			first_mp = mp;
-		if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol].
-		    connf_head != NULL) {
-			/* No user-level listener for IGMP packets */
-			goto drop_pkt;
-		}
-		/* deliver to local raw users */
-		break;
-	case IPPROTO_PIM:
-		/*
-		 * If we are not willing to accept PIM packets in clear,
-		 * then check with global policy.
-		 */
-		if (ipst->ips_pim_accept_clear_messages == 0) {
-			first_mp = ipsec_check_global_policy(first_mp, NULL,
-			    ipha, NULL, mctl_present, ipst->ips_netstack);
-			if (first_mp == NULL)
-				return;
-		}
-		if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) {
-			freemsg(first_mp);
-			ip1dbg(("ip_proto_input: zone all cannot accept PIM"));
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			return;
-		}
-		if (pim_input(q, mp, ill) != 0) {
-			/* Bad packet - discarded by pim_input */
-			TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END,
-			    "ip_rput_locl_end: q %p (%S)", q, "pim");
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		}
-
-		/*
-		 * pim_input() may have pulled up the message so ipha needs to
-		 * be reinitialized.
-		 */
-		ipha = (ipha_t *)mp->b_rptr;
-		if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol].
-		    connf_head != NULL) {
-			/* No user-level listener for PIM packets */
-			goto drop_pkt;
-		}
-		/* deliver to local raw users */
-		break;
-	case IPPROTO_ENCAP:
-		/*
-		 * Handle self-encapsulated packets (IP-in-IP where
-		 * the inner addresses == the outer addresses).
-		 */
-		hdr_length = IPH_HDR_LENGTH(ipha);
-		if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
-		    mp->b_wptr) {
-			if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
-			    sizeof (ipha_t) - mp->b_rptr)) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(first_mp);
-				return;
-			}
-			ipha = (ipha_t *)mp->b_rptr;
-		}
-		inner_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
-		/*
-		 * Check the sanity of the inner IP header.
-		 */
-		if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(first_mp);
-			return;
-		}
-		if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(first_mp);
-			return;
-		}
-		if (inner_ipha->ipha_src == ipha->ipha_src &&
-		    inner_ipha->ipha_dst == ipha->ipha_dst) {
-			ipsec_in_t *ii;
-
-			/*
-			 * Self-encapsulated tunnel packet. Remove
-			 * the outer IP header and fanout again.
-			 * We also need to make sure that the inner
-			 * header is pulled up until options.
-			 */
-			mp->b_rptr = (uchar_t *)inner_ipha;
-			ipha = inner_ipha;
-			hdr_length = IPH_HDR_LENGTH(ipha);
-			if ((uchar_t *)ipha + hdr_length > mp->b_wptr) {
-				if (!pullupmsg(mp, (uchar_t *)ipha +
-				    + hdr_length - mp->b_rptr)) {
-					freemsg(first_mp);
-					return;
-				}
-				ipha = (ipha_t *)mp->b_rptr;
-			}
-			if (hdr_length > sizeof (ipha_t)) {
-				/* We got options on the inner packet. */
-				ipaddr_t dst = ipha->ipha_dst;
-
-				if (ip_rput_options(q, mp, ipha, &dst, ipst) ==
-				    -1) {
-					/* Bad options! */
-					return;
-				}
-				if (dst != ipha->ipha_dst) {
-					/*
-					 * Someone put a source-route in
-					 * the inside header of a self-
-					 * encapsulated packet.  Drop it
-					 * with extreme prejudice and let
-					 * the sender know.
-					 */
-					icmp_unreachable(q, first_mp,
-					    ICMP_SOURCE_ROUTE_FAILED,
-					    recv_ill->ill_zoneid, ipst);
-					return;
-				}
-			}
-			if (!mctl_present) {
-				ASSERT(first_mp == mp);
-				/*
-				 * This means that somebody is sending
-				 * Self-encapsualted packets without AH/ESP.
-				 * If AH/ESP was present, we would have already
-				 * allocated the first_mp.
-				 *
-				 * Send this packet to find a tunnel endpoint.
-				 * if I can't find one, an ICMP
-				 * PROTOCOL_UNREACHABLE will get sent.
-				 */
-				goto fanout;
-			}
-			/*
-			 * We generally store the ill_index if we need to
-			 * do IPsec processing as we lose the ill queue when
-			 * we come back. But in this case, we never should
-			 * have to store the ill_index here as it should have
-			 * been stored previously when we processed the
-			 * AH/ESP header in this routine or for non-ipsec
-			 * cases, we still have the queue. But for some bad
-			 * packets from the wire, we can get to IPsec after
-			 * this and we better store the index for that case.
-			 */
-			ill = (ill_t *)q->q_ptr;
-			ii = (ipsec_in_t *)first_mp->b_rptr;
-			ii->ipsec_in_ill_index =
-			    ill->ill_phyint->phyint_ifindex;
-			ii->ipsec_in_rill_index =
-			    recv_ill->ill_phyint->phyint_ifindex;
-			if (ii->ipsec_in_decaps) {
-				/*
-				 * This packet is self-encapsulated multiple
-				 * times. We don't want to recurse infinitely.
-				 * To keep it simple, drop the packet.
-				 */
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(first_mp);
-				return;
-			}
-			ii->ipsec_in_decaps = B_TRUE;
-			ip_fanout_proto_again(first_mp, recv_ill, recv_ill,
-			    ire);
-			return;
-		}
-		break;
-	case IPPROTO_AH:
-	case IPPROTO_ESP: {
-		ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
-		/*
-		 * Fast path for AH/ESP. If this is the first time
-		 * we are sending a datagram to AH/ESP, allocate
-		 * a IPSEC_IN message and prepend it. Otherwise,
-		 * just fanout.
-		 */
-
-		int ipsec_rc;
-		ipsec_in_t *ii;
-		netstack_t *ns = ipst->ips_netstack;
-
-		IP_STAT(ipst, ipsec_proto_ahesp);
-		if (!mctl_present) {
-			ASSERT(first_mp == mp);
-			first_mp = ipsec_in_alloc(B_TRUE, ns);
-			if (first_mp == NULL) {
-				ip1dbg(("ip_proto_input: IPSEC_IN "
-				    "allocation failure.\n"));
-				freemsg(hada_mp); /* okay ifnull */
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(mp);
-				return;
-			}
-			/*
-			 * Store the ill_index so that when we come back
-			 * from IPsec we ride on the same queue.
-			 */
-			ill = (ill_t *)q->q_ptr;
-			ii = (ipsec_in_t *)first_mp->b_rptr;
-			ii->ipsec_in_ill_index =
-			    ill->ill_phyint->phyint_ifindex;
-			ii->ipsec_in_rill_index =
-			    recv_ill->ill_phyint->phyint_ifindex;
-			first_mp->b_cont = mp;
-			/*
-			 * Cache hardware acceleration info.
-			 */
-			if (hada_mp != NULL) {
-				IPSECHW_DEBUG(IPSECHW_PKT,
-				    ("ip_rput_local: caching data attr.\n"));
-				ii->ipsec_in_accelerated = B_TRUE;
-				ii->ipsec_in_da = hada_mp;
-				hada_mp = NULL;
-			}
-		} else {
-			ii = (ipsec_in_t *)first_mp->b_rptr;
-		}
-
-		ii->ipsec_in_esp_udp_ports = esp_udp_ports;
-
-		if (!ipsec_loaded(ipss)) {
-			ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP,
-			    ire->ire_zoneid, ipst);
-			return;
-		}
-
-		ns = ipst->ips_netstack;
-		/* select inbound SA and have IPsec process the pkt */
-		if (ipha->ipha_protocol == IPPROTO_ESP) {
-			esph_t *esph = ipsec_inbound_esp_sa(first_mp, ns);
-			boolean_t esp_in_udp_sa;
-			if (esph == NULL)
-				return;
-			ASSERT(ii->ipsec_in_esp_sa != NULL);
-			ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != NULL);
-			esp_in_udp_sa = ((ii->ipsec_in_esp_sa->ipsa_flags &
-			    IPSA_F_NATT) != 0);
-			/*
-			 * The following is a fancy, but quick, way of saying:
-			 * ESP-in-UDP SA and Raw ESP packet --> drop
-			 *    OR
-			 * ESP SA and ESP-in-UDP packet --> drop
-			 */
-			if (esp_in_udp_sa != esp_in_udp_packet) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				ip_drop_packet(first_mp, B_TRUE, ill, NULL,
-				    DROPPER(ns->netstack_ipsec, ipds_esp_no_sa),
-				    &ns->netstack_ipsec->ipsec_dropper);
-				return;
-			}
-			ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func(
-			    first_mp, esph);
-		} else {
-			ah_t *ah = ipsec_inbound_ah_sa(first_mp, ns);
-			if (ah == NULL)
-				return;
-			ASSERT(ii->ipsec_in_ah_sa != NULL);
-			ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL);
-			ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func(
-			    first_mp, ah);
-		}
-
-		switch (ipsec_rc) {
-		case IPSEC_STATUS_SUCCESS:
-			break;
-		case IPSEC_STATUS_FAILED:
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			/* FALLTHRU */
-		case IPSEC_STATUS_PENDING:
-			return;
-		}
-		/* we're done with IPsec processing, send it up */
-		ip_fanout_proto_again(first_mp, ill, recv_ill, ire);
-		return;
-	}
-	default:
-		break;
-	}
-	if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) {
-		ip1dbg(("ip_proto_input: zone %d cannot accept raw IP",
-		    ire->ire_zoneid));
-		goto drop_pkt;
-	}
-	/*
-	 * Handle protocols with which IP is less intimate.  There
-	 * can be more than one stream bound to a particular
-	 * protocol.  When this is the case, each one gets a copy
-	 * of any incoming packets.
-	 */
-fanout:
-	ip_fanout_proto(q, first_mp, ill, ipha,
-	    IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_RAWIP, mctl_present,
-	    B_TRUE, recv_ill, ire->ire_zoneid);
-	TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END,
-	    "ip_rput_locl_end: q %p (%S)", q, "ip_fanout_proto");
-	return;
-
-drop_pkt:
-	freemsg(first_mp);
-	if (hada_mp != NULL)
-		freeb(hada_mp);
-	TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END,
-	    "ip_rput_locl_end: q %p (%S)", q, "droppkt");
-#undef	rptr
-#undef  iphs
-
-}
-
-/*
  * Update any source route, record route or timestamp options.
  * Check that we are at end of strict source route.
- * The options have already been checked for sanity in ip_rput_options().
+ * The options have already been checked for sanity in ip_input_options().
  */
-static boolean_t
-ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
-    ip_stack_t *ipst)
+boolean_t
+ip_input_local_options(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
 {
 	ipoptp_t	opts;
 	uchar_t		*opt;
 	uint8_t		optval;
 	uint8_t		optlen;
 	ipaddr_t	dst;
+	ipaddr_t	ifaddr;
 	uint32_t	ts;
-	ire_t		*dst_ire;
 	timestruc_t	now;
-	zoneid_t	zoneid;
-	ill_t		*ill;
-
-	ASSERT(ire->ire_ipversion == IPV4_VERSION);
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	ip2dbg(("ip_rput_local_options\n"));
+	ip2dbg(("ip_input_local_options\n"));
 
 	for (optval = ipoptp_first(&opts, ipha);
 	    optval != IPOPT_EOL;
@@ -17900,7 +9360,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
 		opt = opts.ipoptp_cur;
 		optlen = opts.ipoptp_len;
-		ip2dbg(("ip_rput_local_options: opt %d, len %d\n",
+		ip2dbg(("ip_input_local_options: opt %d, len %d\n",
 		    optval, optlen));
 		switch (optval) {
 			uint32_t off;
@@ -17911,7 +9371,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 			if (optlen < IP_ADDR_LEN ||
 			    off > optlen - IP_ADDR_LEN) {
 				/* End of source route */
-				ip1dbg(("ip_rput_local_options: end of SR\n"));
+				ip1dbg(("ip_input_local_options: end of SR\n"));
 				break;
 			}
 			/*
@@ -17920,7 +9380,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 			 * it is a packet with a loose source route which
 			 * reaches us before consuming the whole source route
 			 */
-			ip1dbg(("ip_rput_local_options: not end of SR\n"));
+			ip1dbg(("ip_input_local_options: not end of SR\n"));
 			if (optval == IPOPT_SSRR) {
 				goto bad_src_route;
 			}
@@ -17941,11 +9401,17 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 			    off > optlen - IP_ADDR_LEN) {
 				/* No more room - ignore */
 				ip1dbg((
-				    "ip_rput_local_options: end of RR\n"));
+				    "ip_input_local_options: end of RR\n"));
 				break;
 			}
-			bcopy(&ire->ire_src_addr, (char *)opt + off,
-			    IP_ADDR_LEN);
+			/* Pick a reasonable address on the outbound if */
+			if (ip_select_source_v4(ill, INADDR_ANY, ipha->ipha_dst,
+			    INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
+			    NULL) != 0) {
+				/* No source! Shouldn't happen */
+				ifaddr = INADDR_ANY;
+			}
+			bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
 			opt[IPOPT_OFFSET] += IP_ADDR_LEN;
 			break;
 		case IPOPT_TS:
@@ -17959,14 +9425,10 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 				/* Verify that the address matched */
 				off = opt[IPOPT_OFFSET] - 1;
 				bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
-				dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL,
-				    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
-				    ipst);
-				if (dst_ire == NULL) {
+				if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
 					/* Not for us */
 					break;
 				}
-				ire_refrele(dst_ire);
 				/* FALLTHRU */
 			case IPOPT_TS_TSANDADDR:
 				off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
@@ -17976,8 +9438,8 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 				 * ip_*put_options should have already
 				 * dropped this packet.
 				 */
-				cmn_err(CE_PANIC, "ip_rput_local_options: "
-				    "unknown IT - bug in ip_rput_options?\n");
+				cmn_err(CE_PANIC, "ip_input_local_options: "
+				    "unknown IT - bug in ip_input_options?\n");
 				return (B_TRUE);	/* Keep "lint" happy */
 			}
 			if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
@@ -17993,8 +9455,14 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 			case IPOPT_TS_PRESPEC:
 			case IPOPT_TS_PRESPEC_RFC791:
 			case IPOPT_TS_TSANDADDR:
-				bcopy(&ire->ire_src_addr, (char *)opt + off,
-				    IP_ADDR_LEN);
+				/* Pick a reasonable addr on the outbound if */
+				if (ip_select_source_v4(ill, INADDR_ANY,
+				    ipha->ipha_dst, INADDR_ANY, ALL_ZONES, ipst,
+				    &ifaddr, NULL, NULL) != 0) {
+					/* No source! Shouldn't happen */
+					ifaddr = INADDR_ANY;
+				}
+				bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
 				opt[IPOPT_OFFSET] += IP_ADDR_LEN;
 				/* FALLTHRU */
 			case IPOPT_TS_TSONLY:
@@ -18013,51 +9481,41 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 	return (B_TRUE);
 
 bad_src_route:
-	q = WR(q);
-	if (q->q_next != NULL)
-		ill = q->q_ptr;
-	else
-		ill = NULL;
-
 	/* make sure we clear any indication of a hardware checksum */
 	DB_CKSUMFLAGS(mp) = 0;
-	zoneid = ipif_lookup_addr_zoneid(ipha->ipha_dst, ill, ipst);
-	if (zoneid == ALL_ZONES)
-		freemsg(mp);
-	else
-		icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst);
+	ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
+	icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
 	return (B_FALSE);
 
 }
 
 /*
- * Process IP options in an inbound packet.  If an option affects the
- * effective destination address, return the next hop address via dstp.
- * Returns -1 if something fails in which case an ICMP error has been sent
+ * Process IP options in an inbound packet.  Always returns the nexthop.
+ * Normally this is the passed in nexthop, but if there is an option
+ * that effects the nexthop (such as a source route) that will be returned.
+ * Sets *errorp if there is an error, in which case an ICMP error has been sent
  * and mp freed.
  */
-static int
-ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
-    ip_stack_t *ipst)
+ipaddr_t
+ip_input_options(ipha_t *ipha, ipaddr_t dst, mblk_t *mp,
+    ip_recv_attr_t *ira, int *errorp)
 {
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
 	ipoptp_t	opts;
 	uchar_t		*opt;
 	uint8_t		optval;
 	uint8_t		optlen;
-	ipaddr_t	dst;
 	intptr_t	code = 0;
-	ire_t		*ire = NULL;
-	zoneid_t	zoneid;
-	ill_t		*ill;
+	ire_t		*ire;
 
-	ip2dbg(("ip_rput_options\n"));
-	dst = ipha->ipha_dst;
+	ip2dbg(("ip_input_options\n"));
+	*errorp = 0;
 	for (optval = ipoptp_first(&opts, ipha);
 	    optval != IPOPT_EOL;
 	    optval = ipoptp_next(&opts)) {
 		opt = opts.ipoptp_cur;
 		optlen = opts.ipoptp_len;
-		ip2dbg(("ip_rput_options: opt %d, len %d\n",
+		ip2dbg(("ip_input_options: opt %d, len %d\n",
 		    optval, optlen));
 		/*
 		 * Note: we need to verify the checksum before we
@@ -18068,27 +9526,24 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
 			uint32_t off;
 		case IPOPT_SSRR:
 		case IPOPT_LSRR:
-			ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL,
-			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-			if (ire == NULL) {
+			if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
 				if (optval == IPOPT_SSRR) {
-					ip1dbg(("ip_rput_options: not next"
+					ip1dbg(("ip_input_options: not next"
 					    " strict source route 0x%x\n",
 					    ntohl(dst)));
 					code = (char *)&ipha->ipha_dst -
 					    (char *)ipha;
 					goto param_prob; /* RouterReq's */
 				}
-				ip2dbg(("ip_rput_options: "
+				ip2dbg(("ip_input_options: "
 				    "not next source route 0x%x\n",
 				    ntohl(dst)));
 				break;
 			}
-			ire_refrele(ire);
 
 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
 				ip1dbg((
-				    "ip_rput_options: bad option offset\n"));
+				    "ip_input_options: bad option offset\n"));
 				code = (char *)&opt[IPOPT_OLEN] -
 				    (char *)ipha;
 				goto param_prob;
@@ -18099,11 +9554,11 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
 			if (optlen < IP_ADDR_LEN ||
 			    off > optlen - IP_ADDR_LEN) {
 				/* End of source route */
-				ip1dbg(("ip_rput_options: end of SR\n"));
+				ip1dbg(("ip_input_options: end of SR\n"));
 				break;
 			}
 			bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
-			ip1dbg(("ip_rput_options: next hop 0x%x\n",
+			ip1dbg(("ip_input_options: next hop 0x%x\n",
 			    ntohl(dst)));
 
 			/*
@@ -18112,17 +9567,13 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
 			 * XXX verify per-interface ip_forwarding
 			 * for source route?
 			 */
-			ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL,
-			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-
-			if (ire != NULL) {
-				ire_refrele(ire);
+			if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
 				off += IP_ADDR_LEN;
 				goto redo_srr;
 			}
 
 			if (dst == htonl(INADDR_LOOPBACK)) {
-				ip1dbg(("ip_rput_options: loopback addr in "
+				ip1dbg(("ip_input_options: loopback addr in "
 				    "source route!\n"));
 				goto bad_src_route;
 			}
@@ -18131,12 +9582,13 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
 			 * reachable.
 			 */
 			if (optval == IPOPT_SSRR) {
-				ire = ire_ftable_lookup(dst, 0, 0,
-				    IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0,
-				    msg_getlabel(mp),
-				    MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst);
+				ire = ire_ftable_lookup_v4(dst, 0, 0,
+				    IRE_IF_ALL, NULL, ALL_ZONES,
+				    ira->ira_tsl,
+				    MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
+				    NULL);
 				if (ire == NULL) {
-					ip1dbg(("ip_rput_options: SSRR not "
+					ip1dbg(("ip_input_options: SSRR not "
 					    "directly reachable: 0x%x\n",
 					    ntohl(dst)));
 					goto bad_src_route;
@@ -18151,7 +9603,7 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
 		case IPOPT_RR:
 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
 				ip1dbg((
-				    "ip_rput_options: bad option offset\n"));
+				    "ip_input_options: bad option offset\n"));
 				code = (char *)&opt[IPOPT_OLEN] -
 				    (char *)ipha;
 				goto param_prob;
@@ -18169,7 +9621,7 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
 			}
 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
 				ip1dbg((
-				    "ip_rput_options: bad option offset\n"));
+				    "ip_input_options: bad option offset\n"));
 				code = (char *)&opt[IPOPT_OFFSET] -
 				    (char *)ipha;
 				goto param_prob;
@@ -18201,45 +9653,27 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
 	}
 
 	if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) {
-		*dstp = dst;
-		return (0);
+		return (dst);
 	}
 
-	ip1dbg(("ip_rput_options: error processing IP options."));
+	ip1dbg(("ip_input_options: error processing IP options."));
 	code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
 
 param_prob:
-	q = WR(q);
-	if (q->q_next != NULL)
-		ill = q->q_ptr;
-	else
-		ill = NULL;
-
 	/* make sure we clear any indication of a hardware checksum */
 	DB_CKSUMFLAGS(mp) = 0;
-	/* Don't know whether this is for non-global or global/forwarding */
-	zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst);
-	if (zoneid == ALL_ZONES)
-		freemsg(mp);
-	else
-		icmp_param_problem(q, mp, (uint8_t)code, zoneid, ipst);
-	return (-1);
+	ip_drop_input("ICMP_PARAM_PROBLEM", mp, ira->ira_ill);
+	icmp_param_problem(mp, (uint8_t)code, ira);
+	*errorp = -1;
+	return (dst);
 
 bad_src_route:
-	q = WR(q);
-	if (q->q_next != NULL)
-		ill = q->q_ptr;
-	else
-		ill = NULL;
-
 	/* make sure we clear any indication of a hardware checksum */
 	DB_CKSUMFLAGS(mp) = 0;
-	zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst);
-	if (zoneid == ALL_ZONES)
-		freemsg(mp);
-	else
-		icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst);
-	return (-1);
+	ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ira->ira_ill);
+	icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
+	*errorp = -1;
+	return (dst);
 }
 
 /*
@@ -18248,7 +9682,7 @@ bad_src_route:
  *  - icmp fixed part (mib2_icmp_t)
  *  - ipAddrEntryTable (ip 20)		all IPv4 ipifs
  *  - ipRouteEntryTable (ip 21)		all IPv4 IREs
- *  - ipNetToMediaEntryTable (ip 22)	[filled in by the arp module]
+ *  - ipNetToMediaEntryTable (ip 22)	all IPv4 Neighbor Cache entries
  *  - ipRouteAttributeTable (ip 102)	labeled routes
  *  - ip multicast membership (ip_member_t)
  *  - ip multicast source filtering (ip_grpsrc_t)
@@ -18262,13 +9696,11 @@ bad_src_route:
  *					One per ill plus one generic
  *  - ipv6RouteEntry			all IPv6 IREs
  *  - ipv6RouteAttributeTable (ip6 102)	labeled routes
- *  - ipv6NetToMediaEntry		all Neighbor Cache entries
+ *  - ipv6NetToMediaEntry		all IPv6 Neighbor Cache entries
  *  - ipv6AddrEntry			all IPv6 ipifs
  *  - ipv6 multicast membership (ipv6_member_t)
  *  - ipv6 multicast source filtering (ipv6_grpsrc_t)
  *
- * MIB2_IP_MEDIA is filled in by the arp module with ARP cache entries.
- *
  * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is
  * already filled in by the caller.
  * Return value of 0 indicates that no messages were sent and caller
@@ -18387,6 +9819,9 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level)
 	if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) {
 		return (1);
 	}
+	if ((mpctl = ip_snmp_get_mib2_ip_dce(q, mpctl, ipst)) == NULL) {
+		return (1);
+	}
 	freemsg(mpctl);
 	return (1);
 }
@@ -18426,6 +9861,7 @@ ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib,
 	SET_MIB(old_ip_mib.ipRouteAttributeSize,
 	    sizeof (mib2_ipAttributeEntry_t));
 	SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t));
+	SET_MIB(old_ip_mib.ipDestEntrySize, sizeof (dest_cache_entry_t));
 
 	/*
 	 * Grab the statistics from the new IP MIB
@@ -18681,9 +10117,14 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 			if (ipif->ipif_zoneid != zoneid &&
 			    ipif->ipif_zoneid != ALL_ZONES)
 				continue;
+			/* Sum of count from dead IRE_LO* and our current */
 			mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
-			mae.ipAdEntInfo.ae_obcnt = ipif->ipif_ob_pkt_count;
-			mae.ipAdEntInfo.ae_focnt = ipif->ipif_fo_pkt_count;
+			if (ipif->ipif_ire_local != NULL) {
+				mae.ipAdEntInfo.ae_ibcnt +=
+				    ipif->ipif_ire_local->ire_ib_pkt_count;
+			}
+			mae.ipAdEntInfo.ae_obcnt = 0;
+			mae.ipAdEntInfo.ae_focnt = 0;
 
 			ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes,
 			    OCTET_LENGTH);
@@ -18694,7 +10135,7 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 			mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet;
 			mae.ipAdEntInfo.ae_subnet_len =
 			    ip_mask_to_plen(ipif->ipif_net_mask);
-			mae.ipAdEntInfo.ae_src_addr = ipif->ipif_src_addr;
+			mae.ipAdEntInfo.ae_src_addr = ipif->ipif_lcl_addr;
 			for (bitval = 1;
 			    bitval &&
 			    !(bitval & ipif->ipif_brd_addr);
@@ -18702,7 +10143,7 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 				noop;
 			mae.ipAdEntBcastAddr = bitval;
 			mae.ipAdEntReasmMaxSize = IP_MAXPACKET;
-			mae.ipAdEntInfo.ae_mtu = ipif->ipif_mtu;
+			mae.ipAdEntInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
 			mae.ipAdEntInfo.ae_metric  = ipif->ipif_metric;
 			mae.ipAdEntInfo.ae_broadcast_addr =
 			    ipif->ipif_brd_addr;
@@ -18710,7 +10151,8 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 			    ipif->ipif_pp_dst_addr;
 			mae.ipAdEntInfo.ae_flags = ipif->ipif_flags |
 			    ill->ill_flags | ill->ill_phyint->phyint_flags;
-			mae.ipAdEntRetransmitTime = AR_EQ_DEFAULT_XMIT_INTERVAL;
+			mae.ipAdEntRetransmitTime =
+			    ill->ill_reachable_retrans_time;
 
 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
 			    (char *)&mae, (int)sizeof (mib2_ipAddrEntry_t))) {
@@ -18762,9 +10204,14 @@ ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 			if (ipif->ipif_zoneid != zoneid &&
 			    ipif->ipif_zoneid != ALL_ZONES)
 				continue;
+			/* Sum of count from dead IRE_LO* and our current */
 			mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
-			mae6.ipv6AddrInfo.ae_obcnt = ipif->ipif_ob_pkt_count;
-			mae6.ipv6AddrInfo.ae_focnt = ipif->ipif_fo_pkt_count;
+			if (ipif->ipif_ire_local != NULL) {
+				mae6.ipv6AddrInfo.ae_ibcnt +=
+				    ipif->ipif_ire_local->ire_ib_pkt_count;
+			}
+			mae6.ipv6AddrInfo.ae_obcnt = 0;
+			mae6.ipv6AddrInfo.ae_focnt = 0;
 
 			ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes,
 			    OCTET_LENGTH);
@@ -18776,7 +10223,7 @@ ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 			mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet;
 			mae6.ipv6AddrInfo.ae_subnet_len =
 			    mae6.ipv6AddrPfxLength;
-			mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6src_addr;
+			mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6lcl_addr;
 
 			/* Type: stateless(1), stateful(2), unknown(3) */
 			if (ipif->ipif_flags & IPIF_ADDRCONF)
@@ -18799,7 +10246,7 @@ ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 				mae6.ipv6AddrStatus = 2;
 			else
 				mae6.ipv6AddrStatus = 1;
-			mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_mtu;
+			mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
 			mae6.ipv6AddrInfo.ae_metric  = ipif->ipif_metric;
 			mae6.ipv6AddrInfo.ae_pp_dst_addr =
 			    ipif->ipif_v6pp_dst_addr;
@@ -18842,7 +10289,6 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	mblk_t			*mp_tail = NULL;
 	ill_walk_context_t	ctx;
 	zoneid_t		zoneid;
-	ilm_walker_t		ilw;
 
 	/*
 	 * make a copy of the original message
@@ -18859,36 +10305,49 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (IS_UNDER_IPMP(ill))
+		/* Make sure the ill isn't going away. */
+		if (!ill_check_and_refhold(ill))
 			continue;
+		rw_exit(&ipst->ips_ill_g_lock);
+		rw_enter(&ill->ill_mcast_lock, RW_READER);
+		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+			if (ilm->ilm_zoneid != zoneid &&
+			    ilm->ilm_zoneid != ALL_ZONES)
+				continue;
 
-		ilm = ilm_walker_start(&ilw, ill);
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ipif->ipif_zoneid != zoneid &&
-			    ipif->ipif_zoneid != ALL_ZONES)
-				continue;	/* not this zone */
-			ipif_get_name(ipif, ipm.ipGroupMemberIfIndex.o_bytes,
-			    OCTET_LENGTH);
+			/* Is there an ipif for ilm_ifaddr? */
+			for (ipif = ill->ill_ipif; ipif != NULL;
+			    ipif = ipif->ipif_next) {
+				if (!IPIF_IS_CONDEMNED(ipif) &&
+				    ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
+				    ilm->ilm_ifaddr != INADDR_ANY)
+					break;
+			}
+			if (ipif != NULL) {
+				ipif_get_name(ipif,
+				    ipm.ipGroupMemberIfIndex.o_bytes,
+				    OCTET_LENGTH);
+			} else {
+				ill_get_name(ill,
+				    ipm.ipGroupMemberIfIndex.o_bytes,
+				    OCTET_LENGTH);
+			}
 			ipm.ipGroupMemberIfIndex.o_length =
 			    mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes);
-			for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
-				ASSERT(ilm->ilm_ipif != NULL);
-				ASSERT(ilm->ilm_ill == NULL);
-				if (ilm->ilm_ipif != ipif)
-					continue;
-				ipm.ipGroupMemberAddress = ilm->ilm_addr;
-				ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt;
-				ipm.ipGroupMemberFilterMode = ilm->ilm_fmode;
-				if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
-				    (char *)&ipm, (int)sizeof (ipm))) {
-					ip1dbg(("ip_snmp_get_mib2_ip_group: "
-					    "failed to allocate %u bytes\n",
-					    (uint_t)sizeof (ipm)));
-				}
+
+			ipm.ipGroupMemberAddress = ilm->ilm_addr;
+			ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt;
+			ipm.ipGroupMemberFilterMode = ilm->ilm_fmode;
+			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
+			    (char *)&ipm, (int)sizeof (ipm))) {
+				ip1dbg(("ip_snmp_get_mib2_ip_group: "
+				    "failed to allocate %u bytes\n",
+				    (uint_t)sizeof (ipm)));
 			}
 		}
-		ilm_walker_finish(&ilw);
+		rw_exit(&ill->ill_mcast_lock);
+		ill_refrele(ill);
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
@@ -18910,7 +10369,6 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	mblk_t			*mp_tail = NULL;
 	ill_walk_context_t	ctx;
 	zoneid_t		zoneid;
-	ilm_walker_t		ilw;
 
 	/*
 	 * make a copy of the original message
@@ -18926,15 +10384,19 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V6(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (IS_UNDER_IPMP(ill))
+		/* Make sure the ill isn't going away. */
+		if (!ill_check_and_refhold(ill))
 			continue;
-
-		ilm = ilm_walker_start(&ilw, ill);
+		rw_exit(&ipst->ips_ill_g_lock);
+		/*
+		 * Normally we don't have any members on under IPMP interfaces.
+		 * We report them as a debugging aid.
+		 */
+		rw_enter(&ill->ill_mcast_lock, RW_READER);
 		ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex;
-		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
-			ASSERT(ilm->ilm_ipif == NULL);
-			ASSERT(ilm->ilm_ill != NULL);
-			if (ilm->ilm_zoneid != zoneid)
+		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+			if (ilm->ilm_zoneid != zoneid &&
+			    ilm->ilm_zoneid != ALL_ZONES)
 				continue;	/* not this zone */
 			ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr;
 			ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt;
@@ -18947,7 +10409,9 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 				    (uint_t)sizeof (ipm6)));
 			}
 		}
-		ilm_walker_finish(&ilw);
+		rw_exit(&ill->ill_mcast_lock);
+		ill_refrele(ill);
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 
@@ -18973,7 +10437,6 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	zoneid_t		zoneid;
 	int			i;
 	slist_t			*sl;
-	ilm_walker_t		ilw;
 
 	/*
 	 * make a copy of the original message
@@ -18990,43 +10453,56 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (IS_UNDER_IPMP(ill))
+		/* Make sure the ill isn't going away. */
+		if (!ill_check_and_refhold(ill))
 			continue;
+		rw_exit(&ipst->ips_ill_g_lock);
+		rw_enter(&ill->ill_mcast_lock, RW_READER);
+		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+			sl = ilm->ilm_filter;
+			if (ilm->ilm_zoneid != zoneid &&
+			    ilm->ilm_zoneid != ALL_ZONES)
+				continue;
+			if (SLIST_IS_EMPTY(sl))
+				continue;
 
-		ilm = ilm_walker_start(&ilw, ill);
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ipif->ipif_zoneid != zoneid)
-				continue;	/* not this zone */
-			ipif_get_name(ipif, ips.ipGroupSourceIfIndex.o_bytes,
-			    OCTET_LENGTH);
+			/* Is there an ipif for ilm_ifaddr? */
+			for (ipif = ill->ill_ipif; ipif != NULL;
+			    ipif = ipif->ipif_next) {
+				if (!IPIF_IS_CONDEMNED(ipif) &&
+				    ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
+				    ilm->ilm_ifaddr != INADDR_ANY)
+					break;
+			}
+			if (ipif != NULL) {
+				ipif_get_name(ipif,
+				    ips.ipGroupSourceIfIndex.o_bytes,
+				    OCTET_LENGTH);
+			} else {
+				ill_get_name(ill,
+				    ips.ipGroupSourceIfIndex.o_bytes,
+				    OCTET_LENGTH);
+			}
 			ips.ipGroupSourceIfIndex.o_length =
 			    mi_strlen(ips.ipGroupSourceIfIndex.o_bytes);
-			for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
-				ASSERT(ilm->ilm_ipif != NULL);
-				ASSERT(ilm->ilm_ill == NULL);
-				sl = ilm->ilm_filter;
-				if (ilm->ilm_ipif != ipif || SLIST_IS_EMPTY(sl))
+
+			ips.ipGroupSourceGroup = ilm->ilm_addr;
+			for (i = 0; i < sl->sl_numsrc; i++) {
+				if (!IN6_IS_ADDR_V4MAPPED(&sl->sl_addr[i]))
 					continue;
-				ips.ipGroupSourceGroup = ilm->ilm_addr;
-				for (i = 0; i < sl->sl_numsrc; i++) {
-					if (!IN6_IS_ADDR_V4MAPPED(
-					    &sl->sl_addr[i]))
-						continue;
-					IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i],
-					    ips.ipGroupSourceAddress);
-					if (snmp_append_data2(mpctl->b_cont,
-					    &mp_tail, (char *)&ips,
-					    (int)sizeof (ips)) == 0) {
-						ip1dbg(("ip_snmp_get_mib2_"
-						    "ip_group_src: failed to "
-						    "allocate %u bytes\n",
-						    (uint_t)sizeof (ips)));
-					}
+				IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i],
+				    ips.ipGroupSourceAddress);
+				if (snmp_append_data2(mpctl->b_cont, &mp_tail,
+				    (char *)&ips, (int)sizeof (ips)) == 0) {
+					ip1dbg(("ip_snmp_get_mib2_ip_group_src:"
+					    " failed to allocate %u bytes\n",
+					    (uint_t)sizeof (ips)));
 				}
 			}
 		}
-		ilm_walker_finish(&ilw);
+		rw_exit(&ill->ill_mcast_lock);
+		ill_refrele(ill);
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
@@ -19050,7 +10526,6 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	zoneid_t		zoneid;
 	int			i;
 	slist_t			*sl;
-	ilm_walker_t		ilw;
 
 	/*
 	 * make a copy of the original message
@@ -19066,16 +10541,22 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V6(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (IS_UNDER_IPMP(ill))
+		/* Make sure the ill isn't going away. */
+		if (!ill_check_and_refhold(ill))
 			continue;
-
-		ilm = ilm_walker_start(&ilw, ill);
+		rw_exit(&ipst->ips_ill_g_lock);
+		/*
+		 * Normally we don't have any members on under IPMP interfaces.
+		 * We report them as a debugging aid.
+		 */
+		rw_enter(&ill->ill_mcast_lock, RW_READER);
 		ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex;
-		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
-			ASSERT(ilm->ilm_ipif == NULL);
-			ASSERT(ilm->ilm_ill != NULL);
+		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
 			sl = ilm->ilm_filter;
-			if (ilm->ilm_zoneid != zoneid || SLIST_IS_EMPTY(sl))
+			if (ilm->ilm_zoneid != zoneid &&
+			    ilm->ilm_zoneid != ALL_ZONES)
+				continue;
+			if (SLIST_IS_EMPTY(sl))
 				continue;
 			ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr;
 			for (i = 0; i < sl->sl_numsrc; i++) {
@@ -19089,7 +10570,9 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 				}
 			}
 		}
-		ilm_walker_finish(&ilw);
+		rw_exit(&ill->ill_mcast_lock);
+		ill_refrele(ill);
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 
@@ -19189,13 +10672,13 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
 	ird.ird_netmedia.lp_head = mp3ctl->b_cont;
 	ird.ird_attrs.lp_head = mp4ctl->b_cont;
 	/*
-	 * If the level has been set the special EXPER_IP_AND_TESTHIDDEN
-	 * value, then also include IRE_MARK_TESTHIDDEN IREs.  This is
+	 * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
+	 * then also include ire_testhidden IREs and IRE_IF_CLONE.  This is
 	 * intended a temporary solution until a proper MIB API is provided
 	 * that provides complete filtering/caller-opt-in.
 	 */
-	if (level == EXPER_IP_AND_TESTHIDDEN)
-		ird.ird_flags |= IRD_REPORT_TESTHIDDEN;
+	if (level == EXPER_IP_AND_ALL_IRES)
+		ird.ird_flags |= IRD_REPORT_ALL;
 
 	zoneid = Q_TO_CONN(q)->conn_zoneid;
 	ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst);
@@ -19210,6 +10693,8 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
 	qreply(q, mpctl);
 
 	/* ipNetToMediaEntryTable in mp3ctl */
+	ncec_walk(NULL, ip_snmp_get2_v4_media, &ird, ipst);
+
 	optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
 	optp->level = MIB2_IP;
 	optp->name = MIB2_IP_MEDIA;
@@ -19272,13 +10757,13 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
 	ird.ird_netmedia.lp_head = mp3ctl->b_cont;
 	ird.ird_attrs.lp_head = mp4ctl->b_cont;
 	/*
-	 * If the level has been set the special EXPER_IP_AND_TESTHIDDEN
-	 * value, then also include IRE_MARK_TESTHIDDEN IREs.  This is
+	 * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
+	 * then also include ire_testhidden IREs and IRE_IF_CLONE.  This is
 	 * intended a temporary solution until a proper MIB API is provided
 	 * that provides complete filtering/caller-opt-in.
 	 */
-	if (level == EXPER_IP_AND_TESTHIDDEN)
-		ird.ird_flags |= IRD_REPORT_TESTHIDDEN;
+	if (level == EXPER_IP_AND_ALL_IRES)
+		ird.ird_flags |= IRD_REPORT_ALL;
 
 	zoneid = Q_TO_CONN(q)->conn_zoneid;
 	ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst);
@@ -19292,7 +10777,7 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
 	qreply(q, mpctl);
 
 	/* ipv6NetToMediaEntryTable in mp3ctl */
-	ndp_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst);
+	ncec_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst);
 
 	optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
 	optp->level = MIB2_IP6;
@@ -19487,21 +10972,20 @@ static void
 ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
 {
 	ill_t				*ill;
-	ipif_t				*ipif;
 	mib2_ipRouteEntry_t		*re;
-	mib2_ipAttributeEntry_t		*iae, *iaeptr;
-	ipaddr_t			gw_addr;
+	mib2_ipAttributeEntry_t		iaes;
 	tsol_ire_gw_secattr_t		*attrp;
 	tsol_gc_t			*gc = NULL;
 	tsol_gcgrp_t			*gcgrp = NULL;
-	uint_t				sacnt = 0;
-	int				i;
+	ip_stack_t			*ipst = ire->ire_ipst;
 
 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
 
-	if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) &&
-	    ire->ire_marks & IRE_MARK_TESTHIDDEN) {
-		return;
+	if (!(ird->ird_flags & IRD_REPORT_ALL)) {
+		if (ire->ire_testhidden)
+			return;
+		if (ire->ire_type & IRE_IF_CLONE)
+			return;
 	}
 
 	if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
@@ -19513,52 +10997,17 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
 			gcgrp = gc->gc_grp;
 			ASSERT(gcgrp != NULL);
 			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
-			sacnt = 1;
-		} else if ((gcgrp = attrp->igsa_gcgrp) != NULL) {
-			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
-			gc = gcgrp->gcgrp_head;
-			sacnt = gcgrp->gcgrp_count;
 		}
 		mutex_exit(&attrp->igsa_lock);
-
-		/* do nothing if there's no gc to report */
-		if (gc == NULL) {
-			ASSERT(sacnt == 0);
-			if (gcgrp != NULL) {
-				/* we might as well drop the lock now */
-				rw_exit(&gcgrp->gcgrp_rwlock);
-				gcgrp = NULL;
-			}
-			attrp = NULL;
-		}
-
-		ASSERT(gc == NULL || (gcgrp != NULL &&
-		    RW_LOCK_HELD(&gcgrp->gcgrp_rwlock)));
 	}
-	ASSERT(sacnt == 0 || gc != NULL);
-
-	if (sacnt != 0 &&
-	    (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) {
-		kmem_free(re, sizeof (*re));
-		rw_exit(&gcgrp->gcgrp_rwlock);
-		return;
-	}
-
 	/*
 	 * Return all IRE types for route table... let caller pick and choose
 	 */
 	re->ipRouteDest = ire->ire_addr;
-	ipif = ire->ire_ipif;
+	ill = ire->ire_ill;
 	re->ipRouteIfIndex.o_length = 0;
-	if (ire->ire_type == IRE_CACHE) {
-		ill = (ill_t *)ire->ire_stq->q_ptr;
-		re->ipRouteIfIndex.o_length =
-		    ill->ill_name_length == 0 ? 0 :
-		    MIN(OCTET_LENGTH, ill->ill_name_length - 1);
-		bcopy(ill->ill_name, re->ipRouteIfIndex.o_bytes,
-		    re->ipRouteIfIndex.o_length);
-	} else if (ipif != NULL) {
-		ipif_get_name(ipif, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH);
+	if (ill != NULL) {
+		ill_get_name(ill, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH);
 		re->ipRouteIfIndex.o_length =
 		    mi_strlen(re->ipRouteIfIndex.o_bytes);
 	}
@@ -19567,30 +11016,45 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
 	re->ipRouteMetric3 = -1;
 	re->ipRouteMetric4 = -1;
 
-	gw_addr = ire->ire_gateway_addr;
-
-	if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST))
-		re->ipRouteNextHop = ire->ire_src_addr;
-	else
-		re->ipRouteNextHop = gw_addr;
+	re->ipRouteNextHop = ire->ire_gateway_addr;
 	/* indirect(4), direct(3), or invalid(2) */
 	if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
 		re->ipRouteType = 2;
+	else if (ire->ire_type & IRE_ONLINK)
+		re->ipRouteType = 3;
 	else
-		re->ipRouteType = (gw_addr != 0) ? 4 : 3;
+		re->ipRouteType = 4;
+
 	re->ipRouteProto = -1;
 	re->ipRouteAge = gethrestime_sec() - ire->ire_create_time;
 	re->ipRouteMask = ire->ire_mask;
 	re->ipRouteMetric5 = -1;
-	re->ipRouteInfo.re_max_frag	= ire->ire_max_frag;
-	re->ipRouteInfo.re_frag_flag	= ire->ire_frag_flag;
-	re->ipRouteInfo.re_rtt		= ire->ire_uinfo.iulp_rtt;
+	re->ipRouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
+	if (ire->ire_ill != NULL && re->ipRouteInfo.re_max_frag == 0)
+		re->ipRouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
+
+	re->ipRouteInfo.re_frag_flag	= 0;
+	re->ipRouteInfo.re_rtt		= 0;
+	re->ipRouteInfo.re_src_addr	= 0;
 	re->ipRouteInfo.re_ref		= ire->ire_refcnt;
-	re->ipRouteInfo.re_src_addr	= ire->ire_src_addr;
 	re->ipRouteInfo.re_obpkt	= ire->ire_ob_pkt_count;
 	re->ipRouteInfo.re_ibpkt	= ire->ire_ib_pkt_count;
 	re->ipRouteInfo.re_flags	= ire->ire_flags;
 
+	/* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
+	if (ire->ire_type & IRE_INTERFACE) {
+		ire_t *child;
+
+		rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+		child = ire->ire_dep_children;
+		while (child != NULL) {
+			re->ipRouteInfo.re_obpkt += child->ire_ob_pkt_count;
+			re->ipRouteInfo.re_ibpkt += child->ire_ib_pkt_count;
+			child = child->ire_dep_sib_next;
+		}
+		rw_exit(&ipst->ips_ire_dep_lock);
+	}
+
 	if (ire->ire_flags & RTF_DYNAMIC) {
 		re->ipRouteInfo.re_ire_type	= IRE_HOST_REDIRECT;
 	} else {
@@ -19603,25 +11067,22 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
 		    (uint_t)sizeof (*re)));
 	}
 
-	for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) {
-		iaeptr->iae_routeidx = ird->ird_idx;
-		iaeptr->iae_doi = gc->gc_db->gcdb_doi;
-		iaeptr->iae_slrange = gc->gc_db->gcdb_slrange;
-	}
+	if (gc != NULL) {
+		iaes.iae_routeidx = ird->ird_idx;
+		iaes.iae_doi = gc->gc_db->gcdb_doi;
+		iaes.iae_slrange = gc->gc_db->gcdb_slrange;
 
-	if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail,
-	    (char *)iae, sacnt * sizeof (*iae))) {
-		ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n",
-		    (unsigned)(sacnt * sizeof (*iae))));
+		if (!snmp_append_data2(ird->ird_attrs.lp_head,
+		    &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
+			ip1dbg(("ip_snmp_get2_v4: failed to allocate %u "
+			    "bytes\n", (uint_t)sizeof (iaes)));
+		}
 	}
 
 	/* bump route index for next pass */
 	ird->ird_idx++;
 
 	kmem_free(re, sizeof (*re));
-	if (sacnt != 0)
-		kmem_free(iae, sacnt * sizeof (*iae));
-
 	if (gcgrp != NULL)
 		rw_exit(&gcgrp->gcgrp_rwlock);
 }
@@ -19633,21 +11094,20 @@ static void
 ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
 {
 	ill_t				*ill;
-	ipif_t				*ipif;
 	mib2_ipv6RouteEntry_t		*re;
-	mib2_ipAttributeEntry_t		*iae, *iaeptr;
-	in6_addr_t			gw_addr_v6;
+	mib2_ipAttributeEntry_t		iaes;
 	tsol_ire_gw_secattr_t		*attrp;
 	tsol_gc_t			*gc = NULL;
 	tsol_gcgrp_t			*gcgrp = NULL;
-	uint_t				sacnt = 0;
-	int				i;
+	ip_stack_t			*ipst = ire->ire_ipst;
 
 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
 
-	if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) &&
-	    ire->ire_marks & IRE_MARK_TESTHIDDEN) {
-		return;
+	if (!(ird->ird_flags & IRD_REPORT_ALL)) {
+		if (ire->ire_testhidden)
+			return;
+		if (ire->ire_type & IRE_IF_CLONE)
+			return;
 	}
 
 	if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
@@ -19659,37 +11119,9 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
 			gcgrp = gc->gc_grp;
 			ASSERT(gcgrp != NULL);
 			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
-			sacnt = 1;
-		} else if ((gcgrp = attrp->igsa_gcgrp) != NULL) {
-			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
-			gc = gcgrp->gcgrp_head;
-			sacnt = gcgrp->gcgrp_count;
 		}
 		mutex_exit(&attrp->igsa_lock);
-
-		/* do nothing if there's no gc to report */
-		if (gc == NULL) {
-			ASSERT(sacnt == 0);
-			if (gcgrp != NULL) {
-				/* we might as well drop the lock now */
-				rw_exit(&gcgrp->gcgrp_rwlock);
-				gcgrp = NULL;
-			}
-			attrp = NULL;
-		}
-
-		ASSERT(gc == NULL || (gcgrp != NULL &&
-		    RW_LOCK_HELD(&gcgrp->gcgrp_rwlock)));
-	}
-	ASSERT(sacnt == 0 || gc != NULL);
-
-	if (sacnt != 0 &&
-	    (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) {
-		kmem_free(re, sizeof (*re));
-		rw_exit(&gcgrp->gcgrp_rwlock);
-		return;
 	}
-
 	/*
 	 * Return all IRE types for route table... let caller pick and choose
 	 */
@@ -19697,16 +11129,9 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
 	re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6);
 	re->ipv6RouteIndex = 0;	/* Unique when multiple with same dest/plen */
 	re->ipv6RouteIfIndex.o_length = 0;
-	ipif = ire->ire_ipif;
-	if (ire->ire_type == IRE_CACHE) {
-		ill = (ill_t *)ire->ire_stq->q_ptr;
-		re->ipv6RouteIfIndex.o_length =
-		    ill->ill_name_length == 0 ? 0 :
-		    MIN(OCTET_LENGTH, ill->ill_name_length - 1);
-		bcopy(ill->ill_name, re->ipv6RouteIfIndex.o_bytes,
-		    re->ipv6RouteIfIndex.o_length);
-	} else if (ipif != NULL) {
-		ipif_get_name(ipif, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH);
+	ill = ire->ire_ill;
+	if (ill != NULL) {
+		ill_get_name(ill, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH);
 		re->ipv6RouteIfIndex.o_length =
 		    mi_strlen(re->ipv6RouteIfIndex.o_bytes);
 	}
@@ -19714,18 +11139,13 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
 
 	mutex_enter(&ire->ire_lock);
-	gw_addr_v6 = ire->ire_gateway_addr_v6;
+	re->ipv6RouteNextHop = ire->ire_gateway_addr_v6;
 	mutex_exit(&ire->ire_lock);
 
-	if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK))
-		re->ipv6RouteNextHop = ire->ire_src_addr_v6;
-	else
-		re->ipv6RouteNextHop = gw_addr_v6;
-
 	/* remote(4), local(3), or discard(2) */
 	if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
 		re->ipv6RouteType = 2;
-	else if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6))
+	else if (ire->ire_type & IRE_ONLINK)
 		re->ipv6RouteType = 3;
 	else
 		re->ipv6RouteType = 4;
@@ -19736,15 +11156,31 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
 	re->ipv6RouteNextHopRDI	= 0;
 	re->ipv6RouteWeight	= 0;
 	re->ipv6RouteMetric	= 0;
-	re->ipv6RouteInfo.re_max_frag	= ire->ire_max_frag;
-	re->ipv6RouteInfo.re_frag_flag	= ire->ire_frag_flag;
-	re->ipv6RouteInfo.re_rtt	= ire->ire_uinfo.iulp_rtt;
-	re->ipv6RouteInfo.re_src_addr	= ire->ire_src_addr_v6;
+	re->ipv6RouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
+	if (ire->ire_ill != NULL && re->ipv6RouteInfo.re_max_frag == 0)
+		re->ipv6RouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
+
+	re->ipv6RouteInfo.re_frag_flag	= 0;
+	re->ipv6RouteInfo.re_rtt	= 0;
+	re->ipv6RouteInfo.re_src_addr	= ipv6_all_zeros;
 	re->ipv6RouteInfo.re_obpkt	= ire->ire_ob_pkt_count;
 	re->ipv6RouteInfo.re_ibpkt	= ire->ire_ib_pkt_count;
 	re->ipv6RouteInfo.re_ref	= ire->ire_refcnt;
 	re->ipv6RouteInfo.re_flags	= ire->ire_flags;
 
+	/* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
+	if (ire->ire_type & IRE_INTERFACE) {
+		ire_t *child;
+
+		rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+		child = ire->ire_dep_children;
+		while (child != NULL) {
+			re->ipv6RouteInfo.re_obpkt += child->ire_ob_pkt_count;
+			re->ipv6RouteInfo.re_ibpkt += child->ire_ib_pkt_count;
+			child = child->ire_dep_sib_next;
+		}
+		rw_exit(&ipst->ips_ire_dep_lock);
+	}
 	if (ire->ire_flags & RTF_DYNAMIC) {
 		re->ipv6RouteInfo.re_ire_type	= IRE_HOST_REDIRECT;
 	} else {
@@ -19757,79 +11193,67 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
 		    (uint_t)sizeof (*re)));
 	}
 
-	for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) {
-		iaeptr->iae_routeidx = ird->ird_idx;
-		iaeptr->iae_doi = gc->gc_db->gcdb_doi;
-		iaeptr->iae_slrange = gc->gc_db->gcdb_slrange;
-	}
+	if (gc != NULL) {
+		iaes.iae_routeidx = ird->ird_idx;
+		iaes.iae_doi = gc->gc_db->gcdb_doi;
+		iaes.iae_slrange = gc->gc_db->gcdb_slrange;
 
-	if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail,
-	    (char *)iae, sacnt * sizeof (*iae))) {
-		ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n",
-		    (unsigned)(sacnt * sizeof (*iae))));
+		if (!snmp_append_data2(ird->ird_attrs.lp_head,
+		    &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
+			ip1dbg(("ip_snmp_get2_v6: failed to allocate %u "
+			    "bytes\n", (uint_t)sizeof (iaes)));
+		}
 	}
 
 	/* bump route index for next pass */
 	ird->ird_idx++;
 
 	kmem_free(re, sizeof (*re));
-	if (sacnt != 0)
-		kmem_free(iae, sacnt * sizeof (*iae));
-
 	if (gcgrp != NULL)
 		rw_exit(&gcgrp->gcgrp_rwlock);
 }
 
 /*
- * ndp_walk routine to create ipv6NetToMediaEntryTable
+ * ncec_walk routine to create ipv6NetToMediaEntryTable
  */
 static int
-ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird)
+ip_snmp_get2_v6_media(ncec_t *ncec, iproutedata_t *ird)
 {
 	ill_t				*ill;
 	mib2_ipv6NetToMediaEntry_t	ntme;
-	dl_unitdata_req_t		*dl;
 
-	ill = nce->nce_ill;
-	if (ill->ill_isv6 == B_FALSE) /* skip arpce entry */
+	ill = ncec->ncec_ill;
+	/* skip arpce entries, and loopback ncec entries */
+	if (ill->ill_isv6 == B_FALSE || ill->ill_net_type == IRE_LOOPBACK)
 		return (0);
-
 	/*
 	 * Neighbor cache entry attached to IRE with on-link
 	 * destination.
+	 * We report all IPMP groups on ncec_ill which is normally the upper.
 	 */
 	ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex;
-	ntme.ipv6NetToMediaNetAddress = nce->nce_addr;
-	if ((ill->ill_flags & ILLF_XRESOLV) &&
-	    (nce->nce_res_mp != NULL)) {
-		dl = (dl_unitdata_req_t *)(nce->nce_res_mp->b_rptr);
-		ntme.ipv6NetToMediaPhysAddress.o_length =
-		    dl->dl_dest_addr_length;
-	} else {
-		ntme.ipv6NetToMediaPhysAddress.o_length =
-		    ill->ill_phys_addr_length;
-	}
-	if (nce->nce_res_mp != NULL) {
-		bcopy((char *)nce->nce_res_mp->b_rptr +
-		    NCE_LL_ADDR_OFFSET(ill),
-		    ntme.ipv6NetToMediaPhysAddress.o_bytes,
+	ntme.ipv6NetToMediaNetAddress = ncec->ncec_addr;
+	ntme.ipv6NetToMediaPhysAddress.o_length = ill->ill_phys_addr_length;
+	if (ncec->ncec_lladdr != NULL) {
+		bcopy(ncec->ncec_lladdr, ntme.ipv6NetToMediaPhysAddress.o_bytes,
 		    ntme.ipv6NetToMediaPhysAddress.o_length);
-	} else {
-		bzero(ntme.ipv6NetToMediaPhysAddress.o_bytes,
-		    ill->ill_phys_addr_length);
 	}
 	/*
 	 * Note: Returns ND_* states. Should be:
 	 * reachable(1), stale(2), delay(3), probe(4),
 	 * invalid(5), unknown(6)
 	 */
-	ntme.ipv6NetToMediaState = nce->nce_state;
+	ntme.ipv6NetToMediaState = ncec->ncec_state;
 	ntme.ipv6NetToMediaLastUpdated = 0;
 
 	/* other(1), dynamic(2), static(3), local(4) */
-	if (IN6_IS_ADDR_LOOPBACK(&nce->nce_addr)) {
+	if (NCE_MYADDR(ncec)) {
 		ntme.ipv6NetToMediaType = 4;
-	} else if (IN6_IS_ADDR_MULTICAST(&nce->nce_addr)) {
+	} else if (ncec->ncec_flags & NCE_F_PUBLISH) {
+		ntme.ipv6NetToMediaType = 1; /* proxy */
+	} else if (ncec->ncec_flags & NCE_F_STATIC) {
+		ntme.ipv6NetToMediaType = 3;
+	} else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST)) {
 		ntme.ipv6NetToMediaType = 1;
 	} else {
 		ntme.ipv6NetToMediaType = 2;
@@ -19843,6 +11267,93 @@ ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird)
 	return (0);
 }
 
+int
+nce2ace(ncec_t *ncec)
+{
+	int flags = 0;
+
+	if (NCE_ISREACHABLE(ncec))
+		flags |= ACE_F_RESOLVED;
+	if (ncec->ncec_flags & NCE_F_AUTHORITY)
+		flags |= ACE_F_AUTHORITY;
+	if (ncec->ncec_flags & NCE_F_PUBLISH)
+		flags |= ACE_F_PUBLISH;
+	if ((ncec->ncec_flags & NCE_F_NONUD) != 0)
+		flags |= ACE_F_PERMANENT;
+	if (NCE_MYADDR(ncec))
+		flags |= (ACE_F_MYADDR | ACE_F_AUTHORITY);
+	if (ncec->ncec_flags & NCE_F_UNVERIFIED)
+		flags |= ACE_F_UNVERIFIED;
+	if (ncec->ncec_flags & NCE_F_AUTHORITY)
+		flags |= ACE_F_AUTHORITY;
+	if (ncec->ncec_flags & NCE_F_DELAYED)
+		flags |= ACE_F_DELAYED;
+	return (flags);
+}
+
+/*
+ * ncec_walk routine to create ipNetToMediaEntryTable
+ */
+static int
+ip_snmp_get2_v4_media(ncec_t *ncec, iproutedata_t *ird)
+{
+	ill_t				*ill;
+	mib2_ipNetToMediaEntry_t	ntme;
+	const char			*name = "unknown";
+	ipaddr_t			ncec_addr;
+
+	ill = ncec->ncec_ill;
+	if (ill->ill_isv6 || (ncec->ncec_flags & NCE_F_BCAST) ||
+	    ill->ill_net_type == IRE_LOOPBACK)
+		return (0);
+
+	/* We report all IPMP groups on ncec_ill which is normally the upper. */
+	name = ill->ill_name;
+	/* Based on RFC 4293: other(1), inval(2), dyn(3), stat(4) */
+	if (NCE_MYADDR(ncec)) {
+		ntme.ipNetToMediaType = 4;
+	} else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST|NCE_F_PUBLISH)) {
+		ntme.ipNetToMediaType = 1;
+	} else {
+		ntme.ipNetToMediaType = 3;
+	}
+	ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name));
+	bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes,
+	    ntme.ipNetToMediaIfIndex.o_length);
+
+	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
+	bcopy(&ncec_addr, &ntme.ipNetToMediaNetAddress, sizeof (ncec_addr));
+
+	ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (ipaddr_t);
+	ncec_addr = INADDR_BROADCAST;
+	bcopy(&ncec_addr, ntme.ipNetToMediaInfo.ntm_mask.o_bytes,
+	    sizeof (ncec_addr));
+	/*
+	 * map all the flags to the ACE counterpart.
+	 */
+	ntme.ipNetToMediaInfo.ntm_flags = nce2ace(ncec);
+
+	ntme.ipNetToMediaPhysAddress.o_length =
+	    MIN(OCTET_LENGTH, ill->ill_phys_addr_length);
+
+	if (!NCE_ISREACHABLE(ncec))
+		ntme.ipNetToMediaPhysAddress.o_length = 0;
+	else {
+		if (ncec->ncec_lladdr != NULL) {
+			bcopy(ncec->ncec_lladdr,
+			    ntme.ipNetToMediaPhysAddress.o_bytes,
+			    ntme.ipNetToMediaPhysAddress.o_length);
+		}
+	}
+
+	if (!snmp_append_data2(ird->ird_netmedia.lp_head,
+	    &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
+		ip1dbg(("ip_snmp_get2_v4_media: failed to allocate %u bytes\n",
+		    (uint_t)sizeof (ntme)));
+	}
+	return (0);
+}
+
 /*
  * return (0) if invalid set request, 1 otherwise, including non-tcp requests
  */
@@ -19999,7 +11510,7 @@ ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2)
  * This routine assumes that the options are well formed i.e. that they
  * have already been checked.
  */
-static boolean_t
+boolean_t
 ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
 {
 	ipoptp_t	opts;
@@ -20007,7 +11518,6 @@ ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
 	uint8_t		optval;
 	uint8_t		optlen;
 	ipaddr_t	dst;
-	ire_t		*ire;
 
 	if (IS_SIMPLE_IPH(ipha)) {
 		ip2dbg(("not source routed\n"));
@@ -20030,15 +11540,12 @@ ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
 			 * If dst is one of our addresses and there are some
 			 * entries left in the source route return (true).
 			 */
-			ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL,
-			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-			if (ire == NULL) {
+			if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
 				ip2dbg(("ip_source_routed: not next"
 				    " source route 0x%x\n",
 				    ntohl(dst)));
 				return (B_FALSE);
 			}
-			ire_refrele(ire);
 			off = opt[IPOPT_OFFSET];
 			off--;
 			if (optlen < IP_ADDR_LEN ||
@@ -20055,267 +11562,18 @@ ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
 }
 
 /*
- * Check if the packet contains any source route.
- */
-static boolean_t
-ip_source_route_included(ipha_t *ipha)
-{
-	ipoptp_t	opts;
-	uint8_t		optval;
-
-	if (IS_SIMPLE_IPH(ipha))
-		return (B_FALSE);
-	for (optval = ipoptp_first(&opts, ipha);
-	    optval != IPOPT_EOL;
-	    optval = ipoptp_next(&opts)) {
-		switch (optval) {
-		case IPOPT_SSRR:
-		case IPOPT_LSRR:
-			return (B_TRUE);
-		}
-	}
-	return (B_FALSE);
-}
-
-/*
- * Called when the IRE expiration timer fires.
- */
-void
-ip_trash_timer_expire(void *args)
-{
-	int			flush_flag = 0;
-	ire_expire_arg_t	iea;
-	ip_stack_t		*ipst = (ip_stack_t *)args;
-
-	iea.iea_ipst = ipst;	/* No netstack_hold */
-
-	/*
-	 * ip_ire_expire_id is protected by ip_trash_timer_lock.
-	 * This lock makes sure that a new invocation of this function
-	 * that occurs due to an almost immediate timer firing will not
-	 * progress beyond this point until the current invocation is done
-	 */
-	mutex_enter(&ipst->ips_ip_trash_timer_lock);
-	ipst->ips_ip_ire_expire_id = 0;
-	mutex_exit(&ipst->ips_ip_trash_timer_lock);
-
-	/* Periodic timer */
-	if (ipst->ips_ip_ire_arp_time_elapsed >=
-	    ipst->ips_ip_ire_arp_interval) {
-		/*
-		 * Remove all IRE_CACHE entries since they might
-		 * contain arp information.
-		 */
-		flush_flag |= FLUSH_ARP_TIME;
-		ipst->ips_ip_ire_arp_time_elapsed = 0;
-		IP_STAT(ipst, ip_ire_arp_timer_expired);
-	}
-	if (ipst->ips_ip_ire_rd_time_elapsed >=
-	    ipst->ips_ip_ire_redir_interval) {
-		/* Remove all redirects */
-		flush_flag |= FLUSH_REDIRECT_TIME;
-		ipst->ips_ip_ire_rd_time_elapsed = 0;
-		IP_STAT(ipst, ip_ire_redirect_timer_expired);
-	}
-	if (ipst->ips_ip_ire_pmtu_time_elapsed >=
-	    ipst->ips_ip_ire_pathmtu_interval) {
-		/* Increase path mtu */
-		flush_flag |= FLUSH_MTU_TIME;
-		ipst->ips_ip_ire_pmtu_time_elapsed = 0;
-		IP_STAT(ipst, ip_ire_pmtu_timer_expired);
-	}
-
-	/*
-	 * Optimize for the case when there are no redirects in the
-	 * ftable, that is, no need to walk the ftable in that case.
-	 */
-	if (flush_flag & (FLUSH_MTU_TIME|FLUSH_ARP_TIME)) {
-		iea.iea_flush_flag = flush_flag;
-		ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_CACHETABLE, ire_expire,
-		    (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 0, NULL,
-		    ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table,
-		    NULL, ALL_ZONES, ipst);
-	}
-	if ((flush_flag & FLUSH_REDIRECT_TIME) &&
-	    ipst->ips_ip_redirect_cnt > 0) {
-		iea.iea_flush_flag = flush_flag;
-		ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_FORWARDTABLE,
-		    ire_expire, (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE,
-		    0, NULL, 0, NULL, NULL, ALL_ZONES, ipst);
-	}
-	if (flush_flag & FLUSH_MTU_TIME) {
-		/*
-		 * Walk all IPv6 IRE's and update them
-		 * Note that ARP and redirect timers are not
-		 * needed since NUD handles stale entries.
-		 */
-		flush_flag = FLUSH_MTU_TIME;
-		iea.iea_flush_flag = flush_flag;
-		ire_walk_v6(ire_expire, (char *)(uintptr_t)&iea,
-		    ALL_ZONES, ipst);
-	}
-
-	ipst->ips_ip_ire_arp_time_elapsed += ipst->ips_ip_timer_interval;
-	ipst->ips_ip_ire_rd_time_elapsed += ipst->ips_ip_timer_interval;
-	ipst->ips_ip_ire_pmtu_time_elapsed += ipst->ips_ip_timer_interval;
-
-	/*
-	 * Hold the lock to serialize timeout calls and prevent
-	 * stale values in ip_ire_expire_id. Otherwise it is possible
-	 * for the timer to fire and a new invocation of this function
-	 * to start before the return value of timeout has been stored
-	 * in ip_ire_expire_id by the current invocation.
-	 */
-	mutex_enter(&ipst->ips_ip_trash_timer_lock);
-	ipst->ips_ip_ire_expire_id = timeout(ip_trash_timer_expire,
-	    (void *)ipst, MSEC_TO_TICK(ipst->ips_ip_timer_interval));
-	mutex_exit(&ipst->ips_ip_trash_timer_lock);
-}
-
-/*
- * Called by the memory allocator subsystem directly, when the system
- * is running low on memory.
- */
-/* ARGSUSED */
-void
-ip_trash_ire_reclaim(void *args)
-{
-	netstack_handle_t nh;
-	netstack_t *ns;
-
-	netstack_next_init(&nh);
-	while ((ns = netstack_next(&nh)) != NULL) {
-		ip_trash_ire_reclaim_stack(ns->netstack_ip);
-		netstack_rele(ns);
-	}
-	netstack_next_fini(&nh);
-}
-
-static void
-ip_trash_ire_reclaim_stack(ip_stack_t *ipst)
-{
-	ire_cache_count_t icc;
-	ire_cache_reclaim_t icr;
-	ncc_cache_count_t ncc;
-	nce_cache_reclaim_t ncr;
-	uint_t delete_cnt;
-	/*
-	 * Memory reclaim call back.
-	 * Count unused, offlink, pmtu, and onlink IRE_CACHE entries.
-	 * Then, with a target of freeing 1/Nth of IRE_CACHE
-	 * entries, determine what fraction to free for
-	 * each category of IRE_CACHE entries giving absolute priority
-	 * in the order of onlink, pmtu, offlink, unused (e.g. no pmtu
-	 * entry will be freed unless all offlink entries are freed).
-	 */
-	icc.icc_total = 0;
-	icc.icc_unused = 0;
-	icc.icc_offlink = 0;
-	icc.icc_pmtu = 0;
-	icc.icc_onlink = 0;
-	ire_walk(ire_cache_count, (char *)&icc, ipst);
-
-	/*
-	 * Free NCEs for IPv6 like the onlink ires.
-	 */
-	ncc.ncc_total = 0;
-	ncc.ncc_host = 0;
-	ndp_walk(NULL, (pfi_t)ndp_cache_count, (uchar_t *)&ncc, ipst);
-
-	ASSERT(icc.icc_total == icc.icc_unused + icc.icc_offlink +
-	    icc.icc_pmtu + icc.icc_onlink);
-	delete_cnt = icc.icc_total/ipst->ips_ip_ire_reclaim_fraction;
-	IP_STAT(ipst, ip_trash_ire_reclaim_calls);
-	if (delete_cnt == 0)
-		return;
-	IP_STAT(ipst, ip_trash_ire_reclaim_success);
-	/* Always delete all unused offlink entries */
-	icr.icr_ipst = ipst;
-	icr.icr_unused = 1;
-	if (delete_cnt <= icc.icc_unused) {
-		/*
-		 * Only need to free unused entries.  In other words,
-		 * there are enough unused entries to free to meet our
-		 * target number of freed ire cache entries.
-		 */
-		icr.icr_offlink = icr.icr_pmtu = icr.icr_onlink = 0;
-		ncr.ncr_host = 0;
-	} else if (delete_cnt <= icc.icc_unused + icc.icc_offlink) {
-		/*
-		 * Only need to free unused entries, plus a fraction of offlink
-		 * entries.  It follows from the first if statement that
-		 * icc_offlink is non-zero, and that delete_cnt != icc_unused.
-		 */
-		delete_cnt -= icc.icc_unused;
-		/* Round up # deleted by truncating fraction */
-		icr.icr_offlink = icc.icc_offlink / delete_cnt;
-		icr.icr_pmtu = icr.icr_onlink = 0;
-		ncr.ncr_host = 0;
-	} else if (delete_cnt <=
-	    icc.icc_unused + icc.icc_offlink + icc.icc_pmtu) {
-		/*
-		 * Free all unused and offlink entries, plus a fraction of
-		 * pmtu entries.  It follows from the previous if statement
-		 * that icc_pmtu is non-zero, and that
-		 * delete_cnt != icc_unused + icc_offlink.
-		 */
-		icr.icr_offlink = 1;
-		delete_cnt -= icc.icc_unused + icc.icc_offlink;
-		/* Round up # deleted by truncating fraction */
-		icr.icr_pmtu = icc.icc_pmtu / delete_cnt;
-		icr.icr_onlink = 0;
-		ncr.ncr_host = 0;
-	} else {
-		/*
-		 * Free all unused, offlink, and pmtu entries, plus a fraction
-		 * of onlink entries.  If we're here, then we know that
-		 * icc_onlink is non-zero, and that
-		 * delete_cnt != icc_unused + icc_offlink + icc_pmtu.
-		 */
-		icr.icr_offlink = icr.icr_pmtu = 1;
-		delete_cnt -= icc.icc_unused + icc.icc_offlink +
-		    icc.icc_pmtu;
-		/* Round up # deleted by truncating fraction */
-		icr.icr_onlink = icc.icc_onlink / delete_cnt;
-		/* Using the same delete fraction as for onlink IREs */
-		ncr.ncr_host = ncc.ncc_host / delete_cnt;
-	}
-#ifdef DEBUG
-	ip1dbg(("IP reclaim: target %d out of %d current %d/%d/%d/%d "
-	    "fractions %d/%d/%d/%d\n",
-	    icc.icc_total/ipst->ips_ip_ire_reclaim_fraction, icc.icc_total,
-	    icc.icc_unused, icc.icc_offlink,
-	    icc.icc_pmtu, icc.icc_onlink,
-	    icr.icr_unused, icr.icr_offlink,
-	    icr.icr_pmtu, icr.icr_onlink));
-#endif
-	ire_walk(ire_cache_reclaim, (char *)&icr, ipst);
-	if (ncr.ncr_host != 0)
-		ndp_walk(NULL, (pfi_t)ndp_cache_reclaim,
-		    (uchar_t *)&ncr, ipst);
-#ifdef DEBUG
-	icc.icc_total = 0; icc.icc_unused = 0; icc.icc_offlink = 0;
-	icc.icc_pmtu = 0; icc.icc_onlink = 0;
-	ire_walk(ire_cache_count, (char *)&icc, ipst);
-	ip1dbg(("IP reclaim: result total %d %d/%d/%d/%d\n",
-	    icc.icc_total, icc.icc_unused, icc.icc_offlink,
-	    icc.icc_pmtu, icc.icc_onlink));
-#endif
-}
-
-/*
- * ip_unbind is called when a copy of an unbind request is received from the
- * upper level protocol.  We remove this conn from any fanout hash list it is
- * on, and zero out the bind information.  No reply is expected up above.
+ * ip_unbind is called by the transports to remove a conn from
+ * the fanout table.
  */
 void
 ip_unbind(conn_t *connp)
 {
+
 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
 
 	if (is_system_labeled() && connp->conn_anon_port) {
 		(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
-		    connp->conn_mlp_type, connp->conn_ulp,
+		    connp->conn_mlp_type, connp->conn_proto,
 		    ntohs(connp->conn_lport), B_FALSE);
 		connp->conn_anon_port = 0;
 	}
@@ -20325,1489 +11583,6 @@ ip_unbind(conn_t *connp)
 }
 
 /*
- * Write side put procedure.  Outbound data, IOCTLs, responses from
- * resolvers, etc, come down through here.
- *
- * arg2 is always a queue_t *.
- * When that queue is an ill_t (i.e. q_next != NULL), then arg must be
- * the zoneid.
- * When that queue is not an ill_t, then arg must be a conn_t pointer.
- */
-void
-ip_output(void *arg, mblk_t *mp, void *arg2, int caller)
-{
-	ip_output_options(arg, mp, arg2, caller, &zero_info);
-}
-
-void
-ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
-    ip_opt_info_t *infop)
-{
-	conn_t		*connp = NULL;
-	queue_t		*q = (queue_t *)arg2;
-	ipha_t		*ipha;
-#define	rptr	((uchar_t *)ipha)
-	ire_t		*ire = NULL;
-	ire_t		*sctp_ire = NULL;
-	uint32_t	v_hlen_tos_len;
-	ipaddr_t	dst;
-	mblk_t		*first_mp = NULL;
-	boolean_t	mctl_present;
-	ipsec_out_t	*io;
-	int		match_flags;
-	ill_t		*xmit_ill = NULL;	/* IP_PKTINFO etc. */
-	ipif_t		*dst_ipif;
-	boolean_t	multirt_need_resolve = B_FALSE;
-	mblk_t		*copy_mp = NULL;
-	int		err = 0;
-	zoneid_t	zoneid;
-	boolean_t	need_decref = B_FALSE;
-	boolean_t	ignore_dontroute = B_FALSE;
-	boolean_t	ignore_nexthop = B_FALSE;
-	boolean_t	ip_nexthop = B_FALSE;
-	ipaddr_t	nexthop_addr;
-	ip_stack_t	*ipst;
-
-#ifdef	_BIG_ENDIAN
-#define	V_HLEN	(v_hlen_tos_len >> 24)
-#else
-#define	V_HLEN	(v_hlen_tos_len & 0xFF)
-#endif
-
-	TRACE_1(TR_FAC_IP, TR_IP_WPUT_START,
-	    "ip_wput_start: q %p", q);
-
-	/*
-	 * ip_wput fast path
-	 */
-
-	/* is packet from ARP ? */
-	if (q->q_next != NULL) {
-		zoneid = (zoneid_t)(uintptr_t)arg;
-		goto qnext;
-	}
-
-	connp = (conn_t *)arg;
-	ASSERT(connp != NULL);
-	zoneid = connp->conn_zoneid;
-	ipst = connp->conn_netstack->netstack_ip;
-	ASSERT(ipst != NULL);
-
-	/* is queue flow controlled? */
-	if ((q->q_first != NULL || connp->conn_draining) &&
-	    (caller == IP_WPUT)) {
-		ASSERT(!need_decref);
-		ASSERT(!IP_FLOW_CONTROLLED_ULP(connp->conn_ulp));
-		(void) putq(q, mp);
-		return;
-	}
-
-	/* Multidata transmit? */
-	if (DB_TYPE(mp) == M_MULTIDATA) {
-		/*
-		 * We should never get here, since all Multidata messages
-		 * originating from tcp should have been directed over to
-		 * tcp_multisend() in the first place.
-		 */
-		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-		freemsg(mp);
-		return;
-	} else if (DB_TYPE(mp) != M_DATA)
-		goto notdata;
-
-	if (mp->b_flag & MSGHASREF) {
-		ASSERT(connp->conn_ulp == IPPROTO_SCTP);
-		mp->b_flag &= ~MSGHASREF;
-		SCTP_EXTRACT_IPINFO(mp, sctp_ire);
-		need_decref = B_TRUE;
-	}
-	ipha = (ipha_t *)mp->b_rptr;
-
-	/* is IP header non-aligned or mblk smaller than basic IP header */
-#ifndef SAFETY_BEFORE_SPEED
-	if (!OK_32PTR(rptr) ||
-	    (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH)
-		goto hdrtoosmall;
-#endif
-
-	ASSERT(OK_32PTR(ipha));
-
-	/*
-	 * This function assumes that mp points to an IPv4 packet.  If it's the
-	 * wrong version, we'll catch it again in ip_output_v6.
-	 *
-	 * Note that this is *only* locally-generated output here, and never
-	 * forwarded data, and that we need to deal only with transports that
-	 * don't know how to label.  (TCP, UDP, and ICMP/raw-IP all know how to
-	 * label.)
-	 */
-	if (is_system_labeled() &&
-	    (ipha->ipha_version_and_hdr_length & 0xf0) == (IPV4_VERSION << 4) &&
-	    !connp->conn_ulp_labeled) {
-		cred_t	*credp;
-		pid_t	pid;
-
-		credp = BEST_CRED(mp, connp, &pid);
-		err = tsol_check_label(credp, &mp,
-		    connp->conn_mac_mode, ipst, pid);
-		ipha = (ipha_t *)mp->b_rptr;
-		if (err != 0) {
-			first_mp = mp;
-			if (err == EINVAL)
-				goto icmp_parameter_problem;
-			ip2dbg(("ip_wput: label check failed (%d)\n", err));
-			goto discard_pkt;
-		}
-	}
-
-	ASSERT(infop != NULL);
-
-	if (infop->ip_opt_flags & IP_VERIFY_SRC) {
-		/*
-		 * IP_PKTINFO ancillary option is present.
-		 * IPCL_ZONEID is used to honor IP_ALLZONES option which
-		 * allows using address of any zone as the source address.
-		 */
-		ire = ire_ctable_lookup(ipha->ipha_src, 0,
-		    (IRE_LOCAL|IRE_LOOPBACK), NULL, IPCL_ZONEID(connp),
-		    NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
-		if (ire == NULL)
-			goto drop_pkt;
-		ire_refrele(ire);
-		ire = NULL;
-	}
-
-	/*
-	 * IP_BOUND_IF has precedence over the ill index passed in IP_PKTINFO.
-	 */
-	if (infop->ip_opt_ill_index != 0 && connp->conn_outgoing_ill == NULL) {
-		xmit_ill = ill_lookup_on_ifindex(infop->ip_opt_ill_index,
-		    B_FALSE, NULL, NULL, NULL, NULL, ipst);
-
-		if (xmit_ill == NULL || IS_VNI(xmit_ill))
-			goto drop_pkt;
-		/*
-		 * check that there is an ipif belonging
-		 * to our zone. IPCL_ZONEID is not used because
-		 * IP_ALLZONES option is valid only when the ill is
-		 * accessible from all zones i.e has a valid ipif in
-		 * all zones.
-		 */
-		if (!ipif_lookup_zoneid(xmit_ill, zoneid, 0, NULL)) {
-			goto drop_pkt;
-		}
-	}
-
-	/*
-	 * If there is a policy, try to attach an ipsec_out in
-	 * the front. At the end, first_mp either points to a
-	 * M_DATA message or IPSEC_OUT message linked to a
-	 * M_DATA message. We have to do it now as we might
-	 * lose the "conn" if we go through ip_newroute.
-	 */
-	if (connp->conn_out_enforce_policy || (connp->conn_latch != NULL)) {
-		if (((mp = ipsec_attach_ipsec_out(&mp, connp, NULL,
-		    ipha->ipha_protocol, ipst->ips_netstack)) == NULL)) {
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-			if (need_decref)
-				CONN_DEC_REF(connp);
-			return;
-		}
-		ASSERT(mp->b_datap->db_type == M_CTL);
-		first_mp = mp;
-		mp = mp->b_cont;
-		mctl_present = B_TRUE;
-	} else {
-		first_mp = mp;
-		mctl_present = B_FALSE;
-	}
-
-	v_hlen_tos_len = ((uint32_t *)ipha)[0];
-
-	/* is wrong version or IP options present */
-	if (V_HLEN != IP_SIMPLE_HDR_VERSION)
-		goto version_hdrlen_check;
-	dst = ipha->ipha_dst;
-
-	/* If IP_BOUND_IF has been set, use that ill. */
-	if (connp->conn_outgoing_ill != NULL) {
-		xmit_ill = conn_get_held_ill(connp,
-		    &connp->conn_outgoing_ill, &err);
-		if (err == ILL_LOOKUP_FAILED)
-			goto drop_pkt;
-
-		goto send_from_ill;
-	}
-
-	/* is packet multicast? */
-	if (CLASSD(dst))
-		goto multicast;
-
-	/*
-	 * If xmit_ill is set above due to index passed in ip_pkt_info. It
-	 * takes precedence over conn_dontroute and conn_nexthop_set
-	 */
-	if (xmit_ill != NULL)
-		goto send_from_ill;
-
-	if (connp->conn_dontroute || connp->conn_nexthop_set) {
-		/*
-		 * If the destination is a broadcast, local, or loopback
-		 * address, SO_DONTROUTE and IP_NEXTHOP go through the
-		 * standard path.
-		 */
-		ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst);
-		if ((ire == NULL) || (ire->ire_type &
-		    (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK)) == 0) {
-			if (ire != NULL) {
-				ire_refrele(ire);
-				/* No more access to ire */
-				ire = NULL;
-			}
-			/*
-			 * bypass routing checks and go directly to interface.
-			 */
-			if (connp->conn_dontroute)
-				goto dontroute;
-
-			ASSERT(connp->conn_nexthop_set);
-			ip_nexthop = B_TRUE;
-			nexthop_addr = connp->conn_nexthop_v4;
-			goto send_from_ill;
-		}
-
-		/* Must be a broadcast, a loopback or a local ire */
-		ire_refrele(ire);
-		/* No more access to ire */
-		ire = NULL;
-	}
-
-	/*
-	 * We cache IRE_CACHEs to avoid lookups. We don't do
-	 * this for the tcp global queue and listen end point
-	 * as it does not really have a real destination to
-	 * talk to.  This is also true for SCTP.
-	 */
-	if (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) &&
-	    !connp->conn_fully_bound) {
-		ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst);
-		if (ire == NULL)
-			goto noirefound;
-		TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-		    "ip_wput_end: q %p (%S)", q, "end");
-
-		/*
-		 * Check if the ire has the RTF_MULTIRT flag, inherited
-		 * from an IRE_OFFSUBNET ire entry in ip_newroute().
-		 */
-		if (ire->ire_flags & RTF_MULTIRT) {
-
-			/*
-			 * Force the TTL of multirouted packets if required.
-			 * The TTL of such packets is bounded by the
-			 * ip_multirt_ttl ndd variable.
-			 */
-			if ((ipst->ips_ip_multirt_ttl > 0) &&
-			    (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
-				ip2dbg(("ip_wput: forcing multirt TTL to %d "
-				    "(was %d), dst 0x%08x\n",
-				    ipst->ips_ip_multirt_ttl, ipha->ipha_ttl,
-				    ntohl(ire->ire_addr)));
-				ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
-			}
-			/*
-			 * We look at this point if there are pending
-			 * unresolved routes. ire_multirt_resolvable()
-			 * checks in O(n) that all IRE_OFFSUBNET ire
-			 * entries for the packet's destination and
-			 * flagged RTF_MULTIRT are currently resolved.
-			 * If some remain unresolved, we make a copy
-			 * of the current message. It will be used
-			 * to initiate additional route resolutions.
-			 */
-			multirt_need_resolve =
-			    ire_multirt_need_resolve(ire->ire_addr,
-			    msg_getlabel(first_mp), ipst);
-			ip2dbg(("ip_wput[TCP]: ire %p, "
-			    "multirt_need_resolve %d, first_mp %p\n",
-			    (void *)ire, multirt_need_resolve,
-			    (void *)first_mp));
-			if (multirt_need_resolve) {
-				copy_mp = copymsg(first_mp);
-				if (copy_mp != NULL) {
-					MULTIRT_DEBUG_TAG(copy_mp);
-				}
-			}
-		}
-
-		ip_wput_ire(q, first_mp, ire, connp, caller, zoneid);
-
-		/*
-		 * Try to resolve another multiroute if
-		 * ire_multirt_need_resolve() deemed it necessary.
-		 */
-		if (copy_mp != NULL)
-			ip_newroute(q, copy_mp, dst, connp, zoneid, ipst);
-		if (need_decref)
-			CONN_DEC_REF(connp);
-		return;
-	}
-
-	/*
-	 * Access to conn_ire_cache. (protected by conn_lock)
-	 *
-	 * IRE_MARK_CONDEMNED is marked in ire_delete. We don't grab
-	 * the ire bucket lock here to check for CONDEMNED as it is okay to
-	 * send a packet or two with the IRE_CACHE that is going away.
-	 * Access to the ire requires an ire refhold on the ire prior to
-	 * its use since an interface unplumb thread may delete the cached
-	 * ire and release the refhold at any time.
-	 *
-	 * Caching an ire in the conn_ire_cache
-	 *
-	 * o Caching an ire pointer in the conn requires a strict check for
-	 * IRE_MARK_CONDEMNED. An interface unplumb thread deletes all relevant
-	 * ires  before cleaning up the conns. So the caching of an ire pointer
-	 * in the conn is done after making sure under the bucket lock that the
-	 * ire has not yet been marked CONDEMNED. Otherwise we will end up
-	 * caching an ire after the unplumb thread has cleaned up the conn.
-	 * If the conn does not send a packet subsequently the unplumb thread
-	 * will be hanging waiting for the ire count to drop to zero.
-	 *
-	 * o We also need to atomically test for a null conn_ire_cache and
-	 * set the conn_ire_cache under the the protection of the conn_lock
-	 * to avoid races among concurrent threads trying to simultaneously
-	 * cache an ire in the conn_ire_cache.
-	 */
-	mutex_enter(&connp->conn_lock);
-	ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache;
-
-	if (ire != NULL && ire->ire_addr == dst &&
-	    !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-
-		IRE_REFHOLD(ire);
-		mutex_exit(&connp->conn_lock);
-
-	} else {
-		boolean_t cached = B_FALSE;
-		connp->conn_ire_cache = NULL;
-		mutex_exit(&connp->conn_lock);
-		/* Release the old ire */
-		if (ire != NULL && sctp_ire == NULL)
-			IRE_REFRELE_NOTR(ire);
-
-		ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst);
-		if (ire == NULL)
-			goto noirefound;
-		IRE_REFHOLD_NOTR(ire);
-
-		mutex_enter(&connp->conn_lock);
-		if (CONN_CACHE_IRE(connp) && connp->conn_ire_cache == NULL) {
-			rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
-			if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-				if (connp->conn_ulp == IPPROTO_TCP)
-					TCP_CHECK_IREINFO(connp->conn_tcp, ire);
-				connp->conn_ire_cache = ire;
-				cached = B_TRUE;
-			}
-			rw_exit(&ire->ire_bucket->irb_lock);
-		}
-		mutex_exit(&connp->conn_lock);
-
-		/*
-		 * We can continue to use the ire but since it was
-		 * not cached, we should drop the extra reference.
-		 */
-		if (!cached)
-			IRE_REFRELE_NOTR(ire);
-	}
-
-	TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-	    "ip_wput_end: q %p (%S)", q, "end");
-
-	/*
-	 * Check if the ire has the RTF_MULTIRT flag, inherited
-	 * from an IRE_OFFSUBNET ire entry in ip_newroute().
-	 */
-	if (ire->ire_flags & RTF_MULTIRT) {
-		/*
-		 * Force the TTL of multirouted packets if required.
-		 * The TTL of such packets is bounded by the
-		 * ip_multirt_ttl ndd variable.
-		 */
-		if ((ipst->ips_ip_multirt_ttl > 0) &&
-		    (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
-			ip2dbg(("ip_wput: forcing multirt TTL to %d "
-			    "(was %d), dst 0x%08x\n",
-			    ipst->ips_ip_multirt_ttl, ipha->ipha_ttl,
-			    ntohl(ire->ire_addr)));
-			ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
-		}
-
-		/*
-		 * At this point, we check to see if there are any pending
-		 * unresolved routes. ire_multirt_resolvable()
-		 * checks in O(n) that all IRE_OFFSUBNET ire
-		 * entries for the packet's destination and
-		 * flagged RTF_MULTIRT are currently resolved.
-		 * If some remain unresolved, we make a copy
-		 * of the current message. It will be used
-		 * to initiate additional route resolutions.
-		 */
-		multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr,
-		    msg_getlabel(first_mp), ipst);
-		ip2dbg(("ip_wput[not TCP]: ire %p, "
-		    "multirt_need_resolve %d, first_mp %p\n",
-		    (void *)ire, multirt_need_resolve, (void *)first_mp));
-		if (multirt_need_resolve) {
-			copy_mp = copymsg(first_mp);
-			if (copy_mp != NULL) {
-				MULTIRT_DEBUG_TAG(copy_mp);
-			}
-		}
-	}
-
-	ip_wput_ire(q, first_mp, ire, connp, caller, zoneid);
-
-	/*
-	 * Try to resolve another multiroute if
-	 * ire_multirt_resolvable() deemed it necessary
-	 */
-	if (copy_mp != NULL)
-		ip_newroute(q, copy_mp, dst, connp, zoneid, ipst);
-	if (need_decref)
-		CONN_DEC_REF(connp);
-	return;
-
-qnext:
-	/*
-	 * Upper Level Protocols pass down complete IP datagrams
-	 * as M_DATA messages.	Everything else is a sideshow.
-	 *
-	 * 1) We could be re-entering ip_wput because of ip_neworute
-	 *    in which case we could have a IPSEC_OUT message. We
-	 *    need to pass through ip_wput like other datagrams and
-	 *    hence cannot branch to ip_wput_nondata.
-	 *
-	 * 2) ARP, AH, ESP, and other clients who are on the module
-	 *    instance of IP stream, give us something to deal with.
-	 *    We will handle AH and ESP here and rest in ip_wput_nondata.
-	 *
-	 * 3) ICMP replies also could come here.
-	 */
-	ipst = ILLQ_TO_IPST(q);
-
-	if (DB_TYPE(mp) != M_DATA) {
-notdata:
-		if (DB_TYPE(mp) == M_CTL) {
-			/*
-			 * M_CTL messages are used by ARP, AH and ESP to
-			 * communicate with IP. We deal with IPSEC_IN and
-			 * IPSEC_OUT here. ip_wput_nondata handles other
-			 * cases.
-			 */
-			ipsec_info_t *ii = (ipsec_info_t *)mp->b_rptr;
-			if (mp->b_cont && (mp->b_cont->b_flag & MSGHASREF)) {
-				first_mp = mp->b_cont;
-				first_mp->b_flag &= ~MSGHASREF;
-				ASSERT(connp->conn_ulp == IPPROTO_SCTP);
-				SCTP_EXTRACT_IPINFO(first_mp, sctp_ire);
-				CONN_DEC_REF(connp);
-				connp = NULL;
-			}
-			if (ii->ipsec_info_type == IPSEC_IN) {
-				/*
-				 * Either this message goes back to
-				 * IPsec for further processing or to
-				 * ULP after policy checks.
-				 */
-				ip_fanout_proto_again(mp, NULL, NULL, NULL);
-				return;
-			} else if (ii->ipsec_info_type == IPSEC_OUT) {
-				io = (ipsec_out_t *)ii;
-				if (io->ipsec_out_proc_begin) {
-					/*
-					 * IPsec processing has already started.
-					 * Complete it.
-					 * IPQoS notes: We don't care what is
-					 * in ipsec_out_ill_index since this
-					 * won't be processed for IPQoS policies
-					 * in ipsec_out_process.
-					 */
-					ipsec_out_process(q, mp, NULL,
-					    io->ipsec_out_ill_index);
-					return;
-				} else {
-					connp = (q->q_next != NULL) ?
-					    NULL : Q_TO_CONN(q);
-					first_mp = mp;
-					mp = mp->b_cont;
-					mctl_present = B_TRUE;
-				}
-				zoneid = io->ipsec_out_zoneid;
-				ASSERT(zoneid != ALL_ZONES);
-			} else if (ii->ipsec_info_type == IPSEC_CTL) {
-				/*
-				 * It's an IPsec control message requesting
-				 * an SADB update to be sent to the IPsec
-				 * hardware acceleration capable ills.
-				 */
-				ipsec_ctl_t *ipsec_ctl =
-				    (ipsec_ctl_t *)mp->b_rptr;
-				ipsa_t *sa = (ipsa_t *)ipsec_ctl->ipsec_ctl_sa;
-				uint_t satype = ipsec_ctl->ipsec_ctl_sa_type;
-				mblk_t *cmp = mp->b_cont;
-
-				ASSERT(MBLKL(mp) >= sizeof (ipsec_ctl_t));
-				ASSERT(cmp != NULL);
-
-				freeb(mp);
-				ill_ipsec_capab_send_all(satype, cmp, sa,
-				    ipst->ips_netstack);
-				return;
-			} else {
-				/*
-				 * This must be ARP or special TSOL signaling.
-				 */
-				ip_wput_nondata(NULL, q, mp, NULL);
-				TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-				    "ip_wput_end: q %p (%S)", q, "nondata");
-				return;
-			}
-		} else {
-			/*
-			 * This must be non-(ARP/AH/ESP) messages.
-			 */
-			ASSERT(!need_decref);
-			ip_wput_nondata(NULL, q, mp, NULL);
-			TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-			    "ip_wput_end: q %p (%S)", q, "nondata");
-			return;
-		}
-	} else {
-		first_mp = mp;
-		mctl_present = B_FALSE;
-	}
-
-	ASSERT(first_mp != NULL);
-
-	if (mctl_present) {
-		io = (ipsec_out_t *)first_mp->b_rptr;
-		if (io->ipsec_out_ip_nexthop) {
-			/*
-			 * We may have lost the conn context if we are
-			 * coming here from ip_newroute(). Copy the
-			 * nexthop information.
-			 */
-			ip_nexthop = B_TRUE;
-			nexthop_addr = io->ipsec_out_nexthop_addr;
-
-			ipha = (ipha_t *)mp->b_rptr;
-			dst = ipha->ipha_dst;
-			goto send_from_ill;
-		}
-	}
-
-	ASSERT(xmit_ill == NULL);
-
-	/* We have a complete IP datagram heading outbound. */
-	ipha = (ipha_t *)mp->b_rptr;
-
-#ifndef SPEED_BEFORE_SAFETY
-	/*
-	 * Make sure we have a full-word aligned message and that at least
-	 * a simple IP header is accessible in the first message.  If not,
-	 * try a pullup.  For labeled systems we need to always take this
-	 * path as M_CTLs are "notdata" but have trailing data to process.
-	 */
-	if (!OK_32PTR(rptr) ||
-	    (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH || is_system_labeled()) {
-hdrtoosmall:
-		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
-			TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-			    "ip_wput_end: q %p (%S)", q, "pullupfailed");
-			if (first_mp == NULL)
-				first_mp = mp;
-			goto discard_pkt;
-		}
-
-		/* This function assumes that mp points to an IPv4 packet. */
-		if (is_system_labeled() &&
-		    (*mp->b_rptr & 0xf0) == (IPV4_VERSION << 4) &&
-		    (connp == NULL || !connp->conn_ulp_labeled)) {
-			cred_t	*credp;
-			pid_t	pid;
-
-			if (connp != NULL) {
-				credp = BEST_CRED(mp, connp, &pid);
-				err = tsol_check_label(credp, &mp,
-				    connp->conn_mac_mode, ipst, pid);
-			} else if ((credp = msg_getcred(mp, &pid)) != NULL) {
-				err = tsol_check_label(credp, &mp,
-				    CONN_MAC_DEFAULT, ipst, pid);
-			}
-			ipha = (ipha_t *)mp->b_rptr;
-			if (mctl_present)
-				first_mp->b_cont = mp;
-			else
-				first_mp = mp;
-			if (err != 0) {
-				if (err == EINVAL)
-					goto icmp_parameter_problem;
-				ip2dbg(("ip_wput: label check failed (%d)\n",
-				    err));
-				goto discard_pkt;
-			}
-		}
-
-		ipha = (ipha_t *)mp->b_rptr;
-		if (first_mp == NULL) {
-			ASSERT(xmit_ill == NULL);
-			/*
-			 * If we got here because of "goto hdrtoosmall"
-			 * We need to attach a IPSEC_OUT.
-			 */
-			if (connp->conn_out_enforce_policy) {
-				if (((mp = ipsec_attach_ipsec_out(&mp, connp,
-				    NULL, ipha->ipha_protocol,
-				    ipst->ips_netstack)) == NULL)) {
-					BUMP_MIB(&ipst->ips_ip_mib,
-					    ipIfStatsOutDiscards);
-					if (need_decref)
-						CONN_DEC_REF(connp);
-					return;
-				} else {
-					ASSERT(mp->b_datap->db_type == M_CTL);
-					first_mp = mp;
-					mp = mp->b_cont;
-					mctl_present = B_TRUE;
-				}
-			} else {
-				first_mp = mp;
-				mctl_present = B_FALSE;
-			}
-		}
-	}
-#endif
-
-	/* Most of the code below is written for speed, not readability */
-	v_hlen_tos_len = ((uint32_t *)ipha)[0];
-
-	/*
-	 * If ip_newroute() fails, we're going to need a full
-	 * header for the icmp wraparound.
-	 */
-	if (V_HLEN != IP_SIMPLE_HDR_VERSION) {
-		uint_t	v_hlen;
-version_hdrlen_check:
-		ASSERT(first_mp != NULL);
-		v_hlen = V_HLEN;
-		/*
-		 * siphon off IPv6 packets coming down from transport
-		 * layer modules here.
-		 * Note: high-order bit carries NUD reachability confirmation
-		 */
-		if (((v_hlen >> 4) & 0x7) == IPV6_VERSION) {
-			/*
-			 * FIXME: assume that callers of ip_output* call
-			 * the right version?
-			 */
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion);
-			ASSERT(xmit_ill == NULL);
-			if (need_decref)
-				mp->b_flag |= MSGHASREF;
-			(void) ip_output_v6(arg, first_mp, arg2, caller);
-			return;
-		}
-
-		if ((v_hlen >> 4) != IP_VERSION) {
-			TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-			    "ip_wput_end: q %p (%S)", q, "badvers");
-			goto discard_pkt;
-		}
-		/*
-		 * Is the header length at least 20 bytes?
-		 *
-		 * Are there enough bytes accessible in the header?  If
-		 * not, try a pullup.
-		 */
-		v_hlen &= 0xF;
-		v_hlen <<= 2;
-		if (v_hlen < IP_SIMPLE_HDR_LENGTH) {
-			TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-			    "ip_wput_end: q %p (%S)", q, "badlen");
-			goto discard_pkt;
-		}
-		if (v_hlen > (mp->b_wptr - rptr)) {
-			if (!pullupmsg(mp, v_hlen)) {
-				TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-				    "ip_wput_end: q %p (%S)", q, "badpullup2");
-				goto discard_pkt;
-			}
-			ipha = (ipha_t *)mp->b_rptr;
-		}
-		/*
-		 * Move first entry from any source route into ipha_dst and
-		 * verify the options
-		 */
-		if (ip_wput_options(q, first_mp, ipha, mctl_present,
-		    zoneid, ipst)) {
-			ASSERT(xmit_ill == NULL);
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-			TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-			    "ip_wput_end: q %p (%S)", q, "badopts");
-			if (need_decref)
-				CONN_DEC_REF(connp);
-			return;
-		}
-	}
-	dst = ipha->ipha_dst;
-
-	/*
-	 * Try to get an IRE_CACHE for the destination address.	 If we can't,
-	 * we have to run the packet through ip_newroute which will take
-	 * the appropriate action to arrange for an IRE_CACHE, such as querying
-	 * a resolver, or assigning a default gateway, etc.
-	 */
-	if (CLASSD(dst)) {
-		ipif_t	*ipif;
-		uint32_t setsrc = 0;
-
-multicast:
-		ASSERT(first_mp != NULL);
-		ip2dbg(("ip_wput: CLASSD\n"));
-		if (connp == NULL) {
-			/*
-			 * Use the first good ipif on the ill.
-			 * XXX Should this ever happen? (Appears
-			 * to show up with just ppp and no ethernet due
-			 * to in.rdisc.)
-			 * However, ire_send should be able to
-			 * call ip_wput_ire directly.
-			 *
-			 * XXX Also, this can happen for ICMP and other packets
-			 * with multicast source addresses.  Perhaps we should
-			 * fix things so that we drop the packet in question,
-			 * but for now, just run with it.
-			 */
-			ill_t *ill = (ill_t *)q->q_ptr;
-
-			ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID);
-			if (ipif == NULL) {
-				if (need_decref)
-					CONN_DEC_REF(connp);
-				freemsg(first_mp);
-				return;
-			}
-			ip1dbg(("ip_wput: CLASSD no CONN: dst 0x%x on %s\n",
-			    ntohl(dst), ill->ill_name));
-		} else {
-			/*
-			 * The order of precedence is IP_BOUND_IF, IP_PKTINFO
-			 * and IP_MULTICAST_IF.  The block comment above this
-			 * function explains the locking mechanism used here.
-			 */
-			if (xmit_ill == NULL) {
-				xmit_ill = conn_get_held_ill(connp,
-				    &connp->conn_outgoing_ill, &err);
-				if (err == ILL_LOOKUP_FAILED) {
-					ip1dbg(("ip_wput: No ill for "
-					    "IP_BOUND_IF\n"));
-					BUMP_MIB(&ipst->ips_ip_mib,
-					    ipIfStatsOutNoRoutes);
-					goto drop_pkt;
-				}
-			}
-
-			if (xmit_ill == NULL) {
-				ipif = conn_get_held_ipif(connp,
-				    &connp->conn_multicast_ipif, &err);
-				if (err == IPIF_LOOKUP_FAILED) {
-					ip1dbg(("ip_wput: No ipif for "
-					    "multicast\n"));
-					BUMP_MIB(&ipst->ips_ip_mib,
-					    ipIfStatsOutNoRoutes);
-					goto drop_pkt;
-				}
-			}
-			if (xmit_ill != NULL) {
-				ipif = ipif_get_next_ipif(NULL, xmit_ill);
-				if (ipif == NULL) {
-					ip1dbg(("ip_wput: No ipif for "
-					    "xmit_ill\n"));
-					BUMP_MIB(&ipst->ips_ip_mib,
-					    ipIfStatsOutNoRoutes);
-					goto drop_pkt;
-				}
-			} else if (ipif == NULL || ipif->ipif_isv6) {
-				/*
-				 * We must do this ipif determination here
-				 * else we could pass through ip_newroute
-				 * and come back here without the conn context.
-				 *
-				 * Note: we do late binding i.e. we bind to
-				 * the interface when the first packet is sent.
-				 * For performance reasons we do not rebind on
-				 * each packet but keep the binding until the
-				 * next IP_MULTICAST_IF option.
-				 *
-				 * conn_multicast_{ipif,ill} are shared between
-				 * IPv4 and IPv6 and AF_INET6 sockets can
-				 * send both IPv4 and IPv6 packets. Hence
-				 * we have to check that "isv6" matches above.
-				 */
-				if (ipif != NULL)
-					ipif_refrele(ipif);
-				ipif = ipif_lookup_group(dst, zoneid, ipst);
-				if (ipif == NULL) {
-					ip1dbg(("ip_wput: No ipif for "
-					    "multicast\n"));
-					BUMP_MIB(&ipst->ips_ip_mib,
-					    ipIfStatsOutNoRoutes);
-					goto drop_pkt;
-				}
-				err = conn_set_held_ipif(connp,
-				    &connp->conn_multicast_ipif, ipif);
-				if (err == IPIF_LOOKUP_FAILED) {
-					ipif_refrele(ipif);
-					ip1dbg(("ip_wput: No ipif for "
-					    "multicast\n"));
-					BUMP_MIB(&ipst->ips_ip_mib,
-					    ipIfStatsOutNoRoutes);
-					goto drop_pkt;
-				}
-			}
-		}
-		ASSERT(!ipif->ipif_isv6);
-		/*
-		 * As we may lose the conn by the time we reach ip_wput_ire,
-		 * we copy conn_multicast_loop and conn_dontroute on to an
-		 * ipsec_out. In case if this datagram goes out secure,
-		 * we need the ill_index also. Copy that also into the
-		 * ipsec_out.
-		 */
-		if (mctl_present) {
-			io = (ipsec_out_t *)first_mp->b_rptr;
-			ASSERT(first_mp->b_datap->db_type == M_CTL);
-			ASSERT(io->ipsec_out_type == IPSEC_OUT);
-		} else {
-			ASSERT(mp == first_mp);
-			if ((first_mp = allocb(sizeof (ipsec_info_t),
-			    BPRI_HI)) == NULL) {
-				ipif_refrele(ipif);
-				first_mp = mp;
-				goto discard_pkt;
-			}
-			first_mp->b_datap->db_type = M_CTL;
-			first_mp->b_wptr += sizeof (ipsec_info_t);
-			/* ipsec_out_secure is B_FALSE now */
-			bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
-			io = (ipsec_out_t *)first_mp->b_rptr;
-			io->ipsec_out_type = IPSEC_OUT;
-			io->ipsec_out_len = sizeof (ipsec_out_t);
-			io->ipsec_out_use_global_policy = B_TRUE;
-			io->ipsec_out_ns = ipst->ips_netstack;
-			first_mp->b_cont = mp;
-			mctl_present = B_TRUE;
-		}
-
-		match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
-		io->ipsec_out_ill_index =
-		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
-
-		if (connp != NULL) {
-			io->ipsec_out_multicast_loop =
-			    connp->conn_multicast_loop;
-			io->ipsec_out_dontroute = connp->conn_dontroute;
-			io->ipsec_out_zoneid = connp->conn_zoneid;
-		}
-		/*
-		 * If the application uses IP_MULTICAST_IF with
-		 * different logical addresses of the same ILL, we
-		 * need to make sure that the soruce address of
-		 * the packet matches the logical IP address used
-		 * in the option. We do it by initializing ipha_src
-		 * here. This should keep IPsec also happy as
-		 * when we return from IPsec processing, we don't
-		 * have to worry about getting the right address on
-		 * the packet. Thus it is sufficient to look for
-		 * IRE_CACHE using MATCH_IRE_ILL rathen than
-		 * MATCH_IRE_IPIF.
-		 *
-		 * NOTE : We need to do it for non-secure case also as
-		 * this might go out secure if there is a global policy
-		 * match in ip_wput_ire.
-		 *
-		 * As we do not have the ire yet, it is possible that
-		 * we set the source address here and then later discover
-		 * that the ire implies the source address to be assigned
-		 * through the RTF_SETSRC flag.
-		 * In that case, the setsrc variable will remind us
-		 * that overwritting the source address by the one
-		 * of the RTF_SETSRC-flagged ire is allowed.
-		 */
-		if (ipha->ipha_src == INADDR_ANY &&
-		    (connp == NULL || !connp->conn_unspec_src)) {
-			ipha->ipha_src = ipif->ipif_src_addr;
-			setsrc = RTF_SETSRC;
-		}
-		/*
-		 * Find an IRE which matches the destination and the outgoing
-		 * queue (i.e. the outgoing interface.)
-		 * For loopback use a unicast IP address for
-		 * the ire lookup.
-		 */
-		if (IS_LOOPBACK(ipif->ipif_ill))
-			dst = ipif->ipif_lcl_addr;
-
-		/*
-		 * If xmit_ill is set, we branch out to ip_newroute_ipif.
-		 * We don't need to lookup ire in ctable as the packet
-		 * needs to be sent to the destination through the specified
-		 * ill irrespective of ires in the cache table.
-		 */
-		ire = NULL;
-		if (xmit_ill == NULL) {
-			ire = ire_ctable_lookup(dst, 0, 0, ipif,
-			    zoneid, msg_getlabel(mp), match_flags, ipst);
-		}
-
-		if (ire == NULL) {
-			/*
-			 * Multicast loopback and multicast forwarding is
-			 * done in ip_wput_ire.
-			 *
-			 * Mark this packet to make it be delivered to
-			 * ip_wput_ire after the new ire has been
-			 * created.
-			 *
-			 * The call to ip_newroute_ipif takes into account
-			 * the setsrc reminder. In any case, we take care
-			 * of the RTF_MULTIRT flag.
-			 */
-			mp->b_prev = mp->b_next = NULL;
-			if (xmit_ill == NULL ||
-			    xmit_ill->ill_ipif_up_count > 0) {
-				ip_newroute_ipif(q, first_mp, ipif, dst, connp,
-				    setsrc | RTF_MULTIRT, zoneid, infop);
-				TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-				    "ip_wput_end: q %p (%S)", q, "noire");
-			} else {
-				freemsg(first_mp);
-			}
-			ipif_refrele(ipif);
-			if (xmit_ill != NULL)
-				ill_refrele(xmit_ill);
-			if (need_decref)
-				CONN_DEC_REF(connp);
-			return;
-		}
-
-		ipif_refrele(ipif);
-		ipif = NULL;
-		ASSERT(xmit_ill == NULL);
-
-		/*
-		 * Honor the RTF_SETSRC flag for multicast packets,
-		 * if allowed by the setsrc reminder.
-		 */
-		if ((ire->ire_flags & RTF_SETSRC) && setsrc) {
-			ipha->ipha_src = ire->ire_src_addr;
-		}
-
-		/*
-		 * Unconditionally force the TTL to 1 for
-		 * multirouted multicast packets:
-		 * multirouted multicast should not cross
-		 * multicast routers.
-		 */
-		if (ire->ire_flags & RTF_MULTIRT) {
-			if (ipha->ipha_ttl > 1) {
-				ip2dbg(("ip_wput: forcing multicast "
-				    "multirt TTL to 1 (was %d), dst 0x%08x\n",
-				    ipha->ipha_ttl, ntohl(ire->ire_addr)));
-				ipha->ipha_ttl = 1;
-			}
-		}
-	} else {
-		ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst);
-		if ((ire != NULL) && (ire->ire_type &
-		    (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) {
-			ignore_dontroute = B_TRUE;
-			ignore_nexthop = B_TRUE;
-		}
-		if (ire != NULL) {
-			ire_refrele(ire);
-			ire = NULL;
-		}
-		/*
-		 * Guard against coming in from arp in which case conn is NULL.
-		 * Also guard against non M_DATA with dontroute set but
-		 * destined to local, loopback or broadcast addresses.
-		 */
-		if (connp != NULL && connp->conn_dontroute &&
-		    !ignore_dontroute) {
-dontroute:
-			/*
-			 * Set TTL to 1 if SO_DONTROUTE is set to prevent
-			 * routing protocols from seeing false direct
-			 * connectivity.
-			 */
-			ipha->ipha_ttl = 1;
-			/* If suitable ipif not found, drop packet */
-			dst_ipif = ipif_lookup_onlink_addr(dst, zoneid, ipst);
-			if (dst_ipif == NULL) {
-noroute:
-				ip1dbg(("ip_wput: no route for dst using"
-				    " SO_DONTROUTE\n"));
-				BUMP_MIB(&ipst->ips_ip_mib,
-				    ipIfStatsOutNoRoutes);
-				mp->b_prev = mp->b_next = NULL;
-				if (first_mp == NULL)
-					first_mp = mp;
-				goto drop_pkt;
-			} else {
-				/*
-				 * If suitable ipif has been found, set
-				 * xmit_ill to the corresponding
-				 * ipif_ill because we'll be using the
-				 * send_from_ill logic below.
-				 */
-				ASSERT(xmit_ill == NULL);
-				xmit_ill = dst_ipif->ipif_ill;
-				mutex_enter(&xmit_ill->ill_lock);
-				if (!ILL_CAN_LOOKUP(xmit_ill)) {
-					mutex_exit(&xmit_ill->ill_lock);
-					xmit_ill = NULL;
-					ipif_refrele(dst_ipif);
-					goto noroute;
-				}
-				ill_refhold_locked(xmit_ill);
-				mutex_exit(&xmit_ill->ill_lock);
-				ipif_refrele(dst_ipif);
-			}
-		}
-
-send_from_ill:
-		if (xmit_ill != NULL) {
-			ipif_t *ipif;
-
-			/*
-			 * Mark this packet as originated locally
-			 */
-			mp->b_prev = mp->b_next = NULL;
-
-			/*
-			 * Could be SO_DONTROUTE case also.
-			 * Verify that at least one ipif is up on the ill.
-			 */
-			if (xmit_ill->ill_ipif_up_count == 0) {
-				ip1dbg(("ip_output: xmit_ill %s is down\n",
-				    xmit_ill->ill_name));
-				goto drop_pkt;
-			}
-
-			ipif = ipif_get_next_ipif(NULL, xmit_ill);
-			if (ipif == NULL) {
-				ip1dbg(("ip_output: xmit_ill %s NULL ipif\n",
-				    xmit_ill->ill_name));
-				goto drop_pkt;
-			}
-
-			match_flags = 0;
-			if (IS_UNDER_IPMP(xmit_ill))
-				match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
-			/*
-			 * Look for a ire that is part of the group,
-			 * if found use it else call ip_newroute_ipif.
-			 * IPCL_ZONEID is not used for matching because
-			 * IP_ALLZONES option is valid only when the
-			 * ill is accessible from all zones i.e has a
-			 * valid ipif in all zones.
-			 */
-			match_flags |= MATCH_IRE_ILL | MATCH_IRE_SECATTR;
-			ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid,
-			    msg_getlabel(mp), match_flags, ipst);
-			/*
-			 * If an ire exists use it or else create
-			 * an ire but don't add it to the cache.
-			 * Adding an ire may cause issues with
-			 * asymmetric routing.
-			 * In case of multiroute always act as if
-			 * ire does not exist.
-			 */
-			if (ire == NULL || ire->ire_flags & RTF_MULTIRT) {
-				if (ire != NULL)
-					ire_refrele(ire);
-				ip_newroute_ipif(q, first_mp, ipif,
-				    dst, connp, 0, zoneid, infop);
-				ipif_refrele(ipif);
-				ip1dbg(("ip_output: xmit_ill via %s\n",
-				    xmit_ill->ill_name));
-				ill_refrele(xmit_ill);
-				if (need_decref)
-					CONN_DEC_REF(connp);
-				return;
-			}
-			ipif_refrele(ipif);
-		} else if (ip_nexthop || (connp != NULL &&
-		    (connp->conn_nexthop_set)) && !ignore_nexthop) {
-			if (!ip_nexthop) {
-				ip_nexthop = B_TRUE;
-				nexthop_addr = connp->conn_nexthop_v4;
-			}
-			match_flags = MATCH_IRE_MARK_PRIVATE_ADDR |
-			    MATCH_IRE_GW;
-			ire = ire_ctable_lookup(dst, nexthop_addr, 0,
-			    NULL, zoneid, msg_getlabel(mp), match_flags, ipst);
-		} else {
-			ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp),
-			    ipst);
-		}
-		if (!ire) {
-			if (ip_nexthop && !ignore_nexthop) {
-				if (mctl_present) {
-					io = (ipsec_out_t *)first_mp->b_rptr;
-					ASSERT(first_mp->b_datap->db_type ==
-					    M_CTL);
-					ASSERT(io->ipsec_out_type == IPSEC_OUT);
-				} else {
-					ASSERT(mp == first_mp);
-					first_mp = allocb(
-					    sizeof (ipsec_info_t), BPRI_HI);
-					if (first_mp == NULL) {
-						first_mp = mp;
-						goto discard_pkt;
-					}
-					first_mp->b_datap->db_type = M_CTL;
-					first_mp->b_wptr +=
-					    sizeof (ipsec_info_t);
-					/* ipsec_out_secure is B_FALSE now */
-					bzero(first_mp->b_rptr,
-					    sizeof (ipsec_info_t));
-					io = (ipsec_out_t *)first_mp->b_rptr;
-					io->ipsec_out_type = IPSEC_OUT;
-					io->ipsec_out_len =
-					    sizeof (ipsec_out_t);
-					io->ipsec_out_use_global_policy =
-					    B_TRUE;
-					io->ipsec_out_ns = ipst->ips_netstack;
-					first_mp->b_cont = mp;
-					mctl_present = B_TRUE;
-				}
-				io->ipsec_out_ip_nexthop = ip_nexthop;
-				io->ipsec_out_nexthop_addr = nexthop_addr;
-			}
-noirefound:
-			/*
-			 * Mark this packet as having originated on
-			 * this machine.  This will be noted in
-			 * ire_add_then_send, which needs to know
-			 * whether to run it back through ip_wput or
-			 * ip_rput following successful resolution.
-			 */
-			mp->b_prev = NULL;
-			mp->b_next = NULL;
-			ip_newroute(q, first_mp, dst, connp, zoneid, ipst);
-			TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-			    "ip_wput_end: q %p (%S)", q, "newroute");
-			if (xmit_ill != NULL)
-				ill_refrele(xmit_ill);
-			if (need_decref)
-				CONN_DEC_REF(connp);
-			return;
-		}
-	}
-
-	/* We now know where we are going with it. */
-
-	TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-	    "ip_wput_end: q %p (%S)", q, "end");
-
-	/*
-	 * Check if the ire has the RTF_MULTIRT flag, inherited
-	 * from an IRE_OFFSUBNET ire entry in ip_newroute.
-	 */
-	if (ire->ire_flags & RTF_MULTIRT) {
-		/*
-		 * Force the TTL of multirouted packets if required.
-		 * The TTL of such packets is bounded by the
-		 * ip_multirt_ttl ndd variable.
-		 */
-		if ((ipst->ips_ip_multirt_ttl > 0) &&
-		    (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
-			ip2dbg(("ip_wput: forcing multirt TTL to %d "
-			    "(was %d), dst 0x%08x\n",
-			    ipst->ips_ip_multirt_ttl, ipha->ipha_ttl,
-			    ntohl(ire->ire_addr)));
-			ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
-		}
-		/*
-		 * At this point, we check to see if there are any pending
-		 * unresolved routes. ire_multirt_resolvable()
-		 * checks in O(n) that all IRE_OFFSUBNET ire
-		 * entries for the packet's destination and
-		 * flagged RTF_MULTIRT are currently resolved.
-		 * If some remain unresolved, we make a copy
-		 * of the current message. It will be used
-		 * to initiate additional route resolutions.
-		 */
-		multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr,
-		    msg_getlabel(first_mp), ipst);
-		ip2dbg(("ip_wput[noirefound]: ire %p, "
-		    "multirt_need_resolve %d, first_mp %p\n",
-		    (void *)ire, multirt_need_resolve, (void *)first_mp));
-		if (multirt_need_resolve) {
-			copy_mp = copymsg(first_mp);
-			if (copy_mp != NULL) {
-				MULTIRT_DEBUG_TAG(copy_mp);
-			}
-		}
-	}
-
-	ip_wput_ire(q, first_mp, ire, connp, caller, zoneid);
-	/*
-	 * Try to resolve another multiroute if
-	 * ire_multirt_resolvable() deemed it necessary.
-	 * At this point, we need to distinguish
-	 * multicasts from other packets. For multicasts,
-	 * we call ip_newroute_ipif() and request that both
-	 * multirouting and setsrc flags are checked.
-	 */
-	if (copy_mp != NULL) {
-		if (CLASSD(dst)) {
-			ipif_t *ipif = ipif_lookup_group(dst, zoneid, ipst);
-			if (ipif) {
-				ASSERT(infop->ip_opt_ill_index == 0);
-				ip_newroute_ipif(q, copy_mp, ipif, dst, connp,
-				    RTF_SETSRC | RTF_MULTIRT, zoneid, infop);
-				ipif_refrele(ipif);
-			} else {
-				MULTIRT_DEBUG_UNTAG(copy_mp);
-				freemsg(copy_mp);
-				copy_mp = NULL;
-			}
-		} else {
-			ip_newroute(q, copy_mp, dst, connp, zoneid, ipst);
-		}
-	}
-	if (xmit_ill != NULL)
-		ill_refrele(xmit_ill);
-	if (need_decref)
-		CONN_DEC_REF(connp);
-	return;
-
-icmp_parameter_problem:
-	/* could not have originated externally */
-	ASSERT(mp->b_prev == NULL);
-	if (ip_hdr_complete(ipha, zoneid, ipst) == 0) {
-		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
-		/* it's the IP header length that's in trouble */
-		icmp_param_problem(q, first_mp, 0, zoneid, ipst);
-		first_mp = NULL;
-	}
-
-discard_pkt:
-	BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-drop_pkt:
-	ip1dbg(("ip_wput: dropped packet\n"));
-	if (ire != NULL)
-		ire_refrele(ire);
-	if (need_decref)
-		CONN_DEC_REF(connp);
-	freemsg(first_mp);
-	if (xmit_ill != NULL)
-		ill_refrele(xmit_ill);
-	TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
-	    "ip_wput_end: q %p (%S)", q, "droppkt");
-}
-
-/*
- * If this is a conn_t queue, then we pass in the conn. This includes the
- * zoneid.
- * Otherwise, this is a message coming back from ARP or for an ill_t queue,
- * in which case we use the global zoneid since those are all part of
- * the global zone.
- */
-void
-ip_wput(queue_t *q, mblk_t *mp)
-{
-	if (CONN_Q(q))
-		ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
-	else
-		ip_output(GLOBAL_ZONEID, mp, q, IP_WPUT);
-}
-
-/*
- *
- * The following rules must be observed when accessing any ipif or ill
- * that has been cached in the conn. Typically conn_outgoing_ill,
- * conn_multicast_ipif and conn_multicast_ill.
- *
- * Access: The ipif or ill pointed to from the conn can be accessed under
- * the protection of the conn_lock or after it has been refheld under the
- * protection of the conn lock. In addition the IPIF_CAN_LOOKUP or
- * ILL_CAN_LOOKUP macros must be used before actually doing the refhold.
- * The reason for this is that a concurrent unplumb could actually be
- * cleaning up these cached pointers by walking the conns and might have
- * finished cleaning up the conn in question. The macros check that an
- * unplumb has not yet started on the ipif or ill.
- *
- * Caching: An ipif or ill pointer may be cached in the conn only after
- * making sure that an unplumb has not started. So the caching is done
- * while holding both the conn_lock and the ill_lock and after using the
- * ILL_CAN_LOOKUP/IPIF_CAN_LOOKUP macro. An unplumb will set the ILL_CONDEMNED
- * flag before starting the cleanup of conns.
- *
- * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock
- * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock
- * or a reference to the ipif or a reference to an ire that references the
- * ipif. An ipif only changes its ill when migrating from an underlying ill
- * to an IPMP ill in ipif_up().
- */
-ipif_t *
-conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err)
-{
-	ipif_t	*ipif;
-	ill_t	*ill;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	*err = 0;
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	mutex_enter(&connp->conn_lock);
-	ipif = *ipifp;
-	if (ipif != NULL) {
-		ill = ipif->ipif_ill;
-		mutex_enter(&ill->ill_lock);
-		if (IPIF_CAN_LOOKUP(ipif)) {
-			ipif_refhold_locked(ipif);
-			mutex_exit(&ill->ill_lock);
-			mutex_exit(&connp->conn_lock);
-			rw_exit(&ipst->ips_ill_g_lock);
-			return (ipif);
-		} else {
-			*err = IPIF_LOOKUP_FAILED;
-		}
-		mutex_exit(&ill->ill_lock);
-	}
-	mutex_exit(&connp->conn_lock);
-	rw_exit(&ipst->ips_ill_g_lock);
-	return (NULL);
-}
-
-ill_t *
-conn_get_held_ill(conn_t *connp, ill_t **illp, int *err)
-{
-	ill_t	*ill;
-
-	*err = 0;
-	mutex_enter(&connp->conn_lock);
-	ill = *illp;
-	if (ill != NULL) {
-		mutex_enter(&ill->ill_lock);
-		if (ILL_CAN_LOOKUP(ill)) {
-			ill_refhold_locked(ill);
-			mutex_exit(&ill->ill_lock);
-			mutex_exit(&connp->conn_lock);
-			return (ill);
-		} else {
-			*err = ILL_LOOKUP_FAILED;
-		}
-		mutex_exit(&ill->ill_lock);
-	}
-	mutex_exit(&connp->conn_lock);
-	return (NULL);
-}
-
-static int
-conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif)
-{
-	ill_t	*ill;
-
-	ill = ipif->ipif_ill;
-	mutex_enter(&connp->conn_lock);
-	mutex_enter(&ill->ill_lock);
-	if (IPIF_CAN_LOOKUP(ipif)) {
-		*ipifp = ipif;
-		mutex_exit(&ill->ill_lock);
-		mutex_exit(&connp->conn_lock);
-		return (0);
-	}
-	mutex_exit(&ill->ill_lock);
-	mutex_exit(&connp->conn_lock);
-	return (IPIF_LOOKUP_FAILED);
-}
-
-/*
- * This is called if the outbound datagram needs fragmentation.
- *
- * NOTE : This function does not ire_refrele the ire argument passed in.
- */
-static void
-ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid,
-    ip_stack_t *ipst, conn_t *connp)
-{
-	ipha_t		*ipha;
-	mblk_t		*mp;
-	uint32_t	v_hlen_tos_len;
-	uint32_t	max_frag;
-	uint32_t	frag_flag;
-	boolean_t	dont_use;
-
-	if (ipsec_mp->b_datap->db_type == M_CTL) {
-		mp = ipsec_mp->b_cont;
-	} else {
-		mp = ipsec_mp;
-	}
-
-	ipha = (ipha_t *)mp->b_rptr;
-	v_hlen_tos_len = ((uint32_t *)ipha)[0];
-
-#ifdef	_BIG_ENDIAN
-#define	V_HLEN	(v_hlen_tos_len >> 24)
-#define	LENGTH	(v_hlen_tos_len & 0xFFFF)
-#else
-#define	V_HLEN	(v_hlen_tos_len & 0xFF)
-#define	LENGTH	((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00))
-#endif
-
-#ifndef SPEED_BEFORE_SAFETY
-	/*
-	 * Check that ipha_length is consistent with
-	 * the mblk length
-	 */
-	if (LENGTH != (mp->b_cont ? msgdsize(mp) : mp->b_wptr - rptr)) {
-		ip0dbg(("Packet length mismatch: %d, %ld\n",
-		    LENGTH, msgdsize(mp)));
-		freemsg(ipsec_mp);
-		TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
-		    "ip_wput_ire_fragmentit: mp %p (%S)", mp,
-		    "packet length mismatch");
-		return;
-	}
-#endif
-	/*
-	 * Don't use frag_flag if pre-built packet or source
-	 * routed or if multicast (since multicast packets do not solicit
-	 * ICMP "packet too big" messages). Get the values of
-	 * max_frag and frag_flag atomically by acquiring the
-	 * ire_lock.
-	 */
-	mutex_enter(&ire->ire_lock);
-	max_frag = ire->ire_max_frag;
-	frag_flag = ire->ire_frag_flag;
-	mutex_exit(&ire->ire_lock);
-
-	dont_use = ((ipha->ipha_ident == IP_HDR_INCLUDED) ||
-	    (V_HLEN != IP_SIMPLE_HDR_VERSION &&
-	    ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst));
-
-	ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag,
-	    (dont_use ? 0 : frag_flag), zoneid, ipst, connp);
-}
-
-/*
  * Used for deciding the MSS size for the upper layer. Thus
  * we need to check the outbound policy values in the conn.
  */
@@ -21820,10 +11595,10 @@ conn_ipsec_length(conn_t *connp)
 	if (ipl == NULL)
 		return (0);
 
-	if (ipl->ipl_out_policy == NULL)
+	if (connp->conn_ixa->ixa_ipsec_policy == NULL)
 		return (0);
 
-	return (ipl->ipl_out_policy->ipsp_act->ipa_ovhd);
+	return (connp->conn_ixa->ixa_ipsec_policy->ipsp_act->ipa_ovhd);
 }
 
 /*
@@ -21831,20 +11606,17 @@ conn_ipsec_length(conn_t *connp)
  * we don't want to call into IPsec to get the exact size.
  */
 int
-ipsec_out_extra_length(mblk_t *ipsec_mp)
+ipsec_out_extra_length(ip_xmit_attr_t *ixa)
 {
-	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
 	ipsec_action_t *a;
 
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	if (!io->ipsec_out_secure)
+	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
 		return (0);
 
-	a = io->ipsec_out_act;
-
+	a = ixa->ixa_ipsec_action;
 	if (a == NULL) {
-		ASSERT(io->ipsec_out_policy != NULL);
-		a = io->ipsec_out_policy->ipsp_act;
+		ASSERT(ixa->ixa_ipsec_policy != NULL);
+		a = ixa->ixa_ipsec_policy->ipsp_act;
 	}
 	ASSERT(a != NULL);
 
@@ -21852,22 +11624,6 @@ ipsec_out_extra_length(mblk_t *ipsec_mp)
 }
 
 /*
- * Returns an estimate of the IPsec headers size. This is used if
- * we don't want to call into IPsec to get the exact size.
- */
-int
-ipsec_in_extra_length(mblk_t *ipsec_mp)
-{
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	ipsec_action_t *a;
-
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
-
-	a = ii->ipsec_in_action;
-	return (a == NULL ? 0 : a->ipa_ovhd);
-}
-
-/*
  * If there are any source route options, return the true final
  * destination. Otherwise, return the destination.
  */
@@ -21914,2257 +11670,70 @@ ip_get_dst(ipha_t *ipha)
 	return (dst);
 }
 
-mblk_t *
-ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire,
-    conn_t *connp, boolean_t unspec_src, zoneid_t zoneid)
-{
-	ipsec_out_t	*io;
-	mblk_t		*first_mp;
-	boolean_t policy_present;
-	ip_stack_t	*ipst;
-	ipsec_stack_t	*ipss;
-
-	ASSERT(ire != NULL);
-	ipst = ire->ire_ipst;
-	ipss = ipst->ips_netstack->netstack_ipsec;
-
-	first_mp = mp;
-	if (mp->b_datap->db_type == M_CTL) {
-		io = (ipsec_out_t *)first_mp->b_rptr;
-		/*
-		 * ip_wput[_v6] attaches an IPSEC_OUT in two cases.
-		 *
-		 * 1) There is per-socket policy (including cached global
-		 *    policy) or a policy on the IP-in-IP tunnel.
-		 * 2) There is no per-socket policy, but it is
-		 *    a multicast packet that needs to go out
-		 *    on a specific interface. This is the case
-		 *    where (ip_wput and ip_wput_multicast) attaches
-		 *    an IPSEC_OUT and sets ipsec_out_secure B_FALSE.
-		 *
-		 * In case (2) we check with global policy to
-		 * see if there is a match and set the ill_index
-		 * appropriately so that we can lookup the ire
-		 * properly in ip_wput_ipsec_out.
-		 */
-
-		/*
-		 * ipsec_out_use_global_policy is set to B_FALSE
-		 * in ipsec_in_to_out(). Refer to that function for
-		 * details.
-		 */
-		if ((io->ipsec_out_latch == NULL) &&
-		    (io->ipsec_out_use_global_policy)) {
-			return (ip_wput_attach_policy(first_mp, ipha, ip6h,
-			    ire, connp, unspec_src, zoneid));
-		}
-		if (!io->ipsec_out_secure) {
-			/*
-			 * If this is not a secure packet, drop
-			 * the IPSEC_OUT mp and treat it as a clear
-			 * packet. This happens when we are sending
-			 * a ICMP reply back to a clear packet. See
-			 * ipsec_in_to_out() for details.
-			 */
-			mp = first_mp->b_cont;
-			freeb(first_mp);
-		}
-		return (mp);
-	}
-	/*
-	 * See whether we need to attach a global policy here. We
-	 * don't depend on the conn (as it could be null) for deciding
-	 * what policy this datagram should go through because it
-	 * should have happened in ip_wput if there was some
-	 * policy. This normally happens for connections which are not
-	 * fully bound preventing us from caching policies in
-	 * ip_bind. Packets coming from the TCP listener/global queue
-	 * - which are non-hard_bound - could also be affected by
-	 * applying policy here.
-	 *
-	 * If this packet is coming from tcp global queue or listener,
-	 * we will be applying policy here.  This may not be *right*
-	 * if these packets are coming from the detached connection as
-	 * it could have gone in clear before. This happens only if a
-	 * TCP connection started when there is no policy and somebody
-	 * added policy before it became detached. Thus packets of the
-	 * detached connection could go out secure and the other end
-	 * would drop it because it will be expecting in clear. The
-	 * converse is not true i.e if somebody starts a TCP
-	 * connection and deletes the policy, all the packets will
-	 * still go out with the policy that existed before deleting
-	 * because ip_unbind sends up policy information which is used
-	 * by TCP on subsequent ip_wputs. The right solution is to fix
-	 * TCP to attach a dummy IPSEC_OUT and set
-	 * ipsec_out_use_global_policy to B_FALSE. As this might
-	 * affect performance for normal cases, we are not doing it.
-	 * Thus, set policy before starting any TCP connections.
-	 *
-	 * NOTE - We might apply policy even for a hard bound connection
-	 * - for which we cached policy in ip_bind - if somebody added
-	 * global policy after we inherited the policy in ip_bind.
-	 * This means that the packets that were going out in clear
-	 * previously would start going secure and hence get dropped
-	 * on the other side. To fix this, TCP attaches a dummy
-	 * ipsec_out and make sure that we don't apply global policy.
-	 */
-	if (ipha != NULL)
-		policy_present = ipss->ipsec_outbound_v4_policy_present;
-	else
-		policy_present = ipss->ipsec_outbound_v6_policy_present;
-	if (!policy_present)
-		return (mp);
-
-	return (ip_wput_attach_policy(mp, ipha, ip6h, ire, connp, unspec_src,
-	    zoneid));
-}
-
-/*
- * This function does the ire_refrele of the ire passed in as the
- * argument. As this function looks up more ires i.e broadcast ires,
- * it needs to REFRELE them. Currently, for simplicity we don't
- * differentiate the one passed in and looked up here. We always
- * REFRELE.
- * IPQoS Notes:
- * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for
- * IPsec packets are done in ipsec_out_process.
- */
-void
-ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
-    zoneid_t zoneid)
-{
-	ipha_t		*ipha;
-#define	rptr	((uchar_t *)ipha)
-	queue_t		*stq;
-#define	Q_TO_INDEX(stq)	(((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex)
-	uint32_t	v_hlen_tos_len;
-	uint32_t	ttl_protocol;
-	ipaddr_t	src;
-	ipaddr_t	dst;
-	uint32_t	cksum;
-	ipaddr_t	orig_src;
-	ire_t		*ire1;
-	mblk_t		*next_mp;
-	uint_t		hlen;
-	uint16_t	*up;
-	uint32_t	max_frag = ire->ire_max_frag;
-	ill_t		*ill = ire_to_ill(ire);
-	int		clusterwide;
-	uint16_t	ip_hdr_included; /* IP header included by ULP? */
-	int		ipsec_len;
-	mblk_t		*first_mp;
-	ipsec_out_t	*io;
-	boolean_t	conn_dontroute;		/* conn value for multicast */
-	boolean_t	conn_multicast_loop;	/* conn value for multicast */
-	boolean_t	multicast_forward;	/* Should we forward ? */
-	boolean_t	unspec_src;
-	ill_t		*conn_outgoing_ill = NULL;
-	ill_t		*ire_ill;
-	ill_t		*ire1_ill;
-	ill_t		*out_ill;
-	uint32_t 	ill_index = 0;
-	boolean_t	multirt_send = B_FALSE;
-	int		err;
-	ipxmit_state_t	pktxmit_state;
-	ip_stack_t	*ipst = ire->ire_ipst;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
-
-	TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START,
-	    "ip_wput_ire_start: q %p", q);
-
-	multicast_forward = B_FALSE;
-	unspec_src = (connp != NULL && connp->conn_unspec_src);
-
-	if (ire->ire_flags & RTF_MULTIRT) {
-		/*
-		 * Multirouting case. The bucket where ire is stored
-		 * probably holds other RTF_MULTIRT flagged ire
-		 * to the destination. In this call to ip_wput_ire,
-		 * we attempt to send the packet through all
-		 * those ires. Thus, we first ensure that ire is the
-		 * first RTF_MULTIRT ire in the bucket,
-		 * before walking the ire list.
-		 */
-		ire_t *first_ire;
-		irb_t *irb = ire->ire_bucket;
-		ASSERT(irb != NULL);
-
-		/* Make sure we do not omit any multiroute ire. */
-		IRB_REFHOLD(irb);
-		for (first_ire = irb->irb_ire;
-		    first_ire != NULL;
-		    first_ire = first_ire->ire_next) {
-			if ((first_ire->ire_flags & RTF_MULTIRT) &&
-			    (first_ire->ire_addr == ire->ire_addr) &&
-			    !(first_ire->ire_marks &
-			    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
-				break;
-		}
-
-		if ((first_ire != NULL) && (first_ire != ire)) {
-			IRE_REFHOLD(first_ire);
-			ire_refrele(ire);
-			ire = first_ire;
-			ill = ire_to_ill(ire);
-		}
-		IRB_REFRELE(irb);
-	}
-
-	/*
-	 * conn_outgoing_ill variable is used only in the broadcast loop.
-	 * for performance we don't grab the mutexs in the fastpath
-	 */
-	if (ire->ire_type == IRE_BROADCAST && connp != NULL &&
-	    connp->conn_outgoing_ill != NULL) {
-		conn_outgoing_ill = conn_get_held_ill(connp,
-		    &connp->conn_outgoing_ill, &err);
-		if (err == ILL_LOOKUP_FAILED) {
-			ire_refrele(ire);
-			freemsg(mp);
-			return;
-		}
-	}
-
-	if (mp->b_datap->db_type != M_CTL) {
-		ipha = (ipha_t *)mp->b_rptr;
-	} else {
-		io = (ipsec_out_t *)mp->b_rptr;
-		ASSERT(io->ipsec_out_type == IPSEC_OUT);
-		ASSERT(zoneid == io->ipsec_out_zoneid);
-		ASSERT(zoneid != ALL_ZONES);
-		ipha = (ipha_t *)mp->b_cont->b_rptr;
-		dst = ipha->ipha_dst;
-		/*
-		 * For the multicast case, ipsec_out carries conn_dontroute and
-		 * conn_multicast_loop as conn may not be available here. We
-		 * need this for multicast loopback and forwarding which is done
-		 * later in the code.
-		 */
-		if (CLASSD(dst)) {
-			conn_dontroute = io->ipsec_out_dontroute;
-			conn_multicast_loop = io->ipsec_out_multicast_loop;
-			/*
-			 * If conn_dontroute is not set or conn_multicast_loop
-			 * is set, we need to do forwarding/loopback. For
-			 * datagrams from ip_wput_multicast, conn_dontroute is
-			 * set to B_TRUE and conn_multicast_loop is set to
-			 * B_FALSE so that we neither do forwarding nor
-			 * loopback.
-			 */
-			if (!conn_dontroute || conn_multicast_loop)
-				multicast_forward = B_TRUE;
-		}
-	}
-
-	if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid &&
-	    ire->ire_zoneid != ALL_ZONES) {
-		/*
-		 * When a zone sends a packet to another zone, we try to deliver
-		 * the packet under the same conditions as if the destination
-		 * was a real node on the network. To do so, we look for a
-		 * matching route in the forwarding table.
-		 * RTF_REJECT and RTF_BLACKHOLE are handled just like
-		 * ip_newroute() does.
-		 * Note that IRE_LOCAL are special, since they are used
-		 * when the zoneid doesn't match in some cases. This means that
-		 * we need to handle ipha_src differently since ire_src_addr
-		 * belongs to the receiving zone instead of the sending zone.
-		 * When ip_restrict_interzone_loopback is set, then
-		 * ire_cache_lookup() ensures that IRE_LOCAL are only used
-		 * for loopback between zones when the logical "Ethernet" would
-		 * have looped them back.
-		 */
-		ire_t *src_ire;
-
-		src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 0,
-		    NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE |
-		    MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE), ipst);
-		if (src_ire != NULL &&
-		    !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
-		    (!ipst->ips_ip_restrict_interzone_loopback ||
-		    ire_local_same_lan(ire, src_ire))) {
-			if (ipha->ipha_src == INADDR_ANY && !unspec_src)
-				ipha->ipha_src = src_ire->ire_src_addr;
-			ire_refrele(src_ire);
-		} else {
-			ire_refrele(ire);
-			if (conn_outgoing_ill != NULL)
-				ill_refrele(conn_outgoing_ill);
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
-			if (src_ire != NULL) {
-				if (src_ire->ire_flags & RTF_BLACKHOLE) {
-					ire_refrele(src_ire);
-					freemsg(mp);
-					return;
-				}
-				ire_refrele(src_ire);
-			}
-			if (ip_hdr_complete(ipha, zoneid, ipst)) {
-				/* Failed */
-				freemsg(mp);
-				return;
-			}
-			icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, zoneid,
-			    ipst);
-			return;
-		}
-	}
-
-	if (mp->b_datap->db_type == M_CTL ||
-	    ipss->ipsec_outbound_v4_policy_present) {
-		mp = ip_wput_ire_parse_ipsec_out(mp, ipha, NULL, ire, connp,
-		    unspec_src, zoneid);
-		if (mp == NULL) {
-			ire_refrele(ire);
-			if (conn_outgoing_ill != NULL)
-				ill_refrele(conn_outgoing_ill);
-			return;
-		}
-		/*
-		 * Trusted Extensions supports all-zones interfaces, so
-		 * zoneid == ALL_ZONES is valid, but IPsec maps ALL_ZONES to
-		 * the global zone.
-		 */
-		if (zoneid == ALL_ZONES && mp->b_datap->db_type == M_CTL) {
-			io = (ipsec_out_t *)mp->b_rptr;
-			ASSERT(io->ipsec_out_type == IPSEC_OUT);
-			zoneid = io->ipsec_out_zoneid;
-		}
-	}
-
-	first_mp = mp;
-	ipsec_len = 0;
-
-	if (first_mp->b_datap->db_type == M_CTL) {
-		io = (ipsec_out_t *)first_mp->b_rptr;
-		ASSERT(io->ipsec_out_type == IPSEC_OUT);
-		mp = first_mp->b_cont;
-		ipsec_len = ipsec_out_extra_length(first_mp);
-		ASSERT(ipsec_len >= 0);
-		if (zoneid == ALL_ZONES)
-			zoneid = GLOBAL_ZONEID;
-		/* We already picked up the zoneid from the M_CTL above */
-		ASSERT(zoneid == io->ipsec_out_zoneid);
-
-		/*
-		 * Drop M_CTL here if IPsec processing is not needed.
-		 * (Non-IPsec use of M_CTL extracted any information it
-		 * needed above).
-		 */
-		if (ipsec_len == 0) {
-			freeb(first_mp);
-			first_mp = mp;
-		}
-	}
-
-	/*
-	 * Fast path for ip_wput_ire
-	 */
-
-	ipha = (ipha_t *)mp->b_rptr;
-	v_hlen_tos_len = ((uint32_t *)ipha)[0];
-	dst = ipha->ipha_dst;
-
-	/*
-	 * ICMP(RAWIP) module should set the ipha_ident to IP_HDR_INCLUDED
-	 * if the socket is a SOCK_RAW type. The transport checksum should
-	 * be provided in the pre-built packet, so we don't need to compute it.
-	 * Also, other application set flags, like DF, should not be altered.
-	 * Other transport MUST pass down zero.
-	 */
-	ip_hdr_included = ipha->ipha_ident;
-	ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
-
-	if (CLASSD(dst)) {
-		ip1dbg(("ip_wput_ire: to 0x%x ire %s addr 0x%x\n",
-		    ntohl(dst),
-		    ip_nv_lookup(ire_nv_tbl, ire->ire_type),
-		    ntohl(ire->ire_addr)));
-	}
-
-/* Macros to extract header fields from data already in registers */
-#ifdef	_BIG_ENDIAN
-#define	V_HLEN	(v_hlen_tos_len >> 24)
-#define	LENGTH	(v_hlen_tos_len & 0xFFFF)
-#define	PROTO	(ttl_protocol & 0xFF)
-#else
-#define	V_HLEN	(v_hlen_tos_len & 0xFF)
-#define	LENGTH	((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00))
-#define	PROTO	(ttl_protocol >> 8)
-#endif
-
-	orig_src = src = ipha->ipha_src;
-	/* (The loop back to "another" is explained down below.) */
-another:;
-	/*
-	 * Assign an ident value for this packet.  We assign idents on
-	 * a per destination basis out of the IRE.  There could be
-	 * other threads targeting the same destination, so we have to
-	 * arrange for a atomic increment.  Note that we use a 32-bit
-	 * atomic add because it has better performance than its
-	 * 16-bit sibling.
-	 *
-	 * If running in cluster mode and if the source address
-	 * belongs to a replicated service then vector through
-	 * cl_inet_ipident vector to allocate ip identifier
-	 * NOTE: This is a contract private interface with the
-	 * clustering group.
-	 */
-	clusterwide = 0;
-	if (cl_inet_ipident) {
-		ASSERT(cl_inet_isclusterwide);
-		netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
-
-		if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
-		    AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
-			ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
-			    IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
-			    (uint8_t *)(uintptr_t)dst, NULL);
-			clusterwide = 1;
-		}
-	}
-	if (!clusterwide) {
-		ipha->ipha_ident =
-		    (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
-	}
-
-#ifndef _BIG_ENDIAN
-	ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
-#endif
-
-	/*
-	 * Set source address unless sent on an ill or conn_unspec_src is set.
-	 * This is needed to obey conn_unspec_src when packets go through
-	 * ip_newroute + arp.
-	 * Assumes ip_newroute{,_multi} sets the source address as well.
-	 */
-	if (src == INADDR_ANY && !unspec_src) {
-		/*
-		 * Assign the appropriate source address from the IRE if none
-		 * was specified.
-		 */
-		ASSERT(ire->ire_ipversion == IPV4_VERSION);
-
-		src = ire->ire_src_addr;
-		if (connp == NULL) {
-			ip1dbg(("ip_wput_ire: no connp and no src "
-			    "address for dst 0x%x, using src 0x%x\n",
-			    ntohl(dst),
-			    ntohl(src)));
-		}
-		ipha->ipha_src = src;
-	}
-	stq = ire->ire_stq;
-
-	/*
-	 * We only allow ire chains for broadcasts since there will
-	 * be multiple IRE_CACHE entries for the same multicast
-	 * address (one per ipif).
-	 */
-	next_mp = NULL;
-
-	/* broadcast packet */
-	if (ire->ire_type == IRE_BROADCAST)
-		goto broadcast;
-
-	/* loopback ? */
-	if (stq == NULL)
-		goto nullstq;
-
-	/* The ill_index for outbound ILL */
-	ill_index = Q_TO_INDEX(stq);
-
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
-	ttl_protocol = ((uint16_t *)ipha)[4];
-
-	/* pseudo checksum (do it in parts for IP header checksum) */
-	cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
-
-	if (!IP_FLOW_CONTROLLED_ULP(PROTO)) {
-		queue_t *dev_q = stq->q_next;
-
-		/*
-		 * For DIRECT_CAPABLE, we do flow control at
-		 * the time of sending the packet. See
-		 * ILL_SEND_TX().
-		 */
-		if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) &&
-		    (DEV_Q_FLOW_BLOCKED(dev_q)))
-			goto blocked;
-
-		if ((PROTO == IPPROTO_UDP) &&
-		    (ip_hdr_included != IP_HDR_INCLUDED)) {
-			hlen = (V_HLEN & 0xF) << 2;
-			up = IPH_UDPH_CHECKSUMP(ipha, hlen);
-			if (*up != 0) {
-				IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO,
-				    hlen, LENGTH, max_frag, ipsec_len, cksum);
-				/* Software checksum? */
-				if (DB_CKSUMFLAGS(mp) == 0) {
-					IP_STAT(ipst, ip_out_sw_cksum);
-					IP_STAT_UPDATE(ipst,
-					    ip_udp_out_sw_cksum_bytes,
-					    LENGTH - hlen);
-				}
-			}
-		}
-	} else if (ip_hdr_included != IP_HDR_INCLUDED) {
-		hlen = (V_HLEN & 0xF) << 2;
-		if (PROTO == IPPROTO_TCP) {
-			up = IPH_TCPH_CHECKSUMP(ipha, hlen);
-			/*
-			 * The packet header is processed once and for all, even
-			 * in the multirouting case. We disable hardware
-			 * checksum if the packet is multirouted, as it will be
-			 * replicated via several interfaces, and not all of
-			 * them may have this capability.
-			 */
-			IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen,
-			    LENGTH, max_frag, ipsec_len, cksum);
-			/* Software checksum? */
-			if (DB_CKSUMFLAGS(mp) == 0) {
-				IP_STAT(ipst, ip_out_sw_cksum);
-				IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes,
-				    LENGTH - hlen);
-			}
-		} else {
-			sctp_hdr_t	*sctph;
-
-			ASSERT(PROTO == IPPROTO_SCTP);
-			ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph)));
-			sctph = (sctp_hdr_t *)(mp->b_rptr + hlen);
-			/*
-			 * Zero out the checksum field to ensure proper
-			 * checksum calculation.
-			 */
-			sctph->sh_chksum = 0;
-#ifdef	DEBUG
-			if (!skip_sctp_cksum)
-#endif
-				sctph->sh_chksum = sctp_cksum(mp, hlen);
-		}
-	}
-
-	/*
-	 * If this is a multicast packet and originated from ip_wput
-	 * we need to do loopback and forwarding checks. If it comes
-	 * from ip_wput_multicast, we SHOULD not do this.
-	 */
-	if (CLASSD(ipha->ipha_dst) && multicast_forward) goto multi_loopback;
-
-	/* checksum */
-	cksum += ttl_protocol;
-
-	/* fragment the packet */
-	if (max_frag < (uint_t)(LENGTH + ipsec_len))
-		goto fragmentit;
-	/*
-	 * Don't use frag_flag if packet is pre-built or source
-	 * routed or if multicast (since multicast packets do
-	 * not solicit ICMP "packet too big" messages).
-	 */
-	if ((ip_hdr_included != IP_HDR_INCLUDED) &&
-	    (V_HLEN == IP_SIMPLE_HDR_VERSION ||
-	    !ip_source_route_included(ipha)) &&
-	    !CLASSD(ipha->ipha_dst))
-		ipha->ipha_fragment_offset_and_flags |=
-		    htons(ire->ire_frag_flag);
-
-	if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
-		/* calculate IP header checksum */
-		cksum += ipha->ipha_ident;
-		cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF);
-		cksum += ipha->ipha_fragment_offset_and_flags;
-
-		/* IP options present */
-		hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS;
-		if (hlen)
-			goto checksumoptions;
-
-		/* calculate hdr checksum */
-		cksum = ((cksum & 0xFFFF) + (cksum >> 16));
-		cksum = ~(cksum + (cksum >> 16));
-		ipha->ipha_hdr_checksum = (uint16_t)cksum;
-	}
-	if (ipsec_len != 0) {
-		/*
-		 * We will do the rest of the processing after
-		 * we come back from IPsec in ip_wput_ipsec_out().
-		 */
-		ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t));
-
-		io = (ipsec_out_t *)first_mp->b_rptr;
-		io->ipsec_out_ill_index =
-		    ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex;
-		ipsec_out_process(q, first_mp, ire, 0);
-		ire_refrele(ire);
-		if (conn_outgoing_ill != NULL)
-			ill_refrele(conn_outgoing_ill);
-		return;
-	}
-
-	/*
-	 * In most cases, the emission loop below is entered only
-	 * once. Only in the case where the ire holds the
-	 * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT
-	 * flagged ires in the bucket, and send the packet
-	 * through all crossed RTF_MULTIRT routes.
-	 */
-	if (ire->ire_flags & RTF_MULTIRT) {
-		multirt_send = B_TRUE;
-	}
-	do {
-		if (multirt_send) {
-			irb_t *irb;
-			/*
-			 * We are in a multiple send case, need to get
-			 * the next ire and make a duplicate of the packet.
-			 * ire1 holds here the next ire to process in the
-			 * bucket. If multirouting is expected,
-			 * any non-RTF_MULTIRT ire that has the
-			 * right destination address is ignored.
-			 */
-			irb = ire->ire_bucket;
-			ASSERT(irb != NULL);
-
-			IRB_REFHOLD(irb);
-			for (ire1 = ire->ire_next;
-			    ire1 != NULL;
-			    ire1 = ire1->ire_next) {
-				if ((ire1->ire_flags & RTF_MULTIRT) == 0)
-					continue;
-				if (ire1->ire_addr != ire->ire_addr)
-					continue;
-				if (ire1->ire_marks &
-				    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
-					continue;
-
-				/* Got one */
-				IRE_REFHOLD(ire1);
-				break;
-			}
-			IRB_REFRELE(irb);
-
-			if (ire1 != NULL) {
-				next_mp = copyb(mp);
-				if ((next_mp == NULL) ||
-				    ((mp->b_cont != NULL) &&
-				    ((next_mp->b_cont =
-				    dupmsg(mp->b_cont)) == NULL))) {
-					freemsg(next_mp);
-					next_mp = NULL;
-					ire_refrele(ire1);
-					ire1 = NULL;
-				}
-			}
-
-			/* Last multiroute ire; don't loop anymore. */
-			if (ire1 == NULL) {
-				multirt_send = B_FALSE;
-			}
-		}
-
-		DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL,
-		    ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha,
-		    mblk_t *, mp);
-		FW_HOOKS(ipst->ips_ip4_physical_out_event,
-		    ipst->ips_ipv4firewall_physical_out,
-		    NULL, ire->ire_ipif->ipif_ill, ipha, mp, mp, 0, ipst);
-		DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
-
-		if (mp == NULL)
-			goto release_ire_and_ill;
-
-		if (ipst->ips_ip4_observe.he_interested) {
-			zoneid_t szone;
-
-			/*
-			 * On the outbound path the destination zone will be
-			 * unknown as we're sending this packet out on the
-			 * wire.
-			 */
-			szone = ip_get_zoneid_v4(ipha->ipha_src, mp, ipst,
-			    ALL_ZONES);
-			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
-			    ire->ire_ipif->ipif_ill, ipst);
-		}
-		mp->b_prev = SET_BPREV_FLAG(IPP_LOCAL_OUT);
-		DTRACE_PROBE2(ip__xmit__1, mblk_t *, mp, ire_t *, ire);
-
-		pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE, connp);
-
-		if ((pktxmit_state == SEND_FAILED) ||
-		    (pktxmit_state == LLHDR_RESLV_FAILED)) {
-			ip2dbg(("ip_wput_ire: ip_xmit_v4 failed"
-			    "- packet dropped\n"));
-release_ire_and_ill:
-			ire_refrele(ire);
-			if (next_mp != NULL) {
-				freemsg(next_mp);
-				ire_refrele(ire1);
-			}
-			if (conn_outgoing_ill != NULL)
-				ill_refrele(conn_outgoing_ill);
-			return;
-		}
-
-		if (CLASSD(dst)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastPkts);
-			UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastOctets,
-			    LENGTH);
-		}
-
-		TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
-		    "ip_wput_ire_end: q %p (%S)",
-		    q, "last copy out");
-		IRE_REFRELE(ire);
-
-		if (multirt_send) {
-			ASSERT(ire1);
-			/*
-			 * Proceed with the next RTF_MULTIRT ire,
-			 * Also set up the send-to queue accordingly.
-			 */
-			ire = ire1;
-			ire1 = NULL;
-			stq = ire->ire_stq;
-			mp = next_mp;
-			next_mp = NULL;
-			ipha = (ipha_t *)mp->b_rptr;
-			ill_index = Q_TO_INDEX(stq);
-			ill = (ill_t *)stq->q_ptr;
-		}
-	} while (multirt_send);
-	if (conn_outgoing_ill != NULL)
-		ill_refrele(conn_outgoing_ill);
-	return;
-
-	/*
-	 * ire->ire_type == IRE_BROADCAST (minimize diffs)
-	 */
-broadcast:
-	{
-		/*
-		 * To avoid broadcast storms, we usually set the TTL to 1 for
-		 * broadcasts.  However, if SO_DONTROUTE isn't set, this value
-		 * can be overridden stack-wide through the ip_broadcast_ttl
-		 * ndd tunable, or on a per-connection basis through the
-		 * IP_BROADCAST_TTL socket option.
-		 *
-		 * In the event that we are replying to incoming ICMP packets,
-		 * connp could be NULL.
-		 */
-		ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
-		if (connp != NULL) {
-			if (connp->conn_dontroute)
-				ipha->ipha_ttl = 1;
-			else if (connp->conn_broadcast_ttl != 0)
-				ipha->ipha_ttl = connp->conn_broadcast_ttl;
-		}
-
-		/*
-		 * Note that we are not doing a IRB_REFHOLD here.
-		 * Actually we don't care if the list changes i.e
-		 * if somebody deletes an IRE from the list while
-		 * we drop the lock, the next time we come around
-		 * ire_next will be NULL and hence we won't send
-		 * out multiple copies which is fine.
-		 */
-		rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
-		ire1 = ire->ire_next;
-		if (conn_outgoing_ill != NULL) {
-			while (ire->ire_ipif->ipif_ill != conn_outgoing_ill) {
-				ASSERT(ire1 == ire->ire_next);
-				if (ire1 != NULL && ire1->ire_addr == dst) {
-					ire_refrele(ire);
-					ire = ire1;
-					IRE_REFHOLD(ire);
-					ire1 = ire->ire_next;
-					continue;
-				}
-				rw_exit(&ire->ire_bucket->irb_lock);
-				/* Did not find a matching ill */
-				ip1dbg(("ip_wput_ire: broadcast with no "
-				    "matching IP_BOUND_IF ill %s dst %x\n",
-				    conn_outgoing_ill->ill_name, dst));
-				freemsg(first_mp);
-				if (ire != NULL)
-					ire_refrele(ire);
-				ill_refrele(conn_outgoing_ill);
-				return;
-			}
-		} else if (ire1 != NULL && ire1->ire_addr == dst) {
-			/*
-			 * If the next IRE has the same address and is not one
-			 * of the two copies that we need to send, try to see
-			 * whether this copy should be sent at all. This
-			 * assumes that we insert loopbacks first and then
-			 * non-loopbacks. This is acheived by inserting the
-			 * loopback always before non-loopback.
-			 * This is used to send a single copy of a broadcast
-			 * packet out all physical interfaces that have an
-			 * matching IRE_BROADCAST while also looping
-			 * back one copy (to ip_wput_local) for each
-			 * matching physical interface. However, we avoid
-			 * sending packets out different logical that match by
-			 * having ipif_up/ipif_down supress duplicate
-			 * IRE_BROADCASTS.
-			 *
-			 * This feature is currently used to get broadcasts
-			 * sent to multiple interfaces, when the broadcast
-			 * address being used applies to multiple interfaces.
-			 * For example, a whole net broadcast will be
-			 * replicated on every connected subnet of
-			 * the target net.
-			 *
-			 * Each zone has its own set of IRE_BROADCASTs, so that
-			 * we're able to distribute inbound packets to multiple
-			 * zones who share a broadcast address. We avoid looping
-			 * back outbound packets in different zones but on the
-			 * same ill, as the application would see duplicates.
-			 *
-			 * This logic assumes that ire_add_v4() groups the
-			 * IRE_BROADCAST entries so that those with the same
-			 * ire_addr are kept together.
-			 */
-			ire_ill = ire->ire_ipif->ipif_ill;
-			if (ire->ire_stq != NULL || ire1->ire_stq == NULL) {
-				while (ire1 != NULL && ire1->ire_addr == dst) {
-					ire1_ill = ire1->ire_ipif->ipif_ill;
-					if (ire1_ill != ire_ill)
-						break;
-					ire1 = ire1->ire_next;
-				}
-			}
-		}
-		ASSERT(multirt_send == B_FALSE);
-		if (ire1 != NULL && ire1->ire_addr == dst) {
-			if ((ire->ire_flags & RTF_MULTIRT) &&
-			    (ire1->ire_flags & RTF_MULTIRT)) {
-				/*
-				 * We are in the multirouting case.
-				 * The message must be sent at least
-				 * on both ires. These ires have been
-				 * inserted AFTER the standard ones
-				 * in ip_rt_add(). There are thus no
-				 * other ire entries for the destination
-				 * address in the rest of the bucket
-				 * that do not have the RTF_MULTIRT
-				 * flag. We don't process a copy
-				 * of the message here. This will be
-				 * done in the final sending loop.
-				 */
-				multirt_send = B_TRUE;
-			} else {
-				next_mp = ip_copymsg(first_mp);
-				if (next_mp != NULL)
-					IRE_REFHOLD(ire1);
-			}
-		}
-		rw_exit(&ire->ire_bucket->irb_lock);
-	}
-
-	if (stq) {
-		/*
-		 * A non-NULL send-to queue means this packet is going
-		 * out of this machine.
-		 */
-		out_ill = (ill_t *)stq->q_ptr;
-
-		BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutRequests);
-		ttl_protocol = ((uint16_t *)ipha)[4];
-		/*
-		 * We accumulate the pseudo header checksum in cksum.
-		 * This is pretty hairy code, so watch close.  One
-		 * thing to keep in mind is that UDP and TCP have
-		 * stored their respective datagram lengths in their
-		 * checksum fields.  This lines things up real nice.
-		 */
-		cksum = (dst >> 16) + (dst & 0xFFFF) +
-		    (src >> 16) + (src & 0xFFFF);
-		/*
-		 * We assume the udp checksum field contains the
-		 * length, so to compute the pseudo header checksum,
-		 * all we need is the protocol number and src/dst.
-		 */
-		/* Provide the checksums for UDP and TCP. */
-		if ((PROTO == IPPROTO_TCP) &&
-		    (ip_hdr_included != IP_HDR_INCLUDED)) {
-			/* hlen gets the number of uchar_ts in the IP header */
-			hlen = (V_HLEN & 0xF) << 2;
-			up = IPH_TCPH_CHECKSUMP(ipha, hlen);
-			IP_STAT(ipst, ip_out_sw_cksum);
-			IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes,
-			    LENGTH - hlen);
-			*up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP);
-		} else if (PROTO == IPPROTO_SCTP &&
-		    (ip_hdr_included != IP_HDR_INCLUDED)) {
-			sctp_hdr_t	*sctph;
-
-			hlen = (V_HLEN & 0xF) << 2;
-			ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph)));
-			sctph = (sctp_hdr_t *)(mp->b_rptr + hlen);
-			sctph->sh_chksum = 0;
-#ifdef	DEBUG
-			if (!skip_sctp_cksum)
-#endif
-				sctph->sh_chksum = sctp_cksum(mp, hlen);
-		} else {
-			queue_t	*dev_q = stq->q_next;
-
-			if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) &&
-			    (DEV_Q_FLOW_BLOCKED(dev_q))) {
-blocked:
-				ipha->ipha_ident = ip_hdr_included;
-				/*
-				 * If we don't have a conn to apply
-				 * backpressure, free the message.
-				 * In the ire_send path, we don't know
-				 * the position to requeue the packet. Rather
-				 * than reorder packets, we just drop this
-				 * packet.
-				 */
-				if (ipst->ips_ip_output_queue &&
-				    connp != NULL &&
-				    caller != IRE_SEND) {
-					if (caller == IP_WSRV) {
-						idl_tx_list_t *idl_txl;
-
-						idl_txl =
-						    &ipst->ips_idl_tx_list[0];
-						connp->conn_did_putbq = 1;
-						(void) putbq(connp->conn_wq,
-						    first_mp);
-						conn_drain_insert(connp,
-						    idl_txl);
-						/*
-						 * This is the service thread,
-						 * and the queue is already
-						 * noenabled. The check for
-						 * canput and the putbq is not
-						 * atomic. So we need to check
-						 * again.
-						 */
-						if (canput(stq->q_next))
-							connp->conn_did_putbq
-							    = 0;
-						IP_STAT(ipst, ip_conn_flputbq);
-					} else {
-						/*
-						 * We are not the service proc.
-						 * ip_wsrv will be scheduled or
-						 * is already running.
-						 */
-
-						(void) putq(connp->conn_wq,
-						    first_mp);
-					}
-				} else {
-					out_ill = (ill_t *)stq->q_ptr;
-					BUMP_MIB(out_ill->ill_ip_mib,
-					    ipIfStatsOutDiscards);
-					freemsg(first_mp);
-					TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
-					    "ip_wput_ire_end: q %p (%S)",
-					    q, "discard");
-				}
-				ire_refrele(ire);
-				if (next_mp) {
-					ire_refrele(ire1);
-					freemsg(next_mp);
-				}
-				if (conn_outgoing_ill != NULL)
-					ill_refrele(conn_outgoing_ill);
-				return;
-			}
-			if ((PROTO == IPPROTO_UDP) &&
-			    (ip_hdr_included != IP_HDR_INCLUDED)) {
-				/*
-				 * hlen gets the number of uchar_ts in the
-				 * IP header
-				 */
-				hlen = (V_HLEN & 0xF) << 2;
-				up = IPH_UDPH_CHECKSUMP(ipha, hlen);
-				max_frag = ire->ire_max_frag;
-				if (*up != 0) {
-					IP_CKSUM_XMIT(out_ill, ire, mp, ipha,
-					    up, PROTO, hlen, LENGTH, max_frag,
-					    ipsec_len, cksum);
-					/* Software checksum? */
-					if (DB_CKSUMFLAGS(mp) == 0) {
-						IP_STAT(ipst, ip_out_sw_cksum);
-						IP_STAT_UPDATE(ipst,
-						    ip_udp_out_sw_cksum_bytes,
-						    LENGTH - hlen);
-					}
-				}
-			}
-		}
-		/*
-		 * Need to do this even when fragmenting. The local
-		 * loopback can be done without computing checksums
-		 * but forwarding out other interface must be done
-		 * after the IP checksum (and ULP checksums) have been
-		 * computed.
-		 *
-		 * NOTE : multicast_forward is set only if this packet
-		 * originated from ip_wput. For packets originating from
-		 * ip_wput_multicast, it is not set.
-		 */
-		if (CLASSD(ipha->ipha_dst) && multicast_forward) {
-multi_loopback:
-			ip2dbg(("ip_wput: multicast, loop %d\n",
-			    conn_multicast_loop));
-
-			/*  Forget header checksum offload */
-			DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
-
-			/*
-			 * Local loopback of multicasts?  Check the
-			 * ill.
-			 *
-			 * Note that the loopback function will not come
-			 * in through ip_rput - it will only do the
-			 * client fanout thus we need to do an mforward
-			 * as well.  The is different from the BSD
-			 * logic.
-			 */
-			if (ill != NULL) {
-				if (ilm_lookup_ill(ill, ipha->ipha_dst,
-				    ALL_ZONES) != NULL) {
-					/*
-					 * Pass along the virtual output q.
-					 * ip_wput_local() will distribute the
-					 * packet to all the matching zones,
-					 * except the sending zone when
-					 * IP_MULTICAST_LOOP is false.
-					 */
-					ip_multicast_loopback(q, ill, first_mp,
-					    conn_multicast_loop ? 0 :
-					    IP_FF_NO_MCAST_LOOP, zoneid);
-				}
-			}
-			if (ipha->ipha_ttl == 0) {
-				/*
-				 * 0 => only to this host i.e. we are
-				 * done. We are also done if this was the
-				 * loopback interface since it is sufficient
-				 * to loopback one copy of a multicast packet.
-				 */
-				freemsg(first_mp);
-				TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
-				    "ip_wput_ire_end: q %p (%S)",
-				    q, "loopback");
-				ire_refrele(ire);
-				if (conn_outgoing_ill != NULL)
-					ill_refrele(conn_outgoing_ill);
-				return;
-			}
-			/*
-			 * ILLF_MULTICAST is checked in ip_newroute
-			 * i.e. we don't need to check it here since
-			 * all IRE_CACHEs come from ip_newroute.
-			 * For multicast traffic, SO_DONTROUTE is interpreted
-			 * to mean only send the packet out the interface
-			 * (optionally specified with IP_MULTICAST_IF)
-			 * and do not forward it out additional interfaces.
-			 * RSVP and the rsvp daemon is an example of a
-			 * protocol and user level process that
-			 * handles it's own routing. Hence, it uses the
-			 * SO_DONTROUTE option to accomplish this.
-			 */
-
-			if (ipst->ips_ip_g_mrouter && !conn_dontroute &&
-			    ill != NULL) {
-				/* Unconditionally redo the checksum */
-				ipha->ipha_hdr_checksum = 0;
-				ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
-
-				/*
-				 * If this needs to go out secure, we need
-				 * to wait till we finish the IPsec
-				 * processing.
-				 */
-				if (ipsec_len == 0 &&
-				    ip_mforward(ill, ipha, mp)) {
-					freemsg(first_mp);
-					ip1dbg(("ip_wput: mforward failed\n"));
-					TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
-					    "ip_wput_ire_end: q %p (%S)",
-					    q, "mforward failed");
-					ire_refrele(ire);
-					if (conn_outgoing_ill != NULL)
-						ill_refrele(conn_outgoing_ill);
-					return;
-				}
-			}
-		}
-		max_frag = ire->ire_max_frag;
-		cksum += ttl_protocol;
-		if (max_frag >= (uint_t)(LENGTH + ipsec_len)) {
-			/* No fragmentation required for this one. */
-			/*
-			 * Don't use frag_flag if packet is pre-built or source
-			 * routed or if multicast (since multicast packets do
-			 * not solicit ICMP "packet too big" messages).
-			 */
-			if ((ip_hdr_included != IP_HDR_INCLUDED) &&
-			    (V_HLEN == IP_SIMPLE_HDR_VERSION ||
-			    !ip_source_route_included(ipha)) &&
-			    !CLASSD(ipha->ipha_dst))
-				ipha->ipha_fragment_offset_and_flags |=
-				    htons(ire->ire_frag_flag);
-
-			if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
-				/* Complete the IP header checksum. */
-				cksum += ipha->ipha_ident;
-				cksum += (v_hlen_tos_len >> 16)+
-				    (v_hlen_tos_len & 0xFFFF);
-				cksum += ipha->ipha_fragment_offset_and_flags;
-				hlen = (V_HLEN & 0xF) -
-				    IP_SIMPLE_HDR_LENGTH_IN_WORDS;
-				if (hlen) {
-checksumoptions:
-					/*
-					 * Account for the IP Options in the IP
-					 * header checksum.
-					 */
-					up = (uint16_t *)(rptr+
-					    IP_SIMPLE_HDR_LENGTH);
-					do {
-						cksum += up[0];
-						cksum += up[1];
-						up += 2;
-					} while (--hlen);
-				}
-				cksum = ((cksum & 0xFFFF) + (cksum >> 16));
-				cksum = ~(cksum + (cksum >> 16));
-				ipha->ipha_hdr_checksum = (uint16_t)cksum;
-			}
-			if (ipsec_len != 0) {
-				ipsec_out_process(q, first_mp, ire, ill_index);
-				if (!next_mp) {
-					ire_refrele(ire);
-					if (conn_outgoing_ill != NULL)
-						ill_refrele(conn_outgoing_ill);
-					return;
-				}
-				goto next;
-			}
-
-			/*
-			 * multirt_send has already been handled
-			 * for broadcast, but not yet for multicast
-			 * or IP options.
-			 */
-			if (next_mp == NULL) {
-				if (ire->ire_flags & RTF_MULTIRT) {
-					multirt_send = B_TRUE;
-				}
-			}
-
-			/*
-			 * In most cases, the emission loop below is
-			 * entered only once. Only in the case where
-			 * the ire holds the RTF_MULTIRT flag, do we loop
-			 * to process all RTF_MULTIRT ires in the bucket,
-			 * and send the packet through all crossed
-			 * RTF_MULTIRT routes.
-			 */
-			do {
-				if (multirt_send) {
-					irb_t *irb;
-
-					irb = ire->ire_bucket;
-					ASSERT(irb != NULL);
-					/*
-					 * We are in a multiple send case,
-					 * need to get the next IRE and make
-					 * a duplicate of the packet.
-					 */
-					IRB_REFHOLD(irb);
-					for (ire1 = ire->ire_next;
-					    ire1 != NULL;
-					    ire1 = ire1->ire_next) {
-						if (!(ire1->ire_flags &
-						    RTF_MULTIRT))
-							continue;
-
-						if (ire1->ire_addr !=
-						    ire->ire_addr)
-							continue;
-
-						if (ire1->ire_marks &
-						    (IRE_MARK_CONDEMNED |
-						    IRE_MARK_TESTHIDDEN))
-							continue;
-
-						/* Got one */
-						IRE_REFHOLD(ire1);
-						break;
-					}
-					IRB_REFRELE(irb);
-
-					if (ire1 != NULL) {
-						next_mp = copyb(mp);
-						if ((next_mp == NULL) ||
-						    ((mp->b_cont != NULL) &&
-						    ((next_mp->b_cont =
-						    dupmsg(mp->b_cont))
-						    == NULL))) {
-							freemsg(next_mp);
-							next_mp = NULL;
-							ire_refrele(ire1);
-							ire1 = NULL;
-						}
-					}
-
-					/*
-					 * Last multiroute ire; don't loop
-					 * anymore. The emission is over
-					 * and next_mp is NULL.
-					 */
-					if (ire1 == NULL) {
-						multirt_send = B_FALSE;
-					}
-				}
-
-				out_ill = ire_to_ill(ire);
-				DTRACE_PROBE4(ip4__physical__out__start,
-				    ill_t *, NULL,
-				    ill_t *, out_ill,
-				    ipha_t *, ipha, mblk_t *, mp);
-				FW_HOOKS(ipst->ips_ip4_physical_out_event,
-				    ipst->ips_ipv4firewall_physical_out,
-				    NULL, out_ill, ipha, mp, mp, 0, ipst);
-				DTRACE_PROBE1(ip4__physical__out__end,
-				    mblk_t *, mp);
-				if (mp == NULL)
-					goto release_ire_and_ill_2;
-
-				ASSERT(ipsec_len == 0);
-				mp->b_prev =
-				    SET_BPREV_FLAG(IPP_LOCAL_OUT);
-				DTRACE_PROBE2(ip__xmit__2,
-				    mblk_t *, mp, ire_t *, ire);
-				pktxmit_state = ip_xmit_v4(mp, ire,
-				    NULL, B_TRUE, connp);
-				if ((pktxmit_state == SEND_FAILED) ||
-				    (pktxmit_state == LLHDR_RESLV_FAILED)) {
-release_ire_and_ill_2:
-					if (next_mp) {
-						freemsg(next_mp);
-						ire_refrele(ire1);
-					}
-					ire_refrele(ire);
-					TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
-					    "ip_wput_ire_end: q %p (%S)",
-					    q, "discard MDATA");
-					if (conn_outgoing_ill != NULL)
-						ill_refrele(conn_outgoing_ill);
-					return;
-				}
-
-				if (CLASSD(dst)) {
-					BUMP_MIB(out_ill->ill_ip_mib,
-					    ipIfStatsHCOutMcastPkts);
-					UPDATE_MIB(out_ill->ill_ip_mib,
-					    ipIfStatsHCOutMcastOctets,
-					    LENGTH);
-				} else if (ire->ire_type == IRE_BROADCAST) {
-					BUMP_MIB(out_ill->ill_ip_mib,
-					    ipIfStatsHCOutBcastPkts);
-				}
-
-				if (multirt_send) {
-					/*
-					 * We are in a multiple send case,
-					 * need to re-enter the sending loop
-					 * using the next ire.
-					 */
-					ire_refrele(ire);
-					ire = ire1;
-					stq = ire->ire_stq;
-					mp = next_mp;
-					next_mp = NULL;
-					ipha = (ipha_t *)mp->b_rptr;
-					ill_index = Q_TO_INDEX(stq);
-				}
-			} while (multirt_send);
-
-			if (!next_mp) {
-				/*
-				 * Last copy going out (the ultra-common
-				 * case).  Note that we intentionally replicate
-				 * the putnext rather than calling it before
-				 * the next_mp check in hopes of a little
-				 * tail-call action out of the compiler.
-				 */
-				TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
-				    "ip_wput_ire_end: q %p (%S)",
-				    q, "last copy out(1)");
-				ire_refrele(ire);
-				if (conn_outgoing_ill != NULL)
-					ill_refrele(conn_outgoing_ill);
-				return;
-			}
-			/* More copies going out below. */
-		} else {
-			int offset;
-fragmentit:
-			offset = ntohs(ipha->ipha_fragment_offset_and_flags);
-			/*
-			 * If this would generate a icmp_frag_needed message,
-			 * we need to handle it before we do the IPsec
-			 * processing. Otherwise, we need to strip the IPsec
-			 * headers before we send up the message to the ULPs
-			 * which becomes messy and difficult.
-			 */
-			if (ipsec_len != 0) {
-				if ((max_frag < (unsigned int)(LENGTH +
-				    ipsec_len)) && (offset & IPH_DF)) {
-					out_ill = (ill_t *)stq->q_ptr;
-					BUMP_MIB(out_ill->ill_ip_mib,
-					    ipIfStatsOutFragFails);
-					BUMP_MIB(out_ill->ill_ip_mib,
-					    ipIfStatsOutFragReqds);
-					ipha->ipha_hdr_checksum = 0;
-					ipha->ipha_hdr_checksum =
-					    (uint16_t)ip_csum_hdr(ipha);
-					icmp_frag_needed(ire->ire_stq, first_mp,
-					    max_frag, zoneid, ipst);
-					if (!next_mp) {
-						ire_refrele(ire);
-						if (conn_outgoing_ill != NULL) {
-							ill_refrele(
-							    conn_outgoing_ill);
-						}
-						return;
-					}
-				} else {
-					/*
-					 * This won't cause a icmp_frag_needed
-					 * message. to be generated. Send it on
-					 * the wire. Note that this could still
-					 * cause fragmentation and all we
-					 * do is the generation of the message
-					 * to the ULP if needed before IPsec.
-					 */
-					if (!next_mp) {
-						ipsec_out_process(q, first_mp,
-						    ire, ill_index);
-						TRACE_2(TR_FAC_IP,
-						    TR_IP_WPUT_IRE_END,
-						    "ip_wput_ire_end: q %p "
-						    "(%S)", q,
-						    "last ipsec_out_process");
-						ire_refrele(ire);
-						if (conn_outgoing_ill != NULL) {
-							ill_refrele(
-							    conn_outgoing_ill);
-						}
-						return;
-					}
-					ipsec_out_process(q, first_mp,
-					    ire, ill_index);
-				}
-			} else {
-				/*
-				 * Initiate IPPF processing. For
-				 * fragmentable packets we finish
-				 * all QOS packet processing before
-				 * calling:
-				 * ip_wput_ire_fragmentit->ip_wput_frag
-				 */
-
-				if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
-					ip_process(IPP_LOCAL_OUT, &mp,
-					    ill_index);
-					if (mp == NULL) {
-						out_ill = (ill_t *)stq->q_ptr;
-						BUMP_MIB(out_ill->ill_ip_mib,
-						    ipIfStatsOutDiscards);
-						if (next_mp != NULL) {
-							freemsg(next_mp);
-							ire_refrele(ire1);
-						}
-						ire_refrele(ire);
-						TRACE_2(TR_FAC_IP,
-						    TR_IP_WPUT_IRE_END,
-						    "ip_wput_ire: q %p (%S)",
-						    q, "discard MDATA");
-						if (conn_outgoing_ill != NULL) {
-							ill_refrele(
-							    conn_outgoing_ill);
-						}
-						return;
-					}
-				}
-				if (!next_mp) {
-					TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
-					    "ip_wput_ire_end: q %p (%S)",
-					    q, "last fragmentation");
-					ip_wput_ire_fragmentit(mp, ire,
-					    zoneid, ipst, connp);
-					ire_refrele(ire);
-					if (conn_outgoing_ill != NULL)
-						ill_refrele(conn_outgoing_ill);
-					return;
-				}
-				ip_wput_ire_fragmentit(mp, ire,
-				    zoneid, ipst, connp);
-			}
-		}
-	} else {
-nullstq:
-		/* A NULL stq means the destination address is local. */
-		UPDATE_OB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
-		ASSERT(ire->ire_ipif != NULL);
-		if (!next_mp) {
-			/*
-			 * Is there an "in" and "out" for traffic local
-			 * to a host (loopback)?  The code in Solaris doesn't
-			 * explicitly draw a line in its code for in vs out,
-			 * so we've had to draw a line in the sand: ip_wput_ire
-			 * is considered to be the "output" side and
-			 * ip_wput_local to be the "input" side.
-			 */
-			out_ill = ire_to_ill(ire);
-
-			/*
-			 * DTrace this as ip:::send.  A blocked packet will
-			 * fire the send probe, but not the receive probe.
-			 */
-			DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL,
-			    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
-			    ipha_t *, ipha, ip6_t *, NULL, int, 1);
-
-			DTRACE_PROBE4(ip4__loopback__out__start,
-			    ill_t *, NULL, ill_t *, out_ill,
-			    ipha_t *, ipha, mblk_t *, first_mp);
-
-			FW_HOOKS(ipst->ips_ip4_loopback_out_event,
-			    ipst->ips_ipv4firewall_loopback_out,
-			    NULL, out_ill, ipha, first_mp, mp, 0, ipst);
-
-			DTRACE_PROBE1(ip4__loopback__out_end,
-			    mblk_t *, first_mp);
-
-			TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
-			    "ip_wput_ire_end: q %p (%S)",
-			    q, "local address");
-
-			if (first_mp != NULL)
-				ip_wput_local(q, out_ill, ipha,
-				    first_mp, ire, 0, ire->ire_zoneid);
-			ire_refrele(ire);
-			if (conn_outgoing_ill != NULL)
-				ill_refrele(conn_outgoing_ill);
-			return;
-		}
-
-		out_ill = ire_to_ill(ire);
-
-		/*
-		 * DTrace this as ip:::send.  A blocked packet will fire the
-		 * send probe, but not the receive probe.
-		 */
-		DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL,
-		    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
-		    ipha_t *, ipha, ip6_t *, NULL, int, 1);
-
-		DTRACE_PROBE4(ip4__loopback__out__start,
-		    ill_t *, NULL, ill_t *, out_ill,
-		    ipha_t *, ipha, mblk_t *, first_mp);
-
-		FW_HOOKS(ipst->ips_ip4_loopback_out_event,
-		    ipst->ips_ipv4firewall_loopback_out,
-		    NULL, out_ill, ipha, first_mp, mp, 0, ipst);
-
-		DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, first_mp);
-
-		if (first_mp != NULL)
-			ip_wput_local(q, out_ill, ipha,
-			    first_mp, ire, 0, ire->ire_zoneid);
-	}
-next:
-	/*
-	 * More copies going out to additional interfaces.
-	 * ire1 has already been held. We don't need the
-	 * "ire" anymore.
-	 */
-	ire_refrele(ire);
-	ire = ire1;
-	ASSERT(ire != NULL && ire->ire_refcnt >= 1 && next_mp != NULL);
-	mp = next_mp;
-	ASSERT(ire->ire_ipversion == IPV4_VERSION);
-	ill = ire_to_ill(ire);
-	first_mp = mp;
-	if (ipsec_len != 0) {
-		ASSERT(first_mp->b_datap->db_type == M_CTL);
-		mp = mp->b_cont;
-	}
-	dst = ire->ire_addr;
-	ipha = (ipha_t *)mp->b_rptr;
-	/*
-	 * Restore src so that we will pick up ire->ire_src_addr if src was 0.
-	 * Restore ipha_ident "no checksum" flag.
-	 */
-	src = orig_src;
-	ipha->ipha_ident = ip_hdr_included;
-	goto another;
-
-#undef	rptr
-#undef	Q_TO_INDEX
-}
-
-/*
- * Routine to allocate a message that is used to notify the ULP about MDT.
- * The caller may provide a pointer to the link-layer MDT capabilities,
- * or NULL if MDT is to be disabled on the stream.
- */
-mblk_t *
-ip_mdinfo_alloc(ill_mdt_capab_t *isrc)
-{
-	mblk_t *mp;
-	ip_mdt_info_t *mdti;
-	ill_mdt_capab_t *idst;
-
-	if ((mp = allocb(sizeof (*mdti), BPRI_HI)) != NULL) {
-		DB_TYPE(mp) = M_CTL;
-		mp->b_wptr = mp->b_rptr + sizeof (*mdti);
-		mdti = (ip_mdt_info_t *)mp->b_rptr;
-		mdti->mdt_info_id = MDT_IOC_INFO_UPDATE;
-		idst = &(mdti->mdt_capab);
-
-		/*
-		 * If the caller provides us with the capability, copy
-		 * it over into our notification message; otherwise
-		 * we zero out the capability portion.
-		 */
-		if (isrc != NULL)
-			bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst));
-		else
-			bzero((caddr_t)idst, sizeof (*idst));
-	}
-	return (mp);
-}
-
-/*
- * Routine which determines whether MDT can be enabled on the destination
- * IRE and IPC combination, and if so, allocates and returns the MDT
- * notification mblk that may be used by ULP.  We also check if we need to
- * turn MDT back to 'on' when certain restrictions prohibiting us to allow
- * MDT usage in the past have been lifted.  This gets called during IP
- * and ULP binding.
- */
-mblk_t *
-ip_mdinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name,
-    ill_mdt_capab_t *mdt_cap)
-{
-	mblk_t *mp;
-	boolean_t rc = B_FALSE;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	ASSERT(dst_ire != NULL);
-	ASSERT(connp != NULL);
-	ASSERT(mdt_cap != NULL);
-
-	/*
-	 * Currently, we only support simple TCP/{IPv4,IPv6} with
-	 * Multidata, which is handled in tcp_multisend().  This
-	 * is the reason why we do all these checks here, to ensure
-	 * that we don't enable Multidata for the cases which we
-	 * can't handle at the moment.
-	 */
-	do {
-		/* Only do TCP at the moment */
-		if (connp->conn_ulp != IPPROTO_TCP)
-			break;
-
-		/*
-		 * IPsec outbound policy present?  Note that we get here
-		 * after calling ipsec_conn_cache_policy() where the global
-		 * policy checking is performed.  conn_latch will be
-		 * non-NULL as long as there's a policy defined,
-		 * i.e. conn_out_enforce_policy may be NULL in such case
-		 * when the connection is non-secure, and hence we check
-		 * further if the latch refers to an outbound policy.
-		 */
-		if (CONN_IPSEC_OUT_ENCAPSULATED(connp))
-			break;
-
-		/* CGTP (multiroute) is enabled? */
-		if (dst_ire->ire_flags & RTF_MULTIRT)
-			break;
-
-		/* Outbound IPQoS enabled? */
-		if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
-			/*
-			 * In this case, we disable MDT for this and all
-			 * future connections going over the interface.
-			 */
-			mdt_cap->ill_mdt_on = 0;
-			break;
-		}
-
-		/* socket option(s) present? */
-		if (!CONN_IS_LSO_MD_FASTPATH(connp))
-			break;
-
-		rc = B_TRUE;
-	/* CONSTCOND */
-	} while (0);
-
-	/* Remember the result */
-	connp->conn_mdt_ok = rc;
-
-	if (!rc)
-		return (NULL);
-	else if (!mdt_cap->ill_mdt_on) {
-		/*
-		 * If MDT has been previously turned off in the past, and we
-		 * currently can do MDT (due to IPQoS policy removal, etc.)
-		 * then enable it for this interface.
-		 */
-		mdt_cap->ill_mdt_on = 1;
-		ip1dbg(("ip_mdinfo_return: reenabling MDT for "
-		    "interface %s\n", ill_name));
-	}
-
-	/* Allocate the MDT info mblk */
-	if ((mp = ip_mdinfo_alloc(mdt_cap)) == NULL) {
-		ip0dbg(("ip_mdinfo_return: can't enable Multidata for "
-		    "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name));
-		return (NULL);
-	}
-	return (mp);
-}
-
-/*
- * Routine to allocate a message that is used to notify the ULP about LSO.
- * The caller may provide a pointer to the link-layer LSO capabilities,
- * or NULL if LSO is to be disabled on the stream.
- */
-mblk_t *
-ip_lsoinfo_alloc(ill_lso_capab_t *isrc)
-{
-	mblk_t *mp;
-	ip_lso_info_t *lsoi;
-	ill_lso_capab_t *idst;
-
-	if ((mp = allocb(sizeof (*lsoi), BPRI_HI)) != NULL) {
-		DB_TYPE(mp) = M_CTL;
-		mp->b_wptr = mp->b_rptr + sizeof (*lsoi);
-		lsoi = (ip_lso_info_t *)mp->b_rptr;
-		lsoi->lso_info_id = LSO_IOC_INFO_UPDATE;
-		idst = &(lsoi->lso_capab);
-
-		/*
-		 * If the caller provides us with the capability, copy
-		 * it over into our notification message; otherwise
-		 * we zero out the capability portion.
-		 */
-		if (isrc != NULL)
-			bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst));
-		else
-			bzero((caddr_t)idst, sizeof (*idst));
-	}
-	return (mp);
-}
-
-/*
- * Routine which determines whether LSO can be enabled on the destination
- * IRE and IPC combination, and if so, allocates and returns the LSO
- * notification mblk that may be used by ULP.  We also check if we need to
- * turn LSO back to 'on' when certain restrictions prohibiting us to allow
- * LSO usage in the past have been lifted.  This gets called during IP
- * and ULP binding.
- */
-mblk_t *
-ip_lsoinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name,
-    ill_lso_capab_t *lso_cap)
-{
-	mblk_t *mp;
-	ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
-	ASSERT(dst_ire != NULL);
-	ASSERT(connp != NULL);
-	ASSERT(lso_cap != NULL);
-
-	connp->conn_lso_ok = B_TRUE;
-
-	if ((connp->conn_ulp != IPPROTO_TCP) ||
-	    CONN_IPSEC_OUT_ENCAPSULATED(connp) ||
-	    (dst_ire->ire_flags & RTF_MULTIRT) ||
-	    !CONN_IS_LSO_MD_FASTPATH(connp) ||
-	    (IPP_ENABLED(IPP_LOCAL_OUT, ipst))) {
-		connp->conn_lso_ok = B_FALSE;
-		if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
-			/*
-			 * Disable LSO for this and all future connections going
-			 * over the interface.
-			 */
-			lso_cap->ill_lso_on = 0;
-		}
-	}
-
-	if (!connp->conn_lso_ok)
-		return (NULL);
-	else if (!lso_cap->ill_lso_on) {
-		/*
-		 * If LSO has been previously turned off in the past, and we
-		 * currently can do LSO (due to IPQoS policy removal, etc.)
-		 * then enable it for this interface.
-		 */
-		lso_cap->ill_lso_on = 1;
-		ip1dbg(("ip_mdinfo_return: reenabling LSO for interface %s\n",
-		    ill_name));
-	}
-
-	/* Allocate the LSO info mblk */
-	if ((mp = ip_lsoinfo_alloc(lso_cap)) == NULL)
-		ip0dbg(("ip_lsoinfo_return: can't enable LSO for "
-		    "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name));
-
-	return (mp);
-}
-
-/*
- * Create destination address attribute, and fill it with the physical
- * destination address and SAP taken from the template DL_UNITDATA_REQ
- * message block.
- */
-boolean_t
-ip_md_addr_attr(multidata_t *mmd, pdesc_t *pd, const mblk_t *dlmp)
-{
-	dl_unitdata_req_t *dlurp;
-	pattr_t *pa;
-	pattrinfo_t pa_info;
-	pattr_addr_t **das = (pattr_addr_t **)&pa_info.buf;
-	uint_t das_len, das_off;
-
-	ASSERT(dlmp != NULL);
-
-	dlurp = (dl_unitdata_req_t *)dlmp->b_rptr;
-	das_len = dlurp->dl_dest_addr_length;
-	das_off = dlurp->dl_dest_addr_offset;
-
-	pa_info.type = PATTR_DSTADDRSAP;
-	pa_info.len = sizeof (**das) + das_len - 1;
-
-	/* create and associate the attribute */
-	pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP);
-	if (pa != NULL) {
-		ASSERT(*das != NULL);
-		(*das)->addr_is_group = 0;
-		(*das)->addr_len = (uint8_t)das_len;
-		bcopy((caddr_t)dlurp + das_off, (*das)->addr, das_len);
-	}
-
-	return (pa != NULL);
-}
-
-/*
- * Create hardware checksum attribute and fill it with the values passed.
- */
-boolean_t
-ip_md_hcksum_attr(multidata_t *mmd, pdesc_t *pd, uint32_t start_offset,
-    uint32_t stuff_offset, uint32_t end_offset, uint32_t flags)
-{
-	pattr_t *pa;
-	pattrinfo_t pa_info;
-
-	ASSERT(mmd != NULL);
-
-	pa_info.type = PATTR_HCKSUM;
-	pa_info.len = sizeof (pattr_hcksum_t);
-
-	/* create and associate the attribute */
-	pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP);
-	if (pa != NULL) {
-		pattr_hcksum_t *hck = (pattr_hcksum_t *)pa_info.buf;
-
-		hck->hcksum_start_offset = start_offset;
-		hck->hcksum_stuff_offset = stuff_offset;
-		hck->hcksum_end_offset = end_offset;
-		hck->hcksum_flags = flags;
-	}
-	return (pa != NULL);
-}
-
-/*
- * Create zerocopy attribute and fill it with the specified flags
- */
-boolean_t
-ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags)
-{
-	pattr_t *pa;
-	pattrinfo_t pa_info;
-
-	ASSERT(mmd != NULL);
-	pa_info.type = PATTR_ZCOPY;
-	pa_info.len = sizeof (pattr_zcopy_t);
-
-	/* create and associate the attribute */
-	pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP);
-	if (pa != NULL) {
-		pattr_zcopy_t *zcopy = (pattr_zcopy_t *)pa_info.buf;
-
-		zcopy->zcopy_flags = flags;
-	}
-	return (pa != NULL);
-}
-
-/*
- * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message
- * block chain. We could rewrite to handle arbitrary message block chains but
- * that would make the code complicated and slow. Right now there three
- * restrictions:
- *
- *   1. The first message block must contain the complete IP header and
- *	at least 1 byte of payload data.
- *   2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed
- *	so that we can use a single Multidata message.
- *   3. No frag must be distributed over two or more message blocks so
- *	that we don't need more than two packet descriptors per frag.
- *
- * The above restrictions allow us to support userland applications (which
- * will send down a single message block) and NFS over UDP (which will
- * send down a chain of at most three message blocks).
- *
- * We also don't use MDT for payloads with less than or equal to
- * ip_wput_frag_mdt_min bytes because it would cause too much overhead.
- */
-boolean_t
-ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len)
-{
-	int	blocks;
-	ssize_t	total, missing, size;
-
-	ASSERT(mp != NULL);
-	ASSERT(hdr_len > 0);
-
-	size = MBLKL(mp) - hdr_len;
-	if (size <= 0)
-		return (B_FALSE);
-
-	/* The first mblk contains the header and some payload. */
-	blocks = 1;
-	total = size;
-	size %= len;
-	missing = (size == 0) ? 0 : (len - size);
-	mp = mp->b_cont;
-
-	while (mp != NULL) {
-		/*
-		 * Give up if we encounter a zero length message block.
-		 * In practice, this should rarely happen and therefore
-		 * not worth the trouble of freeing and re-linking the
-		 * mblk from the chain to handle such case.
-		 */
-		if ((size = MBLKL(mp)) == 0)
-			return (B_FALSE);
-
-		/* Too many payload buffers for a single Multidata message? */
-		if (++blocks > MULTIDATA_MAX_PBUFS)
-			return (B_FALSE);
-
-		total += size;
-		/* Is a frag distributed over two or more message blocks? */
-		if (missing > size)
-			return (B_FALSE);
-		size -= missing;
-
-		size %= len;
-		missing = (size == 0) ? 0 : (len - size);
-
-		mp = mp->b_cont;
-	}
-
-	return (total > ip_wput_frag_mdt_min);
-}
-
-/*
- * Outbound IPv4 fragmentation routine using MDT.
- */
-static void
-ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len,
-    uint32_t frag_flag, int offset)
-{
-	ipha_t		*ipha_orig;
-	int		i1, ip_data_end;
-	uint_t		pkts, wroff, hdr_chunk_len, pbuf_idx;
-	mblk_t		*hdr_mp, *md_mp = NULL;
-	unsigned char	*hdr_ptr, *pld_ptr;
-	multidata_t	*mmd;
-	ip_pdescinfo_t	pdi;
-	ill_t		*ill;
-	ip_stack_t	*ipst = ire->ire_ipst;
-
-	ASSERT(DB_TYPE(mp) == M_DATA);
-	ASSERT(MBLKL(mp) > sizeof (ipha_t));
-
-	ill = ire_to_ill(ire);
-	ASSERT(ill != NULL);
-
-	ipha_orig = (ipha_t *)mp->b_rptr;
-	mp->b_rptr += sizeof (ipha_t);
-
-	/* Calculate how many packets we will send out */
-	i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
-	pkts = (i1 + len - 1) / len;
-	ASSERT(pkts > 1);
-
-	/* Allocate a message block which will hold all the IP Headers. */
-	wroff = ipst->ips_ip_wroff_extra;
-	hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH;
-
-	i1 = pkts * hdr_chunk_len;
-	/*
-	 * Create the header buffer, Multidata and destination address
-	 * and SAP attribute that should be associated with it.
-	 */
-	if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
-	    ((hdr_mp->b_wptr += i1),
-	    (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
-	    !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) {
-		freemsg(mp);
-		if (md_mp == NULL) {
-			freemsg(hdr_mp);
-		} else {
-free_mmd:		IP_STAT(ipst, ip_frag_mdt_discarded);
-			freemsg(md_mp);
-		}
-		IP_STAT(ipst, ip_frag_mdt_allocfail);
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
-		return;
-	}
-	IP_STAT(ipst, ip_frag_mdt_allocd);
-
-	/*
-	 * Add a payload buffer to the Multidata; this operation must not
-	 * fail, or otherwise our logic in this routine is broken.  There
-	 * is no memory allocation done by the routine, so any returned
-	 * failure simply tells us that we've done something wrong.
-	 *
-	 * A failure tells us that either we're adding the same payload
-	 * buffer more than once, or we're trying to add more buffers than
-	 * allowed.  None of the above cases should happen, and we panic
-	 * because either there's horrible heap corruption, and/or
-	 * programming mistake.
-	 */
-	if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
-		goto pbuf_panic;
-
-	hdr_ptr = hdr_mp->b_rptr;
-	pld_ptr = mp->b_rptr;
-
-	/* Establish the ending byte offset, based on the starting offset. */
-	offset <<= 3;
-	ip_data_end = offset + ntohs(ipha_orig->ipha_length) -
-	    IP_SIMPLE_HDR_LENGTH;
-
-	pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
-
-	while (pld_ptr < mp->b_wptr) {
-		ipha_t		*ipha;
-		uint16_t	offset_and_flags;
-		uint16_t	ip_len;
-		int		error;
-
-		ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
-		ipha = (ipha_t *)(hdr_ptr + wroff);
-		ASSERT(OK_32PTR(ipha));
-		*ipha = *ipha_orig;
-
-		if (ip_data_end - offset > len) {
-			offset_and_flags = IPH_MF;
-		} else {
-			/*
-			 * Last frag. Set len to the length of this last piece.
-			 */
-			len = ip_data_end - offset;
-			/* A frag of a frag might have IPH_MF non-zero */
-			offset_and_flags =
-			    ntohs(ipha->ipha_fragment_offset_and_flags) &
-			    IPH_MF;
-		}
-		offset_and_flags |= (uint16_t)(offset >> 3);
-		offset_and_flags |= (uint16_t)frag_flag;
-		/* Store the offset and flags in the IP header. */
-		ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
-
-		/* Store the length in the IP header. */
-		ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH);
-		ipha->ipha_length = htons(ip_len);
-
-		/*
-		 * Set the IP header checksum.  Note that mp is just
-		 * the header, so this is easy to pass to ip_csum.
-		 */
-		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
-
-		DTRACE_IP7(send, mblk_t *, md_mp, conn_t *, NULL, void_ip_t *,
-		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
-		    NULL, int, 0);
-
-		/*
-		 * Record offset and size of header and data of the next packet
-		 * in the multidata message.
-		 */
-		PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0);
-		PDESC_PLD_INIT(&pdi);
-		i1 = MIN(mp->b_wptr - pld_ptr, len);
-		ASSERT(i1 > 0);
-		PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
-		if (i1 == len) {
-			pld_ptr += len;
-		} else {
-			i1 = len - i1;
-			mp = mp->b_cont;
-			ASSERT(mp != NULL);
-			ASSERT(MBLKL(mp) >= i1);
-			/*
-			 * Attach the next payload message block to the
-			 * multidata message.
-			 */
-			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
-				goto pbuf_panic;
-			PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
-			pld_ptr = mp->b_rptr + i1;
-		}
-
-		if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
-		    KM_NOSLEEP)) == NULL) {
-			/*
-			 * Any failure other than ENOMEM indicates that we
-			 * have passed in invalid pdesc info or parameters
-			 * to mmd_addpdesc, which must not happen.
-			 *
-			 * EINVAL is a result of failure on boundary checks
-			 * against the pdesc info contents.  It should not
-			 * happen, and we panic because either there's
-			 * horrible heap corruption, and/or programming
-			 * mistake.
-			 */
-			if (error != ENOMEM) {
-				cmn_err(CE_PANIC, "ip_wput_frag_mdt: "
-				    "pdesc logic error detected for "
-				    "mmd %p pinfo %p (%d)\n",
-				    (void *)mmd, (void *)&pdi, error);
-				/* NOTREACHED */
-			}
-			IP_STAT(ipst, ip_frag_mdt_addpdescfail);
-			/* Free unattached payload message blocks as well */
-			md_mp->b_cont = mp->b_cont;
-			goto free_mmd;
-		}
-
-		/* Advance fragment offset. */
-		offset += len;
-
-		/* Advance to location for next header in the buffer. */
-		hdr_ptr += hdr_chunk_len;
-
-		/* Did we reach the next payload message block? */
-		if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
-			mp = mp->b_cont;
-			/*
-			 * Attach the next message block with payload
-			 * data to the multidata message.
-			 */
-			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
-				goto pbuf_panic;
-			pld_ptr = mp->b_rptr;
-		}
-	}
-
-	ASSERT(hdr_mp->b_wptr == hdr_ptr);
-	ASSERT(mp->b_wptr == pld_ptr);
-
-	/* Update IP statistics */
-	IP_STAT_UPDATE(ipst, ip_frag_mdt_pkt_out, pkts);
-
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates, pkts);
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
-
-	len = ntohs(ipha_orig->ipha_length) + (pkts - 1) * IP_SIMPLE_HDR_LENGTH;
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, pkts);
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, len);
-
-	if (pkt_type == OB_PKT) {
-		ire->ire_ob_pkt_count += pkts;
-		if (ire->ire_ipif != NULL)
-			atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
-	} else {
-		/* The type is IB_PKT in the forwarding path. */
-		ire->ire_ib_pkt_count += pkts;
-		ASSERT(!IRE_IS_LOCAL(ire));
-		if (ire->ire_type & IRE_BROADCAST) {
-			atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts);
-		} else {
-			UPDATE_MIB(ill->ill_ip_mib,
-			    ipIfStatsHCOutForwDatagrams, pkts);
-			atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts);
-		}
-	}
-	ire->ire_last_used_time = lbolt;
-	/* Send it down */
-	putnext(ire->ire_stq, md_mp);
-	return;
-
-pbuf_panic:
-	cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic "
-	    "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
-	    pbuf_idx);
-	/* NOTREACHED */
-}
-
 /*
  * Outbound IP fragmentation routine.
- *
- * NOTE : This routine does not ire_refrele the ire that is passed in
- * as the argument.
+ * Assumes the caller has checked whether or not fragmentation should
+ * be allowed. Here we copy the DF bit from the header to all the generated
+ * fragments.
  */
-static void
-ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
-    uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst, conn_t *connp)
+int
+ip_fragment_v4(mblk_t *mp_orig, nce_t *nce, iaflags_t ixaflags,
+    uint_t pkt_len, uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone,
+    zoneid_t nolzid, pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
 {
 	int		i1;
-	mblk_t		*ll_hdr_mp;
-	int 		ll_hdr_len;
 	int		hdr_len;
 	mblk_t		*hdr_mp;
 	ipha_t		*ipha;
 	int		ip_data_end;
 	int		len;
-	mblk_t		*mp = mp_orig, *mp1;
+	mblk_t		*mp = mp_orig;
 	int		offset;
-	queue_t		*q;
-	uint32_t	v_hlen_tos_len;
-	mblk_t		*first_mp;
-	boolean_t	mctl_present;
-	ill_t		*ill;
-	ill_t		*out_ill;
-	mblk_t		*xmit_mp;
+	ill_t		*ill = nce->nce_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 	mblk_t		*carve_mp;
-	ire_t		*ire1 = NULL;
-	ire_t		*save_ire = NULL;
-	mblk_t  	*next_mp = NULL;
-	boolean_t	last_frag = B_FALSE;
-	boolean_t	multirt_send = B_FALSE;
-	ire_t		*first_ire = NULL;
-	irb_t		*irb = NULL;
-	mib2_ipIfStatsEntry_t *mibptr = NULL;
-
-	ill = ire_to_ill(ire);
-	mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib;
+	uint32_t	frag_flag;
+	uint_t		priority = mp->b_band;
+	int		error = 0;
 
-	BUMP_MIB(mibptr, ipIfStatsOutFragReqds);
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
 
-	if (max_frag == 0) {
-		ip1dbg(("ip_wput_frag: ire frag size is 0"
-		    " -  dropping packet\n"));
-		BUMP_MIB(mibptr, ipIfStatsOutFragFails);
+	if (pkt_len != msgdsize(mp)) {
+		ip0dbg(("Packet length mismatch: %d, %ld\n",
+		    pkt_len, msgdsize(mp)));
 		freemsg(mp);
-		return;
+		return (EINVAL);
 	}
 
-	/*
-	 * IPsec does not allow hw accelerated packets to be fragmented
-	 * This check is made in ip_wput_ipsec_out prior to coming here
-	 * via ip_wput_ire_fragmentit.
-	 *
-	 * If at this point we have an ire whose ARP request has not
-	 * been sent out, we call ip_xmit_v4->ire_arpresolve to trigger
-	 * sending of ARP query and change ire's state to ND_INCOMPLETE.
-	 * This packet and all fragmentable packets for this ire will
-	 * continue to get dropped while ire_nce->nce_state remains in
-	 * ND_INCOMPLETE. Post-ARP resolution, after ire's nce_state changes to
-	 * ND_REACHABLE, all subsquent large packets for this ire will
-	 * get fragemented and sent out by this function.
-	 */
-	if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) {
-		/* If nce_state is ND_INITIAL, trigger ARP query */
-		(void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
-		ip1dbg(("ip_wput_frag: mac address for ire is unresolved"
-		    " -  dropping packet\n"));
-		BUMP_MIB(mibptr, ipIfStatsOutFragFails);
+	if (max_frag == 0) {
+		ip1dbg(("ip_fragment_v4: max_frag is zero. Dropping packet\n"));
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+		ip_drop_output("FragFails: zero max_frag", mp, ill);
 		freemsg(mp);
-		return;
-	}
-
-	TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START,
-	    "ip_wput_frag_start:");
-
-	if (mp->b_datap->db_type == M_CTL) {
-		first_mp = mp;
-		mp_orig = mp = mp->b_cont;
-		mctl_present = B_TRUE;
-	} else {
-		first_mp = mp;
-		mctl_present = B_FALSE;
+		return (EINVAL);
 	}
 
 	ASSERT(MBLKL(mp) >= sizeof (ipha_t));
 	ipha = (ipha_t *)mp->b_rptr;
+	ASSERT(ntohs(ipha->ipha_length) == pkt_len);
+	frag_flag = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF;
 
 	/*
-	 * If the Don't Fragment flag is on, generate an ICMP destination
-	 * unreachable, fragmentation needed.
-	 */
-	offset = ntohs(ipha->ipha_fragment_offset_and_flags);
-	if (offset & IPH_DF) {
-		BUMP_MIB(mibptr, ipIfStatsOutFragFails);
-		if (is_system_labeled()) {
-			max_frag = tsol_pmtu_adjust(mp, ire->ire_max_frag,
-			    ire->ire_max_frag - max_frag, AF_INET);
-		}
-		/*
-		 * Need to compute hdr checksum if called from ip_wput_ire.
-		 * Note that ip_rput_forward verifies the checksum before
-		 * calling this routine so in that case this is a noop.
-		 */
-		ipha->ipha_hdr_checksum = 0;
-		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
-		icmp_frag_needed(ire->ire_stq, first_mp, max_frag, zoneid,
-		    ipst);
-		TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
-		    "ip_wput_frag_end:(%S)",
-		    "don't fragment");
-		return;
-	}
-	/*
-	 * Labeled systems adjust max_frag if they add a label
-	 * to send the correct path mtu.  We need the real mtu since we
-	 * are fragmenting the packet after label adjustment.
-	 */
-	if (is_system_labeled())
-		max_frag = ire->ire_max_frag;
-	if (mctl_present)
-		freeb(first_mp);
-	/*
 	 * Establish the starting offset.  May not be zero if we are fragging
 	 * a fragment that is being forwarded.
 	 */
-	offset = offset & IPH_OFFSET;
+	offset = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET;
 
 	/* TODO why is this test needed? */
-	v_hlen_tos_len = ((uint32_t *)ipha)[0];
-	if (((max_frag - LENGTH) & ~7) < 8) {
+	if (((max_frag - ntohs(ipha->ipha_length)) & ~7) < 8) {
 		/* TODO: notify ulp somehow */
-		BUMP_MIB(mibptr, ipIfStatsOutFragFails);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+		ip_drop_output("FragFails: bad starting offset", mp, ill);
 		freemsg(mp);
-		TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
-		    "ip_wput_frag_end:(%S)",
-		    "len < 8");
-		return;
+		return (EINVAL);
 	}
 
-	hdr_len = (V_HLEN & 0xF) << 2;
-
+	hdr_len = IPH_HDR_LENGTH(ipha);
 	ipha->ipha_hdr_checksum = 0;
 
 	/*
@@ -24173,40 +11742,14 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 	 */
 	len = (max_frag - hdr_len) & ~7;
 
-	/* Check if we can use MDT to send out the frags. */
-	ASSERT(!IRE_IS_LOCAL(ire));
-	if (hdr_len == IP_SIMPLE_HDR_LENGTH &&
-	    ipst->ips_ip_multidata_outbound &&
-	    !(ire->ire_flags & RTF_MULTIRT) &&
-	    !IPP_ENABLED(IPP_LOCAL_OUT, ipst) &&
-	    ill != NULL && ILL_MDT_CAPABLE(ill) &&
-	    IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) {
-		ASSERT(ill->ill_mdt_capab != NULL);
-		if (!ill->ill_mdt_capab->ill_mdt_on) {
-			/*
-			 * If MDT has been previously turned off in the past,
-			 * and we currently can do MDT (due to IPQoS policy
-			 * removal, etc.) then enable it for this interface.
-			 */
-			ill->ill_mdt_capab->ill_mdt_on = 1;
-			ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n",
-			    ill->ill_name));
-		}
-		ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag,
-		    offset);
-		return;
-	}
-
 	/* Get a copy of the header for the trailing frags */
-	hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst,
+	hdr_mp = ip_fragment_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst,
 	    mp);
-	if (!hdr_mp) {
-		BUMP_MIB(mibptr, ipIfStatsOutFragFails);
+	if (hdr_mp == NULL) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+		ip_drop_output("FragFails: no hdr_mp", mp, ill);
 		freemsg(mp);
-		TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
-		    "ip_wput_frag_end:(%S)",
-		    "couldn't copy hdr");
-		return;
+		return (ENOBUFS);
 	}
 
 	/* Store the starting offset, with the MoreFrags flag. */
@@ -24233,279 +11776,28 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 	 * original IP header.
 	 */
 	if (!(mp = ip_carve_mp(&mp_orig, i1))) {
-		BUMP_MIB(mibptr, ipIfStatsOutFragFails);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+		ip_drop_output("FragFails: could not carve mp", mp_orig, ill);
 		freeb(hdr_mp);
 		freemsg(mp_orig);
-		TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
-		    "ip_wput_frag_end:(%S)",
-		    "couldn't carve first");
-		return;
+		return (ENOBUFS);
 	}
 
-	/*
-	 * Multirouting case. Each fragment is replicated
-	 * via all non-condemned RTF_MULTIRT routes
-	 * currently resolved.
-	 * We ensure that first_ire is the first RTF_MULTIRT
-	 * ire in the bucket.
-	 */
-	if (ire->ire_flags & RTF_MULTIRT) {
-		irb = ire->ire_bucket;
-		ASSERT(irb != NULL);
-
-		multirt_send = B_TRUE;
-
-		/* Make sure we do not omit any multiroute ire. */
-		IRB_REFHOLD(irb);
-		for (first_ire = irb->irb_ire;
-		    first_ire != NULL;
-		    first_ire = first_ire->ire_next) {
-			if ((first_ire->ire_flags & RTF_MULTIRT) &&
-			    (first_ire->ire_addr == ire->ire_addr) &&
-			    !(first_ire->ire_marks &
-			    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
-				break;
-		}
-
-		if (first_ire != NULL) {
-			if (first_ire != ire) {
-				IRE_REFHOLD(first_ire);
-				/*
-				 * Do not release the ire passed in
-				 * as the argument.
-				 */
-				ire = first_ire;
-			} else {
-				first_ire = NULL;
-			}
-		}
-		IRB_REFRELE(irb);
-
-		/*
-		 * Save the first ire; we will need to restore it
-		 * for the trailing frags.
-		 * We REFHOLD save_ire, as each iterated ire will be
-		 * REFRELEd.
-		 */
-		save_ire = ire;
-		IRE_REFHOLD(save_ire);
-	}
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
 
-	/*
-	 * First fragment emission loop.
-	 * In most cases, the emission loop below is entered only
-	 * once. Only in the case where the ire holds the RTF_MULTIRT
-	 * flag, do we loop to process all RTF_MULTIRT ires in the
-	 * bucket, and send the fragment through all crossed
-	 * RTF_MULTIRT routes.
-	 */
-	do {
-		if (ire->ire_flags & RTF_MULTIRT) {
-			/*
-			 * We are in a multiple send case, need to get
-			 * the next ire and make a copy of the packet.
-			 * ire1 holds here the next ire to process in the
-			 * bucket. If multirouting is expected,
-			 * any non-RTF_MULTIRT ire that has the
-			 * right destination address is ignored.
-			 *
-			 * We have to take into account the MTU of
-			 * each walked ire. max_frag is set by the
-			 * the caller and generally refers to
-			 * the primary ire entry. Here we ensure that
-			 * no route with a lower MTU will be used, as
-			 * fragments are carved once for all ires,
-			 * then replicated.
-			 */
-			ASSERT(irb != NULL);
-			IRB_REFHOLD(irb);
-			for (ire1 = ire->ire_next;
-			    ire1 != NULL;
-			    ire1 = ire1->ire_next) {
-				if ((ire1->ire_flags & RTF_MULTIRT) == 0)
-					continue;
-				if (ire1->ire_addr != ire->ire_addr)
-					continue;
-				if (ire1->ire_marks &
-				    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
-					continue;
-				/*
-				 * Ensure we do not exceed the MTU
-				 * of the next route.
-				 */
-				if (ire1->ire_max_frag < max_frag) {
-					ip_multirt_bad_mtu(ire1, max_frag);
-					continue;
-				}
-
-				/* Got one. */
-				IRE_REFHOLD(ire1);
-				break;
-			}
-			IRB_REFRELE(irb);
-
-			if (ire1 != NULL) {
-				next_mp = copyb(mp);
-				if ((next_mp == NULL) ||
-				    ((mp->b_cont != NULL) &&
-				    ((next_mp->b_cont =
-				    dupmsg(mp->b_cont)) == NULL))) {
-					freemsg(next_mp);
-					next_mp = NULL;
-					ire_refrele(ire1);
-					ire1 = NULL;
-				}
-			}
-
-			/* Last multiroute ire; don't loop anymore. */
-			if (ire1 == NULL) {
-				multirt_send = B_FALSE;
-			}
-		}
-
-		ll_hdr_len = 0;
-		LOCK_IRE_FP_MP(ire);
-		ll_hdr_mp = ire->ire_nce->nce_fp_mp;
-		if (ll_hdr_mp != NULL) {
-			ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA);
-			ll_hdr_len = ll_hdr_mp->b_wptr - ll_hdr_mp->b_rptr;
-		} else {
-			ll_hdr_mp = ire->ire_nce->nce_res_mp;
-		}
-
-		/* If there is a transmit header, get a copy for this frag. */
-		/*
-		 * TODO: should check db_ref before calling ip_carve_mp since
-		 * it might give us a dup.
-		 */
-		if (!ll_hdr_mp) {
-			/* No xmit header. */
-			xmit_mp = mp;
-
-		/* We have a link-layer header that can fit in our mblk. */
-		} else if (mp->b_datap->db_ref == 1 &&
-		    ll_hdr_len != 0 &&
-		    ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) {
-			/* M_DATA fastpath */
-			mp->b_rptr -= ll_hdr_len;
-			bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, ll_hdr_len);
-			xmit_mp = mp;
-
-		/* Corner case if copyb has failed */
-		} else if (!(xmit_mp = copyb(ll_hdr_mp))) {
-			UNLOCK_IRE_FP_MP(ire);
-			BUMP_MIB(mibptr, ipIfStatsOutFragFails);
-			freeb(hdr_mp);
-			freemsg(mp);
-			freemsg(mp_orig);
-			TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
-			    "ip_wput_frag_end:(%S)",
-			    "discard");
-
-			if (multirt_send) {
-				ASSERT(ire1);
-				ASSERT(next_mp);
-
-				freemsg(next_mp);
-				ire_refrele(ire1);
-			}
-			if (save_ire != NULL)
-				IRE_REFRELE(save_ire);
-
-			if (first_ire != NULL)
-				ire_refrele(first_ire);
-			return;
-
-		/*
-		 * Case of res_mp OR the fastpath mp can't fit
-		 * in the mblk
-		 */
-		} else {
-			xmit_mp->b_cont = mp;
-
-			/*
-			 * Get priority marking, if any.
-			 * We propagate the CoS marking from the
-			 * original packet that went to QoS processing
-			 * in ip_wput_ire to the newly carved mp.
-			 */
-			if (DB_TYPE(xmit_mp) == M_DATA)
-				xmit_mp->b_band = mp->b_band;
-		}
-		UNLOCK_IRE_FP_MP(ire);
-
-		q = ire->ire_stq;
-		out_ill = (ill_t *)q->q_ptr;
-
-		BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates);
-
-		DTRACE_PROBE4(ip4__physical__out__start,
-		    ill_t *, NULL, ill_t *, out_ill,
-		    ipha_t *, ipha, mblk_t *, xmit_mp);
-
-		FW_HOOKS(ipst->ips_ip4_physical_out_event,
-		    ipst->ips_ipv4firewall_physical_out,
-		    NULL, out_ill, ipha, xmit_mp, mp, 0, ipst);
-
-		DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, xmit_mp);
-
-		if (xmit_mp != NULL) {
-			DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, NULL,
-			    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
-			    ipha_t *, ipha, ip6_t *, NULL, int, 0);
-
-			ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0, connp);
-
-			BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits);
-			UPDATE_MIB(out_ill->ill_ip_mib,
-			    ipIfStatsHCOutOctets, i1);
-
-			if (pkt_type != OB_PKT) {
-				/*
-				 * Update the packet count and MIB stats
-				 * of trailing RTF_MULTIRT ires.
-				 */
-				UPDATE_OB_PKT_COUNT(ire);
-				BUMP_MIB(out_ill->ill_ip_mib,
-				    ipIfStatsOutFragReqds);
-			}
-		}
-
-		if (multirt_send) {
-			/*
-			 * We are in a multiple send case; look for
-			 * the next ire and re-enter the loop.
-			 */
-			ASSERT(ire1);
-			ASSERT(next_mp);
-			/* REFRELE the current ire before looping */
-			ire_refrele(ire);
-			ire = ire1;
-			ire1 = NULL;
-			mp = next_mp;
-			next_mp = NULL;
-		}
-	} while (multirt_send);
-
-	ASSERT(ire1 == NULL);
-
-	/* Restore the original ire; we need it for the trailing frags */
-	if (save_ire != NULL) {
-		/* REFRELE the last iterated ire */
-		ire_refrele(ire);
-		/* save_ire has been REFHOLDed */
-		ire = save_ire;
-		save_ire = NULL;
-		q = ire->ire_stq;
+	error = postfragfn(mp, nce, ixaflags, i1, xmit_hint, szone, nolzid,
+	    ixa_cookie);
+	if (error != 0 && error != EWOULDBLOCK) {
+		/* No point in sending the other fragments */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+		ip_drop_output("FragFails: postfragfn failed", mp_orig, ill);
+		freeb(hdr_mp);
+		freemsg(mp_orig);
+		return (error);
 	}
 
-	if (pkt_type == OB_PKT) {
-		UPDATE_OB_PKT_COUNT(ire);
-	} else {
-		out_ill = (ill_t *)q->q_ptr;
-		BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
-		UPDATE_IB_PKT_COUNT(ire);
-	}
+	/* No need to redo state machine in loop */
+	ixaflags &= ~IXAF_REACH_CONF;
 
 	/* Advance the offset to the second frag starting point. */
 	offset += len;
@@ -24547,7 +11839,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 					break;
 				}
 				/* Get priority marking, if any. */
-				mp->b_band = carve_mp->b_band;
+				mp->b_band = priority;
 				mp->b_cont = carve_mp;
 			}
 			ipha = (ipha_t *)mp->b_rptr;
@@ -24581,7 +11873,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 			} else {
 				mp = hdr_mp;
 				/* Get priority marking, if any. */
-				mp->b_band = carve_mp->b_band;
+				mp->b_band = priority;
 				mp->b_cont = carve_mp;
 			}
 			ipha = (ipha_t *)mp->b_rptr;
@@ -24605,254 +11897,40 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 		 */
 		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
 
-		/* Attach a transmit header, if any, and ship it. */
-		if (pkt_type == OB_PKT) {
-			UPDATE_OB_PKT_COUNT(ire);
-		} else {
-			out_ill = (ill_t *)q->q_ptr;
-			BUMP_MIB(out_ill->ill_ip_mib,
-			    ipIfStatsHCOutForwDatagrams);
-			UPDATE_IB_PKT_COUNT(ire);
-		}
-
-		if (ire->ire_flags & RTF_MULTIRT) {
-			irb = ire->ire_bucket;
-			ASSERT(irb != NULL);
-
-			multirt_send = B_TRUE;
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
 
-			/*
-			 * Save the original ire; we will need to restore it
-			 * for the tailing frags.
-			 */
-			save_ire = ire;
-			IRE_REFHOLD(save_ire);
+		error = postfragfn(mp, nce, ixaflags, ip_len, xmit_hint, szone,
+		    nolzid, ixa_cookie);
+		/* All done if we just consumed the hdr_mp. */
+		if (mp == hdr_mp) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
+			return (error);
 		}
-		/*
-		 * Emission loop for this fragment, similar
-		 * to what is done for the first fragment.
-		 */
-		do {
-			if (multirt_send) {
-				/*
-				 * We are in a multiple send case, need to get
-				 * the next ire and make a copy of the packet.
-				 */
-				ASSERT(irb != NULL);
-				IRB_REFHOLD(irb);
-				for (ire1 = ire->ire_next;
-				    ire1 != NULL;
-				    ire1 = ire1->ire_next) {
-					if (!(ire1->ire_flags & RTF_MULTIRT))
-						continue;
-					if (ire1->ire_addr != ire->ire_addr)
-						continue;
-					if (ire1->ire_marks &
-					    (IRE_MARK_CONDEMNED |
-					    IRE_MARK_TESTHIDDEN))
-						continue;
-					/*
-					 * Ensure we do not exceed the MTU
-					 * of the next route.
-					 */
-					if (ire1->ire_max_frag < max_frag) {
-						ip_multirt_bad_mtu(ire1,
-						    max_frag);
-						continue;
-					}
-
-					/* Got one. */
-					IRE_REFHOLD(ire1);
-					break;
-				}
-				IRB_REFRELE(irb);
-
-				if (ire1 != NULL) {
-					next_mp = copyb(mp);
-					if ((next_mp == NULL) ||
-					    ((mp->b_cont != NULL) &&
-					    ((next_mp->b_cont =
-					    dupmsg(mp->b_cont)) == NULL))) {
-						freemsg(next_mp);
-						next_mp = NULL;
-						ire_refrele(ire1);
-						ire1 = NULL;
-					}
-				}
-
-				/* Last multiroute ire; don't loop anymore. */
-				if (ire1 == NULL) {
-					multirt_send = B_FALSE;
-				}
-			}
-
-			/* Update transmit header */
-			ll_hdr_len = 0;
-			LOCK_IRE_FP_MP(ire);
-			ll_hdr_mp = ire->ire_nce->nce_fp_mp;
-			if (ll_hdr_mp != NULL) {
-				ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA);
-				ll_hdr_len = MBLKL(ll_hdr_mp);
-			} else {
-				ll_hdr_mp = ire->ire_nce->nce_res_mp;
-			}
-
-			if (!ll_hdr_mp) {
-				xmit_mp = mp;
-
-			/*
-			 * We have link-layer header that can fit in
-			 * our mblk.
-			 */
-			} else if (mp->b_datap->db_ref == 1 &&
-			    ll_hdr_len != 0 &&
-			    ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) {
-				/* M_DATA fastpath */
-				mp->b_rptr -= ll_hdr_len;
-				bcopy(ll_hdr_mp->b_rptr, mp->b_rptr,
-				    ll_hdr_len);
-				xmit_mp = mp;
-
-			/*
-			 * Case of res_mp OR the fastpath mp can't fit
-			 * in the mblk
-			 */
-			} else if ((xmit_mp = copyb(ll_hdr_mp)) != NULL) {
-				xmit_mp->b_cont = mp;
-				/* Get priority marking, if any. */
-				if (DB_TYPE(xmit_mp) == M_DATA)
-					xmit_mp->b_band = mp->b_band;
-
-			/* Corner case if copyb failed */
-			} else {
-				/*
-				 * Exit both the replication and
-				 * fragmentation loops.
-				 */
-				UNLOCK_IRE_FP_MP(ire);
-				goto drop_pkt;
-			}
-			UNLOCK_IRE_FP_MP(ire);
-
-			mp1 = mp;
-			out_ill = (ill_t *)q->q_ptr;
-
-			BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates);
-
-			DTRACE_PROBE4(ip4__physical__out__start,
-			    ill_t *, NULL, ill_t *, out_ill,
-			    ipha_t *, ipha, mblk_t *, xmit_mp);
-
-			FW_HOOKS(ipst->ips_ip4_physical_out_event,
-			    ipst->ips_ipv4firewall_physical_out,
-			    NULL, out_ill, ipha, xmit_mp, mp, 0, ipst);
-
-			DTRACE_PROBE1(ip4__physical__out__end,
-			    mblk_t *, xmit_mp);
-
-			if (mp != mp1 && hdr_mp == mp1)
-				hdr_mp = mp;
-			if (mp != mp1 && mp_orig == mp1)
-				mp_orig = mp;
-
-			if (xmit_mp != NULL) {
-				DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *,
-				    NULL, void_ip_t *, ipha,
-				    __dtrace_ipsr_ill_t *, out_ill, ipha_t *,
-				    ipha, ip6_t *, NULL, int, 0);
-
-				ILL_SEND_TX(out_ill, ire, connp,
-				    xmit_mp, 0, connp);
-
-				BUMP_MIB(out_ill->ill_ip_mib,
-				    ipIfStatsHCOutTransmits);
-				UPDATE_MIB(out_ill->ill_ip_mib,
-				    ipIfStatsHCOutOctets, ip_len);
-
-				if (pkt_type != OB_PKT) {
-					/*
-					 * Update the packet count of trailing
-					 * RTF_MULTIRT ires.
-					 */
-					UPDATE_OB_PKT_COUNT(ire);
-				}
-			}
-
-			/* All done if we just consumed the hdr_mp. */
-			if (mp == hdr_mp) {
-				last_frag = B_TRUE;
-				BUMP_MIB(out_ill->ill_ip_mib,
-				    ipIfStatsOutFragOKs);
-			}
-
-			if (multirt_send) {
-				/*
-				 * We are in a multiple send case; look for
-				 * the next ire and re-enter the loop.
-				 */
-				ASSERT(ire1);
-				ASSERT(next_mp);
-				/* REFRELE the current ire before looping */
-				ire_refrele(ire);
-				ire = ire1;
-				ire1 = NULL;
-				q = ire->ire_stq;
-				mp = next_mp;
-				next_mp = NULL;
-			}
-		} while (multirt_send);
-		/*
-		 * Restore the original ire; we need it for the
-		 * trailing frags
-		 */
-		if (save_ire != NULL) {
-			ASSERT(ire1 == NULL);
-			/* REFRELE the last iterated ire */
-			ire_refrele(ire);
-			/* save_ire has been REFHOLDed */
-			ire = save_ire;
-			q = ire->ire_stq;
-			save_ire = NULL;
+		if (error != 0 && error != EWOULDBLOCK) {
+			DTRACE_PROBE2(ip__xmit__frag__fail, ill_t *, ill,
+			    mblk_t *, hdr_mp);
+			/* No point in sending the other fragments */
+			break;
 		}
 
-		if (last_frag) {
-			TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
-			    "ip_wput_frag_end:(%S)",
-			    "consumed hdr_mp");
-
-			if (first_ire != NULL)
-				ire_refrele(first_ire);
-			return;
-		}
 		/* Otherwise, advance and loop. */
 		offset += len;
 	}
-
-drop_pkt:
 	/* Clean up following allocation failure. */
-	BUMP_MIB(mibptr, ipIfStatsOutFragFails);
-	freemsg(mp);
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+	ip_drop_output("FragFails: loop ended", NULL, ill);
 	if (mp != hdr_mp)
 		freeb(hdr_mp);
 	if (mp != mp_orig)
 		freemsg(mp_orig);
-
-	if (save_ire != NULL)
-		IRE_REFRELE(save_ire);
-	if (first_ire != NULL)
-		ire_refrele(first_ire);
-
-	TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
-	    "ip_wput_frag_end:(%S)",
-	    "end--alloc failure");
+	return (error);
 }
 
 /*
  * Copy the header plus those options which have the copy bit set
- * src is the template to make sure we preserve the cred for TX purposes.
  */
 static mblk_t *
-ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
+ip_fragment_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
     mblk_t *src)
 {
 	mblk_t	*mp;
@@ -24908,310 +11986,13 @@ ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
 }
 
 /*
- * Delivery to local recipients including fanout to multiple recipients.
- * Does not do checksumming of UDP/TCP.
- * Note: q should be the read side queue for either the ill or conn.
- * Note: rq should be the read side q for the lower (ill) stream.
- * We don't send packets to IPPF processing, thus the last argument
- * to all the fanout calls are B_FALSE.
- */
-void
-ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
-    int fanout_flags, zoneid_t zoneid)
-{
-	uint32_t	protocol;
-	mblk_t		*first_mp;
-	boolean_t	mctl_present;
-	int		ire_type;
-#define	rptr	((uchar_t *)ipha)
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	TRACE_1(TR_FAC_IP, TR_IP_WPUT_LOCAL_START,
-	    "ip_wput_local_start: q %p", q);
-
-	if (ire != NULL) {
-		ire_type = ire->ire_type;
-	} else {
-		/*
-		 * Only ip_multicast_loopback() calls us with a NULL ire. If the
-		 * packet is not multicast, we can't tell the ire type.
-		 */
-		ASSERT(CLASSD(ipha->ipha_dst));
-		ire_type = IRE_BROADCAST;
-	}
-
-	first_mp = mp;
-	if (first_mp->b_datap->db_type == M_CTL) {
-		ipsec_out_t *io = (ipsec_out_t *)first_mp->b_rptr;
-		if (!io->ipsec_out_secure) {
-			/*
-			 * This ipsec_out_t was allocated in ip_wput
-			 * for multicast packets to store the ill_index.
-			 * As this is being delivered locally, we don't
-			 * need this anymore.
-			 */
-			mp = first_mp->b_cont;
-			freeb(first_mp);
-			first_mp = mp;
-			mctl_present = B_FALSE;
-		} else {
-			/*
-			 * Convert IPSEC_OUT to IPSEC_IN, preserving all
-			 * security properties for the looped-back packet.
-			 */
-			mctl_present = B_TRUE;
-			mp = first_mp->b_cont;
-			ASSERT(mp != NULL);
-			ipsec_out_to_in(first_mp);
-		}
-	} else {
-		mctl_present = B_FALSE;
-	}
-
-	DTRACE_PROBE4(ip4__loopback__in__start,
-	    ill_t *, ill, ill_t *, NULL,
-	    ipha_t *, ipha, mblk_t *, first_mp);
-
-	FW_HOOKS(ipst->ips_ip4_loopback_in_event,
-	    ipst->ips_ipv4firewall_loopback_in,
-	    ill, NULL, ipha, first_mp, mp, 0, ipst);
-
-	DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, first_mp);
-
-	if (first_mp == NULL)
-		return;
-
-	if (ipst->ips_ip4_observe.he_interested) {
-		zoneid_t szone, dzone, lookup_zoneid = ALL_ZONES;
-		zoneid_t stackzoneid = netstackid_to_zoneid(
-		    ipst->ips_netstack->netstack_stackid);
-
-		dzone = (stackzoneid == GLOBAL_ZONEID) ? zoneid : stackzoneid;
-		/*
-		 * 127.0.0.1 is special, as we cannot lookup its zoneid by
-		 * address.  Restrict the lookup below to the destination zone.
-		 */
-		if (ipha->ipha_src == ntohl(INADDR_LOOPBACK))
-			lookup_zoneid = zoneid;
-		szone = ip_get_zoneid_v4(ipha->ipha_src, mp, ipst,
-		    lookup_zoneid);
-		ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
-	}
-
-	DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *,
-	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
-	    int, 1);
-
-	ipst->ips_loopback_packets++;
-
-	ip2dbg(("ip_wput_local: from 0x%x to 0x%x in zone %d\n",
-	    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), zoneid));
-	if (!IS_SIMPLE_IPH(ipha)) {
-		ip_wput_local_options(ipha, ipst);
-	}
-
-	protocol = ipha->ipha_protocol;
-	switch (protocol) {
-	case IPPROTO_ICMP: {
-		ire_t		*ire_zone;
-		ilm_t		*ilm;
-		mblk_t		*mp1;
-		zoneid_t	last_zoneid;
-		ilm_walker_t	ilw;
-
-		if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) {
-			ASSERT(ire_type == IRE_BROADCAST);
-			/*
-			 * In the multicast case, applications may have joined
-			 * the group from different zones, so we need to deliver
-			 * the packet to each of them. Loop through the
-			 * multicast memberships structures (ilm) on the receive
-			 * ill and send a copy of the packet up each matching
-			 * one. However, we don't do this for multicasts sent on
-			 * the loopback interface (PHYI_LOOPBACK flag set) as
-			 * they must stay in the sender's zone.
-			 *
-			 * ilm_add_v6() ensures that ilms in the same zone are
-			 * contiguous in the ill_ilm list. We use this property
-			 * to avoid sending duplicates needed when two
-			 * applications in the same zone join the same group on
-			 * different logical interfaces: we ignore the ilm if
-			 * it's zoneid is the same as the last matching one.
-			 * In addition, the sending of the packet for
-			 * ire_zoneid is delayed until all of the other ilms
-			 * have been exhausted.
-			 */
-			last_zoneid = -1;
-			ilm = ilm_walker_start(&ilw, ill);
-			for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
-				if (ipha->ipha_dst != ilm->ilm_addr ||
-				    ilm->ilm_zoneid == last_zoneid ||
-				    ilm->ilm_zoneid == zoneid ||
-				    !(ilm->ilm_ipif->ipif_flags & IPIF_UP))
-					continue;
-				mp1 = ip_copymsg(first_mp);
-				if (mp1 == NULL)
-					continue;
-				icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill,
-				    0, 0, mctl_present, B_FALSE, ill,
-				    ilm->ilm_zoneid);
-				last_zoneid = ilm->ilm_zoneid;
-			}
-			ilm_walker_finish(&ilw);
-			/*
-			 * Loopback case: the sending endpoint has
-			 * IP_MULTICAST_LOOP disabled, therefore we don't
-			 * dispatch the multicast packet to the sending zone.
-			 */
-			if (fanout_flags & IP_FF_NO_MCAST_LOOP) {
-				freemsg(first_mp);
-				return;
-			}
-		} else if (ire_type == IRE_BROADCAST) {
-			/*
-			 * In the broadcast case, there may be many zones
-			 * which need a copy of the packet delivered to them.
-			 * There is one IRE_BROADCAST per broadcast address
-			 * and per zone; we walk those using a helper function.
-			 * In addition, the sending of the packet for zoneid is
-			 * delayed until all of the other ires have been
-			 * processed.
-			 */
-			IRB_REFHOLD(ire->ire_bucket);
-			ire_zone = NULL;
-			while ((ire_zone = ire_get_next_bcast_ire(ire_zone,
-			    ire)) != NULL) {
-				mp1 = ip_copymsg(first_mp);
-				if (mp1 == NULL)
-					continue;
-
-				UPDATE_IB_PKT_COUNT(ire_zone);
-				ire_zone->ire_last_used_time = lbolt;
-				icmp_inbound(q, mp1, B_TRUE, ill, 0, 0,
-				    mctl_present, B_FALSE, ill,
-				    ire_zone->ire_zoneid);
-			}
-			IRB_REFRELE(ire->ire_bucket);
-		}
-		icmp_inbound(q, first_mp, (ire_type == IRE_BROADCAST), ill, 0,
-		    0, mctl_present, B_FALSE, ill, zoneid);
-		TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END,
-		    "ip_wput_local_end: q %p (%S)",
-		    q, "icmp");
-		return;
-	}
-	case IPPROTO_IGMP:
-		if ((mp = igmp_input(q, mp, ill)) == NULL) {
-			/* Bad packet - discarded by igmp_input */
-			TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END,
-			    "ip_wput_local_end: q %p (%S)",
-			    q, "igmp_input--bad packet");
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		}
-		/*
-		 * igmp_input() may have returned the pulled up message.
-		 * So first_mp and ipha need to be reinitialized.
-		 */
-		ipha = (ipha_t *)mp->b_rptr;
-		if (mctl_present)
-			first_mp->b_cont = mp;
-		else
-			first_mp = mp;
-		/* deliver to local raw users */
-		break;
-	case IPPROTO_ENCAP:
-		/*
-		 * This case is covered by either ip_fanout_proto, or by
-		 * the above security processing for self-tunneled packets.
-		 */
-		break;
-	case IPPROTO_UDP: {
-		uint16_t	*up;
-		uint32_t	ports;
-
-		up = (uint16_t *)(rptr + IPH_HDR_LENGTH(ipha) +
-		    UDP_PORTS_OFFSET);
-		/* Force a 'valid' checksum. */
-		up[3] = 0;
-
-		ports = *(uint32_t *)up;
-		ip_fanout_udp(q, first_mp, ill, ipha, ports,
-		    (ire_type == IRE_BROADCAST),
-		    fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE |
-		    IP_FF_SEND_SLLA | IP_FF_IPINFO, mctl_present, B_FALSE,
-		    ill, zoneid);
-		TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END,
-		    "ip_wput_local_end: q %p (%S)", q, "ip_fanout_udp");
-		return;
-	}
-	case IPPROTO_TCP: {
-
-		/*
-		 * For TCP, discard broadcast packets.
-		 */
-		if ((ushort_t)ire_type == IRE_BROADCAST) {
-			freemsg(first_mp);
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			ip2dbg(("ip_wput_local: discard broadcast\n"));
-			return;
-		}
-
-		if (mp->b_datap->db_type == M_DATA) {
-			/*
-			 * M_DATA mblk, so init mblk (chain) for no struio().
-			 */
-			mblk_t	*mp1 = mp;
-
-			do {
-				mp1->b_datap->db_struioflag = 0;
-			} while ((mp1 = mp1->b_cont) != NULL);
-		}
-		ASSERT((rptr + IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET + 4)
-		    <= mp->b_wptr);
-		ip_fanout_tcp(q, first_mp, ill, ipha,
-		    fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE |
-		    IP_FF_SYN_ADDIRE | IP_FF_IPINFO,
-		    mctl_present, B_FALSE, zoneid);
-		TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END,
-		    "ip_wput_local_end: q %p (%S)", q, "ip_fanout_tcp");
-		return;
-	}
-	case IPPROTO_SCTP:
-	{
-		uint32_t	ports;
-
-		bcopy(rptr + IPH_HDR_LENGTH(ipha), &ports, sizeof (ports));
-		ip_fanout_sctp(first_mp, ill, ipha, ports,
-		    fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE |
-		    IP_FF_IPINFO, mctl_present, B_FALSE, zoneid);
-		return;
-	}
-
-	default:
-		break;
-	}
-	/*
-	 * Find a client for some other protocol.  We give
-	 * copies to multiple clients, if more than one is
-	 * bound.
-	 */
-	ip_fanout_proto(q, first_mp, ill, ipha,
-	    fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | IP_FF_RAWIP,
-	    mctl_present, B_FALSE, ill, zoneid);
-	TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END,
-	    "ip_wput_local_end: q %p (%S)", q, "ip_fanout_proto");
-#undef	rptr
-}
-
-/*
- * Update any source route, record route, or timestamp options.
+ * Update any source route, record route, or timestamp options when
+ * sending a packet back to ourselves.
  * Check that we are at end of strict source route.
- * The options have been sanity checked by ip_wput_options().
+ * The options have been sanity checked by ip_output_options().
  */
-static void
-ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
+void
+ip_output_local_options(ipha_t *ipha, ip_stack_t *ipst)
 {
 	ipoptp_t	opts;
 	uchar_t		*opt;
@@ -25219,10 +12000,8 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
 	uint8_t		optlen;
 	ipaddr_t	dst;
 	uint32_t	ts;
-	ire_t		*ire;
 	timestruc_t	now;
 
-	ip2dbg(("ip_wput_local_options\n"));
 	for (optval = ipoptp_first(&opts, ipha);
 	    optval != IPOPT_EOL;
 	    optval = ipoptp_next(&opts)) {
@@ -25246,7 +12025,7 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
 			 * it is a packet with a loose source route which
 			 * reaches us before consuming the whole source route
 			 */
-			ip1dbg(("ip_wput_local_options: not end of SR\n"));
+
 			if (optval == IPOPT_SSRR) {
 				return;
 			}
@@ -25267,7 +12046,7 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
 			    off > optlen - IP_ADDR_LEN) {
 				/* No more room - ignore */
 				ip1dbg((
-				    "ip_wput_forward_options: end of RR\n"));
+				    "ip_output_local_options: end of RR\n"));
 				break;
 			}
 			dst = htonl(INADDR_LOOPBACK);
@@ -25285,14 +12064,10 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
 				/* Verify that the address matched */
 				off = opt[IPOPT_OFFSET] - 1;
 				bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
-				ire = ire_ctable_lookup(dst, 0, IRE_LOCAL,
-				    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
-				    ipst);
-				if (ire == NULL) {
+				if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
 					/* Not for us */
 					break;
 				}
-				ire_refrele(ire);
 				/* FALLTHRU */
 			case IPOPT_TS_TSANDADDR:
 				off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
@@ -25302,8 +12077,8 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
 				 * ip_*put_options should have already
 				 * dropped this packet.
 				 */
-				cmn_err(CE_PANIC, "ip_wput_local_options: "
-				    "unknown IT - bug in ip_wput_options?\n");
+				cmn_err(CE_PANIC, "ip_output_local_options: "
+				    "unknown IT - bug in ip_output_options?\n");
 				return;	/* Keep "lint" happy */
 			}
 			if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
@@ -25339,1098 +12114,240 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
 }
 
 /*
- * Send out a multicast packet on interface ipif.
- * The sender does not have an conn.
- * Caller verifies that this isn't a PHYI_LOOPBACK.
- */
-void
-ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid)
-{
-	ipha_t	*ipha;
-	ire_t	*ire;
-	ipaddr_t	dst;
-	mblk_t		*first_mp;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
-
-	/* igmp_sendpkt always allocates a ipsec_out_t */
-	ASSERT(mp->b_datap->db_type == M_CTL);
-	ASSERT(!ipif->ipif_isv6);
-	ASSERT(!IS_LOOPBACK(ipif->ipif_ill));
-
-	first_mp = mp;
-	mp = first_mp->b_cont;
-	ASSERT(mp->b_datap->db_type == M_DATA);
-	ipha = (ipha_t *)mp->b_rptr;
-
-	/*
-	 * Find an IRE which matches the destination and the outgoing
-	 * queue (i.e. the outgoing interface.)
-	 */
-	if (ipif->ipif_flags & IPIF_POINTOPOINT)
-		dst = ipif->ipif_pp_dst_addr;
-	else
-		dst = ipha->ipha_dst;
-	/*
-	 * The source address has already been initialized by the
-	 * caller and hence matching on ILL (MATCH_IRE_ILL) would
-	 * be sufficient rather than MATCH_IRE_IPIF.
-	 *
-	 * This function is used for sending IGMP packets.  For IPMP,
-	 * we sidestep IGMP snooping issues by sending all multicast
-	 * traffic on a single interface in the IPMP group.
-	 */
-	ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL,
-	    MATCH_IRE_ILL, ipst);
-	if (!ire) {
-		/*
-		 * Mark this packet to make it be delivered to
-		 * ip_wput_ire after the new ire has been
-		 * created.
-		 */
-		mp->b_prev = NULL;
-		mp->b_next = NULL;
-		ip_newroute_ipif(q, first_mp, ipif, dst, NULL, RTF_SETSRC,
-		    zoneid, &zero_info);
-		return;
-	}
-
-	/*
-	 * Honor the RTF_SETSRC flag; this is the only case
-	 * where we force this addr whatever the current src addr is,
-	 * because this address is set by igmp_sendpkt(), and
-	 * cannot be specified by any user.
-	 */
-	if (ire->ire_flags & RTF_SETSRC) {
-		ipha->ipha_src = ire->ire_src_addr;
-	}
-
-	ip_wput_ire(q, first_mp, ire, NULL, B_FALSE, zoneid);
-}
-
-/*
- * NOTE : This function does not ire_refrele the ire argument passed in.
+ * Prepend an M_DATA fastpath header, and if none present prepend a
+ * DL_UNITDATA_REQ. Frees the mblk on failure.
+ *
+ * nce_dlur_mp and nce_fp_mp can not disappear once they have been set.
+ * If there is a change to them, the nce will be deleted (condemned) and
+ * a new nce_t will be created when packets are sent. Thus we need no locks
+ * to access those fields.
  *
- * Copy the link layer header and do IPQoS if needed. Frees the mblk on
- * failure. The nce_fp_mp can vanish any time in the case of
- * IRE_BROADCAST due to DL_NOTE_FASTPATH_FLUSH. Hence we have to hold
- * the ire_lock to access the nce_fp_mp in this case.
- * IPQoS assumes that the first M_DATA contains the IP header. So, if we are
- * prepending a fastpath message IPQoS processing must precede it, we also set
- * the b_band of the fastpath message to that of the  mblk returned by IPQoS
- * (IPQoS might have set the b_band for CoS marking).
- * However, if we are prepending DL_UNITDATA_REQ message, IPQoS processing
- * must follow it so that IPQoS can mark the dl_priority field for CoS
- * marking, if needed.
+ * We preserve b_band to support IPQoS. If a DL_UNITDATA_REQ is prepended
+ * we place b_band in dl_priority.dl_max.
  */
 static mblk_t *
-ip_wput_attach_llhdr(mblk_t *mp, ire_t *ire, ip_proc_t proc,
-    uint32_t ill_index, ipha_t **iphap)
+ip_xmit_attach_llhdr(mblk_t *mp, nce_t *nce)
 {
 	uint_t	hlen;
-	ipha_t *ipha;
 	mblk_t *mp1;
-	boolean_t qos_done = B_FALSE;
-	uchar_t	*ll_hdr;
-	ip_stack_t	*ipst = ire->ire_ipst;
+	uint_t	priority;
+	uchar_t *rptr;
 
-#define	rptr	((uchar_t *)ipha)
+	rptr = mp->b_rptr;
 
-	ipha = (ipha_t *)mp->b_rptr;
-	hlen = 0;
-	LOCK_IRE_FP_MP(ire);
-	if ((mp1 = ire->ire_nce->nce_fp_mp) != NULL) {
-		ASSERT(DB_TYPE(mp1) == M_DATA);
-		/* Initiate IPPF processing */
-		if ((proc != 0) && IPP_ENABLED(proc, ipst)) {
-			UNLOCK_IRE_FP_MP(ire);
-			ip_process(proc, &mp, ill_index);
-			if (mp == NULL)
-				return (NULL);
+	ASSERT(DB_TYPE(mp) == M_DATA);
+	priority = mp->b_band;
 
-			ipha = (ipha_t *)mp->b_rptr;
-			LOCK_IRE_FP_MP(ire);
-			if ((mp1 = ire->ire_nce->nce_fp_mp) == NULL) {
-				qos_done = B_TRUE;
-				goto no_fp_mp;
-			}
-			ASSERT(DB_TYPE(mp1) == M_DATA);
-		}
+	ASSERT(nce != NULL);
+	if ((mp1 = nce->nce_fp_mp) != NULL) {
 		hlen = MBLKL(mp1);
 		/*
 		 * Check if we have enough room to prepend fastpath
 		 * header
 		 */
 		if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) {
-			ll_hdr = rptr - hlen;
-			bcopy(mp1->b_rptr, ll_hdr, hlen);
+			rptr -= hlen;
+			bcopy(mp1->b_rptr, rptr, hlen);
 			/*
 			 * Set the b_rptr to the start of the link layer
 			 * header
 			 */
-			mp->b_rptr = ll_hdr;
-			mp1 = mp;
-		} else {
-			mp1 = copyb(mp1);
-			if (mp1 == NULL)
-				goto unlock_err;
-			mp1->b_band = mp->b_band;
-			mp1->b_cont = mp;
-			/*
-			 * XXX disable ICK_VALID and compute checksum
-			 * here; can happen if nce_fp_mp changes and
-			 * it can't be copied now due to insufficient
-			 * space. (unlikely, fp mp can change, but it
-			 * does not increase in length)
-			 */
+			mp->b_rptr = rptr;
+			return (mp);
 		}
-		UNLOCK_IRE_FP_MP(ire);
-	} else {
-no_fp_mp:
-		mp1 = copyb(ire->ire_nce->nce_res_mp);
+		mp1 = copyb(mp1);
 		if (mp1 == NULL) {
-unlock_err:
-			UNLOCK_IRE_FP_MP(ire);
+			ill_t *ill = nce->nce_ill;
+
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
 			freemsg(mp);
 			return (NULL);
 		}
-		UNLOCK_IRE_FP_MP(ire);
+		mp1->b_band = priority;
 		mp1->b_cont = mp;
-		if (!qos_done && (proc != 0) && IPP_ENABLED(proc, ipst)) {
-			ip_process(proc, &mp1, ill_index);
-			if (mp1 == NULL)
-				return (NULL);
-
-			if (mp1->b_cont == NULL)
-				ipha = NULL;
-			else
-				ipha = (ipha_t *)mp1->b_cont->b_rptr;
-		}
-	}
-
-	*iphap = ipha;
-	return (mp1);
-#undef rptr
-}
-
-/*
- * Finish the outbound IPsec processing for an IPv6 packet. This function
- * is called from ipsec_out_process() if the IPsec packet was processed
- * synchronously, or from {ah,esp}_kcf_callback() if it was processed
- * asynchronously.
- */
-void
-ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
-    ire_t *ire_arg)
-{
-	in6_addr_t *v6dstp;
-	ire_t *ire;
-	mblk_t *mp;
-	ip6_t *ip6h1;
-	uint_t	ill_index;
-	ipsec_out_t *io;
-	boolean_t hwaccel;
-	uint32_t flags = IP6_NO_IPPOLICY;
-	int match_flags;
-	zoneid_t zoneid;
-	boolean_t ill_need_rele = B_FALSE;
-	boolean_t ire_need_rele = B_FALSE;
-	ip_stack_t	*ipst;
-
-	mp = ipsec_mp->b_cont;
-	ip6h1 = (ip6_t *)mp->b_rptr;
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	ASSERT(io->ipsec_out_ns != NULL);
-	ipst = io->ipsec_out_ns->netstack_ip;
-	ill_index = io->ipsec_out_ill_index;
-	if (io->ipsec_out_reachable) {
-		flags |= IPV6_REACHABILITY_CONFIRMATION;
-	}
-	hwaccel = io->ipsec_out_accelerated;
-	zoneid = io->ipsec_out_zoneid;
-	ASSERT(zoneid != ALL_ZONES);
-	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
-	match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
-	/* Multicast addresses should have non-zero ill_index. */
-	v6dstp = &ip6h->ip6_dst;
-	ASSERT(ip6h->ip6_nxt != IPPROTO_RAW);
-	ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0);
-
-	if (ill == NULL && ill_index != 0) {
-		ill = ip_grab_ill(ipsec_mp, ill_index, B_TRUE, ipst);
-		/* Failure case frees things for us. */
-		if (ill == NULL)
-			return;
-
-		ill_need_rele = B_TRUE;
-	}
-	ASSERT(mp != NULL);
-
-	if (IN6_IS_ADDR_MULTICAST(v6dstp)) {
-		boolean_t unspec_src;
-		ipif_t	*ipif;
-
-		/*
-		 * Use the ill_index to get the right ill.
-		 */
-		unspec_src = io->ipsec_out_unspec_src;
-		(void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif);
-		if (ipif == NULL) {
-			if (ill_need_rele)
-				ill_refrele(ill);
-			freemsg(ipsec_mp);
-			return;
-		}
-
-		if (ire_arg != NULL) {
-			ire = ire_arg;
-		} else {
-			ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif,
-			    zoneid, msg_getlabel(mp), match_flags, ipst);
-			ire_need_rele = B_TRUE;
-		}
-		if (ire != NULL) {
-			ipif_refrele(ipif);
-			/*
-			 * XXX Do the multicast forwarding now, as the IPsec
-			 * processing has been done.
-			 */
-			goto send;
-		}
-
-		ip0dbg(("ip_wput_ipsec_out_v6: multicast: IRE disappeared\n"));
-		mp->b_prev = NULL;
-		mp->b_next = NULL;
-
-		/*
-		 * If the IPsec packet was processed asynchronously,
-		 * drop it now.
-		 */
-		if (q == NULL) {
-			if (ill_need_rele)
-				ill_refrele(ill);
-			freemsg(ipsec_mp);
-			ipif_refrele(ipif);
-			return;
-		}
-
-		ip_newroute_ipif_v6(q, ipsec_mp, ipif, v6dstp, &ip6h->ip6_src,
-		    unspec_src, zoneid);
-		ipif_refrele(ipif);
-	} else {
-		if (ire_arg != NULL) {
-			ire = ire_arg;
-		} else {
-			ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, ipst);
-			ire_need_rele = B_TRUE;
-		}
-		if (ire != NULL)
-			goto send;
-		/*
-		 * ire disappeared underneath.
-		 *
-		 * What we need to do here is the ip_newroute
-		 * logic to get the ire without doing the IPsec
-		 * processing. Follow the same old path. But this
-		 * time, ip_wput or ire_add_then_send will call us
-		 * directly as all the IPsec operations are done.
-		 */
-		ip1dbg(("ip_wput_ipsec_out_v6: IRE disappeared\n"));
-		mp->b_prev = NULL;
-		mp->b_next = NULL;
-
-		/*
-		 * If the IPsec packet was processed asynchronously,
-		 * drop it now.
-		 */
-		if (q == NULL) {
-			if (ill_need_rele)
-				ill_refrele(ill);
-			freemsg(ipsec_mp);
-			return;
-		}
-
-		ip_newroute_v6(q, ipsec_mp, v6dstp, &ip6h->ip6_src, ill,
-		    zoneid, ipst);
-	}
-	if (ill != NULL && ill_need_rele)
-		ill_refrele(ill);
-	return;
-send:
-	if (ill != NULL && ill_need_rele)
-		ill_refrele(ill);
-
-	/* Local delivery */
-	if (ire->ire_stq == NULL) {
-		ill_t	*out_ill;
-		ASSERT(q != NULL);
-
-		/* PFHooks: LOOPBACK_OUT */
-		out_ill = ire_to_ill(ire);
-
+		DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
+		DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
+		DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
+		DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
+		DB_LSOMSS(mp1) = DB_LSOMSS(mp);
+		DTRACE_PROBE1(ip__xmit__copyb, (mblk_t *), mp1);
 		/*
-		 * DTrace this as ip:::send.  A blocked packet will fire the
-		 * send probe, but not the receive probe.
+		 * XXX disable ICK_VALID and compute checksum
+		 * here; can happen if nce_fp_mp changes and
+		 * it can't be copied now due to insufficient
+		 * space. (unlikely, fp mp can change, but it
+		 * does not increase in length)
 		 */
-		DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL,
-		    void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, out_ill,
-		    ipha_t *, NULL, ip6_t *, ip6h, int, 1);
-
-		DTRACE_PROBE4(ip6__loopback__out__start,
-		    ill_t *, NULL, ill_t *, out_ill,
-		    ip6_t *, ip6h1, mblk_t *, ipsec_mp);
-
-		FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
-		    ipst->ips_ipv6firewall_loopback_out,
-		    NULL, out_ill, ip6h1, ipsec_mp, mp, 0, ipst);
-
-		DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, ipsec_mp);
-
-		if (ipsec_mp != NULL) {
-			ip_wput_local_v6(RD(q), out_ill,
-			    ip6h, ipsec_mp, ire, 0, zoneid);
-		}
-		if (ire_need_rele)
-			ire_refrele(ire);
-		return;
-	}
-	/*
-	 * Everything is done. Send it out on the wire.
-	 * We force the insertion of a fragment header using the
-	 * IPH_FRAG_HDR flag in two cases:
-	 * - after reception of an ICMPv6 "packet too big" message
-	 *   with a MTU < 1280 (cf. RFC 2460 section 5)
-	 * - for multirouted IPv6 packets, so that the receiver can
-	 *   discard duplicates according to their fragment identifier
-	 */
-	/* XXX fix flow control problems. */
-	if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > ire->ire_max_frag ||
-	    (ire->ire_frag_flag & IPH_FRAG_HDR)) {
-		if (hwaccel) {
-			/*
-			 * hardware acceleration does not handle these
-			 * "slow path" cases.
-			 */
-			/* IPsec KSTATS: should bump bean counter here. */
-			if (ire_need_rele)
-				ire_refrele(ire);
-			freemsg(ipsec_mp);
-			return;
-		}
-		if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN !=
-		    (mp->b_cont ? msgdsize(mp) :
-		    mp->b_wptr - (uchar_t *)ip6h)) {
-			/* IPsec KSTATS: should bump bean counter here. */
-			ip0dbg(("Packet length mismatch: %d, %ld\n",
-			    ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
-			    msgdsize(mp)));
-			if (ire_need_rele)
-				ire_refrele(ire);
-			freemsg(ipsec_mp);
-			return;
-		}
-		ASSERT(mp->b_prev == NULL);
-		ip2dbg(("Fragmenting Size = %d, mtu = %d\n",
-		    ntohs(ip6h->ip6_plen) +
-		    IPV6_HDR_LEN, ire->ire_max_frag));
-		ip_wput_frag_v6(mp, ire, flags, NULL, B_FALSE,
-		    ire->ire_max_frag);
-	} else {
-		UPDATE_OB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
-		ip_xmit_v6(mp, ire, flags, NULL, B_FALSE, hwaccel ? io : NULL);
+		return (mp1);
 	}
-	if (ire_need_rele)
-		ire_refrele(ire);
-	freeb(ipsec_mp);
-}
+	mp1 = copyb(nce->nce_dlur_mp);
 
-void
-ipsec_hw_putnext(queue_t *q, mblk_t *mp)
-{
-	mblk_t *hada_mp;	/* attributes M_CTL mblk */
-	da_ipsec_t *hada;	/* data attributes */
-	ill_t *ill = (ill_t *)q->q_ptr;
-
-	IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_hw_putnext: accelerated packet\n"));
+	if (mp1 == NULL) {
+		ill_t *ill = nce->nce_ill;
 
-	if ((ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) == 0) {
-		/* IPsec KSTATS: Bump lose counter here! */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ipIfStatsOutDiscards", mp, ill);
 		freemsg(mp);
-		return;
+		return (NULL);
 	}
-
-	/*
-	 * It's an IPsec packet that must be
-	 * accelerated by the Provider, and the
-	 * outbound ill is IPsec acceleration capable.
-	 * Prepends the mblk with an IPHADA_M_CTL, and ship it
-	 * to the ill.
-	 * IPsec KSTATS: should bump packet counter here.
-	 */
-
-	hada_mp = allocb(sizeof (da_ipsec_t), BPRI_HI);
-	if (hada_mp == NULL) {
-		/* IPsec KSTATS: should bump packet counter here. */
-		freemsg(mp);
-		return;
+	mp1->b_cont = mp;
+	if (priority != 0) {
+		mp1->b_band = priority;
+		((dl_unitdata_req_t *)(mp1->b_rptr))->dl_priority.dl_max =
+		    priority;
 	}
-
-	hada_mp->b_datap->db_type = M_CTL;
-	hada_mp->b_wptr = hada_mp->b_rptr + sizeof (*hada);
-	hada_mp->b_cont = mp;
-
-	hada = (da_ipsec_t *)hada_mp->b_rptr;
-	bzero(hada, sizeof (da_ipsec_t));
-	hada->da_type = IPHADA_M_CTL;
-
-	putnext(q, hada_mp);
+	return (mp1);
+#undef rptr
 }
 
 /*
  * Finish the outbound IPsec processing. This function is called from
  * ipsec_out_process() if the IPsec packet was processed
- * synchronously, or from {ah,esp}_kcf_callback() if it was processed
+ * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
  * asynchronously.
+ *
+ * This is common to IPv4 and IPv6.
  */
-void
-ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
-    ire_t *ire_arg)
+int
+ip_output_post_ipsec(mblk_t *mp, ip_xmit_attr_t *ixa)
 {
-	uint32_t v_hlen_tos_len;
-	ipaddr_t	dst;
-	ipif_t	*ipif = NULL;
-	ire_t *ire;
-	ire_t *ire1 = NULL;
-	mblk_t *next_mp = NULL;
-	uint32_t max_frag;
-	boolean_t multirt_send = B_FALSE;
-	mblk_t *mp;
-	ipha_t *ipha1;
-	uint_t	ill_index;
-	ipsec_out_t *io;
-	int match_flags;
-	irb_t *irb = NULL;
-	boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE;
-	zoneid_t zoneid;
-	ipxmit_state_t	pktxmit_state;
-	ip_stack_t	*ipst;
-
-#ifdef	_BIG_ENDIAN
-#define	LENGTH	(v_hlen_tos_len & 0xFFFF)
-#else
-#define	LENGTH	((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00))
-#endif
+	iaflags_t	ixaflags = ixa->ixa_flags;
+	uint_t		pktlen;
 
-	mp = ipsec_mp->b_cont;
-	ipha1 = (ipha_t *)mp->b_rptr;
-	ASSERT(mp != NULL);
-	v_hlen_tos_len = ((uint32_t *)ipha)[0];
-	dst = ipha->ipha_dst;
 
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	ill_index = io->ipsec_out_ill_index;
-	zoneid = io->ipsec_out_zoneid;
-	ASSERT(zoneid != ALL_ZONES);
-	ipst = io->ipsec_out_ns->netstack_ip;
-	ASSERT(io->ipsec_out_ns != NULL);
-
-	match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
-	if (ill == NULL && ill_index != 0) {
-		ill = ip_grab_ill(ipsec_mp, ill_index, B_FALSE, ipst);
-		/* Failure case frees things for us. */
-		if (ill == NULL)
-			return;
+	/* AH/ESP don't update ixa_pktlen when they modify the packet */
+	if (ixaflags & IXAF_IS_IPV4) {
+		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
 
-		ill_need_rele = B_TRUE;
-	}
-
-	if (CLASSD(dst)) {
-		boolean_t conn_dontroute;
-		/*
-		 * Use the ill_index to get the right ipif.
-		 */
-		conn_dontroute = io->ipsec_out_dontroute;
-		if (ill_index == 0)
-			ipif = ipif_lookup_group(dst, zoneid, ipst);
-		else
-			(void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif);
-		if (ipif == NULL) {
-			ip1dbg(("ip_wput_ipsec_out: No ipif for"
-			    " multicast\n"));
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
-			freemsg(ipsec_mp);
-			goto done;
-		}
-		/*
-		 * ipha_src has already been intialized with the
-		 * value of the ipif in ip_wput. All we need now is
-		 * an ire to send this downstream.
-		 */
-		ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid,
-		    msg_getlabel(mp), match_flags, ipst);
-		if (ire != NULL) {
-			ill_t *ill1;
-			/*
-			 * Do the multicast forwarding now, as the IPsec
-			 * processing has been done.
-			 */
-			if (ipst->ips_ip_g_mrouter && !conn_dontroute &&
-			    (ill1 = ire_to_ill(ire))) {
-				if (ip_mforward(ill1, ipha, mp)) {
-					freemsg(ipsec_mp);
-					ip1dbg(("ip_wput_ipsec_out: mforward "
-					    "failed\n"));
-					ire_refrele(ire);
-					goto done;
-				}
-			}
-			goto send;
-		}
-
-		ip0dbg(("ip_wput_ipsec_out: multicast: IRE disappeared\n"));
-		mp->b_prev = NULL;
-		mp->b_next = NULL;
-
-		/*
-		 * If the IPsec packet was processed asynchronously,
-		 * drop it now.
-		 */
-		if (q == NULL) {
-			freemsg(ipsec_mp);
-			goto done;
-		}
-
-		/*
-		 * We may be using a wrong ipif to create the ire.
-		 * But it is okay as the source address is assigned
-		 * for the packet already. Next outbound packet would
-		 * create the IRE with the right IPIF in ip_wput.
-		 *
-		 * Also handle RTF_MULTIRT routes.
-		 */
-		ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT,
-		    zoneid, &zero_info);
+		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+		pktlen = ntohs(ipha->ipha_length);
 	} else {
-		if (ire_arg != NULL) {
-			ire = ire_arg;
-			ire_need_rele = B_FALSE;
-		} else {
-			ire = ire_cache_lookup(dst, zoneid,
-			    msg_getlabel(mp), ipst);
-		}
-		if (ire != NULL) {
-			goto send;
-		}
-
-		/*
-		 * ire disappeared underneath.
-		 *
-		 * What we need to do here is the ip_newroute
-		 * logic to get the ire without doing the IPsec
-		 * processing. Follow the same old path. But this
-		 * time, ip_wput or ire_add_then_put will call us
-		 * directly as all the IPsec operations are done.
-		 */
-		ip1dbg(("ip_wput_ipsec_out: IRE disappeared\n"));
-		mp->b_prev = NULL;
-		mp->b_next = NULL;
+		ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
 
-		/*
-		 * If the IPsec packet was processed asynchronously,
-		 * drop it now.
-		 */
-		if (q == NULL) {
-			freemsg(ipsec_mp);
-			goto done;
-		}
-
-		/*
-		 * Since we're going through ip_newroute() again, we
-		 * need to make sure we don't:
-		 *
-		 *	1.) Trigger the ASSERT() with the ipha_ident
-		 *	    overloading.
-		 *	2.) Redo transport-layer checksumming, since we've
-		 *	    already done all that to get this far.
-		 *
-		 * The easiest way not do either of the above is to set
-		 * the ipha_ident field to IP_HDR_INCLUDED.
-		 */
-		ipha->ipha_ident = IP_HDR_INCLUDED;
-		ip_newroute(q, ipsec_mp, dst, (CONN_Q(q) ? Q_TO_CONN(q) : NULL),
-		    zoneid, ipst);
-	}
-	goto done;
-send:
-	if (ire->ire_stq == NULL) {
-		ill_t	*out_ill;
-		/*
-		 * Loopbacks go through ip_wput_local except for one case.
-		 * We come here if we generate a icmp_frag_needed message
-		 * after IPsec processing is over. When this function calls
-		 * ip_wput_ire_fragmentit, ip_wput_frag might end up calling
-		 * icmp_frag_needed. The message generated comes back here
-		 * through icmp_frag_needed -> icmp_pkt -> ip_wput ->
-		 * ipsec_out_process -> ip_wput_ipsec_out. We need to set the
-		 * source address as it is usually set in ip_wput_ire. As
-		 * ipsec_out_proc_begin is set, ip_wput calls ipsec_out_process
-		 * and we end up here. We can't enter ip_wput_ire once the
-		 * IPsec processing is over and hence we need to do it here.
-		 */
-		ASSERT(q != NULL);
-		UPDATE_OB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
-		if (ipha->ipha_src == 0)
-			ipha->ipha_src = ire->ire_src_addr;
-
-		/* PFHooks: LOOPBACK_OUT */
-		out_ill = ire_to_ill(ire);
-
-		/*
-		 * DTrace this as ip:::send.  A blocked packet will fire the
-		 * send probe, but not the receive probe.
-		 */
-		DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL,
-		    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
-		    ipha_t *, ipha, ip6_t *, NULL, int, 1);
-
-		DTRACE_PROBE4(ip4__loopback__out__start,
-		    ill_t *, NULL, ill_t *, out_ill,
-		    ipha_t *, ipha1, mblk_t *, ipsec_mp);
-
-		FW_HOOKS(ipst->ips_ip4_loopback_out_event,
-		    ipst->ips_ipv4firewall_loopback_out,
-		    NULL, out_ill, ipha1, ipsec_mp, mp, 0, ipst);
-
-		DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, ipsec_mp);
-
-		if (ipsec_mp != NULL)
-			ip_wput_local(RD(q), out_ill,
-			    ipha, ipsec_mp, ire, 0, zoneid);
-		if (ire_need_rele)
-			ire_refrele(ire);
-		goto done;
+		ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
+		pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
 	}
 
-	if (ire->ire_max_frag < (unsigned int)LENGTH) {
-		/*
-		 * We are through with IPsec processing.
-		 * Fragment this and send it on the wire.
-		 */
-		if (io->ipsec_out_accelerated) {
-			/*
-			 * The packet has been accelerated but must
-			 * be fragmented. This should not happen
-			 * since AH and ESP must not accelerate
-			 * packets that need fragmentation, however
-			 * the configuration could have changed
-			 * since the AH or ESP processing.
-			 * Drop packet.
-			 * IPsec KSTATS: bump bean counter here.
-			 */
-			IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_wput_ipsec_out: "
-			    "fragmented accelerated packet!\n"));
-			freemsg(ipsec_mp);
-		} else {
-			ip_wput_ire_fragmentit(ipsec_mp, ire,
-			    zoneid, ipst, NULL);
-		}
-		if (ire_need_rele)
-			ire_refrele(ire);
-		goto done;
-	}
-
-	ip2dbg(("ip_wput_ipsec_out: ipsec_mp %p, ire %p, ire_ipif %p, "
-	    "ipif %p\n", (void *)ipsec_mp, (void *)ire,
-	    (void *)ire->ire_ipif, (void *)ipif));
-
 	/*
-	 * Multiroute the secured packet.
+	 * We release any hard reference on the SAs here to make
+	 * sure the SAs can be garbage collected. ipsr_sa has a soft reference
+	 * on the SAs.
+	 * If in the future we want the hard latching of the SAs in the
+	 * ip_xmit_attr_t then we should remove this.
 	 */
-	if (ire->ire_flags & RTF_MULTIRT) {
-		ire_t *first_ire;
-		irb = ire->ire_bucket;
-		ASSERT(irb != NULL);
-		/*
-		 * This ire has been looked up as the one that
-		 * goes through the given ipif;
-		 * make sure we do not omit any other multiroute ire
-		 * that may be present in the bucket before this one.
-		 */
-		IRB_REFHOLD(irb);
-		for (first_ire = irb->irb_ire;
-		    first_ire != NULL;
-		    first_ire = first_ire->ire_next) {
-			if ((first_ire->ire_flags & RTF_MULTIRT) &&
-			    (first_ire->ire_addr == ire->ire_addr) &&
-			    !(first_ire->ire_marks &
-			    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
-				break;
-		}
-
-		if ((first_ire != NULL) && (first_ire != ire)) {
-			/*
-			 * Don't change the ire if the packet must
-			 * be fragmented if sent via this new one.
-			 */
-			if (first_ire->ire_max_frag >= (unsigned int)LENGTH) {
-				IRE_REFHOLD(first_ire);
-				if (ire_need_rele)
-					ire_refrele(ire);
-				else
-					ire_need_rele = B_TRUE;
-				ire = first_ire;
-			}
-		}
-		IRB_REFRELE(irb);
-
-		multirt_send = B_TRUE;
-		max_frag = ire->ire_max_frag;
+	if (ixa->ixa_ipsec_esp_sa != NULL) {
+		IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
+		ixa->ixa_ipsec_esp_sa = NULL;
+	}
+	if (ixa->ixa_ipsec_ah_sa != NULL) {
+		IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
+		ixa->ixa_ipsec_ah_sa = NULL;
 	}
 
-	/*
-	 * In most cases, the emission loop below is entered only once.
-	 * Only in the case where the ire holds the RTF_MULTIRT
-	 * flag, we loop to process all RTF_MULTIRT ires in the
-	 * bucket, and send the packet through all crossed
-	 * RTF_MULTIRT routes.
-	 */
-	do {
-		if (multirt_send) {
+	/* Do we need to fragment? */
+	if ((ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR) ||
+	    pktlen > ixa->ixa_fragsize) {
+		if (ixaflags & IXAF_IS_IPV4) {
+			ASSERT(!(ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR));
 			/*
-			 * ire1 holds here the next ire to process in the
-			 * bucket. If multirouting is expected,
-			 * any non-RTF_MULTIRT ire that has the
-			 * right destination address is ignored.
+			 * We check for the DF case in ipsec_out_process
+			 * hence this only handles the non-DF case.
 			 */
-			ASSERT(irb != NULL);
-			IRB_REFHOLD(irb);
-			for (ire1 = ire->ire_next;
-			    ire1 != NULL;
-			    ire1 = ire1->ire_next) {
-				if ((ire1->ire_flags & RTF_MULTIRT) == 0)
-					continue;
-				if (ire1->ire_addr != ire->ire_addr)
-					continue;
-				if (ire1->ire_marks &
-				    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
-					continue;
-				/* No loopback here */
-				if (ire1->ire_stq == NULL)
-					continue;
-				/*
-				 * Ensure we do not exceed the MTU
-				 * of the next route.
-				 */
-				if (ire1->ire_max_frag < (unsigned int)LENGTH) {
-					ip_multirt_bad_mtu(ire1, max_frag);
-					continue;
-				}
-
-				IRE_REFHOLD(ire1);
-				break;
-			}
-			IRB_REFRELE(irb);
-			if (ire1 != NULL) {
-				/*
-				 * We are in a multiple send case, need to
-				 * make a copy of the packet.
-				 */
-				next_mp = copymsg(ipsec_mp);
-				if (next_mp == NULL) {
-					ire_refrele(ire1);
-					ire1 = NULL;
-				}
+			return (ip_fragment_v4(mp, ixa->ixa_nce, ixa->ixa_flags,
+			    pktlen, ixa->ixa_fragsize,
+			    ixa->ixa_xmit_hint, ixa->ixa_zoneid,
+			    ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
+			    &ixa->ixa_cookie));
+		} else {
+			mp = ip_fraghdr_add_v6(mp, ixa->ixa_ident, ixa);
+			if (mp == NULL) {
+				/* MIB and ip_drop_output already done */
+				return (ENOMEM);
 			}
-		}
-		/*
-		 * Everything is done. Send it out on the wire
-		 *
-		 * ip_xmit_v4 will call ip_wput_attach_llhdr and then
-		 * either send it on the wire or, in the case of
-		 * HW acceleration, call ipsec_hw_putnext.
-		 */
-		if (ire->ire_nce &&
-		    ire->ire_nce->nce_state != ND_REACHABLE) {
-			DTRACE_PROBE2(ip__wput__ipsec__bail,
-			    (ire_t *), ire,  (mblk_t *), ipsec_mp);
-			/*
-			 * If ire's link-layer is unresolved (this
-			 * would only happen if the incomplete ire
-			 * was added to cachetable via forwarding path)
-			 * don't bother going to ip_xmit_v4. Just drop the
-			 * packet.
-			 * There is a slight risk here, in that, if we
-			 * have the forwarding path create an incomplete
-			 * IRE, then until the IRE is completed, any
-			 * transmitted IPsec packets will be dropped
-			 * instead of being queued waiting for resolution.
-			 *
-			 * But the likelihood of a forwarding packet and a wput
-			 * packet sending to the same dst at the same time
-			 * and there not yet be an ARP entry for it is small.
-			 * Furthermore, if this actually happens, it might
-			 * be likely that wput would generate multiple
-			 * packets (and forwarding would also have a train
-			 * of packets) for that destination. If this is
-			 * the case, some of them would have been dropped
-			 * anyway, since ARP only queues a few packets while
-			 * waiting for resolution
-			 *
-			 * NOTE: We should really call ip_xmit_v4,
-			 * and let it queue the packet and send the
-			 * ARP query and have ARP come back thus:
-			 * <ARP> ip_wput->ip_output->ip-wput_nondata->
-			 * ip_xmit_v4->ip_wput_attach_llhdr + ipsec
-			 * hw accel work. But it's too complex to get
-			 * the IPsec hw  acceleration approach to fit
-			 * well with ip_xmit_v4 doing ARP without
-			 * doing IPsec simplification. For now, we just
-			 * poke ip_xmit_v4 to trigger the arp resolve, so
-			 * that we can continue with the send on the next
-			 * attempt.
-			 *
-			 * XXX THis should be revisited, when
-			 * the IPsec/IP interaction is cleaned up
-			 */
-			ip1dbg(("ip_wput_ipsec_out: ire is incomplete"
-			    " - dropping packet\n"));
-			freemsg(ipsec_mp);
-			/*
-			 * Call ip_xmit_v4() to trigger ARP query
-			 * in case the nce_state is ND_INITIAL
-			 */
-			(void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
-			goto drop_pkt;
-		}
-
-		DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL,
-		    ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha1,
-		    mblk_t *, ipsec_mp);
-		FW_HOOKS(ipst->ips_ip4_physical_out_event,
-		    ipst->ips_ipv4firewall_physical_out, NULL,
-		    ire->ire_ipif->ipif_ill, ipha1, ipsec_mp, mp, 0, ipst);
-		DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, ipsec_mp);
-		if (ipsec_mp == NULL)
-			goto drop_pkt;
-
-		ip1dbg(("ip_wput_ipsec_out: calling ip_xmit_v4\n"));
-		pktxmit_state = ip_xmit_v4(mp, ire,
-		    (io->ipsec_out_accelerated ? io : NULL), B_FALSE, NULL);
-
-		if ((pktxmit_state ==  SEND_FAILED) ||
-		    (pktxmit_state == LLHDR_RESLV_FAILED)) {
-
-			freeb(ipsec_mp); /* ip_xmit_v4 frees the mp */
-drop_pkt:
-			BUMP_MIB(((ill_t *)ire->ire_stq->q_ptr)->ill_ip_mib,
-			    ipIfStatsOutDiscards);
-			if (ire_need_rele)
-				ire_refrele(ire);
-			if (ire1 != NULL) {
-				ire_refrele(ire1);
-				freemsg(next_mp);
+			pktlen += sizeof (ip6_frag_t);
+			if (pktlen > ixa->ixa_fragsize) {
+				return (ip_fragment_v6(mp, ixa->ixa_nce,
+				    ixa->ixa_flags, pktlen,
+				    ixa->ixa_fragsize, ixa->ixa_xmit_hint,
+				    ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
+				    ixa->ixa_postfragfn, &ixa->ixa_cookie));
 			}
-			goto done;
 		}
-
-		freeb(ipsec_mp);
-		if (ire_need_rele)
-			ire_refrele(ire);
-
-		if (ire1 != NULL) {
-			ire = ire1;
-			ire_need_rele = B_TRUE;
-			ASSERT(next_mp);
-			ipsec_mp = next_mp;
-			mp = ipsec_mp->b_cont;
-			ire1 = NULL;
-			next_mp = NULL;
-			io = (ipsec_out_t *)ipsec_mp->b_rptr;
-		} else {
-			multirt_send = B_FALSE;
-		}
-	} while (multirt_send);
-done:
-	if (ill != NULL && ill_need_rele)
-		ill_refrele(ill);
-	if (ipif != NULL)
-		ipif_refrele(ipif);
+	}
+	return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixa->ixa_flags,
+	    pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
+	    ixa->ixa_no_loop_zoneid, NULL));
 }
 
 /*
- * Get the ill corresponding to the specified ire, and compare its
- * capabilities with the protocol and algorithms specified by the
- * the SA obtained from ipsec_out. If they match, annotate the
- * ipsec_out structure to indicate that the packet needs acceleration.
- *
- *
- * A packet is eligible for outbound hardware acceleration if the
- * following conditions are satisfied:
- *
- * 1. the packet will not be fragmented
- * 2. the provider supports the algorithm
- * 3. there is no pending control message being exchanged
- * 4. snoop is not attached
- * 5. the destination address is not a broadcast or multicast address.
- *
- * Rationale:
- *	- Hardware drivers do not support fragmentation with
- *	  the current interface.
- *	- snoop, multicast, and broadcast may result in exposure of
- *	  a cleartext datagram.
- * We check all five of these conditions here.
+ * Finish the inbound IPsec processing. This function is called from
+ * ipsec_out_process() if the IPsec packet was processed
+ * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
+ * asynchronously.
  *
- * XXX would like to nuke "ire_t *" parameter here; problem is that
- * IRE is only way to figure out if a v4 address is a broadcast and
- * thus ineligible for acceleration...
+ * This is common to IPv4 and IPv6.
  */
-static void
-ipsec_out_is_accelerated(mblk_t *ipsec_mp, ipsa_t *sa, ill_t *ill, ire_t *ire)
+void
+ip_input_post_ipsec(mblk_t *mp, ip_recv_attr_t *ira)
 {
-	ipsec_out_t *io;
-	mblk_t *data_mp;
-	uint_t plen, overhead;
-	ip_stack_t	*ipst;
-	phyint_t	*phyint;
-
-	if ((sa->ipsa_flags & IPSA_F_HW) == 0)
-		return;
-
-	if (ill == NULL)
-		return;
-	ipst = ill->ill_ipst;
-	phyint = ill->ill_phyint;
-
-	/*
-	 * Destination address is a broadcast or multicast.  Punt.
-	 */
-	if ((ire != NULL) && (ire->ire_type & (IRE_BROADCAST|IRE_LOOPBACK|
-	    IRE_LOCAL)))
-		return;
-
-	data_mp = ipsec_mp->b_cont;
+	iaflags_t	iraflags = ira->ira_flags;
 
-	if (ill->ill_isv6) {
-		ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
+	/* Length might have changed */
+	if (iraflags & IRAF_IS_IPV4) {
+		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
 
-		if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
-			return;
+		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+		ira->ira_pktlen = ntohs(ipha->ipha_length);
+		ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
+		ira->ira_protocol = ipha->ipha_protocol;
 
-		plen = ip6h->ip6_plen;
+		ip_fanout_v4(mp, ipha, ira);
 	} else {
-		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
-
-		if (CLASSD(ipha->ipha_dst))
+		ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
+		uint8_t		*nexthdrp;
+
+		ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
+		ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+		if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ira->ira_ip_hdr_length,
+		    &nexthdrp)) {
+			/* Malformed packet */
+			BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ira->ira_ill);
+			freemsg(mp);
 			return;
-
-		plen = ipha->ipha_length;
-	}
-	/*
-	 * Is there a pending DLPI control message being exchanged
-	 * between IP/IPsec and the DLS Provider? If there is, it
-	 * could be a SADB update, and the state of the DLS Provider
-	 * SADB might not be in sync with the SADB maintained by
-	 * IPsec. To avoid dropping packets or using the wrong keying
-	 * material, we do not accelerate this packet.
-	 */
-	if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
-		IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: "
-		    "ill_dlpi_pending! don't accelerate packet\n"));
-		return;
-	}
-
-	/*
-	 * Is the Provider in promiscous mode? If it does, we don't
-	 * accelerate the packet since it will bounce back up to the
-	 * listeners in the clear.
-	 */
-	if (phyint->phyint_flags & PHYI_PROMISC) {
-		IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: "
-		    "ill in promiscous mode, don't accelerate packet\n"));
-		return;
-	}
-
-	/*
-	 * Will the packet require fragmentation?
-	 */
-
-	/*
-	 * IPsec ESP note: this is a pessimistic estimate, but the same
-	 * as is used elsewhere.
-	 * SPI + sequence + MAC + IV(blocksize) + padding(blocksize-1)
-	 *	+ 2-byte trailer
-	 */
-	overhead = (sa->ipsa_type == SADB_SATYPE_AH) ? IPSEC_MAX_AH_HDR_SIZE :
-	    IPSEC_BASE_ESP_HDR_SIZE(sa);
-
-	if ((plen + overhead) > ill->ill_max_mtu)
-		return;
-
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-
-	/*
-	 * Can the ill accelerate this IPsec protocol and algorithm
-	 * specified by the SA?
-	 */
-	if (!ipsec_capab_match(ill, io->ipsec_out_capab_ill_index,
-	    ill->ill_isv6, sa, ipst->ips_netstack)) {
-		return;
+		}
+		ira->ira_protocol = *nexthdrp;
+		ip_fanout_v6(mp, ip6h, ira);
 	}
-
-	/*
-	 * Tell AH or ESP that the outbound ill is capable of
-	 * accelerating this packet.
-	 */
-	io->ipsec_out_is_capab_ill = B_TRUE;
 }
 
 /*
  * Select which AH & ESP SA's to use (if any) for the outbound packet.
  *
  * If this function returns B_TRUE, the requested SA's have been filled
- * into the ipsec_out_*_sa pointers.
+ * into the ixa_ipsec_*_sa pointers.
  *
  * If the function returns B_FALSE, the packet has been "consumed", most
  * likely by an ACQUIRE sent up via PF_KEY to a key management daemon.
  *
  * The SA references created by the protocol-specific "select"
- * function will be released when the ipsec_mp is freed, thanks to the
- * ipsec_out_free destructor -- see spd.c.
+ * function will be released in ip_output_post_ipsec.
  */
 static boolean_t
-ipsec_out_select_sa(mblk_t *ipsec_mp)
+ipsec_out_select_sa(mblk_t *mp, ip_xmit_attr_t *ixa)
 {
 	boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE;
-	ipsec_out_t *io;
 	ipsec_policy_t *pp;
 	ipsec_action_t *ap;
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t));
 
-	if (!io->ipsec_out_secure) {
-		/*
-		 * We came here by mistake.
-		 * Don't bother with ipsec processing
-		 * We should "discourage" this path in the future.
-		 */
-		ASSERT(io->ipsec_out_proc_begin == B_FALSE);
-		return (B_FALSE);
-	}
-	ASSERT(io->ipsec_out_need_policy == B_FALSE);
-	ASSERT((io->ipsec_out_policy != NULL) ||
-	    (io->ipsec_out_act != NULL));
+	ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
+	ASSERT((ixa->ixa_ipsec_policy != NULL) ||
+	    (ixa->ixa_ipsec_action != NULL));
 
-	ASSERT(io->ipsec_out_failed == B_FALSE);
-
-	/*
-	 * IPsec processing has started.
-	 */
-	io->ipsec_out_proc_begin = B_TRUE;
-	ap = io->ipsec_out_act;
+	ap = ixa->ixa_ipsec_action;
 	if (ap == NULL) {
-		pp = io->ipsec_out_policy;
+		pp = ixa->ixa_ipsec_policy;
 		ASSERT(pp != NULL);
 		ap = pp->ipsp_act;
 		ASSERT(ap != NULL);
@@ -26438,22 +12355,23 @@ ipsec_out_select_sa(mblk_t *ipsec_mp)
 
 	/*
 	 * We have an action.  now, let's select SA's.
-	 * (In the future, we can cache this in the conn_t..)
+	 * A side effect of setting ixa_ipsec_*_sa is that it will
+	 * be cached in the conn_t.
 	 */
 	if (ap->ipa_want_esp) {
-		if (io->ipsec_out_esp_sa == NULL) {
-			need_esp_acquire = !ipsec_outbound_sa(ipsec_mp,
+		if (ixa->ixa_ipsec_esp_sa == NULL) {
+			need_esp_acquire = !ipsec_outbound_sa(mp, ixa,
 			    IPPROTO_ESP);
 		}
-		ASSERT(need_esp_acquire || io->ipsec_out_esp_sa != NULL);
+		ASSERT(need_esp_acquire || ixa->ixa_ipsec_esp_sa != NULL);
 	}
 
 	if (ap->ipa_want_ah) {
-		if (io->ipsec_out_ah_sa == NULL) {
-			need_ah_acquire = !ipsec_outbound_sa(ipsec_mp,
+		if (ixa->ixa_ipsec_ah_sa == NULL) {
+			need_ah_acquire = !ipsec_outbound_sa(mp, ixa,
 			    IPPROTO_AH);
 		}
-		ASSERT(need_ah_acquire || io->ipsec_out_ah_sa != NULL);
+		ASSERT(need_ah_acquire || ixa->ixa_ipsec_ah_sa != NULL);
 		/*
 		 * The ESP and AH processing order needs to be preserved
 		 * when both protocols are required (ESP should be applied
@@ -26471,16 +12389,16 @@ ipsec_out_select_sa(mblk_t *ipsec_mp)
 	 * acquire _all_ of the SAs we need.
 	 */
 	if (need_ah_acquire || need_esp_acquire) {
-		if (io->ipsec_out_ah_sa != NULL) {
-			IPSA_REFRELE(io->ipsec_out_ah_sa);
-			io->ipsec_out_ah_sa = NULL;
+		if (ixa->ixa_ipsec_ah_sa != NULL) {
+			IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
+			ixa->ixa_ipsec_ah_sa = NULL;
 		}
-		if (io->ipsec_out_esp_sa != NULL) {
-			IPSA_REFRELE(io->ipsec_out_esp_sa);
-			io->ipsec_out_esp_sa = NULL;
+		if (ixa->ixa_ipsec_esp_sa != NULL) {
+			IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
+			ixa->ixa_ipsec_esp_sa = NULL;
 		}
 
-		sadb_acquire(ipsec_mp, io, need_ah_acquire, need_esp_acquire);
+		sadb_acquire(mp, ixa, need_ah_acquire, need_esp_acquire);
 		return (B_FALSE);
 	}
 
@@ -26488,110 +12406,64 @@ ipsec_out_select_sa(mblk_t *ipsec_mp)
 }
 
 /*
- * Process an IPSEC_OUT message and see what you can
- * do with it.
- * IPQoS Notes:
- * We do IPPF processing if IPP_LOCAL_OUT is enabled before processing for
- * IPsec.
- * XXX would like to nuke ire_t.
- * XXX ill_index better be "real"
+ * Handle IPsec output processing.
+ * This function is only entered once for a given packet.
+ * We try to do things synchronously, but if we need to have user-level
+ * set up SAs, or ESP or AH uses asynchronous kEF, then the operation
+ * will be completed
+ *  - when the SAs are added in esp_add_sa_finish/ah_add_sa_finish
+ *  - when asynchronous ESP is done it will do AH
+ *
+ * In all cases we come back in ip_output_post_ipsec() to fragment and
+ * send out the packet.
  */
-void
-ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
+int
+ipsec_out_process(mblk_t *mp, ip_xmit_attr_t *ixa)
 {
-	ipsec_out_t *io;
-	ipsec_policy_t *pp;
-	ipsec_action_t *ap;
-	ipha_t *ipha;
-	ip6_t *ip6h;
-	mblk_t *mp;
-	ill_t *ill;
-	zoneid_t zoneid;
-	ipsec_status_t ipsec_rc;
-	boolean_t ill_need_rele = B_FALSE;
-	ip_stack_t	*ipst;
+	ill_t		*ill = ixa->ixa_nce->nce_ill;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
 	ipsec_stack_t	*ipss;
+	ipsec_policy_t	*pp;
+	ipsec_action_t	*ap;
 
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t));
-	ipst = io->ipsec_out_ns->netstack_ip;
-	mp = ipsec_mp->b_cont;
-
-	/*
-	 * Initiate IPPF processing. We do it here to account for packets
-	 * coming here that don't have any policy (i.e. !io->ipsec_out_secure).
-	 * We can check for ipsec_out_proc_begin even for such packets, as
-	 * they will always be false (asserted below).
-	 */
-	if (IPP_ENABLED(IPP_LOCAL_OUT, ipst) && !io->ipsec_out_proc_begin) {
-		ip_process(IPP_LOCAL_OUT, &mp, io->ipsec_out_ill_index != 0 ?
-		    io->ipsec_out_ill_index : ill_index);
-		if (mp == NULL) {
-			ip2dbg(("ipsec_out_process: packet dropped "\
-			    "during IPPF processing\n"));
-			freeb(ipsec_mp);
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-			return;
-		}
-	}
+	ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
 
-	if (!io->ipsec_out_secure) {
-		/*
-		 * We came here by mistake.
-		 * Don't bother with ipsec processing
-		 * Should "discourage" this path in the future.
-		 */
-		ASSERT(io->ipsec_out_proc_begin == B_FALSE);
-		goto done;
-	}
-	ASSERT(io->ipsec_out_need_policy == B_FALSE);
-	ASSERT((io->ipsec_out_policy != NULL) ||
-	    (io->ipsec_out_act != NULL));
-	ASSERT(io->ipsec_out_failed == B_FALSE);
+	ASSERT((ixa->ixa_ipsec_policy != NULL) ||
+	    (ixa->ixa_ipsec_action != NULL));
 
 	ipss = ipst->ips_netstack->netstack_ipsec;
 	if (!ipsec_loaded(ipss)) {
-		ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
-		if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-		} else {
-			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
-		}
-		ip_drop_packet(ipsec_mp, B_FALSE, NULL, ire,
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_packet(mp, B_TRUE, ill,
 		    DROPPER(ipss, ipds_ip_ipsec_not_loaded),
 		    &ipss->ipsec_dropper);
-		return;
+		return (ENOTSUP);
 	}
 
-	/*
-	 * IPsec processing has started.
-	 */
-	io->ipsec_out_proc_begin = B_TRUE;
-	ap = io->ipsec_out_act;
+	ap = ixa->ixa_ipsec_action;
 	if (ap == NULL) {
-		pp = io->ipsec_out_policy;
+		pp = ixa->ixa_ipsec_policy;
 		ASSERT(pp != NULL);
 		ap = pp->ipsp_act;
 		ASSERT(ap != NULL);
 	}
 
-	/*
-	 * Save the outbound ill index. When the packet comes back
-	 * from IPsec, we make sure the ill hasn't changed or disappeared
-	 * before sending it the accelerated packet.
-	 */
-	if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) {
-		ill = ire_to_ill(ire);
-		io->ipsec_out_capab_ill_index = ill->ill_phyint->phyint_ifindex;
+	/* Handle explicit drop action and bypass. */
+	switch (ap->ipa_act.ipa_type) {
+	case IPSEC_ACT_DISCARD:
+	case IPSEC_ACT_REJECT:
+		ip_drop_packet(mp, B_FALSE, ill,
+		    DROPPER(ipss, ipds_spd_explicit), &ipss->ipsec_spd_dropper);
+		return (EHOSTUNREACH);	/* IPsec policy failure */
+	case IPSEC_ACT_BYPASS:
+		return (ip_output_post_ipsec(mp, ixa));
 	}
 
 	/*
 	 * The order of processing is first insert a IP header if needed.
 	 * Then insert the ESP header and then the AH header.
 	 */
-	if ((io->ipsec_out_se_done == B_FALSE) &&
-	    (ap->ipa_want_se)) {
+	if ((ixa->ixa_flags & IXAF_IS_IPV4) && ap->ipa_want_se) {
 		/*
 		 * First get the outer IP header before sending
 		 * it to ESP.
@@ -26600,19 +12472,16 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
 		mblk_t *outer_mp, *inner_mp;
 
 		if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) {
-			(void) mi_strlog(q, 0, SL_ERROR|SL_TRACE|SL_CONSOLE,
+			(void) mi_strlog(ill->ill_rq, 0,
+			    SL_ERROR|SL_TRACE|SL_CONSOLE,
 			    "ipsec_out_process: "
 			    "Self-Encapsulation failed: Out of memory\n");
-			freemsg(ipsec_mp);
-			if (ill != NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
-			} else {
-				BUMP_MIB(&ipst->ips_ip_mib,
-				    ipIfStatsOutDiscards);
-			}
-			return;
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+			freemsg(mp);
+			return (ENOBUFS);
 		}
-		inner_mp = ipsec_mp->b_cont;
+		inner_mp = mp;
 		ASSERT(inner_mp->b_datap->db_type == M_DATA);
 		oipha = (ipha_t *)outer_mp->b_rptr;
 		iipha = (ipha_t *)inner_mp->b_rptr;
@@ -26626,139 +12495,51 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
 		oipha->ipha_hdr_checksum = 0;
 		oipha->ipha_hdr_checksum = ip_csum_hdr(oipha);
 		outer_mp->b_cont = inner_mp;
-		ipsec_mp->b_cont = outer_mp;
+		mp = outer_mp;
 
-		io->ipsec_out_se_done = B_TRUE;
-		io->ipsec_out_tunnel = B_TRUE;
+		ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
 	}
 
-	if (((ap->ipa_want_ah && (io->ipsec_out_ah_sa == NULL)) ||
-	    (ap->ipa_want_esp && (io->ipsec_out_esp_sa == NULL))) &&
-	    !ipsec_out_select_sa(ipsec_mp))
-		return;
+	/* If we need to wait for a SA then we can't return any errno */
+	if (((ap->ipa_want_ah && (ixa->ixa_ipsec_ah_sa == NULL)) ||
+	    (ap->ipa_want_esp && (ixa->ixa_ipsec_esp_sa == NULL))) &&
+	    !ipsec_out_select_sa(mp, ixa))
+		return (0);
 
 	/*
 	 * By now, we know what SA's to use.  Toss over to ESP & AH
 	 * to do the heavy lifting.
 	 */
-	zoneid = io->ipsec_out_zoneid;
-	ASSERT(zoneid != ALL_ZONES);
-	if ((io->ipsec_out_esp_done == B_FALSE) && (ap->ipa_want_esp)) {
-		ASSERT(io->ipsec_out_esp_sa != NULL);
-		io->ipsec_out_esp_done = B_TRUE;
-		/*
-		 * Note that since hw accel can only apply one transform,
-		 * not two, we skip hw accel for ESP if we also have AH
-		 * This is an design limitation of the interface
-		 * which should be revisited.
-		 */
-		ASSERT(ire != NULL);
-		if (io->ipsec_out_ah_sa == NULL) {
-			ill = (ill_t *)ire->ire_stq->q_ptr;
-			ipsec_out_is_accelerated(ipsec_mp,
-			    io->ipsec_out_esp_sa, ill, ire);
-		}
+	if (ap->ipa_want_esp) {
+		ASSERT(ixa->ixa_ipsec_esp_sa != NULL);
 
-		ipsec_rc = io->ipsec_out_esp_sa->ipsa_output_func(ipsec_mp);
-		switch (ipsec_rc) {
-		case IPSEC_STATUS_SUCCESS:
-			break;
-		case IPSEC_STATUS_FAILED:
-			if (ill != NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
-			} else {
-				BUMP_MIB(&ipst->ips_ip_mib,
-				    ipIfStatsOutDiscards);
-			}
-			/* FALLTHRU */
-		case IPSEC_STATUS_PENDING:
-			return;
+		mp = ixa->ixa_ipsec_esp_sa->ipsa_output_func(mp, ixa);
+		if (mp == NULL) {
+			/*
+			 * Either it failed or is pending. In the former case
+			 * ipIfStatsInDiscards was increased.
+			 */
+			return (0);
 		}
 	}
 
-	if ((io->ipsec_out_ah_done == B_FALSE) && (ap->ipa_want_ah)) {
-		ASSERT(io->ipsec_out_ah_sa != NULL);
-		io->ipsec_out_ah_done = B_TRUE;
-		if (ire == NULL) {
-			int idx = io->ipsec_out_capab_ill_index;
-			ill = ill_lookup_on_ifindex(idx, B_FALSE,
-			    NULL, NULL, NULL, NULL, ipst);
-			ill_need_rele = B_TRUE;
-		} else {
-			ill = (ill_t *)ire->ire_stq->q_ptr;
-		}
-		ipsec_out_is_accelerated(ipsec_mp, io->ipsec_out_ah_sa, ill,
-		    ire);
+	if (ap->ipa_want_ah) {
+		ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
 
-		ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp);
-		switch (ipsec_rc) {
-		case IPSEC_STATUS_SUCCESS:
-			break;
-		case IPSEC_STATUS_FAILED:
-			if (ill != NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
-			} else {
-				BUMP_MIB(&ipst->ips_ip_mib,
-				    ipIfStatsOutDiscards);
-			}
-			/* FALLTHRU */
-		case IPSEC_STATUS_PENDING:
-			if (ill != NULL && ill_need_rele)
-				ill_refrele(ill);
-			return;
+		mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(mp, ixa);
+		if (mp == NULL) {
+			/*
+			 * Either it failed or is pending. In the former case
+			 * ipIfStatsInDiscards was increased.
+			 */
+			return (0);
 		}
 	}
 	/*
-	 * We are done with IPsec processing. Send it over the wire.
-	 */
-done:
-	mp = ipsec_mp->b_cont;
-	ipha = (ipha_t *)mp->b_rptr;
-	if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
-		ip_wput_ipsec_out(q, ipsec_mp, ipha, ire->ire_ipif->ipif_ill,
-		    ire);
-	} else {
-		ip6h = (ip6_t *)ipha;
-		ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ire->ire_ipif->ipif_ill,
-		    ire);
-	}
-	if (ill != NULL && ill_need_rele)
-		ill_refrele(ill);
-}
-
-/* ARGSUSED */
-void
-ip_restart_optmgmt(ipsq_t *dummy_sq, queue_t *q, mblk_t *first_mp, void *dummy)
-{
-	opt_restart_t	*or;
-	int	err;
-	conn_t	*connp;
-	cred_t	*cr;
-
-	ASSERT(CONN_Q(q));
-	connp = Q_TO_CONN(q);
-
-	ASSERT(first_mp->b_datap->db_type == M_CTL);
-	or = (opt_restart_t *)first_mp->b_rptr;
-	/*
-	 * We checked for a db_credp the first time svr4_optcom_req
-	 * was called (from ip_wput_nondata). So we can just ASSERT here.
+	 * We are done with IPsec processing. Send it over
+	 * the wire.
 	 */
-	cr = msg_getcred(first_mp, NULL);
-	ASSERT(cr != NULL);
-
-	if (or->or_type == T_SVR4_OPTMGMT_REQ) {
-		err = svr4_optcom_req(q, first_mp, cr,
-		    &ip_opt_obj, B_FALSE);
-	} else {
-		ASSERT(or->or_type == T_OPTMGMT_REQ);
-		err = tpi_optcom_req(q, first_mp, cr,
-		    &ip_opt_obj, B_FALSE);
-	}
-	if (err != EINPROGRESS) {
-		/* operation is done */
-		CONN_OPER_PENDING_DONE(connp);
-	}
+	return (ip_output_post_ipsec(mp, ixa));
 }
 
 /*
@@ -26811,6 +12592,11 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 	err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin,
 	    q, mp, ipip, mp1->b_rptr);
 
+	DTRACE_PROBE4(ipif__ioctl, char *, "ip_reprocess_ioctl finish",
+	    int, ipip->ipi_cmd,
+	    ill_t *, ipsq->ipsq_xop->ipx_current_ipif->ipif_ill,
+	    ipif_t *, ipsq->ipsq_xop->ipx_current_ipif);
+
 	ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
 }
 
@@ -26865,12 +12651,16 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	 */
 	if (ipip->ipi_cmd == SIOCLIFADDIF) {
 		err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL);
+		DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish",
+		    int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
 		ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
 		return;
 	}
 
 	ci.ci_ipif = NULL;
-	if (ipip->ipi_cmd_type == MISC_CMD) {
+	switch (ipip->ipi_cmd_type) {
+	case MISC_CMD:
+	case MSFILT_CMD:
 		/*
 		 * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF.
 		 */
@@ -26883,28 +12673,29 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 		ci.ci_sin = NULL;
 		ci.ci_sin6 = NULL;
 		ci.ci_lifr = NULL;
-	} else {
-		switch (ipip->ipi_cmd_type) {
-		case IF_CMD:
-		case LIF_CMD:
-			extract_funcp = ip_extract_lifreq;
-			break;
+		extract_funcp = NULL;
+		break;
 
-		case ARP_CMD:
-		case XARP_CMD:
-			extract_funcp = ip_extract_arpreq;
-			break;
+	case IF_CMD:
+	case LIF_CMD:
+		extract_funcp = ip_extract_lifreq;
+		break;
 
-		case MSFILT_CMD:
-			extract_funcp = ip_extract_msfilter;
-			break;
+	case ARP_CMD:
+	case XARP_CMD:
+		extract_funcp = ip_extract_arpreq;
+		break;
 
-		default:
-			ASSERT(0);
-		}
+	default:
+		ASSERT(0);
+	}
 
-		err = (*extract_funcp)(q, mp, ipip, &ci, ip_process_ioctl);
+	if (extract_funcp != NULL) {
+		err = (*extract_funcp)(q, mp, ipip, &ci);
 		if (err != 0) {
+			DTRACE_PROBE4(ipif__ioctl,
+			    char *, "ip_process_ioctl finish err",
+			    int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
 			ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
 			return;
 		}
@@ -26923,8 +12714,17 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 		 */
 		err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip,
 		    ci.ci_lifr);
-		if (ci.ci_ipif != NULL)
+		if (ci.ci_ipif != NULL) {
+			DTRACE_PROBE4(ipif__ioctl,
+			    char *, "ip_process_ioctl finish RD",
+			    int, ipip->ipi_cmd, ill_t *, ci.ci_ipif->ipif_ill,
+			    ipif_t *, ci.ci_ipif);
 			ipif_refrele(ci.ci_ipif);
+		} else {
+			DTRACE_PROBE4(ipif__ioctl,
+			    char *, "ip_process_ioctl finish RD",
+			    int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
+		}
 		ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
 		return;
 	}
@@ -26932,7 +12732,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	ASSERT(ci.ci_ipif != NULL);
 
 	/*
-	 * If ipsq is non-NULL, we are already being called exclusively.
+	 * If ipsq is non-NULL, we are already being called exclusively
 	 */
 	ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
 	if (ipsq == NULL) {
@@ -26944,7 +12744,6 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 		}
 		entered_ipsq = B_TRUE;
 	}
-
 	/*
 	 * Release the ipif so that ipif_down and friends that wait for
 	 * references to go away are not misled about the current ipif_refcnt
@@ -26962,6 +12761,10 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	 */
 	err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
 
+	DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
+	    int, ipip->ipi_cmd,
+	    ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
+	    ipif_t *, ci.ci_ipif);
 	ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
 
 	if (entered_ipsq)
@@ -27012,31 +12815,21 @@ ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq)
 		ipsq_current_finish(ipsq);
 }
 
-/* Called from ip_wput for all non data messages */
-/* ARGSUSED */
+/* Handles all non data messages */
 void
-ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
+ip_wput_nondata(queue_t *q, mblk_t *mp)
 {
 	mblk_t		*mp1;
-	ire_t		*ire, *fake_ire;
-	ill_t		*ill;
 	struct iocblk	*iocp;
 	ip_ioctl_cmd_t	*ipip;
-	cred_t		*cr;
 	conn_t		*connp;
-	int		err;
-	nce_t		*nce;
-	ipif_t		*ipif;
-	ip_stack_t	*ipst;
+	cred_t		*cr;
 	char		*proto_str;
 
-	if (CONN_Q(q)) {
+	if (CONN_Q(q))
 		connp = Q_TO_CONN(q);
-		ipst = connp->conn_netstack->netstack_ip;
-	} else {
+	else
 		connp = NULL;
-		ipst = ILLQ_TO_IPST(q);
-	}
 
 	switch (DB_TYPE(mp)) {
 	case M_IOCTL:
@@ -27064,17 +12857,10 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		}
 		if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
 			/*
-			 * the ioctl is one we recognise, but is not
-			 * consumed by IP as a module, pass M_IOCDATA
-			 * for processing downstream, but only for
-			 * common Streams ioctls.
+			 * The ioctl is one we recognise, but is not consumed
+			 * by IP as a module and we are a module, so we drop
 			 */
-			if (ipip->ipi_flags & IPI_PASS_DOWN) {
-				putnext(q, mp);
-				return;
-			} else {
-				goto nak;
-			}
+			goto nak;
 		}
 
 		/* IOCTL continuation following copyin or copyout. */
@@ -27110,8 +12896,8 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			/*
 			 * Refhold the conn, till the ioctl completes. This is
 			 * needed in case the ioctl ends up in the pending mp
-			 * list. Every mp in the ill_pending_mp list and
-			 * the ipx_pending_mp must have a refhold on the conn
+			 * list. Every mp in the ipx_pending_mp list
+			 * must have a refhold on the conn
 			 * to resume processing. The refhold is released when
 			 * the ioctl completes. (normally or abnormally)
 			 * In all cases ip_ioctl_finish is called to finish
@@ -27119,7 +12905,6 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			 */
 			if (connp != NULL) {
 				/* This is not a reentry */
-				ASSERT(ipsq == NULL);
 				CONN_INC_REF(connp);
 			} else {
 				if (!(ipip->ipi_flags & IPI_MODOK)) {
@@ -27128,18 +12913,12 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 				}
 			}
 
-			ip_process_ioctl(ipsq, q, mp, ipip);
+			ip_process_ioctl(NULL, q, mp, ipip);
 
 		} else {
 			mi_copyout(q, mp);
 		}
 		return;
-nak:
-		iocp->ioc_error = EINVAL;
-		mp->b_datap->db_type = M_IOCNAK;
-		iocp->ioc_count = 0;
-		qreply(q, mp);
-		return;
 
 	case M_IOCNAK:
 		/*
@@ -27147,35 +12926,13 @@ nak:
 		 * an IOCTL we sent it.	 This shouldn't happen.
 		 */
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
-		    "ip_wput: unexpected M_IOCNAK, ioc_cmd 0x%x",
+		    "ip_wput_nondata: unexpected M_IOCNAK, ioc_cmd 0x%x",
 		    ((struct iocblk *)mp->b_rptr)->ioc_cmd);
 		freemsg(mp);
 		return;
 	case M_IOCACK:
 		/* /dev/ip shouldn't see this */
-		if (CONN_Q(q))
-			goto nak;
-
-		/*
-		 * Finish socket ioctls passed through to ARP.  We use the
-		 * ioc_cmd values we set in ip_sioctl_arp() to decide whether
-		 * we need to become writer before calling ip_sioctl_iocack().
-		 * Note that qwriter_ip() will release the refhold, and that a
-		 * refhold is OK without ILL_CAN_LOOKUP() since we're on the
-		 * ill stream.
-		 */
-		iocp = (struct iocblk *)mp->b_rptr;
-		if (iocp->ioc_cmd == AR_ENTRY_SQUERY) {
-			ip_sioctl_iocack(NULL, q, mp, NULL);
-			return;
-		}
-
-		ASSERT(iocp->ioc_cmd == AR_ENTRY_DELETE ||
-		    iocp->ioc_cmd == AR_ENTRY_ADD);
-		ill = q->q_ptr;
-		ill_refhold(ill);
-		qwriter_ip(ill, q, mp, ip_sioctl_iocack, CUR_OP, B_FALSE);
-		return;
+		goto nak;
 	case M_FLUSH:
 		if (*mp->b_rptr & FLUSHW)
 			flushq(q, FLUSHALL);
@@ -27190,117 +12947,17 @@ nak:
 		}
 		freemsg(mp);
 		return;
-	case IRE_DB_REQ_TYPE:
-		if (connp == NULL) {
-			proto_str = "IRE_DB_REQ_TYPE";
-			goto protonak;
-		}
-		/* An Upper Level Protocol wants a copy of an IRE. */
-		ip_ire_req(q, mp);
-		return;
 	case M_CTL:
-		if (mp->b_wptr - mp->b_rptr < sizeof (uint32_t))
-			break;
-
-		/* M_CTL messages are used by ARP to tell us things. */
-		if ((mp->b_wptr - mp->b_rptr) < sizeof (arc_t))
-			break;
-		switch (((arc_t *)mp->b_rptr)->arc_cmd) {
-		case AR_ENTRY_SQUERY:
-			putnext(q, mp);
-			return;
-		case AR_CLIENT_NOTIFY:
-			ip_arp_news(q, mp);
-			return;
-		case AR_DLPIOP_DONE:
-			ASSERT(q->q_next != NULL);
-			ill = (ill_t *)q->q_ptr;
-			/* qwriter_ip releases the refhold */
-			/* refhold on ill stream is ok without ILL_CAN_LOOKUP */
-			ill_refhold(ill);
-			qwriter_ip(ill, q, mp, ip_arp_done, CUR_OP, B_FALSE);
-			return;
-		case AR_ARP_CLOSING:
-			/*
-			 * ARP (above us) is closing. If no ARP bringup is
-			 * currently pending, ack the message so that ARP
-			 * can complete its close. Also mark ill_arp_closing
-			 * so that new ARP bringups will fail. If any
-			 * ARP bringup is currently in progress, we will
-			 * ack this when the current ARP bringup completes.
-			 */
-			ASSERT(q->q_next != NULL);
-			ill = (ill_t *)q->q_ptr;
-			mutex_enter(&ill->ill_lock);
-			ill->ill_arp_closing = 1;
-			if (!ill->ill_arp_bringup_pending) {
-				mutex_exit(&ill->ill_lock);
-				qreply(q, mp);
-			} else {
-				mutex_exit(&ill->ill_lock);
-				freemsg(mp);
-			}
-			return;
-		case AR_ARP_EXTEND:
-			/*
-			 * The ARP module above us is capable of duplicate
-			 * address detection.  Old ATM drivers will not send
-			 * this message.
-			 */
-			ASSERT(q->q_next != NULL);
-			ill = (ill_t *)q->q_ptr;
-			ill->ill_arp_extend = B_TRUE;
-			freemsg(mp);
-			return;
-		default:
-			break;
-		}
 		break;
 	case M_PROTO:
 	case M_PCPROTO:
 		/*
-		 * The only PROTO messages we expect are copies of option
-		 * negotiation acknowledgements, AH and ESP bind requests
-		 * are also expected.
+		 * The only PROTO messages we expect are SNMP-related.
 		 */
 		switch (((union T_primitives *)mp->b_rptr)->type) {
-		case O_T_BIND_REQ:
-		case T_BIND_REQ: {
-			/* Request can get queued in bind */
-			if (connp == NULL) {
-				proto_str = "O_T_BIND_REQ/T_BIND_REQ";
-				goto protonak;
-			}
-			/*
-			 * The transports except SCTP call ip_bind_{v4,v6}()
-			 * directly instead of a a putnext. SCTP doesn't
-			 * generate any T_BIND_REQ since it has its own
-			 * fanout data structures. However, ESP and AH
-			 * come in for regular binds; all other cases are
-			 * bind retries.
-			 */
-			ASSERT(!IPCL_IS_SCTP(connp));
-
-			/* Don't increment refcnt if this is a re-entry */
-			if (ipsq == NULL)
-				CONN_INC_REF(connp);
-
-			mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp,
-			    connp, NULL) : ip_bind_v4(q, mp, connp);
-			ASSERT(mp != NULL);
-
-			ASSERT(!IPCL_IS_TCP(connp));
-			ASSERT(!IPCL_IS_UDP(connp));
-			ASSERT(!IPCL_IS_RAWIP(connp));
-			ASSERT(!IPCL_IS_IPTUN(connp));
-
-			/* The case of AH and ESP */
-			qreply(q, mp);
-			CONN_OPER_PENDING_DONE(connp);
-			return;
-		}
 		case T_SVR4_OPTMGMT_REQ:
-			ip2dbg(("ip_wput: T_SVR4_OPTMGMT_REQ flags %x\n",
+			ip2dbg(("ip_wput_nondata: T_SVR4_OPTMGMT_REQ "
+			    "flags %x\n",
 			    ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags));
 
 			if (connp == NULL) {
@@ -27324,460 +12981,17 @@ nak:
 				return;
 			}
 
-			if (!snmpcom_req(q, mp, ip_snmp_set,
-			    ip_snmp_get, cr)) {
-				/*
-				 * Call svr4_optcom_req so that it can
-				 * generate the ack. We don't come here
-				 * if this operation is being restarted.
-				 * ip_restart_optmgmt will drop the conn ref.
-				 * In the case of ipsec option after the ipsec
-				 * load is complete conn_restart_ipsec_waiter
-				 * drops the conn ref.
-				 */
-				ASSERT(ipsq == NULL);
-				CONN_INC_REF(connp);
-				if (ip_check_for_ipsec_opt(q, mp))
-					return;
-				err = svr4_optcom_req(q, mp, cr, &ip_opt_obj,
-				    B_FALSE);
-				if (err != EINPROGRESS) {
-					/* Operation is done */
-					CONN_OPER_PENDING_DONE(connp);
-				}
-			}
-			return;
-		case T_OPTMGMT_REQ:
-			ip2dbg(("ip_wput: T_OPTMGMT_REQ\n"));
-			/*
-			 * Note: No snmpcom_req support through new
-			 * T_OPTMGMT_REQ.
-			 * Call tpi_optcom_req so that it can
-			 * generate the ack.
-			 */
-			if (connp == NULL) {
-				proto_str = "T_OPTMGMT_REQ";
-				goto protonak;
-			}
-
-			/*
-			 * All Solaris components should pass a db_credp
-			 * for this TPI message, hence we ASSERT.
-			 * But in case there is some other M_PROTO that looks
-			 * like a TPI message sent by some other kernel
-			 * component, we check and return an error.
-			 */
-			cr = msg_getcred(mp, NULL);
-			ASSERT(cr != NULL);
-			if (cr == NULL) {
-				mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
-				if (mp != NULL)
-					qreply(q, mp);
-				return;
-			}
-			ASSERT(ipsq == NULL);
-			/*
-			 * We don't come here for restart. ip_restart_optmgmt
-			 * will drop the conn ref. In the case of ipsec option
-			 * after the ipsec load is complete
-			 * conn_restart_ipsec_waiter drops the conn ref.
-			 */
-			CONN_INC_REF(connp);
-			if (ip_check_for_ipsec_opt(q, mp))
-				return;
-			err = tpi_optcom_req(q, mp, cr, &ip_opt_obj, B_FALSE);
-			if (err != EINPROGRESS) {
-				/* Operation is done */
-				CONN_OPER_PENDING_DONE(connp);
-			}
-			return;
-		case T_UNBIND_REQ:
-			if (connp == NULL) {
-				proto_str = "T_UNBIND_REQ";
+			if (!snmpcom_req(q, mp, ip_snmp_set, ip_snmp_get, cr)) {
+				proto_str = "Bad SNMPCOM request?";
 				goto protonak;
 			}
-			ip_unbind(Q_TO_CONN(q));
-			mp = mi_tpi_ok_ack_alloc(mp);
-			qreply(q, mp);
 			return;
 		default:
-			/*
-			 * Have to drop any DLPI messages coming down from
-			 * arp (such as an info_req which would cause ip
-			 * to receive an extra info_ack if it was passed
-			 * through.
-			 */
-			ip1dbg(("ip_wput_nondata: dropping M_PROTO %d\n",
+			ip1dbg(("ip_wput_nondata: dropping M_PROTO prim %u\n",
 			    (int)*(uint_t *)mp->b_rptr));
 			freemsg(mp);
 			return;
 		}
-		/* NOTREACHED */
-	case IRE_DB_TYPE: {
-		nce_t		*nce;
-		ill_t		*ill;
-		in6_addr_t	gw_addr_v6;
-
-		/*
-		 * This is a response back from a resolver.  It
-		 * consists of a message chain containing:
-		 *	IRE_MBLK-->LL_HDR_MBLK->pkt
-		 * The IRE_MBLK is the one we allocated in ip_newroute.
-		 * The LL_HDR_MBLK is the DLPI header to use to get
-		 * the attached packet, and subsequent ones for the
-		 * same destination, transmitted.
-		 */
-		if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t))    /* ire */
-			break;
-		/*
-		 * First, check to make sure the resolution succeeded.
-		 * If it failed, the second mblk will be empty.
-		 * If it is, free the chain, dropping the packet.
-		 * (We must ire_delete the ire; that frees the ire mblk)
-		 * We're doing this now to support PVCs for ATM; it's
-		 * a partial xresolv implementation. When we fully implement
-		 * xresolv interfaces, instead of freeing everything here
-		 * we'll initiate neighbor discovery.
-		 *
-		 * For v4 (ARP and other external resolvers) the resolver
-		 * frees the message, so no check is needed. This check
-		 * is required, though, for a full xresolve implementation.
-		 * Including this code here now both shows how external
-		 * resolvers can NACK a resolution request using an
-		 * existing design that has no specific provisions for NACKs,
-		 * and also takes into account that the current non-ARP
-		 * external resolver has been coded to use this method of
-		 * NACKing for all IPv6 (xresolv) cases,
-		 * whether our xresolv implementation is complete or not.
-		 *
-		 */
-		ire = (ire_t *)mp->b_rptr;
-		ill = ire_to_ill(ire);
-		mp1 = mp->b_cont;		/* dl_unitdata_req */
-		if (mp1->b_rptr == mp1->b_wptr) {
-			if (ire->ire_ipversion == IPV6_VERSION) {
-				/*
-				 * XRESOLV interface.
-				 */
-				ASSERT(ill->ill_flags & ILLF_XRESOLV);
-				mutex_enter(&ire->ire_lock);
-				gw_addr_v6 = ire->ire_gateway_addr_v6;
-				mutex_exit(&ire->ire_lock);
-				if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
-					nce = ndp_lookup_v6(ill, B_FALSE,
-					    &ire->ire_addr_v6, B_FALSE);
-				} else {
-					nce = ndp_lookup_v6(ill, B_FALSE,
-					    &gw_addr_v6, B_FALSE);
-				}
-				if (nce != NULL) {
-					nce_resolv_failed(nce);
-					ndp_delete(nce);
-					NCE_REFRELE(nce);
-				}
-			}
-			mp->b_cont = NULL;
-			freemsg(mp1);		/* frees the pkt as well */
-			ASSERT(ire->ire_nce == NULL);
-			ire_delete((ire_t *)mp->b_rptr);
-			return;
-		}
-
-		/*
-		 * Split them into IRE_MBLK and pkt and feed it into
-		 * ire_add_then_send. Then in ire_add_then_send
-		 * the IRE will be added, and then the packet will be
-		 * run back through ip_wput. This time it will make
-		 * it to the wire.
-		 */
-		mp->b_cont = NULL;
-		mp = mp1->b_cont;		/* now, mp points to pkt */
-		mp1->b_cont = NULL;
-		ip1dbg(("ip_wput_nondata: reply from external resolver \n"));
-		if (ire->ire_ipversion == IPV6_VERSION) {
-			/*
-			 * XRESOLV interface. Find the nce and put a copy
-			 * of the dl_unitdata_req in nce_res_mp
-			 */
-			ASSERT(ill->ill_flags & ILLF_XRESOLV);
-			mutex_enter(&ire->ire_lock);
-			gw_addr_v6 = ire->ire_gateway_addr_v6;
-			mutex_exit(&ire->ire_lock);
-			if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
-				nce = ndp_lookup_v6(ill, B_FALSE,
-				    &ire->ire_addr_v6, B_FALSE);
-			} else {
-				nce = ndp_lookup_v6(ill, B_FALSE,
-				    &gw_addr_v6, B_FALSE);
-			}
-			if (nce != NULL) {
-				/*
-				 * We have to protect nce_res_mp here
-				 * from being accessed by other threads
-				 * while we change the mblk pointer.
-				 * Other functions will also lock the nce when
-				 * accessing nce_res_mp.
-				 *
-				 * The reason we change the mblk pointer
-				 * here rather than copying the resolved address
-				 * into the template is that, unlike with
-				 * ethernet, we have no guarantee that the
-				 * resolved address length will be
-				 * smaller than or equal to the lla length
-				 * with which the template was allocated,
-				 * (for ethernet, they're equal)
-				 * so we have to use the actual resolved
-				 * address mblk - which holds the real
-				 * dl_unitdata_req with the resolved address.
-				 *
-				 * Doing this is the same behavior as was
-				 * previously used in the v4 ARP case.
-				 */
-				mutex_enter(&nce->nce_lock);
-				if (nce->nce_res_mp != NULL)
-					freemsg(nce->nce_res_mp);
-				nce->nce_res_mp = mp1;
-				mutex_exit(&nce->nce_lock);
-				/*
-				 * We do a fastpath probe here because
-				 * we have resolved the address without
-				 * using Neighbor Discovery.
-				 * In the non-XRESOLV v6 case, the fastpath
-				 * probe is done right after neighbor
-				 * discovery completes.
-				 */
-				if (nce->nce_res_mp != NULL) {
-					int res;
-					nce_fastpath_list_add(nce);
-					res = ill_fastpath_probe(ill,
-					    nce->nce_res_mp);
-					if (res != 0 && res != EAGAIN)
-						nce_fastpath_list_delete(nce);
-				}
-
-				ire_add_then_send(q, ire, mp);
-				/*
-				 * Now we have to clean out any packets
-				 * that may have been queued on the nce
-				 * while it was waiting for address resolution
-				 * to complete.
-				 */
-				mutex_enter(&nce->nce_lock);
-				mp1 = nce->nce_qd_mp;
-				nce->nce_qd_mp = NULL;
-				mutex_exit(&nce->nce_lock);
-				while (mp1 != NULL) {
-					mblk_t *nxt_mp;
-					queue_t *fwdq = NULL;
-					ill_t   *inbound_ill;
-					uint_t ifindex;
-
-					nxt_mp = mp1->b_next;
-					mp1->b_next = NULL;
-					/*
-					 * Retrieve ifindex stored in
-					 * ip_rput_data_v6()
-					 */
-					ifindex =
-					    (uint_t)(uintptr_t)mp1->b_prev;
-					inbound_ill =
-					    ill_lookup_on_ifindex(ifindex,
-					    B_TRUE, NULL, NULL, NULL,
-					    NULL, ipst);
-					mp1->b_prev = NULL;
-					if (inbound_ill != NULL)
-						fwdq = inbound_ill->ill_rq;
-
-					if (fwdq != NULL) {
-						put(fwdq, mp1);
-						ill_refrele(inbound_ill);
-					} else
-						put(WR(ill->ill_rq), mp1);
-					mp1 = nxt_mp;
-				}
-				NCE_REFRELE(nce);
-			} else {	/* nce is NULL; clean up */
-				ire_delete(ire);
-				freemsg(mp);
-				freemsg(mp1);
-				return;
-			}
-		} else {
-			nce_t *arpce;
-			/*
-			 * Link layer resolution succeeded. Recompute the
-			 * ire_nce.
-			 */
-			ASSERT(ire->ire_type & (IRE_CACHE|IRE_BROADCAST));
-			if ((arpce = ndp_lookup_v4(ill,
-			    (ire->ire_gateway_addr != INADDR_ANY ?
-			    &ire->ire_gateway_addr : &ire->ire_addr),
-			    B_FALSE)) == NULL) {
-				freeb(ire->ire_mp);
-				freeb(mp1);
-				freemsg(mp);
-				return;
-			}
-			mutex_enter(&arpce->nce_lock);
-			arpce->nce_last = TICK_TO_MSEC(lbolt64);
-			if (arpce->nce_state == ND_REACHABLE) {
-				/*
-				 * Someone resolved this before us;
-				 * cleanup the res_mp. Since ire has
-				 * not been added yet, the call to ire_add_v4
-				 * from ire_add_then_send (when a dup is
-				 * detected) will clean up the ire.
-				 */
-				freeb(mp1);
-			} else {
-				ASSERT(arpce->nce_res_mp == NULL);
-				arpce->nce_res_mp = mp1;
-				arpce->nce_state = ND_REACHABLE;
-			}
-			mutex_exit(&arpce->nce_lock);
-			if (ire->ire_marks & IRE_MARK_NOADD) {
-				/*
-				 * this ire will not be added to the ire
-				 * cache table, so we can set the ire_nce
-				 * here, as there are no atomicity constraints.
-				 */
-				ire->ire_nce = arpce;
-				/*
-				 * We are associating this nce with the ire
-				 * so change the nce ref taken in
-				 * ndp_lookup_v4() from
-				 * NCE_REFHOLD to NCE_REFHOLD_NOTR
-				 */
-				NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
-			} else {
-				NCE_REFRELE(arpce);
-			}
-			ire_add_then_send(q, ire, mp);
-		}
-		return;	/* All is well, the packet has been sent. */
-	}
-	case IRE_ARPRESOLVE_TYPE: {
-
-		if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* fake_ire */
-			break;
-		mp1 = mp->b_cont;		/* dl_unitdata_req */
-		mp->b_cont = NULL;
-		/*
-		 * First, check to make sure the resolution succeeded.
-		 * If it failed, the second mblk will be empty.
-		 */
-		if (mp1->b_rptr == mp1->b_wptr) {
-			/* cleanup  the incomplete ire, free queued packets */
-			freemsg(mp); /* fake ire */
-			freeb(mp1);  /* dl_unitdata response */
-			return;
-		}
-
-		/*
-		 * Update any incomplete nce_t found. We search the ctable
-		 * and find the nce from the ire->ire_nce because we need
-		 * to pass the ire to ip_xmit_v4 later, and can find both
-		 * ire and nce in one lookup.
-		 */
-		fake_ire = (ire_t *)mp->b_rptr;
-
-		/*
-		 * By the time we come back here from ARP the logical outgoing
-		 * interface of the incomplete ire we added in ire_forward()
-		 * could have disappeared, causing the incomplete ire to also
-		 * disappear.  So we need to retreive the proper ipif for the
-		 * ire before looking in ctable.  In the case of IPMP, the
-		 * ipif may be on the IPMP ill, so look it up based on the
-		 * ire_ipif_ifindex we stashed back in ire_init_common().
-		 * Then, we can verify that ire_ipif_seqid still exists.
-		 */
-		ill = ill_lookup_on_ifindex(fake_ire->ire_ipif_ifindex, B_FALSE,
-		    NULL, NULL, NULL, NULL, ipst);
-		if (ill == NULL) {
-			ip1dbg(("ill for incomplete ire vanished\n"));
-			freemsg(mp); /* fake ire */
-			freeb(mp1);  /* dl_unitdata response */
-			return;
-		}
-
-		/* Get the outgoing ipif */
-		mutex_enter(&ill->ill_lock);
-		ipif = ipif_lookup_seqid(ill, fake_ire->ire_ipif_seqid);
-		if (ipif == NULL) {
-			mutex_exit(&ill->ill_lock);
-			ill_refrele(ill);
-			ip1dbg(("logical intrf to incomplete ire vanished\n"));
-			freemsg(mp); /* fake_ire */
-			freeb(mp1);  /* dl_unitdata response */
-			return;
-		}
-
-		ipif_refhold_locked(ipif);
-		mutex_exit(&ill->ill_lock);
-		ill_refrele(ill);
-		ire = ire_arpresolve_lookup(fake_ire->ire_addr,
-		    fake_ire->ire_gateway_addr, ipif, fake_ire->ire_zoneid,
-		    ipst, ((ill_t *)q->q_ptr)->ill_wq);
-		ipif_refrele(ipif);
-		if (ire == NULL) {
-			/*
-			 * no ire was found; check if there is an nce
-			 * for this lookup; if it has no ire's pointing at it
-			 * cleanup.
-			 */
-			if ((nce = ndp_lookup_v4(q->q_ptr,
-			    (fake_ire->ire_gateway_addr != INADDR_ANY ?
-			    &fake_ire->ire_gateway_addr : &fake_ire->ire_addr),
-			    B_FALSE)) != NULL) {
-				/*
-				 * cleanup:
-				 * We check for refcnt 2 (one for the nce
-				 * hash list + 1 for the ref taken by
-				 * ndp_lookup_v4) to check that there are
-				 * no ire's pointing at the nce.
-				 */
-				if (nce->nce_refcnt == 2)
-					ndp_delete(nce);
-				NCE_REFRELE(nce);
-			}
-			freeb(mp1);  /* dl_unitdata response */
-			freemsg(mp); /* fake ire */
-			return;
-		}
-
-		nce = ire->ire_nce;
-		DTRACE_PROBE2(ire__arpresolve__type,
-		    ire_t *, ire, nce_t *, nce);
-		mutex_enter(&nce->nce_lock);
-		nce->nce_last = TICK_TO_MSEC(lbolt64);
-		if (nce->nce_state == ND_REACHABLE) {
-			/*
-			 * Someone resolved this before us;
-			 * our response is not needed any more.
-			 */
-			mutex_exit(&nce->nce_lock);
-			freeb(mp1);  /* dl_unitdata response */
-		} else {
-			ASSERT(nce->nce_res_mp == NULL);
-			nce->nce_res_mp = mp1;
-			nce->nce_state = ND_REACHABLE;
-			mutex_exit(&nce->nce_lock);
-			nce_fastpath(nce);
-		}
-		/*
-		 * The cached nce_t has been updated to be reachable;
-		 * Clear the IRE_MARK_UNCACHED flag and free the fake_ire.
-		 */
-		fake_ire->ire_marks &= ~IRE_MARK_UNCACHED;
-		freemsg(mp);
-		/*
-		 * send out queued packets.
-		 */
-		(void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
-
-		IRE_REFRELE(ire);
-		return;
-	}
 	default:
 		break;
 	}
@@ -27787,6 +13001,13 @@ nak:
 		freemsg(mp);
 	return;
 
+nak:
+	iocp->ioc_error = EINVAL;
+	mp->b_datap->db_type = M_IOCNAK;
+	iocp->ioc_count = 0;
+	qreply(q, mp);
+	return;
+
 protonak:
 	cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str);
 	if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL)
@@ -27794,14 +13015,15 @@ protonak:
 }
 
 /*
- * Process IP options in an outbound packet.  Modify the destination if there
- * is a source route option.
+ * Process IP options in an outbound packet.  Verify that the nexthop in a
+ * strict source route is onlink.
  * Returns non-zero if something fails in which case an ICMP error has been
  * sent and mp freed.
+ *
+ * Assumes the ULP has called ip_massage_options to move nexthop into ipha_dst.
  */
-static int
-ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
-    boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst)
+int
+ip_output_options(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa, ill_t *ill)
 {
 	ipoptp_t	opts;
 	uchar_t		*opt;
@@ -27809,14 +13031,11 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
 	uint8_t		optlen;
 	ipaddr_t	dst;
 	intptr_t	code = 0;
-	mblk_t		*mp;
-	ire_t		*ire = NULL;
+	ire_t		*ire;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	ip_recv_attr_t	iras;
 
-	ip2dbg(("ip_wput_options\n"));
-	mp = ipsec_mp;
-	if (mctl_present) {
-		mp = ipsec_mp->b_cont;
-	}
+	ip2dbg(("ip_output_options\n"));
 
 	dst = ipha->ipha_dst;
 	for (optval = ipoptp_first(&opts, ipha);
@@ -27824,7 +13043,7 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
 	    optval = ipoptp_next(&opts)) {
 		opt = opts.ipoptp_cur;
 		optlen = opts.ipoptp_len;
-		ip2dbg(("ip_wput_options: opt %d, len %d\n",
+		ip2dbg(("ip_output_options: opt %d, len %d\n",
 		    optval, optlen));
 		switch (optval) {
 			uint32_t off;
@@ -27832,25 +13051,25 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
 		case IPOPT_LSRR:
 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
 				ip1dbg((
-				    "ip_wput_options: bad option offset\n"));
+				    "ip_output_options: bad option offset\n"));
 				code = (char *)&opt[IPOPT_OLEN] -
 				    (char *)ipha;
 				goto param_prob;
 			}
 			off = opt[IPOPT_OFFSET];
-			ip1dbg(("ip_wput_options: next hop 0x%x\n",
+			ip1dbg(("ip_output_options: next hop 0x%x\n",
 			    ntohl(dst)));
 			/*
 			 * For strict: verify that dst is directly
 			 * reachable.
 			 */
 			if (optval == IPOPT_SSRR) {
-				ire = ire_ftable_lookup(dst, 0, 0,
-				    IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0,
-				    msg_getlabel(mp),
-				    MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst);
+				ire = ire_ftable_lookup_v4(dst, 0, 0,
+				    IRE_IF_ALL, NULL, ALL_ZONES, ixa->ixa_tsl,
+				    MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
+				    NULL);
 				if (ire == NULL) {
-					ip1dbg(("ip_wput_options: SSRR not"
+					ip1dbg(("ip_output_options: SSRR not"
 					    " directly reachable: 0x%x\n",
 					    ntohl(dst)));
 					goto bad_src_route;
@@ -27861,7 +13080,7 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
 		case IPOPT_RR:
 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
 				ip1dbg((
-				    "ip_wput_options: bad option offset\n"));
+				    "ip_output_options: bad option offset\n"));
 				code = (char *)&opt[IPOPT_OLEN] -
 				    (char *)ipha;
 				goto param_prob;
@@ -27879,7 +13098,7 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
 			}
 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
 				ip1dbg((
-				    "ip_wput_options: bad option offset\n"));
+				    "ip_output_options: bad option offset\n"));
 				code = (char *)&opt[IPOPT_OFFSET] -
 				    (char *)ipha;
 				goto param_prob;
@@ -27913,33 +13132,31 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
 	if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0)
 		return (0);
 
-	ip1dbg(("ip_wput_options: error processing IP options."));
+	ip1dbg(("ip_output_options: error processing IP options."));
 	code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
 
 param_prob:
-	/*
-	 * Since ip_wput() isn't close to finished, we fill
-	 * in enough of the header for credible error reporting.
-	 */
-	if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) {
-		/* Failed */
-		freemsg(ipsec_mp);
-		return (-1);
-	}
-	icmp_param_problem(q, ipsec_mp, (uint8_t)code, zoneid, ipst);
+	bzero(&iras, sizeof (iras));
+	iras.ira_ill = iras.ira_rill = ill;
+	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+	iras.ira_rifindex = iras.ira_ruifindex;
+	iras.ira_flags = IRAF_IS_IPV4;
+
+	ip_drop_output("ip_output_options", mp, ill);
+	icmp_param_problem(mp, (uint8_t)code, &iras);
+	ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
 	return (-1);
 
 bad_src_route:
-	/*
-	 * Since ip_wput() isn't close to finished, we fill
-	 * in enough of the header for credible error reporting.
-	 */
-	if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) {
-		/* Failed */
-		freemsg(ipsec_mp);
-		return (-1);
-	}
-	icmp_unreachable(q, ipsec_mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst);
+	bzero(&iras, sizeof (iras));
+	iras.ira_ill = iras.ira_rill = ill;
+	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+	iras.ira_rifindex = iras.ira_ruifindex;
+	iras.ira_flags = IRAF_IS_IPV4;
+
+	ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
+	icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
+	ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
 	return (-1);
 }
 
@@ -28082,29 +13299,60 @@ conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list)
 	/*
 	 * For non streams based sockets assert flow control.
 	 */
-	if (IPCL_IS_NONSTR(connp)) {
-		DTRACE_PROBE1(su__txq__full, conn_t *, connp);
-		(*connp->conn_upcalls->su_txq_full)
-		    (connp->conn_upper_handle, B_TRUE);
-	} else {
-		conn_setqfull(connp);
-		noenable(connp->conn_wq);
-	}
+	conn_setqfull(connp, NULL);
 	mutex_exit(CONN_DRAIN_LIST_LOCK(connp));
 }
 
+static void
+conn_idl_remove(conn_t *connp)
+{
+	idl_t *idl = connp->conn_idl;
+
+	if (idl != NULL) {
+		/*
+		 * Remove ourself from the drain list, if we did not do
+		 * a putq, or if the conn is closing.
+		 * Note: It is possible that q->q_first is non-null. It means
+		 * that these messages landed after we did a enableok() in
+		 * ip_wsrv. Thus STREAMS will call ip_wsrv once again to
+		 * service them.
+		 */
+		if (connp->conn_drain_next == connp) {
+			/* Singleton in the list */
+			ASSERT(connp->conn_drain_prev == connp);
+			idl->idl_conn = NULL;
+		} else {
+			connp->conn_drain_prev->conn_drain_next =
+			    connp->conn_drain_next;
+			connp->conn_drain_next->conn_drain_prev =
+			    connp->conn_drain_prev;
+			if (idl->idl_conn == connp)
+				idl->idl_conn = connp->conn_drain_next;
+		}
+	}
+	connp->conn_drain_next = NULL;
+	connp->conn_drain_prev = NULL;
+
+	conn_clrqfull(connp, NULL);
+	/*
+	 * For streams based sockets open up flow control.
+	 */
+	if (!IPCL_IS_NONSTR(connp))
+		enableok(connp->conn_wq);
+}
+
 /*
  * This conn is closing, and we are called from ip_close. OR
- * This conn has been serviced by ip_wsrv, and we need to do the tail
- * processing.
- * If this conn is part of the drain list, we may need to sustain the drain
- * process by qenabling the next conn in the drain list. We may also need to
- * remove this conn from the list, if it is done.
+ * this conn is draining because flow-control on the ill has been relieved.
+ *
+ * We must also need to remove conn's on this idl from the list, and also
+ * inform the sockfs upcalls about the change in flow-control.
  */
 static void
 conn_drain_tail(conn_t *connp, boolean_t closing)
 {
 	idl_t *idl;
+	conn_t *next_connp;
 
 	/*
 	 * connp->conn_idl is stable at this point, and no lock is needed
@@ -28116,24 +13364,21 @@ conn_drain_tail(conn_t *connp, boolean_t closing)
 	 * instance of service trying to call conn_drain_insert on this conn
 	 * now.
 	 */
-	ASSERT(!closing || (connp->conn_idl != NULL));
+	ASSERT(!closing || connp == NULL || connp->conn_idl != NULL);
 
 	/*
 	 * If connp->conn_idl is null, the conn has not been inserted into any
 	 * drain list even once since creation of the conn. Just return.
 	 */
-	if (connp->conn_idl == NULL)
+	if (connp == NULL || connp->conn_idl == NULL)
 		return;
 
-	mutex_enter(CONN_DRAIN_LIST_LOCK(connp));
-
 	if (connp->conn_drain_prev == NULL) {
 		/* This conn is currently not in the drain list.  */
-		mutex_exit(CONN_DRAIN_LIST_LOCK(connp));
 		return;
 	}
 	idl = connp->conn_idl;
-	if (idl->idl_conn_draining == connp) {
+	if (!closing) {
 		/*
 		 * This conn is the current drainer. If this is the last conn
 		 * in the drain list, we need to do more checks, in the 'if'
@@ -28141,186 +13386,45 @@ conn_drain_tail(conn_t *connp, boolean_t closing)
 		 * to sustain the draining, and is handled in the 'else'
 		 * below.
 		 */
-		if (connp->conn_drain_next == idl->idl_conn) {
-			/*
-			 * This conn is the last in this list. This round
-			 * of draining is complete. If idl_repeat is set,
-			 * it means another flow enabling has happened from
-			 * the driver/streams and we need to another round
-			 * of draining.
-			 * If there are more than 2 conns in the drain list,
-			 * do a left rotate by 1, so that all conns except the
-			 * conn at the head move towards the head by 1, and the
-			 * the conn at the head goes to the tail. This attempts
-			 * a more even share for all queues that are being
-			 * drained.
-			 */
-			if ((connp->conn_drain_next != connp) &&
-			    (idl->idl_conn->conn_drain_next != connp)) {
-				idl->idl_conn = idl->idl_conn->conn_drain_next;
-			}
-			if (idl->idl_repeat) {
-				qenable(idl->idl_conn->conn_wq);
-				idl->idl_conn_draining = idl->idl_conn;
-				idl->idl_repeat = 0;
-			} else {
-				idl->idl_conn_draining = NULL;
-			}
-		} else {
-			/*
-			 * If the next queue that we are now qenable'ing,
-			 * is closing, it will remove itself from this list
-			 * and qenable the subsequent queue in ip_close().
-			 * Serialization is acheived thru idl_lock.
-			 */
-			qenable(connp->conn_drain_next->conn_wq);
-			idl->idl_conn_draining = connp->conn_drain_next;
-		}
-	}
-	if (!connp->conn_did_putbq || closing) {
-		/*
-		 * Remove ourself from the drain list, if we did not do
-		 * a putbq, or if the conn is closing.
-		 * Note: It is possible that q->q_first is non-null. It means
-		 * that these messages landed after we did a enableok() in
-		 * ip_wsrv. Thus STREAMS will call ip_wsrv once again to
-		 * service them.
-		 */
-		if (connp->conn_drain_next == connp) {
-			/* Singleton in the list */
-			ASSERT(connp->conn_drain_prev == connp);
-			idl->idl_conn = NULL;
-			idl->idl_conn_draining = NULL;
-		} else {
-			connp->conn_drain_prev->conn_drain_next =
-			    connp->conn_drain_next;
-			connp->conn_drain_next->conn_drain_prev =
-			    connp->conn_drain_prev;
-			if (idl->idl_conn == connp)
-				idl->idl_conn = connp->conn_drain_next;
-			ASSERT(idl->idl_conn_draining != connp);
-
-		}
-		connp->conn_drain_next = NULL;
-		connp->conn_drain_prev = NULL;
+		next_connp = connp->conn_drain_next;
+		while (next_connp != connp) {
+			conn_t *delconnp = next_connp;
 
-		/*
-		 * For non streams based sockets open up flow control.
-		 */
-		if (IPCL_IS_NONSTR(connp)) {
-			(*connp->conn_upcalls->su_txq_full)
-			    (connp->conn_upper_handle, B_FALSE);
-		} else {
-			conn_clrqfull(connp);
-			enableok(connp->conn_wq);
+			next_connp = next_connp->conn_drain_next;
+			conn_idl_remove(delconnp);
 		}
+		ASSERT(connp->conn_drain_next == idl->idl_conn);
 	}
+	conn_idl_remove(connp);
 
-	mutex_exit(CONN_DRAIN_LIST_LOCK(connp));
 }
 
 /*
  * Write service routine. Shared perimeter entry point.
- * ip_wsrv can be called in any of the following ways.
- * 1. The device queue's messages has fallen below the low water mark
- *    and STREAMS has backenabled the ill_wq. We walk thru all the
- *    the drain lists and backenable the first conn in each list.
- * 2. The above causes STREAMS to run ip_wsrv on the conn_wq of the
- *    qenabled non-tcp upper layers. We start dequeing messages and call
- *    ip_wput for each message.
+ * The device queue's messages has fallen below the low water mark and STREAMS
+ * has backenabled the ill_wq. Send sockfs notification about flow-control onx
+ * each waiting conn.
  */
-
 void
 ip_wsrv(queue_t *q)
 {
-	conn_t	*connp;
 	ill_t	*ill;
-	mblk_t	*mp;
-
-	if (q->q_next) {
-		ill = (ill_t *)q->q_ptr;
-		if (ill->ill_state_flags == 0) {
-			ip_stack_t *ipst = ill->ill_ipst;
 
-			/*
-			 * The device flow control has opened up.
-			 * Walk through conn drain lists and qenable the
-			 * first conn in each list. This makes sense only
-			 * if the stream is fully plumbed and setup.
-			 * Hence the if check above.
-			 */
-			ip1dbg(("ip_wsrv: walking\n"));
-			conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
-		}
-		return;
-	}
-
-	connp = Q_TO_CONN(q);
-	ip1dbg(("ip_wsrv: %p %p\n", (void *)q, (void *)connp));
+	ill = (ill_t *)q->q_ptr;
+	if (ill->ill_state_flags == 0) {
+		ip_stack_t *ipst = ill->ill_ipst;
 
-	/*
-	 * 1. Set conn_draining flag to signal that service is active.
-	 *
-	 * 2. ip_output determines whether it has been called from service,
-	 *    based on the last parameter. If it is IP_WSRV it concludes it
-	 *    has been called from service.
-	 *
-	 * 3. Message ordering is preserved by the following logic.
-	 *    i. A directly called ip_output (i.e. not thru service) will queue
-	 *    the message at the tail, if conn_draining is set (i.e. service
-	 *    is running) or if q->q_first is non-null.
-	 *
-	 *    ii. If ip_output is called from service, and if ip_output cannot
-	 *    putnext due to flow control, it does a putbq.
-	 *
-	 * 4. noenable the queue so that a putbq from ip_wsrv does not reenable
-	 *    (causing an infinite loop).
-	 */
-	ASSERT(!connp->conn_did_putbq);
-
-	while ((q->q_first != NULL) && !connp->conn_did_putbq) {
-		connp->conn_draining = 1;
-		noenable(q);
-		while ((mp = getq(q)) != NULL) {
-			ASSERT(CONN_Q(q));
-
-			DTRACE_PROBE1(ip__wsrv__ip__output, conn_t *, connp);
-			ip_output(Q_TO_CONN(q), mp, q, IP_WSRV);
-			if (connp->conn_did_putbq) {
-				/* ip_wput did a putbq */
-				break;
-			}
-		}
 		/*
-		 * At this point, a thread coming down from top, calling
-		 * ip_wput, may end up queueing the message. We have not yet
-		 * enabled the queue, so ip_wsrv won't be called again.
-		 * To avoid this race, check q->q_first again (in the loop)
-		 * If the other thread queued the message before we call
-		 * enableok(), we will catch it in the q->q_first check.
-		 * If the other thread queues the message after we call
-		 * enableok(), ip_wsrv will be called again by STREAMS.
+		 * The device flow control has opened up.
+		 * Walk through conn drain lists and qenable the
+		 * first conn in each list. This makes sense only
+		 * if the stream is fully plumbed and setup.
+		 * Hence the ill_state_flags check above.
 		 */
-		connp->conn_draining = 0;
-		enableok(q);
+		ip1dbg(("ip_wsrv: walking\n"));
+		conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
+		enableok(ill->ill_wq);
 	}
-
-	/* Enable the next conn for draining */
-	conn_drain_tail(connp, B_FALSE);
-
-	/*
-	 * conn_direct_blocked is used to indicate blocked
-	 * condition for direct path (ILL_DIRECT_CAPABLE()).
-	 * This is the only place where it is set without
-	 * checking for ILL_DIRECT_CAPABLE() and setting it
-	 * to 0 is ok even if it is not ILL_DIRECT_CAPABLE().
-	 */
-	if (!connp->conn_did_putbq && connp->conn_direct_blocked) {
-		DTRACE_PROBE1(ip__wsrv__direct__blocked, conn_t *, connp);
-		connp->conn_direct_blocked = B_FALSE;
-	}
-
-	connp->conn_did_putbq = 0;
 }
 
 /*
@@ -28369,21 +13473,7 @@ conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
 	for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
 		idl = &tx_list->txl_drain_list[i];
 		mutex_enter(&idl->idl_lock);
-		if (idl->idl_conn == NULL) {
-			mutex_exit(&idl->idl_lock);
-			continue;
-		}
-		/*
-		 * If this list is not being drained currently by
-		 * an ip_wsrv thread, start the process.
-		 */
-		if (idl->idl_conn_draining == NULL) {
-			ASSERT(idl->idl_repeat == 0);
-			qenable(idl->idl_conn->conn_wq);
-			idl->idl_conn_draining = idl->idl_conn;
-		} else {
-			idl->idl_repeat = 1;
-		}
+		conn_drain_tail(idl->idl_conn, B_FALSE);
 		mutex_exit(&idl->idl_lock);
 	}
 }
@@ -28393,240 +13483,190 @@ conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
  * "matches" the conn.
  */
 boolean_t
-conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
-    zoneid_t zoneid)
+conn_wantpacket(conn_t *connp, ip_recv_attr_t *ira, ipha_t *ipha)
 {
-	ill_t *bound_ill;
-	boolean_t found;
-	ipif_t *ipif;
-	ire_t *ire;
-	ipaddr_t dst, src;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
+	ill_t		*ill = ira->ira_rill;
+	zoneid_t	zoneid = ira->ira_zoneid;
+	uint_t		in_ifindex;
+	ipaddr_t	dst, src;
 
 	dst = ipha->ipha_dst;
 	src = ipha->ipha_src;
 
 	/*
-	 * conn_incoming_ill is set by IP_BOUND_IF which limits
+	 * conn_incoming_ifindex is set by IP_BOUND_IF which limits
 	 * unicast, broadcast and multicast reception to
-	 * conn_incoming_ill. conn_wantpacket itself is called
-	 * only for BROADCAST and multicast.
+	 * conn_incoming_ifindex.
+	 * conn_wantpacket is called for unicast, broadcast and
+	 * multicast packets.
 	 */
-	bound_ill = connp->conn_incoming_ill;
-	if (bound_ill != NULL) {
-		if (IS_IPMP(bound_ill)) {
-			if (bound_ill->ill_grp != ill->ill_grp)
-				return (B_FALSE);
-		} else {
-			if (bound_ill != ill)
-				return (B_FALSE);
-		}
-	}
+	in_ifindex = connp->conn_incoming_ifindex;
 
-	if (!CLASSD(dst)) {
-		if (IPCL_ZONE_MATCH(connp, zoneid))
-			return (B_TRUE);
-		/*
-		 * The conn is in a different zone; we need to check that this
-		 * broadcast address is configured in the application's zone.
-		 */
-		ipif = ipif_get_next_ipif(NULL, ill);
-		if (ipif == NULL)
+	/* mpathd can bind to the under IPMP interface, which we allow */
+	if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
+		if (!IS_UNDER_IPMP(ill))
 			return (B_FALSE);
-		ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif,
-		    connp->conn_zoneid, NULL,
-		    (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
-		ipif_refrele(ipif);
-		if (ire != NULL) {
-			ire_refrele(ire);
-			return (B_TRUE);
-		} else {
+
+		if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
 			return (B_FALSE);
-		}
 	}
 
-	if ((fanout_flags & IP_FF_NO_MCAST_LOOP) &&
-	    connp->conn_zoneid == zoneid) {
-		/*
-		 * Loopback case: the sending endpoint has IP_MULTICAST_LOOP
-		 * disabled, therefore we don't dispatch the multicast packet to
-		 * the sending zone.
-		 */
+	if (!IPCL_ZONE_MATCH(connp, zoneid))
 		return (B_FALSE);
-	}
 
-	if (IS_LOOPBACK(ill) && connp->conn_zoneid != zoneid) {
-		/*
-		 * Multicast packet on the loopback interface: we only match
-		 * conns who joined the group in the specified zone.
-		 */
-		return (B_FALSE);
-	}
+	if (!(ira->ira_flags & IRAF_MULTICAST))
+		return (B_TRUE);
 
 	if (connp->conn_multi_router) {
 		/* multicast packet and multicast router socket: send up */
 		return (B_TRUE);
 	}
 
-	mutex_enter(&connp->conn_lock);
-	found = (ilg_lookup_ill_withsrc(connp, dst, src, ill) != NULL);
-	mutex_exit(&connp->conn_lock);
-	return (found);
+	if (ipha->ipha_protocol == IPPROTO_PIM ||
+	    ipha->ipha_protocol == IPPROTO_RSVP)
+		return (B_TRUE);
+
+	return (conn_hasmembers_ill_withsrc_v4(connp, dst, src, ira->ira_ill));
 }
 
-static void
-conn_setqfull(conn_t *connp)
+void
+conn_setqfull(conn_t *connp, boolean_t *flow_stopped)
 {
-	queue_t *q = connp->conn_wq;
+	if (IPCL_IS_NONSTR(connp)) {
+		(*connp->conn_upcalls->su_txq_full)
+		    (connp->conn_upper_handle, B_TRUE);
+		if (flow_stopped != NULL)
+			*flow_stopped = B_TRUE;
+	} else {
+		queue_t *q = connp->conn_wq;
 
-	if (!(q->q_flag & QFULL)) {
-		mutex_enter(QLOCK(q));
+		ASSERT(q != NULL);
 		if (!(q->q_flag & QFULL)) {
-			/* still need to set QFULL */
-			q->q_flag |= QFULL;
-			mutex_exit(QLOCK(q));
-		} else {
-			mutex_exit(QLOCK(q));
+			mutex_enter(QLOCK(q));
+			if (!(q->q_flag & QFULL)) {
+				/* still need to set QFULL */
+				q->q_flag |= QFULL;
+				/* set flow_stopped to true under QLOCK */
+				if (flow_stopped != NULL)
+					*flow_stopped = B_TRUE;
+				mutex_exit(QLOCK(q));
+			} else {
+				/* flow_stopped is left unchanged */
+				mutex_exit(QLOCK(q));
+			}
 		}
 	}
 }
 
-static void
-conn_clrqfull(conn_t *connp)
+void
+conn_clrqfull(conn_t *connp, boolean_t *flow_stopped)
 {
-	queue_t *q = connp->conn_wq;
+	if (IPCL_IS_NONSTR(connp)) {
+		(*connp->conn_upcalls->su_txq_full)
+		    (connp->conn_upper_handle, B_FALSE);
+		if (flow_stopped != NULL)
+			*flow_stopped = B_FALSE;
+	} else {
+		queue_t *q = connp->conn_wq;
 
-	if (q->q_flag & QFULL) {
-		mutex_enter(QLOCK(q));
+		ASSERT(q != NULL);
 		if (q->q_flag & QFULL) {
-			q->q_flag &= ~QFULL;
-			mutex_exit(QLOCK(q));
-			if (q->q_flag & QWANTW)
-				qbackenable(q, 0);
-		} else {
-			mutex_exit(QLOCK(q));
+			mutex_enter(QLOCK(q));
+			if (q->q_flag & QFULL) {
+				q->q_flag &= ~QFULL;
+				/* set flow_stopped to false under QLOCK */
+				if (flow_stopped != NULL)
+					*flow_stopped = B_FALSE;
+				mutex_exit(QLOCK(q));
+				if (q->q_flag & QWANTW)
+					qbackenable(q, 0);
+			} else {
+				/* flow_stopped is left unchanged */
+				mutex_exit(QLOCK(q));
+			}
 		}
 	}
+	connp->conn_direct_blocked = B_FALSE;
 }
 
 /*
- * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp.
+ * Return the length in bytes of the IPv4 headers (base header, label, and
+ * other IP options) that will be needed based on the
+ * ip_pkt_t structure passed by the caller.
+ *
+ * The returned length does not include the length of the upper level
+ * protocol (ULP) header.
+ * The caller needs to check that the length doesn't exceed the max for IPv4.
  */
-/* ARGSUSED */
-static void
-ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
+int
+ip_total_hdrs_len_v4(const ip_pkt_t *ipp)
 {
-	ill_t *ill = (ill_t *)q->q_ptr;
-	mblk_t	*mp1, *mp2;
-	ipif_t  *ipif;
-	int err = 0;
-	conn_t *connp = NULL;
-	ipsq_t	*ipsq;
-	arc_t	*arc;
-
-	ip1dbg(("ip_arp_done(%s)\n", ill->ill_name));
-
-	ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (arc_t));
-	ASSERT(((arc_t *)mp->b_rptr)->arc_cmd == AR_DLPIOP_DONE);
-
-	ASSERT(IAM_WRITER_ILL(ill));
-	mp2 = mp->b_cont;
-	mp->b_cont = NULL;
+	int len;
 
-	/*
-	 * We have now received the arp bringup completion message
-	 * from ARP. Mark the arp bringup as done. Also if the arp
-	 * stream has already started closing, send up the AR_ARP_CLOSING
-	 * ack now since ARP is waiting in close for this ack.
-	 */
-	mutex_enter(&ill->ill_lock);
-	ill->ill_arp_bringup_pending = 0;
-	if (ill->ill_arp_closing) {
-		mutex_exit(&ill->ill_lock);
-		/* Let's reuse the mp for sending the ack */
-		arc = (arc_t *)mp->b_rptr;
-		mp->b_wptr = mp->b_rptr + sizeof (arc_t);
-		arc->arc_cmd = AR_ARP_CLOSING;
-		qreply(q, mp);
-	} else {
-		mutex_exit(&ill->ill_lock);
-		freeb(mp);
+	len = IP_SIMPLE_HDR_LENGTH;
+	if (ipp->ipp_fields & IPPF_LABEL_V4) {
+		ASSERT(ipp->ipp_label_len_v4 != 0);
+		/* We need to round up here */
+		len += (ipp->ipp_label_len_v4 + 3) & ~3;
 	}
 
-	ipsq = ill->ill_phyint->phyint_ipsq;
-	ipif = ipsq->ipsq_xop->ipx_pending_ipif;
-	mp1 = ipsq_pending_mp_get(ipsq, &connp);
-	ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
-	if (mp1 == NULL) {
-		/* bringup was aborted by the user */
-		freemsg(mp2);
-		return;
+	if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+		ASSERT(ipp->ipp_ipv4_options_len != 0);
+		ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
+		len += ipp->ipp_ipv4_options_len;
 	}
+	return (len);
+}
 
-	/*
-	 * If an IOCTL is waiting on this (ipx_current_ioctl != 0), then we
-	 * must have an associated conn_t.  Otherwise, we're bringing this
-	 * interface back up as part of handling an asynchronous event (e.g.,
-	 * physical address change).
-	 */
-	if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
-		ASSERT(connp != NULL);
-		q = CONNP_TO_WQ(connp);
-	} else {
-		ASSERT(connp == NULL);
-		q = ill->ill_rq;
-	}
+/*
+ * All-purpose routine to build an IPv4 header with options based
+ * on the abstract ip_pkt_t.
+ *
+ * The caller has to set the source and destination address as well as
+ * ipha_length. The caller has to massage any source route and compensate
+ * for the ULP pseudo-header checksum due to the source route.
+ */
+void
+ip_build_hdrs_v4(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
+    uint8_t protocol)
+{
+	ipha_t	*ipha = (ipha_t *)buf;
+	uint8_t *cp;
 
-	/*
-	 * If the DL_BIND_REQ fails, it is noted
-	 * in arc_name_offset.
-	 */
-	err = *((int *)mp2->b_rptr);
-	if (err == 0) {
-		if (ipif->ipif_isv6) {
-			if ((err = ipif_up_done_v6(ipif)) != 0)
-				ip0dbg(("ip_arp_done: init failed\n"));
-		} else {
-			if ((err = ipif_up_done(ipif)) != 0)
-				ip0dbg(("ip_arp_done: init failed\n"));
-		}
-	} else {
-		ip0dbg(("ip_arp_done: DL_BIND_REQ failed\n"));
-	}
+	/* Initialize IPv4 header */
+	ipha->ipha_type_of_service = ipp->ipp_type_of_service;
+	ipha->ipha_length = 0;	/* Caller will set later */
+	ipha->ipha_ident = 0;
+	ipha->ipha_fragment_offset_and_flags = 0;
+	ipha->ipha_ttl = ipp->ipp_unicast_hops;
+	ipha->ipha_protocol = protocol;
+	ipha->ipha_hdr_checksum = 0;
 
-	freemsg(mp2);
+	if ((ipp->ipp_fields & IPPF_ADDR) &&
+	    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+		ipha->ipha_src = ipp->ipp_addr_v4;
 
-	if ((err == 0) && (ill->ill_up_ipifs)) {
-		err = ill_up_ipifs(ill, q, mp1);
-		if (err == EINPROGRESS)
-			return;
+	cp = (uint8_t *)&ipha[1];
+	if (ipp->ipp_fields & IPPF_LABEL_V4) {
+		ASSERT(ipp->ipp_label_len_v4 != 0);
+		bcopy(ipp->ipp_label_v4, cp, ipp->ipp_label_len_v4);
+		cp += ipp->ipp_label_len_v4;
+		/* We need to round up here */
+		while ((uintptr_t)cp & 0x3) {
+			*cp++ = IPOPT_NOP;
+		}
 	}
 
-	/*
-	 * If we have a moved ipif to bring up, and everything has succeeded
-	 * to this point, bring it up on the IPMP ill.  Otherwise, leave it
-	 * down -- the admin can try to bring it up by hand if need be.
-	 */
-	if (ill->ill_move_ipif != NULL) {
-		ipif = ill->ill_move_ipif;
-		ill->ill_move_ipif = NULL;
-		if (err == 0) {
-			err = ipif_up(ipif, q, mp1);
-			if (err == EINPROGRESS)
-				return;
-		}
+	if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+		ASSERT(ipp->ipp_ipv4_options_len != 0);
+		ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
+		bcopy(ipp->ipp_ipv4_options, cp, ipp->ipp_ipv4_options_len);
+		cp += ipp->ipp_ipv4_options_len;
 	}
+	ipha->ipha_version_and_hdr_length =
+	    (uint8_t)((IP_VERSION << 4) + buf_len / 4);
 
-	/*
-	 * The operation must complete without EINPROGRESS since
-	 * ipsq_pending_mp_get() has removed the mblk.  Otherwise, the
-	 * operation will be stuck forever in the ipsq.
-	 */
-	ASSERT(err != EINPROGRESS);
-	if (ipsq->ipsq_xop->ipx_current_ioctl != 0)
-		ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
-	else
-		ipsq_current_finish(ipsq);
+	ASSERT((int)(cp - buf) == buf_len);
 }
 
 /* Allocate the private structure */
@@ -28659,47 +13699,43 @@ ip_priv_free(void *buf)
  * which holds the state information for this packet and invokes the
  * the classifier (via ipp_packet_process). The classification, depending on
  * configured filters, results in a list of actions for this packet. Invoking
- * an action may cause the packet to be dropped, in which case the resulting
- * mblk (*mpp) is NULL. proc indicates the callout position for
- * this packet and ill_index is the interface this packet on or will leave
+ * an action may cause the packet to be dropped, in which case we return NULL.
+ * proc indicates the callout position for
+ * this packet and ill is the interface this packet arrived on or will leave
  * on (inbound and outbound resp.).
+ *
+ * We do the processing on the rill (mapped to the upper if ipmp), but MIB
+ * on the ill corrsponding to the destination IP address.
  */
-void
-ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index)
+mblk_t *
+ip_process(ip_proc_t proc, mblk_t *mp, ill_t *rill, ill_t *ill)
 {
-	mblk_t		*mp;
 	ip_priv_t	*priv;
 	ipp_action_id_t	aid;
 	int		rc = 0;
 	ipp_packet_t	*pp;
-#define	IP_CLASS	"ip"
 
 	/* If the classifier is not loaded, return  */
 	if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) {
-		return;
+		return (mp);
 	}
 
-	mp = *mpp;
 	ASSERT(mp != NULL);
 
 	/* Allocate the packet structure */
-	rc = ipp_packet_alloc(&pp, IP_CLASS, aid);
-	if (rc != 0) {
-		*mpp = NULL;
-		freemsg(mp);
-		return;
-	}
+	rc = ipp_packet_alloc(&pp, "ip", aid);
+	if (rc != 0)
+		goto drop;
 
 	/* Allocate the private structure */
 	rc = ip_priv_alloc((void **)&priv);
 	if (rc != 0) {
-		*mpp = NULL;
-		freemsg(mp);
 		ipp_packet_free(pp);
-		return;
+		goto drop;
 	}
 	priv->proc = proc;
-	priv->ill_index = ill_index;
+	priv->ill_index = ill_get_upper_ifindex(rill);
+
 	ipp_packet_set_private(pp, priv, ip_priv_free);
 	ipp_packet_set_data(pp, mp);
 
@@ -28708,14 +13744,23 @@ ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index)
 	if (pp != NULL) {
 		mp = ipp_packet_get_data(pp);
 		ipp_packet_free(pp);
-		if (rc != 0) {
-			freemsg(mp);
-			*mpp = NULL;
-		}
+		if (rc != 0)
+			goto drop;
+		return (mp);
 	} else {
-		*mpp = NULL;
+		/* No mp to trace in ip_drop_input/ip_drop_output  */
+		mp = NULL;
 	}
-#undef	IP_CLASS
+drop:
+	if (proc == IPP_LOCAL_IN || proc == IPP_FWD_IN) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ip_process", mp, ill);
+	} else {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ip_process", mp, ill);
+	}
+	freemsg(mp);
+	return (NULL);
 }
 
 /*
@@ -28723,102 +13768,92 @@ ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index)
  * all the interfaces crossed by the related multirt routes.
  * The call is considered successful if the operation succeeds
  * on at least one interface.
+ *
+ * This assumes that a set of IRE_HOST/RTF_MULTIRT has been created for the
+ * multicast addresses with the ire argument being the first one.
+ * We walk the bucket to find all the of those.
+ *
+ * Common to IPv4 and IPv6.
  */
 static int
-ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t,
-    uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *ire, conn_t *connp,
-    boolean_t checkonly, ipaddr_t group, mcast_record_t fmode, ipaddr_t src,
-    mblk_t *first_mp)
+ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
+    const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
+    ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6group,
+    mcast_record_t fmode, const in6_addr_t *v6src)
 {
 	ire_t		*ire_gw;
 	irb_t		*irb;
+	int		ifindex;
 	int		error = 0;
-	opt_restart_t	*or;
+	int		result;
 	ip_stack_t	*ipst = ire->ire_ipst;
+	ipaddr_t	group;
+	boolean_t	isv6;
+	int		match_flags;
+
+	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
+		IN6_V4MAPPED_TO_IPADDR(v6group, group);
+		isv6 = B_FALSE;
+	} else {
+		isv6 = B_TRUE;
+	}
 
 	irb = ire->ire_bucket;
 	ASSERT(irb != NULL);
 
-	ASSERT(DB_TYPE(first_mp) == M_CTL);
-
-	or = (opt_restart_t *)first_mp->b_rptr;
-	IRB_REFHOLD(irb);
+	result = 0;
+	irb_refhold(irb);
 	for (; ire != NULL; ire = ire->ire_next) {
 		if ((ire->ire_flags & RTF_MULTIRT) == 0)
 			continue;
-		if (ire->ire_addr != group)
-			continue;
 
-		ire_gw = ire_ftable_lookup(ire->ire_gateway_addr, 0, 0,
-		    IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL,
-		    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE, ipst);
-		/* No resolver exists for the gateway; skip this ire. */
+		/* We handle -ifp routes by matching on the ill if set */
+		match_flags = MATCH_IRE_TYPE;
+		if (ire->ire_ill != NULL)
+			match_flags |= MATCH_IRE_ILL;
+
+		if (isv6) {
+			if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6group))
+				continue;
+
+			ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6,
+			    0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
+			    match_flags, 0, ipst, NULL);
+		} else {
+			if (ire->ire_addr != group)
+				continue;
+
+			ire_gw = ire_ftable_lookup_v4(ire->ire_gateway_addr,
+			    0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
+			    match_flags, 0, ipst, NULL);
+		}
+		/* No interface route exists for the gateway; skip this ire. */
 		if (ire_gw == NULL)
 			continue;
+		if (ire_gw->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+			ire_refrele(ire_gw);
+			continue;
+		}
+		ASSERT(ire_gw->ire_ill != NULL);	/* IRE_INTERFACE */
+		ifindex = ire_gw->ire_ill->ill_phyint->phyint_ifindex;
 
 		/*
-		 * This function can return EINPROGRESS. If so the operation
-		 * will be restarted from ip_restart_optmgmt which will
-		 * call ip_opt_set and option processing will restart for
-		 * this option. So we may end up calling 'fn' more than once.
-		 * This requires that 'fn' is idempotent except for the
-		 * return value. The operation is considered a success if
+		 * The operation is considered a success if
 		 * it succeeds at least once on any one interface.
 		 */
-		error = fn(connp, checkonly, group, ire_gw->ire_src_addr,
-		    NULL, fmode, src, first_mp);
+		error = fn(connp, checkonly, v6group, INADDR_ANY, ifindex,
+		    fmode, v6src);
 		if (error == 0)
-			or->or_private = CGTP_MCAST_SUCCESS;
-
-		if (ip_debug > 0) {
-			ulong_t	off;
-			char	*ksym;
-			ksym = kobj_getsymname((uintptr_t)fn, &off);
-			ip2dbg(("ip_multirt_apply_membership: "
-			    "called %s, multirt group 0x%08x via itf 0x%08x, "
-			    "error %d [success %u]\n",
-			    ksym ? ksym : "?",
-			    ntohl(group), ntohl(ire_gw->ire_src_addr),
-			    error, or->or_private));
-		}
+			result = CGTP_MCAST_SUCCESS;
 
 		ire_refrele(ire_gw);
-		if (error == EINPROGRESS) {
-			IRB_REFRELE(irb);
-			return (error);
-		}
 	}
-	IRB_REFRELE(irb);
+	irb_refrele(irb);
 	/*
 	 * Consider the call as successful if we succeeded on at least
 	 * one interface. Otherwise, return the last encountered error.
 	 */
-	return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error);
-}
-
-/*
- * Issue a warning regarding a route crossing an interface with an
- * incorrect MTU. Only one message every 'ip_multirt_log_interval'
- * amount of time is logged.
- */
-static void
-ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag)
-{
-	hrtime_t	current = gethrtime();
-	char		buf[INET_ADDRSTRLEN];
-	ip_stack_t	*ipst = ire->ire_ipst;
-
-	/* Convert interval in ms to hrtime in ns */
-	if (ipst->ips_multirt_bad_mtu_last_time +
-	    ((hrtime_t)ipst->ips_ip_multirt_log_interval * (hrtime_t)1000000) <=
-	    current) {
-		cmn_err(CE_WARN, "ip: ignoring multiroute "
-		    "to %s, incorrect MTU %u (expected %u)\n",
-		    ip_dot_addr(ire->ire_addr, buf),
-		    ire->ire_max_frag, max_frag);
-
-		ipst->ips_multirt_bad_mtu_last_time = current;
-	}
+	return (result == CGTP_MCAST_SUCCESS ? 0 : error);
 }
 
 /*
@@ -28882,6 +13917,7 @@ ip_cgtp_filter_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
 
 	*ip_cgtp_filter_value = (boolean_t)new_value;
 
+	ill_set_inputfn_all(ipst);
 	return (0);
 }
 
@@ -28919,6 +13955,9 @@ ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops)
 	}
 
 	ipst->ips_ip_cgtp_filter_ops = ops;
+
+	ill_set_inputfn_all(ipst);
+
 	netstack_rele(ns);
 	return (0);
 }
@@ -28950,6 +13989,9 @@ ip_cgtp_filter_unregister(netstackid_t stackid)
 		return (ENXIO);
 	}
 	ipst->ips_ip_cgtp_filter_ops = NULL;
+
+	ill_set_inputfn_all(ipst);
+
 	netstack_rele(ns);
 	return (0);
 }
@@ -28984,7 +14026,7 @@ ip_cgtp_filter_is_registered(netstackid_t stackid)
 static int
 ip_squeue_switch(int val)
 {
-	int rval = SQ_FILL;
+	int rval;
 
 	switch (val) {
 	case IP_SQUEUE_ENTER_NODRAIN:
@@ -28993,7 +14035,9 @@ ip_squeue_switch(int val)
 	case IP_SQUEUE_ENTER:
 		rval = SQ_PROCESS;
 		break;
+	case IP_SQUEUE_FILL:
 	default:
+		rval = SQ_FILL;
 		break;
 	}
 	return (rval);
@@ -29046,52 +14090,45 @@ ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp)
 	kstat_t *ksp;
 
 	ip_stat_t template = {
-		{ "ipsec_fanout_proto", 	KSTAT_DATA_UINT64 },
 		{ "ip_udp_fannorm", 		KSTAT_DATA_UINT64 },
 		{ "ip_udp_fanmb", 		KSTAT_DATA_UINT64 },
-		{ "ip_udp_fanothers", 		KSTAT_DATA_UINT64 },
-		{ "ip_udp_fast_path", 		KSTAT_DATA_UINT64 },
-		{ "ip_udp_slow_path", 		KSTAT_DATA_UINT64 },
-		{ "ip_udp_input_err", 		KSTAT_DATA_UINT64 },
-		{ "ip_tcppullup", 		KSTAT_DATA_UINT64 },
-		{ "ip_tcpoptions", 		KSTAT_DATA_UINT64 },
-		{ "ip_multipkttcp", 		KSTAT_DATA_UINT64 },
-		{ "ip_tcp_fast_path",		KSTAT_DATA_UINT64 },
-		{ "ip_tcp_slow_path",		KSTAT_DATA_UINT64 },
-		{ "ip_tcp_input_error",		KSTAT_DATA_UINT64 },
+		{ "ip_recv_pullup", 		KSTAT_DATA_UINT64 },
 		{ "ip_db_ref",			KSTAT_DATA_UINT64 },
-		{ "ip_notaligned1",		KSTAT_DATA_UINT64 },
-		{ "ip_notaligned2",		KSTAT_DATA_UINT64 },
-		{ "ip_multimblk3",		KSTAT_DATA_UINT64 },
-		{ "ip_multimblk4",		KSTAT_DATA_UINT64 },
-		{ "ip_ipoptions",		KSTAT_DATA_UINT64 },
-		{ "ip_classify_fail",		KSTAT_DATA_UINT64 },
+		{ "ip_notaligned",		KSTAT_DATA_UINT64 },
+		{ "ip_multimblk",		KSTAT_DATA_UINT64 },
 		{ "ip_opt",			KSTAT_DATA_UINT64 },
-		{ "ip_udp_rput_local",		KSTAT_DATA_UINT64 },
 		{ "ipsec_proto_ahesp",		KSTAT_DATA_UINT64 },
 		{ "ip_conn_flputbq",		KSTAT_DATA_UINT64 },
 		{ "ip_conn_walk_drain",		KSTAT_DATA_UINT64 },
 		{ "ip_out_sw_cksum",		KSTAT_DATA_UINT64 },
+		{ "ip_out_sw_cksum_bytes",	KSTAT_DATA_UINT64 },
 		{ "ip_in_sw_cksum",		KSTAT_DATA_UINT64 },
-		{ "ip_trash_ire_reclaim_calls",	KSTAT_DATA_UINT64 },
-		{ "ip_trash_ire_reclaim_success",	KSTAT_DATA_UINT64 },
-		{ "ip_ire_arp_timer_expired",	KSTAT_DATA_UINT64 },
-		{ "ip_ire_redirect_timer_expired",	KSTAT_DATA_UINT64 },
-		{ "ip_ire_pmtu_timer_expired",	KSTAT_DATA_UINT64 },
-		{ "ip_input_multi_squeue",	KSTAT_DATA_UINT64 },
+		{ "ip_ire_reclaim_calls",	KSTAT_DATA_UINT64 },
+		{ "ip_ire_reclaim_deleted",	KSTAT_DATA_UINT64 },
+		{ "ip_nce_reclaim_calls",	KSTAT_DATA_UINT64 },
+		{ "ip_nce_reclaim_deleted",	KSTAT_DATA_UINT64 },
+		{ "ip_dce_reclaim_calls",	KSTAT_DATA_UINT64 },
+		{ "ip_dce_reclaim_deleted",	KSTAT_DATA_UINT64 },
 		{ "ip_tcp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
 		{ "ip_tcp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
 		{ "ip_tcp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
-		{ "ip_tcp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
 		{ "ip_udp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
 		{ "ip_udp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
-		{ "ip_udp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
-		{ "ip_udp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
-		{ "ip_frag_mdt_pkt_out",		KSTAT_DATA_UINT64 },
-		{ "ip_frag_mdt_discarded",		KSTAT_DATA_UINT64 },
-		{ "ip_frag_mdt_allocfail",		KSTAT_DATA_UINT64 },
-		{ "ip_frag_mdt_addpdescfail",		KSTAT_DATA_UINT64 },
-		{ "ip_frag_mdt_allocd",			KSTAT_DATA_UINT64 },
+		{ "ip_udp_in_sw_cksum_err",	KSTAT_DATA_UINT64 },
+		{ "conn_in_recvdstaddr",	KSTAT_DATA_UINT64 },
+		{ "conn_in_recvopts",		KSTAT_DATA_UINT64 },
+		{ "conn_in_recvif",		KSTAT_DATA_UINT64 },
+		{ "conn_in_recvslla",		KSTAT_DATA_UINT64 },
+		{ "conn_in_recvucred",		KSTAT_DATA_UINT64 },
+		{ "conn_in_recvttl",		KSTAT_DATA_UINT64 },
+		{ "conn_in_recvhopopts",	KSTAT_DATA_UINT64 },
+		{ "conn_in_recvhoplimit",	KSTAT_DATA_UINT64 },
+		{ "conn_in_recvdstopts",	KSTAT_DATA_UINT64 },
+		{ "conn_in_recvrthdrdstopts",	KSTAT_DATA_UINT64 },
+		{ "conn_in_recvrthdr",		KSTAT_DATA_UINT64 },
+		{ "conn_in_recvpktinfo",	KSTAT_DATA_UINT64 },
+		{ "conn_in_recvtclass",		KSTAT_DATA_UINT64 },
+		{ "conn_in_timestamp",		KSTAT_DATA_UINT64 },
 	};
 
 	ksp = kstat_create_netstack("ip", 0, "ipstat", "net",
@@ -29420,323 +14457,457 @@ icmp_kstat_update(kstat_t *kp, int rw)
  * a port.  This is assured in ipcl_sctp_hash_insert();
  */
 void
-ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4,
-    uint32_t ports, boolean_t mctl_present, uint_t flags, boolean_t ip_policy,
-    zoneid_t zoneid)
+ip_fanout_sctp_raw(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports,
+    ip_recv_attr_t *ira)
 {
 	conn_t		*connp;
 	queue_t		*rq;
-	mblk_t		*first_mp;
 	boolean_t	secure;
-	ip6_t		*ip6h;
-	ip_stack_t	*ipst = recv_ill->ill_ipst;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
 	sctp_stack_t	*sctps = ipst->ips_netstack->netstack_sctp;
-	boolean_t	sctp_csum_err = B_FALSE;
+	iaflags_t	iraflags = ira->ira_flags;
+	ill_t		*rill = ira->ira_rill;
 
-	if (flags & IP_FF_SCTP_CSUM_ERR) {
-		sctp_csum_err = B_TRUE;
-		flags &= ~IP_FF_SCTP_CSUM_ERR;
-	}
+	secure = iraflags & IRAF_IPSEC_SECURE;
 
-	first_mp = mp;
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		secure = ipsec_in_is_secure(first_mp);
-		ASSERT(mp != NULL);
-	} else {
-		secure = B_FALSE;
-	}
-	ip6h = (isv4) ? NULL : (ip6_t *)ipha;
-
-	connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha, ipst);
+	connp = ipcl_classify_raw(mp, IPPROTO_SCTP, ports, ipha, ip6h,
+	    ira, ipst);
 	if (connp == NULL) {
 		/*
 		 * Although raw sctp is not summed, OOB chunks must be.
 		 * Drop the packet here if the sctp checksum failed.
 		 */
-		if (sctp_csum_err) {
+		if (iraflags & IRAF_SCTP_CSUM_ERR) {
 			BUMP_MIB(&sctps->sctps_mib, sctpChecksumError);
-			freemsg(first_mp);
+			freemsg(mp);
 			return;
 		}
-		sctp_ootb_input(first_mp, recv_ill, zoneid, mctl_present);
+		ira->ira_ill = ira->ira_rill = NULL;
+		sctp_ootb_input(mp, ira, ipst);
+		ira->ira_ill = ill;
+		ira->ira_rill = rill;
 		return;
 	}
 	rq = connp->conn_rq;
-	if (!canputnext(rq)) {
+	if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
 		CONN_DEC_REF(connp);
-		BUMP_MIB(recv_ill->ill_ip_mib, rawipIfStatsInOverflows);
-		freemsg(first_mp);
+		BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
+		freemsg(mp);
 		return;
 	}
-	if ((isv4 ? CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
-	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || secure) {
-		first_mp = ipsec_check_inbound_policy(first_mp, connp,
-		    (isv4 ? ipha : NULL), ip6h, mctl_present);
-		if (first_mp == NULL) {
-			BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
+	if (((iraflags & IRAF_IS_IPV4) ?
+	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
+	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
+	    secure) {
+		mp = ipsec_check_inbound_policy(mp, connp, ipha,
+		    ip6h, ira);
+		if (mp == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			/* Note that mp is NULL */
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
 			CONN_DEC_REF(connp);
 			return;
 		}
 	}
-	/*
-	 * We probably should not send M_CTL message up to
-	 * raw socket.
-	 */
-	if (mctl_present)
-		freeb(first_mp);
 
-	/* Initiate IPPF processing here if needed. */
-	if ((isv4 && IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) ||
-	    (!isv4 && IP6_IN_IPP(flags, ipst))) {
-		ip_process(IPP_LOCAL_IN, &mp,
-		    recv_ill->ill_phyint->phyint_ifindex);
-		if (mp == NULL) {
-			CONN_DEC_REF(connp);
-			return;
-		}
+	if (iraflags & IRAF_ICMP_ERROR) {
+		(connp->conn_recvicmp)(connp, mp, NULL, ira);
+	} else {
+		ill_t *rill = ira->ira_rill;
+
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		/* This is the SOCK_RAW, IPPROTO_SCTP case. */
+		ira->ira_ill = ira->ira_rill = NULL;
+		(connp->conn_recv)(connp, mp, NULL, ira);
+		ira->ira_ill = ill;
+		ira->ira_rill = rill;
 	}
+	CONN_DEC_REF(connp);
+}
 
-	if (connp->conn_recvif || connp->conn_recvslla ||
-	    ((connp->conn_ip_recvpktinfo ||
-	    (!isv4 && IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) &&
-	    (flags & IP_FF_IPINFO))) {
-		int in_flags = 0;
+/*
+ * Free a packet that has the link-layer dl_unitdata_req_t or fast-path
+ * header before the ip payload.
+ */
+static void
+ip_xmit_flowctl_drop(ill_t *ill, mblk_t *mp, boolean_t is_fp_mp, int fp_mp_len)
+{
+	int len = (mp->b_wptr - mp->b_rptr);
+	mblk_t *ip_mp;
 
-		/*
-		 * Since sctp does not support IP_RECVPKTINFO for v4, only pass
-		 * IPF_RECVIF.
-		 */
-		if (connp->conn_recvif || connp->conn_ip_recvpktinfo) {
-			in_flags = IPF_RECVIF;
-		}
-		if (connp->conn_recvslla) {
-			in_flags |= IPF_RECVSLLA;
-		}
-		if (isv4) {
-			mp = ip_add_info(mp, recv_ill, in_flags,
-			    IPCL_ZONEID(connp), ipst);
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+	if (is_fp_mp || len != fp_mp_len) {
+		if (len > fp_mp_len) {
+			/*
+			 * fastpath header and ip header in the first mblk
+			 */
+			mp->b_rptr += fp_mp_len;
 		} else {
-			mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst);
-			if (mp == NULL) {
-				BUMP_MIB(recv_ill->ill_ip_mib,
-				    ipIfStatsInDiscards);
-				CONN_DEC_REF(connp);
-				return;
-			}
+			/*
+			 * ip_xmit_attach_llhdr had to prepend an mblk to
+			 * attach the fastpath header before ip header.
+			 */
+			ip_mp = mp->b_cont;
+			freeb(mp);
+			mp = ip_mp;
+			mp->b_rptr += (fp_mp_len - len);
 		}
+	} else {
+		ip_mp = mp->b_cont;
+		freeb(mp);
+		mp = ip_mp;
 	}
-
-	BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
-	/*
-	 * We are sending the IPSEC_IN message also up. Refer
-	 * to comments above this function.
-	 * This is the SOCK_RAW, IPPROTO_SCTP case.
-	 */
-	(connp->conn_recv)(connp, mp, NULL);
-	CONN_DEC_REF(connp);
+	ip_drop_output("ipIfStatsOutDiscards - flow ctl", mp, ill);
+	freemsg(mp);
 }
 
-#define	UPDATE_IP_MIB_OB_COUNTERS(ill, len)				\
-{									\
-	BUMP_MIB((ill)->ill_ip_mib, ipIfStatsHCOutTransmits);		\
-	UPDATE_MIB((ill)->ill_ip_mib, ipIfStatsHCOutOctets, (len));	\
-}
 /*
- * This function should be called only if all packet processing
- * including fragmentation is complete. Callers of this function
- * must set mp->b_prev to one of these values:
- *	{0, IPP_FWD_OUT, IPP_LOCAL_OUT}
- * prior to handing over the mp as first argument to this function.
+ * Normal post fragmentation function.
+ *
+ * Send a packet using the passed in nce. This handles both IPv4 and IPv6
+ * using the same state machine.
  *
- * If the ire passed by caller is incomplete, this function
+ * We return an error on failure. In particular we return EWOULDBLOCK
+ * when the driver flow controls. In that case this ensures that ip_wsrv runs
+ * (currently by canputnext failure resulting in backenabling from GLD.)
+ * This allows the callers of conn_ip_output() to use EWOULDBLOCK as an
+ * indication that they can flow control until ip_wsrv() tells then to restart.
+ *
+ * If the nce passed by caller is incomplete, this function
  * queues the packet and if necessary, sends ARP request and bails.
- * If the ire passed is fully resolved, we simply prepend
+ * If the Neighbor Cache passed is fully resolved, we simply prepend
  * the link-layer header to the packet, do ipsec hw acceleration
  * work if necessary, and send the packet out on the wire.
- *
- * NOTE: IPsec will only call this function with fully resolved
- * ires if hw acceleration is involved.
- * TODO list :
- * 	a Handle M_MULTIDATA so that
- *	  tcp_multisend->tcp_multisend_data can
- *	  call ip_xmit_v4 directly
- *	b Handle post-ARP work for fragments so that
- *	  ip_wput_frag can call this function.
  */
-ipxmit_state_t
-ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io,
-    boolean_t flow_ctl_enabled, conn_t *connp)
+/* ARGSUSED6 */
+int
+ip_xmit(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
+    uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, uintptr_t *ixacookie)
 {
-	nce_t		*arpce;
-	ipha_t		*ipha;
-	queue_t		*q;
-	int		ill_index;
-	mblk_t		*nxt_mp, *first_mp;
-	boolean_t	xmit_drop = B_FALSE;
-	ip_proc_t	proc;
-	ill_t		*out_ill;
-	int		pkt_len;
+	queue_t		*wq;
+	ill_t		*ill = nce->nce_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	uint64_t	delta;
+	boolean_t	isv6 = ill->ill_isv6;
+	boolean_t	fp_mp;
+	ncec_t		*ncec = nce->nce_common;
 
-	arpce = ire->ire_nce;
-	ASSERT(arpce != NULL);
+	DTRACE_PROBE1(ip__xmit, nce_t *, nce);
 
-	DTRACE_PROBE2(ip__xmit__v4, ire_t *, ire,  nce_t *, arpce);
+	ASSERT(mp != NULL);
+	ASSERT(mp->b_datap->db_type == M_DATA);
+	ASSERT(pkt_len == msgdsize(mp));
 
-	mutex_enter(&arpce->nce_lock);
-	switch (arpce->nce_state) {
-	case ND_REACHABLE:
-		/* If there are other queued packets, queue this packet */
-		if (arpce->nce_qd_mp != NULL) {
-			if (mp != NULL)
-				nce_queue_mp_common(arpce, mp, B_FALSE);
-			mp = arpce->nce_qd_mp;
+	/*
+	 * If we have already been here and are coming back after ARP/ND.
+	 * the IXAF_NO_TRACE flag is set. We skip FW_HOOKS, DTRACE and ipobs
+	 * in that case since they have seen the packet when it came here
+	 * the first time.
+	 */
+	if (ixaflags & IXAF_NO_TRACE)
+		goto sendit;
+
+	if (ixaflags & IXAF_IS_IPV4) {
+		ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+		ASSERT(!isv6);
+		ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
+		if (HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) &&
+		    !(ixaflags & IXAF_NO_PFHOOK)) {
+			int	error;
+
+			FW_HOOKS(ipst->ips_ip4_physical_out_event,
+			    ipst->ips_ipv4firewall_physical_out,
+			    NULL, ill, ipha, mp, mp, 0, ipst, error);
+			DTRACE_PROBE1(ip4__physical__out__end,
+			    mblk_t *, mp);
+			if (mp == NULL)
+				return (error);
+
+			/* The length could have changed */
+			pkt_len = msgdsize(mp);
+		}
+		if (ipst->ips_ip4_observe.he_interested) {
+			/*
+			 * Note that for TX the zoneid is the sending
+			 * zone, whether or not MLP is in play.
+			 * Since the szone argument is the IP zoneid (i.e.,
+			 * zero for exclusive-IP zones) and ipobs wants
+			 * the system zoneid, we map it here.
+			 */
+			szone = IP_REAL_ZONEID(szone, ipst);
+
+			/*
+			 * On the outbound path the destination zone will be
+			 * unknown as we're sending this packet out on the
+			 * wire.
+			 */
+			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
+			    ill, ipst);
+		}
+		DTRACE_IP7(send, mblk_t *, mp,  conn_t *, NULL,
+		    void_ip_t *, ipha,  __dtrace_ipsr_ill_t *, ill,
+		    ipha_t *, ipha, ip6_t *, NULL, int, 0);
+	} else {
+		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+		ASSERT(isv6);
+		ASSERT(pkt_len ==
+		    ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN);
+		if (HOOKS6_INTERESTED_PHYSICAL_OUT(ipst) &&
+		    !(ixaflags & IXAF_NO_PFHOOK)) {
+			int	error;
+
+			FW_HOOKS6(ipst->ips_ip6_physical_out_event,
+			    ipst->ips_ipv6firewall_physical_out,
+			    NULL, ill, ip6h, mp, mp, 0, ipst, error);
+			DTRACE_PROBE1(ip6__physical__out__end,
+			    mblk_t *, mp);
+			if (mp == NULL)
+				return (error);
+
+			/* The length could have changed */
+			pkt_len = msgdsize(mp);
+		}
+		if (ipst->ips_ip6_observe.he_interested) {
+			/* See above */
+			szone = IP_REAL_ZONEID(szone, ipst);
+
+			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
+			    ill, ipst);
 		}
-		arpce->nce_qd_mp = NULL;
-		mutex_exit(&arpce->nce_lock);
+		DTRACE_IP7(send, mblk_t *, mp,  conn_t *, NULL,
+		    void_ip_t *, ip6h,  __dtrace_ipsr_ill_t *, ill,
+		    ipha_t *, NULL, ip6_t *, ip6h, int, 0);
+	}
 
+sendit:
+	/*
+	 * We check the state without a lock because the state can never
+	 * move "backwards" to initial or incomplete.
+	 */
+	switch (ncec->ncec_state) {
+	case ND_REACHABLE:
+	case ND_STALE:
+	case ND_DELAY:
+	case ND_PROBE:
+		mp = ip_xmit_attach_llhdr(mp, nce);
+		if (mp == NULL) {
+			/*
+			 * ip_xmit_attach_llhdr has increased
+			 * ipIfStatsOutDiscards and called ip_drop_output()
+			 */
+			return (ENOBUFS);
+		}
 		/*
-		 * Flush the queue.  In the common case, where the
-		 * ARP is already resolved,  it will go through the
-		 * while loop only once.
+		 * check if nce_fastpath completed and we tagged on a
+		 * copy of nce_fp_mp in ip_xmit_attach_llhdr().
 		 */
-		while (mp != NULL) {
+		fp_mp = (mp->b_datap->db_type == M_DATA);
 
-			nxt_mp = mp->b_next;
-			mp->b_next = NULL;
-			ASSERT(mp->b_datap->db_type != M_CTL);
-			pkt_len = ntohs(((ipha_t *)mp->b_rptr)->ipha_length);
+		if (fp_mp &&
+		    (ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT)) {
+			ill_dld_direct_t *idd;
+
+			idd = &ill->ill_dld_capab->idc_direct;
 			/*
-			 * This info is needed for IPQOS to do COS marking
-			 * in ip_wput_attach_llhdr->ip_process.
+			 * Send the packet directly to DLD, where it
+			 * may be queued depending on the availability
+			 * of transmit resources at the media layer.
+			 * Return value should be taken into
+			 * account and flow control the TCP.
 			 */
-			proc = (ip_proc_t)(uintptr_t)mp->b_prev;
-			mp->b_prev = NULL;
-
-			/* set up ill index for outbound qos processing */
-			out_ill = ire_to_ill(ire);
-			ill_index = out_ill->ill_phyint->phyint_ifindex;
-			first_mp = ip_wput_attach_llhdr(mp, ire, proc,
-			    ill_index, &ipha);
-			if (first_mp == NULL) {
-				xmit_drop = B_TRUE;
-				BUMP_MIB(out_ill->ill_ip_mib,
-				    ipIfStatsOutDiscards);
-				goto next_mp;
-			}
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
+			UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
+			    pkt_len);
 
-			/* non-ipsec hw accel case */
-			if (io == NULL || !io->ipsec_out_accelerated) {
-				/* send it */
-				q = ire->ire_stq;
-				if (proc == IPP_FWD_OUT) {
-					UPDATE_IB_PKT_COUNT(ire);
-				} else {
-					UPDATE_OB_PKT_COUNT(ire);
-				}
-				ire->ire_last_used_time = lbolt;
+			if (ixaflags & IXAF_NO_DEV_FLOW_CTL) {
+				(void) idd->idd_tx_df(idd->idd_tx_dh, mp,
+				    (uintptr_t)xmit_hint, IP_DROP_ON_NO_DESC);
+			} else {
+				uintptr_t cookie;
 
-				if (flow_ctl_enabled || canputnext(q)) {
-					if (proc == IPP_FWD_OUT) {
+				if ((cookie = idd->idd_tx_df(idd->idd_tx_dh,
+				    mp, (uintptr_t)xmit_hint, 0)) != 0) {
+					if (ixacookie != NULL)
+						*ixacookie = cookie;
+					return (EWOULDBLOCK);
+				}
+			}
+		} else {
+			wq = ill->ill_wq;
+
+			if (!(ixaflags & IXAF_NO_DEV_FLOW_CTL) &&
+			    !canputnext(wq)) {
+				if (ixacookie != NULL)
+					*ixacookie = 0;
+				ip_xmit_flowctl_drop(ill, mp, fp_mp,
+				    nce->nce_fp_mp != NULL ?
+				    MBLKL(nce->nce_fp_mp) : 0);
+				return (EWOULDBLOCK);
+			}
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
+			UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
+			    pkt_len);
+			putnext(wq, mp);
+		}
 
-					BUMP_MIB(out_ill->ill_ip_mib,
-					    ipIfStatsHCOutForwDatagrams);
+		/*
+		 * The rest of this function implements Neighbor Unreachability
+		 * detection. Determine if the ncec is eligible for NUD.
+		 */
+		if (ncec->ncec_flags & NCE_F_NONUD)
+			return (0);
 
-					}
-					UPDATE_IP_MIB_OB_COUNTERS(out_ill,
-					    pkt_len);
+		ASSERT(ncec->ncec_state != ND_INCOMPLETE);
 
-					DTRACE_IP7(send, mblk_t *, first_mp,
-					    conn_t *, NULL, void_ip_t *, ipha,
-					    __dtrace_ipsr_ill_t *, out_ill,
-					    ipha_t *, ipha, ip6_t *, NULL, int,
-					    0);
+		/*
+		 * Check for upper layer advice
+		 */
+		if (ixaflags & IXAF_REACH_CONF) {
+			timeout_id_t tid;
 
-					ILL_SEND_TX(out_ill,
-					    ire, connp, first_mp, 0, connp);
-				} else {
-					BUMP_MIB(out_ill->ill_ip_mib,
-					    ipIfStatsOutDiscards);
-					xmit_drop = B_TRUE;
-					freemsg(first_mp);
+			/*
+			 * It should be o.k. to check the state without
+			 * a lock here, at most we lose an advice.
+			 */
+			ncec->ncec_last = TICK_TO_MSEC(lbolt64);
+			if (ncec->ncec_state != ND_REACHABLE) {
+				mutex_enter(&ncec->ncec_lock);
+				ncec->ncec_state = ND_REACHABLE;
+				tid = ncec->ncec_timeout_id;
+				ncec->ncec_timeout_id = 0;
+				mutex_exit(&ncec->ncec_lock);
+				(void) untimeout(tid);
+				if (ip_debug > 2) {
+					/* ip1dbg */
+					pr_addr_dbg("ip_xmit: state"
+					    " for %s changed to"
+					    " REACHABLE\n", AF_INET6,
+					    &ncec->ncec_addr);
 				}
-			} else {
+			}
+			return (0);
+		}
+
+		delta =  TICK_TO_MSEC(lbolt64) - ncec->ncec_last;
+		ip1dbg(("ip_xmit: delta = %" PRId64
+		    " ill_reachable_time = %d \n", delta,
+		    ill->ill_reachable_time));
+		if (delta > (uint64_t)ill->ill_reachable_time) {
+			mutex_enter(&ncec->ncec_lock);
+			switch (ncec->ncec_state) {
+			case ND_REACHABLE:
+				ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
+				/* FALLTHROUGH */
+			case ND_STALE:
 				/*
-				 * Safety Pup says: make sure this
-				 *  is going to the right interface!
+				 * ND_REACHABLE is identical to
+				 * ND_STALE in this specific case. If
+				 * reachable time has expired for this
+				 * neighbor (delta is greater than
+				 * reachable time), conceptually, the
+				 * neighbor cache is no longer in
+				 * REACHABLE state, but already in
+				 * STALE state.  So the correct
+				 * transition here is to ND_DELAY.
 				 */
-				ill_t *ill1 =
-				    (ill_t *)ire->ire_stq->q_ptr;
-				int ifindex =
-				    ill1->ill_phyint->phyint_ifindex;
-				if (ifindex !=
-				    io->ipsec_out_capab_ill_index) {
-					xmit_drop = B_TRUE;
-					freemsg(mp);
-				} else {
-					UPDATE_IP_MIB_OB_COUNTERS(ill1,
-					    pkt_len);
-
-					DTRACE_IP7(send, mblk_t *, first_mp,
-					    conn_t *, NULL, void_ip_t *, ipha,
-					    __dtrace_ipsr_ill_t *, ill1,
-					    ipha_t *, ipha, ip6_t *, NULL,
-					    int, 0);
-
-					ipsec_hw_putnext(ire->ire_stq, mp);
+				ncec->ncec_state = ND_DELAY;
+				mutex_exit(&ncec->ncec_lock);
+				nce_restart_timer(ncec,
+				    ipst->ips_delay_first_probe_time);
+				if (ip_debug > 3) {
+					/* ip2dbg */
+					pr_addr_dbg("ip_xmit: state"
+					    " for %s changed to"
+					    " DELAY\n", AF_INET6,
+					    &ncec->ncec_addr);
 				}
+				break;
+			case ND_DELAY:
+			case ND_PROBE:
+				mutex_exit(&ncec->ncec_lock);
+				/* Timers have already started */
+				break;
+			case ND_UNREACHABLE:
+				/*
+				 * nce_timer has detected that this ncec
+				 * is unreachable and initiated deleting
+				 * this ncec.
+				 * This is a harmless race where we found the
+				 * ncec before it was deleted and have
+				 * just sent out a packet using this
+				 * unreachable ncec.
+				 */
+				mutex_exit(&ncec->ncec_lock);
+				break;
+			default:
+				ASSERT(0);
+				mutex_exit(&ncec->ncec_lock);
 			}
-next_mp:
-			mp = nxt_mp;
-		} /* while (mp != NULL) */
-		if (xmit_drop)
-			return (SEND_FAILED);
-		else
-			return (SEND_PASSED);
+		}
+		return (0);
 
-	case ND_INITIAL:
 	case ND_INCOMPLETE:
-
 		/*
-		 * While we do send off packets to dests that
-		 * use fully-resolved CGTP routes, we do not
-		 * handle unresolved CGTP routes.
+		 * the state could have changed since we didn't hold the lock.
+		 * Re-verify state under lock.
 		 */
-		ASSERT(!(ire->ire_flags & RTF_MULTIRT));
-		ASSERT(io == NULL || !io->ipsec_out_accelerated);
-
-		if (mp != NULL) {
-			/* queue the packet */
-			nce_queue_mp_common(arpce, mp, B_FALSE);
+		mutex_enter(&ncec->ncec_lock);
+		if (NCE_ISREACHABLE(ncec)) {
+			mutex_exit(&ncec->ncec_lock);
+			goto sendit;
 		}
+		/* queue the packet */
+		nce_queue_mp(ncec, mp, ipmp_packet_is_probe(mp, nce->nce_ill));
+		mutex_exit(&ncec->ncec_lock);
+		DTRACE_PROBE2(ip__xmit__incomplete,
+		    (ncec_t *), ncec, (mblk_t *), mp);
+		return (0);
 
-		if (arpce->nce_state == ND_INCOMPLETE) {
-			mutex_exit(&arpce->nce_lock);
-			DTRACE_PROBE3(ip__xmit__incomplete,
-			    (ire_t *), ire, (mblk_t *), mp,
-			    (ipsec_out_t *), io);
-			return (LOOKUP_IN_PROGRESS);
+	case ND_INITIAL:
+		/*
+		 * State could have changed since we didn't hold the lock, so
+		 * re-verify state.
+		 */
+		mutex_enter(&ncec->ncec_lock);
+		if (NCE_ISREACHABLE(ncec))  {
+			mutex_exit(&ncec->ncec_lock);
+			goto sendit;
+		}
+		nce_queue_mp(ncec, mp, ipmp_packet_is_probe(mp, nce->nce_ill));
+		if (ncec->ncec_state == ND_INITIAL) {
+			ncec->ncec_state = ND_INCOMPLETE;
+			mutex_exit(&ncec->ncec_lock);
+			/*
+			 * figure out the source we want to use
+			 * and resolve it.
+			 */
+			ip_ndp_resolve(ncec);
+		} else  {
+			mutex_exit(&ncec->ncec_lock);
 		}
+		return (0);
 
-		arpce->nce_state = ND_INCOMPLETE;
-		mutex_exit(&arpce->nce_lock);
+	case ND_UNREACHABLE:
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ipIfStatsOutDiscards - ND_UNREACHABLE",
+		    mp, ill);
+		freemsg(mp);
+		return (0);
 
-		/*
-		 * Note that ire_add() (called from ire_forward())
-		 * holds a ref on the ire until ARP is completed.
-		 */
-		ire_arpresolve(ire);
-		return (LOOKUP_IN_PROGRESS);
 	default:
 		ASSERT(0);
-		mutex_exit(&arpce->nce_lock);
-		return (LLHDR_RESLV_FAILED);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ipIfStatsOutDiscards - ND_other",
+		    mp, ill);
+		freemsg(mp);
+		return (ENETUNREACH);
 	}
 }
 
-#undef	UPDATE_IP_MIB_OB_COUNTERS
-
 /*
  * Return B_TRUE if the buffers differ in length or content.
  * This is used for comparing extension header buffers.
@@ -29803,52 +14974,300 @@ ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
 }
 
 /*
- * Free the storage pointed to by the members of an ip6_pkt_t.
+ * Free the storage pointed to by the members of an ip_pkt_t.
  */
 void
-ip6_pkt_free(ip6_pkt_t *ipp)
+ip_pkt_free(ip_pkt_t *ipp)
 {
-	ASSERT(ipp->ipp_pathmtu == NULL && !(ipp->ipp_fields & IPPF_PATHMTU));
+	uint_t	fields = ipp->ipp_fields;
 
-	if (ipp->ipp_fields & IPPF_HOPOPTS) {
+	if (fields & IPPF_HOPOPTS) {
 		kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
 		ipp->ipp_hopopts = NULL;
 		ipp->ipp_hopoptslen = 0;
 	}
-	if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
-		kmem_free(ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen);
-		ipp->ipp_rtdstopts = NULL;
-		ipp->ipp_rtdstoptslen = 0;
+	if (fields & IPPF_RTHDRDSTOPTS) {
+		kmem_free(ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
+		ipp->ipp_rthdrdstopts = NULL;
+		ipp->ipp_rthdrdstoptslen = 0;
 	}
-	if (ipp->ipp_fields & IPPF_DSTOPTS) {
+	if (fields & IPPF_DSTOPTS) {
 		kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen);
 		ipp->ipp_dstopts = NULL;
 		ipp->ipp_dstoptslen = 0;
 	}
-	if (ipp->ipp_fields & IPPF_RTHDR) {
+	if (fields & IPPF_RTHDR) {
 		kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen);
 		ipp->ipp_rthdr = NULL;
 		ipp->ipp_rthdrlen = 0;
 	}
-	ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS |
-	    IPPF_RTHDR);
+	if (fields & IPPF_IPV4_OPTIONS) {
+		kmem_free(ipp->ipp_ipv4_options, ipp->ipp_ipv4_options_len);
+		ipp->ipp_ipv4_options = NULL;
+		ipp->ipp_ipv4_options_len = 0;
+	}
+	if (fields & IPPF_LABEL_V4) {
+		kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
+		ipp->ipp_label_v4 = NULL;
+		ipp->ipp_label_len_v4 = 0;
+	}
+	if (fields & IPPF_LABEL_V6) {
+		kmem_free(ipp->ipp_label_v6, ipp->ipp_label_len_v6);
+		ipp->ipp_label_v6 = NULL;
+		ipp->ipp_label_len_v6 = 0;
+	}
+	ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
+	    IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
+}
+
+/*
+ * Copy from src to dst and allocate as needed.
+ * Returns zero or ENOMEM.
+ *
+ * The caller must initialize dst to zero.
+ */
+int
+ip_pkt_copy(ip_pkt_t *src, ip_pkt_t *dst, int kmflag)
+{
+	uint_t	fields = src->ipp_fields;
+
+	/* Start with fields that don't require memory allocation */
+	dst->ipp_fields = fields &
+	    ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
+	    IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
+
+	dst->ipp_addr = src->ipp_addr;
+	dst->ipp_unicast_hops = src->ipp_unicast_hops;
+	dst->ipp_hoplimit = src->ipp_hoplimit;
+	dst->ipp_tclass = src->ipp_tclass;
+	dst->ipp_type_of_service = src->ipp_type_of_service;
+
+	if (fields & IPPF_HOPOPTS) {
+		dst->ipp_hopopts = kmem_alloc(src->ipp_hopoptslen, kmflag);
+		if (dst->ipp_hopopts == NULL) {
+			ip_pkt_free(dst);
+			return (ENOMEM);
+		}
+		dst->ipp_fields |= IPPF_HOPOPTS;
+		bcopy(src->ipp_hopopts, dst->ipp_hopopts,
+		    src->ipp_hopoptslen);
+		dst->ipp_hopoptslen = src->ipp_hopoptslen;
+	}
+	if (fields & IPPF_RTHDRDSTOPTS) {
+		dst->ipp_rthdrdstopts = kmem_alloc(src->ipp_rthdrdstoptslen,
+		    kmflag);
+		if (dst->ipp_rthdrdstopts == NULL) {
+			ip_pkt_free(dst);
+			return (ENOMEM);
+		}
+		dst->ipp_fields |= IPPF_RTHDRDSTOPTS;
+		bcopy(src->ipp_rthdrdstopts, dst->ipp_rthdrdstopts,
+		    src->ipp_rthdrdstoptslen);
+		dst->ipp_rthdrdstoptslen = src->ipp_rthdrdstoptslen;
+	}
+	if (fields & IPPF_DSTOPTS) {
+		dst->ipp_dstopts = kmem_alloc(src->ipp_dstoptslen, kmflag);
+		if (dst->ipp_dstopts == NULL) {
+			ip_pkt_free(dst);
+			return (ENOMEM);
+		}
+		dst->ipp_fields |= IPPF_DSTOPTS;
+		bcopy(src->ipp_dstopts, dst->ipp_dstopts,
+		    src->ipp_dstoptslen);
+		dst->ipp_dstoptslen = src->ipp_dstoptslen;
+	}
+	if (fields & IPPF_RTHDR) {
+		dst->ipp_rthdr = kmem_alloc(src->ipp_rthdrlen, kmflag);
+		if (dst->ipp_rthdr == NULL) {
+			ip_pkt_free(dst);
+			return (ENOMEM);
+		}
+		dst->ipp_fields |= IPPF_RTHDR;
+		bcopy(src->ipp_rthdr, dst->ipp_rthdr,
+		    src->ipp_rthdrlen);
+		dst->ipp_rthdrlen = src->ipp_rthdrlen;
+	}
+	if (fields & IPPF_IPV4_OPTIONS) {
+		dst->ipp_ipv4_options = kmem_alloc(src->ipp_ipv4_options_len,
+		    kmflag);
+		if (dst->ipp_ipv4_options == NULL) {
+			ip_pkt_free(dst);
+			return (ENOMEM);
+		}
+		dst->ipp_fields |= IPPF_IPV4_OPTIONS;
+		bcopy(src->ipp_ipv4_options, dst->ipp_ipv4_options,
+		    src->ipp_ipv4_options_len);
+		dst->ipp_ipv4_options_len = src->ipp_ipv4_options_len;
+	}
+	if (fields & IPPF_LABEL_V4) {
+		dst->ipp_label_v4 = kmem_alloc(src->ipp_label_len_v4, kmflag);
+		if (dst->ipp_label_v4 == NULL) {
+			ip_pkt_free(dst);
+			return (ENOMEM);
+		}
+		dst->ipp_fields |= IPPF_LABEL_V4;
+		bcopy(src->ipp_label_v4, dst->ipp_label_v4,
+		    src->ipp_label_len_v4);
+		dst->ipp_label_len_v4 = src->ipp_label_len_v4;
+	}
+	if (fields & IPPF_LABEL_V6) {
+		dst->ipp_label_v6 = kmem_alloc(src->ipp_label_len_v6, kmflag);
+		if (dst->ipp_label_v6 == NULL) {
+			ip_pkt_free(dst);
+			return (ENOMEM);
+		}
+		dst->ipp_fields |= IPPF_LABEL_V6;
+		bcopy(src->ipp_label_v6, dst->ipp_label_v6,
+		    src->ipp_label_len_v6);
+		dst->ipp_label_len_v6 = src->ipp_label_len_v6;
+	}
+	if (fields & IPPF_FRAGHDR) {
+		dst->ipp_fraghdr = kmem_alloc(src->ipp_fraghdrlen, kmflag);
+		if (dst->ipp_fraghdr == NULL) {
+			ip_pkt_free(dst);
+			return (ENOMEM);
+		}
+		dst->ipp_fields |= IPPF_FRAGHDR;
+		bcopy(src->ipp_fraghdr, dst->ipp_fraghdr,
+		    src->ipp_fraghdrlen);
+		dst->ipp_fraghdrlen = src->ipp_fraghdrlen;
+	}
+	return (0);
+}
+
+/*
+ * Returns INADDR_ANY if no source route
+ */
+ipaddr_t
+ip_pkt_source_route_v4(const ip_pkt_t *ipp)
+{
+	ipaddr_t	nexthop = INADDR_ANY;
+	ipoptp_t	opts;
+	uchar_t		*opt;
+	uint8_t		optval;
+	uint8_t		optlen;
+	uint32_t	totallen;
+
+	if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
+		return (INADDR_ANY);
+
+	totallen = ipp->ipp_ipv4_options_len;
+	if (totallen & 0x3)
+		return (INADDR_ANY);
+
+	for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
+	    optval != IPOPT_EOL;
+	    optval = ipoptp_next(&opts)) {
+		opt = opts.ipoptp_cur;
+		switch (optval) {
+			uint8_t off;
+		case IPOPT_SSRR:
+		case IPOPT_LSRR:
+			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
+				break;
+			}
+			optlen = opts.ipoptp_len;
+			off = opt[IPOPT_OFFSET];
+			off--;
+			if (optlen < IP_ADDR_LEN ||
+			    off > optlen - IP_ADDR_LEN) {
+				/* End of source route */
+				break;
+			}
+			bcopy((char *)opt + off, &nexthop, IP_ADDR_LEN);
+			if (nexthop == htonl(INADDR_LOOPBACK)) {
+				/* Ignore */
+				nexthop = INADDR_ANY;
+				break;
+			}
+			break;
+		}
+	}
+	return (nexthop);
+}
+
+/*
+ * Reverse a source route.
+ */
+void
+ip_pkt_source_route_reverse_v4(ip_pkt_t *ipp)
+{
+	ipaddr_t	tmp;
+	ipoptp_t	opts;
+	uchar_t		*opt;
+	uint8_t		optval;
+	uint32_t	totallen;
+
+	if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
+		return;
+
+	totallen = ipp->ipp_ipv4_options_len;
+	if (totallen & 0x3)
+		return;
+
+	for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
+	    optval != IPOPT_EOL;
+	    optval = ipoptp_next(&opts)) {
+		uint8_t off1, off2;
+
+		opt = opts.ipoptp_cur;
+		switch (optval) {
+		case IPOPT_SSRR:
+		case IPOPT_LSRR:
+			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
+				break;
+			}
+			off1 = IPOPT_MINOFF_SR - 1;
+			off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
+			while (off2 > off1) {
+				bcopy(opt + off2, &tmp, IP_ADDR_LEN);
+				bcopy(opt + off1, opt + off2, IP_ADDR_LEN);
+				bcopy(&tmp, opt + off2, IP_ADDR_LEN);
+				off2 -= IP_ADDR_LEN;
+				off1 += IP_ADDR_LEN;
+			}
+			opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
+			break;
+		}
+	}
+}
+
+/*
+ * Returns NULL if no routing header
+ */
+in6_addr_t *
+ip_pkt_source_route_v6(const ip_pkt_t *ipp)
+{
+	in6_addr_t	*nexthop = NULL;
+	ip6_rthdr0_t	*rthdr;
+
+	if (!(ipp->ipp_fields & IPPF_RTHDR))
+		return (NULL);
+
+	rthdr = (ip6_rthdr0_t *)ipp->ipp_rthdr;
+	if (rthdr->ip6r0_segleft == 0)
+		return (NULL);
+
+	nexthop = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
+	return (nexthop);
 }
 
 zoneid_t
-ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_stack_t *ipst,
+ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_recv_attr_t *ira,
     zoneid_t lookup_zoneid)
 {
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
 	ire_t		*ire;
 	int		ire_flags = MATCH_IRE_TYPE;
 	zoneid_t	zoneid = ALL_ZONES;
 
-	if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE))
+	if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
 		return (ALL_ZONES);
 
 	if (lookup_zoneid != ALL_ZONES)
 		ire_flags |= MATCH_IRE_ZONEONLY;
-	ire = ire_ctable_lookup(addr, NULL, IRE_LOCAL | IRE_LOOPBACK, NULL,
-	    lookup_zoneid, NULL, ire_flags, ipst);
+	ire = ire_ftable_lookup_v4(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
+	    NULL, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
 	if (ire != NULL) {
 		zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
 		ire_refrele(ire);
@@ -29858,24 +15277,23 @@ ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_stack_t *ipst,
 
 zoneid_t
 ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill,
-    ip_stack_t *ipst, zoneid_t lookup_zoneid)
+    ip_recv_attr_t *ira, zoneid_t lookup_zoneid)
 {
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
 	ire_t		*ire;
 	int		ire_flags = MATCH_IRE_TYPE;
 	zoneid_t	zoneid = ALL_ZONES;
-	ipif_t		*ipif_arg = NULL;
 
-	if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE))
+	if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
 		return (ALL_ZONES);
 
-	if (IN6_IS_ADDR_LINKLOCAL(addr)) {
+	if (IN6_IS_ADDR_LINKLOCAL(addr))
 		ire_flags |= MATCH_IRE_ILL;
-		ipif_arg = ill->ill_ipif;
-	}
+
 	if (lookup_zoneid != ALL_ZONES)
 		ire_flags |= MATCH_IRE_ZONEONLY;
-	ire = ire_ctable_lookup_v6(addr, NULL, IRE_LOCAL | IRE_LOOPBACK,
-	    ipif_arg, lookup_zoneid, NULL, ire_flags, ipst);
+	ire = ire_ftable_lookup_v6(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
+	    ill, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
 	if (ire != NULL) {
 		zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
 		ire_refrele(ire);
@@ -29964,3 +15382,29 @@ ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
 	imp->b_cont = NULL;
 	freemsg(imp);
 }
+
+/*
+ * Utility routine that checks if `v4srcp' is a valid address on underlying
+ * interface `ill'.  If `ipifp' is non-NULL, it's set to a held ipif
+ * associated with `v4srcp' on success.  NOTE: if this is not called from
+ * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
+ * group during or after this lookup.
+ */
+boolean_t
+ipif_lookup_testaddr_v4(ill_t *ill, const in_addr_t *v4srcp, ipif_t **ipifp)
+{
+	ipif_t *ipif;
+
+	ipif = ipif_lookup_addr_exact(*v4srcp, ill, ill->ill_ipst);
+	if (ipif != NULL) {
+		if (ipifp != NULL)
+			*ipifp = ipif;
+		else
+			ipif_refrele(ipif);
+		return (B_TRUE);
+	}
+
+	ip1dbg(("ipif_lookup_testaddr_v4: cannot find ipif for src %x\n",
+	    *v4srcp));
+	return (B_FALSE);
+}
diff --git a/usr/src/uts/common/inet/ip/ip2mac.c b/usr/src/uts/common/inet/ip/ip2mac.c
index e232a5bb63..55a17f762a 100644
--- a/usr/src/uts/common/inet/ip/ip2mac.c
+++ b/usr/src/uts/common/inet/ip/ip2mac.c
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -29,7 +30,6 @@
 #include <inet/ip2mac.h>
 #include <inet/ip2mac_impl.h>
 #include <sys/zone.h>
-#include <sys/dlpi.h>
 #include <inet/ip_ndp.h>
 #include <inet/ip_if.h>
 #include <inet/ip6.h>
@@ -38,18 +38,18 @@
  * dispatch pending callbacks.
  */
 void
-nce_cb_dispatch(nce_t *nce)
+ncec_cb_dispatch(ncec_t *ncec)
 {
-	nce_cb_t *nce_cb = list_head(&nce->nce_cb);
+	ncec_cb_t *ncec_cb;
 	ip2mac_t ip2m;
 
-	mutex_enter(&nce->nce_lock);
-	if (list_is_empty(&nce->nce_cb)) {
-		mutex_exit(&nce->nce_lock);
+	mutex_enter(&ncec->ncec_lock);
+	if (list_is_empty(&ncec->ncec_cb)) {
+		mutex_exit(&ncec->ncec_lock);
 		return;
 	}
-	nce_ip2mac_response(&ip2m, nce);
-	nce_cb_refhold_locked(nce);
+	ncec_ip2mac_response(&ip2m, ncec);
+	ncec_cb_refhold_locked(ncec);
 	/*
 	 * IP does not hold internal locks like nce_lock across calls to
 	 * other subsystems for fear of recursive lock entry and lock
@@ -58,75 +58,82 @@ nce_cb_dispatch(nce_t *nce)
 	 * across calls into another subsystem, especially if calls can
 	 * happen in either direction).
 	 */
-	nce_cb = list_head(&nce->nce_cb);
-	for (; nce_cb != NULL; nce_cb = list_next(&nce->nce_cb, nce_cb)) {
-		if (nce_cb->nce_cb_flags & NCE_CB_DISPATCHED)
+	ncec_cb = list_head(&ncec->ncec_cb);
+	for (; ncec_cb != NULL; ncec_cb = list_next(&ncec->ncec_cb, ncec_cb)) {
+		if (ncec_cb->ncec_cb_flags & NCE_CB_DISPATCHED)
 			continue;
-		nce_cb->nce_cb_flags |= NCE_CB_DISPATCHED;
-		mutex_exit(&nce->nce_lock);
-		(*nce_cb->nce_cb_func)(&ip2m, nce_cb->nce_cb_arg);
-		mutex_enter(&nce->nce_lock);
+		ncec_cb->ncec_cb_flags |= NCE_CB_DISPATCHED;
+		mutex_exit(&ncec->ncec_lock);
+		(*ncec_cb->ncec_cb_func)(&ip2m, ncec_cb->ncec_cb_arg);
+		mutex_enter(&ncec->ncec_lock);
 	}
-	nce_cb_refrele(nce);
-	mutex_exit(&nce->nce_lock);
+	ncec_cb_refrele(ncec);
+	mutex_exit(&ncec->ncec_lock);
 }
 
 /*
  * fill up the ip2m response fields with inforamation from the nce.
  */
 void
-nce_ip2mac_response(ip2mac_t *ip2m, nce_t *nce)
+ncec_ip2mac_response(ip2mac_t *ip2m, ncec_t *ncec)
 {
-	boolean_t isv6 = (nce->nce_ipversion == IPV6_VERSION);
+	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
+	sin_t	*sin;
 	sin6_t	*sin6;
 	struct sockaddr_dl *sdl;
-	uchar_t *nce_lladdr;
 
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 	bzero(ip2m, sizeof (*ip2m));
-	if (NCE_ISREACHABLE(nce) && (nce->nce_flags & NCE_F_CONDEMNED) == 0)
+	if (NCE_ISREACHABLE(ncec) && !NCE_ISCONDEMNED(ncec))
 		ip2m->ip2mac_err = 0;
 	else
 		ip2m->ip2mac_err = ESRCH;
 	if (isv6) {
 		sin6 = (sin6_t *)&ip2m->ip2mac_pa;
 		sin6->sin6_family = AF_INET6;
-		sin6->sin6_addr = nce->nce_addr;
+		sin6->sin6_addr = ncec->ncec_addr;
+	} else {
+		sin = (sin_t *)&ip2m->ip2mac_pa;
+		sin->sin_family = AF_INET;
+		IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &sin->sin_addr);
 	}
 	if (ip2m->ip2mac_err == 0) {
 		sdl = &ip2m->ip2mac_ha;
 		sdl->sdl_family = AF_LINK;
-		sdl->sdl_type = nce->nce_ill->ill_type;
+		sdl->sdl_type = ncec->ncec_ill->ill_type;
+		/*
+		 * should we put ncec_ill->ill_name in there? why?
+		 * likewise for the sdl_index
+		 */
 		sdl->sdl_nlen = 0;
-		sdl->sdl_alen = nce->nce_ill->ill_phys_addr_length;
-		nce_lladdr = nce->nce_res_mp->b_rptr +
-		    NCE_LL_ADDR_OFFSET(nce->nce_ill);
-		bcopy(nce_lladdr, LLADDR(sdl), sdl->sdl_alen);
+		sdl->sdl_alen = ncec->ncec_ill->ill_phys_addr_length;
+		if (ncec->ncec_lladdr != NULL)
+			bcopy(ncec->ncec_lladdr, LLADDR(sdl), sdl->sdl_alen);
 	}
 }
 
 void
-nce_cb_refhold_locked(nce_t *nce)
+ncec_cb_refhold_locked(ncec_t *ncec)
 {
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
-	nce->nce_cb_walker_cnt++;
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+	ncec->ncec_cb_walker_cnt++;
 }
 
 void
-nce_cb_refrele(nce_t *nce)
+ncec_cb_refrele(ncec_t *ncec)
 {
-	nce_cb_t *nce_cb, *nce_cb_next = NULL;
+	ncec_cb_t *ncec_cb, *ncec_cb_next = NULL;
 
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
-	if (--nce->nce_cb_walker_cnt == 0) {
-		for (nce_cb = list_head(&nce->nce_cb); nce_cb != NULL;
-		    nce_cb = nce_cb_next) {
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+	if (--ncec->ncec_cb_walker_cnt == 0) {
+		for (ncec_cb = list_head(&ncec->ncec_cb); ncec_cb != NULL;
+		    ncec_cb = ncec_cb_next) {
 
-			nce_cb_next = list_next(&nce->nce_cb, nce_cb);
-			if ((nce_cb->nce_cb_flags & NCE_CB_DISPATCHED) == 0)
+			ncec_cb_next = list_next(&ncec->ncec_cb, ncec_cb);
+			if ((ncec_cb->ncec_cb_flags & NCE_CB_DISPATCHED) == 0)
 				continue;
-			list_remove(&nce->nce_cb, nce_cb);
-			kmem_free(nce_cb, sizeof (*nce_cb));
+			list_remove(&ncec->ncec_cb, ncec_cb);
+			kmem_free(ncec_cb, sizeof (*ncec_cb));
 		}
 	}
 }
@@ -136,25 +143,25 @@ nce_cb_refrele(nce_t *nce)
  * after address resolution succeeds/fails.
  */
 static ip2mac_id_t
-nce_add_cb(nce_t *nce, ip2mac_callback_t *cb, void *cbarg)
+ncec_add_cb(ncec_t *ncec, ip2mac_callback_t *cb, void *cbarg)
 {
-	nce_cb_t	*nce_cb;
+	ncec_cb_t	*nce_cb;
 	ip2mac_id_t	ip2mid = NULL;
 
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 	if ((nce_cb = kmem_zalloc(sizeof (*nce_cb), KM_NOSLEEP)) == NULL)
 		return (ip2mid);
-	nce_cb->nce_cb_func = cb;
-	nce_cb->nce_cb_arg = cbarg;
+	nce_cb->ncec_cb_func = cb;
+	nce_cb->ncec_cb_arg = cbarg;
 	/*
-	 * We identify the nce_cb_t during cancellation by the address
+	 * We identify the ncec_cb_t during cancellation by the address
 	 * of the nce_cb_t itself, and, as a short-cut for eliminating
-	 * clear mismatches, only look in the callback list of nce's
+	 * clear mismatches, only look in the callback list of ncec's
 	 * whose address is equal to the nce_cb_id.
 	 */
-	nce_cb->nce_cb_id = nce; /* no refs! just an address */
-	list_insert_tail(&nce->nce_cb, nce_cb);
-	ip2mid = nce;  /* this is the id to be used in ip2mac_cancel */
+	nce_cb->ncec_cb_id = ncec; /* no refs! just an address */
+	list_insert_tail(&ncec->ncec_cb, nce_cb);
+	ip2mid = ncec;  /* this is the id to be used in ip2mac_cancel */
 
 	return (nce_cb);
 }
@@ -167,29 +174,24 @@ nce_add_cb(nce_t *nce, ip2mac_callback_t *cb, void *cbarg)
  * the resolution completes.
  */
 ip2mac_id_t
-ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
+ip2mac(uint_t op, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
     zoneid_t zoneid)
 {
-	nce_t		*nce;
+	ncec_t		*ncec;
+	nce_t		*nce = NULL;
 	boolean_t	isv6;
 	ill_t		*ill;
 	netstack_t	*ns;
 	ip_stack_t	*ipst;
 	ip2mac_id_t	ip2mid = NULL;
+	sin_t		*sin;
 	sin6_t		*sin6;
 	int		err;
 	uint64_t	delta;
+	boolean_t	need_resolve = B_FALSE;
 
 	isv6 = (ip2m->ip2mac_pa.ss_family == AF_INET6);
 
-	if (!isv6) {
-		/*
-		 * IPv4 is not currently supported.
-		 */
-		ip2m->ip2mac_err = ENOTSUP;
-		return (NULL);
-	}
-
 	ns = netstack_find_by_zoneid(zoneid);
 	if (ns == NULL) {
 		ip2m->ip2mac_err = EINVAL;
@@ -205,8 +207,7 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
 	/*
 	 * find the ill from the ip2m->ip2mac_ifindex
 	 */
-	ill = ill_lookup_on_ifindex(ip2m->ip2mac_ifindex, isv6, NULL,
-	    NULL, NULL, NULL, ipst);
+	ill = ill_lookup_on_ifindex(ip2m->ip2mac_ifindex, isv6, ipst);
 	if (ill == NULL) {
 		ip2m->ip2mac_err = ENXIO;
 		netstack_rele(ns);
@@ -214,32 +215,39 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
 	}
 	if (isv6) {
 		sin6 = (sin6_t *)&ip2m->ip2mac_pa;
-		if (flags == IP2MAC_LOOKUP) {
-			nce = ndp_lookup_v6(ill, B_FALSE, &sin6->sin6_addr,
-			    B_FALSE);
+		if (op == IP2MAC_LOOKUP) {
+			nce = nce_lookup_v6(ill, &sin6->sin6_addr);
 		} else {
-			err = ndp_lookup_then_add_v6(ill, B_FALSE, NULL,
-			    &sin6->sin6_addr, &ipv6_all_ones, &ipv6_all_zeros,
-			    0, 0, ND_INCOMPLETE, &nce);
+			err = nce_lookup_then_add_v6(ill, NULL,
+			    ill->ill_phys_addr_length,
+			    &sin6->sin6_addr, 0, ND_UNCHANGED, &nce);
 		}
 	} else  {
-		ip2m->ip2mac_err = ENOTSUP; /* yet. */
-		goto done;
+		sin = (sin_t *)&ip2m->ip2mac_pa;
+		if (op == IP2MAC_LOOKUP) {
+			nce = nce_lookup_v4(ill, &sin->sin_addr.s_addr);
+		} else {
+			err = nce_lookup_then_add_v4(ill, NULL,
+			    ill->ill_phys_addr_length,
+			    &sin->sin_addr.s_addr, 0, ND_UNCHANGED, &nce);
+		}
 	}
-	if (flags == IP2MAC_LOOKUP) {
+	if (op == IP2MAC_LOOKUP) {
 		if (nce == NULL) {
 			ip2m->ip2mac_err = ESRCH;
 			goto done;
 		}
-		mutex_enter(&nce->nce_lock);
-		if (NCE_ISREACHABLE(nce)) {
-			nce_ip2mac_response(ip2m, nce);
+		ncec = nce->nce_common;
+		delta = TICK_TO_MSEC(lbolt64) - ncec->ncec_last;
+		mutex_enter(&ncec->ncec_lock);
+		if (NCE_ISREACHABLE(ncec) &&
+		    delta < (uint64_t)ill->ill_reachable_time) {
+			ncec_ip2mac_response(ip2m, ncec);
 			ip2m->ip2mac_err = 0;
 		} else {
 			ip2m->ip2mac_err = ESRCH;
 		}
-		mutex_exit(&nce->nce_lock);
-		NCE_REFRELE(nce);
+		mutex_exit(&ncec->ncec_lock);
 		goto done;
 	} else {
 		if (err != 0 && err != EEXIST) {
@@ -247,13 +255,20 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
 			goto done;
 		}
 	}
-	delta = TICK_TO_MSEC(lbolt64) - nce->nce_last;
-	mutex_enter(&nce->nce_lock);
-	if (nce->nce_flags & NCE_F_CONDEMNED) {
+	ncec = nce->nce_common;
+	delta = TICK_TO_MSEC(lbolt64) - ncec->ncec_last;
+	mutex_enter(&ncec->ncec_lock);
+	if (NCE_ISCONDEMNED(ncec)) {
 		ip2m->ip2mac_err = ESRCH;
-	} else if (!NCE_ISREACHABLE(nce) ||
-	    delta > (uint64_t)ill->ill_reachable_time) {
-		if (NCE_ISREACHABLE(nce)) {
+	} else {
+		if (NCE_ISREACHABLE(ncec)) {
+			if (NCE_MYADDR(ncec) ||
+			    delta < (uint64_t)ill->ill_reachable_time) {
+				ncec_ip2mac_response(ip2m, ncec);
+				ip2m->ip2mac_err = 0;
+				mutex_exit(&ncec->ncec_lock);
+				goto done;
+			}
 			/*
 			 * Since we do not control the packet output
 			 * path for ip2mac() callers, we need to verify
@@ -268,39 +283,48 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
 			 * so that we can return the stale information but
 			 * also update the caller if the lladdr changes.
 			 */
-			nce->nce_rcnt = ill->ill_xmit_count;
-			nce->nce_state = ND_PROBE;
-			err = 0; /* treat this nce as a new one */
+			ncec->ncec_rcnt = ill->ill_xmit_count;
+			ncec->ncec_state = ND_PROBE;
+			need_resolve = B_TRUE; /* reachable but very old nce */
+		} else if (ncec->ncec_state == ND_INITIAL) {
+			need_resolve = B_TRUE; /* ND_INITIAL nce */
+			ncec->ncec_state = ND_INCOMPLETE;
 		}
-		if (nce->nce_rcnt > 0) {
+		/*
+		 * NCE not known to be reachable in the recent past. We must
+		 * reconfirm the information before returning it to the caller
+		 */
+		if (ncec->ncec_rcnt > 0) {
 			/*
-			 * Still resolving this nce, so we can
-			 * queue the callback information in nce->nce_cb
+			 * Still resolving this ncec, so we can queue the
+			 * callback information in ncec->ncec_cb
 			 */
-			ip2mid = nce_add_cb(nce, cb, cbarg);
+			ip2mid = ncec_add_cb(ncec, cb, cbarg);
 			ip2m->ip2mac_err = EINPROGRESS;
 		} else {
 			/*
-			 * Resolution failed.
+			 * No more retransmits allowed -- resolution failed.
 			 */
 			ip2m->ip2mac_err = ESRCH;
 		}
-	} else {
-		nce_ip2mac_response(ip2m, nce);
-		ip2m->ip2mac_err = 0;
 	}
-	if (ip2m->ip2mac_err == EINPROGRESS && err != EEXIST)
-		ip_ndp_resolve(nce);
-	mutex_exit(&nce->nce_lock);
-	NCE_REFRELE(nce);
+	mutex_exit(&ncec->ncec_lock);
 done:
+	/*
+	 * if NCE_ISREACHABLE(ncec) but very old, or if it is ND_INITIAL,
+	 * trigger resolve.
+	 */
+	if (need_resolve)
+		ip_ndp_resolve(ncec);
+	if (nce != NULL)
+		nce_refrele(nce);
 	netstack_rele(ns);
 	ill_refrele(ill);
 	return (ip2mid);
 }
 
 /*
- * data passed to nce_walk for canceling outstanding callbacks.
+ * data passed to ncec_walk for canceling outstanding callbacks.
  */
 typedef struct ip2mac_cancel_data_s {
 	ip2mac_id_t ip2m_cancel_id;
@@ -308,23 +332,23 @@ typedef struct ip2mac_cancel_data_s {
 } ip2mac_cancel_data_t;
 
 /*
- * callback invoked for each active nce. If the ip2mac_id_t corresponds
- * to an active nce_cb_t in the nce's callback list, we want to remove
+ * callback invoked for each active ncec. If the ip2mac_id_t corresponds
+ * to an active nce_cb_t in the ncec's callback list, we want to remove
  * the callback (if there are no walkers) or return EBUSY to the caller
  */
 static int
-ip2mac_cancel_callback(nce_t *nce, void *arg)
+ip2mac_cancel_callback(ncec_t *ncec, void *arg)
 {
 	ip2mac_cancel_data_t *ip2m_wdata = arg;
-	nce_cb_t *ip2m_nce_cb = ip2m_wdata->ip2m_cancel_id;
-	nce_cb_t *nce_cb;
+	ncec_cb_t *ip2m_nce_cb = ip2m_wdata->ip2m_cancel_id;
+	ncec_cb_t *ncec_cb;
 
-	if (ip2m_nce_cb->nce_cb_id != nce)
+	if (ip2m_nce_cb->ncec_cb_id != ncec)
 		return (0);
 
-	mutex_enter(&nce->nce_lock);
-	if (list_is_empty(&nce->nce_cb)) {
-		mutex_exit(&nce->nce_lock);
+	mutex_enter(&ncec->ncec_lock);
+	if (list_is_empty(&ncec->ncec_cb)) {
+		mutex_exit(&ncec->ncec_lock);
 		return (0);
 	}
 	/*
@@ -335,22 +359,22 @@ ip2mac_cancel_callback(nce_t *nce, void *arg)
 	 * across calls into another subsystem, especially if calls can
 	 * happen in either direction).
 	 */
-	nce_cb = list_head(&nce->nce_cb);
-	for (; nce_cb != NULL; nce_cb = list_next(&nce->nce_cb, nce_cb)) {
-		if (nce_cb != ip2m_nce_cb)
+	ncec_cb = list_head(&ncec->ncec_cb);
+	for (; ncec_cb != NULL; ncec_cb = list_next(&ncec->ncec_cb, ncec_cb)) {
+		if (ncec_cb != ip2m_nce_cb)
 			continue;
 		/*
 		 * If there are no walkers we can remove the nce_cb.
 		 * Otherwise the exiting walker will clean up.
 		 */
-		if (nce->nce_cb_walker_cnt == 0) {
-			list_remove(&nce->nce_cb, nce_cb);
+		if (ncec->ncec_cb_walker_cnt == 0) {
+			list_remove(&ncec->ncec_cb, ncec_cb);
 		} else {
 			ip2m_wdata->ip2m_cancel_err = EBUSY;
 		}
 		break;
 	}
-	mutex_exit(&nce->nce_lock);
+	mutex_exit(&ncec->ncec_lock);
 	return (0);
 }
 
@@ -379,7 +403,7 @@ ip2mac_cancel(ip2mac_id_t ip2mid, zoneid_t zoneid)
 
 	ip2m_wdata.ip2m_cancel_id = ip2mid;
 	ip2m_wdata.ip2m_cancel_err = 0;
-	ndp_walk(NULL, ip2mac_cancel_callback, &ip2m_wdata, ipst);
+	ncec_walk(NULL, ip2mac_cancel_callback, &ip2m_wdata, ipst);
 	/*
 	 * We may return EBUSY if a walk to dispatch callbacks is
 	 * in progress, in which case the caller needs to synchronize
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index 38fe7b2562..ed54c08884 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -53,8 +53,8 @@
 #include <sys/vtrace.h>
 #include <sys/isa_defs.h>
 #include <sys/atomic.h>
-#include <sys/iphada.h>
 #include <sys/policy.h>
+#include <sys/mac.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/route.h>
@@ -79,9 +79,7 @@
 #include <inet/tcp.h>
 #include <inet/tcp_impl.h>
 #include <inet/udp_impl.h>
-#include <inet/sctp/sctp_impl.h>
 #include <inet/ipp_common.h>
-#include <inet/ilb_ip.h>
 
 #include <inet/ip_multi.h>
 #include <inet/ip_if.h>
@@ -89,7 +87,6 @@
 #include <inet/ip_rts.h>
 #include <inet/ip_ndp.h>
 #include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
 #include <inet/sadb.h>
 #include <inet/ipsec_impl.h>
 #include <inet/iptun/iptun_impl.h>
@@ -110,8 +107,6 @@
 /* Temporary; for CR 6451644 work-around */
 #include <sys/ethernet.h>
 
-extern int ip_squeue_flag;
-
 /*
  * Naming conventions:
  *      These rules should be judiciously applied
@@ -179,154 +174,75 @@ const in6_addr_t ipv6_solicited_node_mcast =
 			{ 0x000002ffU, 0, 0x01000000U, 0x000000ffU };
 #endif /* _BIG_ENDIAN */
 
-/* Leave room for ip_newroute to tack on the src and target addresses */
-#define	OK_RESOLVER_MP_V6(mp)						\
-		((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IPV6_ADDR_LEN))
-
-#define	IP6_MBLK_OK		0
-#define	IP6_MBLK_HDR_ERR	1
-#define	IP6_MBLK_LEN_ERR	2
-
-static void	icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *, ill_t *,
-    boolean_t, zoneid_t);
-static void	icmp_pkt_v6(queue_t *, mblk_t *, void *, size_t,
-    const in6_addr_t *, boolean_t, zoneid_t, ip_stack_t *);
-static void	icmp_redirect_v6(queue_t *, mblk_t *, ill_t *ill);
-static int	ip_bind_connected_v6(conn_t *, mblk_t **, uint8_t, in6_addr_t *,
-    uint16_t, const in6_addr_t *, ip6_pkt_t *, uint16_t,
-    boolean_t, boolean_t, cred_t *);
-static boolean_t ip_bind_get_ire_v6(mblk_t **, ire_t *, const in6_addr_t *,
-    iulp_t *, ip_stack_t *);
-static int	ip_bind_laddr_v6(conn_t *, mblk_t **, uint8_t,
-    const in6_addr_t *, uint16_t, boolean_t);
-static void	ip_fanout_proto_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
-    ill_t *, uint8_t, uint_t, uint_t, boolean_t, zoneid_t);
-static void	ip_fanout_tcp_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
-    ill_t *, uint_t, uint_t, boolean_t, zoneid_t);
-static void	ip_fanout_udp_v6(queue_t *, mblk_t *, ip6_t *, uint32_t,
-    ill_t *, ill_t *, uint_t, boolean_t, zoneid_t);
-static int	ip_process_options_v6(queue_t *, mblk_t *, ip6_t *,
-    uint8_t *, uint_t, uint8_t, ip_stack_t *);
-static mblk_t	*ip_rput_frag_v6(ill_t *, ill_t *, mblk_t *, ip6_t *,
-    ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *);
+static boolean_t icmp_inbound_verify_v6(mblk_t *, icmp6_t *, ip_recv_attr_t *);
+static void	icmp_inbound_too_big_v6(icmp6_t *, ip_recv_attr_t *);
+static void	icmp_pkt_v6(mblk_t *, void *, size_t, const in6_addr_t *,
+    ip_recv_attr_t *);
+static void	icmp_redirect_v6(mblk_t *, ip6_t *, nd_redirect_t *,
+    ip_recv_attr_t *);
+static void	icmp_send_redirect_v6(mblk_t *, in6_addr_t *,
+    in6_addr_t *, ip_recv_attr_t *);
+static void	icmp_send_reply_v6(mblk_t *, ip6_t *, icmp6_t *,
+    ip_recv_attr_t *);
 static boolean_t	ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *);
-static void	ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int,
-    conn_t *, int, int, zoneid_t);
-static boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *,
-    ipif_t **);
-
-/*
- * A template for an IPv6 AR_ENTRY_QUERY
- */
-static areq_t	ipv6_areq_template = {
-	AR_ENTRY_QUERY,				/* cmd */
-	sizeof (areq_t)+(2*IPV6_ADDR_LEN),	/* name offset */
-	sizeof (areq_t),	/* name len (filled by ill_arp_alloc) */
-	ETHERTYPE_IPV6,		/* protocol, from arps perspective */
-	sizeof (areq_t),	/* target addr offset */
-	IPV6_ADDR_LEN,		/* target addr_length */
-	0,			/* flags */
-	sizeof (areq_t) + IPV6_ADDR_LEN,	/* sender addr offset */
-	IPV6_ADDR_LEN,		/* sender addr length */
-	6,			/* xmit_count */
-	1000,			/* (re)xmit_interval in milliseconds */
-	4			/* max # of requests to buffer */
-	/* anything else filled in by the code */
-};
 
 /*
- * Handle IPv6 ICMP packets sent to us.  Consume the mblk passed in.
- * The message has already been checksummed and if needed,
- * a copy has been made to be sent any interested ICMP client (conn)
- * Note that this is different than icmp_inbound() which does the fanout
- * to conn's as well as local processing of the ICMP packets.
+ * icmp_inbound_v6 deals with ICMP messages that are handled by IP.
+ * If the ICMP message is consumed by IP, i.e., it should not be delivered
+ * to any IPPROTO_ICMP raw sockets, then it returns NULL.
+ * Likewise, if the ICMP error is misformed (too short, etc), then it
+ * returns NULL. The caller uses this to determine whether or not to send
+ * to raw sockets.
  *
  * All error messages are passed to the matching transport stream.
  *
- * Zones notes:
- * The packet is only processed in the context of the specified zone: typically
- * only this zone will reply to an echo request. This means that the caller must
- * call icmp_inbound_v6() for each relevant zone.
+ * See comment for icmp_inbound_v4() on how IPsec is handled.
  */
-static void
-icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
-    uint_t hdr_length, boolean_t mctl_present, uint_t flags, zoneid_t zoneid,
-    mblk_t *dl_mp)
+mblk_t *
+icmp_inbound_v6(mblk_t *mp, ip_recv_attr_t *ira)
 {
 	icmp6_t		*icmp6;
-	ip6_t		*ip6h;
+	ip6_t		*ip6h;		/* Outer header */
+	int		ip_hdr_length;	/* Outer header length */
 	boolean_t	interested;
-	in6_addr_t	origsrc;
-	mblk_t		*first_mp;
-	ipsec_in_t	*ii;
+	ill_t		*ill = ira->ira_ill;
 	ip_stack_t	*ipst = ill->ill_ipst;
-
-	ASSERT(ill != NULL);
-	first_mp = mp;
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		ASSERT(mp != NULL);
-
-		ii = (ipsec_in_t *)first_mp->b_rptr;
-		ASSERT(ii->ipsec_in_type == IPSEC_IN);
-	}
+	mblk_t		*mp_ret = NULL;
 
 	ip6h = (ip6_t *)mp->b_rptr;
 
 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
 
-	if ((mp->b_wptr - mp->b_rptr) < (hdr_length + ICMP6_MINLEN)) {
-		if (!pullupmsg(mp, hdr_length + ICMP6_MINLEN)) {
-			ip1dbg(("icmp_inbound_v6: pullupmsg failed\n"));
-			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
-			freemsg(first_mp);
-			return;
-		}
-		ip6h = (ip6_t *)mp->b_rptr;
-	}
-	if (ipst->ips_icmp_accept_clear_messages == 0) {
-		first_mp = ipsec_check_global_policy(first_mp, NULL,
-		    NULL, ip6h, mctl_present, ipst->ips_netstack);
-		if (first_mp == NULL)
-			return;
-	}
+	/* Make sure ira_l2src is set for ndp_input */
+	if (!(ira->ira_flags & IRAF_L2SRC_SET))
+		ip_setl2src(mp, ira, ira->ira_rill);
 
-	/*
-	 * On a labeled system, we have to check whether the zone itself is
-	 * permitted to receive raw traffic.
-	 */
-	if (is_system_labeled()) {
-		if (zoneid == ALL_ZONES)
-			zoneid = tsol_packet_to_zoneid(mp);
-		if (!tsol_can_accept_raw(mp, B_FALSE)) {
-			ip1dbg(("icmp_inbound_v6: zone %d can't receive raw",
-			    zoneid));
+	ip_hdr_length = ira->ira_ip_hdr_length;
+	if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) {
+		if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+			freemsg(mp);
+			return (NULL);
+		}
+		ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira);
+		if (ip6h == NULL) {
 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
-			freemsg(first_mp);
-			return;
+			freemsg(mp);
+			return (NULL);
 		}
 	}
 
-	icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
+	icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
+	DTRACE_PROBE2(icmp__inbound__v6, ip6_t *, ip6h, icmp6_t *, icmp6);
 	ip2dbg(("icmp_inbound_v6: type %d code %d\n", icmp6->icmp6_type,
 	    icmp6->icmp6_code));
-	interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK);
 
-	/* Initiate IPPF processing here */
-	if (IP6_IN_IPP(flags, ipst)) {
-
-		/*
-		 * If the ifindex changes due to SIOCSLIFINDEX
-		 * packet may return to IP on the wrong ill.
-		 */
-		ip_process(IPP_LOCAL_IN, &mp, ill->ill_phyint->phyint_ifindex);
-		if (mp == NULL) {
-			if (mctl_present) {
-				freeb(first_mp);
-			}
-			return;
-		}
-	}
+	/*
+	 * We will set "interested" to "true" if we should pass a copy to
+	 * the transport i.e., if it is an error message.
+	 */
+	interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK);
 
 	switch (icmp6->icmp6_type) {
 	case ICMP6_DST_UNREACH:
@@ -344,9 +260,9 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
 		break;
 
 	case ICMP6_PACKET_TOO_BIG:
-		icmp_inbound_too_big_v6(q, first_mp, ill, inill, mctl_present,
-		    zoneid);
-		return;
+		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInPktTooBigs);
+		break;
+
 	case ICMP6_ECHO_REQUEST:
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchos);
 		if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
@@ -362,93 +278,22 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
 			mblk_t	*mp1;
 
 			mp1 = copymsg(mp);
-			freemsg(mp);
 			if (mp1 == NULL) {
-				BUMP_MIB(ill->ill_icmp6_mib,
-				    ipv6IfIcmpInErrors);
-				if (mctl_present)
-					freeb(first_mp);
-				return;
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards - copymsg",
+				    mp, ill);
+				freemsg(mp);
+				return (NULL);
 			}
+			freemsg(mp);
 			mp = mp1;
 			ip6h = (ip6_t *)mp->b_rptr;
-			icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
-			if (mctl_present)
-				first_mp->b_cont = mp;
-			else
-				first_mp = mp;
+			icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
 		}
 
-		/*
-		 * Turn the echo into an echo reply.
-		 * Remove any extension headers (do not reverse a source route)
-		 * and clear the flow id (keep traffic class for now).
-		 */
-		if (hdr_length != IPV6_HDR_LEN) {
-			int	i;
-
-			for (i = 0; i < IPV6_HDR_LEN; i++)
-				mp->b_rptr[hdr_length - i - 1] =
-				    mp->b_rptr[IPV6_HDR_LEN - i - 1];
-			mp->b_rptr += (hdr_length - IPV6_HDR_LEN);
-			ip6h = (ip6_t *)mp->b_rptr;
-			ip6h->ip6_nxt = IPPROTO_ICMPV6;
-			hdr_length = IPV6_HDR_LEN;
-		}
-		ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
 		icmp6->icmp6_type = ICMP6_ECHO_REPLY;
-
-		ip6h->ip6_plen =
-		    htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN));
-		origsrc = ip6h->ip6_src;
-		/*
-		 * Reverse the source and destination addresses.
-		 * If the return address is a multicast, zero out the source
-		 * (ip_wput_v6 will set an address).
-		 */
-		if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
-			ip6h->ip6_src = ipv6_all_zeros;
-			ip6h->ip6_dst = origsrc;
-		} else {
-			ip6h->ip6_src = ip6h->ip6_dst;
-			ip6h->ip6_dst = origsrc;
-		}
-
-		/* set the hop limit */
-		ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
-
-		/*
-		 * Prepare for checksum by putting icmp length in the icmp
-		 * checksum field. The checksum is calculated in ip_wput_v6.
-		 */
-		icmp6->icmp6_cksum = ip6h->ip6_plen;
-
-		if (!mctl_present) {
-			/*
-			 * This packet should go out the same way as it
-			 * came in i.e in clear. To make sure that global
-			 * policy will not be applied to this in ip_wput,
-			 * we attach a IPSEC_IN mp and clear ipsec_in_secure.
-			 */
-			ASSERT(first_mp == mp);
-			first_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack);
-			if (first_mp == NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(mp);
-				return;
-			}
-			ii = (ipsec_in_t *)first_mp->b_rptr;
-
-			/* This is not a secure packet */
-			ii->ipsec_in_secure = B_FALSE;
-			first_mp->b_cont = mp;
-		}
-		if (!ipsec_in_to_out(first_mp, NULL, ip6h, zoneid)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			return;
-		}
-		put(WR(q), first_mp);
-		return;
+		icmp_send_reply_v6(mp, ip6h, icmp6, ira);
+		return (NULL);
 
 	case ICMP6_ECHO_REPLY:
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchoReplies);
@@ -464,343 +309,478 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
 
 	case ND_NEIGHBOR_SOLICIT:
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInNeighborSolicits);
-		if (mctl_present)
-			freeb(first_mp);
-		/* XXX may wish to pass first_mp up to ndp_input someday. */
-		ndp_input(inill, mp, dl_mp);
-		return;
+		ndp_input(mp, ira);
+		return (NULL);
 
 	case ND_NEIGHBOR_ADVERT:
 		BUMP_MIB(ill->ill_icmp6_mib,
 		    ipv6IfIcmpInNeighborAdvertisements);
-		if (mctl_present)
-			freeb(first_mp);
-		/* XXX may wish to pass first_mp up to ndp_input someday. */
-		ndp_input(inill, mp, dl_mp);
-		return;
+		ndp_input(mp, ira);
+		return (NULL);
 
-	case ND_REDIRECT: {
+	case ND_REDIRECT:
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRedirects);
 
 		if (ipst->ips_ipv6_ignore_redirect)
 			break;
 
-		/*
-		 * As there is no upper client to deliver, we don't
-		 * need the first_mp any more.
-		 */
-		if (mctl_present)
-			freeb(first_mp);
-		if (!pullupmsg(mp, -1)) {
-			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
-			break;
-		}
-		icmp_redirect_v6(q, mp, ill);
-		return;
-	}
+		/* We now allow a RAW socket to receive this. */
+		interested = B_TRUE;
+		break;
 
 	/*
 	 * The next three icmp messages will be handled by MLD.
 	 * Pass all valid MLD packets up to any process(es)
-	 * listening on a raw ICMP socket. MLD messages are
-	 * freed by mld_input function.
+	 * listening on a raw ICMP socket.
 	 */
 	case MLD_LISTENER_QUERY:
 	case MLD_LISTENER_REPORT:
 	case MLD_LISTENER_REDUCTION:
-		if (mctl_present)
-			freeb(first_mp);
-		mld_input(q, mp, ill);
-		return;
+		mp = mld_input(mp, ira);
+		return (mp);
 	default:
 		break;
 	}
-	if (interested) {
-		icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill,
-		    inill, mctl_present, zoneid);
-	} else {
-		freemsg(first_mp);
-	}
-}
+	/*
+	 * See if there is an ICMP client to avoid an extra copymsg/freemsg
+	 * if there isn't one.
+	 */
+	if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_ICMPV6].connf_head != NULL) {
+		/* If there is an ICMP client and we want one too, copy it. */
 
-/*
- * Process received IPv6 ICMP Packet too big.
- * After updating any IRE it does the fanout to any matching transport streams.
- * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else.
- */
-/* ARGSUSED */
-static void
-icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
-    boolean_t mctl_present, zoneid_t zoneid)
-{
-	ip6_t		*ip6h;
-	ip6_t		*inner_ip6h;
-	icmp6_t		*icmp6;
-	uint16_t	hdr_length;
-	uint32_t	mtu;
-	ire_t		*ire, *first_ire;
-	mblk_t		*first_mp;
-	ip_stack_t	*ipst = ill->ill_ipst;
+		if (!interested) {
+			/* Caller will deliver to RAW sockets */
+			return (mp);
+		}
+		mp_ret = copymsg(mp);
+		if (mp_ret == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
+		}
+	} else if (!interested) {
+		/* Neither we nor raw sockets are interested. Drop packet now */
+		freemsg(mp);
+		return (NULL);
+	}
 
-	first_mp = mp;
-	if (mctl_present)
-		mp = first_mp->b_cont;
 	/*
-	 * We must have exclusive use of the mblk to update the MTU
-	 * in the packet.
-	 * If not, we copy it.
-	 *
-	 * If there's an M_CTL present, we know that allocated first_mp
-	 * earlier in this function, so we know first_mp has refcnt of one.
+	 * ICMP error or redirect packet. Make sure we have enough of
+	 * the header and that db_ref == 1 since we might end up modifying
+	 * the packet.
 	 */
-	ASSERT(!mctl_present || first_mp->b_datap->db_ref == 1);
+	if (mp->b_cont != NULL) {
+		if (ip_pullup(mp, -1, ira) == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards - ip_pullup",
+			    mp, ill);
+			freemsg(mp);
+			return (mp_ret);
+		}
+	}
+
 	if (mp->b_datap->db_ref > 1) {
 		mblk_t	*mp1;
 
 		mp1 = copymsg(mp);
-		freemsg(mp);
 		if (mp1 == NULL) {
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			if (mctl_present)
-				freeb(first_mp);
-			return;
+			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
+			freemsg(mp);
+			return (mp_ret);
 		}
+		freemsg(mp);
 		mp = mp1;
-		if (mctl_present)
-			first_mp->b_cont = mp;
-		else
-			first_mp = mp;
 	}
+
+	/*
+	 * In case mp has changed, verify the message before any further
+	 * processes.
+	 */
 	ip6h = (ip6_t *)mp->b_rptr;
-	if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
-		hdr_length = ip_hdr_length_v6(mp, ip6h);
-	else
-		hdr_length = IPV6_HDR_LEN;
+	icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
+	if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
+		freemsg(mp);
+		return (mp_ret);
+	}
 
-	icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
-	ASSERT((size_t)(mp->b_wptr - mp->b_rptr) >= hdr_length + ICMP6_MINLEN);
-	inner_ip6h = (ip6_t *)&icmp6[1];	/* Packet in error */
-	if ((uchar_t *)&inner_ip6h[1] > mp->b_wptr) {
-		if (!pullupmsg(mp, (uchar_t *)&inner_ip6h[1] - mp->b_rptr)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(first_mp);
-			return;
+	switch (icmp6->icmp6_type) {
+	case ND_REDIRECT:
+		icmp_redirect_v6(mp, ip6h, (nd_redirect_t *)icmp6, ira);
+		break;
+	case ICMP6_PACKET_TOO_BIG:
+		/* Update DCE and adjust MTU is icmp header if needed */
+		icmp_inbound_too_big_v6(icmp6, ira);
+		/* FALLTHRU */
+	default:
+		icmp_inbound_error_fanout_v6(mp, icmp6, ira);
+		break;
+	}
+
+	return (mp_ret);
+}
+
+/*
+ * Send an ICMP echo reply.
+ * The caller has already updated the payload part of the packet.
+ * We handle the ICMP checksum, IP source address selection and feed
+ * the packet into ip_output_simple.
+ */
+static void
+icmp_send_reply_v6(mblk_t *mp, ip6_t *ip6h, icmp6_t *icmp6,
+    ip_recv_attr_t *ira)
+{
+	uint_t		ip_hdr_length = ira->ira_ip_hdr_length;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ip_xmit_attr_t	ixas;
+	in6_addr_t	origsrc;
+
+	/*
+	 * Remove any extension headers (do not reverse a source route)
+	 * and clear the flow id (keep traffic class for now).
+	 */
+	if (ip_hdr_length != IPV6_HDR_LEN) {
+		int	i;
+
+		for (i = 0; i < IPV6_HDR_LEN; i++) {
+			mp->b_rptr[ip_hdr_length - i - 1] =
+			    mp->b_rptr[IPV6_HDR_LEN - i - 1];
 		}
+		mp->b_rptr += (ip_hdr_length - IPV6_HDR_LEN);
 		ip6h = (ip6_t *)mp->b_rptr;
-		icmp6 = (icmp6_t *)&mp->b_rptr[hdr_length];
-		inner_ip6h = (ip6_t *)&icmp6[1];
+		ip6h->ip6_nxt = IPPROTO_ICMPV6;
+		i = ntohs(ip6h->ip6_plen);
+		i -= (ip_hdr_length - IPV6_HDR_LEN);
+		ip6h->ip6_plen = htons(i);
+		ip_hdr_length = IPV6_HDR_LEN;
+		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == msgdsize(mp));
 	}
+	ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
+
+	/* Reverse the source and destination addresses. */
+	origsrc = ip6h->ip6_src;
+	ip6h->ip6_src = ip6h->ip6_dst;
+	ip6h->ip6_dst = origsrc;
+
+	/* set the hop limit */
+	ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
 
 	/*
-	 * For link local destinations matching simply on IRE type is not
-	 * sufficient. Same link local addresses for different ILL's is
-	 * possible.
+	 * Prepare for checksum by putting icmp length in the icmp
+	 * checksum field. The checksum is calculated in ip_output
 	 */
-	if (IN6_IS_ADDR_LINKLOCAL(&inner_ip6h->ip6_dst)) {
-		first_ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL,
-		    IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL,
-		    MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-
-		if (first_ire == NULL) {
-			if (ip_debug > 2) {
-				/* ip1dbg */
-				pr_addr_dbg("icmp_inbound_too_big_v6:"
-				    "no ire for dst %s\n", AF_INET6,
-				    &inner_ip6h->ip6_dst);
-			}
-			freemsg(first_mp);
-			return;
-		}
+	icmp6->icmp6_cksum = ip6h->ip6_plen;
 
-		mtu = ntohl(icmp6->icmp6_mtu);
-		rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER);
-		for (ire = first_ire; ire != NULL &&
-		    IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &inner_ip6h->ip6_dst);
-		    ire = ire->ire_next) {
-			mutex_enter(&ire->ire_lock);
-			if (mtu < IPV6_MIN_MTU) {
-				ip1dbg(("Received mtu less than IPv6 "
-				    "min mtu %d: %d\n", IPV6_MIN_MTU, mtu));
-				mtu = IPV6_MIN_MTU;
-				/*
-				 * If an mtu less than IPv6 min mtu is received,
-				 * we must include a fragment header in
-				 * subsequent packets.
-				 */
-				ire->ire_frag_flag |= IPH_FRAG_HDR;
-			}
-			ip1dbg(("Received mtu from router: %d\n", mtu));
-			ire->ire_max_frag = MIN(ire->ire_max_frag, mtu);
-			if (ire->ire_max_frag == mtu) {
-				/* Decreased it */
-				ire->ire_marks |= IRE_MARK_PMTU;
-			}
-			/* Record the new max frag size for the ULP. */
-			if (ire->ire_frag_flag & IPH_FRAG_HDR) {
-				/*
-				 * If we need a fragment header in every packet
-				 * (above case or multirouting), make sure the
-				 * ULP takes it into account when computing the
-				 * payload size.
-				 */
-				icmp6->icmp6_mtu = htonl(ire->ire_max_frag -
-				    sizeof (ip6_frag_t));
-			} else {
-				icmp6->icmp6_mtu = htonl(ire->ire_max_frag);
-			}
-			mutex_exit(&ire->ire_lock);
-		}
-		rw_exit(&first_ire->ire_bucket->irb_lock);
-		ire_refrele(first_ire);
-	} else {
-		irb_t	*irb = NULL;
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+	ixas.ixa_zoneid = ira->ira_zoneid;
+	ixas.ixa_cred = kcred;
+	ixas.ixa_cpid = NOPID;
+	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
+	ixas.ixa_ifindex = 0;
+	ixas.ixa_ipst = ipst;
+	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
 		/*
-		 * for non-link local destinations we match only on the IRE type
+		 * This packet should go out the same way as it
+		 * came in i.e in clear, independent of the IPsec
+		 * policy for transmitting packets.
 		 */
-		ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL,
-		    IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE,
-		    ipst);
-		if (ire == NULL) {
-			if (ip_debug > 2) {
-				/* ip1dbg */
-				pr_addr_dbg("icmp_inbound_too_big_v6:"
-				    "no ire for dst %s\n",
-				    AF_INET6, &inner_ip6h->ip6_dst);
-			}
-			freemsg(first_mp);
+		ixas.ixa_flags |= IXAF_NO_IPSEC;
+	} else {
+		if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			/* Note: mp already consumed and ip_drop_packet done */
 			return;
 		}
-		irb = ire->ire_bucket;
-		ire_refrele(ire);
-		rw_enter(&irb->irb_lock, RW_READER);
-		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
-			if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
-			    &inner_ip6h->ip6_dst)) {
-				mtu = ntohl(icmp6->icmp6_mtu);
-				mutex_enter(&ire->ire_lock);
-				if (mtu < IPV6_MIN_MTU) {
-					ip1dbg(("Received mtu less than IPv6"
-					    "min mtu %d: %d\n",
-					    IPV6_MIN_MTU, mtu));
-					mtu = IPV6_MIN_MTU;
-					/*
-					 * If an mtu less than IPv6 min mtu is
-					 * received, we must include a fragment
-					 * header in subsequent packets.
-					 */
-					ire->ire_frag_flag |= IPH_FRAG_HDR;
-				}
+	}
 
-				ip1dbg(("Received mtu from router: %d\n", mtu));
-				ire->ire_max_frag = MIN(ire->ire_max_frag, mtu);
-				if (ire->ire_max_frag == mtu) {
-					/* Decreased it */
-					ire->ire_marks |= IRE_MARK_PMTU;
-				}
-				/* Record the new max frag size for the ULP. */
-				if (ire->ire_frag_flag & IPH_FRAG_HDR) {
-					/*
-					 * If we need a fragment header in
-					 * every packet (above case or
-					 * multirouting), make sure the ULP
-					 * takes it into account when computing
-					 * the payload size.
-					 */
-					icmp6->icmp6_mtu =
-					    htonl(ire->ire_max_frag -
-					    sizeof (ip6_frag_t));
-				} else {
-					icmp6->icmp6_mtu =
-					    htonl(ire->ire_max_frag);
-				}
-				mutex_exit(&ire->ire_lock);
-			}
-		}
-		rw_exit(&irb->irb_lock);
+	/* Was the destination (now source) link-local? Send out same group */
+	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
+		ixas.ixa_flags |= IXAF_SCOPEID_SET;
+		if (IS_UNDER_IPMP(ill))
+			ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
+		else
+			ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
+	}
+
+	if (ira->ira_flags & IRAF_MULTIBROADCAST) {
+		/*
+		 * Not one or our addresses (IRE_LOCALs), thus we let
+		 * ip_output_simple pick the source.
+		 */
+		ip6h->ip6_src = ipv6_all_zeros;
+		ixas.ixa_flags |= IXAF_SET_SOURCE;
 	}
-	icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, inill,
-	    mctl_present, zoneid);
+
+	/* Should we send using dce_pmtu? */
+	if (ipst->ips_ipv6_icmp_return_pmtu)
+		ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
+
+	(void) ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
+
 }
 
 /*
- * Fanout for ICMPv6 errors containing IP-in-IPv6 packets.  Returns B_TRUE if a
- * tunnel consumed the message, and B_FALSE otherwise.
+ * Verify the ICMP messages for either for ICMP error or redirect packet.
+ * The caller should have fully pulled up the message. If it's a redirect
+ * packet, only basic checks on IP header will be done; otherwise, verify
+ * the packet by looking at the included ULP header.
+ *
+ * Called before icmp_inbound_error_fanout_v6 is called.
  */
 static boolean_t
-icmp_inbound_iptun_fanout_v6(mblk_t *first_mp, ip6_t *rip6h, ill_t *ill,
-    ip_stack_t *ipst)
+icmp_inbound_verify_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
 {
-	conn_t	*connp;
+	ill_t		*ill = ira->ira_ill;
+	uint16_t	hdr_length;
+	uint8_t		*nexthdrp;
+	uint8_t		nexthdr;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	conn_t		*connp;
+	ip6_t		*ip6h;	/* Inner header */
 
-	if ((connp = ipcl_iptun_classify_v6(&rip6h->ip6_src, &rip6h->ip6_dst,
-	    ipst)) == NULL)
-		return (B_FALSE);
+	ip6h = (ip6_t *)&icmp6[1];
+	if ((uchar_t *)ip6h + IPV6_HDR_LEN > mp->b_wptr)
+		goto truncated;
+
+	if (icmp6->icmp6_type == ND_REDIRECT) {
+		hdr_length = sizeof (nd_redirect_t);
+	} else {
+		if ((IPH_HDR_VERSION(ip6h) != IPV6_VERSION))
+			goto discard_pkt;
+		hdr_length = IPV6_HDR_LEN;
+	}
+
+	if ((uchar_t *)ip6h + hdr_length > mp->b_wptr)
+		goto truncated;
+
+	/*
+	 * Stop here for ICMP_REDIRECT.
+	 */
+	if (icmp6->icmp6_type == ND_REDIRECT)
+		return (B_TRUE);
+
+	/*
+	 * ICMP errors only.
+	 */
+	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
+		goto discard_pkt;
+	nexthdr = *nexthdrp;
+
+	/* Try to pass the ICMP message to clients who need it */
+	switch (nexthdr) {
+	case IPPROTO_UDP:
+		/*
+		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+		 * transport header.
+		 */
+		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
+		    mp->b_wptr)
+			goto truncated;
+		break;
+	case IPPROTO_TCP: {
+		tcpha_t		*tcpha;
+
+		/*
+		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+		 * transport header.
+		 */
+		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
+		    mp->b_wptr)
+			goto truncated;
+
+		tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
+		/*
+		 * With IPMP we need to match across group, which we do
+		 * since we have the upper ill from ira_ill.
+		 */
+		connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha, TCPS_LISTEN,
+		    ill->ill_phyint->phyint_ifindex, ipst);
+		if (connp == NULL)
+			goto discard_pkt;
+
+		if ((connp->conn_verifyicmp != NULL) &&
+		    !connp->conn_verifyicmp(connp, tcpha, NULL, icmp6, ira)) {
+			CONN_DEC_REF(connp);
+			goto discard_pkt;
+		}
+		CONN_DEC_REF(connp);
+		break;
+	}
+	case IPPROTO_SCTP:
+		/*
+		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+		 * transport header.
+		 */
+		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
+		    mp->b_wptr)
+			goto truncated;
+		break;
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+		break;
+	case IPPROTO_ENCAP:
+	case IPPROTO_IPV6: {
+		/* Look for self-encapsulated packets that caused an error */
+		ip6_t *in_ip6h;
+
+		in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
+		if ((uint8_t *)in_ip6h + (nexthdr == IPPROTO_ENCAP ?
+		    sizeof (ipha_t) : sizeof (ip6_t)) > mp->b_wptr)
+			goto truncated;
+		break;
+	}
+	default:
+		break;
+	}
 
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-	connp->conn_recv(connp, first_mp, NULL);
-	CONN_DEC_REF(connp);
 	return (B_TRUE);
+
+discard_pkt:
+	/* Bogus ICMP error. */
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+	return (B_FALSE);
+
+truncated:
+	/* We pulled up everthing already. Must be truncated */
+	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
+	return (B_FALSE);
 }
 
 /*
- * Fanout received ICMPv6 error packets to the transports.
- * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else.
+ * Process received IPv6 ICMP Packet too big.
+ * The caller is responsible for validating the packet before passing it in
+ * and also to fanout the ICMP error to any matching transport conns. Assumes
+ * the message has been fully pulled up.
+ *
+ * Before getting here, the caller has called icmp_inbound_verify_v6()
+ * that should have verified with ULP to prevent undoing the changes we're
+ * going to make to DCE. For example, TCP might have verified that the packet
+ * which generated error is in the send window.
+ *
+ * In some cases modified this MTU in the ICMP header packet; the caller
+ * should pass to the matching ULP after this returns.
  */
-void
-icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
-    icmp6_t *icmp6, ill_t *ill, ill_t *inill, boolean_t mctl_present,
-    zoneid_t zoneid)
+static void
+icmp_inbound_too_big_v6(icmp6_t *icmp6, ip_recv_attr_t *ira)
 {
-	uint16_t *up;	/* Pointer to ports in ULP header */
-	uint32_t ports;	/* reversed ports for fanout */
-	ip6_t rip6h;	/* With reversed addresses */
-	uint16_t	hdr_length;
-	uint8_t		*nexthdrp;
-	uint8_t		nexthdr;
-	mblk_t *first_mp;
-	ipsec_in_t *ii;
-	tcpha_t	*tcpha;
-	conn_t	*connp;
+	uint32_t	mtu;
+	dce_t		*dce;
+	ill_t		*ill = ira->ira_ill;	/* Upper ill if IPMP */
 	ip_stack_t	*ipst = ill->ill_ipst;
+	int		old_max_frag;
+	in6_addr_t	final_dst;
+	ip6_t		*ip6h;	/* Inner IP header */
 
-	first_mp = mp;
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		ASSERT(mp != NULL);
+	/* Caller has already pulled up everything. */
+	ip6h = (ip6_t *)&icmp6[1];
+	final_dst = ip_get_dst_v6(ip6h, NULL, NULL);
 
-		ii = (ipsec_in_t *)first_mp->b_rptr;
-		ASSERT(ii->ipsec_in_type == IPSEC_IN);
+	/*
+	 * For link local destinations matching simply on address is not
+	 * sufficient. Same link local addresses for different ILL's is
+	 * possible.
+	 */
+	if (IN6_IS_ADDR_LINKSCOPE(&final_dst)) {
+		dce = dce_lookup_and_add_v6(&final_dst,
+		    ill->ill_phyint->phyint_ifindex, ipst);
 	} else {
-		ii = NULL;
+		dce = dce_lookup_and_add_v6(&final_dst, 0, ipst);
+	}
+	if (dce == NULL) {
+		/* Couldn't add a unique one - ENOMEM */
+		if (ip_debug > 2) {
+			/* ip1dbg */
+			pr_addr_dbg("icmp_inbound_too_big_v6:"
+			    "no dce for dst %s\n", AF_INET6,
+			    &final_dst);
+		}
+		return;
 	}
 
-	hdr_length = (uint16_t)((uchar_t *)icmp6 - (uchar_t *)ip6h);
-	ASSERT((size_t)(mp->b_wptr - (uchar_t *)icmp6) >= ICMP6_MINLEN);
+	mtu = ntohl(icmp6->icmp6_mtu);
 
+	mutex_enter(&dce->dce_lock);
+	if (dce->dce_flags & DCEF_PMTU)
+		old_max_frag = dce->dce_pmtu;
+	else
+		old_max_frag = ill->ill_mtu;
+
+	if (mtu < IPV6_MIN_MTU) {
+		ip1dbg(("Received mtu less than IPv6 "
+		    "min mtu %d: %d\n", IPV6_MIN_MTU, mtu));
+		mtu = IPV6_MIN_MTU;
+		/*
+		 * If an mtu less than IPv6 min mtu is received,
+		 * we must include a fragment header in
+		 * subsequent packets.
+		 */
+		dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
+	} else {
+		dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
+	}
+	ip1dbg(("Received mtu from router: %d\n", mtu));
+	dce->dce_pmtu = MIN(old_max_frag, mtu);
+
+	/* Prepare to send the new max frag size for the ULP. */
+	if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
+		/*
+		 * If we need a fragment header in every packet
+		 * (above case or multirouting), make sure the
+		 * ULP takes it into account when computing the
+		 * payload size.
+		 */
+		icmp6->icmp6_mtu = htonl(dce->dce_pmtu - sizeof (ip6_frag_t));
+	} else {
+		icmp6->icmp6_mtu = htonl(dce->dce_pmtu);
+	}
+	/* We now have a PMTU for sure */
+	dce->dce_flags |= DCEF_PMTU;
+	dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+	mutex_exit(&dce->dce_lock);
 	/*
-	 * Need to pullup everything in order to use
-	 * ip_hdr_length_nexthdr_v6()
+	 * After dropping the lock the new value is visible to everyone.
+	 * Then we bump the generation number so any cached values reinspect
+	 * the dce_t.
 	 */
-	if (mp->b_cont != NULL) {
-		if (!pullupmsg(mp, -1)) {
-			ip1dbg(("icmp_inbound_error_fanout_v6: "
-			    "pullupmsg failed\n"));
-			goto drop_pkt;
-		}
-		ip6h = (ip6_t *)mp->b_rptr;
-		icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
-	}
+	dce_increment_generation(dce);
+	dce_refrele(dce);
+}
 
-	ip6h = (ip6_t *)&icmp6[1];	/* Packet in error */
-	if ((uchar_t *)&ip6h[1] > mp->b_wptr)
-		goto drop_pkt;
+/*
+ * Fanout received ICMPv6 error packets to the transports.
+ * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else.
+ *
+ * The caller must have called icmp_inbound_verify_v6.
+ */
+void
+icmp_inbound_error_fanout_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
+{
+	uint16_t	*up;	/* Pointer to ports in ULP header */
+	uint32_t	ports;	/* reversed ports for fanout */
+	ip6_t		rip6h;	/* With reversed addresses */
+	ip6_t		*ip6h;	/* Inner IP header */
+	uint16_t	hdr_length; /* Inner IP header length */
+	uint8_t		*nexthdrp;
+	uint8_t		nexthdr;
+	tcpha_t		*tcpha;
+	conn_t		*connp;
+	ill_t		*ill = ira->ira_ill;	/* Upper in the case of IPMP */
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
+
+	/* Caller has already pulled up everything. */
+	ip6h = (ip6_t *)&icmp6[1];
+	ASSERT(mp->b_cont == NULL);
+	ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
 
 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
 		goto drop_pkt;
 	nexthdr = *nexthdrp;
-
-	/* Set message type, must be done after pullups */
-	mp->b_datap->db_type = M_CTL;
+	ira->ira_protocol = nexthdr;
 
 	/*
 	 * We need a separate IP header with the source and destination
@@ -814,174 +794,128 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 	/* Try to pass the ICMP message to clients who need it */
 	switch (nexthdr) {
 	case IPPROTO_UDP: {
-		/*
-		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
-		 * UDP header to get the port information.
-		 */
-		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
-		    mp->b_wptr) {
-			break;
-		}
 		/* Attempt to find a client stream based on port. */
 		up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
-		((uint16_t *)&ports)[0] = up[1];
-		((uint16_t *)&ports)[1] = up[0];
 
-		ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, inill,
-		    IP6_NO_IPPOLICY, mctl_present, zoneid);
+		/* Note that we send error to all matches. */
+		ira->ira_flags |= IRAF_ICMP_ERROR;
+		ip_fanout_udp_multi_v6(mp, &rip6h, up[0], up[1], ira);
+		ira->ira_flags &= ~IRAF_ICMP_ERROR;
 		return;
 	}
 	case IPPROTO_TCP: {
 		/*
-		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
-		 * the TCP header to get the port information.
-		 */
-		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
-		    mp->b_wptr) {
-			break;
-		}
-
-		/*
 		 * Attempt to find a client stream based on port.
 		 * Note that we do a reverse lookup since the header is
 		 * in the form we sent it out.
 		 */
-		tcpha = (tcpha_t *)((char *)ip6h + hdr_length);
+		tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
+		/*
+		 * With IPMP we need to match across group, which we do
+		 * since we have the upper ill from ira_ill.
+		 */
 		connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha,
 		    TCPS_LISTEN, ill->ill_phyint->phyint_ifindex, ipst);
 		if (connp == NULL) {
 			goto drop_pkt;
 		}
 
-		SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp,
-		    SQ_FILL, SQTAG_TCP6_INPUT_ICMP_ERR);
+		if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
+		    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
+			mp = ipsec_check_inbound_policy(mp, connp,
+			    NULL, ip6h, ira);
+			if (mp == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				/* Note that mp is NULL */
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				CONN_DEC_REF(connp);
+				return;
+			}
+		}
+
+		ira->ira_flags |= IRAF_ICMP_ERROR;
+		if (IPCL_IS_TCP(connp)) {
+			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+			    connp->conn_recvicmp, connp, ira, SQ_FILL,
+			    SQTAG_TCP6_INPUT_ICMP_ERR);
+		} else {
+			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
+			ill_t *rill = ira->ira_rill;
+
+			ira->ira_ill = ira->ira_rill = NULL;
+			(connp->conn_recv)(connp, mp, NULL, ira);
+			CONN_DEC_REF(connp);
+			ira->ira_ill = ill;
+			ira->ira_rill = rill;
+		}
+		ira->ira_flags &= ~IRAF_ICMP_ERROR;
 		return;
 
 	}
 	case IPPROTO_SCTP:
-		/*
-		 * Verify we have at least ICMP_MIN_SCTP_HDR_LEN bytes of
-		 * transport header to get the port information.
-		 */
-		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_SCTP_HDR_LEN >
-		    mp->b_wptr) {
-			if (!pullupmsg(mp, (uchar_t *)ip6h + hdr_length +
-			    ICMP_MIN_SCTP_HDR_LEN - mp->b_rptr)) {
-				goto drop_pkt;
-			}
-		}
-
 		up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
+		/* Find a SCTP client stream for this packet. */
 		((uint16_t *)&ports)[0] = up[1];
 		((uint16_t *)&ports)[1] = up[0];
-		ip_fanout_sctp(first_mp, inill, (ipha_t *)ip6h, ports, 0,
-		    mctl_present, IP6_NO_IPPOLICY, zoneid);
-		return;
-	case IPPROTO_ESP:
-	case IPPROTO_AH: {
-		int ipsec_rc;
-		ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
 
-		/*
-		 * We need a IPSEC_IN in the front to fanout to AH/ESP.
-		 * We will re-use the IPSEC_IN if it is already present as
-		 * AH/ESP will not affect any fields in the IPSEC_IN for
-		 * ICMP errors. If there is no IPSEC_IN, allocate a new
-		 * one and attach it in the front.
-		 */
-		if (ii != NULL) {
-			/*
-			 * ip_fanout_proto_again converts the ICMP errors
-			 * that come back from AH/ESP to M_DATA so that
-			 * if it is non-AH/ESP and we do a pullupmsg in
-			 * this function, it would work. Convert it back
-			 * to M_CTL before we send up as this is a ICMP
-			 * error. This could have been generated locally or
-			 * by some router. Validate the inner IPSEC
-			 * headers.
-			 *
-			 * NOTE : ill_index is used by ip_fanout_proto_again
-			 * to locate the ill.
-			 */
-			ASSERT(ill != NULL);
-			ii->ipsec_in_ill_index =
-			    ill->ill_phyint->phyint_ifindex;
-			ii->ipsec_in_rill_index =
-			    inill->ill_phyint->phyint_ifindex;
-			first_mp->b_cont->b_datap->db_type = M_CTL;
-		} else {
-			/*
-			 * IPSEC_IN is not present. We attach a ipsec_in
-			 * message and send up to IPSEC for validating
-			 * and removing the IPSEC headers. Clear
-			 * ipsec_in_secure so that when we return
-			 * from IPSEC, we don't mistakenly think that this
-			 * is a secure packet came from the network.
-			 *
-			 * NOTE : ill_index is used by ip_fanout_proto_again
-			 * to locate the ill.
-			 */
-			ASSERT(first_mp == mp);
-			first_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack);
-			ASSERT(ill != NULL);
-			if (first_mp == NULL) {
-				freemsg(mp);
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				return;
-			}
-			ii = (ipsec_in_t *)first_mp->b_rptr;
-
-			/* This is not a secure packet */
-			ii->ipsec_in_secure = B_FALSE;
-			first_mp->b_cont = mp;
-			mp->b_datap->db_type = M_CTL;
-			ii->ipsec_in_ill_index =
-			    ill->ill_phyint->phyint_ifindex;
-			ii->ipsec_in_rill_index =
-			    inill->ill_phyint->phyint_ifindex;
-		}
+		ira->ira_flags |= IRAF_ICMP_ERROR;
+		ip_fanout_sctp(mp, NULL, &rip6h, ports, ira);
+		ira->ira_flags &= ~IRAF_ICMP_ERROR;
+		return;
 
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
 		if (!ipsec_loaded(ipss)) {
-			ip_proto_not_sup(q, first_mp, 0, zoneid, ipst);
+			ip_proto_not_sup(mp, ira);
 			return;
 		}
 
 		if (nexthdr == IPPROTO_ESP)
-			ipsec_rc = ipsecesp_icmp_error(first_mp);
+			mp = ipsecesp_icmp_error(mp, ira);
 		else
-			ipsec_rc = ipsecah_icmp_error(first_mp);
-		if (ipsec_rc == IPSEC_STATUS_FAILED)
+			mp = ipsecah_icmp_error(mp, ira);
+		if (mp == NULL)
 			return;
 
-		ip_fanout_proto_again(first_mp, ill, inill, NULL);
-		return;
-	}
-	case IPPROTO_ENCAP:
-	case IPPROTO_IPV6:
-		if ((uint8_t *)ip6h + hdr_length +
-		    (nexthdr == IPPROTO_ENCAP ? sizeof (ipha_t) :
-		    sizeof (ip6_t)) > mp->b_wptr) {
+		/* Just in case ipsec didn't preserve the NULL b_cont */
+		if (mp->b_cont != NULL) {
+			if (!pullupmsg(mp, -1))
+				goto drop_pkt;
+		}
+
+		/*
+		 * If succesful, the mp has been modified to not include
+		 * the ESP/AH header so we can fanout to the ULP's icmp
+		 * error handler.
+		 */
+		if (mp->b_wptr - mp->b_rptr < IPV6_HDR_LEN)
 			goto drop_pkt;
+
+		ip6h = (ip6_t *)mp->b_rptr;
+		/* Don't call hdr_length_v6() unless you have to. */
+		if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
+			hdr_length = ip_hdr_length_v6(mp, ip6h);
+		else
+			hdr_length = IPV6_HDR_LEN;
+
+		/* Verify the modified message before any further processes. */
+		icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
+		if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
+			freemsg(mp);
+			return;
 		}
 
-		if (nexthdr == IPPROTO_ENCAP ||
-		    !IN6_ARE_ADDR_EQUAL(
-		    &((ip6_t *)(((uint8_t *)ip6h) + hdr_length))->ip6_src,
-		    &ip6h->ip6_src) ||
-		    !IN6_ARE_ADDR_EQUAL(
-		    &((ip6_t *)(((uint8_t *)ip6h) + hdr_length))->ip6_dst,
-		    &ip6h->ip6_dst)) {
-			/*
-			 * For tunnels that have used IPsec protection,
-			 * we need to adjust the MTU to take into account
-			 * the IPsec overhead.
-			 */
-			if (ii != NULL) {
-				icmp6->icmp6_mtu = htonl(
-				    ntohl(icmp6->icmp6_mtu) -
-				    ipsec_in_extra_length(first_mp));
-			}
-		} else {
+		icmp_inbound_error_fanout_v6(mp, icmp6, ira);
+		return;
+
+	case IPPROTO_IPV6: {
+		/* Look for self-encapsulated packets that caused an error */
+		ip6_t *in_ip6h;
+
+		in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
+
+		if (IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_src, &ip6h->ip6_src) &&
+		    IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_dst, &ip6h->ip6_dst)) {
 			/*
 			 * Self-encapsulated case. As in the ipv4 case,
 			 * we need to strip the 2nd IP header. Since mp
@@ -989,126 +923,124 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 			 * the 3rd header + data over the 2nd header.
 			 */
 			uint16_t unused_len;
-			ip6_t *inner_ip6h = (ip6_t *)
-			    ((uchar_t *)ip6h + hdr_length);
 
 			/*
 			 * Make sure we don't do recursion more than once.
 			 */
-			if (!ip_hdr_length_nexthdr_v6(mp, inner_ip6h,
+			if (!ip_hdr_length_nexthdr_v6(mp, in_ip6h,
 			    &unused_len, &nexthdrp) ||
 			    *nexthdrp == IPPROTO_IPV6) {
 				goto drop_pkt;
 			}
 
 			/*
-			 * We are about to modify the packet. Make a copy if
-			 * someone else has a reference to it.
-			 */
-			if (DB_REF(mp) > 1) {
-				mblk_t	*mp1;
-				uint16_t icmp6_offset;
-
-				mp1 = copymsg(mp);
-				if (mp1 == NULL) {
-					goto drop_pkt;
-				}
-				icmp6_offset = (uint16_t)
-				    ((uchar_t *)icmp6 - mp->b_rptr);
-				freemsg(mp);
-				mp = mp1;
-
-				icmp6 = (icmp6_t *)(mp->b_rptr + icmp6_offset);
-				ip6h = (ip6_t *)&icmp6[1];
-				inner_ip6h = (ip6_t *)
-				    ((uchar_t *)ip6h + hdr_length);
-
-				if (mctl_present)
-					first_mp->b_cont = mp;
-				else
-					first_mp = mp;
-			}
-
-			/*
-			 * Need to set db_type back to M_DATA before
-			 * refeeding mp into this function.
-			 */
-			DB_TYPE(mp) = M_DATA;
-
-			/*
 			 * Copy the 3rd header + remaining data on top
 			 * of the 2nd header.
 			 */
-			bcopy(inner_ip6h, ip6h,
-			    mp->b_wptr - (uchar_t *)inner_ip6h);
+			bcopy(in_ip6h, ip6h, mp->b_wptr - (uchar_t *)in_ip6h);
 
 			/*
 			 * Subtract length of the 2nd header.
 			 */
 			mp->b_wptr -= hdr_length;
 
+			ip6h = (ip6_t *)mp->b_rptr;
+			/* Don't call hdr_length_v6() unless you have to. */
+			if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
+				hdr_length = ip_hdr_length_v6(mp, ip6h);
+			else
+				hdr_length = IPV6_HDR_LEN;
+
+			/*
+			 * Verify the modified message before any further
+			 * processes.
+			 */
+			icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
+			if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
+				freemsg(mp);
+				return;
+			}
+
 			/*
 			 * Now recurse, and see what I _really_ should be
 			 * doing here.
 			 */
-			icmp_inbound_error_fanout_v6(q, first_mp,
-			    (ip6_t *)mp->b_rptr, icmp6, ill, inill,
-			    mctl_present, zoneid);
+			icmp_inbound_error_fanout_v6(mp, icmp6, ira);
 			return;
 		}
-		if (icmp_inbound_iptun_fanout_v6(first_mp, &rip6h, ill, ipst))
+		/* FALLTHRU */
+	}
+	case IPPROTO_ENCAP:
+		if ((connp = ipcl_iptun_classify_v6(&rip6h.ip6_src,
+		    &rip6h.ip6_dst, ipst)) != NULL) {
+			ira->ira_flags |= IRAF_ICMP_ERROR;
+			connp->conn_recvicmp(connp, mp, NULL, ira);
+			CONN_DEC_REF(connp);
+			ira->ira_flags &= ~IRAF_ICMP_ERROR;
 			return;
+		}
 		/*
-		 * No IP tunnel is associated with this error.  Perhaps a raw
-		 * socket will want it.
+		 * No IP tunnel is interested, fallthrough and see
+		 * if a raw socket will want it.
 		 */
 		/* FALLTHRU */
 	default:
-		ip_fanout_proto_v6(q, first_mp, &rip6h, ill, inill, nexthdr, 0,
-		    IP6_NO_IPPOLICY, mctl_present, zoneid);
+		ira->ira_flags |= IRAF_ICMP_ERROR;
+		ASSERT(ira->ira_protocol == nexthdr);
+		ip_fanout_proto_v6(mp, &rip6h, ira);
+		ira->ira_flags &= ~IRAF_ICMP_ERROR;
 		return;
 	}
 	/* NOTREACHED */
 drop_pkt:
 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
 	ip1dbg(("icmp_inbound_error_fanout_v6: drop pkt\n"));
-	freemsg(first_mp);
+	freemsg(mp);
 }
 
 /*
  * Process received IPv6 ICMP Redirect messages.
+ * Assumes the caller has verified that the headers are in the pulled up mblk.
+ * Consumes mp.
  */
 /* ARGSUSED */
 static void
-icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
+icmp_redirect_v6(mblk_t *mp, ip6_t *ip6h, nd_redirect_t *rd,
+    ip_recv_attr_t *ira)
 {
-	ip6_t		*ip6h;
-	uint16_t	hdr_length;
-	nd_redirect_t	*rd;
-	ire_t		*ire;
-	ire_t		*prev_ire;
+	ire_t		*ire, *nire;
+	ire_t		*prev_ire = NULL;
 	ire_t		*redir_ire;
 	in6_addr_t	*src, *dst, *gateway;
 	nd_opt_hdr_t	*opt;
 	nce_t		*nce;
-	int		nce_flags = 0;
+	int		ncec_flags = 0;
 	int		err = 0;
 	boolean_t	redirect_to_router = B_FALSE;
 	int		len;
 	int		optlen;
-	iulp_t		ulp_info = { 0 };
-	ill_t		*prev_ire_ill;
-	ipif_t		*ipif;
+	ill_t		*ill = ira->ira_rill;
+	ill_t		*rill = ira->ira_rill;
 	ip_stack_t	*ipst = ill->ill_ipst;
 
-	ip6h = (ip6_t *)mp->b_rptr;
-	if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
-		hdr_length = ip_hdr_length_v6(mp, ip6h);
-	else
-		hdr_length = IPV6_HDR_LEN;
+	/*
+	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
+	 * and make it be the IPMP upper so avoid being confused by a packet
+	 * addressed to a unicast address on a different ill.
+	 */
+	if (IS_UNDER_IPMP(rill)) {
+		rill = ipmp_ill_hold_ipmp_ill(rill);
+		if (rill == NULL) {
+			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
+			ip_drop_input("ipv6IfIcmpInBadRedirects - IPMP ill",
+			    mp, ill);
+			freemsg(mp);
+			return;
+		}
+		ASSERT(rill != ira->ira_rill);
+	}
 
-	rd = (nd_redirect_t *)&mp->b_rptr[hdr_length];
-	len = mp->b_wptr - mp->b_rptr -  hdr_length;
+	len = mp->b_wptr - (uchar_t *)rd;
 	src = &ip6h->ip6_src;
 	dst = &rd->nd_rd_dst;
 	gateway = &rd->nd_rd_target;
@@ -1121,37 +1053,35 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
 	    (IN6_IS_ADDR_V4MAPPED(dst)) ||
 	    (IN6_IS_ADDR_MULTICAST(dst))) {
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
-		freemsg(mp);
-		return;
+		ip_drop_input("ipv6IfIcmpInBadRedirects - addr/len", mp, ill);
+		goto fail_redirect;
 	}
 
 	if (!(IN6_IS_ADDR_LINKLOCAL(gateway) ||
 	    IN6_ARE_ADDR_EQUAL(gateway, dst))) {
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
-		freemsg(mp);
-		return;
+		ip_drop_input("ipv6IfIcmpInBadRedirects - bad gateway",
+		    mp, ill);
+		goto fail_redirect;
 	}
 
-	if (len > sizeof (nd_redirect_t)) {
-		if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1],
-		    len - sizeof (nd_redirect_t))) {
+	optlen = len - sizeof (nd_redirect_t);
+	if (optlen != 0) {
+		if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1], optlen)) {
 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
-			freemsg(mp);
-			return;
+			ip_drop_input("ipv6IfIcmpInBadRedirects - options",
+			    mp, ill);
+			goto fail_redirect;
 		}
 	}
 
 	if (!IN6_ARE_ADDR_EQUAL(gateway, dst)) {
 		redirect_to_router = B_TRUE;
-		nce_flags |= NCE_F_ISROUTER;
+		ncec_flags |= NCE_F_ISROUTER;
+	} else {
+		gateway = dst;	/* Add nce for dst */
 	}
 
-	/* ipif will be refreleased afterwards */
-	ipif = ipif_get_next_ipif(NULL, ill);
-	if (ipif == NULL) {
-		freemsg(mp);
-		return;
-	}
 
 	/*
 	 * Verify that the IP source address of the redirect is
@@ -1160,10 +1090,11 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
 	 * Also, Make sure we had a route for the dest in question and
 	 * that route was pointing to the old gateway (the source of the
 	 * redirect packet.)
+	 * Note: this merely says that there is some IRE which matches that
+	 * gateway; not that the longest match matches that gateway.
 	 */
-
-	prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL, ALL_ZONES,
-	    NULL, MATCH_IRE_GW | MATCH_IRE_ILL | MATCH_IRE_DEFAULT, ipst);
+	prev_ire = ire_ftable_lookup_v6(dst, 0, src, 0, rill,
+	    ALL_ZONES, NULL, MATCH_IRE_GW | MATCH_IRE_ILL, 0, ipst, NULL);
 
 	/*
 	 * Check that
@@ -1171,92 +1102,44 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
 	 *	old gateway is still directly reachable
 	 */
 	if (prev_ire == NULL ||
-	    prev_ire->ire_type == IRE_LOCAL) {
+	    (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
+	    (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
-		ipif_refrele(ipif);
+		ip_drop_input("ipv6IfIcmpInBadRedirects - ire", mp, ill);
 		goto fail_redirect;
 	}
-	prev_ire_ill = ire_to_ill(prev_ire);
-	ASSERT(prev_ire_ill != NULL);
-	if (prev_ire_ill->ill_flags & ILLF_NONUD)
-		nce_flags |= NCE_F_NONUD;
-
-	/*
-	 * Should we use the old ULP info to create the new gateway?  From
-	 * a user's perspective, we should inherit the info so that it
-	 * is a "smooth" transition.  If we do not do that, then new
-	 * connections going thru the new gateway will have no route metrics,
-	 * which is counter-intuitive to user.  From a network point of
-	 * view, this may or may not make sense even though the new gateway
-	 * is still directly connected to us so the route metrics should not
-	 * change much.
-	 *
-	 * But if the old ire_uinfo is not initialized, we do another
-	 * recursive lookup on the dest using the new gateway.  There may
-	 * be a route to that.  If so, use it to initialize the redirect
-	 * route.
-	 */
-	if (prev_ire->ire_uinfo.iulp_set) {
-		bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t));
-	} else if (redirect_to_router) {
-		/*
-		 * Only do the following if the redirection is really to
-		 * a router.
-		 */
-		ire_t *tmp_ire;
-		ire_t *sire;
 
-		tmp_ire = ire_ftable_lookup_v6(dst, 0, gateway, 0, NULL, &sire,
-		    ALL_ZONES, 0, NULL,
-		    (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT),
-		    ipst);
-		if (sire != NULL) {
-			bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t));
-			ASSERT(tmp_ire != NULL);
-			ire_refrele(tmp_ire);
-			ire_refrele(sire);
-		} else if (tmp_ire != NULL) {
-			bcopy(&tmp_ire->ire_uinfo, &ulp_info,
-			    sizeof (iulp_t));
-			ire_refrele(tmp_ire);
-		}
-	}
+	ASSERT(prev_ire->ire_ill != NULL);
+	if (prev_ire->ire_ill->ill_flags & ILLF_NONUD)
+		ncec_flags |= NCE_F_NONUD;
 
-	optlen = mp->b_wptr - mp->b_rptr -  hdr_length - sizeof (nd_redirect_t);
 	opt = (nd_opt_hdr_t *)&rd[1];
 	opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR);
 	if (opt != NULL) {
-		err = ndp_lookup_then_add_v6(ill,
-		    B_FALSE,			/* don't match across illgrp */
+		err = nce_lookup_then_add_v6(rill,
 		    (uchar_t *)&opt[1],		/* Link layer address */
-		    gateway,
-		    &ipv6_all_ones,		/* prefix mask */
-		    &ipv6_all_zeros,		/* Mapping mask */
-		    0,
-		    nce_flags,
-		    ND_STALE,
-		    &nce);
+		    rill->ill_phys_addr_length,
+		    gateway, ncec_flags, ND_STALE, &nce);
 		switch (err) {
 		case 0:
-			NCE_REFRELE(nce);
+			nce_refrele(nce);
 			break;
 		case EEXIST:
 			/*
 			 * Check to see if link layer address has changed and
-			 * process the nce_state accordingly.
+			 * process the ncec_state accordingly.
 			 */
-			ndp_process(nce, (uchar_t *)&opt[1], 0, B_FALSE);
-			NCE_REFRELE(nce);
+			nce_process(nce->nce_common,
+			    (uchar_t *)&opt[1], 0, B_FALSE);
+			nce_refrele(nce);
 			break;
 		default:
 			ip1dbg(("icmp_redirect_v6: NCE create failed %d\n",
 			    err));
-			ipif_refrele(ipif);
 			goto fail_redirect;
 		}
 	}
 	if (redirect_to_router) {
-		/* icmp_redirect_ok_v6() must  have already verified this  */
 		ASSERT(IN6_IS_ADDR_LINKLOCAL(gateway));
 
 		/*
@@ -1266,65 +1149,68 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
 		ire = ire_create_v6(
 		    dst,
 		    &ipv6_all_ones,		/* mask */
-		    &prev_ire->ire_src_addr_v6,	/* source addr */
 		    gateway,			/* gateway addr */
-		    &prev_ire->ire_max_frag,	/* max frag */
-		    NULL,			/* no src nce */
-		    NULL, 			/* no rfq */
-		    NULL,			/* no stq */
 		    IRE_HOST,
-		    prev_ire->ire_ipif,
-		    NULL,
-		    0,
-		    0,
+		    prev_ire->ire_ill,
+		    ALL_ZONES,
 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
-		    &ulp_info,
-		    NULL,
 		    NULL,
 		    ipst);
 	} else {
-		queue_t *stq;
-
-		stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
-		    ? ipif->ipif_rq : ipif->ipif_wq;
+		ipif_t *ipif;
+		in6_addr_t gw;
 
 		/*
 		 * Just create an on link entry, i.e. interface route.
+		 * The gateway field is our link-local on the ill.
 		 */
+		mutex_enter(&rill->ill_lock);
+		for (ipif = rill->ill_ipif; ipif != NULL;
+		    ipif = ipif->ipif_next) {
+			if (!(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
+			    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))
+				break;
+		}
+		if (ipif == NULL) {
+			/* We have no link-local address! */
+			mutex_exit(&rill->ill_lock);
+			goto fail_redirect;
+		}
+		gw = ipif->ipif_v6lcl_addr;
+		mutex_exit(&rill->ill_lock);
+
 		ire = ire_create_v6(
 		    dst,				/* gateway == dst */
 		    &ipv6_all_ones,			/* mask */
-		    &prev_ire->ire_src_addr_v6,		/* source addr */
-		    &ipv6_all_zeros,			/* gateway addr */
-		    &prev_ire->ire_max_frag,		/* max frag */
-		    NULL,				/* no src nce */
-		    NULL,				/* ire rfq */
-		    stq,				/* ire stq */
-		    ipif->ipif_net_type,		/* IF_[NO]RESOLVER */
-		    prev_ire->ire_ipif,
-		    &ipv6_all_ones,
-		    0,
-		    0,
+		    &gw,				/* gateway addr */
+		    rill->ill_net_type,			/* IF_[NO]RESOLVER */
+		    prev_ire->ire_ill,
+		    ALL_ZONES,
 		    (RTF_DYNAMIC | RTF_HOST),
-		    &ulp_info,
-		    NULL,
 		    NULL,
 		    ipst);
 	}
 
-	/* Release reference from earlier ipif_get_next_ipif() */
-	ipif_refrele(ipif);
-
 	if (ire == NULL)
 		goto fail_redirect;
 
-	if (ire_add(&ire, NULL, NULL, NULL, B_FALSE) == 0) {
+	nire = ire_add(ire);
+	/* Check if it was a duplicate entry */
+	if (nire != NULL && nire != ire) {
+		ASSERT(nire->ire_identical_ref > 1);
+		ire_delete(nire);
+		ire_refrele(nire);
+		nire = NULL;
+	}
+	ire = nire;
+	if (ire != NULL) {
+		ire_refrele(ire);		/* Held in ire_add */
 
 		/* tell routing sockets that we received a redirect */
 		ip_rts_change_v6(RTM_REDIRECT,
 		    &rd->nd_rd_dst,
 		    &rd->nd_rd_target,
-		    &ipv6_all_ones, 0, &ire->ire_src_addr_v6,
+		    &ipv6_all_ones, 0, src,
 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
 
@@ -1334,10 +1220,9 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
 		 * modifying an existing redirect.
 		 */
 		redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST,
-		    ire->ire_ipif, NULL, ALL_ZONES, 0, NULL,
-		    (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
-
-		ire_refrele(ire);		/* Held in ire_add_v6 */
+		    prev_ire->ire_ill, ALL_ZONES, NULL,
+		    (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst,
+		    NULL);
 
 		if (redir_ire != NULL) {
 			if (redir_ire->ire_flags & RTF_DYNAMIC)
@@ -1346,8 +1231,6 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
 		}
 	}
 
-	if (prev_ire->ire_type == IRE_CACHE)
-		ire_delete(prev_ire);
 	ire_refrele(prev_ire);
 	prev_ire = NULL;
 
@@ -1355,101 +1238,8 @@ fail_redirect:
 	if (prev_ire != NULL)
 		ire_refrele(prev_ire);
 	freemsg(mp);
-}
-
-static ill_t *
-ip_queue_to_ill_v6(queue_t *q, ip_stack_t *ipst)
-{
-	ill_t *ill;
-
-	ASSERT(WR(q) == q);
-
-	if (q->q_next != NULL) {
-		ill = (ill_t *)q->q_ptr;
-		if (ILL_CAN_LOOKUP(ill))
-			ill_refhold(ill);
-		else
-			ill = NULL;
-	} else {
-		ill = ill_lookup_on_name(ipif_loopback_name, B_FALSE, B_TRUE,
-		    NULL, NULL, NULL, NULL, NULL, ipst);
-	}
-	if (ill == NULL)
-		ip0dbg(("ip_queue_to_ill_v6: no ill\n"));
-	return (ill);
-}
-
-/*
- * Assigns an appropriate source address to the packet.
- * If origdst is one of our IP addresses that use it as the source.
- * If the queue is an ill queue then select a source from that ill.
- * Otherwise pick a source based on a route lookup back to the origsrc.
- *
- * src is the return parameter. Returns a pointer to src or NULL if failure.
- */
-static in6_addr_t *
-icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst,
-    in6_addr_t *src, zoneid_t zoneid, ip_stack_t *ipst)
-{
-	ill_t	*ill;
-	ire_t	*ire;
-	ipif_t	*ipif;
-
-	ASSERT(!(wq->q_flag & QREADR));
-	if (wq->q_next != NULL) {
-		ill = (ill_t *)wq->q_ptr;
-	} else {
-		ill = NULL;
-	}
-
-	ire = ire_route_lookup_v6(origdst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK),
-	    NULL, NULL, zoneid, NULL, (MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY),
-	    ipst);
-	if (ire != NULL) {
-		/* Destined to one of our addresses */
-		*src = *origdst;
-		ire_refrele(ire);
-		return (src);
-	}
-	if (ire != NULL) {
-		ire_refrele(ire);
-		ire = NULL;
-	}
-	if (ill == NULL) {
-		/* What is the route back to the original source? */
-		ire = ire_route_lookup_v6(origsrc, 0, 0, 0,
-		    NULL, NULL, zoneid, NULL,
-		    (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE), ipst);
-		if (ire == NULL) {
-			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes);
-			return (NULL);
-		}
-		ASSERT(ire->ire_ipif != NULL);
-		ill = ire->ire_ipif->ipif_ill;
-		ire_refrele(ire);
-	}
-	ipif = ipif_select_source_v6(ill, origsrc, B_FALSE,
-	    IPV6_PREFER_SRC_DEFAULT, zoneid);
-	if (ipif != NULL) {
-		*src = ipif->ipif_v6src_addr;
-		ipif_refrele(ipif);
-		return (src);
-	}
-	/*
-	 * Unusual case - can't find a usable source address to reach the
-	 * original source. Use what in the route to the source.
-	 */
-	ire = ire_route_lookup_v6(origsrc, 0, 0, 0,
-	    NULL, NULL, zoneid, NULL,
-	    (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE), ipst);
-	if (ire == NULL) {
-		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes);
-		return (NULL);
-	}
-	ASSERT(ire != NULL);
-	*src = ire->ire_src_addr_v6;
-	ire_refrele(ire);
-	return (src);
+	if (rill != ira->ira_rill)
+		ill_refrele(rill);
 }
 
 /*
@@ -1459,17 +1249,12 @@ icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst,
  * Note: assumes that icmp_pkt_err_ok_v6 has been called to
  * verify that an icmp error packet can be sent.
  *
- * If q is an ill write side queue (which is the case when packets
- * arrive from ip_rput) then ip_wput code will ensure that packets to
- * link-local destinations are sent out that ill.
- *
  * If v6src_ptr is set use it as a source. Otherwise select a reasonable
  * source address (see above function).
  */
 static void
-icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len,
-    const in6_addr_t *v6src_ptr, boolean_t mctl_present, zoneid_t zoneid,
-    ip_stack_t *ipst)
+icmp_pkt_v6(mblk_t *mp, void *stuff, size_t len,
+    const in6_addr_t *v6src_ptr, ip_recv_attr_t *ira)
 {
 	ip6_t		*ip6h;
 	in6_addr_t	v6dst;
@@ -1477,98 +1262,82 @@ icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len,
 	size_t		msg_len;
 	mblk_t		*mp1;
 	icmp6_t		*icmp6;
-	ill_t		*ill;
 	in6_addr_t	v6src;
-	mblk_t *ipsec_mp;
-	ipsec_out_t *io;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ip_xmit_attr_t	ixas;
 
-	ill = ip_queue_to_ill_v6(q, ipst);
-	if (ill == NULL) {
-		freemsg(mp);
-		return;
+	ip6h = (ip6_t *)mp->b_rptr;
+
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+	ixas.ixa_zoneid = ira->ira_zoneid;
+	ixas.ixa_ifindex = 0;
+	ixas.ixa_ipst = ipst;
+	ixas.ixa_cred = kcred;
+	ixas.ixa_cpid = NOPID;
+	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
+	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+	/*
+	 * If the source of the original packet was link-local, then
+	 * make sure we send on the same ill (group) as we received it on.
+	 */
+	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
+		ixas.ixa_flags |= IXAF_SCOPEID_SET;
+		if (IS_UNDER_IPMP(ill))
+			ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
+		else
+			ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
 	}
 
-	if (mctl_present) {
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
 		/*
-		 * If it is :
-		 *
-		 * 1) a IPSEC_OUT, then this is caused by outbound
-		 *    datagram originating on this host. IPSEC processing
-		 *    may or may not have been done. Refer to comments above
-		 *    icmp_inbound_error_fanout for details.
+		 * Apply IPsec based on how IPsec was applied to
+		 * the packet that had the error.
 		 *
-		 * 2) a IPSEC_IN if we are generating a icmp_message
-		 *    for an incoming datagram destined for us i.e called
-		 *    from ip_fanout_send_icmp.
+		 * If it was an outbound packet that caused the ICMP
+		 * error, then the caller will have setup the IRA
+		 * appropriately.
 		 */
-		ipsec_info_t *in;
-
-		ipsec_mp = mp;
-		mp = ipsec_mp->b_cont;
-
-		in = (ipsec_info_t *)ipsec_mp->b_rptr;
-		ip6h = (ip6_t *)mp->b_rptr;
-
-		ASSERT(in->ipsec_info_type == IPSEC_OUT ||
-		    in->ipsec_info_type == IPSEC_IN);
-
-		if (in->ipsec_info_type == IPSEC_IN) {
-			/*
-			 * Convert the IPSEC_IN to IPSEC_OUT.
-			 */
-			if (!ipsec_in_to_out(ipsec_mp, NULL, ip6h, zoneid)) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				ill_refrele(ill);
-				return;
-			}
-		} else {
-			ASSERT(in->ipsec_info_type == IPSEC_OUT);
-			io = (ipsec_out_t *)in;
-			/*
-			 * Clear out ipsec_out_proc_begin, so we do a fresh
-			 * ire lookup.
-			 */
-			io->ipsec_out_proc_begin = B_FALSE;
+		if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			/* Note: mp already consumed and ip_drop_packet done */
+			return;
 		}
 	} else {
 		/*
 		 * This is in clear. The icmp message we are building
-		 * here should go out in clear.
-		 */
-		ipsec_in_t *ii;
-		ASSERT(mp->b_datap->db_type == M_DATA);
-		ipsec_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack);
-		if (ipsec_mp == NULL) {
-			freemsg(mp);
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			ill_refrele(ill);
-			return;
-		}
-		ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-
-		/* This is not a secure packet */
-		ii->ipsec_in_secure = B_FALSE;
-		ipsec_mp->b_cont = mp;
-		ip6h = (ip6_t *)mp->b_rptr;
-		/*
-		 * Convert the IPSEC_IN to IPSEC_OUT.
+		 * here should go out in clear, independent of our policy.
 		 */
-		if (!ipsec_in_to_out(ipsec_mp, NULL, ip6h, zoneid)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			ill_refrele(ill);
-			return;
-		}
+		ixas.ixa_flags |= IXAF_NO_IPSEC;
 	}
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
 
+	/*
+	 * If the caller specified the source we use that.
+	 * Otherwise, if the packet was for one of our unicast addresses, make
+	 * sure we respond with that as the source. Otherwise
+	 * have ip_output_simple pick the source address.
+	 */
 	if (v6src_ptr != NULL) {
 		v6src = *v6src_ptr;
 	} else {
-		if (icmp_pick_source_v6(q, &ip6h->ip6_src, &ip6h->ip6_dst,
-		    &v6src, zoneid, ipst) == NULL) {
-			freemsg(ipsec_mp);
-			ill_refrele(ill);
-			return;
+		ire_t *ire;
+		uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY;
+
+		if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) ||
+		    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst))
+			match_flags |= MATCH_IRE_ILL;
+
+		ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0,
+		    (IRE_LOCAL|IRE_LOOPBACK), ill, ira->ira_zoneid, NULL,
+		    match_flags, 0, ipst, NULL);
+		if (ire != NULL) {
+			v6src = ip6h->ip6_dst;
+			ire_refrele(ire);
+		} else {
+			v6src = ipv6_all_zeros;
+			ixas.ixa_flags |= IXAF_SET_SOURCE;
 		}
 	}
 	v6dst = ip6h->ip6_src;
@@ -1577,34 +1346,28 @@ icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len,
 	if (msg_len > len_needed) {
 		if (!adjmsg(mp, len_needed - msg_len)) {
 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
-			freemsg(ipsec_mp);
-			ill_refrele(ill);
+			freemsg(mp);
 			return;
 		}
 		msg_len = len_needed;
 	}
-	mp1 = allocb_tmpl(IPV6_HDR_LEN + len, mp);
+	mp1 = allocb(IPV6_HDR_LEN + len, BPRI_MED);
 	if (mp1 == NULL) {
 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
-		freemsg(ipsec_mp);
-		ill_refrele(ill);
+		freemsg(mp);
 		return;
 	}
-	ill_refrele(ill);
 	mp1->b_cont = mp;
 	mp = mp1;
-	ASSERT(ipsec_mp->b_datap->db_type == M_CTL &&
-	    io->ipsec_out_type == IPSEC_OUT);
-	ipsec_mp->b_cont = mp;
 
 	/*
-	 * Set ipsec_out_icmp_loopback so we can let the ICMP messages this
+	 * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
 	 * node generates be accepted in peace by all on-host destinations.
 	 * If we do NOT assume that all on-host destinations trust
-	 * self-generated ICMP messages, then rework here, ip.c, and spd.c.
-	 * (Look for ipsec_out_icmp_loopback).
+	 * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
+	 * (Look for IXAF_TRUSTED_ICMP).
 	 */
-	io->ipsec_out_icmp_loopback = B_TRUE;
+	ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
 
 	ip6h = (ip6_t *)mp->b_rptr;
 	mp1->b_wptr = (uchar_t *)ip6h + (IPV6_HDR_LEN + len);
@@ -1624,20 +1387,21 @@ icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len,
 	bcopy(stuff, (char *)icmp6, len);
 	/*
 	 * Prepare for checksum by putting icmp length in the icmp
-	 * checksum field. The checksum is calculated in ip_wput_v6.
+	 * checksum field. The checksum is calculated in ip_output_wire_v6.
 	 */
 	icmp6->icmp6_cksum = ip6h->ip6_plen;
 	if (icmp6->icmp6_type == ND_REDIRECT) {
 		ip6h->ip6_hops = IPV6_MAX_HOPS;
 	}
-	/* Send to V6 writeside put routine */
-	put(q, ipsec_mp);
+
+	(void) ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
 }
 
 /*
  * Update the output mib when ICMPv6 packets are sent.
  */
-static void
+void
 icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6)
 {
 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutMsgs);
@@ -1712,14 +1476,19 @@ icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6)
  * ICMP error packet should be sent.
  */
 static mblk_t *
-icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp,
-    boolean_t llbcast, boolean_t mcast_ok, ip_stack_t *ipst)
+icmp_pkt_err_ok_v6(mblk_t *mp, boolean_t mcast_ok, ip_recv_attr_t *ira)
 {
-	ip6_t	*ip6h;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	boolean_t	llbcast;
+	ip6_t		*ip6h;
 
 	if (!mp)
 		return (NULL);
 
+	/* We view multicast and broadcast as the same.. */
+	llbcast = (ira->ira_flags &
+	    (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) != 0;
 	ip6h = (ip6_t *)mp->b_rptr;
 
 	/* Check if source address uniquely identifies the host */
@@ -1737,17 +1506,8 @@ icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp,
 
 		if (mp->b_wptr - mp->b_rptr < len_needed) {
 			if (!pullupmsg(mp, len_needed)) {
-				ill_t	*ill;
-
-				ill = ip_queue_to_ill_v6(q, ipst);
-				if (ill == NULL) {
-					BUMP_MIB(&ipst->ips_icmp6_mib,
-					    ipv6IfIcmpInErrors);
-				} else {
-					BUMP_MIB(ill->ill_icmp6_mib,
-					    ipv6IfIcmpInErrors);
-					ill_refrele(ill);
-				}
+				BUMP_MIB(ill->ill_icmp6_mib,
+				    ipv6IfIcmpInErrors);
 				freemsg(mp);
 				return (NULL);
 			}
@@ -1771,6 +1531,16 @@ icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp,
 		freemsg(mp);
 		return (NULL);
 	}
+	/*
+	 * If this is a labeled system, then check to see if we're allowed to
+	 * send a response to this particular sender.  If not, then just drop.
+	 */
+	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
+		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
+		freemsg(mp);
+		return (NULL);
+	}
+
 	if (icmp_err_rate_limit(ipst)) {
 		/*
 		 * Only send ICMP error packets every so often.
@@ -1784,37 +1554,117 @@ icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp,
 }
 
 /*
+ * Called when a packet was sent out the same link that it arrived on.
+ * Check if it is ok to send a redirect and then send it.
+ */
+void
+ip_send_potential_redirect_v6(mblk_t *mp, ip6_t *ip6h, ire_t *ire,
+    ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	in6_addr_t	*v6targ;
+	ire_t		*src_ire_v6 = NULL;
+	mblk_t		*mp1;
+	ire_t		*nhop_ire = NULL;
+
+	/*
+	 * Don't send a redirect when forwarding a source
+	 * routed packet.
+	 */
+	if (ip_source_routed_v6(ip6h, mp, ipst))
+		return;
+
+	if (ire->ire_type & IRE_ONLINK) {
+		/* Target is directly connected */
+		v6targ = &ip6h->ip6_dst;
+	} else {
+		/* Determine the most specific IRE used to send the packets */
+		nhop_ire = ire_nexthop(ire);
+		if (nhop_ire == NULL)
+			return;
+
+		/*
+		 * We won't send redirects to a router
+		 * that doesn't have a link local
+		 * address, but will forward.
+		 */
+		if (!IN6_IS_ADDR_LINKLOCAL(&nhop_ire->ire_addr_v6)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+			ire_refrele(nhop_ire);
+			return;
+		}
+		v6targ = &nhop_ire->ire_addr_v6;
+	}
+	src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src,
+	    NULL, NULL, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
+	    MATCH_IRE_ILL | MATCH_IRE_TYPE, 0, ipst, NULL);
+
+	if (src_ire_v6 == NULL) {
+		if (nhop_ire != NULL)
+			ire_refrele(nhop_ire);
+		return;
+	}
+
+	/*
+	 * The source is directly connected.
+	 */
+	mp1 = copymsg(mp);
+	if (mp1 != NULL)
+		icmp_send_redirect_v6(mp1, v6targ, &ip6h->ip6_dst, ira);
+
+	if (nhop_ire != NULL)
+		ire_refrele(nhop_ire);
+	ire_refrele(src_ire_v6);
+}
+
+/*
  * Generate an ICMPv6 redirect message.
  * Include target link layer address option if it exits.
  * Always include redirect header.
  */
 static void
-icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
-    in6_addr_t *dest, ill_t *ill, boolean_t llbcast)
+icmp_send_redirect_v6(mblk_t *mp, in6_addr_t *targetp, in6_addr_t *dest,
+    ip_recv_attr_t *ira)
 {
 	nd_redirect_t	*rd;
 	nd_opt_rd_hdr_t	*rdh;
 	uchar_t		*buf;
-	nce_t		*nce = NULL;
+	ncec_t		*ncec = NULL;
 	nd_opt_hdr_t	*opt;
 	int		len;
 	int		ll_opt_len = 0;
 	int		max_redir_hdr_data_len;
 	int		pkt_len;
 	in6_addr_t	*srcp;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * We are called from ip_rput where we could
-	 * not have attached an IPSEC_IN.
-	 */
-	ASSERT(mp->b_datap->db_type == M_DATA);
+	ill_t		*ill;
+	boolean_t	need_refrele;
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
 
-	mp = icmp_pkt_err_ok_v6(q, mp, llbcast, B_FALSE, ipst);
+	mp = icmp_pkt_err_ok_v6(mp, B_FALSE, ira);
 	if (mp == NULL)
 		return;
-	nce = ndp_lookup_v6(ill, B_TRUE, targetp, B_FALSE);
-	if (nce != NULL && nce->nce_state != ND_INCOMPLETE) {
+
+	if (IS_UNDER_IPMP(ira->ira_ill)) {
+		ill = ipmp_ill_hold_ipmp_ill(ira->ira_ill);
+		if (ill == NULL) {
+			ill = ira->ira_ill;
+			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
+			ip_drop_output("no IPMP ill for sending redirect",
+			    mp, ill);
+			freemsg(mp);
+			return;
+		}
+		need_refrele = B_TRUE;
+	} else {
+		ill = ira->ira_ill;
+		need_refrele = B_FALSE;
+	}
+
+	ncec = ncec_lookup_illgrp_v6(ill, targetp);
+	if (ncec != NULL && ncec->ncec_state != ND_INCOMPLETE &&
+	    ncec->ncec_lladdr != NULL) {
 		ll_opt_len = (sizeof (nd_opt_hdr_t) +
 		    ill->ill_phys_addr_length + 7)/8 * 8;
 	}
@@ -1822,8 +1672,10 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
 	ASSERT(len % 4 == 0);
 	buf = kmem_alloc(len, KM_NOSLEEP);
 	if (buf == NULL) {
-		if (nce != NULL)
-			NCE_REFRELE(nce);
+		if (ncec != NULL)
+			ncec_refrele(ncec);
+		if (need_refrele)
+			ill_refrele(ill);
 		freemsg(mp);
 		return;
 	}
@@ -1836,15 +1688,14 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
 	rd->nd_rd_dst = *dest;
 
 	opt = (nd_opt_hdr_t *)(buf + sizeof (nd_redirect_t));
-	if (nce != NULL && ll_opt_len != 0) {
+	if (ncec != NULL && ll_opt_len != 0) {
 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 		opt->nd_opt_len = ll_opt_len/8;
-		bcopy((char *)nce->nce_res_mp->b_rptr +
-		    NCE_LL_ADDR_OFFSET(ill), &opt[1],
+		bcopy((char *)ncec->ncec_lladdr, &opt[1],
 		    ill->ill_phys_addr_length);
 	}
-	if (nce != NULL)
-		NCE_REFRELE(nce);
+	if (ncec != NULL)
+		ncec_refrele(ncec);
 	rdh = (nd_opt_rd_hdr_t *)(buf + sizeof (nd_redirect_t) + ll_opt_len);
 	rdh->nd_opt_rh_type = (uint8_t)ND_OPT_REDIRECTED_HEADER;
 	/* max_redir_hdr_data_len and nd_opt_rh_len must be multiple of 8 */
@@ -1862,321 +1713,136 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
 	}
 	rdh->nd_opt_rh_reserved1 = 0;
 	rdh->nd_opt_rh_reserved2 = 0;
-	/* ipif_v6src_addr contains the link-local source address */
-	srcp = &ill->ill_ipif->ipif_v6src_addr;
+	/* ipif_v6lcl_addr contains the link-local source address */
+	srcp = &ill->ill_ipif->ipif_v6lcl_addr;
 
 	/* Redirects sent by router, and router is global zone */
-	icmp_pkt_v6(q, mp, buf, len, srcp, B_FALSE, GLOBAL_ZONEID, ipst);
+	ASSERT(ira->ira_zoneid == ALL_ZONES);
+	ira->ira_zoneid = GLOBAL_ZONEID;
+	icmp_pkt_v6(mp, buf, len, srcp, ira);
 	kmem_free(buf, len);
+	if (need_refrele)
+		ill_refrele(ill);
 }
 
 
 /* Generate an ICMP time exceeded message.  (May be called as writer.) */
 void
-icmp_time_exceeded_v6(queue_t *q, mblk_t *mp, uint8_t code,
-    boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid,
-    ip_stack_t *ipst)
+icmp_time_exceeded_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
+    ip_recv_attr_t *ira)
 {
 	icmp6_t	icmp6;
-	boolean_t mctl_present;
-	mblk_t *first_mp;
 
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
-	mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst);
-	if (mp == NULL) {
-		if (mctl_present)
-			freeb(first_mp);
+	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
+	if (mp == NULL)
 		return;
-	}
+
 	bzero(&icmp6, sizeof (icmp6_t));
 	icmp6.icmp6_type = ICMP6_TIME_EXCEEDED;
 	icmp6.icmp6_code = code;
-	icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present,
-	    zoneid, ipst);
+	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
 }
 
 /*
  * Generate an ICMP unreachable message.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
  */
 void
-icmp_unreachable_v6(queue_t *q, mblk_t *mp, uint8_t code,
-    boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid,
-    ip_stack_t *ipst)
+icmp_unreachable_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
+    ip_recv_attr_t *ira)
 {
 	icmp6_t	icmp6;
-	boolean_t mctl_present;
-	mblk_t *first_mp;
 
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
-	mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst);
-	if (mp == NULL) {
-		if (mctl_present)
-			freeb(first_mp);
+	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
+	if (mp == NULL)
 		return;
-	}
+
 	bzero(&icmp6, sizeof (icmp6_t));
 	icmp6.icmp6_type = ICMP6_DST_UNREACH;
 	icmp6.icmp6_code = code;
-	icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present,
-	    zoneid, ipst);
+	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
 }
 
 /*
  * Generate an ICMP pkt too big message.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
  */
-static void
-icmp_pkt2big_v6(queue_t *q, mblk_t *mp, uint32_t mtu,
-    boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid, ip_stack_t *ipst)
+void
+icmp_pkt2big_v6(mblk_t *mp, uint32_t mtu, boolean_t mcast_ok,
+    ip_recv_attr_t *ira)
 {
 	icmp6_t	icmp6;
-	mblk_t *first_mp;
-	boolean_t mctl_present;
 
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
-	mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok,  ipst);
-	if (mp == NULL) {
-		if (mctl_present)
-			freeb(first_mp);
+	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
+	if (mp == NULL)
 		return;
-	}
+
 	bzero(&icmp6, sizeof (icmp6_t));
 	icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
 	icmp6.icmp6_code = 0;
 	icmp6.icmp6_mtu = htonl(mtu);
 
-	icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present,
-	    zoneid, ipst);
+	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
 }
 
 /*
  * Generate an ICMP parameter problem message. (May be called as writer.)
  * 'offset' is the offset from the beginning of the packet in error.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
  */
 static void
-icmp_param_problem_v6(queue_t *q, mblk_t *mp, uint8_t code,
-    uint32_t offset, boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid,
-    ip_stack_t *ipst)
+icmp_param_problem_v6(mblk_t *mp, uint8_t code, uint32_t offset,
+    boolean_t mcast_ok, ip_recv_attr_t *ira)
 {
 	icmp6_t	icmp6;
-	boolean_t mctl_present;
-	mblk_t *first_mp;
-
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
 
-	mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst);
-	if (mp == NULL) {
-		if (mctl_present)
-			freeb(first_mp);
+	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
+	if (mp == NULL)
 		return;
-	}
+
 	bzero((char *)&icmp6, sizeof (icmp6_t));
 	icmp6.icmp6_type = ICMP6_PARAM_PROB;
 	icmp6.icmp6_code = code;
 	icmp6.icmp6_pptr = htonl(offset);
-	icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present,
-	    zoneid, ipst);
+	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
 }
 
-/*
- * This code will need to take into account the possibility of binding
- * to a link local address on a multi-homed host, in which case the
- * outgoing interface (from the conn) will need to be used when getting
- * an ire for the dst. Going through proper outgoing interface and
- * choosing the source address corresponding to the outgoing interface
- * is necessary when the destination address is a link-local address and
- * IPV6_BOUND_IF or IPV6_PKTINFO or scope_id has been set.
- * This can happen when active connection is setup; thus ipp pointer
- * is passed here from tcp_connect_*() routines, in non-TCP cases NULL
- * pointer is passed as ipp pointer.
- */
-mblk_t *
-ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
+void
+icmp_param_problem_nexthdr_v6(mblk_t *mp, boolean_t mcast_ok,
+    ip_recv_attr_t *ira)
 {
-	ssize_t			len;
-	int			protocol;
-	struct T_bind_req	*tbr;
-	sin6_t			*sin6;
-	ipa6_conn_t		*ac6;
-	in6_addr_t		*v6srcp;
-	in6_addr_t		*v6dstp;
-	uint16_t		lport;
-	uint16_t		fport;
-	uchar_t			*ucp;
-	int			error = 0;
-	boolean_t		local_bind;
-	ipa6_conn_x_t		*acx6;
-	boolean_t		verify_dst;
-	ip_stack_t		*ipst = connp->conn_netstack->netstack_ip;
-	cred_t			*cr;
-
-	/*
-	 * All Solaris components should pass a db_credp
-	 * for this TPI message, hence we ASSERT.
-	 * But in case there is some other M_PROTO that looks
-	 * like a TPI message sent by some other kernel
-	 * component, we check and return an error.
-	 */
-	cr = msg_getcred(mp, NULL);
-	ASSERT(cr != NULL);
-	if (cr == NULL) {
-		error = EINVAL;
-		goto bad_addr;
-	}
-
-	ASSERT(connp->conn_af_isv6);
-	len = mp->b_wptr - mp->b_rptr;
-	if (len < (sizeof (*tbr) + 1)) {
-		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
-		    "ip_bind_v6: bogus msg, len %ld", len);
-		goto bad_addr;
-	}
-	/* Back up and extract the protocol identifier. */
-	mp->b_wptr--;
-	tbr = (struct T_bind_req *)mp->b_rptr;
-	/* Reset the message type in preparation for shipping it back. */
-	mp->b_datap->db_type = M_PCPROTO;
-
-	protocol = *mp->b_wptr & 0xFF;
-	connp->conn_ulp = (uint8_t)protocol;
-
-	/*
-	 * Check for a zero length address.  This is from a protocol that
-	 * wants to register to receive all packets of its type.
-	 */
-	if (tbr->ADDR_length == 0) {
-		if ((protocol == IPPROTO_TCP || protocol == IPPROTO_SCTP ||
-		    protocol == IPPROTO_ESP || protocol == IPPROTO_AH) &&
-		    ipst->ips_ipcl_proto_fanout_v6[protocol].connf_head !=
-		    NULL) {
-			/*
-			 * TCP, SCTP, AH, and ESP have single protocol fanouts.
-			 * Do not allow others to bind to these.
-			 */
-			goto bad_addr;
-		}
-
-		/*
-		 *
-		 * The udp module never sends down a zero-length address,
-		 * and allowing this on a labeled system will break MLP
-		 * functionality.
-		 */
-		if (is_system_labeled() && protocol == IPPROTO_UDP)
-			goto bad_addr;
-
-		/* Allow ipsec plumbing */
-		if ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
-		    (protocol != IPPROTO_AH) && (protocol != IPPROTO_ESP))
-			goto bad_addr;
-
-		connp->conn_srcv6 = ipv6_all_zeros;
-		ipcl_proto_insert_v6(connp, protocol);
-
-		tbr->PRIM_type = T_BIND_ACK;
-		return (mp);
-	}
-
-	/* Extract the address pointer from the message. */
-	ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset,
-	    tbr->ADDR_length);
-	if (ucp == NULL) {
-		ip1dbg(("ip_bind_v6: no address\n"));
-		goto bad_addr;
-	}
-	if (!OK_32PTR(ucp)) {
-		ip1dbg(("ip_bind_v6: unaligned address\n"));
-		goto bad_addr;
-	}
-
-	switch (tbr->ADDR_length) {
-	default:
-		ip1dbg(("ip_bind_v6: bad address length %d\n",
-		    (int)tbr->ADDR_length));
-		goto bad_addr;
-
-	case IPV6_ADDR_LEN:
-		/* Verification of local address only */
-		v6srcp = (in6_addr_t *)ucp;
-		lport = 0;
-		local_bind = B_TRUE;
-		break;
-
-	case sizeof (sin6_t):
-		sin6 = (sin6_t *)ucp;
-		v6srcp = &sin6->sin6_addr;
-		lport = sin6->sin6_port;
-		local_bind = B_TRUE;
-		break;
-
-	case sizeof (ipa6_conn_t):
-		/*
-		 * Verify that both the source and destination addresses
-		 * are valid.
-		 */
-		ac6 = (ipa6_conn_t *)ucp;
-		v6srcp = &ac6->ac6_laddr;
-		v6dstp = &ac6->ac6_faddr;
-		fport = ac6->ac6_fport;
-		/* For raw socket, the local port is not set. */
-		lport = ac6->ac6_lport != 0 ? ac6->ac6_lport :
-		    connp->conn_lport;
-		local_bind = B_FALSE;
-		/* Always verify destination reachability. */
-		verify_dst = B_TRUE;
-		break;
-
-	case sizeof (ipa6_conn_x_t):
-		/*
-		 * Verify that the source address is valid.
-		 */
-		acx6 = (ipa6_conn_x_t *)ucp;
-		ac6 = &acx6->ac6x_conn;
-		v6srcp = &ac6->ac6_laddr;
-		v6dstp = &ac6->ac6_faddr;
-		fport = ac6->ac6_fport;
-		lport = ac6->ac6_lport;
-		local_bind = B_FALSE;
-		/*
-		 * Client that passed ipa6_conn_x_t to us specifies whether to
-		 * verify destination reachability.
-		 */
-		verify_dst = (acx6->ac6x_flags & ACX_VERIFY_DST) != 0;
-		break;
-	}
-	if (local_bind) {
-		error = ip_proto_bind_laddr_v6(connp, &mp->b_cont, protocol,
-		    v6srcp, lport, tbr->ADDR_length != IPV6_ADDR_LEN);
-	} else {
-		error = ip_proto_bind_connected_v6(connp, &mp->b_cont, protocol,
-		    v6srcp, lport, v6dstp, ipp, fport, B_TRUE, verify_dst, cr);
-	}
+	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
+	uint16_t	hdr_length;
+	uint8_t		*nexthdrp;
+	uint32_t	offset;
+	ill_t		*ill = ira->ira_ill;
 
-	if (error == 0) {
-		/* Send it home. */
-		mp->b_datap->db_type = M_PCPROTO;
-		tbr->PRIM_type = T_BIND_ACK;
-		return (mp);
+	/* Determine the offset of the bad nexthdr value */
+	if (!ip_hdr_length_nexthdr_v6(mp, ip6h,	&hdr_length, &nexthdrp)) {
+		/* Malformed packet */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards", mp, ill);
+		freemsg(mp);
+		return;
 	}
 
-bad_addr:
-	ASSERT(error != EINPROGRESS);
-	if (error > 0)
-		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
-	else
-		mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
-	return (mp);
+	offset = nexthdrp - mp->b_rptr;
+	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_NEXTHEADER, offset,
+	    mcast_ok, ira);
 }
 
 /*
- * Here address is verified to be a valid local address.
- * If the IRE_DB_REQ_TYPE mp is present, a multicast
- * address is also considered a valid local address.
+ * Verify whether or not the IP address is a valid local address.
+ * Could be a unicast, including one for a down interface.
+ * If allow_mcbc then a multicast or broadcast address is also
+ * acceptable.
+ *
  * In the case of a multicast address, however, the
  * upper protocol is expected to reset the src address
- * to 0 if it sees an ire with IN6_IS_ADDR_MULTICAST returned so that
+ * to zero when we return IPVL_MCAST so that
  * no packets are emitted with multicast address as
  * source address.
  * The addresses valid for bind are:
@@ -2193,855 +1859,418 @@ bad_addr:
  * When the address is loopback or multicast, there might be many matching IREs
  * so bind has to look up based on the zone.
  */
-/*
- * Verify the local IP address. Does not change the conn_t except
- * conn_fully_bound and conn_policy_cached.
- */
-static int
-ip_bind_laddr_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
-    const in6_addr_t *v6src, uint16_t lport, boolean_t fanout_insert)
+ip_laddr_t
+ip_laddr_verify_v6(const in6_addr_t *v6src, zoneid_t zoneid,
+    ip_stack_t *ipst, boolean_t allow_mcbc, uint_t scopeid)
 {
-	int		error = 0;
-	ire_t		*src_ire = NULL;
-	zoneid_t	zoneid;
-	mblk_t		*mp = NULL;
-	boolean_t	ire_requested;
-	boolean_t	ipsec_policy_set;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	if (mpp)
-		mp = *mpp;
-
-	ire_requested = (mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE);
-	ipsec_policy_set = (mp != NULL && DB_TYPE(mp) == IPSEC_POLICY_SET);
-
-	/*
-	 * If it was previously connected, conn_fully_bound would have
-	 * been set.
-	 */
-	connp->conn_fully_bound = B_FALSE;
-
-	zoneid = IPCL_ZONEID(connp);
+	ire_t		*src_ire;
+	uint_t		match_flags;
+	ill_t		*ill = NULL;
 
-	if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) {
-		src_ire = ire_route_lookup_v6(v6src, 0, 0,
-		    0, NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst);
-		/*
-		 * If an address other than in6addr_any is requested,
-		 * we verify that it is a valid address for bind
-		 * Note: Following code is in if-else-if form for
-		 * readability compared to a condition check.
-		 */
-		ASSERT(src_ire == NULL || !(src_ire->ire_type & IRE_BROADCAST));
-		/* LINTED - statement has no consequent */
-		if (IRE_IS_LOCAL(src_ire)) {
-			/*
-			 * (2) Bind to address of local UP interface
-			 */
-		} else if (IN6_IS_ADDR_MULTICAST(v6src)) {
-			ipif_t	*multi_ipif = NULL;
-			ire_t	*save_ire;
-			/*
-			 * (4) bind to multicast address.
-			 * Fake out the IRE returned to upper
-			 * layer to be a broadcast IRE in
-			 * ip_bind_insert_ire_v6().
-			 * Pass other information that matches
-			 * the ipif (e.g. the source address).
-			 * conn_multicast_ill is only used for
-			 * IPv6 packets
-			 */
-			mutex_enter(&connp->conn_lock);
-			if (connp->conn_multicast_ill != NULL) {
-				(void) ipif_lookup_zoneid(
-				    connp->conn_multicast_ill, zoneid, 0,
-				    &multi_ipif);
-			} else {
-				/*
-				 * Look for default like
-				 * ip_wput_v6
-				 */
-				multi_ipif = ipif_lookup_group_v6(
-				    &ipv6_unspecified_group, zoneid, ipst);
-			}
-			mutex_exit(&connp->conn_lock);
-			save_ire = src_ire;
-			src_ire = NULL;
-			if (multi_ipif == NULL || !ire_requested ||
-			    (src_ire = ipif_to_ire_v6(multi_ipif)) == NULL) {
-				src_ire = save_ire;
-				error = EADDRNOTAVAIL;
-			} else {
-				ASSERT(src_ire != NULL);
-				if (save_ire != NULL)
-					ire_refrele(save_ire);
-			}
-			if (multi_ipif != NULL)
-				ipif_refrele(multi_ipif);
-		} else {
-			if (!ip_addr_exists_v6(v6src, zoneid, ipst)) {
-				/*
-				 * Not a valid address for bind
-				 */
-				error = EADDRNOTAVAIL;
-			}
-		}
+	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6src));
+	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(v6src));
 
-		if (error != 0) {
-			/* Red Alert!  Attempting to be a bogon! */
-			if (ip_debug > 2) {
-				/* ip1dbg */
-				pr_addr_dbg("ip_bind_laddr_v6: bad src"
-				    " address %s\n", AF_INET6, v6src);
-			}
-			goto bad_addr;
-		}
+	match_flags = MATCH_IRE_ZONEONLY;
+	if (scopeid != 0) {
+		ill = ill_lookup_on_ifindex(scopeid, B_TRUE, ipst);
+		if (ill == NULL)
+			return (IPVL_BAD);
+		match_flags |= MATCH_IRE_ILL;
 	}
 
+	src_ire = ire_ftable_lookup_v6(v6src, NULL, NULL, 0,
+	    ill, zoneid, NULL, match_flags, 0, ipst, NULL);
+	if (ill != NULL)
+		ill_refrele(ill);
+
 	/*
-	 * Allow setting new policies. For example, disconnects come
-	 * down as ipa_t bind. As we would have set conn_policy_cached
-	 * to B_TRUE before, we should set it to B_FALSE, so that policy
-	 * can change after the disconnect.
+	 * If an address other than in6addr_any is requested,
+	 * we verify that it is a valid address for bind
+	 * Note: Following code is in if-else-if form for
+	 * readability compared to a condition check.
 	 */
-	connp->conn_policy_cached = B_FALSE;
-
-	/* If not fanout_insert this was just an address verification */
-	if (fanout_insert) {
+	if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
 		/*
-		 * The addresses have been verified. Time to insert in
-		 * the correct fanout list.
+		 * (2) Bind to address of local UP interface
 		 */
-		connp->conn_srcv6 = *v6src;
-		connp->conn_remv6 = ipv6_all_zeros;
-		connp->conn_lport = lport;
-		connp->conn_fport = 0;
-		error = ipcl_bind_insert_v6(connp, protocol, v6src, lport);
-	}
-	if (error == 0) {
-		if (ire_requested) {
-			if (!ip_bind_get_ire_v6(mpp, src_ire, v6src, NULL,
-			    ipst)) {
-				error = -1;
-				goto bad_addr;
-			}
-			mp = *mpp;
-		} else if (ipsec_policy_set) {
-			if (!ip_bind_ipsec_policy_set(connp, mp)) {
-				error = -1;
-				goto bad_addr;
-			}
-		}
-	}
-bad_addr:
-	if (error != 0) {
-		if (connp->conn_anon_port) {
-			(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
-			    connp->conn_mlp_type, connp->conn_ulp, ntohs(lport),
-			    B_FALSE);
-		}
-		connp->conn_mlp_type = mlptSingle;
-	}
-
-	if (src_ire != NULL)
 		ire_refrele(src_ire);
+		return (IPVL_UNICAST_UP);
+	} else if (IN6_IS_ADDR_MULTICAST(v6src)) {
+		/* (4) bind to multicast address. */
+		if (src_ire != NULL)
+			ire_refrele(src_ire);
 
-	if (ipsec_policy_set) {
-		ASSERT(mp != NULL);
-		freeb(mp);
 		/*
-		 * As of now assume that nothing else accompanies
-		 * IPSEC_POLICY_SET.
+		 * Note: caller should take IPV6_MULTICAST_IF
+		 * into account when selecting a real source address.
 		 */
-		*mpp = NULL;
-	}
-
-	return (error);
-}
-int
-ip_proto_bind_laddr_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
-    const in6_addr_t *v6srcp, uint16_t lport, boolean_t fanout_insert)
-{
-	int		error;
-	boolean_t	orig_pkt_isv6 = connp->conn_pkt_isv6;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	ASSERT(connp->conn_af_isv6);
-	connp->conn_ulp = protocol;
+		if (allow_mcbc)
+			return (IPVL_MCAST);
+		else
+			return (IPVL_BAD);
+	} else {
+		ipif_t *ipif;
 
-	if (IN6_IS_ADDR_V4MAPPED(v6srcp) && !connp->conn_ipv6_v6only) {
-		/* Bind to IPv4 address */
-		ipaddr_t v4src;
+		/*
+		 * (3) Bind to address of local DOWN interface?
+		 * (ipif_lookup_addr() looks up all interfaces
+		 * but we do not get here for UP interfaces
+		 * - case (2) above)
+		 */
+		if (src_ire != NULL)
+			ire_refrele(src_ire);
 
-		IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src);
+		ipif = ipif_lookup_addr_v6(v6src, NULL, zoneid, ipst);
+		if (ipif == NULL)
+			return (IPVL_BAD);
 
-		error = ip_bind_laddr_v4(connp, mpp, protocol, v4src, lport,
-		    fanout_insert);
-		if (error != 0)
-			goto bad_addr;
-		connp->conn_pkt_isv6 = B_FALSE;
-	} else {
-		if (IN6_IS_ADDR_V4MAPPED(v6srcp)) {
-			error = 0;
-			goto bad_addr;
+		/* Not a useful source? */
+		if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
+			ipif_refrele(ipif);
+			return (IPVL_BAD);
 		}
-		error = ip_bind_laddr_v6(connp, mpp, protocol, v6srcp,
-		    lport, fanout_insert);
-		if (error != 0)
-			goto bad_addr;
-		connp->conn_pkt_isv6 = B_TRUE;
+		ipif_refrele(ipif);
+		return (IPVL_UNICAST_DOWN);
 	}
-
-	if (orig_pkt_isv6 != connp->conn_pkt_isv6)
-		ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst);
-	return (0);
-
-bad_addr:
-	if (error < 0)
-		error = -TBADADDR;
-	return (error);
 }
 
 /*
- * Verify that both the source and destination addresses
- * are valid.  If verify_dst, then destination address must also be reachable,
- * i.e. have a route.  Protocols like TCP want this.  Tunnels do not.
- * It takes ip6_pkt_t * as one of the arguments to determine correct
- * source address when IPV6_PKTINFO or scope_id is set along with a link-local
- * destination address. Note that parameter ipp is only useful for TCP connect
- * when scope_id is set or IPV6_PKTINFO option is set with an ifindex. For all
- * non-TCP cases, it is NULL and for all other tcp cases it is not useful.
+ * Verify that both the source and destination addresses are valid.  If
+ * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
+ * i.e. have no route to it.  Protocols like TCP want to verify destination
+ * reachability, while tunnels do not.
+ *
+ * Determine the route, the interface, and (optionally) the source address
+ * to use to reach a given destination.
+ * Note that we allow connect to broadcast and multicast addresses when
+ * IPDF_ALLOW_MCBC is set.
+ * first_hop and dst_addr are normally the same, but if source routing
+ * they will differ; in that case the first_hop is what we'll use for the
+ * routing lookup but the dce and label checks will be done on dst_addr,
+ *
+ * If uinfo is set, then we fill in the best available information
+ * we have for the destination. This is based on (in priority order) any
+ * metrics and path MTU stored in a dce_t, route metrics, and finally the
+ * ill_mtu.
+ *
+ * Tsol note: If we have a source route then dst_addr != firsthop. But we
+ * always do the label check on dst_addr.
  *
+ * Assumes that the caller has set ixa_scopeid for link-local communication.
  */
 int
-ip_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
-    in6_addr_t *v6src, uint16_t lport, const in6_addr_t *v6dst,
-    ip6_pkt_t *ipp, uint16_t fport, boolean_t fanout_insert,
-    boolean_t verify_dst, cred_t *cr)
+ip_set_destination_v6(in6_addr_t *src_addrp, const in6_addr_t *dst_addr,
+    const in6_addr_t *firsthop, ip_xmit_attr_t *ixa, iulp_t *uinfo,
+    uint32_t flags, uint_t mac_mode)
 {
-	ire_t		*src_ire;
-	ire_t		*dst_ire;
+	ire_t		*ire;
 	int		error = 0;
-	ire_t		*sire = NULL;
-	ire_t		*md_dst_ire = NULL;
-	ill_t		*md_ill = NULL;
-	ill_t 		*dst_ill = NULL;
-	ipif_t		*src_ipif = NULL;
-	zoneid_t	zoneid;
-	boolean_t	ill_held = B_FALSE;
-	mblk_t		*mp = NULL;
-	boolean_t	ire_requested = B_FALSE;
-	boolean_t	ipsec_policy_set = B_FALSE;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-	ts_label_t	*tsl = NULL;
-	cred_t		*effective_cred = NULL;
-
-	if (mpp)
-		mp = *mpp;
-
-	if (mp != NULL) {
-		ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE);
-		ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET);
-	}
-
-	src_ire = dst_ire = NULL;
-	/*
-	 * If we never got a disconnect before, clear it now.
-	 */
-	connp->conn_fully_bound = B_FALSE;
+	in6_addr_t	setsrc;				/* RTF_SETSRC */
+	zoneid_t	zoneid = ixa->ixa_zoneid;	/* Honors SO_ALLZONES */
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	dce_t		*dce;
+	uint_t		pmtu;
+	uint_t		ifindex;
+	uint_t		generation;
+	nce_t		*nce;
+	ill_t		*ill = NULL;
+	boolean_t	multirt = B_FALSE;
+
+	ASSERT(!IN6_IS_ADDR_V4MAPPED(dst_addr));
 
-	zoneid = connp->conn_zoneid;
+	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
 
 	/*
-	 * Check whether Trusted Solaris policy allows communication with this
-	 * host, and pretend that the destination is unreachable if not.
-	 *
-	 * This is never a problem for TCP, since that transport is known to
-	 * compute the label properly as part of the tcp_rput_other T_BIND_ACK
-	 * handling.  If the remote is unreachable, it will be detected at that
-	 * point, so there's no reason to check it here.
-	 *
-	 * Note that for sendto (and other datagram-oriented friends), this
-	 * check is done as part of the data path label computation instead.
-	 * The check here is just to make non-TCP connect() report the right
-	 * error.
+	 * We never send to zero; the ULPs map it to the loopback address.
+	 * We can't allow it since we use zero to mean unitialized in some
+	 * places.
 	 */
-	if (is_system_labeled() && !IPCL_IS_TCP(connp)) {
-		if ((error = tsol_check_dest(cr, v6dst, IPV6_VERSION,
-		    connp->conn_mac_mode, &effective_cred)) != 0) {
-			if (ip_debug > 2) {
-				pr_addr_dbg(
-				    "ip_bind_connected: no label for dst %s\n",
-				    AF_INET6, v6dst);
-			}
-			goto bad_addr;
-		}
+	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(dst_addr));
 
-		/*
-		 * tsol_check_dest() may have created a new cred with
-		 * a modified security label. Use that cred if it exists
-		 * for ire lookups.
-		 */
-		if (effective_cred == NULL) {
-			tsl = crgetlabel(cr);
-		} else {
-			tsl = crgetlabel(effective_cred);
+	if (is_system_labeled()) {
+		ts_label_t *tsl = NULL;
+
+		error = tsol_check_dest(ixa->ixa_tsl, dst_addr, IPV6_VERSION,
+		    mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
+		if (error != 0)
+			return (error);
+		if (tsl != NULL) {
+			/* Update the label */
+			ip_xmit_attr_replace_tsl(ixa, tsl);
 		}
 	}
 
-	if (IN6_IS_ADDR_MULTICAST(v6dst)) {
-		ipif_t *ipif;
+	setsrc = ipv6_all_zeros;
+	/*
+	 * Select a route; For IPMP interfaces, we would only select
+	 * a "hidden" route (i.e., going through a specific under_ill)
+	 * if ixa_ifindex has been specified.
+	 */
+	ire = ip_select_route_v6(firsthop, ixa, &generation, &setsrc, &error,
+	    &multirt);
+	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
+	if (error != 0)
+		goto bad_addr;
 
+	/*
+	 * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
+	 * If IPDF_VERIFY_DST is set, the destination must be reachable.
+	 * Otherwise the destination needn't be reachable.
+	 *
+	 * If we match on a reject or black hole, then we've got a
+	 * local failure.  May as well fail out the connect() attempt,
+	 * since it's never going to succeed.
+	 */
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		/*
-		 * Use an "emulated" IRE_BROADCAST to tell the transport it
-		 * is a multicast.
-		 * Pass other information that matches
-		 * the ipif (e.g. the source address).
+		 * If we're verifying destination reachability, we always want
+		 * to complain here.
 		 *
-		 * conn_multicast_ill is only used for IPv6 packets
-		 */
-		mutex_enter(&connp->conn_lock);
-		if (connp->conn_multicast_ill != NULL) {
-			(void) ipif_lookup_zoneid(connp->conn_multicast_ill,
-			    zoneid, 0, &ipif);
-		} else {
-			/* Look for default like ip_wput_v6 */
-			ipif = ipif_lookup_group_v6(v6dst, zoneid, ipst);
-		}
-		mutex_exit(&connp->conn_lock);
-		if (ipif == NULL || ire_requested ||
-		    (dst_ire = ipif_to_ire_v6(ipif)) == NULL) {
-			if (ipif != NULL)
-				ipif_refrele(ipif);
-			if (ip_debug > 2) {
-				/* ip1dbg */
-				pr_addr_dbg("ip_bind_connected_v6: bad "
-				    "connected multicast %s\n", AF_INET6,
-				    v6dst);
-			}
-			error = ENETUNREACH;
-			goto bad_addr;
-		}
-		if (ipif != NULL)
-			ipif_refrele(ipif);
-	} else {
-		dst_ire = ire_route_lookup_v6(v6dst, NULL, NULL, 0,
-		    NULL, &sire, zoneid, tsl,
-		    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-		    MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_SECATTR,
-		    ipst);
-		/*
-		 * We also prevent ire's with src address INADDR_ANY to
-		 * be used, which are created temporarily for
-		 * sending out packets from endpoints that have
-		 * conn_unspec_src set.
+		 * If we're not verifying destination reachability but the
+		 * destination has a route, we still want to fail on the
+		 * temporary address and broadcast address tests.
+		 *
+		 * In both cases do we let the code continue so some reasonable
+		 * information is returned to the caller. That enables the
+		 * caller to use (and even cache) the IRE. conn_ip_ouput will
+		 * use the generation mismatch path to check for the unreachable
+		 * case thereby avoiding any specific check in the main path.
 		 */
-		if (dst_ire == NULL ||
-		    (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
-		    IN6_IS_ADDR_UNSPECIFIED(&dst_ire->ire_src_addr_v6)) {
+		ASSERT(generation == IRE_GENERATION_VERIFY);
+		if (flags & IPDF_VERIFY_DST) {
 			/*
-			 * When verifying destination reachability, we always
-			 * complain.
-			 *
-			 * When not verifying destination reachability but we
-			 * found an IRE, i.e. the destination is reachable,
-			 * then the other tests still apply and we complain.
+			 * Set errno but continue to set up ixa_ire to be
+			 * the RTF_REJECT|RTF_BLACKHOLE IRE.
+			 * That allows callers to use ip_output to get an
+			 * ICMP error back.
 			 */
-			if (verify_dst || (dst_ire != NULL)) {
-				if (ip_debug > 2) {
-					/* ip1dbg */
-					pr_addr_dbg("ip_bind_connected_v6: bad"
-					    " connected dst %s\n", AF_INET6,
-					    v6dst);
-				}
-				if (dst_ire == NULL ||
-				    !(dst_ire->ire_type & IRE_HOST)) {
-					error = ENETUNREACH;
-				} else {
-					error = EHOSTUNREACH;
-				}
-				goto bad_addr;
-			}
+			if (!(ire->ire_type & IRE_HOST))
+				error = ENETUNREACH;
+			else
+				error = EHOSTUNREACH;
 		}
 	}
 
-	/*
-	 * If the app does a connect(), it means that it will most likely
-	 * send more than 1 packet to the destination.  It makes sense
-	 * to clear the temporary flag.
-	 */
-	if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE &&
-	    (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) {
-		irb_t *irb = dst_ire->ire_bucket;
-
-		rw_enter(&irb->irb_lock, RW_WRITER);
-		/*
-		 * We need to recheck for IRE_MARK_TEMPORARY after acquiring
-		 * the lock in order to guarantee irb_tmp_ire_cnt.
-		 */
-		if (dst_ire->ire_marks & IRE_MARK_TEMPORARY) {
-			dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY;
-			irb->irb_tmp_ire_cnt--;
-		}
-		rw_exit(&irb->irb_lock);
+	if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
+	    !(flags & IPDF_ALLOW_MCBC)) {
+		ire_refrele(ire);
+		ire = ire_reject(ipst, B_FALSE);
+		generation = IRE_GENERATION_VERIFY;
+		error = ENETUNREACH;
 	}
 
-	ASSERT(dst_ire == NULL || dst_ire->ire_ipversion == IPV6_VERSION);
+	/* Cache things */
+	if (ixa->ixa_ire != NULL)
+		ire_refrele_notr(ixa->ixa_ire);
+#ifdef DEBUG
+	ire_refhold_notr(ire);
+	ire_refrele(ire);
+#endif
+	ixa->ixa_ire = ire;
+	ixa->ixa_ire_generation = generation;
 
 	/*
-	 * See if we should notify ULP about MDT; we do this whether or not
-	 * ire_requested is TRUE, in order to handle active connects; MDT
-	 * eligibility tests for passive connects are handled separately
-	 * through tcp_adapt_ire().  We do this before the source address
-	 * selection, because dst_ire may change after a call to
-	 * ipif_select_source_v6().  This is a best-effort check, as the
-	 * packet for this connection may not actually go through
-	 * dst_ire->ire_stq, and the exact IRE can only be known after
-	 * calling ip_newroute_v6().  This is why we further check on the
-	 * IRE during Multidata packet transmission in tcp_multisend().
+	 * For multicast with multirt we have a flag passed back from
+	 * ire_lookup_multi_ill_v6 since we don't have an IRE for each
+	 * possible multicast address.
+	 * We also need a flag for multicast since we can't check
+	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
 	 */
-	if (ipst->ips_ip_multidata_outbound && !ipsec_policy_set &&
-	    dst_ire != NULL &&
-	    !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
-	    (md_ill = ire_to_ill(dst_ire), md_ill != NULL) &&
-	    ILL_MDT_CAPABLE(md_ill)) {
-		md_dst_ire = dst_ire;
-		IRE_REFHOLD(md_dst_ire);
-	}
-
-	if (dst_ire != NULL &&
-	    dst_ire->ire_type == IRE_LOCAL &&
-	    dst_ire->ire_zoneid != zoneid &&
-	    dst_ire->ire_zoneid != ALL_ZONES) {
-		src_ire = ire_ftable_lookup_v6(v6dst, 0, 0, 0, NULL, NULL,
-		    zoneid, 0, NULL,
-		    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-		    MATCH_IRE_RJ_BHOLE, ipst);
-		if (src_ire == NULL) {
-			error = EHOSTUNREACH;
-			goto bad_addr;
-		} else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
-			if (!(src_ire->ire_type & IRE_HOST))
-				error = ENETUNREACH;
-			else
-				error = EHOSTUNREACH;
-			goto bad_addr;
-		}
-		if (IN6_IS_ADDR_UNSPECIFIED(v6src)) {
-			src_ipif = src_ire->ire_ipif;
-			ipif_refhold(src_ipif);
-			*v6src = src_ipif->ipif_v6lcl_addr;
-		}
-		ire_refrele(src_ire);
-		src_ire = NULL;
-	} else if (IN6_IS_ADDR_UNSPECIFIED(v6src) && dst_ire != NULL) {
-		if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
-			*v6src = sire->ire_src_addr_v6;
-			ire_refrele(dst_ire);
-			dst_ire = sire;
-			sire = NULL;
-		} else if (dst_ire->ire_type == IRE_CACHE &&
-		    (dst_ire->ire_flags & RTF_SETSRC)) {
-			ASSERT(dst_ire->ire_zoneid == zoneid ||
-			    dst_ire->ire_zoneid == ALL_ZONES);
-			*v6src = dst_ire->ire_src_addr_v6;
+	if (multirt) {
+		ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
+		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
+	} else {
+		ixa->ixa_postfragfn = ire->ire_postfragfn;
+		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
+	}
+	if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+		/* Get an nce to cache. */
+		nce = ire_to_nce(ire, NULL, firsthop);
+		if (nce == NULL) {
+			/* Allocation failure? */
+			ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
 		} else {
-			/*
-			 * Pick a source address so that a proper inbound load
-			 * spreading would happen. Use dst_ill specified by the
-			 * app. when socket option or scopeid is set.
-			 */
-			int  err;
-
-			if (ipp != NULL && ipp->ipp_ifindex != 0) {
-				uint_t	if_index;
-
-				/*
-				 * Scope id or IPV6_PKTINFO
-				 */
-
-				if_index = ipp->ipp_ifindex;
-				dst_ill = ill_lookup_on_ifindex(
-				    if_index, B_TRUE, NULL, NULL, NULL, NULL,
-				    ipst);
-				if (dst_ill == NULL) {
-					ip1dbg(("ip_bind_connected_v6:"
-					    " bad ifindex %d\n", if_index));
-					error = EADDRNOTAVAIL;
-					goto bad_addr;
-				}
-				ill_held = B_TRUE;
-			} else if (connp->conn_outgoing_ill != NULL) {
-				/*
-				 * For IPV6_BOUND_IF socket option,
-				 * conn_outgoing_ill should be set
-				 * already in TCP or UDP/ICMP.
-				 */
-				dst_ill = conn_get_held_ill(connp,
-				    &connp->conn_outgoing_ill, &err);
-				if (err == ILL_LOOKUP_FAILED) {
-					ip1dbg(("ip_bind_connected_v6:"
-					    "no ill for bound_if\n"));
-					error = EADDRNOTAVAIL;
-					goto bad_addr;
-				}
-				ill_held = B_TRUE;
-			} else if (dst_ire->ire_stq != NULL) {
-				/* No need to hold ill here */
-				dst_ill = (ill_t *)dst_ire->ire_stq->q_ptr;
-			} else {
-				/* No need to hold ill here */
-				dst_ill = dst_ire->ire_ipif->ipif_ill;
-			}
-			if (ip6_asp_can_lookup(ipst)) {
-				src_ipif = ipif_select_source_v6(dst_ill,
-				    v6dst, B_FALSE, connp->conn_src_preferences,
-				    zoneid);
-				ip6_asp_table_refrele(ipst);
-				if (src_ipif == NULL) {
-					pr_addr_dbg("ip_bind_connected_v6: "
-					    "no usable source address for "
-					    "connection to %s\n",
-					    AF_INET6, v6dst);
-					error = EADDRNOTAVAIL;
-					goto bad_addr;
-				}
-				*v6src = src_ipif->ipif_v6lcl_addr;
-			} else {
-				error = EADDRNOTAVAIL;
-				goto bad_addr;
-			}
+			if (ixa->ixa_nce != NULL)
+				nce_refrele(ixa->ixa_nce);
+			ixa->ixa_nce = nce;
 		}
 	}
 
 	/*
-	 * We do ire_route_lookup_v6() here (and not an interface lookup)
-	 * as we assert that v6src should only come from an
-	 * UP interface for hard binding.
+	 * We use use ire_nexthop_ill to avoid the under ipmp
+	 * interface for source address selection. Note that for ipmp
+	 * probe packets, ixa_ifindex would have been specified, and
+	 * the ip_select_route() invocation would have picked an ire
+	 * will ire_ill pointing at an under interface.
 	 */
-	src_ire = ire_route_lookup_v6(v6src, 0, 0, 0, NULL,
-	    NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst);
-
-	/* src_ire must be a local|loopback */
-	if (!IRE_IS_LOCAL(src_ire)) {
-		if (ip_debug > 2) {
-			/* ip1dbg */
-			pr_addr_dbg("ip_bind_connected_v6: bad "
-			    "connected src %s\n", AF_INET6, v6src);
-		}
-		error = EADDRNOTAVAIL;
-		goto bad_addr;
-	}
+	ill = ire_nexthop_ill(ire);
 
 	/*
 	 * If the source address is a loopback address, the
 	 * destination had best be local or multicast.
-	 * The transports that can't handle multicast will reject
-	 * those addresses.
+	 * If we are sending to an IRE_LOCAL using a loopback source then
+	 * it had better be the same zoneid.
 	 */
-	if (src_ire->ire_type == IRE_LOOPBACK &&
-	    !(IRE_IS_LOCAL(dst_ire) || IN6_IS_ADDR_MULTICAST(v6dst) ||
-	    IN6_IS_ADDR_V4MAPPED_CLASSD(v6dst))) {
-		ip1dbg(("ip_bind_connected_v6: bad connected loopback\n"));
-		error = -1;
-		goto bad_addr;
+	if (IN6_IS_ADDR_LOOPBACK(src_addrp)) {
+		if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
+			ire = NULL;	/* Stored in ixa_ire */
+			error = EADDRNOTAVAIL;
+			goto bad_addr;
+		}
+		if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
+			ire = NULL;	/* Stored in ixa_ire */
+			error = EADDRNOTAVAIL;
+			goto bad_addr;
+		}
 	}
-	/*
-	 * Allow setting new policies. For example, disconnects come
-	 * down as ipa_t bind. As we would have set conn_policy_cached
-	 * to B_TRUE before, we should set it to B_FALSE, so that policy
-	 * can change after the disconnect.
-	 */
-	connp->conn_policy_cached = B_FALSE;
 
 	/*
-	 * The addresses have been verified. Initialize the conn
-	 * before calling the policy as they expect the conns
-	 * initialized.
+	 * Does the caller want us to pick a source address?
 	 */
-	connp->conn_srcv6 = *v6src;
-	connp->conn_remv6 = *v6dst;
-	connp->conn_lport = lport;
-	connp->conn_fport = fport;
-
-	ASSERT(!(ipsec_policy_set && ire_requested));
-	if (ire_requested) {
-		iulp_t *ulp_info = NULL;
+	if (flags & IPDF_SELECT_SRC) {
+		in6_addr_t	src_addr;
+
+		/* If unreachable we have no ill but need some source */
+		if (ill == NULL) {
+			src_addr = ipv6_loopback;
+			/* Make sure we look for a better source address */
+			generation = SRC_GENERATION_VERIFY;
+		} else {
+			error = ip_select_source_v6(ill, &setsrc, dst_addr,
+			    zoneid, ipst, B_FALSE, ixa->ixa_src_preferences,
+			    &src_addr, &generation, NULL);
+			if (error != 0) {
+				ire = NULL;	/* Stored in ixa_ire */
+				goto bad_addr;
+			}
+		}
 
 		/*
-		 * Note that sire will not be NULL if this is an off-link
-		 * connection and there is not cache for that dest yet.
-		 *
-		 * XXX Because of an existing bug, if there are multiple
-		 * default routes, the IRE returned now may not be the actual
-		 * default route used (default routes are chosen in a
-		 * round robin fashion).  So if the metrics for different
-		 * default routes are different, we may return the wrong
-		 * metrics.  This will not be a problem if the existing
-		 * bug is fixed.
+		 * We allow the source address to to down.
+		 * However, we check that we don't use the loopback address
+		 * as a source when sending out on the wire.
 		 */
-		if (sire != NULL)
-			ulp_info = &(sire->ire_uinfo);
-
-		if (!ip_bind_get_ire_v6(mpp, dst_ire, v6dst, ulp_info,
-		    ipst)) {
-			error = -1;
-			goto bad_addr;
-		}
-	} else if (ipsec_policy_set) {
-		if (!ip_bind_ipsec_policy_set(connp, mp)) {
-			error = -1;
+		if (IN6_IS_ADDR_LOOPBACK(&src_addr) &&
+		    !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
+		    !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+			ire = NULL;	/* Stored in ixa_ire */
+			error = EADDRNOTAVAIL;
 			goto bad_addr;
 		}
+
+		*src_addrp = src_addr;
+		ixa->ixa_src_generation = generation;
 	}
 
 	/*
-	 * Cache IPsec policy in this conn.  If we have per-socket policy,
-	 * we'll cache that.  If we don't, we'll inherit global policy.
-	 *
-	 * We can't insert until the conn reflects the policy. Note that
-	 * conn_policy_cached is set by ipsec_conn_cache_policy() even for
-	 * connections where we don't have a policy. This is to prevent
-	 * global policy lookups in the inbound path.
-	 *
-	 * If we insert before we set conn_policy_cached,
-	 * CONN_INBOUND_POLICY_PRESENT_V6() check can still evaluate true
-	 * because global policy cound be non-empty. We normally call
-	 * ipsec_check_policy() for conn_policy_cached connections only if
-	 * conn_in_enforce_policy is set. But in this case,
-	 * conn_policy_cached can get set anytime since we made the
-	 * CONN_INBOUND_POLICY_PRESENT_V6() check and ipsec_check_policy()
-	 * is called, which will make the above assumption false.  Thus, we
-	 * need to insert after we set conn_policy_cached.
+	 * Make sure we don't leave an unreachable ixa_nce in place
+	 * since ip_select_route is used when we unplumb i.e., remove
+	 * references on ixa_ire, ixa_nce, and ixa_dce.
 	 */
-	if ((error = ipsec_conn_cache_policy(connp, B_FALSE)) != 0)
-		goto bad_addr;
+	nce = ixa->ixa_nce;
+	if (nce != NULL && nce->nce_is_condemned) {
+		nce_refrele(nce);
+		ixa->ixa_nce = NULL;
+		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+	}
 
-	/* If not fanout_insert this was just an address verification */
-	if (fanout_insert) {
-		/*
-		 * The addresses have been verified. Time to insert in
-		 * the correct fanout list.
-		 */
-		error = ipcl_conn_insert_v6(connp, protocol, v6src, v6dst,
-		    connp->conn_ports,
-		    IPCL_IS_TCP(connp) ? connp->conn_tcp->tcp_bound_if : 0);
+
+	ifindex = 0;
+	if (IN6_IS_ADDR_LINKSCOPE(dst_addr)) {
+		/* If we are creating a DCE we'd better have an ifindex */
+		if (ill != NULL)
+			ifindex = ill->ill_phyint->phyint_ifindex;
+		else
+			flags &= ~IPDF_UNIQUE_DCE;
 	}
-	if (error == 0) {
-		connp->conn_fully_bound = B_TRUE;
-		/*
-		 * Our initial checks for MDT have passed; the IRE is not
-		 * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to
-		 * be supporting MDT.  Pass the IRE, IPC and ILL into
-		 * ip_mdinfo_return(), which performs further checks
-		 * against them and upon success, returns the MDT info
-		 * mblk which we will attach to the bind acknowledgment.
-		 */
-		if (md_dst_ire != NULL) {
-			mblk_t *mdinfo_mp;
-
-			ASSERT(md_ill != NULL);
-			ASSERT(md_ill->ill_mdt_capab != NULL);
-			if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp,
-			    md_ill->ill_name, md_ill->ill_mdt_capab)) != NULL) {
-				if (mp == NULL) {
-					*mpp = mdinfo_mp;
-				} else {
-					linkb(mp, mdinfo_mp);
-				}
-			}
+
+	if (flags & IPDF_UNIQUE_DCE) {
+		/* Fallback to the default dce if allocation fails */
+		dce = dce_lookup_and_add_v6(dst_addr, ifindex, ipst);
+		if (dce != NULL) {
+			generation = dce->dce_generation;
+		} else {
+			dce = dce_lookup_v6(dst_addr, ifindex, ipst,
+			    &generation);
 		}
+	} else {
+		dce = dce_lookup_v6(dst_addr, ifindex, ipst, &generation);
 	}
-bad_addr:
-	if (ipsec_policy_set) {
-		ASSERT(mp != NULL);
-		freeb(mp);
-		/*
-		 * As of now assume that nothing else accompanies
-		 * IPSEC_POLICY_SET.
-		 */
-		*mpp = NULL;
-	}
-refrele_and_quit:
-	if (src_ire != NULL)
-		IRE_REFRELE(src_ire);
-	if (dst_ire != NULL)
-		IRE_REFRELE(dst_ire);
-	if (sire != NULL)
-		IRE_REFRELE(sire);
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-	if (md_dst_ire != NULL)
-		IRE_REFRELE(md_dst_ire);
-	if (ill_held && dst_ill != NULL)
-		ill_refrele(dst_ill);
-	if (effective_cred != NULL)
-		crfree(effective_cred);
-	return (error);
-}
-
-/* ARGSUSED */
-int
-ip_proto_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
-    in6_addr_t *v6srcp, uint16_t lport, const in6_addr_t *v6dstp,
-    ip6_pkt_t *ipp, uint16_t fport, boolean_t fanout_insert,
-    boolean_t verify_dst, cred_t *cr)
-{
-	int error = 0;
-	boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6;
-	ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
-	ASSERT(connp->conn_af_isv6);
-	connp->conn_ulp = protocol;
+	ASSERT(dce != NULL);
+	if (ixa->ixa_dce != NULL)
+		dce_refrele_notr(ixa->ixa_dce);
+#ifdef DEBUG
+	dce_refhold_notr(dce);
+	dce_refrele(dce);
+#endif
+	ixa->ixa_dce = dce;
+	ixa->ixa_dce_generation = generation;
 
-	/* For raw socket, the local port is not set. */
-	lport = lport != 0 ? lport : connp->conn_lport;
+	/*
+	 * Note that IPv6 multicast supports PMTU discovery unlike IPv4
+	 * multicast. But pmtu discovery is only enabled for connected
+	 * sockets in general.
+	 */
 
 	/*
-	 * Bind to local and remote address. Local might be
-	 * unspecified in which case it will be extracted from
-	 * ire_src_addr_v6
+	 * Set initial value for fragmentation limit.  Either conn_ip_output
+	 * or ULP might updates it when there are routing changes.
+	 * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
 	 */
-	if (IN6_IS_ADDR_V4MAPPED(v6dstp) && !connp->conn_ipv6_v6only) {
-		/* Connect to IPv4 address */
-		ipaddr_t v4src;
-		ipaddr_t v4dst;
-
-		/* Is the source unspecified or mapped? */
-		if (!IN6_IS_ADDR_V4MAPPED(v6srcp) &&
-		    !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) {
-			ip1dbg(("ip_proto_bind_connected_v6: "
-			    "dst is mapped, but not the src\n"));
-			goto bad_addr;
-		}
-		IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src);
-		IN6_V4MAPPED_TO_IPADDR(v6dstp, v4dst);
+	pmtu = ip_get_pmtu(ixa);
+	ixa->ixa_fragsize = pmtu;
+	/* Make sure ixa_fragsize and ixa_pmtu remain identical */
+	if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
+		ixa->ixa_pmtu = pmtu;
 
-		/* Always verify destination reachability. */
-		error = ip_bind_connected_v4(connp, mpp, protocol, &v4src,
-		    lport, v4dst, fport, B_TRUE, B_TRUE, cr);
-		if (error != 0)
-			goto bad_addr;
-		IN6_IPADDR_TO_V4MAPPED(v4src, v6srcp);
-		connp->conn_pkt_isv6 = B_FALSE;
-	} else if (IN6_IS_ADDR_V4MAPPED(v6srcp)) {
-		ip1dbg(("ip_proto_bind_connected_v6: "
-		    "src is mapped, but not the dst\n"));
-		goto bad_addr;
-	} else {
-		error = ip_bind_connected_v6(connp, mpp, protocol, v6srcp,
-		    lport, v6dstp, ipp, fport, B_TRUE, verify_dst, cr);
-		if (error != 0)
-			goto bad_addr;
-		connp->conn_pkt_isv6 = B_TRUE;
-	}
+	/*
+	 * Extract information useful for some transports.
+	 * First we look for DCE metrics. Then we take what we have in
+	 * the metrics in the route, where the offlink is used if we have
+	 * one.
+	 */
+	if (uinfo != NULL) {
+		bzero(uinfo, sizeof (*uinfo));
 
-	if (orig_pkt_isv6 != connp->conn_pkt_isv6)
-		ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst);
+		if (dce->dce_flags & DCEF_UINFO)
+			*uinfo = dce->dce_uinfo;
 
-	/* Send it home. */
-	return (0);
+		rts_merge_metrics(uinfo, &ire->ire_metrics);
 
-bad_addr:
-	if (error == 0)
-		error = -TBADADDR;
-	return (error);
-}
+		/* Allow ire_metrics to decrease the path MTU from above */
+		if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
+			uinfo->iulp_mtu = pmtu;
 
-/*
- * Get the ire in *mpp. Returns false if it fails (due to lack of space).
- * Makes the IRE be IRE_BROADCAST if dst is a multicast address.
- */
-/* ARGSUSED4 */
-static boolean_t
-ip_bind_get_ire_v6(mblk_t **mpp, ire_t *ire, const in6_addr_t *dst,
-    iulp_t *ulp_info, ip_stack_t *ipst)
-{
-	mblk_t	*mp = *mpp;
-	ire_t	*ret_ire;
+		uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
+		uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
+		uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
+	}
 
-	ASSERT(mp != NULL);
+	if (ill != NULL)
+		ill_refrele(ill);
 
-	if (ire != NULL) {
-		/*
-		 * mp initialized above to IRE_DB_REQ_TYPE
-		 * appended mblk. Its <upper protocol>'s
-		 * job to make sure there is room.
-		 */
-		if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t))
-			return (B_FALSE);
+	return (error);
 
-		mp->b_datap->db_type = IRE_DB_TYPE;
-		mp->b_wptr = mp->b_rptr + sizeof (ire_t);
-		bcopy(ire, mp->b_rptr, sizeof (ire_t));
-		ret_ire = (ire_t *)mp->b_rptr;
-		if (IN6_IS_ADDR_MULTICAST(dst) ||
-		    IN6_IS_ADDR_V4MAPPED_CLASSD(dst)) {
-			ret_ire->ire_type = IRE_BROADCAST;
-			ret_ire->ire_addr_v6 = *dst;
-		}
-		if (ulp_info != NULL) {
-			bcopy(ulp_info, &(ret_ire->ire_uinfo),
-			    sizeof (iulp_t));
-		}
-		ret_ire->ire_mp = mp;
-	} else {
-		/*
-		 * No IRE was found. Remove IRE mblk.
-		 */
-		*mpp = mp->b_cont;
-		freeb(mp);
-	}
-	return (B_TRUE);
-}
+bad_addr:
+	if (ire != NULL)
+		ire_refrele(ire);
 
-/*
- * Add an ip6i_t header to the front of the mblk.
- * Inline if possible else allocate a separate mblk containing only the ip6i_t.
- * Returns NULL if allocation fails (and frees original message).
- * Used in outgoing path when going through ip_newroute_*v6().
- * Used in incoming path to pass ifindex to transports.
- */
-mblk_t *
-ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst)
-{
-	mblk_t *mp1;
-	ip6i_t *ip6i;
-	ip6_t *ip6h;
+	if (ill != NULL)
+		ill_refrele(ill);
 
-	ip6h = (ip6_t *)mp->b_rptr;
-	ip6i = (ip6i_t *)(mp->b_rptr - sizeof (ip6i_t));
-	if ((uchar_t *)ip6i < mp->b_datap->db_base ||
-	    mp->b_datap->db_ref > 1) {
-		mp1 = allocb(sizeof (ip6i_t), BPRI_MED);
-		if (mp1 == NULL) {
-			freemsg(mp);
-			return (NULL);
-		}
-		mp1->b_wptr = mp1->b_rptr = mp1->b_datap->db_lim;
-		mp1->b_cont = mp;
-		mp = mp1;
-		ip6i = (ip6i_t *)(mp->b_rptr - sizeof (ip6i_t));
-	}
-	mp->b_rptr = (uchar_t *)ip6i;
-	ip6i->ip6i_vcf = ip6h->ip6_vcf;
-	ip6i->ip6i_nxt = IPPROTO_RAW;
-	if (ill != NULL) {
-		ip6i->ip6i_flags = IP6I_IFINDEX;
-		/*
-		 * If `ill' is in an IPMP group, make sure we use the IPMP
-		 * interface index so that e.g. IPV6_RECVPKTINFO will get the
-		 * IPMP interface index and not an underlying interface index.
-		 */
-		if (IS_UNDER_IPMP(ill))
-			ip6i->ip6i_ifindex = ipmp_ill_get_ipmp_ifindex(ill);
-		else
-			ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
-	} else {
-		ip6i->ip6i_flags = 0;
+	/*
+	 * Make sure we don't leave an unreachable ixa_nce in place
+	 * since ip_select_route is used when we unplumb i.e., remove
+	 * references on ixa_ire, ixa_nce, and ixa_dce.
+	 */
+	nce = ixa->ixa_nce;
+	if (nce != NULL && nce->nce_is_condemned) {
+		nce_refrele(nce);
+		ixa->ixa_nce = NULL;
+		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
 	}
-	ip6i->ip6i_nexthop = *dst;
-	return (mp);
+
+	return (error);
 }
 
 /*
@@ -3051,53 +2280,29 @@ ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst)
  * of any incoming packets.
  *
  * Zones notes:
- * Packets will be distributed to streams in all zones. This is really only
+ * Packets will be distributed to conns in all zones. This is really only
  * useful for ICMPv6 as only applications in the global zone can create raw
  * sockets for other protocols.
  */
-static void
-ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
-    ill_t *inill, uint8_t nexthdr, uint_t nexthdr_offset, uint_t flags,
-    boolean_t mctl_present, zoneid_t zoneid)
+void
+ip_fanout_proto_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
 {
-	queue_t	*rq;
-	mblk_t	*mp1, *first_mp1;
-	in6_addr_t dst = ip6h->ip6_dst;
-	in6_addr_t src = ip6h->ip6_src;
-	mblk_t *first_mp = mp;
-	boolean_t secure, shared_addr;
-	conn_t	*connp, *first_connp, *next_connp;
-	connf_t *connfp;
-	ip_stack_t	*ipst = inill->ill_ipst;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
-
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		secure = ipsec_in_is_secure(first_mp);
-		ASSERT(mp != NULL);
-	} else {
-		secure = B_FALSE;
-	}
-
-	shared_addr = (zoneid == ALL_ZONES);
-	if (shared_addr) {
-		/*
-		 * We don't allow multilevel ports for raw IP, so no need to
-		 * check for that here.
-		 */
-		zoneid = tsol_packet_to_zoneid(mp);
-	}
+	mblk_t		*mp1;
+	in6_addr_t	laddr = ip6h->ip6_dst;
+	conn_t		*connp, *first_connp, *next_connp;
+	connf_t		*connfp;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	connfp = &ipst->ips_ipcl_proto_fanout_v6[nexthdr];
+	connfp = &ipst->ips_ipcl_proto_fanout_v6[ira->ira_protocol];
 	mutex_enter(&connfp->connf_lock);
 	connp = connfp->connf_head;
 	for (connp = connfp->connf_head; connp != NULL;
 	    connp = connp->conn_next) {
-		if (IPCL_PROTO_MATCH_V6(connp, nexthdr, ip6h, ill, flags,
-		    zoneid) &&
-		    (!is_system_labeled() ||
-		    tsol_receive_local(mp, &dst, IPV6_VERSION, shared_addr,
-		    connp)))
+		/* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
+		if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
+		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+		    tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
 			break;
 	}
 
@@ -3108,96 +2313,52 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
 		 * unclaimed datagrams?
 		 */
 		mutex_exit(&connfp->connf_lock);
-		if (ip_fanout_send_icmp_v6(q, first_mp, flags,
-		    ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER,
-		    nexthdr_offset, mctl_present, zoneid, ipst)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
-		}
-
+		ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
+		    ICMP6_PARAMPROB_NEXTHEADER, ira);
 		return;
 	}
 
-	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
 
 	CONN_INC_REF(connp);
 	first_connp = connp;
 
 	/*
 	 * XXX: Fix the multiple protocol listeners case. We should not
-	 * be walking the conn->next list here.
+	 * be walking the conn->conn_next list here.
 	 */
 	connp = connp->conn_next;
 	for (;;) {
 		while (connp != NULL) {
-			if (IPCL_PROTO_MATCH_V6(connp, nexthdr, ip6h, ill,
-			    flags, zoneid) &&
-			    (!is_system_labeled() ||
-			    tsol_receive_local(mp, &dst, IPV6_VERSION,
-			    shared_addr, connp)))
+			/* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
+			if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
+			    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+			    tsol_receive_local(mp, &laddr, IPV6_VERSION,
+			    ira, connp)))
 				break;
 			connp = connp->conn_next;
 		}
 
-		/*
-		 * Just copy the data part alone. The mctl part is
-		 * needed just for verifying policy and it is never
-		 * sent up.
-		 */
-		if (connp == NULL ||
-		    (((first_mp1 = dupmsg(first_mp)) == NULL) &&
-		    ((first_mp1 = ip_copymsg(first_mp)) == NULL))) {
-			/*
-			 * No more intested clients or memory
-			 * allocation failed
-			 */
+		if (connp == NULL) {
+			/* No more interested clients */
+			connp = first_connp;
+			break;
+		}
+		if (((mp1 = dupmsg(mp)) == NULL) &&
+		    ((mp1 = copymsg(mp)) == NULL)) {
+			/* Memory allocation failed */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
 			connp = first_connp;
 			break;
 		}
-		ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
-		mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
+
 		CONN_INC_REF(connp);
 		mutex_exit(&connfp->connf_lock);
-		rq = connp->conn_rq;
-		/*
-		 * For link-local always add ifindex so that transport can set
-		 * sin6_scope_id. Avoid it for ICMP error fanout.
-		 */
-		if ((connp->conn_ip_recvpktinfo ||
-		    IN6_IS_ADDR_LINKLOCAL(&src)) &&
-		    (flags & IP_FF_IPINFO)) {
-			/* Add header */
-			mp1 = ip_add_info_v6(mp1, inill, &dst);
-		}
-		if (mp1 == NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-		} else if (
-		    (IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
-		    (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
-			if (flags & IP_FF_RAWIP) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    rawipIfStatsInOverflows);
-			} else {
-				BUMP_MIB(ill->ill_icmp6_mib,
-				    ipv6IfIcmpInOverflows);
-			}
 
-			freemsg(mp1);
-		} else {
-			ASSERT(!IPCL_IS_IPTUN(connp));
+		ip_fanout_proto_conn(connp, mp1, NULL, (ip6_t *)mp1->b_rptr,
+		    ira);
 
-			if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
-			    secure) {
-				first_mp1 = ipsec_check_inbound_policy(
-				    first_mp1, connp, NULL, ip6h, mctl_present);
-			}
-			if (first_mp1 != NULL) {
-				if (mctl_present)
-					freeb(first_mp1);
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsHCInDelivers);
-				(connp->conn_recv)(connp, mp1, NULL);
-			}
-		}
 		mutex_enter(&connfp->connf_lock);
 		/* Follow the next pointer before releasing the conn. */
 		next_connp = connp->conn_next;
@@ -3208,105 +2369,33 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
 	/* Last one.  Send it upstream. */
 	mutex_exit(&connfp->connf_lock);
 
-	/* Initiate IPPF processing */
-	if (IP6_IN_IPP(flags, ipst)) {
-		uint_t ifindex;
-
-		mutex_enter(&ill->ill_lock);
-		ifindex = ill->ill_phyint->phyint_ifindex;
-		mutex_exit(&ill->ill_lock);
-		ip_process(IPP_LOCAL_IN, &mp, ifindex);
-		if (mp == NULL) {
-			CONN_DEC_REF(connp);
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		}
-	}
-
-	/*
-	 * For link-local always add ifindex so that transport can set
-	 * sin6_scope_id. Avoid it for ICMP error fanout.
-	 */
-	if ((connp->conn_ip_recvpktinfo || IN6_IS_ADDR_LINKLOCAL(&src)) &&
-	    (flags & IP_FF_IPINFO)) {
-		/* Add header */
-		mp = ip_add_info_v6(mp, inill, &dst);
-		if (mp == NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			CONN_DEC_REF(connp);
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		} else if (mctl_present) {
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
-		}
-	}
-
-	rq = connp->conn_rq;
-	if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
-	    (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
-
-		if (flags & IP_FF_RAWIP) {
-			BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
-		} else {
-			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
-		}
-
-		freemsg(first_mp);
-	} else {
-		ASSERT(!IPCL_IS_IPTUN(connp));
+	ip_fanout_proto_conn(connp, mp, NULL, ip6h, ira);
 
-		if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) {
-			first_mp = ipsec_check_inbound_policy(first_mp, connp,
-			    NULL, ip6h, mctl_present);
-			if (first_mp == NULL) {
-				CONN_DEC_REF(connp);
-				return;
-			}
-		}
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-		(connp->conn_recv)(connp, mp, NULL);
-		if (mctl_present)
-			freeb(first_mp);
-	}
 	CONN_DEC_REF(connp);
 }
 
 /*
- * Send an ICMP error after patching up the packet appropriately.  Returns
- * non-zero if the appropriate MIB should be bumped; zero otherwise.
+ * Called when it is conceptually a ULP that would sent the packet
+ * e.g., port unreachable and nexthdr unknown. Check that the packet
+ * would have passed the IPsec global policy before sending the error.
+ *
+ * Send an ICMP error after patching up the packet appropriately.
+ * Uses ip_drop_input and bumps the appropriate MIB.
+ * For ICMP6_PARAMPROB_NEXTHEADER we determine the offset to use.
  */
-int
-ip_fanout_send_icmp_v6(queue_t *q, mblk_t *mp, uint_t flags,
-    uint_t icmp_type, uint8_t icmp_code, uint_t nexthdr_offset,
-    boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst)
+void
+ip_fanout_send_icmp_v6(mblk_t *mp, uint_t icmp_type, uint8_t icmp_code,
+    ip_recv_attr_t *ira)
 {
-	ip6_t *ip6h;
-	mblk_t *first_mp;
-	boolean_t secure;
-	unsigned char db_type;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
+	ip6_t		*ip6h;
+	boolean_t	secure;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	netstack_t	*ns = ipst->ips_netstack;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+
+	secure = ira->ira_flags & IRAF_IPSEC_SECURE;
 
-	first_mp = mp;
-	if (mctl_present) {
-		mp = mp->b_cont;
-		secure = ipsec_in_is_secure(first_mp);
-		ASSERT(mp != NULL);
-	} else {
-		/*
-		 * If this is an ICMP error being reported - which goes
-		 * up as M_CTLs, we need to convert them to M_DATA till
-		 * we finish checking with global policy because
-		 * ipsec_check_global_policy() assumes M_DATA as clear
-		 * and M_CTL as secure.
-		 */
-		db_type = mp->b_datap->db_type;
-		mp->b_datap->db_type = M_DATA;
-		secure = B_FALSE;
-	}
 	/*
 	 * We are generating an icmp error for some inbound packet.
 	 * Called from all ip_fanout_(udp, tcp, proto) functions.
@@ -3316,572 +2405,155 @@ ip_fanout_send_icmp_v6(queue_t *q, mblk_t *mp, uint_t flags,
 	 */
 	ip6h = (ip6_t *)mp->b_rptr;
 	if (secure || ipss->ipsec_inbound_v6_policy_present) {
-		first_mp = ipsec_check_global_policy(first_mp, NULL,
-		    NULL, ip6h, mctl_present, ipst->ips_netstack);
-		if (first_mp == NULL)
-			return (0);
-	}
-
-	if (!mctl_present)
-		mp->b_datap->db_type = db_type;
-
-	if (flags & IP_FF_SEND_ICMP) {
-		if (flags & IP_FF_HDR_COMPLETE) {
-			if (ip_hdr_complete_v6(ip6h, zoneid, ipst)) {
-				freemsg(first_mp);
-				return (1);
-			}
-		}
-		switch (icmp_type) {
-		case ICMP6_DST_UNREACH:
-			icmp_unreachable_v6(WR(q), first_mp, icmp_code,
-			    B_FALSE, B_FALSE, zoneid, ipst);
-			break;
-		case ICMP6_PARAM_PROB:
-			icmp_param_problem_v6(WR(q), first_mp, icmp_code,
-			    nexthdr_offset, B_FALSE, B_FALSE, zoneid, ipst);
-			break;
-		default:
-#ifdef DEBUG
-			panic("ip_fanout_send_icmp_v6: wrong type");
-			/*NOTREACHED*/
-#else
-			freemsg(first_mp);
-			break;
-#endif
-		}
-	} else {
-		freemsg(first_mp);
-		return (0);
-	}
-
-	return (1);
-}
-
-/*
- * Fanout for TCP packets
- * The caller puts <fport, lport> in the ports parameter.
- */
-static void
-ip_fanout_tcp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, ill_t *inill,
-    uint_t flags, uint_t hdr_len, boolean_t mctl_present, zoneid_t zoneid)
-{
-	mblk_t  	*first_mp;
-	boolean_t 	secure;
-	conn_t		*connp;
-	tcph_t		*tcph;
-	boolean_t	syn_present = B_FALSE;
-	ip_stack_t	*ipst = inill->ill_ipst;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
-
-	first_mp = mp;
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		secure = ipsec_in_is_secure(first_mp);
-		ASSERT(mp != NULL);
-	} else {
-		secure = B_FALSE;
-	}
-
-	connp = ipcl_classify_v6(mp, IPPROTO_TCP, hdr_len, zoneid, ipst);
-
-	if (connp == NULL ||
-	    !conn_wantpacket_v6(connp, ill, ip6h, flags, zoneid)) {
-		/*
-		 * No hard-bound match. Send Reset.
-		 */
-		dblk_t *dp = mp->b_datap;
-		uint32_t ill_index;
-
-		ASSERT((dp->db_struioflag & STRUIO_IP) == 0);
-
-		/* Initiate IPPf processing, if needed. */
-		if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
-		    (flags & IP6_NO_IPPOLICY)) {
-			ill_index = ill->ill_phyint->phyint_ifindex;
-			ip_process(IPP_LOCAL_IN, &first_mp, ill_index);
-			if (first_mp == NULL) {
-				if (connp != NULL)
-					CONN_DEC_REF(connp);
-				return;
-			}
-		}
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-		if (connp != NULL) {
-			ip_xmit_reset_serialize(first_mp, hdr_len, zoneid,
-			    ipst->ips_netstack->netstack_tcp, connp);
-			CONN_DEC_REF(connp);
-		} else {
-			tcp_xmit_listeners_reset(first_mp, hdr_len, zoneid,
-			    ipst->ips_netstack->netstack_tcp, NULL);
-		}
-
-		return;
-	}
-
-	tcph = (tcph_t *)&mp->b_rptr[hdr_len];
-	if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
-		if (IPCL_IS_TCP(connp)) {
-			squeue_t *sqp;
-
-			/*
-			 * If the queue belongs to a conn, and fused tcp
-			 * loopback is enabled, assign the eager's squeue
-			 * to be that of the active connect's.
-			 */
-			if ((flags & IP_FF_LOOPBACK) && do_tcp_fusion &&
-			    CONN_Q(q) && IPCL_IS_TCP(Q_TO_CONN(q)) &&
-			    !CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) &&
-			    !secure &&
-			    !IP6_IN_IPP(flags, ipst)) {
-				ASSERT(Q_TO_CONN(q)->conn_sqp != NULL);
-				sqp = Q_TO_CONN(q)->conn_sqp;
-			} else {
-				sqp = IP_SQUEUE_GET(lbolt);
-			}
-
-			mp->b_datap->db_struioflag |= STRUIO_EAGER;
-			DB_CKSUMSTART(mp) = (intptr_t)sqp;
-
-			/*
-			 * db_cksumstuff is unused in the incoming
-			 * path; Thus store the ifindex here. It will
-			 * be cleared in tcp_conn_create_v6().
-			 */
-			DB_CKSUMSTUFF(mp) =
-			    (intptr_t)ill->ill_phyint->phyint_ifindex;
-			syn_present = B_TRUE;
-		}
-	}
-
-	if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) {
-		uint_t	flags = (unsigned int)tcph->th_flags[0] & 0xFF;
-		if ((flags & TH_RST) || (flags & TH_URG)) {
-			CONN_DEC_REF(connp);
-			freemsg(first_mp);
-			return;
-		}
-		if (flags & TH_ACK) {
-			ip_xmit_reset_serialize(first_mp, hdr_len, zoneid,
-			    ipst->ips_netstack->netstack_tcp, connp);
-			CONN_DEC_REF(connp);
+		mp = ipsec_check_global_policy(mp, NULL, NULL, ip6h, ira, ns);
+		if (mp == NULL)
 			return;
-		}
+	}
 
-		CONN_DEC_REF(connp);
-		freemsg(first_mp);
+	/* We never send errors for protocols that we do implement */
+	if (ira->ira_protocol == IPPROTO_ICMPV6) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ip_fanout_send_icmp_v6", mp, ill);
+		freemsg(mp);
 		return;
 	}
 
-	if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) {
-		first_mp = ipsec_check_inbound_policy(first_mp, connp,
-		    NULL, ip6h, mctl_present);
-		if (first_mp == NULL) {
-			CONN_DEC_REF(connp);
-			return;
-		}
-		if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) {
-			ASSERT(syn_present);
-			if (mctl_present) {
-				ASSERT(first_mp != mp);
-				first_mp->b_datap->db_struioflag |=
-				    STRUIO_POLICY;
-			} else {
-				ASSERT(first_mp == mp);
-				mp->b_datap->db_struioflag &=
-				    ~STRUIO_EAGER;
-				mp->b_datap->db_struioflag |=
-				    STRUIO_POLICY;
-			}
-		} else {
-			/*
-			 * Discard first_mp early since we're dealing with a
-			 * fully-connected conn_t and tcp doesn't do policy in
-			 * this case. Also, if someone is bound to IPPROTO_TCP
-			 * over raw IP, they don't expect to see a M_CTL.
-			 */
-			if (mctl_present) {
-				freeb(first_mp);
-				mctl_present = B_FALSE;
-			}
-			first_mp = mp;
-		}
-	}
+	switch (icmp_type) {
+	case ICMP6_DST_UNREACH:
+		ASSERT(icmp_code == ICMP6_DST_UNREACH_NOPORT);
 
-	/* Initiate IPPF processing */
-	if (IP6_IN_IPP(flags, ipst)) {
-		uint_t	ifindex;
+		BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
+		ip_drop_input("ipIfStatsNoPorts", mp, ill);
 
-		mutex_enter(&ill->ill_lock);
-		ifindex = ill->ill_phyint->phyint_ifindex;
-		mutex_exit(&ill->ill_lock);
-		ip_process(IPP_LOCAL_IN, &mp, ifindex);
-		if (mp == NULL) {
-			CONN_DEC_REF(connp);
-			if (mctl_present) {
-				freeb(first_mp);
-			}
-			return;
-		} else if (mctl_present) {
-			/*
-			 * ip_add_info_v6 might return a new mp.
-			 */
-			ASSERT(first_mp != mp);
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
-		}
-	}
+		icmp_unreachable_v6(mp, icmp_code, B_FALSE, ira);
+		break;
+	case ICMP6_PARAM_PROB:
+		ASSERT(icmp_code == ICMP6_PARAMPROB_NEXTHEADER);
 
-	/*
-	 * For link-local always add ifindex so that TCP can bind to that
-	 * interface. Avoid it for ICMP error fanout.
-	 */
-	if (!syn_present && ((connp->conn_ip_recvpktinfo ||
-	    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) &&
-	    (flags & IP_FF_IPINFO))) {
-		/* Add header */
-		mp = ip_add_info_v6(mp, inill, &ip6h->ip6_dst);
-		if (mp == NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			CONN_DEC_REF(connp);
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		} else if (mctl_present) {
-			ASSERT(first_mp != mp);
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
-		}
-	}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
+		ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
 
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-	if (IPCL_IS_TCP(connp)) {
-		SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv,
-		    connp, ip_squeue_flag, SQTAG_IP6_TCP_INPUT);
-	} else {
-		/* SOCK_RAW, IPPROTO_TCP case */
-		(connp->conn_recv)(connp, first_mp, NULL);
-		CONN_DEC_REF(connp);
+		/* Let the system determine the offset for this one */
+		icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira);
+		break;
+	default:
+#ifdef DEBUG
+		panic("ip_fanout_send_icmp_v6: wrong type");
+		/*NOTREACHED*/
+#else
+		freemsg(mp);
+		break;
+#endif
 	}
 }
 
 /*
+ * Fanout for UDP packets that are multicast or ICMP errors.
+ * (Unicast fanout is handled in ip_input_v6.)
+ *
+ * If SO_REUSEADDR is set all multicast packets
+ * will be delivered to all conns bound to the same port.
+ *
  * Fanout for UDP packets.
  * The caller puts <fport, lport> in the ports parameter.
  * ire_type must be IRE_BROADCAST for multicast and broadcast packets.
  *
  * If SO_REUSEADDR is set all multicast and broadcast packets
- * will be delivered to all streams bound to the same port.
+ * will be delivered to all conns bound to the same port.
  *
  * Zones notes:
- * Multicast packets will be distributed to streams in all zones.
+ * Earlier in ip_input on a system with multiple shared-IP zones we
+ * duplicate the multicast and broadcast packets and send them up
+ * with each explicit zoneid that exists on that ill.
+ * This means that here we can match the zoneid with SO_ALLZONES being special.
  */
-static void
-ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports,
-    ill_t *ill, ill_t *inill, uint_t flags, boolean_t mctl_present,
-    zoneid_t zoneid)
+void
+ip_fanout_udp_multi_v6(mblk_t *mp, ip6_t *ip6h, uint16_t lport, uint16_t fport,
+    ip_recv_attr_t *ira)
 {
-	uint32_t	dstport, srcport;
-	in6_addr_t	dst;
-	mblk_t		*first_mp;
-	boolean_t	secure;
+	in6_addr_t	laddr;
 	conn_t		*connp;
 	connf_t		*connfp;
-	conn_t		*first_conn;
-	conn_t 		*next_conn;
-	mblk_t		*mp1, *first_mp1;
-	in6_addr_t	src;
-	boolean_t	shared_addr;
-	ip_stack_t	*ipst = inill->ill_ipst;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
-
-	first_mp = mp;
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		secure = ipsec_in_is_secure(first_mp);
-		ASSERT(mp != NULL);
-	} else {
-		secure = B_FALSE;
-	}
+	in6_addr_t	faddr;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	/* Extract ports in net byte order */
-	dstport = htons(ntohl(ports) & 0xFFFF);
-	srcport = htons(ntohl(ports) >> 16);
-	dst = ip6h->ip6_dst;
-	src = ip6h->ip6_src;
+	ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
 
-	shared_addr = (zoneid == ALL_ZONES);
-	if (shared_addr) {
-		/*
-		 * No need to handle exclusive-stack zones since ALL_ZONES
-		 * only applies to the shared stack.
-		 */
-		zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport);
-		/*
-		 * If no shared MLP is found, tsol_mlp_findzone returns
-		 * ALL_ZONES.  In that case, we assume it's SLP, and
-		 * search for the zone based on the packet label.
-		 * That will also return ALL_ZONES on failure, but
-		 * we never allow conn_zoneid to be set to ALL_ZONES.
-		 */
-		if (zoneid == ALL_ZONES)
-			zoneid = tsol_packet_to_zoneid(mp);
-	}
+	laddr = ip6h->ip6_dst;
+	faddr = ip6h->ip6_src;
 
 	/* Attempt to find a client stream based on destination port. */
-	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)];
+	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
 	mutex_enter(&connfp->connf_lock);
 	connp = connfp->connf_head;
-	if (!IN6_IS_ADDR_MULTICAST(&dst)) {
-		/*
-		 * Not multicast. Send to the one (first) client we find.
-		 */
-		while (connp != NULL) {
-			if (IPCL_UDP_MATCH_V6(connp, dstport, dst, srcport,
-			    src) && IPCL_ZONE_MATCH(connp, zoneid) &&
-			    conn_wantpacket_v6(connp, ill, ip6h,
-			    flags, zoneid)) {
-				break;
-			}
-			connp = connp->conn_next;
-		}
-		if (connp == NULL || connp->conn_upq == NULL)
-			goto notfound;
-
-		if (is_system_labeled() &&
-		    !tsol_receive_local(mp, &dst, IPV6_VERSION, shared_addr,
-		    connp))
-			goto notfound;
-
-		/* Found a client */
-		CONN_INC_REF(connp);
-		mutex_exit(&connfp->connf_lock);
-
-		if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
-		    (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
-			freemsg(first_mp);
-			CONN_DEC_REF(connp);
-			return;
-		}
-		if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) {
-			first_mp = ipsec_check_inbound_policy(first_mp,
-			    connp, NULL, ip6h, mctl_present);
-			if (first_mp == NULL) {
-				CONN_DEC_REF(connp);
-				return;
-			}
-		}
-		/* Initiate IPPF processing */
-		if (IP6_IN_IPP(flags, ipst)) {
-			uint_t	ifindex;
-
-			mutex_enter(&ill->ill_lock);
-			ifindex = ill->ill_phyint->phyint_ifindex;
-			mutex_exit(&ill->ill_lock);
-			ip_process(IPP_LOCAL_IN, &mp, ifindex);
-			if (mp == NULL) {
-				CONN_DEC_REF(connp);
-				if (mctl_present)
-					freeb(first_mp);
-				return;
-			}
-		}
-		/*
-		 * For link-local always add ifindex so that
-		 * transport can set sin6_scope_id. Avoid it for
-		 * ICMP error fanout.
-		 */
-		if ((connp->conn_ip_recvpktinfo ||
-		    IN6_IS_ADDR_LINKLOCAL(&src)) &&
-		    (flags & IP_FF_IPINFO)) {
-				/* Add header */
-			mp = ip_add_info_v6(mp, inill, &dst);
-			if (mp == NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				CONN_DEC_REF(connp);
-				if (mctl_present)
-					freeb(first_mp);
-				return;
-			} else if (mctl_present) {
-				first_mp->b_cont = mp;
-			} else {
-				first_mp = mp;
-			}
-		}
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-
-		/* Send it upstream */
-		(connp->conn_recv)(connp, mp, NULL);
-
-		IP6_STAT(ipst, ip6_udp_fannorm);
-		CONN_DEC_REF(connp);
-		if (mctl_present)
-			freeb(first_mp);
-		return;
-	}
-
 	while (connp != NULL) {
-		if ((IPCL_UDP_MATCH_V6(connp, dstport, dst, srcport, src)) &&
-		    conn_wantpacket_v6(connp, ill, ip6h, flags, zoneid) &&
-		    (!is_system_labeled() ||
-		    tsol_receive_local(mp, &dst, IPV6_VERSION, shared_addr,
-		    connp)))
+		if ((IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr)) &&
+		    conn_wantpacket_v6(connp, ira, ip6h) &&
+		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+		    tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
 			break;
 		connp = connp->conn_next;
 	}
 
-	if (connp == NULL || connp->conn_upq == NULL)
+	if (connp == NULL)
 		goto notfound;
 
-	first_conn = connp;
-
 	CONN_INC_REF(connp);
-	connp = connp->conn_next;
-	for (;;) {
-		while (connp != NULL) {
-			if (IPCL_UDP_MATCH_V6(connp, dstport, dst, srcport,
-			    src) && conn_wantpacket_v6(connp, ill, ip6h,
-			    flags, zoneid) &&
-			    (!is_system_labeled() ||
-			    tsol_receive_local(mp, &dst, IPV6_VERSION,
-			    shared_addr, connp)))
-				break;
-			connp = connp->conn_next;
-		}
-		/*
-		 * Just copy the data part alone. The mctl part is
-		 * needed just for verifying policy and it is never
-		 * sent up.
-		 */
-		if (connp == NULL ||
-		    (((first_mp1 = dupmsg(first_mp)) == NULL) &&
-		    ((first_mp1 = ip_copymsg(first_mp)) == NULL))) {
-			/*
-			 * No more interested clients or memory
-			 * allocation failed
-			 */
-			connp = first_conn;
-			break;
-		}
-		mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
-		CONN_INC_REF(connp);
-		mutex_exit(&connfp->connf_lock);
-		/*
-		 * For link-local always add ifindex so that transport
-		 * can set sin6_scope_id. Avoid it for ICMP error
-		 * fanout.
-		 */
-		if ((connp->conn_ip_recvpktinfo ||
-		    IN6_IS_ADDR_LINKLOCAL(&src)) &&
-		    (flags & IP_FF_IPINFO)) {
-			/* Add header */
-			mp1 = ip_add_info_v6(mp1, inill, &dst);
-		}
-		/* mp1 could have changed */
-		if (mctl_present)
-			first_mp1->b_cont = mp1;
-		else
-			first_mp1 = mp1;
-		if (mp1 == NULL) {
-			if (mctl_present)
-				freeb(first_mp1);
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			goto next_one;
-		}
-		if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
-		    (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
-			BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
-			freemsg(first_mp1);
-			goto next_one;
-		}
 
-		if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) {
-			first_mp1 = ipsec_check_inbound_policy
-			    (first_mp1, connp, NULL, ip6h,
-			    mctl_present);
-		}
-		if (first_mp1 != NULL) {
-			if (mctl_present)
-				freeb(first_mp1);
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+	if (connp->conn_reuseaddr) {
+		conn_t		*first_connp = connp;
+		conn_t		*next_connp;
+		mblk_t		*mp1;
 
-			/* Send it upstream */
-			(connp->conn_recv)(connp, mp1, NULL);
-		}
-next_one:
-		mutex_enter(&connfp->connf_lock);
-		/* Follow the next pointer before releasing the conn. */
-		next_conn = connp->conn_next;
-		IP6_STAT(ipst, ip6_udp_fanmb);
-		CONN_DEC_REF(connp);
-		connp = next_conn;
-	}
+		connp = connp->conn_next;
+		for (;;) {
+			while (connp != NULL) {
+				if (IPCL_UDP_MATCH_V6(connp, lport, laddr,
+				    fport, faddr) &&
+				    conn_wantpacket_v6(connp, ira, ip6h) &&
+				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+				    tsol_receive_local(mp, &laddr, IPV6_VERSION,
+				    ira, connp)))
+					break;
+				connp = connp->conn_next;
+			}
+			if (connp == NULL) {
+				/* No more interested clients */
+				connp = first_connp;
+				break;
+			}
+			if (((mp1 = dupmsg(mp)) == NULL) &&
+			    ((mp1 = copymsg(mp)) == NULL)) {
+				/* Memory allocation failed */
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				connp = first_connp;
+				break;
+			}
 
-	/* Last one.  Send it upstream. */
-	mutex_exit(&connfp->connf_lock);
+			CONN_INC_REF(connp);
+			mutex_exit(&connfp->connf_lock);
 
-	/* Initiate IPPF processing */
-	if (IP6_IN_IPP(flags, ipst)) {
-		uint_t	ifindex;
+			IP6_STAT(ipst, ip6_udp_fanmb);
+			ip_fanout_udp_conn(connp, mp1, NULL,
+			    (ip6_t *)mp1->b_rptr, ira);
 
-		mutex_enter(&ill->ill_lock);
-		ifindex = ill->ill_phyint->phyint_ifindex;
-		mutex_exit(&ill->ill_lock);
-		ip_process(IPP_LOCAL_IN, &mp, ifindex);
-		if (mp == NULL) {
+			mutex_enter(&connfp->connf_lock);
+			/* Follow the next pointer before releasing the conn. */
+			next_connp = connp->conn_next;
+			IP6_STAT(ipst, ip6_udp_fanmb);
 			CONN_DEC_REF(connp);
-			if (mctl_present) {
-				freeb(first_mp);
-			}
-			return;
+			connp = next_connp;
 		}
 	}
 
-	/*
-	 * For link-local always add ifindex so that transport can set
-	 * sin6_scope_id. Avoid it for ICMP error fanout.
-	 */
-	if ((connp->conn_ip_recvpktinfo ||
-	    IN6_IS_ADDR_LINKLOCAL(&src)) && (flags & IP_FF_IPINFO)) {
-		/* Add header */
-		mp = ip_add_info_v6(mp, inill, &dst);
-		if (mp == NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			CONN_DEC_REF(connp);
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		} else if (mctl_present) {
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
-		}
-	}
-	if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
-	    (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
-		BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
-		freemsg(mp);
-	} else {
-		if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) {
-			first_mp = ipsec_check_inbound_policy(first_mp,
-			    connp, NULL, ip6h, mctl_present);
-			if (first_mp == NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				CONN_DEC_REF(connp);
-				return;
-			}
-		}
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+	/* Last one.  Send it upstream. */
+	mutex_exit(&connfp->connf_lock);
 
-		/* Send it upstream */
-		(connp->conn_recv)(connp, mp, NULL);
-	}
 	IP6_STAT(ipst, ip6_udp_fanmb);
+	ip_fanout_udp_conn(connp, mp, NULL, ip6h, ira);
 	CONN_DEC_REF(connp);
-	if (mctl_present)
-		freeb(first_mp);
 	return;
 
 notfound:
@@ -3892,28 +2564,26 @@ notfound:
 	 * unclaimed datagrams?
 	 */
 	if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP].connf_head != NULL) {
-		ip_fanout_proto_v6(q, first_mp, ip6h, ill, inill, IPPROTO_UDP,
-		    0, flags | IP_FF_RAWIP | IP_FF_IPINFO, mctl_present,
-		    zoneid);
+		ASSERT(ira->ira_protocol == IPPROTO_UDP);
+		ip_fanout_proto_v6(mp, ip6h, ira);
 	} else {
-		if (ip_fanout_send_icmp_v6(q, first_mp, flags,
-		    ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0,
-		    mctl_present, zoneid, ipst)) {
-			BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
-		}
+		ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH,
+		    ICMP6_DST_UNREACH_NOPORT, ira);
 	}
 }
 
 /*
  * int ip_find_hdr_v6()
  *
- * This routine is used by the upper layer protocols and the IP tunnel
- * module to:
+ * This routine is used by the upper layer protocols, iptun, and IPsec:
  * - Set extension header pointers to appropriate locations
  * - Determine IPv6 header length and return it
  * - Return a pointer to the last nexthdr value
  *
  * The caller must initialize ipp_fields.
+ * The upper layer protocols normally set label_separate which makes the
+ * routine put the TX label in ipp_label_v6. If this is not set then
+ * the hop-by-hop options including the label are placed in ipp_hopopts.
  *
  * NOTE: If multiple extension headers of the same type are present,
  * ip_find_hdr_v6() will set the respective extension header pointers
@@ -3923,7 +2593,8 @@ notfound:
  * malformed part.
  */
 int
-ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp)
+ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, boolean_t label_separate, ip_pkt_t *ipp,
+    uint8_t *nexthdrp)
 {
 	uint_t	length, ehdrlen;
 	uint8_t nexthdr;
@@ -3933,6 +2604,11 @@ ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp)
 	ip6_hbh_t *tmphopopts;
 	ip6_frag_t *tmpfraghdr;
 
+	ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
+	ipp->ipp_hoplimit = ip6h->ip6_hops;
+	ipp->ipp_tclass = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
+	ipp->ipp_addr = ip6h->ip6_dst;
+
 	length = IPV6_HDR_LEN;
 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
 	endptr = mp->b_wptr;
@@ -3944,19 +2620,48 @@ ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp)
 			goto done;
 
 		switch (nexthdr) {
-		case IPPROTO_HOPOPTS:
+		case IPPROTO_HOPOPTS: {
+			/* We check for any CIPSO */
+			uchar_t *secopt;
+			boolean_t hbh_needed;
+			uchar_t *after_secopt;
+
 			tmphopopts = (ip6_hbh_t *)whereptr;
 			ehdrlen = 8 * (tmphopopts->ip6h_len + 1);
 			if ((uchar_t *)tmphopopts +  ehdrlen > endptr)
 				goto done;
 			nexthdr = tmphopopts->ip6h_nxt;
+
+			if (!label_separate) {
+				secopt = NULL;
+				after_secopt = whereptr;
+			} else {
+				/*
+				 * We have dropped packets with bad options in
+				 * ip6_input. No need to check return value
+				 * here.
+				 */
+				(void) tsol_find_secopt_v6(whereptr, ehdrlen,
+				    &secopt, &after_secopt, &hbh_needed);
+			}
+			if (secopt != NULL && after_secopt - whereptr > 0) {
+				ipp->ipp_fields |= IPPF_LABEL_V6;
+				ipp->ipp_label_v6 = secopt;
+				ipp->ipp_label_len_v6 = after_secopt - whereptr;
+			} else {
+				ipp->ipp_label_len_v6 = 0;
+				after_secopt = whereptr;
+				hbh_needed = B_TRUE;
+			}
 			/* return only 1st hbh */
-			if (!(ipp->ipp_fields & IPPF_HOPOPTS)) {
+			if (hbh_needed && !(ipp->ipp_fields & IPPF_HOPOPTS)) {
 				ipp->ipp_fields |= IPPF_HOPOPTS;
-				ipp->ipp_hopopts = tmphopopts;
-				ipp->ipp_hopoptslen = ehdrlen;
+				ipp->ipp_hopopts = (ip6_hbh_t *)after_secopt;
+				ipp->ipp_hopoptslen = ehdrlen -
+				    ipp->ipp_label_len_v6;
 			}
 			break;
+		}
 		case IPPROTO_DSTOPTS:
 			tmpdstopts = (ip6_dest_t *)whereptr;
 			ehdrlen = 8 * (tmpdstopts->ip6d_len + 1);
@@ -3993,10 +2698,10 @@ ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp)
 			 */
 			if (ipp->ipp_fields & IPPF_DSTOPTS) {
 				ipp->ipp_fields &= ~IPPF_DSTOPTS;
-				ipp->ipp_fields |= IPPF_RTDSTOPTS;
-				ipp->ipp_rtdstopts = ipp->ipp_dstopts;
+				ipp->ipp_fields |= IPPF_RTHDRDSTOPTS;
+				ipp->ipp_rthdrdstopts = ipp->ipp_dstopts;
 				ipp->ipp_dstopts = NULL;
-				ipp->ipp_rtdstoptslen = ipp->ipp_dstoptslen;
+				ipp->ipp_rthdrdstoptslen = ipp->ipp_dstoptslen;
 				ipp->ipp_dstoptslen = 0;
 			}
 			break;
@@ -4025,25 +2730,6 @@ done:
 	return (length);
 }
 
-int
-ip_hdr_complete_v6(ip6_t *ip6h, zoneid_t zoneid, ip_stack_t *ipst)
-{
-	ire_t *ire;
-
-	if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
-		ire = ire_lookup_local_v6(zoneid, ipst);
-		if (ire == NULL) {
-			ip1dbg(("ip_hdr_complete_v6: no source IRE\n"));
-			return (1);
-		}
-		ip6h->ip6_src = ire->ire_addr_v6;
-		ire_refrele(ire);
-	}
-	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-	ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
-	return (0);
-}
-
 /*
  * Try to determine where and what are the IPv6 header length and
  * pointer to nexthdr value for the upper layer protocol (or an
@@ -4066,7 +2752,7 @@ ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr,
 	ip6_rthdr_t *rthdr;
 	ip6_frag_t *fraghdr;
 
-	ASSERT((IPH_HDR_VERSION(ip6h) & ~IP_FORWARD_PROG_BIT) == IPV6_VERSION);
+	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
 	length = IPV6_HDR_LEN;
 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
 	endptr = mp->b_wptr;
@@ -4151,1905 +2837,6 @@ ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h)
 }
 
 /*
- * IPv6 -
- * ip_newroute_v6 is called by ip_rput_data_v6 or ip_wput_v6 whenever we need
- * to send out a packet to a destination address for which we do not have
- * specific routing information.
- *
- * Handle non-multicast packets. If ill is non-NULL the match is done
- * for that ill.
- *
- * When a specific ill is specified (using IPV6_PKTINFO,
- * IPV6_MULTICAST_IF, or IPV6_BOUND_IF) we will only match
- * on routing entries (ftable and ctable) that have a matching
- * ire->ire_ipif->ipif_ill. Thus this can only be used
- * for destinations that are on-link for the specific ill
- * and that can appear on multiple links. Thus it is useful
- * for multicast destinations, link-local destinations, and
- * at some point perhaps for site-local destinations (if the
- * node sits at a site boundary).
- * We create the cache entries in the regular ctable since
- * it can not "confuse" things for other destinations.
- *
- * NOTE : These are the scopes of some of the variables that point at IRE,
- *	  which needs to be followed while making any future modifications
- *	  to avoid memory leaks.
- *
- *	- ire and sire are the entries looked up initially by
- *	  ire_ftable_lookup_v6.
- *	- ipif_ire is used to hold the interface ire associated with
- *	  the new cache ire. But it's scope is limited, so we always REFRELE
- *	  it before branching out to error paths.
- *	- save_ire is initialized before ire_create, so that ire returned
- *	  by ire_create will not over-write the ire. We REFRELE save_ire
- *	  before breaking out of the switch.
- *
- *	Thus on failures, we have to REFRELE only ire and sire, if they
- *	are not NULL.
- */
-/* ARGSUSED */
-void
-ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
-    const in6_addr_t *v6srcp, ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst)
-{
-	in6_addr_t	v6gw;
-	in6_addr_t	dst;
-	ire_t		*ire = NULL;
-	ipif_t		*src_ipif = NULL;
-	ill_t		*dst_ill = NULL;
-	ire_t		*sire = NULL;
-	ire_t		*save_ire;
-	ip6_t		*ip6h;
-	int		err = 0;
-	mblk_t		*first_mp;
-	ipsec_out_t	*io;
-	ushort_t	ire_marks = 0;
-	int		match_flags;
-	ire_t		*first_sire = NULL;
-	mblk_t		*copy_mp = NULL;
-	mblk_t		*xmit_mp = NULL;
-	in6_addr_t	save_dst;
-	uint32_t	multirt_flags =
-	    MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP;
-	boolean_t	multirt_is_resolvable;
-	boolean_t	multirt_resolve_next;
-	boolean_t	need_rele = B_FALSE;
-	boolean_t	ip6_asp_table_held = B_FALSE;
-	tsol_ire_gw_secattr_t *attrp = NULL;
-	tsol_gcgrp_t	*gcgrp = NULL;
-	tsol_gcgrp_addr_t ga;
-
-	ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp));
-
-	first_mp = mp;
-	if (mp->b_datap->db_type == M_CTL) {
-		mp = mp->b_cont;
-		io = (ipsec_out_t *)first_mp->b_rptr;
-		ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	} else {
-		io = NULL;
-	}
-
-	ip6h = (ip6_t *)mp->b_rptr;
-
-	if (IN6_IS_ADDR_LOOPBACK(v6dstp)) {
-		ip1dbg(("ip_newroute_v6: dst with loopback addr\n"));
-		goto icmp_err_ret;
-	} else if (IN6_IS_ADDR_LOOPBACK(v6srcp)) {
-		ip1dbg(("ip_newroute_v6: src with loopback addr\n"));
-		goto icmp_err_ret;
-	}
-
-	/*
-	 * If this IRE is created for forwarding or it is not for
-	 * TCP traffic, mark it as temporary.
-	 *
-	 * Is it sufficient just to check the next header??
-	 */
-	if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt))
-		ire_marks |= IRE_MARK_TEMPORARY;
-
-	/*
-	 * Get what we can from ire_ftable_lookup_v6 which will follow an IRE
-	 * chain until it gets the most specific information available.
-	 * For example, we know that there is no IRE_CACHE for this dest,
-	 * but there may be an IRE_OFFSUBNET which specifies a gateway.
-	 * ire_ftable_lookup_v6 will look up the gateway, etc.
-	 */
-
-	if (ill == NULL) {
-		match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-		    MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_SECATTR;
-		ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0,
-		    NULL, &sire, zoneid, 0, msg_getlabel(mp),
-		    match_flags, ipst);
-	} else {
-		match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-		    MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL;
-		match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR;
-
-		/*
-		 * Because nce_xmit() calls ip_output_v6() and NCEs are always
-		 * tied to an underlying interface, IS_UNDER_IPMP() may be
-		 * true even when building IREs that will be used for data
-		 * traffic.  As such, use the packet's source address to
-		 * determine whether the traffic is test traffic, and set
-		 * MATCH_IRE_MARK_TESTHIDDEN if so.
-		 */
-		if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) {
-			if (ipif_lookup_testaddr_v6(ill, v6srcp, NULL))
-				match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-		}
-
-		ire = ire_ftable_lookup_v6(v6dstp, NULL, NULL, 0, ill->ill_ipif,
-		    &sire, zoneid, 0, msg_getlabel(mp), match_flags, ipst);
-	}
-
-	ip3dbg(("ip_newroute_v6: ire_ftable_lookup_v6() "
-	    "returned ire %p, sire %p\n", (void *)ire, (void *)sire));
-
-	/*
-	 * We enter a loop that will be run only once in most cases.
-	 * The loop is re-entered in the case where the destination
-	 * can be reached through multiple RTF_MULTIRT-flagged routes.
-	 * The intention is to compute multiple routes to a single
-	 * destination in a single ip_newroute_v6 call.
-	 * The information is contained in sire->ire_flags.
-	 */
-	do {
-		multirt_resolve_next = B_FALSE;
-
-		if (dst_ill != NULL) {
-			ill_refrele(dst_ill);
-			dst_ill = NULL;
-		}
-		if (src_ipif != NULL) {
-			ipif_refrele(src_ipif);
-			src_ipif = NULL;
-		}
-		if ((sire != NULL) && sire->ire_flags & RTF_MULTIRT) {
-			ip3dbg(("ip_newroute_v6: starting new resolution "
-			    "with first_mp %p, tag %d\n",
-			    (void *)first_mp, MULTIRT_DEBUG_TAGGED(first_mp)));
-
-			/*
-			 * We check if there are trailing unresolved routes for
-			 * the destination contained in sire.
-			 */
-			multirt_is_resolvable = ire_multirt_lookup_v6(&ire,
-			    &sire, multirt_flags, msg_getlabel(mp), ipst);
-
-			ip3dbg(("ip_newroute_v6: multirt_is_resolvable %d, "
-			    "ire %p, sire %p\n",
-			    multirt_is_resolvable, (void *)ire, (void *)sire));
-
-			if (!multirt_is_resolvable) {
-				/*
-				 * No more multirt routes to resolve; give up
-				 * (all routes resolved or no more resolvable
-				 * routes).
-				 */
-				if (ire != NULL) {
-					ire_refrele(ire);
-					ire = NULL;
-				}
-			} else {
-				ASSERT(sire != NULL);
-				ASSERT(ire != NULL);
-				/*
-				 * We simply use first_sire as a flag that
-				 * indicates if a resolvable multirt route has
-				 * already been found during the preceding
-				 * loops. If it is not the case, we may have
-				 * to send an ICMP error to report that the
-				 * destination is unreachable. We do not
-				 * IRE_REFHOLD first_sire.
-				 */
-				if (first_sire == NULL) {
-					first_sire = sire;
-				}
-			}
-		}
-		if ((ire == NULL) || (ire == sire)) {
-			/*
-			 * either ire == NULL (the destination cannot be
-			 * resolved) or ire == sire (the gateway cannot be
-			 * resolved). At this point, there are no more routes
-			 * to resolve for the destination, thus we exit.
-			 */
-			if (ip_debug > 3) {
-				/* ip2dbg */
-				pr_addr_dbg("ip_newroute_v6: "
-				    "can't resolve %s\n", AF_INET6, v6dstp);
-			}
-			ip3dbg(("ip_newroute_v6: "
-			    "ire %p, sire %p, first_sire %p\n",
-			    (void *)ire, (void *)sire, (void *)first_sire));
-
-			if (sire != NULL) {
-				ire_refrele(sire);
-				sire = NULL;
-			}
-
-			if (first_sire != NULL) {
-				/*
-				 * At least one multirt route has been found
-				 * in the same ip_newroute() call; there is no
-				 * need to report an ICMP error.
-				 * first_sire was not IRE_REFHOLDed.
-				 */
-				MULTIRT_DEBUG_UNTAG(first_mp);
-				freemsg(first_mp);
-				return;
-			}
-			ip_rts_change_v6(RTM_MISS, v6dstp, 0, 0, 0, 0, 0, 0,
-			    RTA_DST, ipst);
-			goto icmp_err_ret;
-		}
-
-		ASSERT(ire->ire_ipversion == IPV6_VERSION);
-
-		/*
-		 * Verify that the returned IRE does not have either the
-		 * RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
-		 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
-		 */
-		if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
-		    (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0)
-			goto icmp_err_ret;
-
-		/*
-		 * Increment the ire_ob_pkt_count field for ire if it is an
-		 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
-		 * increment the same for the parent IRE, sire, if it is some
-		 * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST)
-		 */
-		if ((ire->ire_type & IRE_INTERFACE) != 0) {
-			UPDATE_OB_PKT_COUNT(ire);
-			ire->ire_last_used_time = lbolt;
-		}
-
-		if (sire != NULL) {
-			mutex_enter(&sire->ire_lock);
-			v6gw = sire->ire_gateway_addr_v6;
-			mutex_exit(&sire->ire_lock);
-			ASSERT((sire->ire_type & (IRE_CACHETABLE |
-			    IRE_INTERFACE)) == 0);
-			UPDATE_OB_PKT_COUNT(sire);
-			sire->ire_last_used_time = lbolt;
-		} else {
-			v6gw = ipv6_all_zeros;
-		}
-
-		/*
-		 * We have a route to reach the destination.  Find the
-		 * appropriate ill, then get a source address that matches the
-		 * right scope via ipif_select_source_v6().
-		 *
-		 * If we are here trying to create an IRE_CACHE for an offlink
-		 * destination and have an IRE_CACHE entry for VNI, then use
-		 * ire_stq instead since VNI's queue is a black hole.
-		 *
-		 * Note: While we pick a dst_ill we are really only interested
-		 * in the ill for load spreading.  The source ipif is
-		 * determined by source address selection below.
-		 */
-		if ((ire->ire_type == IRE_CACHE) &&
-		    IS_VNI(ire->ire_ipif->ipif_ill)) {
-			dst_ill = ire->ire_stq->q_ptr;
-			ill_refhold(dst_ill);
-		} else {
-			ill_t *ill = ire->ire_ipif->ipif_ill;
-
-			if (IS_IPMP(ill)) {
-				dst_ill =
-				    ipmp_illgrp_hold_next_ill(ill->ill_grp);
-			} else {
-				dst_ill = ill;
-				ill_refhold(dst_ill);
-			}
-		}
-
-		if (dst_ill == NULL) {
-			if (ip_debug > 2) {
-				pr_addr_dbg("ip_newroute_v6 : no dst "
-				    "ill for dst %s\n", AF_INET6, v6dstp);
-			}
-			goto icmp_err_ret;
-		}
-
-		if (ill != NULL && dst_ill != ill &&
-		    !IS_IN_SAME_ILLGRP(dst_ill, ill)) {
-			/*
-			 * We should have found a route matching "ill"
-			 * as we called ire_ftable_lookup_v6 with
-			 * MATCH_IRE_ILL.  Rather than asserting when
-			 * there is a mismatch, we just drop the packet.
-			 */
-			ip0dbg(("ip_newroute_v6: BOUND_IF failed: "
-			    "dst_ill %s ill %s\n", dst_ill->ill_name,
-			    ill->ill_name));
-			goto icmp_err_ret;
-		}
-
-		/*
-		 * Pick a source address which matches the scope of the
-		 * destination address.
-		 * For RTF_SETSRC routes, the source address is imposed by the
-		 * parent ire (sire).
-		 */
-		ASSERT(src_ipif == NULL);
-
-		/*
-		 * Because nce_xmit() calls ip_output_v6() and NCEs are always
-		 * tied to the underlying interface, IS_UNDER_IPMP() may be
-		 * true even when building IREs that will be used for data
-		 * traffic.  As such, see if the packet's source address is a
-		 * test address, and if so use that test address's ipif for
-		 * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in
-		 * ire_add_v6() can work properly.
-		 */
-		if (ill != NULL && IS_UNDER_IPMP(ill))
-			(void) ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif);
-
-		if (src_ipif == NULL && ire->ire_type == IRE_IF_RESOLVER &&
-		    !IN6_IS_ADDR_UNSPECIFIED(&v6gw) &&
-		    ip6_asp_can_lookup(ipst)) {
-			/*
-			 * The ire cache entry we're adding is for the
-			 * gateway itself.  The source address in this case
-			 * is relative to the gateway's address.
-			 */
-			ip6_asp_table_held = B_TRUE;
-			src_ipif = ipif_select_source_v6(dst_ill, &v6gw,
-			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, zoneid);
-			if (src_ipif != NULL)
-				ire_marks |= IRE_MARK_USESRC_CHECK;
-		} else if (src_ipif == NULL) {
-			if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
-				/*
-				 * Check that the ipif matching the requested
-				 * source address still exists.
-				 */
-				src_ipif = ipif_lookup_addr_v6(
-				    &sire->ire_src_addr_v6, NULL, zoneid,
-				    NULL, NULL, NULL, NULL, ipst);
-			}
-			if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
-				ip6_asp_table_held = B_TRUE;
-				src_ipif = ipif_select_source_v6(dst_ill,
-				    v6dstp, B_FALSE,
-				    IPV6_PREFER_SRC_DEFAULT, zoneid);
-				if (src_ipif != NULL)
-					ire_marks |= IRE_MARK_USESRC_CHECK;
-			}
-		}
-
-		if (src_ipif == NULL) {
-			if (ip_debug > 2) {
-				/* ip1dbg */
-				pr_addr_dbg("ip_newroute_v6: no src for "
-				    "dst %s\n", AF_INET6, v6dstp);
-				printf("ip_newroute_v6: interface name %s\n",
-				    dst_ill->ill_name);
-			}
-			goto icmp_err_ret;
-		}
-
-		if (ip_debug > 3) {
-			/* ip2dbg */
-			pr_addr_dbg("ip_newroute_v6: first hop %s\n",
-			    AF_INET6, &v6gw);
-		}
-		ip2dbg(("\tire type %s (%d)\n",
-		    ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type));
-
-		/*
-		 * At this point in ip_newroute_v6(), ire is either the
-		 * IRE_CACHE of the next-hop gateway for an off-subnet
-		 * destination or an IRE_INTERFACE type that should be used
-		 * to resolve an on-subnet destination or an on-subnet
-		 * next-hop gateway.
-		 *
-		 * In the IRE_CACHE case, we have the following :
-		 *
-		 * 1) src_ipif - used for getting a source address.
-		 *
-		 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
-		 *    means packets using this IRE_CACHE will go out on dst_ill.
-		 *
-		 * 3) The IRE sire will point to the prefix that is the longest
-		 *    matching route for the destination. These prefix types
-		 *    include IRE_DEFAULT, IRE_PREFIX, IRE_HOST.
-		 *
-		 *    The newly created IRE_CACHE entry for the off-subnet
-		 *    destination is tied to both the prefix route and the
-		 *    interface route used to resolve the next-hop gateway
-		 *    via the ire_phandle and ire_ihandle fields, respectively.
-		 *
-		 * In the IRE_INTERFACE case, we have the following :
-		 *
-		 * 1) src_ipif - used for getting a source address.
-		 *
-		 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
-		 *    means packets using the IRE_CACHE that we will build
-		 *    here will go out on dst_ill.
-		 *
-		 * 3) sire may or may not be NULL. But, the IRE_CACHE that is
-		 *    to be created will only be tied to the IRE_INTERFACE that
-		 *    was derived from the ire_ihandle field.
-		 *
-		 *    If sire is non-NULL, it means the destination is off-link
-		 *    and we will first create the IRE_CACHE for the gateway.
-		 *    Next time through ip_newroute_v6, we will create the
-		 *    IRE_CACHE for the final destination as described above.
-		 */
-		save_ire = ire;
-		switch (ire->ire_type) {
-		case IRE_CACHE: {
-			ire_t	*ipif_ire;
-
-			ASSERT(sire != NULL);
-			if (IN6_IS_ADDR_UNSPECIFIED(&v6gw)) {
-				mutex_enter(&ire->ire_lock);
-				v6gw = ire->ire_gateway_addr_v6;
-				mutex_exit(&ire->ire_lock);
-			}
-			/*
-			 * We need 3 ire's to create a new cache ire for an
-			 * off-link destination from the cache ire of the
-			 * gateway.
-			 *
-			 *	1. The prefix ire 'sire'
-			 *	2. The cache ire of the gateway 'ire'
-			 *	3. The interface ire 'ipif_ire'
-			 *
-			 * We have (1) and (2). We lookup (3) below.
-			 *
-			 * If there is no interface route to the gateway,
-			 * it is a race condition, where we found the cache
-			 * but the inteface route has been deleted.
-			 */
-			ipif_ire = ire_ihandle_lookup_offlink_v6(ire, sire);
-			if (ipif_ire == NULL) {
-				ip1dbg(("ip_newroute_v6:"
-				    "ire_ihandle_lookup_offlink_v6 failed\n"));
-				goto icmp_err_ret;
-			}
-
-			/*
-			 * Note: the new ire inherits RTF_SETSRC
-			 * and RTF_MULTIRT to propagate these flags from prefix
-			 * to cache.
-			 */
-
-			/*
-			 * Check cached gateway IRE for any security
-			 * attributes; if found, associate the gateway
-			 * credentials group to the destination IRE.
-			 */
-			if ((attrp = save_ire->ire_gw_secattr) != NULL) {
-				mutex_enter(&attrp->igsa_lock);
-				if ((gcgrp = attrp->igsa_gcgrp) != NULL)
-					GCGRP_REFHOLD(gcgrp);
-				mutex_exit(&attrp->igsa_lock);
-			}
-
-			ire = ire_create_v6(
-			    v6dstp,			/* dest address */
-			    &ipv6_all_ones,		/* mask */
-			    &src_ipif->ipif_v6src_addr, /* source address */
-			    &v6gw,			/* gateway address */
-			    &save_ire->ire_max_frag,
-			    NULL,			/* src nce */
-			    dst_ill->ill_rq,		/* recv-from queue */
-			    dst_ill->ill_wq,		/* send-to queue */
-			    IRE_CACHE,
-			    src_ipif,
-			    &sire->ire_mask_v6,		/* Parent mask */
-			    sire->ire_phandle,		/* Parent handle */
-			    ipif_ire->ire_ihandle,	/* Interface handle */
-			    sire->ire_flags &		/* flags if any */
-			    (RTF_SETSRC | RTF_MULTIRT),
-			    &(sire->ire_uinfo),
-			    NULL,
-			    gcgrp,
-			    ipst);
-
-			if (ire == NULL) {
-				if (gcgrp != NULL) {
-					GCGRP_REFRELE(gcgrp);
-					gcgrp = NULL;
-				}
-				ire_refrele(save_ire);
-				ire_refrele(ipif_ire);
-				break;
-			}
-
-			/* reference now held by IRE */
-			gcgrp = NULL;
-
-			ire->ire_marks |= ire_marks;
-
-			/*
-			 * Prevent sire and ipif_ire from getting deleted. The
-			 * newly created ire is tied to both of them via the
-			 * phandle and ihandle respectively.
-			 */
-			IRB_REFHOLD(sire->ire_bucket);
-			/* Has it been removed already ? */
-			if (sire->ire_marks & IRE_MARK_CONDEMNED) {
-				IRB_REFRELE(sire->ire_bucket);
-				ire_refrele(ipif_ire);
-				ire_refrele(save_ire);
-				break;
-			}
-
-			IRB_REFHOLD(ipif_ire->ire_bucket);
-			/* Has it been removed already ? */
-			if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) {
-				IRB_REFRELE(ipif_ire->ire_bucket);
-				IRB_REFRELE(sire->ire_bucket);
-				ire_refrele(ipif_ire);
-				ire_refrele(save_ire);
-				break;
-			}
-
-			xmit_mp = first_mp;
-			if (ire->ire_flags & RTF_MULTIRT) {
-				copy_mp = copymsg(first_mp);
-				if (copy_mp != NULL) {
-					xmit_mp = copy_mp;
-					MULTIRT_DEBUG_TAG(first_mp);
-				}
-			}
-			ire_add_then_send(q, ire, xmit_mp);
-			if (ip6_asp_table_held) {
-				ip6_asp_table_refrele(ipst);
-				ip6_asp_table_held = B_FALSE;
-			}
-			ire_refrele(save_ire);
-
-			/* Assert that sire is not deleted yet. */
-			ASSERT(sire->ire_ptpn != NULL);
-			IRB_REFRELE(sire->ire_bucket);
-
-			/* Assert that ipif_ire is not deleted yet. */
-			ASSERT(ipif_ire->ire_ptpn != NULL);
-			IRB_REFRELE(ipif_ire->ire_bucket);
-			ire_refrele(ipif_ire);
-
-			if (copy_mp != NULL) {
-				/*
-				 * Search for the next unresolved
-				 * multirt route.
-				 */
-				copy_mp = NULL;
-				ipif_ire = NULL;
-				ire = NULL;
-				/* re-enter the loop */
-				multirt_resolve_next = B_TRUE;
-				continue;
-			}
-			ire_refrele(sire);
-			ill_refrele(dst_ill);
-			ipif_refrele(src_ipif);
-			return;
-		}
-		case IRE_IF_NORESOLVER:
-			/*
-			 * We have what we need to build an IRE_CACHE.
-			 *
-			 * handle the Gated case, where we create
-			 * a NORESOLVER route for loopback.
-			 */
-			if (dst_ill->ill_net_type != IRE_IF_NORESOLVER)
-				break;
-			/*
-			 * TSol note: We are creating the ire cache for the
-			 * destination 'dst'. If 'dst' is offlink, going
-			 * through the first hop 'gw', the security attributes
-			 * of 'dst' must be set to point to the gateway
-			 * credentials of gateway 'gw'. If 'dst' is onlink, it
-			 * is possible that 'dst' is a potential gateway that is
-			 * referenced by some route that has some security
-			 * attributes. Thus in the former case, we need to do a
-			 * gcgrp_lookup of 'gw' while in the latter case we
-			 * need to do gcgrp_lookup of 'dst' itself.
-			 */
-			ga.ga_af = AF_INET6;
-			if (!IN6_IS_ADDR_UNSPECIFIED(&v6gw))
-				ga.ga_addr = v6gw;
-			else
-				ga.ga_addr = *v6dstp;
-			gcgrp = gcgrp_lookup(&ga, B_FALSE);
-
-			/*
-			 * Note: the new ire inherits sire flags RTF_SETSRC
-			 * and RTF_MULTIRT to propagate those rules from prefix
-			 * to cache.
-			 */
-			ire = ire_create_v6(
-			    v6dstp,			/* dest address */
-			    &ipv6_all_ones,		/* mask */
-			    &src_ipif->ipif_v6src_addr, /* source address */
-			    &v6gw,			/* gateway address */
-			    &save_ire->ire_max_frag,
-			    NULL,			/* no src nce */
-			    dst_ill->ill_rq,		/* recv-from queue */
-			    dst_ill->ill_wq,		/* send-to queue */
-			    IRE_CACHE,
-			    src_ipif,
-			    &save_ire->ire_mask_v6,	/* Parent mask */
-			    (sire != NULL) ?		/* Parent handle */
-			    sire->ire_phandle : 0,
-			    save_ire->ire_ihandle,	/* Interface handle */
-			    (sire != NULL) ?		/* flags if any */
-			    sire->ire_flags &
-			    (RTF_SETSRC | RTF_MULTIRT) : 0,
-			    &(save_ire->ire_uinfo),
-			    NULL,
-			    gcgrp,
-			    ipst);
-
-			if (ire == NULL) {
-				if (gcgrp != NULL) {
-					GCGRP_REFRELE(gcgrp);
-					gcgrp = NULL;
-				}
-				ire_refrele(save_ire);
-				break;
-			}
-
-			/* reference now held by IRE */
-			gcgrp = NULL;
-
-			ire->ire_marks |= ire_marks;
-
-			if (!IN6_IS_ADDR_UNSPECIFIED(&v6gw))
-				dst = v6gw;
-			else
-				dst = *v6dstp;
-			err = ndp_noresolver(dst_ill, &dst);
-			if (err != 0) {
-				ire_refrele(save_ire);
-				break;
-			}
-
-			/* Prevent save_ire from getting deleted */
-			IRB_REFHOLD(save_ire->ire_bucket);
-			/* Has it been removed already ? */
-			if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
-				IRB_REFRELE(save_ire->ire_bucket);
-				ire_refrele(save_ire);
-				break;
-			}
-
-			xmit_mp = first_mp;
-			/*
-			 * In case of MULTIRT, a copy of the current packet
-			 * to send is made to further re-enter the
-			 * loop and attempt another route resolution
-			 */
-			if ((sire != NULL) && sire->ire_flags & RTF_MULTIRT) {
-				copy_mp = copymsg(first_mp);
-				if (copy_mp != NULL) {
-					xmit_mp = copy_mp;
-					MULTIRT_DEBUG_TAG(first_mp);
-				}
-			}
-			ire_add_then_send(q, ire, xmit_mp);
-			if (ip6_asp_table_held) {
-				ip6_asp_table_refrele(ipst);
-				ip6_asp_table_held = B_FALSE;
-			}
-
-			/* Assert that it is not deleted yet. */
-			ASSERT(save_ire->ire_ptpn != NULL);
-			IRB_REFRELE(save_ire->ire_bucket);
-			ire_refrele(save_ire);
-
-			if (copy_mp != NULL) {
-				/*
-				 * If we found a (no)resolver, we ignore any
-				 * trailing top priority IRE_CACHE in
-				 * further loops. This ensures that we do not
-				 * omit any (no)resolver despite the priority
-				 * in this call.
-				 * IRE_CACHE, if any, will be processed
-				 * by another thread entering ip_newroute(),
-				 * (on resolver response, for example).
-				 * We use this to force multiple parallel
-				 * resolution as soon as a packet needs to be
-				 * sent. The result is, after one packet
-				 * emission all reachable routes are generally
-				 * resolved.
-				 * Otherwise, complete resolution of MULTIRT
-				 * routes would require several emissions as
-				 * side effect.
-				 */
-				multirt_flags &= ~MULTIRT_CACHEGW;
-
-				/*
-				 * Search for the next unresolved multirt
-				 * route.
-				 */
-				copy_mp = NULL;
-				save_ire = NULL;
-				ire = NULL;
-				/* re-enter the loop */
-				multirt_resolve_next = B_TRUE;
-				continue;
-			}
-
-			/* Don't need sire anymore */
-			if (sire != NULL)
-				ire_refrele(sire);
-			ill_refrele(dst_ill);
-			ipif_refrele(src_ipif);
-			return;
-
-		case IRE_IF_RESOLVER:
-			/*
-			 * We can't build an IRE_CACHE yet, but at least we
-			 * found a resolver that can help.
-			 */
-			dst = *v6dstp;
-
-			/*
-			 * To be at this point in the code with a non-zero gw
-			 * means that dst is reachable through a gateway that
-			 * we have never resolved.  By changing dst to the gw
-			 * addr we resolve the gateway first.  When
-			 * ire_add_then_send() tries to put the IP dg to dst,
-			 * it will reenter ip_newroute() at which time we will
-			 * find the IRE_CACHE for the gw and create another
-			 * IRE_CACHE above (for dst itself).
-			 */
-			if (!IN6_IS_ADDR_UNSPECIFIED(&v6gw)) {
-				save_dst = dst;
-				dst = v6gw;
-				v6gw = ipv6_all_zeros;
-			}
-			if (dst_ill->ill_flags & ILLF_XRESOLV) {
-				/*
-				 * Ask the external resolver to do its thing.
-				 * Make an mblk chain in the following form:
-				 * ARQ_REQ_MBLK-->IRE_MBLK-->packet
-				 */
-				mblk_t		*ire_mp;
-				mblk_t		*areq_mp;
-				areq_t		*areq;
-				in6_addr_t	*addrp;
-
-				ip1dbg(("ip_newroute_v6:ILLF_XRESOLV\n"));
-				if (ip6_asp_table_held) {
-					ip6_asp_table_refrele(ipst);
-					ip6_asp_table_held = B_FALSE;
-				}
-				ire = ire_create_mp_v6(
-				    &dst,		/* dest address */
-				    &ipv6_all_ones,	/* mask */
-				    &src_ipif->ipif_v6src_addr,
-				    /* source address */
-				    &v6gw,		/* gateway address */
-				    NULL,		/* no src nce */
-				    dst_ill->ill_rq,	/* recv-from queue */
-				    dst_ill->ill_wq, 	/* send-to queue */
-				    IRE_CACHE,
-				    src_ipif,
-				    &save_ire->ire_mask_v6, /* Parent mask */
-				    0,
-				    save_ire->ire_ihandle,
-				    /* Interface handle */
-				    0,		/* flags if any */
-				    &(save_ire->ire_uinfo),
-				    NULL,
-				    NULL,
-				    ipst);
-
-				ire_refrele(save_ire);
-				if (ire == NULL) {
-					ip1dbg(("ip_newroute_v6:"
-					    "ire is NULL\n"));
-					break;
-				}
-
-				if ((sire != NULL) &&
-				    (sire->ire_flags & RTF_MULTIRT)) {
-					/*
-					 * processing a copy of the packet to
-					 * send for further resolution loops
-					 */
-					copy_mp = copymsg(first_mp);
-					if (copy_mp != NULL)
-						MULTIRT_DEBUG_TAG(copy_mp);
-				}
-				ire->ire_marks |= ire_marks;
-				ire_mp = ire->ire_mp;
-				/*
-				 * Now create or find an nce for this interface.
-				 * The hw addr will need to to be set from
-				 * the reply to the AR_ENTRY_QUERY that
-				 * we're about to send. This will be done in
-				 * ire_add_v6().
-				 */
-				err = ndp_resolver(dst_ill, &dst, mp, zoneid);
-				switch (err) {
-				case 0:
-					/*
-					 * New cache entry created.
-					 * Break, then ask the external
-					 * resolver.
-					 */
-					break;
-				case EINPROGRESS:
-					/*
-					 * Resolution in progress;
-					 * packet has been queued by
-					 * ndp_resolver().
-					 */
-					ire_delete(ire);
-					ire = NULL;
-					/*
-					 * Check if another multirt
-					 * route must be resolved.
-					 */
-					if (copy_mp != NULL) {
-						/*
-						 * If we found a resolver, we
-						 * ignore any trailing top
-						 * priority IRE_CACHE in
-						 * further loops. The reason is
-						 * the same as for noresolver.
-						 */
-						multirt_flags &=
-						    ~MULTIRT_CACHEGW;
-						/*
-						 * Search for the next
-						 * unresolved multirt route.
-						 */
-						first_mp = copy_mp;
-						copy_mp = NULL;
-						mp = first_mp;
-						if (mp->b_datap->db_type ==
-						    M_CTL) {
-							mp = mp->b_cont;
-						}
-						ASSERT(sire != NULL);
-						dst = save_dst;
-						/*
-						 * re-enter the loop
-						 */
-						multirt_resolve_next =
-						    B_TRUE;
-						continue;
-					}
-
-					if (sire != NULL)
-						ire_refrele(sire);
-					ill_refrele(dst_ill);
-					ipif_refrele(src_ipif);
-					return;
-				default:
-					/*
-					 * Transient error; packet will be
-					 * freed.
-					 */
-					ire_delete(ire);
-					ire = NULL;
-					break;
-				}
-				if (err != 0)
-					break;
-				/*
-				 * Now set up the AR_ENTRY_QUERY and send it.
-				 */
-				areq_mp = ill_arp_alloc(dst_ill,
-				    (uchar_t *)&ipv6_areq_template,
-				    (caddr_t)&dst);
-				if (areq_mp == NULL) {
-					ip1dbg(("ip_newroute_v6:"
-					    "areq_mp is NULL\n"));
-					freemsg(ire_mp);
-					break;
-				}
-				areq = (areq_t *)areq_mp->b_rptr;
-				addrp = (in6_addr_t *)((char *)areq +
-				    areq->areq_target_addr_offset);
-				*addrp = dst;
-				addrp = (in6_addr_t *)((char *)areq +
-				    areq->areq_sender_addr_offset);
-				*addrp = src_ipif->ipif_v6src_addr;
-				/*
-				 * link the chain, then send up to the resolver.
-				 */
-				linkb(areq_mp, ire_mp);
-				linkb(areq_mp, mp);
-				ip1dbg(("ip_newroute_v6:"
-				    "putnext to resolver\n"));
-				putnext(dst_ill->ill_rq, areq_mp);
-				/*
-				 * Check if another multirt route
-				 * must be resolved.
-				 */
-				ire = NULL;
-				if (copy_mp != NULL) {
-					/*
-					 * If we find a resolver, we ignore any
-					 * trailing top priority IRE_CACHE in
-					 * further loops. The reason is the
-					 * same as for noresolver.
-					 */
-					multirt_flags &= ~MULTIRT_CACHEGW;
-					/*
-					 * Search for the next unresolved
-					 * multirt route.
-					 */
-					first_mp = copy_mp;
-					copy_mp = NULL;
-					mp = first_mp;
-					if (mp->b_datap->db_type == M_CTL) {
-						mp = mp->b_cont;
-					}
-					ASSERT(sire != NULL);
-					dst = save_dst;
-					/*
-					 * re-enter the loop
-					 */
-					multirt_resolve_next = B_TRUE;
-					continue;
-				}
-
-				if (sire != NULL)
-					ire_refrele(sire);
-				ill_refrele(dst_ill);
-				ipif_refrele(src_ipif);
-				return;
-			}
-			/*
-			 * Non-external resolver case.
-			 *
-			 * TSol note: Please see the note above the
-			 * IRE_IF_NORESOLVER case.
-			 */
-			ga.ga_af = AF_INET6;
-			ga.ga_addr = dst;
-			gcgrp = gcgrp_lookup(&ga, B_FALSE);
-
-			ire = ire_create_v6(
-			    &dst,			/* dest address */
-			    &ipv6_all_ones,		/* mask */
-			    &src_ipif->ipif_v6src_addr, /* source address */
-			    &v6gw,			/* gateway address */
-			    &save_ire->ire_max_frag,
-			    NULL,			/* no src nce */
-			    dst_ill->ill_rq,		/* recv-from queue */
-			    dst_ill->ill_wq,		/* send-to queue */
-			    IRE_CACHE,
-			    src_ipif,
-			    &save_ire->ire_mask_v6,	/* Parent mask */
-			    0,
-			    save_ire->ire_ihandle,	/* Interface handle */
-			    0,				/* flags if any */
-			    &(save_ire->ire_uinfo),
-			    NULL,
-			    gcgrp,
-			    ipst);
-
-			if (ire == NULL) {
-				if (gcgrp != NULL) {
-					GCGRP_REFRELE(gcgrp);
-					gcgrp = NULL;
-				}
-				ire_refrele(save_ire);
-				break;
-			}
-
-			/* reference now held by IRE */
-			gcgrp = NULL;
-
-			if ((sire != NULL) &&
-			    (sire->ire_flags & RTF_MULTIRT)) {
-				copy_mp = copymsg(first_mp);
-				if (copy_mp != NULL)
-					MULTIRT_DEBUG_TAG(copy_mp);
-			}
-
-			ire->ire_marks |= ire_marks;
-			err = ndp_resolver(dst_ill, &dst, first_mp, zoneid);
-			switch (err) {
-			case 0:
-				/* Prevent save_ire from getting deleted */
-				IRB_REFHOLD(save_ire->ire_bucket);
-				/* Has it been removed already ? */
-				if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
-					IRB_REFRELE(save_ire->ire_bucket);
-					ire_refrele(save_ire);
-					break;
-				}
-
-				/*
-				 * We have a resolved cache entry,
-				 * add in the IRE.
-				 */
-				ire_add_then_send(q, ire, first_mp);
-				if (ip6_asp_table_held) {
-					ip6_asp_table_refrele(ipst);
-					ip6_asp_table_held = B_FALSE;
-				}
-
-				/* Assert that it is not deleted yet. */
-				ASSERT(save_ire->ire_ptpn != NULL);
-				IRB_REFRELE(save_ire->ire_bucket);
-				ire_refrele(save_ire);
-				/*
-				 * Check if another multirt route
-				 * must be resolved.
-				 */
-				ire = NULL;
-				if (copy_mp != NULL) {
-					/*
-					 * If we find a resolver, we ignore any
-					 * trailing top priority IRE_CACHE in
-					 * further loops. The reason is the
-					 * same as for noresolver.
-					 */
-					multirt_flags &= ~MULTIRT_CACHEGW;
-					/*
-					 * Search for the next unresolved
-					 * multirt route.
-					 */
-					first_mp = copy_mp;
-					copy_mp = NULL;
-					mp = first_mp;
-					if (mp->b_datap->db_type == M_CTL) {
-						mp = mp->b_cont;
-					}
-					ASSERT(sire != NULL);
-					dst = save_dst;
-					/*
-					 * re-enter the loop
-					 */
-					multirt_resolve_next = B_TRUE;
-					continue;
-				}
-
-				if (sire != NULL)
-					ire_refrele(sire);
-				ill_refrele(dst_ill);
-				ipif_refrele(src_ipif);
-				return;
-
-			case EINPROGRESS:
-				/*
-				 * mp was consumed - presumably queued.
-				 * No need for ire, presumably resolution is
-				 * in progress, and ire will be added when the
-				 * address is resolved.
-				 */
-				if (ip6_asp_table_held) {
-					ip6_asp_table_refrele(ipst);
-					ip6_asp_table_held = B_FALSE;
-				}
-				ASSERT(ire->ire_nce == NULL);
-				ire_delete(ire);
-				ire_refrele(save_ire);
-				/*
-				 * Check if another multirt route
-				 * must be resolved.
-				 */
-				ire = NULL;
-				if (copy_mp != NULL) {
-					/*
-					 * If we find a resolver, we ignore any
-					 * trailing top priority IRE_CACHE in
-					 * further loops. The reason is the
-					 * same as for noresolver.
-					 */
-					multirt_flags &= ~MULTIRT_CACHEGW;
-					/*
-					 * Search for the next unresolved
-					 * multirt route.
-					 */
-					first_mp = copy_mp;
-					copy_mp = NULL;
-					mp = first_mp;
-					if (mp->b_datap->db_type == M_CTL) {
-						mp = mp->b_cont;
-					}
-					ASSERT(sire != NULL);
-					dst = save_dst;
-					/*
-					 * re-enter the loop
-					 */
-					multirt_resolve_next = B_TRUE;
-					continue;
-				}
-				if (sire != NULL)
-					ire_refrele(sire);
-				ill_refrele(dst_ill);
-				ipif_refrele(src_ipif);
-				return;
-			default:
-				/* Some transient error */
-				ASSERT(ire->ire_nce == NULL);
-				ire_refrele(save_ire);
-				break;
-			}
-			break;
-		default:
-			break;
-		}
-		if (ip6_asp_table_held) {
-			ip6_asp_table_refrele(ipst);
-			ip6_asp_table_held = B_FALSE;
-		}
-	} while (multirt_resolve_next);
-
-err_ret:
-	ip1dbg(("ip_newroute_v6: dropped\n"));
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-	if (dst_ill != NULL) {
-		need_rele = B_TRUE;
-		ill = dst_ill;
-	}
-	if (ill != NULL) {
-		if (mp->b_prev != NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-		} else {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
-		}
-
-		if (need_rele)
-			ill_refrele(ill);
-	} else {
-		if (mp->b_prev != NULL) {
-			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
-		} else {
-			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
-		}
-	}
-	/* Did this packet originate externally? */
-	if (mp->b_prev) {
-		mp->b_next = NULL;
-		mp->b_prev = NULL;
-	}
-	if (copy_mp != NULL) {
-		MULTIRT_DEBUG_UNTAG(copy_mp);
-		freemsg(copy_mp);
-	}
-	MULTIRT_DEBUG_UNTAG(first_mp);
-	freemsg(first_mp);
-	if (ire != NULL)
-		ire_refrele(ire);
-	if (sire != NULL)
-		ire_refrele(sire);
-	return;
-
-icmp_err_ret:
-	if (ip6_asp_table_held)
-		ip6_asp_table_refrele(ipst);
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-	if (dst_ill != NULL) {
-		need_rele = B_TRUE;
-		ill = dst_ill;
-	}
-	ip1dbg(("ip_newroute_v6: no route\n"));
-	if (sire != NULL)
-		ire_refrele(sire);
-	/*
-	 * We need to set sire to NULL to avoid double freeing if we
-	 * ever goto err_ret from below.
-	 */
-	sire = NULL;
-	ip6h = (ip6_t *)mp->b_rptr;
-	/* Skip ip6i_t header if present */
-	if (ip6h->ip6_nxt == IPPROTO_RAW) {
-		/* Make sure the IPv6 header is present */
-		if ((mp->b_wptr - (uchar_t *)ip6h) <
-		    sizeof (ip6i_t) + IPV6_HDR_LEN) {
-			if (!pullupmsg(mp, sizeof (ip6i_t) + IPV6_HDR_LEN)) {
-				ip1dbg(("ip_newroute_v6: pullupmsg failed\n"));
-				goto err_ret;
-			}
-		}
-		mp->b_rptr += sizeof (ip6i_t);
-		ip6h = (ip6_t *)mp->b_rptr;
-	}
-	/* Did this packet originate externally? */
-	if (mp->b_prev) {
-		if (ill != NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
-		} else {
-			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInNoRoutes);
-		}
-		mp->b_next = NULL;
-		mp->b_prev = NULL;
-		q = WR(q);
-	} else {
-		if (ill != NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutNoRoutes);
-		} else {
-			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes);
-		}
-		if (ip_hdr_complete_v6(ip6h, zoneid, ipst)) {
-			/* Failed */
-			if (copy_mp != NULL) {
-				MULTIRT_DEBUG_UNTAG(copy_mp);
-				freemsg(copy_mp);
-			}
-			MULTIRT_DEBUG_UNTAG(first_mp);
-			freemsg(first_mp);
-			if (ire != NULL)
-				ire_refrele(ire);
-			if (need_rele)
-				ill_refrele(ill);
-			return;
-		}
-	}
-
-	if (need_rele)
-		ill_refrele(ill);
-
-	/*
-	 * At this point we will have ire only if RTF_BLACKHOLE
-	 * or RTF_REJECT flags are set on the IRE. It will not
-	 * generate ICMP6_DST_UNREACH_NOROUTE if RTF_BLACKHOLE is set.
-	 */
-	if (ire != NULL) {
-		if (ire->ire_flags & RTF_BLACKHOLE) {
-			ire_refrele(ire);
-			if (copy_mp != NULL) {
-				MULTIRT_DEBUG_UNTAG(copy_mp);
-				freemsg(copy_mp);
-			}
-			MULTIRT_DEBUG_UNTAG(first_mp);
-			freemsg(first_mp);
-			return;
-		}
-		ire_refrele(ire);
-	}
-	if (ip_debug > 3) {
-		/* ip2dbg */
-		pr_addr_dbg("ip_newroute_v6: no route to %s\n",
-		    AF_INET6, v6dstp);
-	}
-	icmp_unreachable_v6(WR(q), first_mp, ICMP6_DST_UNREACH_NOROUTE,
-	    B_FALSE, B_FALSE, zoneid, ipst);
-}
-
-/*
- * ip_newroute_ipif_v6 is called by ip_wput_v6 and ip_wput_ipsec_out_v6 whenever
- * we need to send out a packet to a destination address for which we do not
- * have specific routing information. It is only used for multicast packets.
- *
- * If unspec_src we allow creating an IRE with source address zero.
- * ire_send_v6() will delete it after the packet is sent.
- */
-void
-ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
-    const in6_addr_t *v6dstp, const in6_addr_t *v6srcp, int unspec_src,
-    zoneid_t zoneid)
-{
-	ire_t	*ire = NULL;
-	ipif_t	*src_ipif = NULL;
-	int	err = 0;
-	ill_t	*dst_ill = NULL;
-	ire_t	*save_ire;
-	ipsec_out_t *io;
-	ill_t *ill;
-	mblk_t *first_mp;
-	ire_t *fire = NULL;
-	mblk_t  *copy_mp = NULL;
-	const in6_addr_t *ire_v6srcp;
-	boolean_t probe = B_FALSE;
-	boolean_t multirt_resolve_next;
-	boolean_t ipif_held = B_FALSE;
-	boolean_t ill_held = B_FALSE;
-	boolean_t ip6_asp_table_held = B_FALSE;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
-
-	/*
-	 * This loop is run only once in most cases.
-	 * We loop to resolve further routes only when the destination
-	 * can be reached through multiple RTF_MULTIRT-flagged ires.
-	 */
-	do {
-		multirt_resolve_next = B_FALSE;
-		if (dst_ill != NULL) {
-			ill_refrele(dst_ill);
-			dst_ill = NULL;
-		}
-
-		if (src_ipif != NULL) {
-			ipif_refrele(src_ipif);
-			src_ipif = NULL;
-		}
-		ASSERT(ipif != NULL);
-		ill = ipif->ipif_ill;
-
-		ASSERT(!IN6_IS_ADDR_V4MAPPED(v6dstp));
-		if (ip_debug > 2) {
-			/* ip1dbg */
-			pr_addr_dbg("ip_newroute_ipif_v6: v6dst %s\n",
-			    AF_INET6, v6dstp);
-			printf("ip_newroute_ipif_v6: if %s, v6 %d\n",
-			    ill->ill_name, ipif->ipif_isv6);
-		}
-
-		first_mp = mp;
-		if (mp->b_datap->db_type == M_CTL) {
-			mp = mp->b_cont;
-			io = (ipsec_out_t *)first_mp->b_rptr;
-			ASSERT(io->ipsec_out_type == IPSEC_OUT);
-		} else {
-			io = NULL;
-		}
-
-		/*
-		 * If the interface is a pt-pt interface we look for an
-		 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the
-		 * local_address and the pt-pt destination address.
-		 * Otherwise we just match the local address.
-		 */
-		if (!(ill->ill_flags & ILLF_MULTICAST)) {
-			goto err_ret;
-		}
-
-		/*
-		 * We check if an IRE_OFFSUBNET for the addr that goes through
-		 * ipif exists. We need it to determine if the RTF_SETSRC and/or
-		 * RTF_MULTIRT flags must be honored.
-		 */
-		fire = ipif_lookup_multi_ire_v6(ipif, v6dstp);
-		ip2dbg(("ip_newroute_ipif_v6: "
-		    "ipif_lookup_multi_ire_v6("
-		    "ipif %p, dst %08x) = fire %p\n",
-		    (void *)ipif, ntohl(V4_PART_OF_V6((*v6dstp))),
-		    (void *)fire));
-
-		ASSERT(src_ipif == NULL);
-
-		/*
-		 * Because nce_xmit() calls ip_output_v6() and NCEs are always
-		 * tied to the underlying interface, IS_UNDER_IPMP() may be
-		 * true even when building IREs that will be used for data
-		 * traffic.  As such, see if the packet's source address is a
-		 * test address, and if so use that test address's ipif for
-		 * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in
-		 * ire_add_v6() can work properly.
-		 */
-		if (IS_UNDER_IPMP(ill))
-			probe = ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif);
-
-		/*
-		 * Determine the outbound (destination) ill for this route.
-		 * If IPMP is not in use, that's the same as our ill.  If IPMP
-		 * is in-use and we're on the IPMP interface, or we're on an
-		 * underlying ill but sending data traffic, use a suitable
-		 * destination ill from the group.  The latter case covers a
-		 * subtle edge condition with multicast: when we bring up an
-		 * IPv6 data address, we will create an NCE on an underlying
-		 * interface, and send solitications to ff02::1, which would
-		 * take us through here, and cause us to create an IRE for
-		 * ff02::1.  To meet our defined semantics for multicast (and
-		 * ensure there aren't unexpected echoes), that IRE needs to
-		 * use the IPMP group's nominated multicast interface.
-		 *
-		 * Note: the source ipif is determined by source address
-		 * selection later.
-		 */
-		if (IS_IPMP(ill) || (IS_UNDER_IPMP(ill) && !probe)) {
-			ill_t *ipmp_ill;
-			ipmp_illgrp_t *illg;
-
-			if (IS_UNDER_IPMP(ill)) {
-				ipmp_ill = ipmp_ill_hold_ipmp_ill(ill);
-			} else {
-				ipmp_ill = ill;
-				ill_refhold(ipmp_ill);	/* for symmetry */
-			}
-
-			if (ipmp_ill == NULL)
-				goto err_ret;
-
-			illg = ipmp_ill->ill_grp;
-			if (IN6_IS_ADDR_MULTICAST(v6dstp))
-				dst_ill = ipmp_illgrp_hold_cast_ill(illg);
-			else
-				dst_ill = ipmp_illgrp_hold_next_ill(illg);
-
-			ill_refrele(ipmp_ill);
-		} else {
-			dst_ill = ill;
-			ill_refhold(dst_ill); 	/* for symmetry */
-		}
-
-		if (dst_ill == NULL) {
-			if (ip_debug > 2) {
-				pr_addr_dbg("ip_newroute_ipif_v6: "
-				    "no dst ill for dst %s\n",
-				    AF_INET6, v6dstp);
-			}
-			goto err_ret;
-		}
-
-		/*
-		 * Pick a source address which matches the scope of the
-		 * destination address.
-		 * For RTF_SETSRC routes, the source address is imposed by the
-		 * parent ire (fire).
-		 */
-
-		if (src_ipif == NULL && fire != NULL &&
-		    (fire->ire_flags & RTF_SETSRC)) {
-			/*
-			 * Check that the ipif matching the requested source
-			 * address still exists.
-			 */
-			src_ipif = ipif_lookup_addr_v6(&fire->ire_src_addr_v6,
-			    NULL, zoneid, NULL, NULL, NULL, NULL, ipst);
-		}
-
-		if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
-			ip6_asp_table_held = B_TRUE;
-			src_ipif = ipif_select_source_v6(dst_ill, v6dstp,
-			    B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
-		}
-
-		if (src_ipif == NULL) {
-			if (!unspec_src) {
-				if (ip_debug > 2) {
-					/* ip1dbg */
-					pr_addr_dbg("ip_newroute_ipif_v6: "
-					    "no src for dst %s\n",
-					    AF_INET6, v6dstp);
-					printf(" through interface %s\n",
-					    dst_ill->ill_name);
-				}
-				goto err_ret;
-			}
-			ire_v6srcp = &ipv6_all_zeros;
-			src_ipif = ipif;
-			ipif_refhold(src_ipif);
-		} else {
-			ire_v6srcp = &src_ipif->ipif_v6src_addr;
-		}
-
-		ire = ipif_to_ire_v6(ipif);
-		if (ire == NULL) {
-			if (ip_debug > 2) {
-				/* ip1dbg */
-				pr_addr_dbg("ip_newroute_ipif_v6: v6src %s\n",
-				    AF_INET6, &ipif->ipif_v6lcl_addr);
-				printf("ip_newroute_ipif_v6: "
-				    "if %s\n", dst_ill->ill_name);
-			}
-			goto err_ret;
-		}
-		if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
-			goto err_ret;
-
-		ASSERT(ire->ire_ipversion == IPV6_VERSION);
-
-		ip1dbg(("ip_newroute_ipif_v6: interface type %s (%d),",
-		    ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type));
-		if (ip_debug > 2) {
-			/* ip1dbg */
-			pr_addr_dbg(" address %s\n",
-			    AF_INET6, &ire->ire_src_addr_v6);
-		}
-		save_ire = ire;
-		ip2dbg(("ip_newroute_ipif: ire %p, ipif %p\n",
-		    (void *)ire, (void *)ipif));
-
-		if ((fire != NULL) && (fire->ire_flags & RTF_MULTIRT)) {
-			/*
-			 * an IRE_OFFSUBET was looked up
-			 * on that interface.
-			 * this ire has RTF_MULTIRT flag,
-			 * so the resolution loop
-			 * will be re-entered to resolve
-			 * additional routes on other
-			 * interfaces. For that purpose,
-			 * a copy of the packet is
-			 * made at this point.
-			 */
-			fire->ire_last_used_time = lbolt;
-			copy_mp = copymsg(first_mp);
-			if (copy_mp) {
-				MULTIRT_DEBUG_TAG(copy_mp);
-			}
-		}
-
-		switch (ire->ire_type) {
-		case IRE_IF_NORESOLVER: {
-			/*
-			 * We have what we need to build an IRE_CACHE.
-			 *
-			 * handle the Gated case, where we create
-			 * a NORESOLVER route for loopback.
-			 */
-			if (dst_ill->ill_net_type != IRE_IF_NORESOLVER)
-				break;
-			/*
-			 * The newly created ire will inherit the flags of the
-			 * parent ire, if any.
-			 */
-			ire = ire_create_v6(
-			    v6dstp,			/* dest address */
-			    &ipv6_all_ones,		/* mask */
-			    ire_v6srcp,			/* source address */
-			    NULL,			/* gateway address */
-			    &save_ire->ire_max_frag,
-			    NULL,			/* no src nce */
-			    dst_ill->ill_rq,		/* recv-from queue */
-			    dst_ill->ill_wq,		/* send-to queue */
-			    IRE_CACHE,
-			    src_ipif,
-			    NULL,
-			    (fire != NULL) ?		/* Parent handle */
-			    fire->ire_phandle : 0,
-			    save_ire->ire_ihandle,	/* Interface handle */
-			    (fire != NULL) ?
-			    (fire->ire_flags & (RTF_SETSRC | RTF_MULTIRT)) :
-			    0,
-			    &ire_uinfo_null,
-			    NULL,
-			    NULL,
-			    ipst);
-
-			if (ire == NULL) {
-				ire_refrele(save_ire);
-				break;
-			}
-
-			err = ndp_noresolver(dst_ill, v6dstp);
-			if (err != 0) {
-				ire_refrele(save_ire);
-				break;
-			}
-
-			/* Prevent save_ire from getting deleted */
-			IRB_REFHOLD(save_ire->ire_bucket);
-			/* Has it been removed already ? */
-			if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
-				IRB_REFRELE(save_ire->ire_bucket);
-				ire_refrele(save_ire);
-				break;
-			}
-
-			ire_add_then_send(q, ire, first_mp);
-			if (ip6_asp_table_held) {
-				ip6_asp_table_refrele(ipst);
-				ip6_asp_table_held = B_FALSE;
-			}
-
-			/* Assert that it is not deleted yet. */
-			ASSERT(save_ire->ire_ptpn != NULL);
-			IRB_REFRELE(save_ire->ire_bucket);
-			ire_refrele(save_ire);
-			if (fire != NULL) {
-				ire_refrele(fire);
-				fire = NULL;
-			}
-
-			/*
-			 * The resolution loop is re-entered if we
-			 * actually are in a multirouting case.
-			 */
-			if (copy_mp != NULL) {
-				boolean_t need_resolve =
-				    ire_multirt_need_resolve_v6(v6dstp,
-				    msg_getlabel(copy_mp), ipst);
-				if (!need_resolve) {
-					MULTIRT_DEBUG_UNTAG(copy_mp);
-					freemsg(copy_mp);
-					copy_mp = NULL;
-				} else {
-					/*
-					 * ipif_lookup_group_v6() calls
-					 * ire_lookup_multi_v6() that uses
-					 * ire_ftable_lookup_v6() to find
-					 * an IRE_INTERFACE for the group.
-					 * In the multirt case,
-					 * ire_lookup_multi_v6() then invokes
-					 * ire_multirt_lookup_v6() to find
-					 * the next resolvable ire.
-					 * As a result, we obtain a new
-					 * interface, derived from the
-					 * next ire.
-					 */
-					if (ipif_held) {
-						ipif_refrele(ipif);
-						ipif_held = B_FALSE;
-					}
-					ipif = ipif_lookup_group_v6(v6dstp,
-					    zoneid, ipst);
-					ip2dbg(("ip_newroute_ipif: "
-					    "multirt dst %08x, ipif %p\n",
-					    ntohl(V4_PART_OF_V6((*v6dstp))),
-					    (void *)ipif));
-					if (ipif != NULL) {
-						ipif_held = B_TRUE;
-						mp = copy_mp;
-						copy_mp = NULL;
-						multirt_resolve_next =
-						    B_TRUE;
-						continue;
-					} else {
-						freemsg(copy_mp);
-					}
-				}
-			}
-			ill_refrele(dst_ill);
-			if (ipif_held) {
-				ipif_refrele(ipif);
-				ipif_held = B_FALSE;
-			}
-			if (src_ipif != NULL)
-				ipif_refrele(src_ipif);
-			return;
-		}
-		case IRE_IF_RESOLVER: {
-
-			ASSERT(dst_ill->ill_isv6);
-
-			/*
-			 * We obtain a partial IRE_CACHE which we will pass
-			 * along with the resolver query.  When the response
-			 * comes back it will be there ready for us to add.
-			 */
-			/*
-			 * the newly created ire will inherit the flags of the
-			 * parent ire, if any.
-			 */
-			ire = ire_create_v6(
-			    v6dstp,			/* dest address */
-			    &ipv6_all_ones,		/* mask */
-			    ire_v6srcp,			/* source address */
-			    NULL,			/* gateway address */
-			    &save_ire->ire_max_frag,
-			    NULL,			/* src nce */
-			    dst_ill->ill_rq,		/* recv-from queue */
-			    dst_ill->ill_wq,		/* send-to queue */
-			    IRE_CACHE,
-			    src_ipif,
-			    NULL,
-			    (fire != NULL) ?		/* Parent handle */
-			    fire->ire_phandle : 0,
-			    save_ire->ire_ihandle,	/* Interface handle */
-			    (fire != NULL) ?
-			    (fire->ire_flags & (RTF_SETSRC | RTF_MULTIRT)) :
-			    0,
-			    &ire_uinfo_null,
-			    NULL,
-			    NULL,
-			    ipst);
-
-			if (ire == NULL) {
-				ire_refrele(save_ire);
-				break;
-			}
-
-			/* Resolve and add ire to the ctable */
-			err = ndp_resolver(dst_ill, v6dstp, first_mp, zoneid);
-			switch (err) {
-			case 0:
-				/* Prevent save_ire from getting deleted */
-				IRB_REFHOLD(save_ire->ire_bucket);
-				/* Has it been removed already ? */
-				if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
-					IRB_REFRELE(save_ire->ire_bucket);
-					ire_refrele(save_ire);
-					break;
-				}
-				/*
-				 * We have a resolved cache entry,
-				 * add in the IRE.
-				 */
-				ire_add_then_send(q, ire, first_mp);
-				if (ip6_asp_table_held) {
-					ip6_asp_table_refrele(ipst);
-					ip6_asp_table_held = B_FALSE;
-				}
-
-				/* Assert that it is not deleted yet. */
-				ASSERT(save_ire->ire_ptpn != NULL);
-				IRB_REFRELE(save_ire->ire_bucket);
-				ire_refrele(save_ire);
-				if (fire != NULL) {
-					ire_refrele(fire);
-					fire = NULL;
-				}
-
-				/*
-				 * The resolution loop is re-entered if we
-				 * actually are in a multirouting case.
-				 */
-				if (copy_mp != NULL) {
-					boolean_t need_resolve =
-					    ire_multirt_need_resolve_v6(v6dstp,
-					    msg_getlabel(copy_mp), ipst);
-					if (!need_resolve) {
-						MULTIRT_DEBUG_UNTAG(copy_mp);
-						freemsg(copy_mp);
-						copy_mp = NULL;
-					} else {
-						/*
-						 * ipif_lookup_group_v6() calls
-						 * ire_lookup_multi_v6() that
-						 * uses ire_ftable_lookup_v6()
-						 * to find an IRE_INTERFACE for
-						 * the group. In the multirt
-						 * case, ire_lookup_multi_v6()
-						 * then invokes
-						 * ire_multirt_lookup_v6() to
-						 * find the next resolvable ire.
-						 * As a result, we obtain a new
-						 * interface, derived from the
-						 * next ire.
-						 */
-						if (ipif_held) {
-							ipif_refrele(ipif);
-							ipif_held = B_FALSE;
-						}
-						ipif = ipif_lookup_group_v6(
-						    v6dstp, zoneid, ipst);
-						ip2dbg(("ip_newroute_ipif: "
-						    "multirt dst %08x, "
-						    "ipif %p\n",
-						    ntohl(V4_PART_OF_V6(
-						    (*v6dstp))),
-						    (void *)ipif));
-						if (ipif != NULL) {
-							ipif_held = B_TRUE;
-							mp = copy_mp;
-							copy_mp = NULL;
-							multirt_resolve_next =
-							    B_TRUE;
-							continue;
-						} else {
-							freemsg(copy_mp);
-						}
-					}
-				}
-				ill_refrele(dst_ill);
-				if (ipif_held) {
-					ipif_refrele(ipif);
-					ipif_held = B_FALSE;
-				}
-				if (src_ipif != NULL)
-					ipif_refrele(src_ipif);
-				return;
-
-			case EINPROGRESS:
-				/*
-				 * mp was consumed - presumably queued.
-				 * No need for ire, presumably resolution is
-				 * in progress, and ire will be added when the
-				 * address is resolved.
-				 */
-				if (ip6_asp_table_held) {
-					ip6_asp_table_refrele(ipst);
-					ip6_asp_table_held = B_FALSE;
-				}
-				ire_delete(ire);
-				ire_refrele(save_ire);
-				if (fire != NULL) {
-					ire_refrele(fire);
-					fire = NULL;
-				}
-
-				/*
-				 * The resolution loop is re-entered if we
-				 * actually are in a multirouting case.
-				 */
-				if (copy_mp != NULL) {
-					boolean_t need_resolve =
-					    ire_multirt_need_resolve_v6(v6dstp,
-					    msg_getlabel(copy_mp), ipst);
-					if (!need_resolve) {
-						MULTIRT_DEBUG_UNTAG(copy_mp);
-						freemsg(copy_mp);
-						copy_mp = NULL;
-					} else {
-						/*
-						 * ipif_lookup_group_v6() calls
-						 * ire_lookup_multi_v6() that
-						 * uses ire_ftable_lookup_v6()
-						 * to find an IRE_INTERFACE for
-						 * the group. In the multirt
-						 * case, ire_lookup_multi_v6()
-						 * then invokes
-						 * ire_multirt_lookup_v6() to
-						 * find the next resolvable ire.
-						 * As a result, we obtain a new
-						 * interface, derived from the
-						 * next ire.
-						 */
-						if (ipif_held) {
-							ipif_refrele(ipif);
-							ipif_held = B_FALSE;
-						}
-						ipif = ipif_lookup_group_v6(
-						    v6dstp, zoneid, ipst);
-						ip2dbg(("ip_newroute_ipif: "
-						    "multirt dst %08x, "
-						    "ipif %p\n",
-						    ntohl(V4_PART_OF_V6(
-						    (*v6dstp))),
-						    (void *)ipif));
-						if (ipif != NULL) {
-							ipif_held = B_TRUE;
-							mp = copy_mp;
-							copy_mp = NULL;
-							multirt_resolve_next =
-							    B_TRUE;
-							continue;
-						} else {
-							freemsg(copy_mp);
-						}
-					}
-				}
-				ill_refrele(dst_ill);
-				if (ipif_held) {
-					ipif_refrele(ipif);
-					ipif_held = B_FALSE;
-				}
-				if (src_ipif != NULL)
-					ipif_refrele(src_ipif);
-				return;
-			default:
-				/* Some transient error */
-				ire_refrele(save_ire);
-				break;
-			}
-			break;
-		}
-		default:
-			break;
-		}
-		if (ip6_asp_table_held) {
-			ip6_asp_table_refrele(ipst);
-			ip6_asp_table_held = B_FALSE;
-		}
-	} while (multirt_resolve_next);
-
-err_ret:
-	if (ip6_asp_table_held)
-		ip6_asp_table_refrele(ipst);
-	if (ire != NULL)
-		ire_refrele(ire);
-	if (fire != NULL)
-		ire_refrele(fire);
-	if (ipif != NULL && ipif_held)
-		ipif_refrele(ipif);
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-
-	/* Multicast - no point in trying to generate ICMP error */
-	if (dst_ill != NULL) {
-		ill = dst_ill;
-		ill_held = B_TRUE;
-	}
-	if (mp->b_prev || mp->b_next) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-	} else {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
-	}
-	ip1dbg(("ip_newroute_ipif_v6: dropped\n"));
-	mp->b_next = NULL;
-	mp->b_prev = NULL;
-	freemsg(first_mp);
-	if (ill_held)
-		ill_refrele(ill);
-}
-
-/*
  * Parse and process any hop-by-hop or destination options.
  *
  * Assumes that q is an ill read queue so that ICMP errors for link-local
@@ -6067,23 +2854,16 @@ err_ret:
  * Current code checks for each opt_type (other than pads) if it is in
  * the expected  nexthdr (hbh or dest)
  */
-static int
-ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
-    uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_stack_t *ipst)
+int
+ip_process_options_v6(mblk_t *mp, ip6_t *ip6h,
+    uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_recv_attr_t *ira)
 {
 	uint8_t opt_type;
 	uint_t optused;
 	int ret = 0;
-	mblk_t *first_mp;
 	const char *errtype;
-	zoneid_t zoneid;
-	ill_t *ill = q->q_ptr;
-	ipif_t *ipif;
-
-	first_mp = mp;
-	if (mp->b_datap->db_type == M_CTL) {
-		mp = mp->b_cont;
-	}
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
 	while (optlen != 0) {
 		opt_type = *optptr;
@@ -6178,13 +2958,9 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 				 * around (i.e. before AH processing).
 				 * If we've done AH... stop now.
 				 */
-				if (first_mp != mp) {
-					ipsec_in_t *ii;
-
-					ii = (ipsec_in_t *)first_mp->b_rptr;
-					if (ii->ipsec_in_ah_sa != NULL)
-						break;
-				}
+				if ((ira->ira_flags & IRAF_IPSEC_SECURE) &&
+				    ira->ira_ipsec_ah_sa != NULL)
+					break;
 
 				oh = (struct ip6_opt_home_address *)optptr;
 				/* Check total length and alignment */
@@ -6217,8 +2993,6 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 				/* FALLTHROUGH */
 			opt_error:
 				/* Determine which zone should send error */
-				zoneid = ipif_lookup_addr_zoneid_v6(
-				    &ip6h->ip6_dst, ill, ipst);
 				switch (IP6OPT_TYPE(opt_type)) {
 				case IP6OPT_TYPE_SKIP:
 					optused = 2 + optptr[1];
@@ -6232,48 +3006,33 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 					ip1dbg(("ip_process_options_v6: %s "
 					    "opt 0x%x; packet dropped\n",
 					    errtype, opt_type));
-					freemsg(first_mp);
+					BUMP_MIB(ill->ill_ip_mib,
+					    ipIfStatsInHdrErrors);
+					ip_drop_input("ipIfStatsInHdrErrors",
+					    mp, ill);
+					freemsg(mp);
 					return (-1);
 				case IP6OPT_TYPE_ICMP:
-					if (zoneid == ALL_ZONES) {
-						freemsg(first_mp);
-						return (-1);
-					}
-					icmp_param_problem_v6(WR(q), first_mp,
+					BUMP_MIB(ill->ill_ip_mib,
+					    ipIfStatsInHdrErrors);
+					ip_drop_input("ipIfStatsInHdrErrors",
+					    mp, ill);
+					icmp_param_problem_v6(mp,
 					    ICMP6_PARAMPROB_OPTION,
 					    (uint32_t)(optptr -
 					    (uint8_t *)ip6h),
-					    B_FALSE, B_FALSE, zoneid, ipst);
+					    B_FALSE, ira);
 					return (-1);
 				case IP6OPT_TYPE_FORCEICMP:
-					/*
-					 * If we don't have a zone and the dst
-					 * addr is multicast, then pick a zone
-					 * based on the inbound interface.
-					 */
-					if (zoneid == ALL_ZONES &&
-					    IN6_IS_ADDR_MULTICAST(
-					    &ip6h->ip6_dst)) {
-						ipif = ipif_select_source_v6(
-						    ill, &ip6h->ip6_src,
-						    B_TRUE,
-						    IPV6_PREFER_SRC_DEFAULT,
-						    ALL_ZONES);
-						if (ipif != NULL) {
-							zoneid =
-							    ipif->ipif_zoneid;
-							ipif_refrele(ipif);
-						}
-					}
-					if (zoneid == ALL_ZONES) {
-						freemsg(first_mp);
-						return (-1);
-					}
-					icmp_param_problem_v6(WR(q), first_mp,
+					BUMP_MIB(ill->ill_ip_mib,
+					    ipIfStatsInHdrErrors);
+					ip_drop_input("ipIfStatsInHdrErrors",
+					    mp, ill);
+					icmp_param_problem_v6(mp,
 					    ICMP6_PARAMPROB_OPTION,
 					    (uint32_t)(optptr -
 					    (uint8_t *)ip6h),
-					    B_FALSE, B_TRUE, zoneid, ipst);
+					    B_TRUE, ira);
 					return (-1);
 				default:
 					ASSERT(0);
@@ -6287,14 +3046,10 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 
 bad_opt:
 	/* Determine which zone should send error */
-	zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, ill, ipst);
-	if (zoneid == ALL_ZONES) {
-		freemsg(first_mp);
-	} else {
-		icmp_param_problem_v6(WR(q), first_mp, ICMP6_PARAMPROB_OPTION,
-		    (uint32_t)(optptr - (uint8_t *)ip6h),
-		    B_FALSE, B_FALSE, zoneid, ipst);
-	}
+	ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
+	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION,
+	    (uint32_t)(optptr - (uint8_t *)ip6h),
+	    B_FALSE, ira);
 	return (-1);
 }
 
@@ -6302,10 +3057,11 @@ bad_opt:
  * Process a routing header that is not yet empty.
  * Because of RFC 5095, we now reject all route headers.
  */
-static void
-ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
-    ill_t *ill, mblk_t *hada_mp)
+void
+ip_process_rthdr(mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
+    ip_recv_attr_t *ira)
 {
+	ill_t		*ill = ira->ira_ill;
 	ip_stack_t	*ipst = ill->ill_ipst;
 
 	ASSERT(rth->ip6r_segleft != 0);
@@ -6314,19 +3070,15 @@ ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
 		/* XXX Check for source routed out same interface? */
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
-		freemsg(hada_mp);
+		ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 		freemsg(mp);
 		return;
 	}
-	if (hada_mp != NULL) {
-		freemsg(hada_mp);
-		freemsg(mp);
-		return;
-	}
-	/* Sent by forwarding path, and router is global zone */
-	icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER,
-	    (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h), B_FALSE,
-	    B_FALSE, GLOBAL_ZONEID, ipst);
+
+	ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
+	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
+	    (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h),
+	    B_FALSE, ira);
 }
 
 /*
@@ -6335,21 +3087,10 @@ ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
 void
 ip_rput_v6(queue_t *q, mblk_t *mp)
 {
-	mblk_t		*first_mp;
-	mblk_t		*hada_mp = NULL;
-	ip6_t		*ip6h;
-	boolean_t	ll_multicast = B_FALSE;
-	boolean_t	mctl_present = B_FALSE;
 	ill_t		*ill;
-	struct iocblk	*iocp;
-	uint_t 		flags = 0;
-	mblk_t		*dl_mp;
-	ip_stack_t	*ipst;
-	int		check;
 
 	ill = (ill_t *)q->q_ptr;
-	ipst = ill->ill_ipst;
-	if (ill->ill_state_flags & ILL_CONDEMNED) {
+	if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
 		union DL_primitives *dl;
 
 		dl = (union DL_primitives *)mp->b_rptr;
@@ -6367,241 +3108,14 @@ ip_rput_v6(queue_t *q, mblk_t *mp)
 			return;
 		}
 	}
+	if (DB_TYPE(mp) == M_DATA) {
+		struct mac_header_info_s mhi;
 
-	dl_mp = NULL;
-	switch (mp->b_datap->db_type) {
-	case M_DATA: {
-		int hlen;
-		uchar_t *ucp;
-		struct ether_header *eh;
-		dl_unitdata_ind_t *dui;
-
-		/*
-		 * This is a work-around for CR 6451644, a bug in Nemo.  It
-		 * should be removed when that problem is fixed.
-		 */
-		if (ill->ill_mactype == DL_ETHER &&
-		    (hlen = MBLKHEAD(mp)) >= sizeof (struct ether_header) &&
-		    (ucp = mp->b_rptr)[-1] == (ETHERTYPE_IPV6 & 0xFF) &&
-		    ucp[-2] == (ETHERTYPE_IPV6 >> 8)) {
-			if (hlen >= sizeof (struct ether_vlan_header) &&
-			    ucp[-5] == 0 && ucp[-6] == 0x81)
-				ucp -= sizeof (struct ether_vlan_header);
-			else
-				ucp -= sizeof (struct ether_header);
-			/*
-			 * If it's a group address, then fabricate a
-			 * DL_UNITDATA_IND message.
-			 */
-			if ((ll_multicast = (ucp[0] & 1)) != 0 &&
-			    (dl_mp = allocb(DL_UNITDATA_IND_SIZE + 16,
-			    BPRI_HI)) != NULL) {
-				eh = (struct ether_header *)ucp;
-				dui = (dl_unitdata_ind_t *)dl_mp->b_rptr;
-				DB_TYPE(dl_mp) = M_PROTO;
-				dl_mp->b_wptr = (uchar_t *)(dui + 1) + 16;
-				dui->dl_primitive = DL_UNITDATA_IND;
-				dui->dl_dest_addr_length = 8;
-				dui->dl_dest_addr_offset = DL_UNITDATA_IND_SIZE;
-				dui->dl_src_addr_length = 8;
-				dui->dl_src_addr_offset = DL_UNITDATA_IND_SIZE +
-				    8;
-				dui->dl_group_address = 1;
-				ucp = (uchar_t *)(dui + 1);
-				if (ill->ill_sap_length > 0)
-					ucp += ill->ill_sap_length;
-				bcopy(&eh->ether_dhost, ucp, 6);
-				bcopy(&eh->ether_shost, ucp + 8, 6);
-				ucp = (uchar_t *)(dui + 1);
-				if (ill->ill_sap_length < 0)
-					ucp += 8 + ill->ill_sap_length;
-				bcopy(&eh->ether_type, ucp, 2);
-				bcopy(&eh->ether_type, ucp + 8, 2);
-			}
-		}
-		break;
-	}
-
-	case M_PROTO:
-	case M_PCPROTO:
-		if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive !=
-		    DL_UNITDATA_IND) {
-			/* Go handle anything other than data elsewhere. */
-			ip_rput_dlpi(q, mp);
-			return;
-		}
-		ll_multicast = ip_get_dlpi_mbcast(ill, mp);
-
-		/* Save the DLPI header. */
-		dl_mp = mp;
-		mp = mp->b_cont;
-		dl_mp->b_cont = NULL;
-		break;
-	case M_BREAK:
-		panic("ip_rput_v6: got an M_BREAK");
-		/*NOTREACHED*/
-	case M_IOCACK:
-		iocp = (struct iocblk *)mp->b_rptr;
-		switch (iocp->ioc_cmd) {
-		case DL_IOC_HDR_INFO:
-			ill = (ill_t *)q->q_ptr;
-			ill_fastpath_ack(ill, mp);
-			return;
-		default:
-			putnext(q, mp);
-			return;
-		}
-		/* FALLTHRU */
-	case M_ERROR:
-	case M_HANGUP:
-		mutex_enter(&ill->ill_lock);
-		if (ill->ill_state_flags & ILL_CONDEMNED) {
-			mutex_exit(&ill->ill_lock);
-			freemsg(mp);
-			return;
-		}
-		ill_refhold_locked(ill);
-		mutex_exit(&ill->ill_lock);
-		qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE);
-		return;
-	case M_CTL:
-		if ((MBLKL(mp) > sizeof (int)) &&
-		    ((da_ipsec_t *)mp->b_rptr)->da_type == IPHADA_M_CTL) {
-			ASSERT(MBLKL(mp) >= sizeof (da_ipsec_t));
-			mctl_present = B_TRUE;
-			break;
-		}
-		putnext(q, mp);
-		return;
-	case M_IOCNAK:
-		iocp = (struct iocblk *)mp->b_rptr;
-		switch (iocp->ioc_cmd) {
-		case DL_IOC_HDR_INFO:
-			ip_rput_other(NULL, q, mp, NULL);
-			return;
-		default:
-			break;
-		}
-		/* FALLTHRU */
-	default:
-		putnext(q, mp);
-		return;
-	}
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
-	    (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp));
-	/*
-	 * if db_ref > 1 then copymsg and free original. Packet may be
-	 * changed and do not want other entity who has a reference to this
-	 * message to trip over the changes. This is a blind change because
-	 * trying to catch all places that might change packet is too
-	 * difficult (since it may be a module above this one).
-	 */
-	if (mp->b_datap->db_ref > 1) {
-		mblk_t  *mp1;
-
-		mp1 = copymsg(mp);
-		freemsg(mp);
-		if (mp1 == NULL) {
-			first_mp = NULL;
-			goto discard;
-		}
-		mp = mp1;
-	}
-	first_mp = mp;
-	if (mctl_present) {
-		hada_mp = first_mp;
-		mp = first_mp->b_cont;
-	}
-
-	if ((check = ip_check_v6_mblk(mp, ill)) == IP6_MBLK_HDR_ERR) {
-		freemsg(mp);
-		return;
-	}
-
-	ip6h = (ip6_t *)mp->b_rptr;
-
-	/*
-	 * ip:::receive must see ipv6 packets with a full header,
-	 * and so is placed after the IP6_MBLK_HDR_ERR check.
-	 */
-	DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *,
-	    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
-	    int, 0);
-
-	if (check != IP6_MBLK_OK) {
-		freemsg(mp);
-		return;
-	}
-
-	DTRACE_PROBE4(ip6__physical__in__start,
-	    ill_t *, ill, ill_t *, NULL,
-	    ip6_t *, ip6h, mblk_t *, first_mp);
-
-	FW_HOOKS6(ipst->ips_ip6_physical_in_event,
-	    ipst->ips_ipv6firewall_physical_in,
-	    ill, NULL, ip6h, first_mp, mp, ll_multicast, ipst);
-
-	DTRACE_PROBE1(ip6__physical__in__end, mblk_t *, first_mp);
-
-	if (first_mp == NULL)
-		return;
-
-	/*
-	 * Attach any necessary label information to this packet.
-	 */
-	if (is_system_labeled() && !tsol_get_pkt_label(mp, IPV6_VERSION)) {
-		if (ip6opt_ls != 0)
-			ip0dbg(("tsol_get_pkt_label v6 failed\n"));
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
-		goto discard;
-	}
-
-	/* IP observability hook. */
-	if (ipst->ips_ip6_observe.he_interested) {
-		zoneid_t dzone;
-
-		dzone = ip_get_zoneid_v6(&ip6h->ip6_dst, mp, ill, ipst,
-		    ALL_ZONES);
-		ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone,
-		    ill, ipst);
-	}
-
-	if ((ip6h->ip6_vcf & IPV6_VERS_AND_FLOW_MASK) ==
-	    IPV6_DEFAULT_VERS_AND_FLOW) {
-		/*
-		 * It may be a bit too expensive to do this mapped address
-		 * check here, but in the interest of robustness, it seems
-		 * like the correct place.
-		 * TODO: Avoid this check for e.g. connected TCP sockets
-		 */
-		if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_src)) {
-			ip1dbg(("ip_rput_v6: pkt with mapped src addr\n"));
-			goto discard;
-		}
-
-		if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src)) {
-			ip1dbg(("ip_rput_v6: pkt with loopback src"));
-			goto discard;
-		} else if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)) {
-			ip1dbg(("ip_rput_v6: pkt with loopback dst"));
-			goto discard;
-		}
-
-		flags |= (ll_multicast ? IP6_IN_LLMCAST : 0);
-		ip_rput_data_v6(q, ill, mp, ip6h, flags, hada_mp, dl_mp);
+		ip_mdata_to_mhi(ill, mp, &mhi);
+		ip_input_v6(ill, NULL, mp, &mhi);
 	} else {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
-		goto discard;
+		ip_rput_notdata(ill, mp);
 	}
-	freemsg(dl_mp);
-	return;
-
-discard:
-	if (dl_mp != NULL)
-		freeb(dl_mp);
-	freemsg(first_mp);
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 }
 
 /*
@@ -6703,1507 +3217,72 @@ ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr)
 }
 
 /*
- * Path for AH if options are present. If this is the first time we are
- * sending a datagram to AH, allocate a IPSEC_IN message and prepend it.
- * Otherwise, just fanout.  Return value answers the boolean question:
- * "Did I consume the mblk you sent me?"
+ * Path for AH if options are present.
+ * Returns NULL if the mblk was consumed.
  *
  * Sometimes AH needs to be done before other IPv6 headers for security
  * reasons.  This function (and its ipsec_needs_processing_v6() above)
  * indicates if that is so, and fans out to the appropriate IPsec protocol
  * for the datagram passed in.
  */
-static boolean_t
-ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
-    ill_t *ill, ill_t *inill, mblk_t *hada_mp, zoneid_t zoneid)
+mblk_t *
+ipsec_early_ah_v6(mblk_t *mp, ip_recv_attr_t *ira)
 {
-	mblk_t *mp;
 	uint8_t nexthdr;
-	ipsec_in_t *ii = NULL;
 	ah_t *ah;
-	ipsec_status_t ipsec_rc;
+	ill_t		*ill = ira->ira_ill;
 	ip_stack_t	*ipst = ill->ill_ipst;
-	netstack_t	*ns = ipst->ips_netstack;
-	ipsec_stack_t	*ipss = ns->netstack_ipsec;
-
-	ASSERT((hada_mp == NULL) || (!mctl_present));
+	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
 
-	switch (ipsec_needs_processing_v6(
-	    (mctl_present ? first_mp->b_cont : first_mp), &nexthdr)) {
+	switch (ipsec_needs_processing_v6(mp, &nexthdr)) {
 	case IPSEC_MEMORY_ERROR:
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-		freemsg(hada_mp);
-		freemsg(first_mp);
-		return (B_TRUE);
+		ip_drop_input("ipIfStatsInDiscards", mp, ill);
+		freemsg(mp);
+		return (NULL);
 	case IPSEC_HDR_DONT_PROCESS:
-		return (B_FALSE);
+		return (mp);
 	}
 
 	/* Default means send it to AH! */
 	ASSERT(nexthdr == IPPROTO_AH);
-	if (!mctl_present) {
-		mp = first_mp;
-		first_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack);
-		if (first_mp == NULL) {
-			ip1dbg(("ipsec_early_ah_v6: IPSEC_IN "
-			    "allocation failure.\n"));
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(hada_mp);
-			freemsg(mp);
-			return (B_TRUE);
-		}
-		/*
-		 * Store the ill_index so that when we come back
-		 * from IPSEC we ride on the same queue.
-		 */
-		ii = (ipsec_in_t *)first_mp->b_rptr;
-		ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex;
-		ii->ipsec_in_rill_index = inill->ill_phyint->phyint_ifindex;
-		first_mp->b_cont = mp;
-	}
-	/*
-	 * Cache hardware acceleration info.
-	 */
-	if (hada_mp != NULL) {
-		ASSERT(ii != NULL);
-		IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_early_ah_v6: "
-		    "caching data attr.\n"));
-		ii->ipsec_in_accelerated = B_TRUE;
-		ii->ipsec_in_da = hada_mp;
-	}
 
 	if (!ipsec_loaded(ipss)) {
-		ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, zoneid, ipst);
-		return (B_TRUE);
-	}
-
-	ah = ipsec_inbound_ah_sa(first_mp, ns);
-	if (ah == NULL)
-		return (B_TRUE);
-	ASSERT(ii->ipsec_in_ah_sa != NULL);
-	ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL);
-	ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func(first_mp, ah);
-
-	switch (ipsec_rc) {
-	case IPSEC_STATUS_SUCCESS:
-		/* we're done with IPsec processing, send it up */
-		ip_fanout_proto_again(first_mp, ill, inill, NULL);
-		break;
-	case IPSEC_STATUS_FAILED:
-		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
-		break;
-	case IPSEC_STATUS_PENDING:
-		/* no action needed */
-		break;
-	}
-	return (B_TRUE);
-}
-
-static boolean_t
-ip_iptun_input_v6(mblk_t *ipsec_mp, mblk_t *data_mp,
-    size_t hdr_len, uint8_t nexthdr, zoneid_t zoneid, ill_t *ill,
-    ip_stack_t *ipst)
-{
-	conn_t	*connp;
-
-	ASSERT(ipsec_mp == NULL || ipsec_mp->b_cont == data_mp);
-
-	connp = ipcl_classify_v6(data_mp, nexthdr, hdr_len, zoneid, ipst);
-	if (connp != NULL) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-		connp->conn_recv(connp, ipsec_mp != NULL ? ipsec_mp : data_mp,
-		    NULL);
-		CONN_DEC_REF(connp);
-		return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-/*
- * Validate the IPv6 mblk for alignment.
- */
-int
-ip_check_v6_mblk(mblk_t *mp, ill_t *ill)
-{
-	int pkt_len, ip6_len;
-	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
-
-	/* check for alignment and full IPv6 header */
-	if (!OK_32PTR((uchar_t *)ip6h) ||
-	    (mp->b_wptr - (uchar_t *)ip6h) < IPV6_HDR_LEN) {
-		if (!pullupmsg(mp, IPV6_HDR_LEN)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			ip1dbg(("ip_rput_v6: pullupmsg failed\n"));
-			return (IP6_MBLK_HDR_ERR);
-		}
-		ip6h = (ip6_t *)mp->b_rptr;
-	}
-
-	ASSERT(OK_32PTR((uchar_t *)ip6h) &&
-	    (mp->b_wptr - (uchar_t *)ip6h) >= IPV6_HDR_LEN);
-
-	if (mp->b_cont == NULL)
-		pkt_len = mp->b_wptr - mp->b_rptr;
-	else
-		pkt_len = msgdsize(mp);
-	ip6_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
-
-	/*
-	 * Check for bogus (too short packet) and packet which
-	 * was padded by the link layer.
-	 */
-	if (ip6_len != pkt_len) {
-		ssize_t diff;
-
-		if (ip6_len > pkt_len) {
-			ip1dbg(("ip_rput_data_v6: packet too short %d %d\n",
-			    ip6_len, pkt_len));
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
-			return (IP6_MBLK_LEN_ERR);
-		}
-		diff = (ssize_t)(pkt_len - ip6_len);
-
-		if (!adjmsg(mp, -diff)) {
-			ip1dbg(("ip_rput_data_v6: adjmsg failed\n"));
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			return (IP6_MBLK_LEN_ERR);
-		}
-
-		/*
-		 * adjmsg may have freed an mblk from the chain, hence
-		 * invalidate any hw checksum here. This will force IP to
-		 * calculate the checksum in sw, but only for this packet.
-		 */
-		DB_CKSUMFLAGS(mp) = 0;
-	}
-	return (IP6_MBLK_OK);
-}
-
-/*
- * ip_rput_data_v6 -- received IPv6 packets in M_DATA messages show up here.
- * ip_rput_v6 has already verified alignment, the min length, the version,
- * and db_ref = 1.
- *
- * The ill passed in (the arg named inill) is the ill that the packet
- * actually arrived on.  We need to remember this when saving the
- * input interface index into potential IPV6_PKTINFO data in
- * ip_add_info_v6().
- *
- * This routine doesn't free dl_mp; that's the caller's responsibility on
- * return.  (Note that the callers are complex enough that there's no tail
- * recursion here anyway.)
- */
-void
-ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
-    uint_t flags, mblk_t *hada_mp, mblk_t *dl_mp)
-{
-	ire_t		*ire = NULL;
-	ill_t		*ill = inill;
-	ill_t		*outill;
-	uint8_t		*whereptr;
-	uint8_t		nexthdr;
-	uint16_t	remlen;
-	uint_t		prev_nexthdr_offset;
-	uint_t		used;
-	size_t		old_pkt_len;
-	size_t		pkt_len;
-	uint16_t	ip6_len;
-	uint_t		hdr_len;
-	boolean_t	mctl_present;
-	mblk_t		*first_mp;
-	mblk_t		*first_mp1;
-	boolean_t	no_forward;
-	ip6_hbh_t	*hbhhdr;
-	boolean_t	ll_multicast = (flags & IP6_IN_LLMCAST);
-	conn_t		*connp;
-	uint32_t	ports;
-	zoneid_t	zoneid = GLOBAL_ZONEID;
-	uint16_t	hck_flags, reass_hck_flags;
-	uint32_t	reass_sum;
-	boolean_t	cksum_err;
-	mblk_t		*mp1;
-	ip_stack_t	*ipst = inill->ill_ipst;
-	ilb_stack_t	*ilbs = ipst->ips_netstack->netstack_ilb;
-	in6_addr_t	lb_dst;
-	int		lb_ret = ILB_PASSED;
-
-	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
-	if (hada_mp != NULL) {
-		/*
-		 * It's an IPsec accelerated packet.
-		 * Keep a pointer to the data attributes around until
-		 * we allocate the ipsecinfo structure.
-		 */
-		IPSECHW_DEBUG(IPSECHW_PKT,
-		    ("ip_rput_data_v6: inbound HW accelerated IPsec pkt\n"));
-		hada_mp->b_cont = NULL;
-		/*
-		 * Since it is accelerated, it came directly from
-		 * the ill.
-		 */
-		ASSERT(mctl_present == B_FALSE);
-		ASSERT(mp->b_datap->db_type != M_CTL);
-	}
-
-	ip6h = (ip6_t *)mp->b_rptr;
-	ip6_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
-	old_pkt_len = pkt_len = ip6_len;
-
-	if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
-		hck_flags = DB_CKSUMFLAGS(mp);
-	else
-		hck_flags = 0;
-
-	/* Clear checksum flags in case we need to forward */
-	DB_CKSUMFLAGS(mp) = 0;
-	reass_sum = reass_hck_flags = 0;
-
-	nexthdr = ip6h->ip6_nxt;
-
-	prev_nexthdr_offset = (uint_t)((uchar_t *)&ip6h->ip6_nxt -
-	    (uchar_t *)ip6h);
-	whereptr = (uint8_t *)&ip6h[1];
-	remlen = pkt_len - IPV6_HDR_LEN;	/* Track how much is left */
-
-	/* Process hop by hop header options */
-	if (nexthdr == IPPROTO_HOPOPTS) {
-		uint_t ehdrlen;
-		uint8_t *optptr;
-
-		if (remlen < MIN_EHDR_LEN)
-			goto pkt_too_short;
-		if (mp->b_cont != NULL &&
-		    whereptr + MIN_EHDR_LEN > mp->b_wptr) {
-			if (!pullupmsg(mp, IPV6_HDR_LEN + MIN_EHDR_LEN)) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(hada_mp);
-				freemsg(first_mp);
-				return;
-			}
-			ip6h = (ip6_t *)mp->b_rptr;
-			whereptr = (uint8_t *)ip6h + pkt_len - remlen;
-		}
-		hbhhdr = (ip6_hbh_t *)whereptr;
-		nexthdr = hbhhdr->ip6h_nxt;
-		prev_nexthdr_offset = (uint_t)(whereptr - (uint8_t *)ip6h);
-		ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
-
-		if (remlen < ehdrlen)
-			goto pkt_too_short;
-		if (mp->b_cont != NULL &&
-		    whereptr + ehdrlen > mp->b_wptr) {
-			if (!pullupmsg(mp, IPV6_HDR_LEN + ehdrlen)) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(hada_mp);
-				freemsg(first_mp);
-				return;
-			}
-			ip6h = (ip6_t *)mp->b_rptr;
-			whereptr = (uint8_t *)ip6h + pkt_len - remlen;
-			hbhhdr = (ip6_hbh_t *)whereptr;
-		}
-
-		optptr = whereptr + 2;
-		whereptr += ehdrlen;
-		remlen -= ehdrlen;
-		switch (ip_process_options_v6(q, first_mp, ip6h, optptr,
-		    ehdrlen - 2, IPPROTO_HOPOPTS, ipst)) {
-		case -1:
-			/*
-			 * Packet has been consumed and any
-			 * needed ICMP messages sent.
-			 */
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
-			freemsg(hada_mp);
-			return;
-		case 0:
-			/* no action needed */
-			break;
-		case 1:
-			/* Known router alert */
-			goto ipv6forus;
-		}
-	}
-
-	/*
-	 * On incoming v6 multicast packets we will bypass the ire table,
-	 * and assume that the read queue corresponds to the targetted
-	 * interface.
-	 *
-	 * The effect of this is the same as the IPv4 original code, but is
-	 * much cleaner I think.  See ip_rput for how that was done.
-	 */
-	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
-		UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, pkt_len);
-
-		/*
-		 * So that we don't end up with dups, only one ill in an IPMP
-		 * group is nominated to receive multicast data traffic.
-		 * However, link-locals on any underlying interfaces will have
-		 * joined their solicited-node multicast addresses and we must
-		 * accept those packets.  (We don't attempt to precisely
-		 * filter out duplicate solicited-node multicast packets since
-		 * e.g. an IPMP interface and underlying interface may have
-		 * the same solicited-node multicast address.)  Note that we
-		 * won't generally have duplicates because we only issue a
-		 * DL_ENABMULTI_REQ on one interface in a group; the exception
-		 * is when PHYI_MULTI_BCAST is set.
-		 */
-		if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast &&
-		    !IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
-			goto drop_pkt;
-		}
-
-		/*
-		 * XXX TODO Give to mrouted to for multicast forwarding.
-		 */
-		if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE,
-		    ALL_ZONES) == NULL) {
-			if (ip_debug > 3) {
-				/* ip2dbg */
-				pr_addr_dbg("ip_rput_data_v6: got mcast packet"
-				    "  which is not for us: %s\n", AF_INET6,
-				    &ip6h->ip6_dst);
-			}
-drop_pkt:		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(hada_mp);
-			freemsg(first_mp);
-			return;
-		}
-		if (ip_debug > 3) {
-			/* ip2dbg */
-			pr_addr_dbg("ip_rput_data_v6: multicast for us: %s\n",
-			    AF_INET6, &ip6h->ip6_dst);
-		}
-		zoneid = GLOBAL_ZONEID;
-		goto ipv6forus;
-	}
-
-	/*
-	 * Find an ire that matches destination. For link-local addresses
-	 * we have to match the ill.
-	 * TBD for site local addresses.
-	 */
-	if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) {
-		ire = ire_ctable_lookup_v6(&ip6h->ip6_dst, NULL,
-		    IRE_CACHE|IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL,
-		    MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-	} else {
-		if (ilb_has_rules(ilbs) && ILB_SUPP_L4(nexthdr)) {
-			/* For convenience, we just pull up the mblk. */
-			if (mp->b_cont != NULL) {
-				if (pullupmsg(mp, -1) == 0) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInDiscards);
-					freemsg(hada_mp);
-					freemsg(first_mp);
-					return;
-				}
-				hdr_len = pkt_len - remlen;
-				ip6h = (ip6_t *)mp->b_rptr;
-				whereptr = (uint8_t *)ip6h + hdr_len;
-			}
-			lb_ret = ilb_check_v6(ilbs, ill, mp, ip6h, nexthdr,
-			    whereptr, &lb_dst);
-			if (lb_ret == ILB_DROPPED) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(hada_mp);
-				freemsg(first_mp);
-				return;
-			}
-		}
-
-		ire = ire_cache_lookup_v6((lb_ret == ILB_BALANCED) ? &lb_dst :
-		    &ip6h->ip6_dst, ALL_ZONES, msg_getlabel(mp), ipst);
-
-		if (ire != NULL && ire->ire_stq != NULL &&
-		    ire->ire_zoneid != GLOBAL_ZONEID &&
-		    ire->ire_zoneid != ALL_ZONES) {
-			/*
-			 * Should only use IREs that are visible from the
-			 * global zone for forwarding.
-			 */
-			ire_refrele(ire);
-			ire = ire_cache_lookup_v6(&ip6h->ip6_dst,
-			    GLOBAL_ZONEID, msg_getlabel(mp), ipst);
-		}
-	}
-
-	if (ire == NULL) {
-		/*
-		 * No matching IRE found.  Mark this packet as having
-		 * originated externally.
-		 */
-		if (!(ill->ill_flags & ILLF_ROUTER) || ll_multicast) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
-			if (!(ill->ill_flags & ILLF_ROUTER)) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsInAddrErrors);
-			}
-			freemsg(hada_mp);
-			freemsg(first_mp);
-			return;
-		}
-		if (ip6h->ip6_hops <= 1) {
-			if (hada_mp != NULL)
-				goto hada_drop;
-			/* Sent by forwarding path, and router is global zone */
-			icmp_time_exceeded_v6(WR(q), first_mp,
-			    ICMP6_TIME_EXCEED_TRANSIT, ll_multicast, B_FALSE,
-			    GLOBAL_ZONEID, ipst);
-			return;
-		}
-		/*
-		 * Per RFC 3513 section 2.5.2, we must not forward packets with
-		 * an unspecified source address.
-		 */
-		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
-			freemsg(hada_mp);
-			freemsg(first_mp);
-			return;
-		}
-		mp->b_prev = (mblk_t *)(uintptr_t)
-		    ill->ill_phyint->phyint_ifindex;
-		ip_newroute_v6(q, mp, (lb_ret == ILB_BALANCED) ? &lb_dst :
-		    &ip6h->ip6_dst, &ip6h->ip6_src,
-		    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) ? ill : NULL,
-		    GLOBAL_ZONEID, ipst);
-		return;
+		ip_proto_not_sup(mp, ira);
+		return (NULL);
 	}
-	/* we have a matching IRE */
-	if (ire->ire_stq != NULL) {
-		/*
-		 * To be quicker, we may wish not to chase pointers
-		 * (ire->ire_ipif->ipif_ill...) and instead store the
-		 * forwarding policy in the ire.  An unfortunate side-
-		 * effect of this would be requiring an ire flush whenever
-		 * the ILLF_ROUTER flag changes.  For now, chase pointers
-		 * once and store in the boolean no_forward.
-		 *
-		 * This appears twice to keep it out of the non-forwarding,
-		 * yes-it's-for-us-on-the-right-interface case.
-		 */
-		no_forward = ((ill->ill_flags &
-		    ire->ire_ipif->ipif_ill->ill_flags & ILLF_ROUTER) == 0);
 
-		ASSERT(first_mp == mp);
-		/*
-		 * This ire has a send-to queue - forward the packet.
-		 */
-		if (no_forward || ll_multicast || (hada_mp != NULL)) {
-			freemsg(hada_mp);
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
-			if (no_forward) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsInAddrErrors);
-			}
-			freemsg(mp);
-			ire_refrele(ire);
-			return;
-		}
-		/*
-		 * ipIfStatsHCInForwDatagrams should only be increment if there
-		 * will be an attempt to forward the packet, which is why we
-		 * increment after the above condition has been checked.
-		 */
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
-		if (ip6h->ip6_hops <= 1) {
-			ip1dbg(("ip_rput_data_v6: hop limit expired.\n"));
-			/* Sent by forwarding path, and router is global zone */
-			icmp_time_exceeded_v6(WR(q), mp,
-			    ICMP6_TIME_EXCEED_TRANSIT, ll_multicast, B_FALSE,
-			    GLOBAL_ZONEID, ipst);
-			ire_refrele(ire);
-			return;
-		}
-		/*
-		 * Per RFC 3513 section 2.5.2, we must not forward packets with
-		 * an unspecified source address.
-		 */
-		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
-			freemsg(mp);
-			ire_refrele(ire);
-			return;
-		}
-
-		if (is_system_labeled()) {
-			mblk_t *mp1;
-
-			if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsForwProhibits);
-				freemsg(mp);
-				ire_refrele(ire);
-				return;
-			}
-			/* Size may have changed */
-			mp = mp1;
-			ip6h = (ip6_t *)mp->b_rptr;
-			pkt_len = msgdsize(mp);
-		}
-
-		if (pkt_len > ire->ire_max_frag) {
-			int max_frag = ire->ire_max_frag;
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTooBigErrors);
-			/*
-			 * Handle labeled packet resizing.
-			 */
-			if (is_system_labeled()) {
-				max_frag = tsol_pmtu_adjust(mp, max_frag,
-				    pkt_len - old_pkt_len, AF_INET6);
-			}
-
-			/* Sent by forwarding path, and router is global zone */
-			icmp_pkt2big_v6(WR(q), mp, max_frag,
-			    ll_multicast, B_TRUE, GLOBAL_ZONEID, ipst);
-			ire_refrele(ire);
-			return;
-		}
+	mp = ipsec_inbound_ah_sa(mp, ira, &ah);
+	if (mp == NULL)
+		return (NULL);
+	ASSERT(ah != NULL);
+	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+	ASSERT(ira->ira_ipsec_ah_sa != NULL);
+	ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
+	mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira);
 
+	if (mp == NULL) {
 		/*
-		 * Check to see if we're forwarding the packet to a
-		 * different link from which it came.  If so, check the
-		 * source and destination addresses since routers must not
-		 * forward any packets with link-local source or
-		 * destination addresses to other links.  Otherwise (if
-		 * we're forwarding onto the same link), conditionally send
-		 * a redirect message.
+		 * Either it failed or is pending. In the former case
+		 * ipIfStatsInDiscards was increased.
 		 */
-		if (ire->ire_rfq != q &&
-		    !IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) {
-			if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) ||
-			    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsInAddrErrors);
-				freemsg(mp);
-				ire_refrele(ire);
-				return;
-			}
-			/* TBD add site-local check at site boundary? */
-		} else if (ipst->ips_ipv6_send_redirects) {
-			in6_addr_t	*v6targ;
-			in6_addr_t	gw_addr_v6;
-			ire_t		*src_ire_v6 = NULL;
-
-			/*
-			 * Don't send a redirect when forwarding a source
-			 * routed packet.
-			 */
-			if (ip_source_routed_v6(ip6h, mp, ipst))
-				goto forward;
-
-			mutex_enter(&ire->ire_lock);
-			gw_addr_v6 = ire->ire_gateway_addr_v6;
-			mutex_exit(&ire->ire_lock);
-			if (!IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
-				v6targ = &gw_addr_v6;
-				/*
-				 * We won't send redirects to a router
-				 * that doesn't have a link local
-				 * address, but will forward.
-				 */
-				if (!IN6_IS_ADDR_LINKLOCAL(v6targ)) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInAddrErrors);
-					goto forward;
-				}
-			} else {
-				v6targ = &ip6h->ip6_dst;
-			}
-
-			src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src,
-			    NULL, NULL, IRE_INTERFACE, ire->ire_ipif, NULL,
-			    GLOBAL_ZONEID, 0, NULL,
-			    MATCH_IRE_IPIF | MATCH_IRE_TYPE,
-			    ipst);
-
-			if (src_ire_v6 != NULL) {
-				/*
-				 * The source is directly connected.
-				 */
-				mp1 = copymsg(mp);
-				if (mp1 != NULL) {
-					icmp_send_redirect_v6(WR(q),
-					    mp1, v6targ, &ip6h->ip6_dst,
-					    ill, B_FALSE);
-				}
-				ire_refrele(src_ire_v6);
-			}
-		}
-
-forward:
-		/* Hoplimit verified above */
-		ip6h->ip6_hops--;
-
-		outill = ire->ire_ipif->ipif_ill;
-
-		DTRACE_PROBE4(ip6__forwarding__start,
-		    ill_t *, inill, ill_t *, outill,
-		    ip6_t *, ip6h, mblk_t *, mp);
-
-		FW_HOOKS6(ipst->ips_ip6_forwarding_event,
-		    ipst->ips_ipv6firewall_forwarding,
-		    inill, outill, ip6h, mp, mp, 0, ipst);
-
-		DTRACE_PROBE1(ip6__forwarding__end, mblk_t *, mp);
-
-		if (mp != NULL) {
-			UPDATE_IB_PKT_COUNT(ire);
-			ire->ire_last_used_time = lbolt;
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
-			ip_xmit_v6(mp, ire, 0, NULL, B_FALSE, NULL);
-		}
-		IRE_REFRELE(ire);
-		return;
-	}
-
-	/*
-	 * Need to put on correct queue for reassembly to find it.
-	 * No need to use put() since reassembly has its own locks.
-	 * Note: multicast packets and packets destined to addresses
-	 * assigned to loopback (ire_rfq is NULL) will be reassembled on
-	 * the arriving ill. Unlike the IPv4 case, enabling strict
-	 * destination multihoming will prevent accepting packets
-	 * addressed to an IRE_LOCAL on lo0.
-	 */
-	if (ire->ire_rfq != q) {
-		if ((ire = ip_check_multihome(&ip6h->ip6_dst, ire, ill))
-		    == NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
-			freemsg(hada_mp);
-			freemsg(first_mp);
-			return;
-		}
-		if (ire->ire_rfq != NULL) {
-			q = ire->ire_rfq;
-			ill = (ill_t *)q->q_ptr;
-			ASSERT(ill != NULL);
-		}
-	}
-
-	zoneid = ire->ire_zoneid;
-	UPDATE_IB_PKT_COUNT(ire);
-	ire->ire_last_used_time = lbolt;
-	/* Don't use the ire after this point, we'll NULL it out to be sure. */
-	ire_refrele(ire);
-	ire = NULL;
-ipv6forus:
-	/*
-	 * Looks like this packet is for us one way or another.
-	 * This is where we'll process destination headers etc.
-	 */
-	for (; ; ) {
-		switch (nexthdr) {
-		case IPPROTO_TCP: {
-			uint16_t	*up;
-			uint32_t	sum;
-			int		offset;
-
-			hdr_len = pkt_len - remlen;
-
-			if (hada_mp != NULL) {
-				ip0dbg(("tcp hada drop\n"));
-				goto hada_drop;
-			}
-
-
-			/* TCP needs all of the TCP header */
-			if (remlen < TCP_MIN_HEADER_LENGTH)
-				goto pkt_too_short;
-			if (mp->b_cont != NULL &&
-			    whereptr + TCP_MIN_HEADER_LENGTH > mp->b_wptr) {
-				if (!pullupmsg(mp,
-				    hdr_len + TCP_MIN_HEADER_LENGTH)) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInDiscards);
-					freemsg(first_mp);
-					return;
-				}
-				hck_flags = 0;
-				ip6h = (ip6_t *)mp->b_rptr;
-				whereptr = (uint8_t *)ip6h + hdr_len;
-			}
-			/*
-			 * Extract the offset field from the TCP header.
-			 */
-			offset = ((uchar_t *)ip6h)[hdr_len + 12] >> 4;
-			if (offset != 5) {
-				if (offset < 5) {
-					ip1dbg(("ip_rput_data_v6: short "
-					    "TCP data offset"));
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInDiscards);
-					freemsg(first_mp);
-					return;
-				}
-				/*
-				 * There must be TCP options.
-				 * Make sure we can grab them.
-				 */
-				offset <<= 2;
-				if (remlen < offset)
-					goto pkt_too_short;
-				if (mp->b_cont != NULL &&
-				    whereptr + offset > mp->b_wptr) {
-					if (!pullupmsg(mp,
-					    hdr_len + offset)) {
-						BUMP_MIB(ill->ill_ip_mib,
-						    ipIfStatsInDiscards);
-						freemsg(first_mp);
-						return;
-					}
-					hck_flags = 0;
-					ip6h = (ip6_t *)mp->b_rptr;
-					whereptr = (uint8_t *)ip6h + hdr_len;
-				}
-			}
-
-			up = (uint16_t *)&ip6h->ip6_src;
-			/*
-			 * TCP checksum calculation.  First sum up the
-			 * pseudo-header fields:
-			 *  -	Source IPv6 address
-			 *  -	Destination IPv6 address
-			 *  -	TCP payload length
-			 *  -	TCP protocol ID
-			 */
-			sum = htons(IPPROTO_TCP + remlen) +
-			    up[0] + up[1] + up[2] + up[3] +
-			    up[4] + up[5] + up[6] + up[7] +
-			    up[8] + up[9] + up[10] + up[11] +
-			    up[12] + up[13] + up[14] + up[15];
-
-			/* Fold initial sum */
-			sum = (sum & 0xffff) + (sum >> 16);
-
-			mp1 = mp->b_cont;
-
-			if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
-				IP6_STAT(ipst, ip6_in_sw_cksum);
-
-			IP_CKSUM_RECV(hck_flags, sum, (uchar_t *)
-			    ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)),
-			    (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
-			    mp, mp1, cksum_err);
-
-			if (cksum_err) {
-				BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
-
-				if (hck_flags & HCK_FULLCKSUM) {
-					IP6_STAT(ipst,
-					    ip6_tcp_in_full_hw_cksum_err);
-				} else if (hck_flags & HCK_PARTIALCKSUM) {
-					IP6_STAT(ipst,
-					    ip6_tcp_in_part_hw_cksum_err);
-				} else {
-					IP6_STAT(ipst, ip6_tcp_in_sw_cksum_err);
-				}
-				freemsg(first_mp);
-				return;
-			}
-tcp_fanout:
-			ip_fanout_tcp_v6(q, first_mp, ip6h, ill, inill,
-			    (flags|IP_FF_SEND_ICMP|IP_FF_SYN_ADDIRE|
-			    IP_FF_IPINFO), hdr_len, mctl_present, zoneid);
-			return;
-		}
-		case IPPROTO_SCTP:
-		{
-			sctp_hdr_t *sctph;
-			uint32_t calcsum, pktsum;
-			uint_t hdr_len = pkt_len - remlen;
-			sctp_stack_t *sctps;
-
-			sctps = inill->ill_ipst->ips_netstack->netstack_sctp;
-
-			/* SCTP needs all of the SCTP header */
-			if (remlen < sizeof (*sctph)) {
-				goto pkt_too_short;
-			}
-			if (whereptr + sizeof (*sctph) > mp->b_wptr) {
-				ASSERT(mp->b_cont != NULL);
-				if (!pullupmsg(mp, hdr_len + sizeof (*sctph))) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInDiscards);
-					freemsg(mp);
-					return;
-				}
-				ip6h = (ip6_t *)mp->b_rptr;
-				whereptr = (uint8_t *)ip6h + hdr_len;
-			}
-
-			sctph = (sctp_hdr_t *)(mp->b_rptr + hdr_len);
-			/* checksum */
-			pktsum = sctph->sh_chksum;
-			sctph->sh_chksum = 0;
-			calcsum = sctp_cksum(mp, hdr_len);
-			if (calcsum != pktsum) {
-				BUMP_MIB(&sctps->sctps_mib, sctpChecksumError);
-				freemsg(mp);
-				return;
-			}
-			sctph->sh_chksum = pktsum;
-			ports = *(uint32_t *)(mp->b_rptr + hdr_len);
-			if ((connp = sctp_fanout(&ip6h->ip6_src, &ip6h->ip6_dst,
-			    ports, zoneid, mp, sctps)) == NULL) {
-				ip_fanout_sctp_raw(first_mp, ill,
-				    (ipha_t *)ip6h, B_FALSE, ports,
-				    mctl_present,
-				    (flags|IP_FF_SEND_ICMP|IP_FF_IPINFO),
-				    B_TRUE, zoneid);
-				return;
-			}
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-			sctp_input(connp, (ipha_t *)ip6h, mp, first_mp, ill,
-			    B_FALSE, mctl_present);
-			return;
-		}
-		case IPPROTO_UDP: {
-			uint16_t	*up;
-			uint32_t	sum;
-
-			hdr_len = pkt_len - remlen;
-
-			if (hada_mp != NULL) {
-				ip0dbg(("udp hada drop\n"));
-				goto hada_drop;
-			}
-
-			/* Verify that at least the ports are present */
-			if (remlen < UDPH_SIZE)
-				goto pkt_too_short;
-			if (mp->b_cont != NULL &&
-			    whereptr + UDPH_SIZE > mp->b_wptr) {
-				if (!pullupmsg(mp, hdr_len + UDPH_SIZE)) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInDiscards);
-					freemsg(first_mp);
-					return;
-				}
-				hck_flags = 0;
-				ip6h = (ip6_t *)mp->b_rptr;
-				whereptr = (uint8_t *)ip6h + hdr_len;
-			}
-
-			/*
-			 *  Before going through the regular checksum
-			 *  calculation, make sure the received checksum
-			 *  is non-zero. RFC 2460 says, a 0x0000 checksum
-			 *  in a UDP packet (within IPv6 packet) is invalid
-			 *  and should be replaced by 0xffff. This makes
-			 *  sense as regular checksum calculation will
-			 *  pass for both the cases i.e. 0x0000 and 0xffff.
-			 *  Removing one of the case makes error detection
-			 *  stronger.
-			 */
-
-			if (((udpha_t *)whereptr)->uha_checksum == 0) {
-				/* 0x0000 checksum is invalid */
-				ip1dbg(("ip_rput_data_v6: Invalid UDP "
-				    "checksum value 0x0000\n"));
-				BUMP_MIB(ill->ill_ip_mib,
-				    udpIfStatsInCksumErrs);
-				freemsg(first_mp);
-				return;
-			}
-
-			up = (uint16_t *)&ip6h->ip6_src;
-
-			/*
-			 * UDP checksum calculation.  First sum up the
-			 * pseudo-header fields:
-			 *  -	Source IPv6 address
-			 *  -	Destination IPv6 address
-			 *  -	UDP payload length
-			 *  -	UDP protocol ID
-			 */
-
-			sum = htons(IPPROTO_UDP + remlen) +
-			    up[0] + up[1] + up[2] + up[3] +
-			    up[4] + up[5] + up[6] + up[7] +
-			    up[8] + up[9] + up[10] + up[11] +
-			    up[12] + up[13] + up[14] + up[15];
-
-			/* Fold initial sum */
-			sum = (sum & 0xffff) + (sum >> 16);
-
-			if (reass_hck_flags != 0) {
-				hck_flags = reass_hck_flags;
-
-				IP_CKSUM_RECV_REASS(hck_flags,
-				    (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
-				    sum, reass_sum, cksum_err);
-			} else {
-				mp1 = mp->b_cont;
-
-				IP_CKSUM_RECV(hck_flags, sum, (uchar_t *)
-				    ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)),
-				    (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
-				    mp, mp1, cksum_err);
-			}
-
-			if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
-				IP6_STAT(ipst, ip6_in_sw_cksum);
-
-			if (cksum_err) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    udpIfStatsInCksumErrs);
-
-				if (hck_flags & HCK_FULLCKSUM)
-					IP6_STAT(ipst,
-					    ip6_udp_in_full_hw_cksum_err);
-				else if (hck_flags & HCK_PARTIALCKSUM)
-					IP6_STAT(ipst,
-					    ip6_udp_in_part_hw_cksum_err);
-				else
-					IP6_STAT(ipst, ip6_udp_in_sw_cksum_err);
-
-				freemsg(first_mp);
-				return;
-			}
-			goto udp_fanout;
-		}
-		case IPPROTO_ICMPV6: {
-			uint16_t	*up;
-			uint32_t	sum;
-			uint_t		hdr_len = pkt_len - remlen;
-
-			if (hada_mp != NULL) {
-				ip0dbg(("icmp hada drop\n"));
-				goto hada_drop;
-			}
-
-			up = (uint16_t *)&ip6h->ip6_src;
-			sum = htons(IPPROTO_ICMPV6 + remlen) +
-			    up[0] + up[1] + up[2] + up[3] +
-			    up[4] + up[5] + up[6] + up[7] +
-			    up[8] + up[9] + up[10] + up[11] +
-			    up[12] + up[13] + up[14] + up[15];
-			sum = (sum & 0xffff) + (sum >> 16);
-			sum = IP_CSUM(mp, hdr_len, sum);
-			if (sum != 0) {
-				/* IPv6 ICMP checksum failed */
-				ip1dbg(("ip_rput_data_v6: ICMPv6 checksum "
-				    "failed %x\n",
-				    sum));
-				BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
-				BUMP_MIB(ill->ill_icmp6_mib,
-				    ipv6IfIcmpInErrors);
-				freemsg(first_mp);
-				return;
-			}
-
-		icmp_fanout:
-			/* Check variable for testing applications */
-			if (ipst->ips_ipv6_drop_inbound_icmpv6) {
-				freemsg(first_mp);
-				return;
-			}
-			/*
-			 * Assume that there is always at least one conn for
-			 * ICMPv6 (in.ndpd) i.e. don't optimize the case
-			 * where there is no conn.
-			 */
-			if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
-				ilm_t *ilm;
-				ilm_walker_t ilw;
-
-				ASSERT(!IS_LOOPBACK(ill));
-				/*
-				 * In the multicast case, applications may have
-				 * joined the group from different zones, so we
-				 * need to deliver the packet to each of them.
-				 * Loop through the multicast memberships
-				 * structures (ilm) on the receive ill and send
-				 * a copy of the packet up each matching one.
-				 */
-				ilm = ilm_walker_start(&ilw, inill);
-				for (; ilm != NULL;
-				    ilm = ilm_walker_step(&ilw, ilm)) {
-					if (!IN6_ARE_ADDR_EQUAL(
-					    &ilm->ilm_v6addr, &ip6h->ip6_dst))
-						continue;
-					if (!ipif_lookup_zoneid(
-					    ilw.ilw_walk_ill, ilm->ilm_zoneid,
-					    IPIF_UP, NULL))
-						continue;
-
-					first_mp1 = ip_copymsg(first_mp);
-					if (first_mp1 == NULL)
-						continue;
-					icmp_inbound_v6(q, first_mp1,
-					    ilw.ilw_walk_ill, inill,
-					    hdr_len, mctl_present, 0,
-					    ilm->ilm_zoneid, dl_mp);
-				}
-				ilm_walker_finish(&ilw);
-			} else {
-				first_mp1 = ip_copymsg(first_mp);
-				if (first_mp1 != NULL)
-					icmp_inbound_v6(q, first_mp1, ill,
-					    inill, hdr_len, mctl_present, 0,
-					    zoneid, dl_mp);
-			}
-			goto proto_fanout;
-		}
-		case IPPROTO_ENCAP:
-		case IPPROTO_IPV6:
-			if (ip_iptun_input_v6(mctl_present ? first_mp : NULL,
-			    mp, pkt_len - remlen, nexthdr, zoneid, ill, ipst)) {
-				return;
-			}
-			/*
-			 * If there was no IP tunnel data-link bound to
-			 * receive this packet, then we fall through to
-			 * allow potential raw sockets bound to either of
-			 * these protocols to pick it up.
-			 */
-			/* FALLTHRU */
-proto_fanout:
-		default: {
-			/*
-			 * Handle protocols with which IPv6 is less intimate.
-			 */
-			uint_t proto_flags = IP_FF_RAWIP|IP_FF_IPINFO;
-
-			if (hada_mp != NULL) {
-				ip0dbg(("default hada drop\n"));
-				goto hada_drop;
-			}
-
-			/*
-			 * Enable sending ICMP for "Unknown" nexthdr
-			 * case. i.e. where we did not FALLTHRU from
-			 * IPPROTO_ICMPV6 processing case above.
-			 * If we did FALLTHRU, then the packet has already been
-			 * processed for IPPF, don't process it again in
-			 * ip_fanout_proto_v6; set IP6_NO_IPPOLICY in the
-			 * flags
-			 */
-			if (nexthdr != IPPROTO_ICMPV6)
-				proto_flags |= IP_FF_SEND_ICMP;
-			else
-				proto_flags |= IP6_NO_IPPOLICY;
-
-			ip_fanout_proto_v6(q, first_mp, ip6h, ill, inill,
-			    nexthdr, prev_nexthdr_offset, (flags|proto_flags),
-			    mctl_present, zoneid);
-			return;
-		}
-
-		case IPPROTO_DSTOPTS: {
-			uint_t ehdrlen;
-			uint8_t *optptr;
-			ip6_dest_t *desthdr;
-
-			/* If packet is too short, look no further */
-			if (remlen < MIN_EHDR_LEN)
-				goto pkt_too_short;
-
-			/* Check if AH is present. */
-			if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill,
-			    inill, hada_mp, zoneid)) {
-				return;
-			}
-
-			/*
-			 * Reinitialize pointers, as ipsec_early_ah_v6() does
-			 * complete pullups.  We don't have to do more pullups
-			 * as a result.
-			 */
-			whereptr = (uint8_t *)((uintptr_t)mp->b_rptr +
-			    (uintptr_t)(whereptr - ((uint8_t *)ip6h)));
-			ip6h = (ip6_t *)mp->b_rptr;
-
-			desthdr = (ip6_dest_t *)whereptr;
-			nexthdr = desthdr->ip6d_nxt;
-			prev_nexthdr_offset = (uint_t)(whereptr -
-			    (uint8_t *)ip6h);
-			ehdrlen = 8 * (desthdr->ip6d_len + 1);
-			if (remlen < ehdrlen)
-				goto pkt_too_short;
-			optptr = whereptr + 2;
-			/*
-			 * Note: XXX This code does not seem to make
-			 * distinction between Destination Options Header
-			 * being before/after Routing Header which can
-			 * happen if we are at the end of source route.
-			 * This may become significant in future.
-			 * (No real significant Destination Options are
-			 * defined/implemented yet ).
-			 */
-			switch (ip_process_options_v6(q, first_mp, ip6h, optptr,
-			    ehdrlen - 2, IPPROTO_DSTOPTS, ipst)) {
-			case -1:
-				/*
-				 * Packet has been consumed and any needed
-				 * ICMP errors sent.
-				 */
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
-				freemsg(hada_mp);
-				return;
-			case 0:
-				/* No action needed  continue */
-				break;
-			case 1:
-				/*
-				 * Unnexpected return value
-				 * (Router alert is a Hop-by-Hop option)
-				 */
-#ifdef DEBUG
-				panic("ip_rput_data_v6: router "
-				    "alert hbh opt indication in dest opt");
-				/*NOTREACHED*/
-#else
-				freemsg(hada_mp);
-				freemsg(first_mp);
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				return;
-#endif
-			}
-			used = ehdrlen;
-			break;
-		}
-		case IPPROTO_FRAGMENT: {
-			ip6_frag_t *fraghdr;
-			size_t no_frag_hdr_len;
-
-			if (hada_mp != NULL) {
-				ip0dbg(("frag hada drop\n"));
-				goto hada_drop;
-			}
-
-			ASSERT(first_mp == mp);
-			if (remlen < sizeof (ip6_frag_t))
-				goto pkt_too_short;
-
-			if (mp->b_cont != NULL &&
-			    whereptr + sizeof (ip6_frag_t) > mp->b_wptr) {
-				if (!pullupmsg(mp,
-				    pkt_len - remlen + sizeof (ip6_frag_t))) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInDiscards);
-					freemsg(mp);
-					return;
-				}
-				hck_flags = 0;
-				ip6h = (ip6_t *)mp->b_rptr;
-				whereptr = (uint8_t *)ip6h + pkt_len - remlen;
-			}
-
-			fraghdr = (ip6_frag_t *)whereptr;
-			used = (uint_t)sizeof (ip6_frag_t);
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
-
-			/*
-			 * Invoke the CGTP (multirouting) filtering module to
-			 * process the incoming packet. Packets identified as
-			 * duplicates must be discarded. Filtering is active
-			 * only if the the ip_cgtp_filter ndd variable is
-			 * non-zero.
-			 */
-			if (ipst->ips_ip_cgtp_filter &&
-			    ipst->ips_ip_cgtp_filter_ops != NULL) {
-				int cgtp_flt_pkt;
-				netstackid_t stackid;
-
-				stackid = ipst->ips_netstack->netstack_stackid;
-
-				cgtp_flt_pkt =
-				    ipst->ips_ip_cgtp_filter_ops->cfo_filter_v6(
-				    stackid, inill->ill_phyint->phyint_ifindex,
-				    ip6h, fraghdr);
-				if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
-					freemsg(mp);
-					return;
-				}
-			}
-
-			/* Restore the flags */
-			DB_CKSUMFLAGS(mp) = hck_flags;
-
-			mp = ip_rput_frag_v6(ill, inill, mp, ip6h, fraghdr,
-			    remlen - used, &prev_nexthdr_offset,
-			    &reass_sum, &reass_hck_flags);
-			if (mp == NULL) {
-				/* Reassembly is still pending */
-				return;
-			}
-			/* The first mblk are the headers before the frag hdr */
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
-
-			first_mp = mp;	/* mp has most likely changed! */
-			no_frag_hdr_len = mp->b_wptr - mp->b_rptr;
-			ip6h = (ip6_t *)mp->b_rptr;
-			nexthdr = ((char *)ip6h)[prev_nexthdr_offset];
-			whereptr = mp->b_rptr + no_frag_hdr_len;
-			remlen = ntohs(ip6h->ip6_plen)  +
-			    (uint16_t)(IPV6_HDR_LEN - no_frag_hdr_len);
-			pkt_len = msgdsize(mp);
-			used = 0;
-			break;
-		}
-		case IPPROTO_HOPOPTS: {
-			if (hada_mp != NULL) {
-				ip0dbg(("hop hada drop\n"));
-				goto hada_drop;
-			}
-			/*
-			 * Illegal header sequence.
-			 * (Hop-by-hop headers are processed above
-			 *  and required to immediately follow IPv6 header)
-			 */
-			icmp_param_problem_v6(WR(q), first_mp,
-			    ICMP6_PARAMPROB_NEXTHEADER,
-			    prev_nexthdr_offset,
-			    B_FALSE, B_FALSE, zoneid, ipst);
-			return;
-		}
-		case IPPROTO_ROUTING: {
-			uint_t ehdrlen;
-			ip6_rthdr_t *rthdr;
-
-			/* If packet is too short, look no further */
-			if (remlen < MIN_EHDR_LEN)
-				goto pkt_too_short;
-
-			/* Check if AH is present. */
-			if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill,
-			    inill, hada_mp, zoneid)) {
-				return;
-			}
-
-			/*
-			 * Reinitialize pointers, as ipsec_early_ah_v6() does
-			 * complete pullups.  We don't have to do more pullups
-			 * as a result.
-			 */
-			whereptr = (uint8_t *)((uintptr_t)mp->b_rptr +
-			    (uintptr_t)(whereptr - ((uint8_t *)ip6h)));
-			ip6h = (ip6_t *)mp->b_rptr;
-
-			rthdr = (ip6_rthdr_t *)whereptr;
-			nexthdr = rthdr->ip6r_nxt;
-			prev_nexthdr_offset = (uint_t)(whereptr -
-			    (uint8_t *)ip6h);
-			ehdrlen = 8 * (rthdr->ip6r_len + 1);
-			if (remlen < ehdrlen)
-				goto pkt_too_short;
-			if (rthdr->ip6r_segleft != 0) {
-				/* Not end of source route */
-				if (ll_multicast) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsForwProhibits);
-					freemsg(hada_mp);
-					freemsg(mp);
-					return;
-				}
-				ip_process_rthdr(q, mp, ip6h, rthdr, ill,
-				    hada_mp);
-				return;
-			}
-			used = ehdrlen;
-			break;
-		}
-		case IPPROTO_AH:
-		case IPPROTO_ESP: {
-			/*
-			 * Fast path for AH/ESP. If this is the first time
-			 * we are sending a datagram to AH/ESP, allocate
-			 * a IPSEC_IN message and prepend it. Otherwise,
-			 * just fanout.
-			 */
-
-			ipsec_in_t *ii;
-			int ipsec_rc;
-			ipsec_stack_t *ipss;
-
-			ipss = ipst->ips_netstack->netstack_ipsec;
-			if (!mctl_present) {
-				ASSERT(first_mp == mp);
-				first_mp = ipsec_in_alloc(B_FALSE,
-				    ipst->ips_netstack);
-				if (first_mp == NULL) {
-					ip1dbg(("ip_rput_data_v6: IPSEC_IN "
-					    "allocation failure.\n"));
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsInDiscards);
-					freemsg(mp);
-					return;
-				}
-				/*
-				 * Store the ill_index so that when we come back
-				 * from IPSEC we ride on the same queue.
-				 */
-				ii = (ipsec_in_t *)first_mp->b_rptr;
-				ii->ipsec_in_ill_index =
-				    ill->ill_phyint->phyint_ifindex;
-				ii->ipsec_in_rill_index =
-				    inill->ill_phyint->phyint_ifindex;
-				first_mp->b_cont = mp;
-				/*
-				 * Cache hardware acceleration info.
-				 */
-				if (hada_mp != NULL) {
-					IPSECHW_DEBUG(IPSECHW_PKT,
-					    ("ip_rput_data_v6: "
-					    "caching data attr.\n"));
-					ii->ipsec_in_accelerated = B_TRUE;
-					ii->ipsec_in_da = hada_mp;
-					hada_mp = NULL;
-				}
-			} else {
-				ii = (ipsec_in_t *)first_mp->b_rptr;
-			}
-
-			if (!ipsec_loaded(ipss)) {
-				ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP,
-				    zoneid, ipst);
-				return;
-			}
-
-			/* select inbound SA and have IPsec process the pkt */
-			if (nexthdr == IPPROTO_ESP) {
-				esph_t *esph = ipsec_inbound_esp_sa(first_mp,
-				    ipst->ips_netstack);
-				if (esph == NULL)
-					return;
-				ASSERT(ii->ipsec_in_esp_sa != NULL);
-				ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func !=
-				    NULL);
-				ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func(
-				    first_mp, esph);
-			} else {
-				ah_t *ah = ipsec_inbound_ah_sa(first_mp,
-				    ipst->ips_netstack);
-				if (ah == NULL)
-					return;
-				ASSERT(ii->ipsec_in_ah_sa != NULL);
-				ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func !=
-				    NULL);
-				ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func(
-				    first_mp, ah);
-			}
-
-			switch (ipsec_rc) {
-			case IPSEC_STATUS_SUCCESS:
-				break;
-			case IPSEC_STATUS_FAILED:
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				/* FALLTHRU */
-			case IPSEC_STATUS_PENDING:
-				return;
-			}
-			/* we're done with IPsec processing, send it up */
-			ip_fanout_proto_again(first_mp, ill, inill, NULL);
-			return;
-		}
-		case IPPROTO_NONE:
-			/* All processing is done. Count as "delivered". */
-			freemsg(hada_mp);
-			freemsg(first_mp);
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-			return;
-		}
-		whereptr += used;
-		ASSERT(remlen >= used);
-		remlen -= used;
-	}
-	/* NOTREACHED */
-
-pkt_too_short:
-	ip1dbg(("ip_rput_data_v6: packet too short %d %lu %d\n",
-	    ip6_len, pkt_len, remlen));
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
-	freemsg(hada_mp);
-	freemsg(first_mp);
-	return;
-udp_fanout:
-	if (mctl_present || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
-		connp = NULL;
-	} else {
-		connp = ipcl_classify_v6(mp, IPPROTO_UDP, hdr_len, zoneid,
-		    ipst);
-		if ((connp != NULL) && (connp->conn_upq == NULL)) {
-			CONN_DEC_REF(connp);
-			connp = NULL;
-		}
-	}
-
-	if (connp == NULL) {
-		uint32_t	ports;
-
-		ports = *(uint32_t *)(mp->b_rptr + hdr_len +
-		    UDP_PORTS_OFFSET);
-		IP6_STAT(ipst, ip6_udp_slow_path);
-		ip_fanout_udp_v6(q, first_mp, ip6h, ports, ill, inill,
-		    (flags|IP_FF_SEND_ICMP|IP_FF_IPINFO), mctl_present,
-		    zoneid);
-		return;
-	}
-
-	if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
-	    (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
-		freemsg(first_mp);
-		BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
-		CONN_DEC_REF(connp);
-		return;
-	}
-
-	/* Initiate IPPF processing */
-	if (IP6_IN_IPP(flags, ipst)) {
-		ip_process(IPP_LOCAL_IN, &mp, ill->ill_phyint->phyint_ifindex);
-		if (mp == NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			CONN_DEC_REF(connp);
-			return;
-		}
-	}
-
-	if (connp->conn_ip_recvpktinfo ||
-	    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) {
-		mp = ip_add_info_v6(mp, inill, &ip6h->ip6_dst);
-		if (mp == NULL) {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			CONN_DEC_REF(connp);
-			return;
-		}
+		return (NULL);
 	}
 
-	IP6_STAT(ipst, ip6_udp_fast_path);
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-
-	/* Send it upstream */
-	(connp->conn_recv)(connp, mp, NULL);
-
-	CONN_DEC_REF(connp);
-	freemsg(hada_mp);
-	return;
-
-hada_drop:
-	ip1dbg(("ip_rput_data_v6: malformed accelerated packet\n"));
-	/* IPsec kstats: bump counter here */
-	freemsg(hada_mp);
-	freemsg(first_mp);
+	/* we're done with IPsec processing, send it up */
+	ip_input_post_ipsec(mp, ira);
+	return (NULL);
 }
 
 /*
  * Reassemble fragment.
  * When it returns a completed message the first mblk will only contain
- * the headers prior to the fragment header.
- *
- * prev_nexthdr_offset is an offset indication of where the nexthdr field is
- * of the preceding header.  This is needed to patch the previous header's
- * nexthdr field when reassembly completes.
+ * the headers prior to the fragment header, with the nexthdr value updated
+ * to be the header after the fragment header.
  */
-static mblk_t *
-ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
-    ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset,
-    uint32_t *cksum_val, uint16_t *cksum_flags)
+mblk_t *
+ip_input_fragment_v6(mblk_t *mp, ip6_t *ip6h,
+    ip6_frag_t *fraghdr, uint_t remlen, ip_recv_attr_t *ira)
 {
 	uint32_t	ident = ntohl(fraghdr->ip6f_ident);
 	uint16_t	offset;
@@ -8225,12 +3304,12 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 	boolean_t	pruned = B_FALSE;
 	uint32_t	sum_val;
 	uint16_t	sum_flags;
+	ill_t		*ill = ira->ira_ill;
 	ip_stack_t	*ipst = ill->ill_ipst;
-
-	if (cksum_val != NULL)
-		*cksum_val = 0;
-	if (cksum_flags != NULL)
-		*cksum_flags = 0;
+	uint_t		prev_nexthdr_offset;
+	uint8_t		prev_nexthdr;
+	uint8_t		*ptr;
+	uint32_t	packet_size;
 
 	/*
 	 * We utilize hardware computed checksum info only for UDP since
@@ -8238,8 +3317,9 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 	 * addition, checksum offload support for IP fragments carrying
 	 * UDP payload is commonly implemented across network adapters.
 	 */
-	ASSERT(inill != NULL);
-	if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(inill) &&
+	ASSERT(ira->ira_rill != NULL);
+	if (nexthdr == IPPROTO_UDP && dohwcksum &&
+	    ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
 	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
 		mblk_t *mp1 = mp->b_cont;
 		int32_t len;
@@ -8253,8 +3333,8 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 
 		if ((sum_flags & HCK_PARTIALCKSUM) &&
 		    (mp1 == NULL || mp1->b_cont == NULL) &&
-		    offset >= (uint16_t)DB_CKSUMSTART(mp) &&
-		    ((len = offset - (uint16_t)DB_CKSUMSTART(mp)) & 1) == 0) {
+		    offset >= DB_CKSUMSTART(mp) &&
+		    ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
 			uint32_t adj;
 			/*
 			 * Partial checksum has been calculated by hardware
@@ -8281,6 +3361,59 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 	DB_CKSUMFLAGS(mp) = 0;
 
 	/*
+	 * Determine the offset (from the begining of the IP header)
+	 * of the nexthdr value which has IPPROTO_FRAGMENT. We use
+	 * this when removing the fragment header from the packet.
+	 * This packet consists of the IPv6 header, a potential
+	 * hop-by-hop options header, a potential pre-routing-header
+	 * destination options header, and a potential routing header.
+	 */
+	prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
+	prev_nexthdr = ip6h->ip6_nxt;
+	ptr = (uint8_t *)&ip6h[1];
+
+	if (prev_nexthdr == IPPROTO_HOPOPTS) {
+		ip6_hbh_t	*hbh_hdr;
+		uint_t		hdr_len;
+
+		hbh_hdr = (ip6_hbh_t *)ptr;
+		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
+		prev_nexthdr = hbh_hdr->ip6h_nxt;
+		prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
+		    - (uint8_t *)ip6h;
+		ptr += hdr_len;
+	}
+	if (prev_nexthdr == IPPROTO_DSTOPTS) {
+		ip6_dest_t	*dest_hdr;
+		uint_t		hdr_len;
+
+		dest_hdr = (ip6_dest_t *)ptr;
+		hdr_len = 8 * (dest_hdr->ip6d_len + 1);
+		prev_nexthdr = dest_hdr->ip6d_nxt;
+		prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
+		    - (uint8_t *)ip6h;
+		ptr += hdr_len;
+	}
+	if (prev_nexthdr == IPPROTO_ROUTING) {
+		ip6_rthdr_t	*rthdr;
+		uint_t		hdr_len;
+
+		rthdr = (ip6_rthdr_t *)ptr;
+		prev_nexthdr = rthdr->ip6r_nxt;
+		prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
+		    - (uint8_t *)ip6h;
+		hdr_len = 8 * (rthdr->ip6r_len + 1);
+		ptr += hdr_len;
+	}
+	if (prev_nexthdr != IPPROTO_FRAGMENT) {
+		/* Can't handle other headers before the fragment header */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+		freemsg(mp);
+		return (NULL);
+	}
+
+	/*
 	 * Note: Fragment offset in header is in 8-octet units.
 	 * Clearing least significant 3 bits not only extracts
 	 * it but also gets it in units of octets.
@@ -8293,17 +3426,10 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 	 * of eight?
 	 */
 	if (more_frags && (ntohs(ip6h->ip6_plen) & 7)) {
-		zoneid_t zoneid;
-
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
-		zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, ill, ipst);
-		if (zoneid == ALL_ZONES) {
-			freemsg(mp);
-			return (NULL);
-		}
-		icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER,
+		ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
+		icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
 		    (uint32_t)((char *)&ip6h->ip6_plen -
-		    (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst);
+		    (char *)ip6h), B_FALSE, ira);
 		return (NULL);
 	}
 
@@ -8319,17 +3445,11 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 	 * greater than IP_MAXPACKET - the max payload size?
 	 */
 	if (end > IP_MAXPACKET) {
-		zoneid_t	zoneid;
-
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
-		zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, ill, ipst);
-		if (zoneid == ALL_ZONES) {
-			freemsg(mp);
-			return (NULL);
-		}
-		icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER,
+		ip_drop_input("Reassembled packet too large", mp, ill);
+		icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
 		    (uint32_t)((char *)&fraghdr->ip6f_offlg -
-		    (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst);
+		    (char *)ip6h), B_FALSE, ira);
 		return (NULL);
 	}
 
@@ -8368,11 +3488,17 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 	 * there is anything on the reassembly queue, the timer will
 	 * be running.
 	 */
-	msg_len = MBLKSIZE(mp);
+	/* Handle vnic loopback of fragments */
+	if (mp->b_datap->db_ref > 2)
+		msg_len = 0;
+	else
+		msg_len = MBLKSIZE(mp);
+
 	tail_mp = mp;
 	while (tail_mp->b_cont != NULL) {
 		tail_mp = tail_mp->b_cont;
-		msg_len += MBLKSIZE(tail_mp);
+		if (tail_mp->b_datap->db_ref <= 2)
+			msg_len += MBLKSIZE(tail_mp);
 	}
 	/*
 	 * If the reassembly list for this ILL will get too big
@@ -8381,6 +3507,9 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 
 	if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
 	    ipst->ips_ip_reass_queue_bytes) {
+		DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
+		    uint_t, ill->ill_frag_count,
+		    uint_t, ipst->ips_ip_reass_queue_bytes);
 		ill_frag_prune(ill,
 		    (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
 		    (ipst->ips_ip_reass_queue_bytes - msg_len));
@@ -8443,6 +3572,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 		mp1 = allocb(sizeof (*ipf), BPRI_MED);
 		if (!mp1) {
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
 			freemsg(mp);
 	partial_reass_done:
 			mutex_exit(&ipfb->ipfb_lock);
@@ -8512,7 +3642,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 			 */
 			ipf->ipf_end = end;
 			ipf->ipf_nf_hdr_len = hdr_length;
-			ipf->ipf_prev_nexthdr_offset = *prev_nexthdr_offset;
+			ipf->ipf_prev_nexthdr_offset = prev_nexthdr_offset;
 		} else {
 			/* Hard case, hole at the beginning. */
 			ipf->ipf_tail_mp = NULL;
@@ -8603,7 +3733,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 			if (ipf->ipf_prev_nexthdr_offset == 0) {
 				ipf->ipf_nf_hdr_len = hdr_length;
 				ipf->ipf_prev_nexthdr_offset =
-				    *prev_nexthdr_offset;
+				    prev_nexthdr_offset;
 			}
 		}
 		/* Save current byte count */
@@ -8654,7 +3784,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 	 * header
 	 */
 	nexthdr = ipf->ipf_protocol;
-	*prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
+	prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
 	ipfp = ipf->ipf_ptphn;
 
 	/* We need to supply these to caller */
@@ -8685,7 +3815,8 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 reass_done:
 	if (hdr_length < sizeof (ip6_frag_t)) {
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
-		ip1dbg(("ip_rput_frag_v6: bad packet\n"));
+		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+		ip1dbg(("ip_input_fragment_v6: bad packet\n"));
 		freemsg(mp);
 		return (NULL);
 	}
@@ -8708,8 +3839,9 @@ reass_done:
 		mblk_t *nmp;
 
 		if (!(nmp = dupb(mp))) {
+			ip1dbg(("ip_input_fragment_v6: dupb failed\n"));
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			ip1dbg(("ip_rput_frag_v6: dupb failed\n"));
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
 			freemsg(mp);
 			return (NULL);
 		}
@@ -8720,19 +3852,24 @@ reass_done:
 	mp->b_wptr = mp->b_rptr + hdr_length - sizeof (ip6_frag_t);
 
 	ip6h = (ip6_t *)mp->b_rptr;
-	((char *)ip6h)[*prev_nexthdr_offset] = nexthdr;
+	((char *)ip6h)[prev_nexthdr_offset] = nexthdr;
 
 	/* Restore original IP length in header. */
-	ip6h->ip6_plen = htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN));
+	packet_size = msgdsize(mp);
+	ip6h->ip6_plen = htons((uint16_t)(packet_size - IPV6_HDR_LEN));
 	/* Record the ECN info. */
 	ip6h->ip6_vcf &= htonl(0xFFCFFFFF);
 	ip6h->ip6_vcf |= htonl(ecn_info << 20);
 
-	/* Reassembly is successful; return checksum information if needed */
-	if (cksum_val != NULL)
-		*cksum_val = sum_val;
-	if (cksum_flags != NULL)
-		*cksum_flags = sum_flags;
+	/* Update the receive attributes */
+	ira->ira_pktlen = packet_size;
+	ira->ira_ip_hdr_length = hdr_length - sizeof (ip6_frag_t);
+	ira->ira_protocol = nexthdr;
+
+	/* Reassembly is successful; set checksum information in packet */
+	DB_CKSUM16(mp) = (uint16_t)sum_val;
+	DB_CKSUMFLAGS(mp) = sum_flags;
+	DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
 
 	return (mp);
 }
@@ -8742,7 +3879,7 @@ reass_done:
  * header.
  */
 static in6_addr_t
-pluck_out_dst(mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
+pluck_out_dst(const mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
 {
 	ip6_rthdr0_t *rt0;
 	int segleft, numaddr;
@@ -8758,7 +3895,7 @@ pluck_out_dst(mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
 	numaddr = rt0->ip6r0_len / 2;
 
 	if ((rt0->ip6r0_len & 0x1) ||
-	    whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr ||
+	    (mp != NULL && whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr) ||
 	    (segleft > rt0->ip6r0_len / 2)) {
 		/*
 		 * Corrupt packet.  Either the routing header length is odd
@@ -8784,11 +3921,13 @@ pluck_out_dst(mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
  * Walk through the options to see if there is a routing header.
  * If present get the destination which is the last address of
  * the option.
+ * mp needs to be provided in cases when the extension headers might span
+ * b_cont; mp is never modified by this function.
  */
 in6_addr_t
-ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment)
+ip_get_dst_v6(ip6_t *ip6h, const mblk_t *mp, boolean_t *is_fragment)
 {
-	mblk_t *current_mp = mp;
+	const mblk_t *current_mp = mp;
 	uint8_t nexthdr;
 	uint8_t *whereptr;
 	int ehdrlen;
@@ -8798,7 +3937,8 @@ ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment)
 	ehdrlen = sizeof (ip6_t);
 
 	/* We assume at least the IPv6 base header is within one mblk. */
-	ASSERT(mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen);
+	ASSERT(mp == NULL ||
+	    (mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen));
 
 	rv = ip6h->ip6_dst;
 	nexthdr = ip6h->ip6_nxt;
@@ -8819,7 +3959,8 @@ ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment)
 		 * All IPv6 extension headers have the next-header in byte
 		 * 0, and the (length - 8) in 8-byte-words.
 		 */
-		while (whereptr + ehdrlen >= current_mp->b_wptr) {
+		while (current_mp != NULL &&
+		    whereptr + ehdrlen >= current_mp->b_wptr) {
 			ehdrlen -= (current_mp->b_wptr - whereptr);
 			current_mp = current_mp->b_cont;
 			if (current_mp == NULL) {
@@ -8833,7 +3974,7 @@ ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment)
 		whereptr += ehdrlen;
 
 		nexthdr = *whereptr;
-		ASSERT(whereptr + 1 < current_mp->b_wptr);
+		ASSERT(current_mp == NULL || whereptr + 1 < current_mp->b_wptr);
 		ehdrlen = (*(whereptr + 1) + 1) * 8;
 	}
 
@@ -8845,7 +3986,7 @@ done:
 
 /*
  * ip_source_routed_v6:
- * This function is called by redirect code in ip_rput_data_v6 to
+ * This function is called by redirect code (called from ip_input_v6) to
  * know whether this packet is source routed through this node i.e
  * whether this node (router) is part of the journey. This
  * function is called under two cases :
@@ -8922,22 +4063,14 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
 		 */
 		if (rthdr->ip6r0_segleft > 0 ||
 		    rthdr->ip6r0_segleft == 0) {
-			ire_t 	*ire = NULL;
-
 			numaddr = rthdr->ip6r0_len / 2;
 			addrptr = (in6_addr_t *)((char *)rthdr +
 			    sizeof (*rthdr));
 			addrptr += (numaddr - (rthdr->ip6r0_segleft + 1));
 			if (addrptr != NULL) {
-				ire = ire_ctable_lookup_v6(addrptr, NULL,
-				    IRE_LOCAL, NULL, ALL_ZONES, NULL,
-				    MATCH_IRE_TYPE,
-				    ipst);
-				if (ire != NULL) {
-					ire_refrele(ire);
+				if (ip_type_v6(addrptr, ipst) == IRE_LOCAL)
 					return (B_TRUE);
-				}
-				ip1dbg(("ip_source_routed_v6: No ire found\n"));
+				ip1dbg(("ip_source_routed_v6: Not local\n"));
 			}
 		}
 	/* FALLTHRU */
@@ -8948,2387 +4081,19 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
 }
 
 /*
- * ip_wput_v6 -- Packets sent down from transport modules show up here.
- * Assumes that the following set of headers appear in the first
- * mblk:
- *	ip6i_t (if present) CAN also appear as a separate mblk.
- *	ip6_t
- *	Any extension headers
- *	TCP/UDP/SCTP header (if present)
- * The routine can handle an ICMPv6 header that is not in the first mblk.
- *
- * The order to determine the outgoing interface is as follows:
- * 1. If an ip6i_t with IP6I_IFINDEX set then use that ill.
- * 2. If q is an ill queue and (link local or multicast destination) then
- *    use that ill.
- * 3. If IPV6_BOUND_IF has been set use that ill.
- * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise
- *    look for the best IRE match for the unspecified group to determine
- *    the ill.
- * 5. For unicast: Just do an IRE lookup for the best match.
- *
- * arg2 is always a queue_t *.
- * When that queue is an ill_t (i.e. q_next != NULL), then arg must be
- * the zoneid.
- * When that queue is not an ill_t, then arg must be a conn_t pointer.
- */
-void
-ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
-{
-	conn_t		*connp = NULL;
-	queue_t		*q = (queue_t *)arg2;
-	ire_t		*ire = NULL;
-	ire_t		*sctp_ire = NULL;
-	ip6_t		*ip6h;
-	in6_addr_t	*v6dstp;
-	ill_t		*ill = NULL;
-	ipif_t		*ipif;
-	ip6i_t		*ip6i;
-	int		cksum_request;	/* -1 => normal. */
-			/* 1 => Skip TCP/UDP/SCTP checksum */
-			/* Otherwise contains insert offset for checksum */
-	int		unspec_src;
-	boolean_t	do_outrequests;	/* Increment OutRequests? */
-	mib2_ipIfStatsEntry_t	*mibptr;
-	int 		match_flags = MATCH_IRE_ILL;
-	mblk_t		*first_mp;
-	boolean_t	mctl_present;
-	ipsec_out_t	*io;
-	boolean_t	multirt_need_resolve = B_FALSE;
-	mblk_t		*copy_mp = NULL;
-	int		err = 0;
-	int		ip6i_flags = 0;
-	zoneid_t	zoneid;
-	ill_t		*saved_ill = NULL;
-	boolean_t	conn_lock_held;
-	boolean_t	need_decref = B_FALSE;
-	ip_stack_t	*ipst;
-
-	if (q->q_next != NULL) {
-		ill = (ill_t *)q->q_ptr;
-		ipst = ill->ill_ipst;
-	} else {
-		connp = (conn_t *)arg;
-		ASSERT(connp != NULL);
-		ipst = connp->conn_netstack->netstack_ip;
-	}
-
-	/*
-	 * Highest bit in version field is Reachability Confirmation bit
-	 * used by NUD in ip_xmit_v6().
-	 */
-#ifdef	_BIG_ENDIAN
-#define	IPVER(ip6h)	((((uint32_t *)ip6h)[0] >> 28) & 0x7)
-#else
-#define	IPVER(ip6h)	((((uint32_t *)ip6h)[0] >> 4) & 0x7)
-#endif
-
-	/*
-	 * M_CTL comes from 5 places
-	 *
-	 * 1) TCP sends down IPSEC_OUT(M_CTL) for detached connections
-	 *    both V4 and V6 datagrams.
-	 *
-	 * 2) AH/ESP sends down M_CTL after doing their job with both
-	 *    V4 and V6 datagrams.
-	 *
-	 * 3) NDP callbacks when nce is resolved and IPSEC_OUT has been
-	 *    attached.
-	 *
-	 * 4) Notifications from an external resolver (for XRESOLV ifs)
-	 *
-	 * 5) AH/ESP send down IPSEC_CTL(M_CTL) to be relayed to hardware for
-	 *    IPsec hardware acceleration support.
-	 *
-	 * We need to handle (1)'s IPv6 case and (3) here.  For the
-	 * IPv4 case in (1), and (2), IPSEC processing has already
-	 * started. The code in ip_wput() already knows how to handle
-	 * continuing IPSEC processing (for IPv4 and IPv6).  All other
-	 * M_CTLs (including case (4)) are passed on to ip_wput_nondata()
-	 * for handling.
-	 */
-	first_mp = mp;
-	mctl_present = B_FALSE;
-	io = NULL;
-
-	/* Multidata transmit? */
-	if (DB_TYPE(mp) == M_MULTIDATA) {
-		/*
-		 * We should never get here, since all Multidata messages
-		 * originating from tcp should have been directed over to
-		 * tcp_multisend() in the first place.
-		 */
-		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
-		freemsg(mp);
-		return;
-	} else if (DB_TYPE(mp) == M_CTL) {
-		uint32_t mctltype = 0;
-		uint32_t mlen = MBLKL(first_mp);
-
-		mp = mp->b_cont;
-		mctl_present = B_TRUE;
-		io = (ipsec_out_t *)first_mp->b_rptr;
-
-		/*
-		 * Validate this M_CTL message.  The only three types of
-		 * M_CTL messages we expect to see in this code path are
-		 * ipsec_out_t or ipsec_in_t structures (allocated as
-		 * ipsec_info_t unions), or ipsec_ctl_t structures.
-		 * The ipsec_out_type and ipsec_in_type overlap in the two
-		 * data structures, and they are either set to IPSEC_OUT
-		 * or IPSEC_IN depending on which data structure it is.
-		 * ipsec_ctl_t is an IPSEC_CTL.
-		 *
-		 * All other M_CTL messages are sent to ip_wput_nondata()
-		 * for handling.
-		 */
-		if (mlen >= sizeof (io->ipsec_out_type))
-			mctltype = io->ipsec_out_type;
-
-		if ((mlen == sizeof (ipsec_ctl_t)) &&
-		    (mctltype == IPSEC_CTL)) {
-			ip_output(arg, first_mp, arg2, caller);
-			return;
-		}
-
-		if ((mlen < sizeof (ipsec_info_t)) ||
-		    (mctltype != IPSEC_OUT && mctltype != IPSEC_IN) ||
-		    mp == NULL) {
-			ip_wput_nondata(NULL, q, first_mp, NULL);
-			return;
-		}
-		/* NDP callbacks have q_next non-NULL.  That's case #3. */
-		if (q->q_next == NULL) {
-			ip6h = (ip6_t *)mp->b_rptr;
-			/*
-			 * For a freshly-generated TCP dgram that needs IPV6
-			 * processing, don't call ip_wput immediately. We can
-			 * tell this by the ipsec_out_proc_begin. In-progress
-			 * IPSEC_OUT messages have proc_begin set to TRUE,
-			 * and we want to send all IPSEC_IN messages to
-			 * ip_wput() for IPsec processing or finishing.
-			 */
-			if (mctltype == IPSEC_IN ||
-			    IPVER(ip6h) != IPV6_VERSION ||
-			    io->ipsec_out_proc_begin) {
-				mibptr = &ipst->ips_ip6_mib;
-				goto notv6;
-			}
-		}
-	} else if (DB_TYPE(mp) != M_DATA) {
-		ip_wput_nondata(NULL, q, mp, NULL);
-		return;
-	}
-
-	ip6h = (ip6_t *)mp->b_rptr;
-
-	if (IPVER(ip6h) != IPV6_VERSION) {
-		mibptr = &ipst->ips_ip6_mib;
-		goto notv6;
-	}
-
-	if (is_system_labeled() && DB_TYPE(mp) == M_DATA &&
-	    (connp == NULL || !connp->conn_ulp_labeled)) {
-		cred_t		*cr;
-		pid_t		pid;
-
-		if (connp != NULL) {
-			ASSERT(CONN_CRED(connp) != NULL);
-			cr = BEST_CRED(mp, connp, &pid);
-			err = tsol_check_label_v6(cr, &mp,
-			    connp->conn_mac_mode, ipst, pid);
-		} else if ((cr = msg_getcred(mp, &pid)) != NULL) {
-			err = tsol_check_label_v6(cr, &mp, CONN_MAC_DEFAULT,
-			    ipst, pid);
-		}
-		if (mctl_present)
-			first_mp->b_cont = mp;
-		else
-			first_mp = mp;
-		if (err != 0) {
-			DTRACE_PROBE3(
-			    tsol_ip_log_drop_checklabel_ip6, char *,
-			    "conn(1), failed to check/update mp(2)",
-			    conn_t, connp, mblk_t, mp);
-			freemsg(first_mp);
-			return;
-		}
-		ip6h = (ip6_t *)mp->b_rptr;
-	}
-	if (q->q_next != NULL) {
-		/*
-		 * We don't know if this ill will be used for IPv6
-		 * until the ILLF_IPV6 flag is set via SIOCSLIFNAME.
-		 * ipif_set_values() sets the ill_isv6 flag to true if
-		 * ILLF_IPV6 is set.  If the ill_isv6 flag isn't true,
-		 * just drop the packet.
-		 */
-		if (!ill->ill_isv6) {
-			ip1dbg(("ip_wput_v6: Received an IPv6 packet before "
-			    "ILLF_IPV6 was set\n"));
-			freemsg(first_mp);
-			return;
-		}
-		/* For uniformity do a refhold */
-		mutex_enter(&ill->ill_lock);
-		if (!ILL_CAN_LOOKUP(ill)) {
-			mutex_exit(&ill->ill_lock);
-			freemsg(first_mp);
-			return;
-		}
-		ill_refhold_locked(ill);
-		mutex_exit(&ill->ill_lock);
-		mibptr = ill->ill_ip_mib;
-
-		ASSERT(mibptr != NULL);
-		unspec_src = 0;
-		BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
-		do_outrequests = B_FALSE;
-		zoneid = (zoneid_t)(uintptr_t)arg;
-	} else {
-		ASSERT(connp != NULL);
-		zoneid = connp->conn_zoneid;
-
-		/* is queue flow controlled? */
-		if ((q->q_first || connp->conn_draining) &&
-		    (caller == IP_WPUT)) {
-			/*
-			 * 1) TCP sends down M_CTL for detached connections.
-			 * 2) AH/ESP sends down M_CTL.
-			 *
-			 * We don't flow control either of the above. Only
-			 * UDP and others are flow controlled for which we
-			 * can't have a M_CTL.
-			 */
-			ASSERT(first_mp == mp);
-			(void) putq(q, mp);
-			return;
-		}
-		mibptr = &ipst->ips_ip6_mib;
-		unspec_src = connp->conn_unspec_src;
-		do_outrequests = B_TRUE;
-		if (mp->b_flag & MSGHASREF) {
-			mp->b_flag &= ~MSGHASREF;
-			ASSERT(connp->conn_ulp == IPPROTO_SCTP);
-			SCTP_EXTRACT_IPINFO(mp, sctp_ire);
-			need_decref = B_TRUE;
-		}
-
-		/*
-		 * If there is a policy, try to attach an ipsec_out in
-		 * the front. At the end, first_mp either points to a
-		 * M_DATA message or IPSEC_OUT message linked to a
-		 * M_DATA message. We have to do it now as we might
-		 * lose the "conn" if we go through ip_newroute.
-		 */
-		if (!mctl_present &&
-		    (connp->conn_out_enforce_policy ||
-		    connp->conn_latch != NULL)) {
-			ASSERT(first_mp == mp);
-			/* XXX Any better way to get the protocol fast ? */
-			if (((mp = ipsec_attach_ipsec_out(&mp, connp, NULL,
-			    connp->conn_ulp, ipst->ips_netstack)) == NULL)) {
-				BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-				if (need_decref)
-					CONN_DEC_REF(connp);
-				return;
-			} else {
-				ASSERT(mp->b_datap->db_type == M_CTL);
-				first_mp = mp;
-				mp = mp->b_cont;
-				mctl_present = B_TRUE;
-				io = (ipsec_out_t *)first_mp->b_rptr;
-			}
-		}
-	}
-
-	/* check for alignment and full IPv6 header */
-	if (!OK_32PTR((uchar_t *)ip6h) ||
-	    (mp->b_wptr - (uchar_t *)ip6h) < IPV6_HDR_LEN) {
-		ip0dbg(("ip_wput_v6: bad alignment or length\n"));
-		if (do_outrequests)
-			BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
-		BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-		freemsg(first_mp);
-		if (ill != NULL)
-			ill_refrele(ill);
-		if (need_decref)
-			CONN_DEC_REF(connp);
-		return;
-	}
-	v6dstp = &ip6h->ip6_dst;
-	cksum_request = -1;
-	ip6i = NULL;
-
-	/*
-	 * Once neighbor discovery has completed, ndp_process() will provide
-	 * locally generated packets for which processing can be reattempted.
-	 * In these cases, connp is NULL and the original zone is part of a
-	 * prepended ipsec_out_t.
-	 */
-	if (io != NULL) {
-		/*
-		 * When coming from icmp_input_v6, the zoneid might not match
-		 * for the loopback case, because inside icmp_input_v6 the
-		 * queue_t is a conn queue from the sending side.
-		 */
-		zoneid = io->ipsec_out_zoneid;
-		ASSERT(zoneid != ALL_ZONES);
-	}
-
-	if (ip6h->ip6_nxt == IPPROTO_RAW) {
-		/*
-		 * This is an ip6i_t header followed by an ip6_hdr.
-		 * Check which fields are set.
-		 *
-		 * When the packet comes from a transport we should have
-		 * all needed headers in the first mblk. However, when
-		 * going through ip_newroute*_v6 the ip6i might be in
-		 * a separate mblk when we return here. In that case
-		 * we pullup everything to ensure that extension and transport
-		 * headers "stay" in the first mblk.
-		 */
-		ip6i = (ip6i_t *)ip6h;
-		ip6i_flags = ip6i->ip6i_flags;
-
-		ASSERT((mp->b_wptr - (uchar_t *)ip6i) == sizeof (ip6i_t) ||
-		    ((mp->b_wptr - (uchar_t *)ip6i) >=
-		    sizeof (ip6i_t) + IPV6_HDR_LEN));
-
-		if ((mp->b_wptr - (uchar_t *)ip6i) == sizeof (ip6i_t)) {
-			if (!pullupmsg(mp, -1)) {
-				ip1dbg(("ip_wput_v6: pullupmsg failed\n"));
-				if (do_outrequests) {
-					BUMP_MIB(mibptr,
-					    ipIfStatsHCOutRequests);
-				}
-				BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-				freemsg(first_mp);
-				if (ill != NULL)
-					ill_refrele(ill);
-				if (need_decref)
-					CONN_DEC_REF(connp);
-				return;
-			}
-			ip6h = (ip6_t *)mp->b_rptr;
-			v6dstp = &ip6h->ip6_dst;
-			ip6i = (ip6i_t *)ip6h;
-		}
-		ip6h = (ip6_t *)&ip6i[1];
-
-		/*
-		 * Advance rptr past the ip6i_t to get ready for
-		 * transmitting the packet. However, if the packet gets
-		 * passed to ip_newroute*_v6 then rptr is moved back so
-		 * that the ip6i_t header can be inspected when the
-		 * packet comes back here after passing through
-		 * ire_add_then_send.
-		 */
-		mp->b_rptr = (uchar_t *)ip6h;
-
-		if (ip6i->ip6i_flags & IP6I_IFINDEX) {
-			ASSERT(ip6i->ip6i_ifindex != 0);
-			if (ill != NULL)
-				ill_refrele(ill);
-			ill = ill_lookup_on_ifindex(ip6i->ip6i_ifindex, 1,
-			    NULL, NULL, NULL, NULL, ipst);
-			if (ill == NULL) {
-				if (do_outrequests) {
-					BUMP_MIB(mibptr,
-					    ipIfStatsHCOutRequests);
-				}
-				BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-				ip1dbg(("ip_wput_v6: bad ifindex %d\n",
-				    ip6i->ip6i_ifindex));
-				if (need_decref)
-					CONN_DEC_REF(connp);
-				freemsg(first_mp);
-				return;
-			}
-			mibptr = ill->ill_ip_mib;
-			/*
-			 * Preserve the index so that when we return from
-			 * IPSEC processing, we know where to send the packet.
-			 */
-			if (mctl_present) {
-				ASSERT(io != NULL);
-				io->ipsec_out_ill_index = ip6i->ip6i_ifindex;
-			}
-		}
-		if (ip6i->ip6i_flags & IP6I_VERIFY_SRC) {
-			cred_t *cr = msg_getcred(mp, NULL);
-
-			/* rpcmod doesn't send down db_credp for UDP packets */
-			if (cr == NULL) {
-				if (connp != NULL)
-					cr = connp->conn_cred;
-				else
-					cr = ill->ill_credp;
-			}
-
-			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src));
-			if (secpolicy_net_rawaccess(cr) != 0) {
-				/*
-				 * Use IPCL_ZONEID to honor SO_ALLZONES.
-				 */
-				ire = ire_route_lookup_v6(&ip6h->ip6_src,
-				    0, 0, (IRE_LOCAL|IRE_LOOPBACK), NULL,
-				    NULL, connp != NULL ?
-				    IPCL_ZONEID(connp) : zoneid, NULL,
-				    MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
-				if (ire == NULL) {
-					if (do_outrequests)
-						BUMP_MIB(mibptr,
-						    ipIfStatsHCOutRequests);
-					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-					ip1dbg(("ip_wput_v6: bad source "
-					    "addr\n"));
-					freemsg(first_mp);
-					if (ill != NULL)
-						ill_refrele(ill);
-					if (need_decref)
-						CONN_DEC_REF(connp);
-					return;
-				}
-				ire_refrele(ire);
-			}
-			/* No need to verify again when using ip_newroute */
-			ip6i->ip6i_flags &= ~IP6I_VERIFY_SRC;
-		}
-		if (!(ip6i->ip6i_flags & IP6I_NEXTHOP)) {
-			/*
-			 * Make sure they match since ip_newroute*_v6 etc might
-			 * (unknown to them) inspect ip6i_nexthop when
-			 * they think they access ip6_dst.
-			 */
-			ip6i->ip6i_nexthop = ip6h->ip6_dst;
-		}
-		if (ip6i->ip6i_flags & IP6I_NO_ULP_CKSUM)
-			cksum_request = 1;
-		if (ip6i->ip6i_flags & IP6I_RAW_CHECKSUM)
-			cksum_request = ip6i->ip6i_checksum_off;
-		if (ip6i->ip6i_flags & IP6I_UNSPEC_SRC)
-			unspec_src = 1;
-
-		if (do_outrequests && ill != NULL) {
-			BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
-			do_outrequests = B_FALSE;
-		}
-		/*
-		 * Store ip6i_t info that we need after we come back
-		 * from IPSEC processing.
-		 */
-		if (mctl_present) {
-			ASSERT(io != NULL);
-			io->ipsec_out_unspec_src = unspec_src;
-		}
-	}
-	if (connp != NULL && connp->conn_dontroute)
-		ip6h->ip6_hops = 1;
-
-	if (IN6_IS_ADDR_MULTICAST(v6dstp))
-		goto ipv6multicast;
-
-	/* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
-	if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
-		ASSERT(ill != NULL);
-		goto send_from_ill;
-	}
-
-	/*
-	 * 2. If q is an ill queue and there's a link-local destination
-	 *    then use that ill.
-	 */
-	if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp))
-		goto send_from_ill;
-
-	/* 3. If IPV6_BOUND_IF has been set use that ill. */
-	if (connp != NULL && connp->conn_outgoing_ill != NULL) {
-		ill_t	*conn_outgoing_ill;
-
-		conn_outgoing_ill = conn_get_held_ill(connp,
-		    &connp->conn_outgoing_ill, &err);
-		if (err == ILL_LOOKUP_FAILED) {
-			if (ill != NULL)
-				ill_refrele(ill);
-			if (need_decref)
-				CONN_DEC_REF(connp);
-			freemsg(first_mp);
-			return;
-		}
-		if (ill != NULL)
-			ill_refrele(ill);
-		ill = conn_outgoing_ill;
-		mibptr = ill->ill_ip_mib;
-		goto send_from_ill;
-	}
-
-	/*
-	 * 4. For unicast: Just do an IRE lookup for the best match.
-	 * If we get here for a link-local address it is rather random
-	 * what interface we pick on a multihomed host.
-	 * *If* there is an IRE_CACHE (and the link-local address
-	 * isn't duplicated on multi links) this will find the IRE_CACHE.
-	 * Otherwise it will use one of the matching IRE_INTERFACE routes
-	 * for the link-local prefix. Hence, applications
-	 * *should* be encouraged to specify an outgoing interface when sending
-	 * to a link local address.
-	 */
-	if (connp == NULL || (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) &&
-	    !connp->conn_fully_bound)) {
-		/*
-		 * We cache IRE_CACHEs to avoid lookups. We don't do
-		 * this for the tcp global queue and listen end point
-		 * as it does not really have a real destination to
-		 * talk to.
-		 */
-		ire = ire_cache_lookup_v6(v6dstp, zoneid, msg_getlabel(mp),
-		    ipst);
-	} else {
-		/*
-		 * IRE_MARK_CONDEMNED is marked in ire_delete. We don't
-		 * grab a lock here to check for CONDEMNED as it is okay
-		 * to send a packet or two with the IRE_CACHE that is going
-		 * away.
-		 */
-		mutex_enter(&connp->conn_lock);
-		ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache;
-		if (ire != NULL &&
-		    IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) &&
-		    !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-
-			IRE_REFHOLD(ire);
-			mutex_exit(&connp->conn_lock);
-
-		} else {
-			boolean_t cached = B_FALSE;
-
-			connp->conn_ire_cache = NULL;
-			mutex_exit(&connp->conn_lock);
-			/* Release the old ire */
-			if (ire != NULL && sctp_ire == NULL)
-				IRE_REFRELE_NOTR(ire);
-
-			ire = ire_cache_lookup_v6(v6dstp, zoneid,
-			    msg_getlabel(mp), ipst);
-			if (ire != NULL) {
-				IRE_REFHOLD_NOTR(ire);
-
-				mutex_enter(&connp->conn_lock);
-				if (CONN_CACHE_IRE(connp) &&
-				    (connp->conn_ire_cache == NULL)) {
-					rw_enter(&ire->ire_bucket->irb_lock,
-					    RW_READER);
-					if (!(ire->ire_marks &
-					    IRE_MARK_CONDEMNED)) {
-						connp->conn_ire_cache = ire;
-						cached = B_TRUE;
-					}
-					rw_exit(&ire->ire_bucket->irb_lock);
-				}
-				mutex_exit(&connp->conn_lock);
-
-				/*
-				 * We can continue to use the ire but since it
-				 * was not cached, we should drop the extra
-				 * reference.
-				 */
-				if (!cached)
-					IRE_REFRELE_NOTR(ire);
-			}
-		}
-	}
-
-	if (ire != NULL) {
-		if (do_outrequests) {
-			/* Handle IRE_LOCAL's that might appear here */
-			if (ire->ire_type == IRE_CACHE) {
-				mibptr = ((ill_t *)ire->ire_stq->q_ptr)->
-				    ill_ip_mib;
-			} else {
-				mibptr = ire->ire_ipif->ipif_ill->ill_ip_mib;
-			}
-			BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
-		}
-
-		/*
-		 * Check if the ire has the RTF_MULTIRT flag, inherited
-		 * from an IRE_OFFSUBNET ire entry in ip_newroute().
-		 */
-		if (ire->ire_flags & RTF_MULTIRT) {
-			/*
-			 * Force hop limit of multirouted packets if required.
-			 * The hop limit of such packets is bounded by the
-			 * ip_multirt_ttl ndd variable.
-			 * NDP packets must have a hop limit of 255; don't
-			 * change the hop limit in that case.
-			 */
-			if ((ipst->ips_ip_multirt_ttl > 0) &&
-			    (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl) &&
-			    (ip6h->ip6_hops != IPV6_MAX_HOPS)) {
-				if (ip_debug > 3) {
-					ip2dbg(("ip_wput_v6: forcing multirt "
-					    "hop limit to %d (was %d) ",
-					    ipst->ips_ip_multirt_ttl,
-					    ip6h->ip6_hops));
-					pr_addr_dbg("v6dst %s\n", AF_INET6,
-					    &ire->ire_addr_v6);
-				}
-				ip6h->ip6_hops = ipst->ips_ip_multirt_ttl;
-			}
-
-			/*
-			 * We look at this point if there are pending
-			 * unresolved routes. ire_multirt_need_resolve_v6()
-			 * checks in O(n) that all IRE_OFFSUBNET ire
-			 * entries for the packet's destination and
-			 * flagged RTF_MULTIRT are currently resolved.
-			 * If some remain unresolved, we do a copy
-			 * of the current message. It will be used
-			 * to initiate additional route resolutions.
-			 */
-			multirt_need_resolve =
-			    ire_multirt_need_resolve_v6(&ire->ire_addr_v6,
-			    msg_getlabel(first_mp), ipst);
-			ip2dbg(("ip_wput_v6: ire %p, "
-			    "multirt_need_resolve %d, first_mp %p\n",
-			    (void *)ire, multirt_need_resolve,
-			    (void *)first_mp));
-			if (multirt_need_resolve) {
-				copy_mp = copymsg(first_mp);
-				if (copy_mp != NULL) {
-					MULTIRT_DEBUG_TAG(copy_mp);
-				}
-			}
-		}
-		ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request,
-		    connp, caller, ip6i_flags, zoneid);
-		if (need_decref) {
-			CONN_DEC_REF(connp);
-			connp = NULL;
-		}
-		IRE_REFRELE(ire);
-
-		/*
-		 * Try to resolve another multiroute if
-		 * ire_multirt_need_resolve_v6() deemed it necessary.
-		 * copy_mp will be consumed (sent or freed) by
-		 * ip_newroute_v6().
-		 */
-		if (copy_mp != NULL) {
-			if (mctl_present) {
-				ip6h = (ip6_t *)copy_mp->b_cont->b_rptr;
-			} else {
-				ip6h = (ip6_t *)copy_mp->b_rptr;
-			}
-			ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst,
-			    &ip6h->ip6_src, NULL, zoneid, ipst);
-		}
-		if (ill != NULL)
-			ill_refrele(ill);
-		return;
-	}
-
-	/*
-	 * No full IRE for this destination.  Send it to
-	 * ip_newroute_v6 to see if anything else matches.
-	 * Mark this packet as having originated on this
-	 * machine.
-	 * Update rptr if there was an ip6i_t header.
-	 */
-	mp->b_prev = NULL;
-	mp->b_next = NULL;
-	if (ip6i != NULL)
-		mp->b_rptr -= sizeof (ip6i_t);
-
-	if (unspec_src) {
-		if (ip6i == NULL) {
-			/*
-			 * Add ip6i_t header to carry unspec_src
-			 * until the packet comes back in ip_wput_v6.
-			 */
-			mp = ip_add_info_v6(mp, NULL, v6dstp);
-			if (mp == NULL) {
-				if (do_outrequests)
-					BUMP_MIB(mibptr,
-					    ipIfStatsHCOutRequests);
-				BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-				if (mctl_present)
-					freeb(first_mp);
-				if (ill != NULL)
-					ill_refrele(ill);
-				if (need_decref)
-					CONN_DEC_REF(connp);
-				return;
-			}
-			ip6i = (ip6i_t *)mp->b_rptr;
-
-			if (mctl_present) {
-				ASSERT(first_mp != mp);
-				first_mp->b_cont = mp;
-			} else {
-				first_mp = mp;
-			}
-
-			if ((mp->b_wptr - (uchar_t *)ip6i) ==
-			    sizeof (ip6i_t)) {
-				/*
-				 * ndp_resolver called from ip_newroute_v6
-				 * expects pulled up message.
-				 */
-				if (!pullupmsg(mp, -1)) {
-					ip1dbg(("ip_wput_v6: pullupmsg"
-					    " failed\n"));
-					if (do_outrequests) {
-						BUMP_MIB(mibptr,
-						    ipIfStatsHCOutRequests);
-					}
-					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-					freemsg(first_mp);
-					if (ill != NULL)
-						ill_refrele(ill);
-					if (need_decref)
-						CONN_DEC_REF(connp);
-					return;
-				}
-				ip6i = (ip6i_t *)mp->b_rptr;
-			}
-			ip6h = (ip6_t *)&ip6i[1];
-			v6dstp = &ip6h->ip6_dst;
-		}
-		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
-		if (mctl_present) {
-			ASSERT(io != NULL);
-			io->ipsec_out_unspec_src = unspec_src;
-		}
-	}
-	if (do_outrequests)
-		BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
-	if (need_decref)
-		CONN_DEC_REF(connp);
-	ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, NULL, zoneid, ipst);
-	if (ill != NULL)
-		ill_refrele(ill);
-	return;
-
-
-	/*
-	 * Handle multicast packets with or without an conn.
-	 * Assumes that the transports set ip6_hops taking
-	 * IPV6_MULTICAST_HOPS (and the other ways to set the hoplimit)
-	 * into account.
-	 */
-ipv6multicast:
-	ip2dbg(("ip_wput_v6: multicast\n"));
-
-	/*
-	 * Hold the conn_lock till we refhold the ill of interest that is
-	 * pointed to from the conn. Since we cannot do an ill/ipif_refrele
-	 * while holding any locks, postpone the refrele until after the
-	 * conn_lock is dropped.
-	 */
-	if (connp != NULL) {
-		mutex_enter(&connp->conn_lock);
-		conn_lock_held = B_TRUE;
-	} else {
-		conn_lock_held = B_FALSE;
-	}
-	if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
-		/* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
-		ASSERT(ill != NULL);
-	} else if (ill != NULL) {
-		/*
-		 * 2. If q is an ill queue and (link local or multicast
-		 * destination) then use that ill.
-		 * We don't need the ipif initialization here.
-		 * This useless assert below is just to prevent lint from
-		 * reporting a null body if statement.
-		 */
-		ASSERT(ill != NULL);
-	} else if (connp != NULL) {
-		/*
-		 * 3. If IPV6_BOUND_IF has been set use that ill.
-		 *
-		 * 4. For multicast: if IPV6_MULTICAST_IF has been set use it.
-		 * Otherwise look for the best IRE match for the unspecified
-		 * group to determine the ill.
-		 *
-		 * conn_multicast_ill is used for only IPv6 packets.
-		 * conn_multicast_ipif is used for only IPv4 packets.
-		 * Thus a PF_INET6 socket send both IPv4 and IPv6
-		 * multicast packets using different IP*_MULTICAST_IF
-		 * interfaces.
-		 */
-		if (connp->conn_outgoing_ill != NULL) {
-			err = ill_check_and_refhold(connp->conn_outgoing_ill);
-			if (err == ILL_LOOKUP_FAILED) {
-				ip1dbg(("ip_output_v6: multicast"
-				    " conn_outgoing_ill no ipif\n"));
-multicast_discard:
-				ASSERT(saved_ill == NULL);
-				if (conn_lock_held)
-					mutex_exit(&connp->conn_lock);
-				if (ill != NULL)
-					ill_refrele(ill);
-				freemsg(first_mp);
-				if (do_outrequests)
-					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-				if (need_decref)
-					CONN_DEC_REF(connp);
-				return;
-			}
-			ill = connp->conn_outgoing_ill;
-		} else if (connp->conn_multicast_ill != NULL) {
-			err = ill_check_and_refhold(connp->conn_multicast_ill);
-			if (err == ILL_LOOKUP_FAILED) {
-				ip1dbg(("ip_output_v6: multicast"
-				    " conn_multicast_ill no ipif\n"));
-				goto multicast_discard;
-			}
-			ill = connp->conn_multicast_ill;
-		} else {
-			mutex_exit(&connp->conn_lock);
-			conn_lock_held = B_FALSE;
-			ipif = ipif_lookup_group_v6(v6dstp, zoneid, ipst);
-			if (ipif == NULL) {
-				ip1dbg(("ip_output_v6: multicast no ipif\n"));
-				goto multicast_discard;
-			}
-			/*
-			 * We have a ref to this ipif, so we can safely
-			 * access ipif_ill.
-			 */
-			ill = ipif->ipif_ill;
-			mutex_enter(&ill->ill_lock);
-			if (!ILL_CAN_LOOKUP(ill)) {
-				mutex_exit(&ill->ill_lock);
-				ipif_refrele(ipif);
-				ill = NULL;
-				ip1dbg(("ip_output_v6: multicast no ipif\n"));
-				goto multicast_discard;
-			}
-			ill_refhold_locked(ill);
-			mutex_exit(&ill->ill_lock);
-			ipif_refrele(ipif);
-			/*
-			 * Save binding until IPV6_MULTICAST_IF
-			 * changes it
-			 */
-			mutex_enter(&connp->conn_lock);
-			connp->conn_multicast_ill = ill;
-			mutex_exit(&connp->conn_lock);
-		}
-	}
-	if (conn_lock_held)
-		mutex_exit(&connp->conn_lock);
-
-	if (saved_ill != NULL)
-		ill_refrele(saved_ill);
-
-	ASSERT(ill != NULL);
-	/*
-	 * For multicast loopback interfaces replace the multicast address
-	 * with a unicast address for the ire lookup.
-	 */
-	if (IS_LOOPBACK(ill))
-		v6dstp = &ill->ill_ipif->ipif_v6lcl_addr;
-
-	mibptr = ill->ill_ip_mib;
-	if (do_outrequests) {
-		BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
-		do_outrequests = B_FALSE;
-	}
-	BUMP_MIB(mibptr, ipIfStatsHCOutMcastPkts);
-	UPDATE_MIB(mibptr, ipIfStatsHCOutMcastOctets,
-	    ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN);
-
-	/*
-	 * As we may lose the conn by the time we reach ip_wput_ire_v6
-	 * we copy conn_multicast_loop and conn_dontroute on to an
-	 * ipsec_out. In case if this datagram goes out secure,
-	 * we need the ill_index also. Copy that also into the
-	 * ipsec_out.
-	 */
-	if (mctl_present) {
-		io = (ipsec_out_t *)first_mp->b_rptr;
-		ASSERT(first_mp->b_datap->db_type == M_CTL);
-		ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	} else {
-		ASSERT(mp == first_mp);
-		if ((first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack)) ==
-		    NULL) {
-			BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-			freemsg(mp);
-			if (ill != NULL)
-				ill_refrele(ill);
-			if (need_decref)
-				CONN_DEC_REF(connp);
-			return;
-		}
-		io = (ipsec_out_t *)first_mp->b_rptr;
-		/* This is not a secure packet */
-		io->ipsec_out_secure = B_FALSE;
-		io->ipsec_out_use_global_policy = B_TRUE;
-		io->ipsec_out_zoneid =
-		    (zoneid != ALL_ZONES ? zoneid : GLOBAL_ZONEID);
-		first_mp->b_cont = mp;
-		mctl_present = B_TRUE;
-	}
-	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
-	io->ipsec_out_unspec_src = unspec_src;
-	if (connp != NULL)
-		io->ipsec_out_dontroute = connp->conn_dontroute;
-
-send_from_ill:
-	ASSERT(ill != NULL);
-	ASSERT(mibptr == ill->ill_ip_mib);
-
-	if (do_outrequests) {
-		BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
-		do_outrequests = B_FALSE;
-	}
-
-	/*
-	 * Because nce_xmit() calls ip_output_v6() and NCEs are always tied to
-	 * an underlying interface, IS_UNDER_IPMP() may be true even when
-	 * building IREs that will be used for data traffic.  As such, use the
-	 * packet's source address to determine whether the traffic is test
-	 * traffic, and set MATCH_IRE_MARK_TESTHIDDEN if so.
-	 *
-	 * Separately, we also need to mark probe packets so that ND can
-	 * process them specially; see the comments in nce_queue_mp_common().
-	 */
-	if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
-	    ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) {
-		if (ip6i == NULL) {
-			if ((mp = ip_add_info_v6(mp, NULL, v6dstp)) == NULL) {
-				if (mctl_present)
-					freeb(first_mp);
-				goto discard;
-			}
-
-			if (mctl_present)
-				first_mp->b_cont = mp;
-			else
-				first_mp = mp;
-
-			/* ndp_resolver() expects a pulled-up message */
-			if (MBLKL(mp) == sizeof (ip6i_t) &&
-			    pullupmsg(mp, -1) == 0) {
-				ip1dbg(("ip_output_v6: pullupmsg failed\n"));
-discard:			BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-				ill_refrele(ill);
-				if (need_decref)
-					CONN_DEC_REF(connp);
-				return;
-			}
-			ip6i = (ip6i_t *)mp->b_rptr;
-			ip6h = (ip6_t *)&ip6i[1];
-			v6dstp = &ip6h->ip6_dst;
-			mp->b_rptr = (uchar_t *)ip6h;	/* rewound below */
-		}
-		ip6i->ip6i_flags |= IP6I_IPMP_PROBE;
-		match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-	}
-
-	if (io != NULL)
-		io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
-
-	/*
-	 * When a specific ill is specified (using IPV6_PKTINFO,
-	 * IPV6_MULTICAST_IF, or IPV6_BOUND_IF) we will only match
-	 * on routing entries (ftable and ctable) that have a matching
-	 * ire->ire_ipif->ipif_ill. Thus this can only be used
-	 * for destinations that are on-link for the specific ill
-	 * and that can appear on multiple links. Thus it is useful
-	 * for multicast destinations, link-local destinations, and
-	 * at some point perhaps for site-local destinations (if the
-	 * node sits at a site boundary).
-	 * We create the cache entries in the regular ctable since
-	 * it can not "confuse" things for other destinations.
-	 * table.
-	 *
-	 * NOTE : conn_ire_cache is not used for caching ire_ctable_lookups.
-	 *	  It is used only when ire_cache_lookup is used above.
-	 */
-	ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ill->ill_ipif,
-	    zoneid, msg_getlabel(mp), match_flags, ipst);
-	if (ire != NULL) {
-		/*
-		 * Check if the ire has the RTF_MULTIRT flag, inherited
-		 * from an IRE_OFFSUBNET ire entry in ip_newroute().
-		 */
-		if (ire->ire_flags & RTF_MULTIRT) {
-			/*
-			 * Force hop limit of multirouted packets if required.
-			 * The hop limit of such packets is bounded by the
-			 * ip_multirt_ttl ndd variable.
-			 * NDP packets must have a hop limit of 255; don't
-			 * change the hop limit in that case.
-			 */
-			if ((ipst->ips_ip_multirt_ttl > 0) &&
-			    (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl) &&
-			    (ip6h->ip6_hops != IPV6_MAX_HOPS)) {
-				if (ip_debug > 3) {
-					ip2dbg(("ip_wput_v6: forcing multirt "
-					    "hop limit to %d (was %d) ",
-					    ipst->ips_ip_multirt_ttl,
-					    ip6h->ip6_hops));
-					pr_addr_dbg("v6dst %s\n", AF_INET6,
-					    &ire->ire_addr_v6);
-				}
-				ip6h->ip6_hops = ipst->ips_ip_multirt_ttl;
-			}
-
-			/*
-			 * We look at this point if there are pending
-			 * unresolved routes. ire_multirt_need_resolve_v6()
-			 * checks in O(n) that all IRE_OFFSUBNET ire
-			 * entries for the packet's destination and
-			 * flagged RTF_MULTIRT are currently resolved.
-			 * If some remain unresolved, we make a copy
-			 * of the current message. It will be used
-			 * to initiate additional route resolutions.
-			 */
-			multirt_need_resolve =
-			    ire_multirt_need_resolve_v6(&ire->ire_addr_v6,
-			    msg_getlabel(first_mp), ipst);
-			ip2dbg(("ip_wput_v6[send_from_ill]: ire %p, "
-			    "multirt_need_resolve %d, first_mp %p\n",
-			    (void *)ire, multirt_need_resolve,
-			    (void *)first_mp));
-			if (multirt_need_resolve) {
-				copy_mp = copymsg(first_mp);
-				if (copy_mp != NULL) {
-					MULTIRT_DEBUG_TAG(copy_mp);
-				}
-			}
-		}
-
-		ip1dbg(("ip_wput_v6: send on %s, ire = %p, ill index = %d\n",
-		    ill->ill_name, (void *)ire,
-		    ill->ill_phyint->phyint_ifindex));
-		ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request,
-		    connp, caller, ip6i_flags, zoneid);
-		ire_refrele(ire);
-		if (need_decref) {
-			CONN_DEC_REF(connp);
-			connp = NULL;
-		}
-
-		/*
-		 * Try to resolve another multiroute if
-		 * ire_multirt_need_resolve_v6() deemed it necessary.
-		 * copy_mp will be consumed (sent or freed) by
-		 * ip_newroute_[ipif_]v6().
-		 */
-		if (copy_mp != NULL) {
-			if (mctl_present) {
-				ip6h = (ip6_t *)copy_mp->b_cont->b_rptr;
-			} else {
-				ip6h = (ip6_t *)copy_mp->b_rptr;
-			}
-			if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
-				ipif = ipif_lookup_group_v6(&ip6h->ip6_dst,
-				    zoneid, ipst);
-				if (ipif == NULL) {
-					ip1dbg(("ip_wput_v6: No ipif for "
-					    "multicast\n"));
-					MULTIRT_DEBUG_UNTAG(copy_mp);
-					freemsg(copy_mp);
-					return;
-				}
-				ip_newroute_ipif_v6(q, copy_mp, ipif,
-				    &ip6h->ip6_dst, &ip6h->ip6_src, unspec_src,
-				    zoneid);
-				ipif_refrele(ipif);
-			} else {
-				ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst,
-				    &ip6h->ip6_src, ill, zoneid, ipst);
-			}
-		}
-		ill_refrele(ill);
-		return;
-	}
-	if (need_decref) {
-		CONN_DEC_REF(connp);
-		connp = NULL;
-	}
-
-	/* Update rptr if there was an ip6i_t header. */
-	if (ip6i != NULL)
-		mp->b_rptr -= sizeof (ip6i_t);
-	if (unspec_src) {
-		if (ip6i == NULL) {
-			/*
-			 * Add ip6i_t header to carry unspec_src
-			 * until the packet comes back in ip_wput_v6.
-			 */
-			if (mctl_present) {
-				first_mp->b_cont =
-				    ip_add_info_v6(mp, NULL, v6dstp);
-				mp = first_mp->b_cont;
-				if (mp == NULL)
-					freeb(first_mp);
-			} else {
-				first_mp = mp = ip_add_info_v6(mp, NULL,
-				    v6dstp);
-			}
-			if (mp == NULL) {
-				BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-				ill_refrele(ill);
-				return;
-			}
-			ip6i = (ip6i_t *)mp->b_rptr;
-			if ((mp->b_wptr - (uchar_t *)ip6i) ==
-			    sizeof (ip6i_t)) {
-				/*
-				 * ndp_resolver called from ip_newroute_v6
-				 * expects a pulled up message.
-				 */
-				if (!pullupmsg(mp, -1)) {
-					ip1dbg(("ip_wput_v6: pullupmsg"
-					    " failed\n"));
-					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-					freemsg(first_mp);
-					return;
-				}
-				ip6i = (ip6i_t *)mp->b_rptr;
-			}
-			ip6h = (ip6_t *)&ip6i[1];
-			v6dstp = &ip6h->ip6_dst;
-		}
-		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
-		if (mctl_present) {
-			ASSERT(io != NULL);
-			io->ipsec_out_unspec_src = unspec_src;
-		}
-	}
-	if (IN6_IS_ADDR_MULTICAST(v6dstp)) {
-		ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, v6dstp,
-		    &ip6h->ip6_src, unspec_src, zoneid);
-	} else {
-		ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, ill,
-		    zoneid, ipst);
-	}
-	ill_refrele(ill);
-	return;
-
-notv6:
-	/* FIXME?: assume the caller calls the right version of ip_output? */
-	if (q->q_next == NULL) {
-		connp = Q_TO_CONN(q);
-
-		/*
-		 * We can change conn_send for all types of conn, even
-		 * though only TCP uses it right now.
-		 * FIXME: sctp could use conn_send but doesn't currently.
-		 */
-		ip_setpktversion(connp, B_FALSE, B_TRUE, ipst);
-	}
-	BUMP_MIB(mibptr, ipIfStatsOutWrongIPVersion);
-	(void) ip_output(arg, first_mp, arg2, caller);
-	if (ill != NULL)
-		ill_refrele(ill);
-}
-
-/*
- * If this is a conn_t queue, then we pass in the conn. This includes the
- * zoneid.
- * Otherwise, this is a message for an ill_t queue,
- * in which case we use the global zoneid since those are all part of
- * the global zone.
- */
-void
-ip_wput_v6(queue_t *q, mblk_t *mp)
-{
-	if (CONN_Q(q))
-		ip_output_v6(Q_TO_CONN(q), mp, q, IP_WPUT);
-	else
-		ip_output_v6(GLOBAL_ZONEID, mp, q, IP_WPUT);
-}
-
-/*
- * NULL send-to queue - packet is to be delivered locally.
- */
-void
-ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
-    ire_t *ire, int fanout_flags, zoneid_t zoneid)
-{
-	uint32_t	ports;
-	mblk_t		*mp = first_mp, *first_mp1;
-	boolean_t	mctl_present;
-	uint8_t		nexthdr;
-	uint16_t	hdr_length;
-	ipsec_out_t	*io;
-	mib2_ipIfStatsEntry_t	*mibptr;
-	ilm_t		*ilm;
-	uint_t	nexthdr_offset;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	if (DB_TYPE(mp) == M_CTL) {
-		io = (ipsec_out_t *)mp->b_rptr;
-		if (!io->ipsec_out_secure) {
-			mp = mp->b_cont;
-			freeb(first_mp);
-			first_mp = mp;
-			mctl_present = B_FALSE;
-		} else {
-			mctl_present = B_TRUE;
-			mp = first_mp->b_cont;
-			ipsec_out_to_in(first_mp);
-		}
-	} else {
-		mctl_present = B_FALSE;
-	}
-
-	/*
-	 * Remove reachability confirmation bit from version field
-	 * before passing the packet on to any firewall hooks or
-	 * looping back the packet.
-	 */
-	if (ip6h->ip6_vcf & IP_FORWARD_PROG)
-		ip6h->ip6_vcf &= ~IP_FORWARD_PROG;
-
-	DTRACE_PROBE4(ip6__loopback__in__start,
-	    ill_t *, ill, ill_t *, NULL,
-	    ip6_t *, ip6h, mblk_t *, first_mp);
-
-	FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
-	    ipst->ips_ipv6firewall_loopback_in,
-	    ill, NULL, ip6h, first_mp, mp, 0, ipst);
-
-	DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, first_mp);
-
-	if (first_mp == NULL)
-		return;
-
-	if (ipst->ips_ip6_observe.he_interested) {
-		zoneid_t szone, dzone, lookup_zoneid = ALL_ZONES;
-		zoneid_t stackzoneid = netstackid_to_zoneid(
-		    ipst->ips_netstack->netstack_stackid);
-
-		szone = (stackzoneid == GLOBAL_ZONEID) ? zoneid : stackzoneid;
-		/*
-		 * ::1 is special, as we cannot lookup its zoneid by
-		 * address.  For this case, restrict the lookup to the
-		 * source zone.
-		 */
-		if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst))
-			lookup_zoneid = zoneid;
-		dzone = ip_get_zoneid_v6(&ip6h->ip6_dst, mp, ill, ipst,
-		    lookup_zoneid);
-		ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
-	}
-
-	DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *,
-	    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
-	    int, 1);
-
-	nexthdr = ip6h->ip6_nxt;
-	mibptr = ill->ill_ip_mib;
-
-	/* Fastpath */
-	switch (nexthdr) {
-	case IPPROTO_TCP:
-	case IPPROTO_UDP:
-	case IPPROTO_ICMPV6:
-	case IPPROTO_SCTP:
-		hdr_length = IPV6_HDR_LEN;
-		nexthdr_offset = (uint_t)((uchar_t *)&ip6h->ip6_nxt -
-		    (uchar_t *)ip6h);
-		break;
-	default: {
-		uint8_t	*nexthdrp;
-
-		if (!ip_hdr_length_nexthdr_v6(mp, ip6h,
-		    &hdr_length, &nexthdrp)) {
-			/* Malformed packet */
-			BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-			freemsg(first_mp);
-			return;
-		}
-		nexthdr = *nexthdrp;
-		nexthdr_offset = nexthdrp - (uint8_t *)ip6h;
-		break;
-	}
-	}
-
-	UPDATE_OB_PKT_COUNT(ire);
-	ire->ire_last_used_time = lbolt;
-
-	switch (nexthdr) {
-		case IPPROTO_TCP:
-			if (DB_TYPE(mp) == M_DATA) {
-				/*
-				 * M_DATA mblk, so init mblk (chain) for
-				 * no struio().
-				 */
-				mblk_t  *mp1 = mp;
-
-				do {
-					mp1->b_datap->db_struioflag = 0;
-				} while ((mp1 = mp1->b_cont) != NULL);
-			}
-			ports = *(uint32_t *)(mp->b_rptr + hdr_length +
-			    TCP_PORTS_OFFSET);
-			ip_fanout_tcp_v6(q, first_mp, ip6h, ill, ill,
-			    fanout_flags|IP_FF_SEND_ICMP|IP_FF_SYN_ADDIRE|
-			    IP_FF_IPINFO|IP6_NO_IPPOLICY|IP_FF_LOOPBACK,
-			    hdr_length, mctl_present, ire->ire_zoneid);
-			return;
-
-		case IPPROTO_UDP:
-			ports = *(uint32_t *)(mp->b_rptr + hdr_length +
-			    UDP_PORTS_OFFSET);
-			ip_fanout_udp_v6(q, first_mp, ip6h, ports, ill, ill,
-			    fanout_flags|IP_FF_SEND_ICMP|IP_FF_IPINFO|
-			    IP6_NO_IPPOLICY, mctl_present, ire->ire_zoneid);
-			return;
-
-		case IPPROTO_SCTP:
-		{
-			ports = *(uint32_t *)(mp->b_rptr + hdr_length);
-			ip_fanout_sctp(first_mp, ill, (ipha_t *)ip6h, ports,
-			    fanout_flags|IP_FF_SEND_ICMP|IP_FF_IPINFO,
-			    mctl_present, IP6_NO_IPPOLICY, ire->ire_zoneid);
-			return;
-		}
-		case IPPROTO_ICMPV6: {
-			icmp6_t *icmp6;
-
-			/* check for full IPv6+ICMPv6 header */
-			if ((mp->b_wptr - mp->b_rptr) <
-			    (hdr_length + ICMP6_MINLEN)) {
-				if (!pullupmsg(mp, hdr_length + ICMP6_MINLEN)) {
-					ip1dbg(("ip_wput_v6: ICMP hdr pullupmsg"
-					    " failed\n"));
-					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-					freemsg(first_mp);
-					return;
-				}
-				ip6h = (ip6_t *)mp->b_rptr;
-			}
-			icmp6 = (icmp6_t *)((uchar_t *)ip6h + hdr_length);
-
-			/* Update output mib stats */
-			icmp_update_out_mib_v6(ill, icmp6);
-
-			/* Check variable for testing applications */
-			if (ipst->ips_ipv6_drop_inbound_icmpv6) {
-				freemsg(first_mp);
-				return;
-			}
-			/*
-			 * Assume that there is always at least one conn for
-			 * ICMPv6 (in.ndpd) i.e. don't optimize the case
-			 * where there is no conn.
-			 */
-			if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
-			    !IS_LOOPBACK(ill)) {
-				ilm_walker_t ilw;
-
-				/*
-				 * In the multicast case, applications may have
-				 * joined the group from different zones, so we
-				 * need to deliver the packet to each of them.
-				 * Loop through the multicast memberships
-				 * structures (ilm) on the receive ill and send
-				 * a copy of the packet up each matching one.
-				 * However, we don't do this for multicasts sent
-				 * on the loopback interface (PHYI_LOOPBACK flag
-				 * set) as they must stay in the sender's zone.
-				 */
-				ilm = ilm_walker_start(&ilw, ill);
-				for (; ilm != NULL;
-				    ilm = ilm_walker_step(&ilw, ilm)) {
-					if (!IN6_ARE_ADDR_EQUAL(
-					    &ilm->ilm_v6addr, &ip6h->ip6_dst))
-						continue;
-					if ((fanout_flags &
-					    IP_FF_NO_MCAST_LOOP) &&
-					    ilm->ilm_zoneid == ire->ire_zoneid)
-						continue;
-					if (!ipif_lookup_zoneid(
-					    ilw.ilw_walk_ill, ilm->ilm_zoneid,
-					    IPIF_UP, NULL))
-						continue;
-
-					first_mp1 = ip_copymsg(first_mp);
-					if (first_mp1 == NULL)
-						continue;
-					icmp_inbound_v6(q, first_mp1,
-					    ilw.ilw_walk_ill, ill, hdr_length,
-					    mctl_present, IP6_NO_IPPOLICY,
-					    ilm->ilm_zoneid, NULL);
-				}
-				ilm_walker_finish(&ilw);
-			} else {
-				first_mp1 = ip_copymsg(first_mp);
-				if (first_mp1 != NULL)
-					icmp_inbound_v6(q, first_mp1, ill, ill,
-					    hdr_length, mctl_present,
-					    IP6_NO_IPPOLICY, ire->ire_zoneid,
-					    NULL);
-			}
-		}
-		/* FALLTHRU */
-		default: {
-			/*
-			 * Handle protocols with which IPv6 is less intimate.
-			 */
-			fanout_flags |= IP_FF_RAWIP|IP_FF_IPINFO;
-
-			/*
-			 * Enable sending ICMP for "Unknown" nexthdr
-			 * case. i.e. where we did not FALLTHRU from
-			 * IPPROTO_ICMPV6 processing case above.
-			 */
-			if (nexthdr != IPPROTO_ICMPV6)
-				fanout_flags |= IP_FF_SEND_ICMP;
-			/*
-			 * Note: There can be more than one stream bound
-			 * to a particular protocol. When this is the case,
-			 * each one gets a copy of any incoming packets.
-			 */
-			ip_fanout_proto_v6(q, first_mp, ip6h, ill, ill, nexthdr,
-			    nexthdr_offset, fanout_flags|IP6_NO_IPPOLICY,
-			    mctl_present, ire->ire_zoneid);
-			return;
-		}
-	}
-}
-
-/*
- * Send packet using IRE.
- * Checksumming is controlled by cksum_request:
- *	-1 => normal i.e. TCP/UDP/SCTP/ICMPv6 are checksummed and nothing else.
- *	1 => Skip TCP/UDP/SCTP checksum
- * 	Otherwise => checksum_request contains insert offset for checksum
- *
- * Assumes that the following set of headers appear in the first
- * mblk:
- *	ip6_t
- *	Any extension headers
- *	TCP/UDP/SCTP header (if present)
- * The routine can handle an ICMPv6 header that is not in the first mblk.
- *
- * NOTE : This function does not ire_refrele the ire passed in as the
- *	  argument unlike ip_wput_ire where the REFRELE is done.
- *	  Refer to ip_wput_ire for more on this.
- */
-static void
-ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
-    int cksum_request, conn_t *connp, int caller, int flags, zoneid_t zoneid)
-{
-	ip6_t		*ip6h;
-	uint8_t		nexthdr;
-	uint16_t	hdr_length;
-	uint_t		reachable = 0x0;
-	ill_t		*ill;
-	mib2_ipIfStatsEntry_t	*mibptr;
-	mblk_t		*first_mp;
-	boolean_t	mctl_present;
-	ipsec_out_t	*io;
-	boolean_t	conn_dontroute;	/* conn value for multicast */
-	boolean_t	conn_multicast_loop;	/* conn value for multicast */
-	boolean_t 	multicast_forward;	/* Should we forward ? */
-	int		max_frag;
-	ip_stack_t	*ipst = ire->ire_ipst;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
-
-	ill = ire_to_ill(ire);
-	first_mp = mp;
-	multicast_forward = B_FALSE;
-
-	if (mp->b_datap->db_type != M_CTL) {
-		ip6h = (ip6_t *)first_mp->b_rptr;
-	} else {
-		io = (ipsec_out_t *)first_mp->b_rptr;
-		ASSERT(io->ipsec_out_type == IPSEC_OUT);
-		/*
-		 * Grab the zone id now because the M_CTL can be discarded by
-		 * ip_wput_ire_parse_ipsec_out() below.
-		 */
-		ASSERT(zoneid == io->ipsec_out_zoneid);
-		ASSERT(zoneid != ALL_ZONES);
-		ip6h = (ip6_t *)first_mp->b_cont->b_rptr;
-		/*
-		 * For the multicast case, ipsec_out carries conn_dontroute and
-		 * conn_multicast_loop as conn may not be available here. We
-		 * need this for multicast loopback and forwarding which is done
-		 * later in the code.
-		 */
-		if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
-			conn_dontroute = io->ipsec_out_dontroute;
-			conn_multicast_loop = io->ipsec_out_multicast_loop;
-			/*
-			 * If conn_dontroute is not set or conn_multicast_loop
-			 * is set, we need to do forwarding/loopback. For
-			 * datagrams from ip_wput_multicast, conn_dontroute is
-			 * set to B_TRUE and conn_multicast_loop is set to
-			 * B_FALSE so that we neither do forwarding nor
-			 * loopback.
-			 */
-			if (!conn_dontroute || conn_multicast_loop)
-				multicast_forward = B_TRUE;
-		}
-	}
-
-	/*
-	 * If the sender didn't supply the hop limit and there is a default
-	 * unicast hop limit associated with the output interface, we use
-	 * that if the packet is unicast.  Interface specific unicast hop
-	 * limits as set via the SIOCSLIFLNKINFO ioctl.
-	 */
-	if (ill->ill_max_hops != 0 && !(flags & IP6I_HOPLIMIT) &&
-	    !(IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))) {
-		ip6h->ip6_hops = ill->ill_max_hops;
-	}
-
-	if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid &&
-	    ire->ire_zoneid != ALL_ZONES) {
-		/*
-		 * When a zone sends a packet to another zone, we try to deliver
-		 * the packet under the same conditions as if the destination
-		 * was a real node on the network. To do so, we look for a
-		 * matching route in the forwarding table.
-		 * RTF_REJECT and RTF_BLACKHOLE are handled just like
-		 * ip_newroute_v6() does.
-		 * Note that IRE_LOCAL are special, since they are used
-		 * when the zoneid doesn't match in some cases. This means that
-		 * we need to handle ipha_src differently since ire_src_addr
-		 * belongs to the receiving zone instead of the sending zone.
-		 * When ip_restrict_interzone_loopback is set, then
-		 * ire_cache_lookup_v6() ensures that IRE_LOCAL are only used
-		 * for loopback between zones when the logical "Ethernet" would
-		 * have looped them back.
-		 */
-		ire_t *src_ire;
-
-		src_ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, 0,
-		    NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE |
-		    MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE), ipst);
-		if (src_ire != NULL &&
-		    !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
-		    (!ipst->ips_ip_restrict_interzone_loopback ||
-		    ire_local_same_lan(ire, src_ire))) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
-			    !unspec_src) {
-				ip6h->ip6_src = src_ire->ire_src_addr_v6;
-			}
-			ire_refrele(src_ire);
-		} else {
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutNoRoutes);
-			if (src_ire != NULL) {
-				if (src_ire->ire_flags & RTF_BLACKHOLE) {
-					ire_refrele(src_ire);
-					freemsg(first_mp);
-					return;
-				}
-				ire_refrele(src_ire);
-			}
-			if (ip_hdr_complete_v6(ip6h, zoneid, ipst)) {
-				/* Failed */
-				freemsg(first_mp);
-				return;
-			}
-			icmp_unreachable_v6(q, first_mp,
-			    ICMP6_DST_UNREACH_NOROUTE, B_FALSE, B_FALSE,
-			    zoneid, ipst);
-			return;
-		}
-	}
-
-	if (mp->b_datap->db_type == M_CTL ||
-	    ipss->ipsec_outbound_v6_policy_present) {
-		mp = ip_wput_ire_parse_ipsec_out(first_mp, NULL, ip6h, ire,
-		    connp, unspec_src, zoneid);
-		if (mp == NULL) {
-			return;
-		}
-	}
-
-	first_mp = mp;
-	if (mp->b_datap->db_type == M_CTL) {
-		io = (ipsec_out_t *)mp->b_rptr;
-		ASSERT(io->ipsec_out_type == IPSEC_OUT);
-		mp = mp->b_cont;
-		mctl_present = B_TRUE;
-	} else {
-		mctl_present = B_FALSE;
-	}
-
-	ip6h = (ip6_t *)mp->b_rptr;
-	nexthdr = ip6h->ip6_nxt;
-	mibptr = ill->ill_ip_mib;
-
-	if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && !unspec_src) {
-		ipif_t *ipif;
-
-		/*
-		 * Select the source address using ipif_select_source_v6.
-		 */
-		ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, B_FALSE,
-		    IPV6_PREFER_SRC_DEFAULT, zoneid);
-		if (ipif == NULL) {
-			if (ip_debug > 2) {
-				/* ip1dbg */
-				pr_addr_dbg("ip_wput_ire_v6: no src for "
-				    "dst %s\n", AF_INET6, &ip6h->ip6_dst);
-				printf("through interface %s\n", ill->ill_name);
-			}
-			freemsg(first_mp);
-			return;
-		}
-		ip6h->ip6_src = ipif->ipif_v6src_addr;
-		ipif_refrele(ipif);
-	}
-	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
-		if ((connp != NULL && connp->conn_multicast_loop) ||
-		    !IS_LOOPBACK(ill)) {
-			if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE,
-			    ALL_ZONES) != NULL) {
-				mblk_t *nmp;
-				int fanout_flags = 0;
-
-				if (connp != NULL &&
-				    !connp->conn_multicast_loop) {
-					fanout_flags |= IP_FF_NO_MCAST_LOOP;
-				}
-				ip1dbg(("ip_wput_ire_v6: "
-				    "Loopback multicast\n"));
-				nmp = ip_copymsg(first_mp);
-				if (nmp != NULL) {
-					ip6_t	*nip6h;
-					mblk_t	*mp_ip6h;
-
-					if (mctl_present) {
-						nip6h = (ip6_t *)
-						    nmp->b_cont->b_rptr;
-						mp_ip6h = nmp->b_cont;
-					} else {
-						nip6h = (ip6_t *)nmp->b_rptr;
-						mp_ip6h = nmp;
-					}
-
-					DTRACE_PROBE4(
-					    ip6__loopback__out__start,
-					    ill_t *, NULL,
-					    ill_t *, ill,
-					    ip6_t *, nip6h,
-					    mblk_t *, nmp);
-
-					FW_HOOKS6(
-					    ipst->ips_ip6_loopback_out_event,
-					    ipst->ips_ipv6firewall_loopback_out,
-					    NULL, ill, nip6h, nmp, mp_ip6h,
-					    0, ipst);
-
-					DTRACE_PROBE1(
-					    ip6__loopback__out__end,
-					    mblk_t *, nmp);
-
-					/*
-					 * DTrace this as ip:::send.  A blocked
-					 * packet will fire the send probe, but
-					 * not the receive probe.
-					 */
-					DTRACE_IP7(send, mblk_t *, nmp,
-					    conn_t *, NULL, void_ip_t *, nip6h,
-					    __dtrace_ipsr_ill_t *, ill,
-					    ipha_t *, NULL, ip6_t *, nip6h,
-					    int, 1);
-
-					if (nmp != NULL) {
-						/*
-						 * Deliver locally and to
-						 * every local zone, except
-						 * the sending zone when
-						 * IPV6_MULTICAST_LOOP is
-						 * disabled.
-						 */
-						ip_wput_local_v6(RD(q), ill,
-						    nip6h, nmp, ire,
-						    fanout_flags, zoneid);
-					}
-				} else {
-					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-					ip1dbg(("ip_wput_ire_v6: "
-					    "copymsg failed\n"));
-				}
-			}
-		}
-		if (ip6h->ip6_hops == 0 ||
-		    IN6_IS_ADDR_MC_NODELOCAL(&ip6h->ip6_dst) ||
-		    IS_LOOPBACK(ill)) {
-			/*
-			 * Local multicast or just loopback on loopback
-			 * interface.
-			 */
-			BUMP_MIB(mibptr, ipIfStatsHCOutMcastPkts);
-			UPDATE_MIB(mibptr, ipIfStatsHCOutMcastOctets,
-			    ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN);
-			ip1dbg(("ip_wput_ire_v6: local multicast only\n"));
-			freemsg(first_mp);
-			return;
-		}
-	}
-
-	if (ire->ire_stq != NULL) {
-		uint32_t	sum;
-		uint_t		ill_index =  ((ill_t *)ire->ire_stq->q_ptr)->
-		    ill_phyint->phyint_ifindex;
-		queue_t		*dev_q = ire->ire_stq->q_next;
-
-		/*
-		 * non-NULL send-to queue - packet is to be sent
-		 * out an interface.
-		 */
-
-		/* Driver is flow-controlling? */
-		if (!IP_FLOW_CONTROLLED_ULP(nexthdr) &&
-		    DEV_Q_FLOW_BLOCKED(dev_q)) {
-			/*
-			 * Queue packet if we have an conn to give back
-			 * pressure.  We can't queue packets intended for
-			 * hardware acceleration since we've tossed that
-			 * state already.  If the packet is being fed back
-			 * from ire_send_v6, we don't know the position in
-			 * the queue to enqueue the packet and we discard
-			 * the packet.
-			 */
-			if (ipst->ips_ip_output_queue && connp != NULL &&
-			    !mctl_present && caller != IRE_SEND) {
-				if (caller == IP_WSRV) {
-					idl_tx_list_t *idl_txl;
-
-					idl_txl = &ipst->ips_idl_tx_list[0];
-					connp->conn_did_putbq = 1;
-					(void) putbq(connp->conn_wq, mp);
-					conn_drain_insert(connp, idl_txl);
-					/*
-					 * caller == IP_WSRV implies we are
-					 * the service thread, and the
-					 * queue is already noenabled.
-					 * The check for canput and
-					 * the putbq is not atomic.
-					 * So we need to check again.
-					 */
-					if (canput(dev_q))
-						connp->conn_did_putbq = 0;
-				} else {
-					(void) putq(connp->conn_wq, mp);
-				}
-				return;
-			}
-			BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-			freemsg(first_mp);
-			return;
-		}
-
-		/*
-		 * Look for reachability confirmations from the transport.
-		 */
-		if (ip6h->ip6_vcf & IP_FORWARD_PROG) {
-			reachable |= IPV6_REACHABILITY_CONFIRMATION;
-			ip6h->ip6_vcf &= ~IP_FORWARD_PROG;
-			if (mctl_present)
-				io->ipsec_out_reachable = B_TRUE;
-		}
-		/* Fastpath */
-		switch (nexthdr) {
-		case IPPROTO_TCP:
-		case IPPROTO_UDP:
-		case IPPROTO_ICMPV6:
-		case IPPROTO_SCTP:
-			hdr_length = IPV6_HDR_LEN;
-			break;
-		default: {
-			uint8_t	*nexthdrp;
-
-			if (!ip_hdr_length_nexthdr_v6(mp, ip6h,
-			    &hdr_length, &nexthdrp)) {
-				/* Malformed packet */
-				BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-				freemsg(first_mp);
-				return;
-			}
-			nexthdr = *nexthdrp;
-			break;
-		}
-		}
-
-		if (cksum_request != -1 && nexthdr != IPPROTO_ICMPV6) {
-			uint16_t	*up;
-			uint16_t	*insp;
-
-			/*
-			 * The packet header is processed once for all, even
-			 * in the multirouting case. We disable hardware
-			 * checksum if the packet is multirouted, as it will be
-			 * replicated via several interfaces, and not all of
-			 * them may have this capability.
-			 */
-			if (cksum_request == 1 &&
-			    !(ire->ire_flags & RTF_MULTIRT)) {
-				/* Skip the transport checksum */
-				goto cksum_done;
-			}
-			/*
-			 * Do user-configured raw checksum.
-			 * Compute checksum and insert at offset "cksum_request"
-			 */
-
-			/* check for enough headers for checksum */
-			cksum_request += hdr_length;	/* offset from rptr */
-			if ((mp->b_wptr - mp->b_rptr) <
-			    (cksum_request + sizeof (int16_t))) {
-				if (!pullupmsg(mp,
-				    cksum_request + sizeof (int16_t))) {
-					ip1dbg(("ip_wput_v6: ICMP hdr pullupmsg"
-					    " failed\n"));
-					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-					freemsg(first_mp);
-					return;
-				}
-				ip6h = (ip6_t *)mp->b_rptr;
-			}
-			insp = (uint16_t *)((uchar_t *)ip6h + cksum_request);
-			ASSERT(((uintptr_t)insp & 0x1) == 0);
-			up = (uint16_t *)&ip6h->ip6_src;
-			/*
-			 * icmp has placed length and routing
-			 * header adjustment in *insp.
-			 */
-			sum = htons(nexthdr) +
-			    up[0] + up[1] + up[2] + up[3] +
-			    up[4] + up[5] + up[6] + up[7] +
-			    up[8] + up[9] + up[10] + up[11] +
-			    up[12] + up[13] + up[14] + up[15];
-			sum = (sum & 0xffff) + (sum >> 16);
-			*insp = IP_CSUM(mp, hdr_length, sum);
-		} else if (nexthdr == IPPROTO_TCP) {
-			uint16_t	*up;
-
-			/*
-			 * Check for full IPv6 header + enough TCP header
-			 * to get at the checksum field.
-			 */
-			if ((mp->b_wptr - mp->b_rptr) <
-			    (hdr_length + TCP_CHECKSUM_OFFSET +
-			    TCP_CHECKSUM_SIZE)) {
-				if (!pullupmsg(mp, hdr_length +
-				    TCP_CHECKSUM_OFFSET + TCP_CHECKSUM_SIZE)) {
-					ip1dbg(("ip_wput_v6: TCP hdr pullupmsg"
-					    " failed\n"));
-					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-					freemsg(first_mp);
-					return;
-				}
-				ip6h = (ip6_t *)mp->b_rptr;
-			}
-
-			up = (uint16_t *)&ip6h->ip6_src;
-			/*
-			 * Note: The TCP module has stored the length value
-			 * into the tcp checksum field, so we don't
-			 * need to explicitly sum it in here.
-			 */
-			sum = up[0] + up[1] + up[2] + up[3] +
-			    up[4] + up[5] + up[6] + up[7] +
-			    up[8] + up[9] + up[10] + up[11] +
-			    up[12] + up[13] + up[14] + up[15];
-
-			/* Fold the initial sum */
-			sum = (sum & 0xffff) + (sum >> 16);
-
-			up = (uint16_t *)(((uchar_t *)ip6h) +
-			    hdr_length + TCP_CHECKSUM_OFFSET);
-
-			IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_TCP,
-			    hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
-			    ire->ire_max_frag, mctl_present, sum);
-
-			/* Software checksum? */
-			if (DB_CKSUMFLAGS(mp) == 0) {
-				IP6_STAT(ipst, ip6_out_sw_cksum);
-				IP6_STAT_UPDATE(ipst,
-				    ip6_tcp_out_sw_cksum_bytes,
-				    (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) -
-				    hdr_length);
-			}
-		} else if (nexthdr == IPPROTO_UDP) {
-			uint16_t	*up;
-
-			/*
-			 * check for full IPv6 header + enough UDP header
-			 * to get at the UDP checksum field
-			 */
-			if ((mp->b_wptr - mp->b_rptr) < (hdr_length +
-			    UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) {
-				if (!pullupmsg(mp, hdr_length +
-				    UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) {
-					ip1dbg(("ip_wput_v6: UDP hdr pullupmsg"
-					    " failed\n"));
-					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-					freemsg(first_mp);
-					return;
-				}
-				ip6h = (ip6_t *)mp->b_rptr;
-			}
-			up = (uint16_t *)&ip6h->ip6_src;
-			/*
-			 * Note: The UDP module has stored the length value
-			 * into the udp checksum field, so we don't
-			 * need to explicitly sum it in here.
-			 */
-			sum = up[0] + up[1] + up[2] + up[3] +
-			    up[4] + up[5] + up[6] + up[7] +
-			    up[8] + up[9] + up[10] + up[11] +
-			    up[12] + up[13] + up[14] + up[15];
-
-			/* Fold the initial sum */
-			sum = (sum & 0xffff) + (sum >> 16);
-
-			up = (uint16_t *)(((uchar_t *)ip6h) +
-			    hdr_length + UDP_CHECKSUM_OFFSET);
-
-			IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_UDP,
-			    hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
-			    ire->ire_max_frag, mctl_present, sum);
-
-			/* Software checksum? */
-			if (DB_CKSUMFLAGS(mp) == 0) {
-				IP6_STAT(ipst, ip6_out_sw_cksum);
-				IP6_STAT_UPDATE(ipst,
-				    ip6_udp_out_sw_cksum_bytes,
-				    (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) -
-				    hdr_length);
-			}
-		} else if (nexthdr == IPPROTO_ICMPV6) {
-			uint16_t	*up;
-			icmp6_t *icmp6;
-
-			/* check for full IPv6+ICMPv6 header */
-			if ((mp->b_wptr - mp->b_rptr) <
-			    (hdr_length + ICMP6_MINLEN)) {
-				if (!pullupmsg(mp, hdr_length + ICMP6_MINLEN)) {
-					ip1dbg(("ip_wput_v6: ICMP hdr pullupmsg"
-					    " failed\n"));
-					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-					freemsg(first_mp);
-					return;
-				}
-				ip6h = (ip6_t *)mp->b_rptr;
-			}
-			icmp6 = (icmp6_t *)((uchar_t *)ip6h + hdr_length);
-			up = (uint16_t *)&ip6h->ip6_src;
-			/*
-			 * icmp has placed length and routing
-			 * header adjustment in icmp6_cksum.
-			 */
-			sum = htons(IPPROTO_ICMPV6) +
-			    up[0] + up[1] + up[2] + up[3] +
-			    up[4] + up[5] + up[6] + up[7] +
-			    up[8] + up[9] + up[10] + up[11] +
-			    up[12] + up[13] + up[14] + up[15];
-			sum = (sum & 0xffff) + (sum >> 16);
-			icmp6->icmp6_cksum = IP_CSUM(mp, hdr_length, sum);
-
-			/* Update output mib stats */
-			icmp_update_out_mib_v6(ill, icmp6);
-		} else if (nexthdr == IPPROTO_SCTP) {
-			sctp_hdr_t *sctph;
-
-			if (MBLKL(mp) < (hdr_length + sizeof (*sctph))) {
-				if (!pullupmsg(mp, hdr_length +
-				    sizeof (*sctph))) {
-					ip1dbg(("ip_wput_v6: SCTP hdr pullupmsg"
-					    " failed\n"));
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsOutDiscards);
-					freemsg(mp);
-					return;
-				}
-				ip6h = (ip6_t *)mp->b_rptr;
-			}
-			sctph = (sctp_hdr_t *)(mp->b_rptr + hdr_length);
-			sctph->sh_chksum = 0;
-			sctph->sh_chksum = sctp_cksum(mp, hdr_length);
-		}
-
-	cksum_done:
-		/*
-		 * We force the insertion of a fragment header using the
-		 * IPH_FRAG_HDR flag in two cases:
-		 * - after reception of an ICMPv6 "packet too big" message
-		 *   with a MTU < 1280 (cf. RFC 2460 section 5)
-		 * - for multirouted IPv6 packets, so that the receiver can
-		 *   discard duplicates according to their fragment identifier
-		 *
-		 * Two flags modifed from the API can modify this behavior.
-		 * The first is IPV6_USE_MIN_MTU.  With this API the user
-		 * can specify how to manage PMTUD for unicast and multicast.
-		 *
-		 * IPV6_DONTFRAG disallows fragmentation.
-		 */
-		max_frag = ire->ire_max_frag;
-		switch (IP6I_USE_MIN_MTU_API(flags)) {
-		case IPV6_USE_MIN_MTU_DEFAULT:
-		case IPV6_USE_MIN_MTU_UNICAST:
-			if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
-				max_frag = IPV6_MIN_MTU;
-			}
-			break;
-
-		case IPV6_USE_MIN_MTU_NEVER:
-			max_frag = IPV6_MIN_MTU;
-			break;
-		}
-		if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > max_frag ||
-		    (ire->ire_frag_flag & IPH_FRAG_HDR)) {
-			if (connp != NULL && (flags & IP6I_DONTFRAG)) {
-				icmp_pkt2big_v6(ire->ire_stq, first_mp,
-				    max_frag, B_FALSE, B_TRUE, zoneid, ipst);
-				return;
-			}
-
-			if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN !=
-			    (mp->b_cont ? msgdsize(mp) :
-			    mp->b_wptr - (uchar_t *)ip6h)) {
-				ip0dbg(("Packet length mismatch: %d, %ld\n",
-				    ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
-				    msgdsize(mp)));
-				freemsg(first_mp);
-				return;
-			}
-			/* Do IPSEC processing first */
-			if (mctl_present) {
-				ipsec_out_process(q, first_mp, ire, ill_index);
-				return;
-			}
-			ASSERT(mp->b_prev == NULL);
-			ip2dbg(("Fragmenting Size = %d, mtu = %d\n",
-			    ntohs(ip6h->ip6_plen) +
-			    IPV6_HDR_LEN, max_frag));
-			ASSERT(mp == first_mp);
-			/* Initiate IPPF processing */
-			if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
-				ip_process(IPP_LOCAL_OUT, &mp, ill_index);
-				if (mp == NULL) {
-					return;
-				}
-			}
-			ip_wput_frag_v6(mp, ire, reachable, connp,
-			    caller, max_frag);
-			return;
-		}
-		/* Do IPSEC processing first */
-		if (mctl_present) {
-			int extra_len = ipsec_out_extra_length(first_mp);
-
-			if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN + extra_len >
-			    max_frag && connp != NULL &&
-			    (flags & IP6I_DONTFRAG)) {
-				/*
-				 * IPsec headers will push the packet over the
-				 * MTU limit.  Issue an ICMPv6 Packet Too Big
-				 * message for this packet if the upper-layer
-				 * that issued this packet will be able to
-				 * react to the icmp_pkt2big_v6() that we'll
-				 * generate.
-				 */
-				icmp_pkt2big_v6(ire->ire_stq, first_mp,
-				    max_frag, B_FALSE, B_TRUE, zoneid, ipst);
-				return;
-			}
-			ipsec_out_process(q, first_mp, ire, ill_index);
-			return;
-		}
-		/*
-		 * XXX multicast: add ip_mforward_v6() here.
-		 * Check conn_dontroute
-		 */
-#ifdef lint
-		/*
-		 * XXX The only purpose of this statement is to avoid lint
-		 * errors.  See the above "XXX multicast".  When that gets
-		 * fixed, remove this whole #ifdef lint section.
-		 */
-		ip3dbg(("multicast forward is %s.\n",
-		    (multicast_forward ? "TRUE" : "FALSE")));
-#endif
-
-		UPDATE_OB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
-		ASSERT(mp == first_mp);
-		ip_xmit_v6(mp, ire, reachable, connp, caller, NULL);
-	} else {
-		/*
-		 * DTrace this as ip:::send.  A blocked packet will fire the
-		 * send probe, but not the receive probe.
-		 */
-		DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL,
-		    void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *,
-		    NULL, ip6_t *, ip6h, int, 1);
-		DTRACE_PROBE4(ip6__loopback__out__start,
-		    ill_t *, NULL, ill_t *, ill,
-		    ip6_t *, ip6h, mblk_t *, first_mp);
-		FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
-		    ipst->ips_ipv6firewall_loopback_out,
-		    NULL, ill, ip6h, first_mp, mp, 0, ipst);
-		DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, first_mp);
-		if (first_mp != NULL) {
-			ip_wput_local_v6(RD(q), ill, ip6h, first_mp, ire, 0,
-			    zoneid);
-		}
-	}
-}
-
-/*
- * Outbound IPv6 fragmentation routine using MDT.
- */
-static void
-ip_wput_frag_mdt_v6(mblk_t *mp, ire_t *ire, size_t max_chunk,
-    size_t unfragmentable_len, uint8_t nexthdr, uint_t prev_nexthdr_offset)
-{
-	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
-	uint_t		pkts, wroff, hdr_chunk_len, pbuf_idx;
-	mblk_t		*hdr_mp, *md_mp = NULL;
-	int		i1;
-	multidata_t	*mmd;
-	unsigned char	*hdr_ptr, *pld_ptr;
-	ip_pdescinfo_t	pdi;
-	uint32_t	ident;
-	size_t		len;
-	uint16_t	offset;
-	queue_t		*stq = ire->ire_stq;
-	ill_t		*ill = (ill_t *)stq->q_ptr;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	ASSERT(DB_TYPE(mp) == M_DATA);
-	ASSERT(MBLKL(mp) > unfragmentable_len);
-
-	/*
-	 * Move read ptr past unfragmentable portion, we don't want this part
-	 * of the data in our fragments.
-	 */
-	mp->b_rptr += unfragmentable_len;
-
-	/* Calculate how many packets we will send out  */
-	i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
-	pkts = (i1 + max_chunk - 1) / max_chunk;
-	ASSERT(pkts > 1);
-
-	/* Allocate a message block which will hold all the IP Headers. */
-	wroff = ipst->ips_ip_wroff_extra;
-	hdr_chunk_len = wroff + unfragmentable_len + sizeof (ip6_frag_t);
-
-	i1 = pkts * hdr_chunk_len;
-	/*
-	 * Create the header buffer, Multidata and destination address
-	 * and SAP attribute that should be associated with it.
-	 */
-	if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
-	    ((hdr_mp->b_wptr += i1),
-	    (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
-	    !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) {
-		freemsg(mp);
-		if (md_mp == NULL) {
-			freemsg(hdr_mp);
-		} else {
-free_mmd:		IP6_STAT(ipst, ip6_frag_mdt_discarded);
-			freemsg(md_mp);
-		}
-		IP6_STAT(ipst, ip6_frag_mdt_allocfail);
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
-		return;
-	}
-	IP6_STAT(ipst, ip6_frag_mdt_allocd);
-
-	/*
-	 * Add a payload buffer to the Multidata; this operation must not
-	 * fail, or otherwise our logic in this routine is broken.  There
-	 * is no memory allocation done by the routine, so any returned
-	 * failure simply tells us that we've done something wrong.
-	 *
-	 * A failure tells us that either we're adding the same payload
-	 * buffer more than once, or we're trying to add more buffers than
-	 * allowed.  None of the above cases should happen, and we panic
-	 * because either there's horrible heap corruption, and/or
-	 * programming mistake.
-	 */
-	if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) {
-		goto pbuf_panic;
-	}
-
-	hdr_ptr = hdr_mp->b_rptr;
-	pld_ptr = mp->b_rptr;
-
-	pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
-
-	ident = htonl(atomic_add_32_nv(&ire->ire_ident, 1));
-
-	/*
-	 * len is the total length of the fragmentable data in this
-	 * datagram.  For each fragment sent, we will decrement len
-	 * by the amount of fragmentable data sent in that fragment
-	 * until len reaches zero.
-	 */
-	len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN);
-
-	offset = 0;
-	prev_nexthdr_offset += wroff;
-
-	while (len != 0) {
-		size_t		mlen;
-		ip6_t		*fip6h;
-		ip6_frag_t	*fraghdr;
-		int		error;
-
-		ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
-		mlen = MIN(len, max_chunk);
-		len -= mlen;
-
-		fip6h = (ip6_t *)(hdr_ptr + wroff);
-		ASSERT(OK_32PTR(fip6h));
-		bcopy(ip6h, fip6h, unfragmentable_len);
-		hdr_ptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
-
-		fip6h->ip6_plen = htons((uint16_t)(mlen +
-		    unfragmentable_len - IPV6_HDR_LEN + sizeof (ip6_frag_t)));
-
-		fraghdr = (ip6_frag_t *)((unsigned char *)fip6h +
-		    unfragmentable_len);
-		fraghdr->ip6f_nxt = nexthdr;
-		fraghdr->ip6f_reserved = 0;
-		fraghdr->ip6f_offlg = htons(offset) |
-		    ((len != 0) ? IP6F_MORE_FRAG : 0);
-		fraghdr->ip6f_ident = ident;
-
-		/*
-		 * Record offset and size of header and data of the next packet
-		 * in the multidata message.
-		 */
-		PDESC_HDR_ADD(&pdi, hdr_ptr, wroff,
-		    unfragmentable_len + sizeof (ip6_frag_t), 0);
-		PDESC_PLD_INIT(&pdi);
-		i1 = MIN(mp->b_wptr - pld_ptr, mlen);
-		ASSERT(i1 > 0);
-		PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
-		if (i1 == mlen) {
-			pld_ptr += mlen;
-		} else {
-			i1 = mlen - i1;
-			mp = mp->b_cont;
-			ASSERT(mp != NULL);
-			ASSERT(MBLKL(mp) >= i1);
-			/*
-			 * Attach the next payload message block to the
-			 * multidata message.
-			 */
-			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
-				goto pbuf_panic;
-			PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
-			pld_ptr = mp->b_rptr + i1;
-		}
-
-		if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
-		    KM_NOSLEEP)) == NULL) {
-			/*
-			 * Any failure other than ENOMEM indicates that we
-			 * have passed in invalid pdesc info or parameters
-			 * to mmd_addpdesc, which must not happen.
-			 *
-			 * EINVAL is a result of failure on boundary checks
-			 * against the pdesc info contents.  It should not
-			 * happen, and we panic because either there's
-			 * horrible heap corruption, and/or programming
-			 * mistake.
-			 */
-			if (error != ENOMEM) {
-				cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: "
-				    "pdesc logic error detected for "
-				    "mmd %p pinfo %p (%d)\n",
-				    (void *)mmd, (void *)&pdi, error);
-				/* NOTREACHED */
-			}
-			IP6_STAT(ipst, ip6_frag_mdt_addpdescfail);
-			/* Free unattached payload message blocks as well */
-			md_mp->b_cont = mp->b_cont;
-			goto free_mmd;
-		}
-
-		/* Advance fragment offset. */
-		offset += mlen;
-
-		/* Advance to location for next header in the buffer. */
-		hdr_ptr += hdr_chunk_len;
-
-		/* Did we reach the next payload message block? */
-		if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
-			mp = mp->b_cont;
-			/*
-			 * Attach the next message block with payload
-			 * data to the multidata message.
-			 */
-			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
-				goto pbuf_panic;
-			pld_ptr = mp->b_rptr;
-		}
-	}
-
-	ASSERT(hdr_mp->b_wptr == hdr_ptr);
-	ASSERT(mp->b_wptr == pld_ptr);
-
-	/* Update IP statistics */
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates, pkts);
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, pkts);
-	/*
-	 * The ipv6 header len is accounted for in unfragmentable_len so
-	 * when calculating the fragmentation overhead just add the frag
-	 * header len.
-	 */
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
-	    (ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN)) +
-	    pkts * (unfragmentable_len + sizeof (ip6_frag_t)));
-	IP6_STAT_UPDATE(ipst, ip6_frag_mdt_pkt_out, pkts);
-
-	ire->ire_ob_pkt_count += pkts;
-	if (ire->ire_ipif != NULL)
-		atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
-
-	ire->ire_last_used_time = lbolt;
-	/* Send it down */
-	putnext(stq, md_mp);
-	return;
-
-pbuf_panic:
-	cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: payload buffer logic "
-	    "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
-	    pbuf_idx);
-	/* NOTREACHED */
-}
-
-/*
  * IPv6 fragmentation.  Essentially the same as IPv4 fragmentation.
  * We have not optimized this in terms of number of mblks
  * allocated. For instance, for each fragment sent we always allocate a
  * mblk to hold the IPv6 header and fragment header.
  *
- * Assumes that all the extension headers are contained in the first mblk.
- *
- * The fragment header is inserted after an hop-by-hop options header
- * and after [an optional destinations header followed by] a routing header.
- *
- * NOTE : This function does not ire_refrele the ire passed in as
- * the argument.
+ * Assumes that all the extension headers are contained in the first mblk
+ * and that the fragment header has has already been added by calling
+ * ip_fraghdr_add_v6.
  */
-void
-ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
-    int caller, int max_frag)
+int
+ip_fragment_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
+    uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
+    pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
 {
 	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
 	ip6_t		*fip6h;
@@ -11337,27 +4102,31 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
 	mblk_t		*dmp;
 	ip6_frag_t	*fraghdr;
 	size_t		unfragmentable_len;
-	size_t		len;
 	size_t		mlen;
 	size_t		max_chunk;
-	uint32_t	ident;
 	uint16_t	off_flags;
 	uint16_t	offset = 0;
-	ill_t		*ill;
+	ill_t		*ill = nce->nce_ill;
 	uint8_t		nexthdr;
-	uint_t		prev_nexthdr_offset;
 	uint8_t		*ptr;
-	ip_stack_t	*ipst = ire->ire_ipst;
-
-	ASSERT(ire->ire_type == IRE_CACHE);
-	ill = (ill_t *)ire->ire_stq->q_ptr;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	uint_t		priority = mp->b_band;
+	int		error = 0;
 
-	if (max_frag <= 0) {
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
+	if (max_frag == 0) {
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+		ip_drop_output("FragFails: zero max_frag", mp, ill);
 		freemsg(mp);
-		return;
+		return (EINVAL);
 	}
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
+
+	/*
+	 * Caller should have added fraghdr_t to pkt_len, and also
+	 * updated ip6_plen.
+	 */
+	ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == pkt_len);
+	ASSERT(msgdsize(mp) == pkt_len);
 
 	/*
 	 * Determine the length of the unfragmentable portion of this
@@ -11366,7 +4135,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
 	 * destination options header, and a potential routing header.
 	 */
 	nexthdr = ip6h->ip6_nxt;
-	prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
 	ptr = (uint8_t *)&ip6h[1];
 
 	if (nexthdr == IPPROTO_HOPOPTS) {
@@ -11376,8 +4144,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
 		hbh_hdr = (ip6_hbh_t *)ptr;
 		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
 		nexthdr = hbh_hdr->ip6h_nxt;
-		prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
-		    - (uint8_t *)ip6h;
 		ptr += hdr_len;
 	}
 	if (nexthdr == IPPROTO_DSTOPTS) {
@@ -11388,8 +4154,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
 		if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
 			hdr_len = 8 * (dest_hdr->ip6d_len + 1);
 			nexthdr = dest_hdr->ip6d_nxt;
-			prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
-			    - (uint8_t *)ip6h;
 			ptr += hdr_len;
 		}
 	}
@@ -11399,82 +4163,73 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
 
 		rthdr = (ip6_rthdr_t *)ptr;
 		nexthdr = rthdr->ip6r_nxt;
-		prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
-		    - (uint8_t *)ip6h;
 		hdr_len = 8 * (rthdr->ip6r_len + 1);
 		ptr += hdr_len;
 	}
+	if (nexthdr != IPPROTO_FRAGMENT) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+		ip_drop_output("FragFails: bad nexthdr", mp, ill);
+		freemsg(mp);
+		return (EINVAL);
+	}
 	unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
+	unfragmentable_len += sizeof (ip6_frag_t);
 
-	max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len -
-	    sizeof (ip6_frag_t)) & ~7;
-
-	/* Check if we can use MDT to send out the frags. */
-	ASSERT(!IRE_IS_LOCAL(ire));
-	if (ipst->ips_ip_multidata_outbound && reachable == 0 &&
-	    !(ire->ire_flags & RTF_MULTIRT) && ILL_MDT_CAPABLE(ill) &&
-	    IP_CAN_FRAG_MDT(mp, unfragmentable_len, max_chunk)) {
-		ip_wput_frag_mdt_v6(mp, ire, max_chunk, unfragmentable_len,
-		    nexthdr, prev_nexthdr_offset);
-		return;
-	}
+	max_chunk = (max_frag - unfragmentable_len) & ~7;
 
 	/*
 	 * Allocate an mblk with enough room for the link-layer
-	 * header, the unfragmentable part of the datagram, and the
-	 * fragment header.  This (or a copy) will be used as the
+	 * header and the unfragmentable part of the datagram, which includes
+	 * the fragment header.  This (or a copy) will be used as the
 	 * first mblk for each fragment we send.
 	 */
-	hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) +
-	    ipst->ips_ip_wroff_extra, mp);
+	hmp = allocb_tmpl(unfragmentable_len + ipst->ips_ip_wroff_extra, mp);
 	if (hmp == NULL) {
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+		ip_drop_output("FragFails: no hmp", mp, ill);
 		freemsg(mp);
-		return;
+		return (ENOBUFS);
 	}
 	hmp->b_rptr += ipst->ips_ip_wroff_extra;
-	hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t);
+	hmp->b_wptr = hmp->b_rptr + unfragmentable_len;
 
 	fip6h = (ip6_t *)hmp->b_rptr;
-	fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len);
-
 	bcopy(ip6h, fip6h, unfragmentable_len);
-	hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
-
-	ident = atomic_add_32_nv(&ire->ire_ident, 1);
-
-	fraghdr->ip6f_nxt = nexthdr;
-	fraghdr->ip6f_reserved = 0;
-	fraghdr->ip6f_offlg = 0;
-	fraghdr->ip6f_ident = htonl(ident);
 
 	/*
-	 * len is the total length of the fragmentable data in this
-	 * datagram.  For each fragment sent, we will decrement len
+	 * pkt_len is set to the total length of the fragmentable data in this
+	 * datagram.  For each fragment sent, we will decrement pkt_len
 	 * by the amount of fragmentable data sent in that fragment
 	 * until len reaches zero.
 	 */
-	len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN);
+	pkt_len -= unfragmentable_len;
 
 	/*
 	 * Move read ptr past unfragmentable portion, we don't want this part
 	 * of the data in our fragments.
 	 */
 	mp->b_rptr += unfragmentable_len;
+	if (mp->b_rptr == mp->b_wptr) {
+		mblk_t *mp1 = mp->b_cont;
+		freeb(mp);
+		mp = mp1;
+	}
 
-	while (len != 0) {
-		mlen = MIN(len, max_chunk);
-		len -= mlen;
-		if (len != 0) {
+	while (pkt_len != 0) {
+		mlen = MIN(pkt_len, max_chunk);
+		pkt_len -= mlen;
+		if (pkt_len != 0) {
 			/* Not last */
 			hmp0 = copyb(hmp);
 			if (hmp0 == NULL) {
-				freeb(hmp);
-				freemsg(mp);
 				BUMP_MIB(ill->ill_ip_mib,
 				    ipIfStatsOutFragFails);
-				ip1dbg(("ip_wput_frag_v6: copyb failed\n"));
-				return;
+				ip_drop_output("FragFails: copyb failed",
+				    mp, ill);
+				freeb(hmp);
+				freemsg(mp);
+				ip1dbg(("ip_fragment_v6: copyb failed\n"));
+				return (ENOBUFS);
 			}
 			off_flags = IP6F_MORE_FRAG;
 		} else {
@@ -11484,10 +4239,11 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
 			off_flags = 0;
 		}
 		fip6h = (ip6_t *)(hmp0->b_rptr);
-		fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len);
+		fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len -
+		    sizeof (ip6_frag_t));
 
 		fip6h->ip6_plen = htons((uint16_t)(mlen +
-		    unfragmentable_len - IPV6_HDR_LEN + sizeof (ip6_frag_t)));
+		    unfragmentable_len - IPV6_HDR_LEN));
 		/*
 		 * Note: Optimization alert.
 		 * In IPv6 (and IPv4) protocol header, Fragment Offset
@@ -11504,654 +4260,197 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
 
 		if (!(dmp = ip_carve_mp(&mp, mlen))) {
 			/* mp has already been freed by ip_carve_mp() */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+			ip_drop_output("FragFails: could not carve mp",
+			    hmp0, ill);
 			if (hmp != NULL)
 				freeb(hmp);
 			freeb(hmp0);
 			ip1dbg(("ip_carve_mp: failed\n"));
-			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
-			return;
+			return (ENOBUFS);
 		}
 		hmp0->b_cont = dmp;
 		/* Get the priority marking, if any */
-		hmp0->b_band = dmp->b_band;
-		UPDATE_OB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
-		ip_xmit_v6(hmp0, ire, reachable | IP6_NO_IPPOLICY, connp,
-		    caller, NULL);
-		reachable = 0;	/* No need to redo state machine in loop */
+		hmp0->b_band = priority;
+
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
+
+		error = postfragfn(hmp0, nce, ixaflags,
+		    mlen + unfragmentable_len, xmit_hint, szone, nolzid,
+		    ixa_cookie);
+		if (error != 0 && error != EWOULDBLOCK && hmp != NULL) {
+			/* No point in sending the other fragments */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+			ip_drop_output("FragFails: postfragfn failed",
+			    hmp, ill);
+			freeb(hmp);
+			freemsg(mp);
+			return (error);
+		}
+		/* No need to redo state machine in loop */
+		ixaflags &= ~IXAF_REACH_CONF;
+
 		offset += mlen;
 	}
 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
+	return (error);
 }
 
 /*
- * Determine if the ill and multicast aspects of that packets
- * "matches" the conn.
+ * Add a fragment header to an IPv6 packet.
+ * Assumes that all the extension headers are contained in the first mblk.
+ *
+ * The fragment header is inserted after an hop-by-hop options header
+ * and after [an optional destinations header followed by] a routing header.
  */
-boolean_t
-conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags,
-    zoneid_t zoneid)
+mblk_t *
+ip_fraghdr_add_v6(mblk_t *mp, uint32_t ident, ip_xmit_attr_t *ixa)
 {
-	ill_t *bound_ill;
-	boolean_t wantpacket;
-	in6_addr_t *v6dst_ptr = &ip6h->ip6_dst;
-	in6_addr_t *v6src_ptr = &ip6h->ip6_src;
+	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
+	ip6_t		*fip6h;
+	mblk_t		*hmp;
+	ip6_frag_t	*fraghdr;
+	size_t		unfragmentable_len;
+	uint8_t		nexthdr;
+	uint_t		prev_nexthdr_offset;
+	uint8_t		*ptr;
+	uint_t		priority = mp->b_band;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
 
 	/*
-	 * conn_incoming_ill is set by IPV6_BOUND_IF which limits
-	 * unicast and multicast reception to conn_incoming_ill.
-	 * conn_wantpacket_v6 is called both for unicast and
-	 * multicast.
+	 * Determine the length of the unfragmentable portion of this
+	 * datagram.  This consists of the IPv6 header, a potential
+	 * hop-by-hop options header, a potential pre-routing-header
+	 * destination options header, and a potential routing header.
 	 */
-	bound_ill = connp->conn_incoming_ill;
-	if (bound_ill != NULL) {
-		if (IS_IPMP(bound_ill)) {
-			if (bound_ill->ill_grp != ill->ill_grp)
-				return (B_FALSE);
-		} else {
-			if (bound_ill != ill)
-				return (B_FALSE);
-		}
-	}
+	nexthdr = ip6h->ip6_nxt;
+	prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
+	ptr = (uint8_t *)&ip6h[1];
 
-	if (connp->conn_multi_router)
-		return (B_TRUE);
+	if (nexthdr == IPPROTO_HOPOPTS) {
+		ip6_hbh_t	*hbh_hdr;
+		uint_t		hdr_len;
 
-	if (!IN6_IS_ADDR_MULTICAST(v6dst_ptr) &&
-	    !IN6_IS_ADDR_V4MAPPED_CLASSD(v6dst_ptr)) {
-		/*
-		 * Unicast case: we match the conn only if it's in the specified
-		 * zone.
-		 */
-		return (IPCL_ZONE_MATCH(connp, zoneid));
+		hbh_hdr = (ip6_hbh_t *)ptr;
+		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
+		nexthdr = hbh_hdr->ip6h_nxt;
+		prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
+		    - (uint8_t *)ip6h;
+		ptr += hdr_len;
 	}
+	if (nexthdr == IPPROTO_DSTOPTS) {
+		ip6_dest_t	*dest_hdr;
+		uint_t		hdr_len;
 
-	if ((fanout_flags & IP_FF_NO_MCAST_LOOP) &&
-	    (connp->conn_zoneid == zoneid || zoneid == ALL_ZONES)) {
-		/*
-		 * Loopback case: the sending endpoint has IP_MULTICAST_LOOP
-		 * disabled, therefore we don't dispatch the multicast packet to
-		 * the sending zone.
-		 */
-		return (B_FALSE);
+		dest_hdr = (ip6_dest_t *)ptr;
+		if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
+			hdr_len = 8 * (dest_hdr->ip6d_len + 1);
+			nexthdr = dest_hdr->ip6d_nxt;
+			prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
+			    - (uint8_t *)ip6h;
+			ptr += hdr_len;
+		}
 	}
+	if (nexthdr == IPPROTO_ROUTING) {
+		ip6_rthdr_t	*rthdr;
+		uint_t		hdr_len;
 
-	if (IS_LOOPBACK(ill) && connp->conn_zoneid != zoneid &&
-	    zoneid != ALL_ZONES) {
-		/*
-		 * Multicast packet on the loopback interface: we only match
-		 * conns who joined the group in the specified zone.
-		 */
-		return (B_FALSE);
+		rthdr = (ip6_rthdr_t *)ptr;
+		nexthdr = rthdr->ip6r_nxt;
+		prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
+		    - (uint8_t *)ip6h;
+		hdr_len = 8 * (rthdr->ip6r_len + 1);
+		ptr += hdr_len;
 	}
+	unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
 
-	mutex_enter(&connp->conn_lock);
-	wantpacket =
-	    ilg_lookup_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr, ill) != NULL;
-	mutex_exit(&connp->conn_lock);
-
-	return (wantpacket);
-}
-
-
-/*
- * Transmit a packet and update any NUD state based on the flags
- * XXX need to "recover" any ip6i_t when doing putq!
- *
- * NOTE : This function does not ire_refrele the ire passed in as the
- * argument.
- */
-void
-ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
-    int caller, ipsec_out_t *io)
-{
-	mblk_t		*mp1;
-	nce_t		*nce = ire->ire_nce;
-	ill_t		*ill;
-	ill_t		*out_ill;
-	uint64_t	delta;
-	ip6_t		*ip6h;
-	queue_t		*stq = ire->ire_stq;
-	ire_t		*ire1 = NULL;
-	ire_t		*save_ire = ire;
-	boolean_t	multirt_send = B_FALSE;
-	mblk_t		*next_mp = NULL;
-	ip_stack_t	*ipst = ire->ire_ipst;
-	boolean_t	fp_prepend = B_FALSE;
-	uint32_t	hlen;
+	/*
+	 * Allocate an mblk with enough room for the link-layer
+	 * header, the unfragmentable part of the datagram, and the
+	 * fragment header.
+	 */
+	hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) +
+	    ipst->ips_ip_wroff_extra, mp);
+	if (hmp == NULL) {
+		ill_t *ill = ixa->ixa_nce->nce_ill;
 
-	ip6h = (ip6_t *)mp->b_rptr;
-	ASSERT(!IN6_IS_ADDR_V4MAPPED(&ire->ire_addr_v6));
-	ASSERT(ire->ire_ipversion == IPV6_VERSION);
-	ASSERT(nce != NULL);
-	ASSERT(mp->b_datap->db_type == M_DATA);
-	ASSERT(stq != NULL);
-
-	ill = ire_to_ill(ire);
-	if (!ill) {
-		ip0dbg(("ip_xmit_v6: ire_to_ill failed\n"));
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ipIfStatsOutDiscards: allocb failure", mp, ill);
 		freemsg(mp);
-		return;
+		return (NULL);
 	}
+	hmp->b_rptr += ipst->ips_ip_wroff_extra;
+	hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t);
 
-	/* Flow-control check has been done in ip_wput_ire_v6 */
-	if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || caller == IP_WPUT ||
-	    caller == IP_WSRV || canput(stq->q_next)) {
-		uint32_t ill_index;
-
-		/*
-		 * In most cases, the emission loop below is entered only
-		 * once. Only in the case where the ire holds the
-		 * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT
-		 * flagged ires in the bucket, and send the packet
-		 * through all crossed RTF_MULTIRT routes.
-		 */
-		if (ire->ire_flags & RTF_MULTIRT) {
-			/*
-			 * Multirouting case. The bucket where ire is stored
-			 * probably holds other RTF_MULTIRT flagged ires
-			 * to the destination. In this call to ip_xmit_v6,
-			 * we attempt to send the packet through all
-			 * those ires. Thus, we first ensure that ire is the
-			 * first RTF_MULTIRT ire in the bucket,
-			 * before walking the ire list.
-			 */
-			ire_t *first_ire;
-			irb_t *irb = ire->ire_bucket;
-			ASSERT(irb != NULL);
-			multirt_send = B_TRUE;
-
-			/* Make sure we do not omit any multiroute ire. */
-			IRB_REFHOLD(irb);
-			for (first_ire = irb->irb_ire;
-			    first_ire != NULL;
-			    first_ire = first_ire->ire_next) {
-				if ((first_ire->ire_flags & RTF_MULTIRT) &&
-				    (IN6_ARE_ADDR_EQUAL(&first_ire->ire_addr_v6,
-				    &ire->ire_addr_v6)) &&
-				    !(first_ire->ire_marks &
-				    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
-					break;
-			}
-
-			if ((first_ire != NULL) && (first_ire != ire)) {
-				IRE_REFHOLD(first_ire);
-				/* ire will be released by the caller */
-				ire = first_ire;
-				nce = ire->ire_nce;
-				stq = ire->ire_stq;
-				ill = ire_to_ill(ire);
-			}
-			IRB_REFRELE(irb);
-		} else if (connp != NULL && IPCL_IS_TCP(connp) &&
-		    connp->conn_mdt_ok && !connp->conn_tcp->tcp_mdt &&
-		    ILL_MDT_USABLE(ill)) {
-			/*
-			 * This tcp connection was marked as MDT-capable, but
-			 * it has been turned off due changes in the interface.
-			 * Now that the interface support is back, turn it on
-			 * by notifying tcp.  We don't directly modify tcp_mdt,
-			 * since we leave all the details to the tcp code that
-			 * knows better.
-			 */
-			mblk_t *mdimp = ip_mdinfo_alloc(ill->ill_mdt_capab);
-
-			if (mdimp == NULL) {
-				ip0dbg(("ip_xmit_v6: can't re-enable MDT for "
-				    "connp %p (ENOMEM)\n", (void *)connp));
-			} else {
-				CONN_INC_REF(connp);
-				SQUEUE_ENTER_ONE(connp->conn_sqp, mdimp,
-				    tcp_input, connp, SQ_FILL,
-				    SQTAG_TCP_INPUT_MCTL);
-			}
-		}
-
-		do {
-			mblk_t *mp_ip6h;
-
-			if (multirt_send) {
-				irb_t *irb;
-				/*
-				 * We are in a multiple send case, need to get
-				 * the next ire and make a duplicate of the
-				 * packet. ire1 holds here the next ire to
-				 * process in the bucket. If multirouting is
-				 * expected, any non-RTF_MULTIRT ire that has
-				 * the right destination address is ignored.
-				 */
-				irb = ire->ire_bucket;
-				ASSERT(irb != NULL);
-
-				IRB_REFHOLD(irb);
-				for (ire1 = ire->ire_next;
-				    ire1 != NULL;
-				    ire1 = ire1->ire_next) {
-					if (!(ire1->ire_flags & RTF_MULTIRT))
-						continue;
-					if (!IN6_ARE_ADDR_EQUAL(
-					    &ire1->ire_addr_v6,
-					    &ire->ire_addr_v6))
-						continue;
-					if (ire1->ire_marks &
-					    IRE_MARK_CONDEMNED)
-						continue;
-
-					/* Got one */
-					if (ire1 != save_ire) {
-						IRE_REFHOLD(ire1);
-					}
-					break;
-				}
-				IRB_REFRELE(irb);
-
-				if (ire1 != NULL) {
-					next_mp = copyb(mp);
-					if ((next_mp == NULL) ||
-					    ((mp->b_cont != NULL) &&
-					    ((next_mp->b_cont =
-					    dupmsg(mp->b_cont)) == NULL))) {
-						freemsg(next_mp);
-						next_mp = NULL;
-						ire_refrele(ire1);
-						ire1 = NULL;
-					}
-				}
-
-				/* Last multiroute ire; don't loop anymore. */
-				if (ire1 == NULL) {
-					multirt_send = B_FALSE;
-				}
-			}
-
-			ill_index =
-			    ((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex;
-
-			/* Initiate IPPF processing */
-			if (IP6_OUT_IPP(flags, ipst)) {
-				ip_process(IPP_LOCAL_OUT, &mp, ill_index);
-				if (mp == NULL) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsOutDiscards);
-					if (next_mp != NULL)
-						freemsg(next_mp);
-					if (ire != save_ire) {
-						ire_refrele(ire);
-					}
-					return;
-				}
-				ip6h = (ip6_t *)mp->b_rptr;
-			}
-			mp_ip6h = mp;
-
-			/*
-			 * Check for fastpath, we need to hold nce_lock to
-			 * prevent fastpath update from chaining nce_fp_mp.
-			 */
-
-			ASSERT(nce->nce_ipversion != IPV4_VERSION);
-			mutex_enter(&nce->nce_lock);
-			if ((mp1 = nce->nce_fp_mp) != NULL) {
-				uchar_t	*rptr;
-
-				hlen = MBLKL(mp1);
-				rptr = mp->b_rptr - hlen;
-				/*
-				 * make sure there is room for the fastpath
-				 * datalink header
-				 */
-				if (rptr < mp->b_datap->db_base) {
-					mp1 = copyb(mp1);
-					mutex_exit(&nce->nce_lock);
-					if (mp1 == NULL) {
-						BUMP_MIB(ill->ill_ip_mib,
-						    ipIfStatsOutDiscards);
-						freemsg(mp);
-						if (next_mp != NULL)
-							freemsg(next_mp);
-						if (ire != save_ire) {
-							ire_refrele(ire);
-						}
-						return;
-					}
-					mp1->b_cont = mp;
-
-					/* Get the priority marking, if any */
-					mp1->b_band = mp->b_band;
-					mp = mp1;
-				} else {
-					mp->b_rptr = rptr;
-					/*
-					 * fastpath -  pre-pend datalink
-					 * header
-					 */
-					bcopy(mp1->b_rptr, rptr, hlen);
-					mutex_exit(&nce->nce_lock);
-					fp_prepend = B_TRUE;
-				}
-			} else {
-				/*
-				 * Get the DL_UNITDATA_REQ.
-				 */
-				mp1 = nce->nce_res_mp;
-				if (mp1 == NULL) {
-					mutex_exit(&nce->nce_lock);
-					ip1dbg(("ip_xmit_v6: No resolution "
-					    "block ire = %p\n", (void *)ire));
-					freemsg(mp);
-					if (next_mp != NULL)
-						freemsg(next_mp);
-					if (ire != save_ire) {
-						ire_refrele(ire);
-					}
-					return;
-				}
-				/*
-				 * Prepend the DL_UNITDATA_REQ.
-				 */
-				mp1 = copyb(mp1);
-				mutex_exit(&nce->nce_lock);
-				if (mp1 == NULL) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsOutDiscards);
-					freemsg(mp);
-					if (next_mp != NULL)
-						freemsg(next_mp);
-					if (ire != save_ire) {
-						ire_refrele(ire);
-					}
-					return;
-				}
-				mp1->b_cont = mp;
-
-				/* Get the priority marking, if any */
-				mp1->b_band = mp->b_band;
-				mp = mp1;
-			}
-
-			out_ill = (ill_t *)stq->q_ptr;
-
-			DTRACE_PROBE4(ip6__physical__out__start,
-			    ill_t *, NULL, ill_t *, out_ill,
-			    ip6_t *, ip6h, mblk_t *, mp);
+	fip6h = (ip6_t *)hmp->b_rptr;
+	fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len);
 
-			FW_HOOKS6(ipst->ips_ip6_physical_out_event,
-			    ipst->ips_ipv6firewall_physical_out,
-			    NULL, out_ill, ip6h, mp, mp_ip6h, 0, ipst);
+	bcopy(ip6h, fip6h, unfragmentable_len);
+	fip6h->ip6_plen = htons(ntohs(fip6h->ip6_plen) + sizeof (ip6_frag_t));
+	hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
 
-			DTRACE_PROBE1(ip6__physical__out__end, mblk_t *, mp);
+	fraghdr->ip6f_nxt = nexthdr;
+	fraghdr->ip6f_reserved = 0;
+	fraghdr->ip6f_offlg = 0;
+	fraghdr->ip6f_ident = htonl(ident);
 
-			if (mp == NULL) {
-				if (multirt_send) {
-					ASSERT(ire1 != NULL);
-					if (ire != save_ire) {
-						ire_refrele(ire);
-					}
-					/*
-					 * Proceed with the next RTF_MULTIRT
-					 * ire, also set up the send-to queue
-					 * accordingly.
-					 */
-					ire = ire1;
-					ire1 = NULL;
-					stq = ire->ire_stq;
-					nce = ire->ire_nce;
-					ill = ire_to_ill(ire);
-					mp = next_mp;
-					next_mp = NULL;
-					continue;
-				} else {
-					ASSERT(next_mp == NULL);
-					ASSERT(ire1 == NULL);
-					break;
-				}
-			}
+	/* Get the priority marking, if any */
+	hmp->b_band = priority;
 
-			if (ipst->ips_ip6_observe.he_interested) {
-				zoneid_t	szone;
+	/*
+	 * Move read ptr past unfragmentable portion, we don't want this part
+	 * of the data in our fragments.
+	 */
+	mp->b_rptr += unfragmentable_len;
+	hmp->b_cont = mp;
+	return (hmp);
+}
 
-				/*
-				 * Both of these functions expect b_rptr to
-				 * be where the IPv6 header starts, so advance
-				 * past the link layer header.
-				 */
-				if (fp_prepend)
-					mp_ip6h->b_rptr += hlen;
-				szone = ip_get_zoneid_v6(&ip6h->ip6_src,
-				    mp_ip6h, out_ill, ipst, ALL_ZONES);
-				ipobs_hook(mp_ip6h, IPOBS_HOOK_OUTBOUND, szone,
-				    ALL_ZONES, out_ill, ipst);
-				if (fp_prepend)
-					mp_ip6h->b_rptr -= hlen;
-			}
+/*
+ * Determine if the ill and multicast aspects of that packets
+ * "matches" the conn.
+ */
+boolean_t
+conn_wantpacket_v6(conn_t *connp, ip_recv_attr_t *ira, ip6_t *ip6h)
+{
+	ill_t		*ill = ira->ira_rill;
+	zoneid_t	zoneid = ira->ira_zoneid;
+	uint_t		in_ifindex;
+	in6_addr_t	*v6dst_ptr = &ip6h->ip6_dst;
+	in6_addr_t	*v6src_ptr = &ip6h->ip6_src;
 
-			/*
-			 * Update ire and MIB counters; for save_ire, this has
-			 * been done by the caller.
-			 */
-			if (ire != save_ire) {
-				UPDATE_OB_PKT_COUNT(ire);
-				ire->ire_last_used_time = lbolt;
+	/*
+	 * conn_incoming_ifindex is set by IPV6_BOUND_IF and as link-local
+	 * scopeid. This is used to limit
+	 * unicast and multicast reception to conn_incoming_ifindex.
+	 * conn_wantpacket_v6 is called both for unicast and
+	 * multicast packets.
+	 */
+	in_ifindex = connp->conn_incoming_ifindex;
 
-				if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsHCOutMcastPkts);
-					UPDATE_MIB(ill->ill_ip_mib,
-					    ipIfStatsHCOutMcastOctets,
-					    ntohs(ip6h->ip6_plen) +
-					    IPV6_HDR_LEN);
-				}
-			}
+	/* mpathd can bind to the under IPMP interface, which we allow */
+	if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
+		if (!IS_UNDER_IPMP(ill))
+			return (B_FALSE);
 
-			/*
-			 * Send it down.  XXX Do we want to flow control AH/ESP
-			 * packets that carry TCP payloads?  We don't flow
-			 * control TCP packets, but we should also not
-			 * flow-control TCP packets that have been protected.
-			 * We don't have an easy way to find out if an AH/ESP
-			 * packet was originally TCP or not currently.
-			 */
-			if (io == NULL) {
-				BUMP_MIB(ill->ill_ip_mib,
-				    ipIfStatsHCOutTransmits);
-				UPDATE_MIB(ill->ill_ip_mib,
-				    ipIfStatsHCOutOctets,
-				    ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN);
-				DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
-				    void_ip_t *, ip6h, __dtrace_ipsr_ill_t *,
-				    out_ill, ipha_t *, NULL, ip6_t *, ip6h,
-				    int, 0);
-
-				putnext(stq, mp);
-			} else {
-				/*
-				 * Safety Pup says: make sure this is
-				 * going to the right interface!
-				 */
-				if (io->ipsec_out_capab_ill_index !=
-				    ill_index) {
-					/* IPsec kstats: bump lose counter */
-					freemsg(mp1);
-				} else {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsHCOutTransmits);
-					UPDATE_MIB(ill->ill_ip_mib,
-					    ipIfStatsHCOutOctets,
-					    ntohs(ip6h->ip6_plen) +
-					    IPV6_HDR_LEN);
-					DTRACE_IP7(send, mblk_t *, mp,
-					    conn_t *, NULL, void_ip_t *, ip6h,
-					    __dtrace_ipsr_ill_t *, out_ill,
-					    ipha_t *, NULL, ip6_t *, ip6h, int,
-					    0);
-					ipsec_hw_putnext(stq, mp);
-				}
-			}
+		if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
+			return (B_FALSE);
+	}
 
-			if (nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT)) {
-				if (ire != save_ire) {
-					ire_refrele(ire);
-				}
-				if (multirt_send) {
-					ASSERT(ire1 != NULL);
-					/*
-					 * Proceed with the next RTF_MULTIRT
-					 * ire, also set up the send-to queue
-					 * accordingly.
-					 */
-					ire = ire1;
-					ire1 = NULL;
-					stq = ire->ire_stq;
-					nce = ire->ire_nce;
-					ill = ire_to_ill(ire);
-					mp = next_mp;
-					next_mp = NULL;
-					continue;
-				}
-				ASSERT(next_mp == NULL);
-				ASSERT(ire1 == NULL);
-				return;
-			}
+	if (!IPCL_ZONE_MATCH(connp, zoneid))
+		return (B_FALSE);
 
-			ASSERT(nce->nce_state != ND_INCOMPLETE);
+	if (!(ira->ira_flags & IRAF_MULTICAST))
+		return (B_TRUE);
 
-			/*
-			 * Check for upper layer advice
-			 */
-			if (flags & IPV6_REACHABILITY_CONFIRMATION) {
-				/*
-				 * It should be o.k. to check the state without
-				 * a lock here, at most we lose an advice.
-				 */
-				nce->nce_last = TICK_TO_MSEC(lbolt64);
-				if (nce->nce_state != ND_REACHABLE) {
-
-					mutex_enter(&nce->nce_lock);
-					nce->nce_state = ND_REACHABLE;
-					nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
-					mutex_exit(&nce->nce_lock);
-					(void) untimeout(nce->nce_timeout_id);
-					if (ip_debug > 2) {
-						/* ip1dbg */
-						pr_addr_dbg("ip_xmit_v6: state"
-						    " for %s changed to"
-						    " REACHABLE\n", AF_INET6,
-						    &ire->ire_addr_v6);
-					}
-				}
-				if (ire != save_ire) {
-					ire_refrele(ire);
-				}
-				if (multirt_send) {
-					ASSERT(ire1 != NULL);
-					/*
-					 * Proceed with the next RTF_MULTIRT
-					 * ire, also set up the send-to queue
-					 * accordingly.
-					 */
-					ire = ire1;
-					ire1 = NULL;
-					stq = ire->ire_stq;
-					nce = ire->ire_nce;
-					ill = ire_to_ill(ire);
-					mp = next_mp;
-					next_mp = NULL;
-					continue;
-				}
-				ASSERT(next_mp == NULL);
-				ASSERT(ire1 == NULL);
-				return;
-			}
+	if (connp->conn_multi_router)
+		return (B_TRUE);
 
-			delta =  TICK_TO_MSEC(lbolt64) - nce->nce_last;
-			ip1dbg(("ip_xmit_v6: delta = %" PRId64
-			    " ill_reachable_time = %d \n", delta,
-			    ill->ill_reachable_time));
-			if (delta > (uint64_t)ill->ill_reachable_time) {
-				nce = ire->ire_nce;
-				mutex_enter(&nce->nce_lock);
-				switch (nce->nce_state) {
-				case ND_REACHABLE:
-				case ND_STALE:
-					/*
-					 * ND_REACHABLE is identical to
-					 * ND_STALE in this specific case. If
-					 * reachable time has expired for this
-					 * neighbor (delta is greater than
-					 * reachable time), conceptually, the
-					 * neighbor cache is no longer in
-					 * REACHABLE state, but already in
-					 * STALE state.  So the correct
-					 * transition here is to ND_DELAY.
-					 */
-					nce->nce_state = ND_DELAY;
-					mutex_exit(&nce->nce_lock);
-					NDP_RESTART_TIMER(nce,
-					    ipst->ips_delay_first_probe_time);
-					if (ip_debug > 3) {
-						/* ip2dbg */
-						pr_addr_dbg("ip_xmit_v6: state"
-						    " for %s changed to"
-						    " DELAY\n", AF_INET6,
-						    &ire->ire_addr_v6);
-					}
-					break;
-				case ND_DELAY:
-				case ND_PROBE:
-					mutex_exit(&nce->nce_lock);
-					/* Timers have already started */
-					break;
-				case ND_UNREACHABLE:
-					/*
-					 * ndp timer has detected that this nce
-					 * is unreachable and initiated deleting
-					 * this nce and all its associated IREs.
-					 * This is a race where we found the
-					 * ire before it was deleted and have
-					 * just sent out a packet using this
-					 * unreachable nce.
-					 */
-					mutex_exit(&nce->nce_lock);
-					break;
-				default:
-					ASSERT(0);
-				}
-			}
+	if (ira->ira_protocol == IPPROTO_RSVP)
+		return (B_TRUE);
 
-			if (multirt_send) {
-				ASSERT(ire1 != NULL);
-				/*
-				 * Proceed with the next RTF_MULTIRT ire,
-				 * Also set up the send-to queue accordingly.
-				 */
-				if (ire != save_ire) {
-					ire_refrele(ire);
-				}
-				ire = ire1;
-				ire1 = NULL;
-				stq = ire->ire_stq;
-				nce = ire->ire_nce;
-				ill = ire_to_ill(ire);
-				mp = next_mp;
-				next_mp = NULL;
-			}
-		} while (multirt_send);
-		/*
-		 * In the multirouting case, release the last ire used for
-		 * emission. save_ire will be released by the caller.
-		 */
-		if (ire != save_ire) {
-			ire_refrele(ire);
-		}
-	} else {
-		/*
-		 * Can't apply backpressure, just discard the packet.
-		 */
-		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
-		freemsg(mp);
-		return;
-	}
+	return (conn_hasmembers_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr,
+	    ira->ira_ill));
 }
 
 /*
@@ -12189,37 +4488,52 @@ pr_addr_dbg(char *fmt1, int af, const void *addr)
 
 
 /*
- * Return the length in bytes of the IPv6 headers (base header, ip6i_t
- * if needed and extension headers) that will be needed based on the
- * ip6_pkt_t structure passed by the caller.
+ * Return the length in bytes of the IPv6 headers (base header
+ * extension headers) that will be needed based on the
+ * ip_pkt_t structure passed by the caller.
  *
  * The returned length does not include the length of the upper level
  * protocol (ULP) header.
  */
 int
-ip_total_hdrs_len_v6(ip6_pkt_t *ipp)
+ip_total_hdrs_len_v6(const ip_pkt_t *ipp)
 {
 	int len;
 
 	len = IPV6_HDR_LEN;
-	if (ipp->ipp_fields & IPPF_HAS_IP6I)
-		len += sizeof (ip6i_t);
-	if (ipp->ipp_fields & IPPF_HOPOPTS) {
+
+	/*
+	 * If there's a security label here, then we ignore any hop-by-hop
+	 * options the user may try to set.
+	 */
+	if (ipp->ipp_fields & IPPF_LABEL_V6) {
+		uint_t hopoptslen;
+		/*
+		 * Note that ipp_label_len_v6 is just the option - not
+		 * the hopopts extension header. It also needs to be padded
+		 * to a multiple of 8 bytes.
+		 */
+		ASSERT(ipp->ipp_label_len_v6 != 0);
+		hopoptslen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
+		hopoptslen = (hopoptslen + 7)/8 * 8;
+		len += hopoptslen;
+	} else if (ipp->ipp_fields & IPPF_HOPOPTS) {
 		ASSERT(ipp->ipp_hopoptslen != 0);
 		len += ipp->ipp_hopoptslen;
 	}
-	if (ipp->ipp_fields & IPPF_RTHDR) {
-		ASSERT(ipp->ipp_rthdrlen != 0);
-		len += ipp->ipp_rthdrlen;
-	}
+
 	/*
 	 * En-route destination options
 	 * Only do them if there's a routing header as well
 	 */
-	if ((ipp->ipp_fields & (IPPF_RTDSTOPTS|IPPF_RTHDR)) ==
-	    (IPPF_RTDSTOPTS|IPPF_RTHDR)) {
-		ASSERT(ipp->ipp_rtdstoptslen != 0);
-		len += ipp->ipp_rtdstoptslen;
+	if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
+	    (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
+		ASSERT(ipp->ipp_rthdrdstoptslen != 0);
+		len += ipp->ipp_rthdrdstoptslen;
+	}
+	if (ipp->ipp_fields & IPPF_RTHDR) {
+		ASSERT(ipp->ipp_rthdrlen != 0);
+		len += ipp->ipp_rthdrlen;
 	}
 	if (ipp->ipp_fields & IPPF_DSTOPTS) {
 		ASSERT(ipp->ipp_dstoptslen != 0);
@@ -12230,80 +4544,40 @@ ip_total_hdrs_len_v6(ip6_pkt_t *ipp)
 
 /*
  * All-purpose routine to build a header chain of an IPv6 header
- * followed by any required extension headers and a proto header,
- * preceeded (where necessary) by an ip6i_t private header.
+ * followed by any required extension headers and a proto header.
  *
- * The fields of the IPv6 header that are derived from the ip6_pkt_t
- * will be filled in appropriately.
- * Thus the caller must fill in the rest of the IPv6 header, such as
- * traffic class/flowid, source address (if not set here), hoplimit (if not
- * set here) and destination address.
+ * The caller has to set the source and destination address as well as
+ * ip6_plen. The caller has to massage any routing header and compensate
+ * for the ULP pseudo-header checksum due to the source route.
  *
- * The extension headers and ip6i_t header will all be fully filled in.
+ * The extension headers will all be fully filled in.
  */
 void
-ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len,
-    ip6_pkt_t *ipp, uint8_t protocol)
+ip_build_hdrs_v6(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
+    uint8_t protocol, uint32_t flowinfo)
 {
 	uint8_t *nxthdr_ptr;
 	uint8_t *cp;
-	ip6i_t	*ip6i;
-	ip6_t	*ip6h = (ip6_t *)ext_hdrs;
+	ip6_t	*ip6h = (ip6_t *)buf;
 
-	/*
-	 * If sending private ip6i_t header down (checksum info, nexthop,
-	 * or ifindex), adjust ip header pointer and set ip6i_t header pointer,
-	 * then fill it in. (The checksum info will be filled in by icmp).
-	 */
-	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
-		ip6i = (ip6i_t *)ip6h;
-		ip6h = (ip6_t *)&ip6i[1];
-
-		ip6i->ip6i_flags = 0;
-		ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-		if (ipp->ipp_fields & IPPF_IFINDEX ||
-		    ipp->ipp_fields & IPPF_SCOPE_ID) {
-			ASSERT(ipp->ipp_ifindex != 0);
-			ip6i->ip6i_flags |= IP6I_IFINDEX;
-			ip6i->ip6i_ifindex = ipp->ipp_ifindex;
-		}
-		if (ipp->ipp_fields & IPPF_ADDR) {
-			/*
-			 * Enable per-packet source address verification if
-			 * IPV6_PKTINFO specified the source address.
-			 * ip6_src is set in the transport's _wput function.
-			 */
-			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
-			    &ipp->ipp_addr));
-			ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
-		}
-		if (ipp->ipp_fields & IPPF_UNICAST_HOPS) {
-			ip6h->ip6_hops = ipp->ipp_unicast_hops;
-			/*
-			 * We need to set this flag so that IP doesn't
-			 * rewrite the IPv6 header's hoplimit with the
-			 * current default value.
-			 */
-			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
-		}
-		if (ipp->ipp_fields & IPPF_NEXTHOP) {
-			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
-			    &ipp->ipp_nexthop));
-			ip6i->ip6i_flags |= IP6I_NEXTHOP;
-			ip6i->ip6i_nexthop = ipp->ipp_nexthop;
-		}
-		/*
-		 * tell IP this is an ip6i_t private header
-		 */
-		ip6i->ip6i_nxt = IPPROTO_RAW;
-	}
 	/* Initialize IPv6 header */
-	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
+	ip6h->ip6_vcf =
+	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
+	    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
+
 	if (ipp->ipp_fields & IPPF_TCLASS) {
-		ip6h->ip6_vcf = (ip6h->ip6_vcf & ~IPV6_FLOWINFO_TCLASS) |
-		    (ipp->ipp_tclass << 20);
+		/* Overrides the class part of flowinfo */
+		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
+		    ipp->ipp_tclass);
 	}
-	if (ipp->ipp_fields & IPPF_ADDR)
+
+	if (ipp->ipp_fields & IPPF_HOPLIMIT)
+		ip6h->ip6_hops = ipp->ipp_hoplimit;
+	else
+		ip6h->ip6_hops = ipp->ipp_unicast_hops;
+
+	if ((ipp->ipp_fields & IPPF_ADDR) &&
+	    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
 		ip6h->ip6_src = ipp->ipp_addr;
 
 	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
@@ -12313,7 +4587,47 @@ ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len,
 	 * any extension headers in the right order:
 	 * Hop-by-hop, destination, routing, and final destination opts.
 	 */
-	if (ipp->ipp_fields & IPPF_HOPOPTS) {
+	/*
+	 * If there's a security label here, then we ignore any hop-by-hop
+	 * options the user may try to set.
+	 */
+	if (ipp->ipp_fields & IPPF_LABEL_V6) {
+		/*
+		 * Hop-by-hop options with the label.
+		 * Note that ipp_label_v6 is just the option - not
+		 * the hopopts extension header. It also needs to be padded
+		 * to a multiple of 8 bytes.
+		 */
+		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
+		uint_t hopoptslen;
+		uint_t padlen;
+
+		padlen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
+		hopoptslen = (padlen + 7)/8 * 8;
+		padlen = hopoptslen - padlen;
+
+		*nxthdr_ptr = IPPROTO_HOPOPTS;
+		nxthdr_ptr = &hbh->ip6h_nxt;
+		hbh->ip6h_len = hopoptslen/8 - 1;
+		cp += sizeof (ip6_hbh_t);
+		bcopy(ipp->ipp_label_v6, cp, ipp->ipp_label_len_v6);
+		cp += ipp->ipp_label_len_v6;
+
+		ASSERT(padlen <= 7);
+		switch (padlen) {
+		case 0:
+			break;
+		case 1:
+			cp[0] = IP6OPT_PAD1;
+			break;
+		default:
+			cp[0] = IP6OPT_PADN;
+			cp[1] = padlen - 2;
+			bzero(&cp[2], padlen - 2);
+			break;
+		}
+		cp += padlen;
+	} else if (ipp->ipp_fields & IPPF_HOPOPTS) {
 		/* Hop-by-hop options */
 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
 
@@ -12327,15 +4641,15 @@ ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len,
 	 * En-route destination options
 	 * Only do them if there's a routing header as well
 	 */
-	if ((ipp->ipp_fields & (IPPF_RTDSTOPTS|IPPF_RTHDR)) ==
-	    (IPPF_RTDSTOPTS|IPPF_RTHDR)) {
+	if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
+	    (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
 		ip6_dest_t *dst = (ip6_dest_t *)cp;
 
 		*nxthdr_ptr = IPPROTO_DSTOPTS;
 		nxthdr_ptr = &dst->ip6d_nxt;
 
-		bcopy(ipp->ipp_rtdstopts, cp, ipp->ipp_rtdstoptslen);
-		cp += ipp->ipp_rtdstoptslen;
+		bcopy(ipp->ipp_rthdrdstopts, cp, ipp->ipp_rthdrdstoptslen);
+		cp += ipp->ipp_rthdrdstoptslen;
 	}
 	/*
 	 * Routing header next
@@ -12365,7 +4679,7 @@ ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len,
 	 * Now set the last header pointer to the proto passed in
 	 */
 	*nxthdr_ptr = protocol;
-	ASSERT((int)(cp - ext_hdrs) == ext_hdrs_len);
+	ASSERT((int)(cp - buf) == buf_len);
 }
 
 /*
@@ -12509,108 +4823,28 @@ ip_massage_options_v6(ip6_t *ip6h, ip6_rthdr_t *rth, netstack_t *ns)
 	return (cksm);
 }
 
-/*
- * Propagate a multicast group membership operation (join/leave) (*fn) on
- * all interfaces crossed by the related multirt routes.
- * The call is considered successful if the operation succeeds
- * on at least one interface.
- * The function is called if the destination address in the packet to send
- * is multirouted.
- */
-int
-ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t,
-    const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *),
-    ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6grp,
-    mcast_record_t fmode, const in6_addr_t *v6src, mblk_t *first_mp)
-{
-	ire_t		*ire_gw;
-	irb_t		*irb;
-	int		index, error = 0;
-	opt_restart_t	*or;
-	ip_stack_t	*ipst = ire->ire_ipst;
-
-	irb = ire->ire_bucket;
-	ASSERT(irb != NULL);
-
-	ASSERT(DB_TYPE(first_mp) == M_CTL);
-	or = (opt_restart_t *)first_mp->b_rptr;
-
-	IRB_REFHOLD(irb);
-	for (; ire != NULL; ire = ire->ire_next) {
-		if ((ire->ire_flags & RTF_MULTIRT) == 0)
-			continue;
-		if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6grp))
-			continue;
-
-		ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6, 0, 0,
-		    IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL,
-		    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE, ipst);
-		/* No resolver exists for the gateway; skip this ire. */
-		if (ire_gw == NULL)
-			continue;
-		index = ire_gw->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex;
-		/*
-		 * A resolver exists: we can get the interface on which we have
-		 * to apply the operation.
-		 */
-		error = fn(connp, checkonly, v6grp, index, fmode, v6src,
-		    first_mp);
-		if (error == 0)
-			or->or_private = CGTP_MCAST_SUCCESS;
-
-		if (ip_debug > 0) {
-			ulong_t	off;
-			char	*ksym;
-
-			ksym = kobj_getsymname((uintptr_t)fn, &off);
-			ip2dbg(("ip_multirt_apply_membership_v6: "
-			    "called %s, multirt group 0x%08x via itf 0x%08x, "
-			    "error %d [success %u]\n",
-			    ksym ? ksym : "?",
-			    ntohl(V4_PART_OF_V6((*v6grp))),
-			    ntohl(V4_PART_OF_V6(ire_gw->ire_src_addr_v6)),
-			    error, or->or_private));
-		}
-
-		ire_refrele(ire_gw);
-		if (error == EINPROGRESS) {
-			IRB_REFRELE(irb);
-			return (error);
-		}
-	}
-	IRB_REFRELE(irb);
-	/*
-	 * Consider the call as successful if we succeeded on at least
-	 * one interface. Otherwise, return the last encountered error.
-	 */
-	return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error);
-}
-
 void
 *ip6_kstat_init(netstackid_t stackid, ip6_stat_t *ip6_statisticsp)
 {
 	kstat_t *ksp;
 
 	ip6_stat_t template = {
-		{ "ip6_udp_fast_path", 	KSTAT_DATA_UINT64 },
-		{ "ip6_udp_slow_path", 	KSTAT_DATA_UINT64 },
 		{ "ip6_udp_fannorm", 	KSTAT_DATA_UINT64 },
 		{ "ip6_udp_fanmb", 	KSTAT_DATA_UINT64 },
+		{ "ip6_recv_pullup", 		KSTAT_DATA_UINT64 },
+		{ "ip6_db_ref",			KSTAT_DATA_UINT64 },
+		{ "ip6_notaligned",		KSTAT_DATA_UINT64 },
+		{ "ip6_multimblk",		KSTAT_DATA_UINT64 },
+		{ "ipsec_proto_ahesp",		KSTAT_DATA_UINT64 },
 		{ "ip6_out_sw_cksum",			KSTAT_DATA_UINT64 },
+		{ "ip6_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
 		{ "ip6_in_sw_cksum",			KSTAT_DATA_UINT64 },
 		{ "ip6_tcp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
 		{ "ip6_tcp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
 		{ "ip6_tcp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
-		{ "ip6_tcp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
 		{ "ip6_udp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
 		{ "ip6_udp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
 		{ "ip6_udp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
-		{ "ip6_udp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
-		{ "ip6_frag_mdt_pkt_out",		KSTAT_DATA_UINT64 },
-		{ "ip6_frag_mdt_discarded",		KSTAT_DATA_UINT64 },
-		{ "ip6_frag_mdt_allocfail",		KSTAT_DATA_UINT64 },
-		{ "ip6_frag_mdt_addpdescfail",		KSTAT_DATA_UINT64 },
-		{ "ip6_frag_mdt_allocd",		KSTAT_DATA_UINT64 },
 	};
 	ksp = kstat_create_netstack("ip", 0, "ip6stat", "net",
 	    KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
@@ -12641,7 +4875,7 @@ ip6_kstat_fini(netstackid_t stackid, kstat_t *ksp)
  * IPV6_SRC_PREFERENCES socket option.
  */
 int
-ip6_set_src_preferences(conn_t *connp, uint32_t prefs)
+ip6_set_src_preferences(ip_xmit_attr_t *ixa, uint32_t prefs)
 {
 	/*
 	 * We only support preferences that are covered by
@@ -12675,47 +4909,15 @@ ip6_set_src_preferences(conn_t *connp, uint32_t prefs)
 		return (EINVAL);
 	}
 
-	connp->conn_src_preferences = prefs;
+	ixa->ixa_src_preferences = prefs;
 	return (0);
 }
 
 size_t
-ip6_get_src_preferences(conn_t *connp, uint32_t *val)
+ip6_get_src_preferences(ip_xmit_attr_t *ixa, uint32_t *val)
 {
-	*val = connp->conn_src_preferences;
-	return (sizeof (connp->conn_src_preferences));
-}
-
-int
-ip6_set_pktinfo(cred_t *cr, conn_t *connp, struct in6_pktinfo *pkti)
-{
-	ire_t	*ire;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	/*
-	 * Verify the source address and ifindex. Privileged users can use
-	 * any source address.  For ancillary data the source address is
-	 * checked in ip_wput_v6.
-	 */
-	if (pkti->ipi6_ifindex != 0) {
-		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-		if (!phyint_exists(pkti->ipi6_ifindex, ipst)) {
-			rw_exit(&ipst->ips_ill_g_lock);
-			return (ENXIO);
-		}
-		rw_exit(&ipst->ips_ill_g_lock);
-	}
-	if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr) &&
-	    secpolicy_net_rawaccess(cr) != 0) {
-		ire = ire_route_lookup_v6(&pkti->ipi6_addr, 0, 0,
-		    (IRE_LOCAL|IRE_LOOPBACK), NULL, NULL,
-		    connp->conn_zoneid, NULL, MATCH_IRE_TYPE, ipst);
-		if (ire != NULL)
-			ire_refrele(ire);
-		else
-			return (ENXIO);
-	}
-	return (0);
+	*val = ixa->ixa_src_preferences;
+	return (sizeof (ixa->ixa_src_preferences));
 }
 
 /*
@@ -12743,7 +4945,7 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
 	whereptr = (uint8_t *)&ip6h[1];
 	for (;;) {
 		/* Assume IP has already stripped it */
-		ASSERT(nexthdr != IPPROTO_FRAGMENT && nexthdr != IPPROTO_RAW);
+		ASSERT(nexthdr != IPPROTO_FRAGMENT);
 		switch (nexthdr) {
 		case IPPROTO_HOPOPTS:
 			hbhhdr = (ip6_hbh_t *)whereptr;
@@ -12815,11 +5017,12 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
  * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
  * group during or after this lookup.
  */
-static boolean_t
+boolean_t
 ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp)
 {
 	ipif_t *ipif;
 
+
 	ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst);
 	if (ipif != NULL) {
 		if (ipifp != NULL)
diff --git a/usr/src/uts/common/inet/ip/ip6_asp.c b/usr/src/uts/common/inet/ip/ip6_asp.c
index d54e821359..5c499e6526 100644
--- a/usr/src/uts/common/inet/ip/ip6_asp.c
+++ b/usr/src/uts/common/inet/ip/ip6_asp.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/ksynch.h>
@@ -41,6 +39,7 @@
 #include <inet/ip6.h>
 #include <inet/ip6_asp.h>
 #include <inet/ip_ire.h>
+#include <inet/ip_if.h>
 #include <inet/ipclassifier.h>
 
 #define	IN6ADDR_MASK128_INIT \
@@ -415,18 +414,13 @@ ip6_asp_replace(mblk_t *mp, ip6_asp_t *new_table, size_t new_size,
 	ipst->ips_ip6_asp_table = tmp_table;
 	ipst->ips_ip6_asp_table_count = count;
 
-	/*
-	 * The user has changed the address selection policy table.  IPv6
-	 * source address selection for existing IRE_CACHE and
-	 * RTF_DYNAMIC entries used the old table, so we need to
-	 * clear the cache.
-	 */
-	ire_walk_v6(ire_delete_cache_v6, NULL, ALL_ZONES, ipst);
-
 unlock_end:
 	ipst->ips_ip6_asp_uip = B_FALSE;
 	mutex_exit(&ipst->ips_ip6_asp_lock);
 
+	/* Let conn_ixa caching know that source address selection changed */
+	ip_update_source_selection(ipst);
+
 replace_end:
 	/* Reply to the ioctl */
 	q = (queue_t *)mp->b_prev;
diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c
index a986a755ac..364a44b9d4 100644
--- a/usr/src/uts/common/inet/ip/ip6_if.c
+++ b/usr/src/uts/common/inet/ip/ip6_if.c
@@ -76,12 +76,13 @@ static in6_addr_t	ipv6_ll_template =
 
 static ipif_t *
 ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
-    queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst);
+    ip_stack_t *ipst);
+
+static int	ipif_add_ires_v6(ipif_t *, boolean_t);
 
 /*
- * These two functions, ipif_lookup_group_v6() and ill_lookup_group_v6(),
- * are called when an application does not specify an interface to be
- * used for multicast traffic.  It calls ire_lookup_multi_v6() to look
+ * This function is called when an application does not specify an interface
+ * to be used for multicast traffic.  It calls ire_lookup_multi_v6() to look
  * for an interface route for the specified multicast group.  Doing
  * this allows the administrator to add prefix routes for multicast to
  * indicate which interface to be used for multicast traffic in the above
@@ -89,47 +90,21 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
  * multicast group (a /128 route) or anything in between.  If there is no
  * such multicast route, we just find any multicast capable interface and
  * return it.
+ *
+ * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
+ * unicast table. This is used by CGTP.
  */
-ipif_t *
-ipif_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
-{
-	ire_t	*ire;
-	ipif_t	*ipif;
-
-	ire = ire_lookup_multi_v6(group, zoneid, ipst);
-	if (ire != NULL) {
-		ipif = ire->ire_ipif;
-		ipif_refhold(ipif);
-		ire_refrele(ire);
-		return (ipif);
-	}
-
-	return (ipif_lookup_multicast(ipst, zoneid, B_TRUE));
-}
-
 ill_t *
-ill_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
+ill_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst,
+    boolean_t *multirtp, in6_addr_t *setsrcp)
 {
-	ire_t	*ire;
 	ill_t	*ill;
-	ipif_t	*ipif;
 
-	ire = ire_lookup_multi_v6(group, zoneid, ipst);
-	if (ire != NULL) {
-		ill = ire->ire_ipif->ipif_ill;
-		ill_refhold(ill);
-		ire_refrele(ire);
+	ill = ire_lookup_multi_ill_v6(group, zoneid, ipst, multirtp, setsrcp);
+	if (ill != NULL)
 		return (ill);
-	}
-
-	ipif = ipif_lookup_multicast(ipst, zoneid, B_TRUE);
-	if (ipif == NULL)
-		return (NULL);
 
-	ill = ipif->ipif_ill;
-	ill_refhold(ill);
-	ipif_refrele(ipif);
-	return (ill);
+	return (ill_lookup_multicast(ipst, zoneid, B_TRUE));
 }
 
 /*
@@ -138,16 +113,12 @@ ill_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
  */
 static ipif_t *
 ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
-    queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+    ip_stack_t *ipst)
 {
 	ipif_t	*ipif;
 	ill_t	*ill;
-	ipsq_t	*ipsq;
 	ill_walk_context_t ctx;
 
-	if (error != NULL)
-		*error = 0;
-
 	/*
 	 * First match all the point-to-point interfaces
 	 * before looking at non-point-to-point interfaces.
@@ -157,7 +128,6 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V6(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		GRAB_CONN_LOCK(q);
 		mutex_enter(&ill->ill_lock);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
@@ -167,36 +137,19 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
 			    if_addr)) &&
 			    (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr,
 			    dst))) {
-				if (IPIF_CAN_LOOKUP(ipif)) {
+				if (!IPIF_IS_CONDEMNED(ipif)) {
 					ipif_refhold_locked(ipif);
 					mutex_exit(&ill->ill_lock);
-					RELEASE_CONN_LOCK(q);
 					rw_exit(&ipst->ips_ill_g_lock);
 					return (ipif);
-				} else if (IPIF_CAN_WAIT(ipif, q)) {
-					ipsq = ill->ill_phyint->phyint_ipsq;
-					mutex_enter(&ipsq->ipsq_lock);
-					mutex_enter(&ipsq->ipsq_xop->ipx_lock);
-					mutex_exit(&ill->ill_lock);
-					rw_exit(&ipst->ips_ill_g_lock);
-					ipsq_enq(ipsq, q, mp, func, NEW_OP,
-					    ill);
-					mutex_exit(&ipsq->ipsq_xop->ipx_lock);
-					mutex_exit(&ipsq->ipsq_lock);
-					RELEASE_CONN_LOCK(q);
-					if (error != NULL)
-						*error = EINPROGRESS;
-					return (NULL);
 				}
 			}
 		}
 		mutex_exit(&ill->ill_lock);
-		RELEASE_CONN_LOCK(q);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 	/* lookup the ipif based on interface address */
-	ipif = ipif_lookup_addr_v6(if_addr, NULL, ALL_ZONES, q, mp, func,
-	    error, ipst);
+	ipif = ipif_lookup_addr_v6(if_addr, NULL, ALL_ZONES, ipst);
 	ASSERT(ipif == NULL || ipif->ipif_isv6);
 	return (ipif);
 }
@@ -206,17 +159,14 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
  */
 static ipif_t *
 ipif_lookup_addr_common_v6(const in6_addr_t *addr, ill_t *match_ill,
-    boolean_t match_illgrp, zoneid_t zoneid, queue_t *q, mblk_t *mp,
-    ipsq_func_t func, int *error, ip_stack_t *ipst)
+    uint32_t match_flags, zoneid_t zoneid, ip_stack_t *ipst)
 {
 	ipif_t	*ipif;
 	ill_t	*ill;
 	boolean_t  ptp = B_FALSE;
-	ipsq_t	*ipsq;
 	ill_walk_context_t ctx;
-
-	if (error != NULL)
-		*error = 0;
+	boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
+	boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
 
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	/*
@@ -230,7 +180,6 @@ repeat:
 		    (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
 			continue;
 		}
-		GRAB_CONN_LOCK(q);
 		mutex_enter(&ill->ill_lock);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
@@ -238,6 +187,12 @@ repeat:
 			    ipif->ipif_zoneid != zoneid &&
 			    ipif->ipif_zoneid != ALL_ZONES)
 				continue;
+
+			if (no_duplicate &&
+			    !(ipif->ipif_flags & IPIF_UP)) {
+				continue;
+			}
+
 			/* Allow the ipif to be down */
 			if ((!ptp && (IN6_ARE_ADDR_EQUAL(
 			    &ipif->ipif_v6lcl_addr, addr) &&
@@ -245,82 +200,26 @@ repeat:
 			    (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
 			    IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr,
 			    addr))) {
-				if (IPIF_CAN_LOOKUP(ipif)) {
+				if (!IPIF_IS_CONDEMNED(ipif)) {
 					ipif_refhold_locked(ipif);
 					mutex_exit(&ill->ill_lock);
-					RELEASE_CONN_LOCK(q);
 					rw_exit(&ipst->ips_ill_g_lock);
 					return (ipif);
-				} else if (IPIF_CAN_WAIT(ipif, q)) {
-					ipsq = ill->ill_phyint->phyint_ipsq;
-					mutex_enter(&ipsq->ipsq_lock);
-					mutex_enter(&ipsq->ipsq_xop->ipx_lock);
-					mutex_exit(&ill->ill_lock);
-					rw_exit(&ipst->ips_ill_g_lock);
-					ipsq_enq(ipsq, q, mp, func, NEW_OP,
-					    ill);
-					mutex_exit(&ipsq->ipsq_xop->ipx_lock);
-					mutex_exit(&ipsq->ipsq_lock);
-					RELEASE_CONN_LOCK(q);
-					if (error != NULL)
-						*error = EINPROGRESS;
-					return (NULL);
 				}
 			}
 		}
 		mutex_exit(&ill->ill_lock);
-		RELEASE_CONN_LOCK(q);
 	}
 
 	/* If we already did the ptp case, then we are done */
 	if (ptp) {
 		rw_exit(&ipst->ips_ill_g_lock);
-		if (error != NULL)
-			*error = ENXIO;
 		return (NULL);
 	}
 	ptp = B_TRUE;
 	goto repeat;
 }
 
-boolean_t
-ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid,
-    ip_stack_t *ipst)
-{
-	ipif_t	*ipif;
-	ill_t	*ill;
-	ill_walk_context_t ctx;
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-
-	ill = ILL_START_WALK_V6(&ctx, ipst);
-	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		mutex_enter(&ill->ill_lock);
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (zoneid != ALL_ZONES &&
-			    ipif->ipif_zoneid != zoneid &&
-			    ipif->ipif_zoneid != ALL_ZONES)
-				continue;
-			/* Allow the ipif to be down */
-			if (((IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
-			    addr) &&
-			    (ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
-			    ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
-			    IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr,
-			    addr))) {
-				mutex_exit(&ill->ill_lock);
-				rw_exit(&ipst->ips_ill_g_lock);
-				return (B_TRUE);
-			}
-		}
-		mutex_exit(&ill->ill_lock);
-	}
-
-	rw_exit(&ipst->ips_ill_g_lock);
-	return (B_FALSE);
-}
-
 /*
  * Lookup an ipif with the specified address.  For point-to-point links we
  * look for matches on either the destination address or the local address,
@@ -330,10 +229,24 @@ ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid,
  */
 ipif_t *
 ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
-    queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+    ip_stack_t *ipst)
 {
-	return (ipif_lookup_addr_common_v6(addr, match_ill, B_TRUE, zoneid, q,
-	    mp, func, error, ipst));
+	return (ipif_lookup_addr_common_v6(addr, match_ill, IPIF_MATCH_ILLGRP,
+	    zoneid, ipst));
+}
+
+/*
+ * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
+ * except that we will only return an address if it is not marked as
+ * IPIF_DUPLICATE
+ */
+ipif_t *
+ipif_lookup_addr_nondup_v6(const in6_addr_t *addr, ill_t *match_ill,
+    zoneid_t zoneid, ip_stack_t *ipst)
+{
+	return (ipif_lookup_addr_common_v6(addr, match_ill,
+	    (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), zoneid,
+	    ipst));
 }
 
 /*
@@ -346,8 +259,8 @@ ipif_lookup_addr_exact_v6(const in6_addr_t *addr, ill_t *match_ill,
     ip_stack_t *ipst)
 {
 	ASSERT(match_ill != NULL);
-	return (ipif_lookup_addr_common_v6(addr, match_ill, B_FALSE, ALL_ZONES,
-	    NULL, NULL, NULL, NULL, ipst));
+	return (ipif_lookup_addr_common_v6(addr, match_ill, 0, ALL_ZONES,
+	    ipst));
 }
 
 /*
@@ -473,23 +386,22 @@ ip_remote_addr_ok_v6(const in6_addr_t *addr, const in6_addr_t *subnet_mask)
 
 /*
  * ip_rt_add_v6 is called to add an IPv6 route to the forwarding table.
- * ipif_arg is passed in to associate it with the correct interface
+ * ill is passed in to associate it with the correct interface
  * (for link-local destinations and gateways).
+ * If ire_arg is set, then we return the held IRE in that location.
  */
 /* ARGSUSED1 */
 int
 ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
     const in6_addr_t *gw_addr, const in6_addr_t *src_addr, int flags,
-    ipif_t *ipif_arg, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func,
-    struct rtsa_s *sp, ip_stack_t *ipst)
+    ill_t *ill, ire_t **ire_arg, struct rtsa_s *sp, ip_stack_t *ipst,
+    zoneid_t zoneid)
 {
-	ire_t	*ire;
+	ire_t	*ire, *nire;
 	ire_t	*gw_ire = NULL;
 	ipif_t	*ipif;
-	boolean_t ipif_refheld = B_FALSE;
 	uint_t	type;
 	int	match_flags = MATCH_IRE_TYPE;
-	int	error;
 	tsol_gc_t *gc = NULL;
 	tsol_gcgrp_t *gcgrp = NULL;
 	boolean_t gcgrp_xtraref = B_FALSE;
@@ -514,14 +426,19 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 
 	/*
 	 * Get the ipif, if any, corresponding to the gw_addr
+	 * If -ifp was specified we restrict ourselves to the ill, otherwise
+	 * we match on the gatway and destination to handle unnumbered pt-pt
+	 * interfaces.
 	 */
-	ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, q, mp, func,
-	    &error, ipst);
-	if (ipif != NULL)
-		ipif_refheld = B_TRUE;
-	else if (error == EINPROGRESS) {
-		ip1dbg(("ip_rt_add_v6: null and EINPROGRESS"));
-		return (error);
+	if (ill != NULL)
+		ipif = ipif_lookup_addr_v6(gw_addr, ill, ALL_ZONES, ipst);
+	else
+		ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, ipst);
+	if (ipif != NULL) {
+		if (IS_VNI(ipif->ipif_ill)) {
+			ipif_refrele(ipif);
+			return (EINVAL);
+		}
 	}
 
 	/*
@@ -535,57 +452,74 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 		if (IN6_ARE_ADDR_EQUAL(gw_addr, &ipv6_loopback) &&
 		    IN6_ARE_ADDR_EQUAL(dst_addr, &ipv6_loopback) &&
 		    IN6_ARE_ADDR_EQUAL(mask, &ipv6_all_ones)) {
-			ire = ire_ctable_lookup_v6(dst_addr, 0, IRE_LOOPBACK,
-			    ipif, ALL_ZONES, NULL, match_flags, ipst);
+			ire = ire_ftable_lookup_v6(dst_addr, 0, 0, IRE_LOOPBACK,
+			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
+			    NULL);
 			if (ire != NULL) {
 				ire_refrele(ire);
-				if (ipif_refheld)
-					ipif_refrele(ipif);
+				ipif_refrele(ipif);
 				return (EEXIST);
 			}
-			ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x"
+			ip1dbg(("ip_rt_add_v6: 0x%p creating IRE 0x%x"
 			    "for 0x%x\n", (void *)ipif,
 			    ipif->ipif_ire_type,
 			    ntohl(ipif->ipif_lcl_addr)));
 			ire = ire_create_v6(
 			    dst_addr,
 			    mask,
-			    &ipif->ipif_v6src_addr,
-			    NULL,
-			    &ipif->ipif_mtu,
-			    NULL,
-			    NULL,
-			    NULL,
-			    ipif->ipif_net_type,
-			    ipif,
-			    NULL,
-			    0,
-			    0,
-			    flags,
-			    &ire_uinfo_null,
 			    NULL,
+			    ipif->ipif_ire_type,	/* LOOPBACK */
+			    ipif->ipif_ill,
+			    zoneid,
+			    (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
 			    NULL,
 			    ipst);
+
 			if (ire == NULL) {
-				if (ipif_refheld)
-					ipif_refrele(ipif);
+				ipif_refrele(ipif);
+				return (ENOMEM);
+			}
+			/* src address assigned by the caller? */
+			if ((flags & RTF_SETSRC) &&
+			    !IN6_IS_ADDR_UNSPECIFIED(src_addr))
+				ire->ire_setsrc_addr_v6 = *src_addr;
+
+			nire = ire_add(ire);
+			if (nire == NULL) {
+				/*
+				 * In the result of failure, ire_add() will have
+				 * already deleted the ire in question, so there
+				 * is no need to do that here.
+				 */
+				ipif_refrele(ipif);
 				return (ENOMEM);
 			}
-			error = ire_add(&ire, q, mp, func, B_FALSE);
-			if (error == 0)
-				goto save_ire;
 			/*
-			 * In the result of failure, ire_add() will have already
-			 * deleted the ire in question, so there is no need to
-			 * do that here.
+			 * Check if it was a duplicate entry. This handles
+			 * the case of two racing route adds for the same route
 			 */
-			if (ipif_refheld)
+			if (nire != ire) {
+				ASSERT(nire->ire_identical_ref > 1);
+				ire_delete(nire);
+				ire_refrele(nire);
 				ipif_refrele(ipif);
-			return (error);
+				return (EEXIST);
+			}
+			ire = nire;
+			goto save_ire;
 		}
 	}
 
 	/*
+	 * The routes for multicast with CGTP are quite special in that
+	 * the gateway is the local interface address, yet RTF_GATEWAY
+	 * is set. We turn off RTF_GATEWAY to provide compatibility with
+	 * this undocumented and unusual use of multicast routes.
+	 */
+	if ((flags & RTF_MULTIRT) && ipif != NULL)
+		flags &= ~RTF_GATEWAY;
+
+	/*
 	 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
 	 * and the gateway address provided is one of the system's interface
 	 * addresses.  By using the routing socket interface and supplying an
@@ -619,8 +553,8 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 	 * logical interfaces
 	 *
 	 *	192.0.2.32	255.255.255.224	192.0.2.33	U	if0
-	 *	192.0.2.32	255.255.255.224	192.0.2.34	U	if0:1
-	 *	192.0.2.32	255.255.255.224	192.0.2.35	U	if0:2
+	 *	192.0.2.32	255.255.255.224	192.0.2.34	U	if0
+	 *	192.0.2.32	255.255.255.224	192.0.2.35	U	if0
 	 *
 	 * the ipif's corresponding to each of these interface routes can be
 	 * uniquely identified by the "gateway" (actually interface address).
@@ -635,90 +569,68 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 
 	/* RTF_GATEWAY not set */
 	if (!(flags & RTF_GATEWAY)) {
-		queue_t	*stq;
-
 		if (sp != NULL) {
 			ip2dbg(("ip_rt_add_v6: gateway security attributes "
 			    "cannot be set with interface route\n"));
-			if (ipif_refheld)
+			if (ipif != NULL)
 				ipif_refrele(ipif);
 			return (EINVAL);
 		}
 
 		/*
-		 * As the interface index specified with the RTA_IFP sockaddr is
-		 * the same for all ipif's off of an ill, the matching logic
-		 * below uses MATCH_IRE_ILL if such an index was specified.
-		 * This means that routes sharing the same prefix when added
-		 * using a RTA_IFP sockaddr must have distinct interface
-		 * indices (namely, they must be on distinct ill's).
-		 *
-		 * On the other hand, since the gateway address will usually be
-		 * different for each ipif on the system, the matching logic
-		 * uses MATCH_IRE_IPIF in the case of a traditional interface
-		 * route.  This means that interface routes for the same prefix
-		 * can be created if they belong to distinct ipif's and if a
-		 * RTA_IFP sockaddr is not present.
+		 * Whether or not ill (RTA_IFP) is set, we require that
+		 * the gateway is one of our local addresses.
 		 */
-		if (ipif_arg != NULL) {
-			if (ipif_refheld) {
-				ipif_refrele(ipif);
-				ipif_refheld = B_FALSE;
-			}
-			ipif = ipif_arg;
-			match_flags |= MATCH_IRE_ILL;
-		} else {
-			/*
-			 * Check the ipif corresponding to the gw_addr
-			 */
-			if (ipif == NULL)
-				return (ENETUNREACH);
-			match_flags |= MATCH_IRE_IPIF;
+		if (ipif == NULL)
+			return (ENETUNREACH);
+
+		/*
+		 * We use MATCH_IRE_ILL here. If the caller specified an
+		 * interface (from the RTA_IFP sockaddr) we use it, otherwise
+		 * we use the ill derived from the gateway address.
+		 * We can always match the gateway address since we record it
+		 * in ire_gateway_addr.
+		 * We don't allow RTA_IFP to specify a different ill than the
+		 * one matching the ipif to make sure we can delete the route.
+		 */
+		match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
+		if (ill == NULL) {
+			ill = ipif->ipif_ill;
+		} else if (ill != ipif->ipif_ill) {
+			ipif_refrele(ipif);
+			return (EINVAL);
 		}
 
-		ASSERT(ipif != NULL);
 		/*
 		 * We check for an existing entry at this point.
 		 */
 		match_flags |= MATCH_IRE_MASK;
-		ire = ire_ftable_lookup_v6(dst_addr, mask, 0, IRE_INTERFACE,
-		    ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
+		ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr,
+		    IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
+		    NULL);
 		if (ire != NULL) {
 			ire_refrele(ire);
-			if (ipif_refheld)
-				ipif_refrele(ipif);
+			ipif_refrele(ipif);
 			return (EEXIST);
 		}
 
-		stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
-		    ? ipif->ipif_rq : ipif->ipif_wq;
-
 		/*
 		 * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or
-		 * IRE_IF_RESOLVER with the modified address and netmask.
+		 * IRE_IF_RESOLVER with the modified address, netmask, and
+		 * gateway.
 		 */
 		ire = ire_create_v6(
 		    dst_addr,
 		    mask,
-		    &ipif->ipif_v6src_addr,
-		    NULL,
-		    &ipif->ipif_mtu,
-		    NULL,
-		    NULL,
-		    stq,
-		    ipif->ipif_net_type,
-		    ipif,
-		    NULL,
-		    0,
-		    0,
+		    gw_addr,
+		    ill->ill_net_type,
+		    ill,
+		    zoneid,
 		    flags,
-		    &ire_uinfo_null,
-		    NULL,
 		    NULL,
 		    ipst);
 		if (ire == NULL) {
-			if (ipif_refheld)
-				ipif_refrele(ipif);
+			ipif_refrele(ipif);
 			return (ENOMEM);
 		}
 
@@ -731,32 +643,44 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 		 * RTF_BLACKHOLE flag as these interface routes, by
 		 * definition, can only be that.
 		 *
-		 * If the IRE type (as defined by ipif->ipif_net_type) is
+		 * If the IRE type (as defined by ill->ill_net_type) is
 		 * IRE_LOOPBACK, then we map the request into a
 		 * IRE_IF_NORESOLVER.
 		 *
 		 * Needless to say, the real IRE_LOOPBACK is NOT created by this
 		 * routine, but rather using ire_create_v6() directly.
 		 */
-		if (ipif->ipif_net_type == IRE_LOOPBACK) {
+		if (ill->ill_net_type == IRE_LOOPBACK) {
 			ire->ire_type = IRE_IF_NORESOLVER;
 			ire->ire_flags |= RTF_BLACKHOLE;
 		}
-		error = ire_add(&ire, q, mp, func, B_FALSE);
-		if (error == 0)
-			goto save_ire;
+		/* src address assigned by the caller? */
+		if ((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr))
+			ire->ire_setsrc_addr_v6 = *src_addr;
+
+		nire = ire_add(ire);
+		if (nire == NULL) {
+			/*
+			 * In the result of failure, ire_add() will have
+			 * already deleted the ire in question, so there
+			 * is no need to do that here.
+			 */
+			ipif_refrele(ipif);
+			return (ENOMEM);
+		}
 		/*
-		 * In the result of failure, ire_add() will have already
-		 * deleted the ire in question, so there is no need to
-		 * do that here.
+		 * Check if it was a duplicate entry. This handles
+		 * the case of two racing route adds for the same route
 		 */
-		if (ipif_refheld)
+		if (nire != ire) {
+			ASSERT(nire->ire_identical_ref > 1);
+			ire_delete(nire);
+			ire_refrele(nire);
 			ipif_refrele(ipif);
-		return (error);
-	}
-	if (ipif_refheld) {
-		ipif_refrele(ipif);
-		ipif_refheld = B_FALSE;
+			return (EEXIST);
+		}
+		ire = nire;
+		goto save_ire;
 	}
 
 	/*
@@ -764,14 +688,23 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 	 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
 	 * gateway, it is currently unreachable and we fail the request
 	 * accordingly.
+	 * If RTA_IFP was specified we look on that particular ill.
 	 */
-	ipif = ipif_arg;
-	if (ipif_arg != NULL)
+	if (ill != NULL)
 		match_flags |= MATCH_IRE_ILL;
-	gw_ire = ire_ftable_lookup_v6(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg,
-	    NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
-	if (gw_ire == NULL)
+
+	/* Check whether the gateway is reachable. */
+	type = IRE_INTERFACE;
+	if (flags & RTF_INDIRECT)
+		type |= IRE_OFFLINK;
+
+	gw_ire = ire_ftable_lookup_v6(gw_addr, 0, 0, type, ill,
+	    ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
+	if (gw_ire == NULL) {
+		if (ipif != NULL)
+			ipif_refrele(ipif);
 		return (ENETUNREACH);
+	}
 
 	/*
 	 * We create one of three types of IREs as a result of this request
@@ -789,10 +722,12 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 		type = IRE_PREFIX;
 
 	/* check for a duplicate entry */
-	ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type, ipif_arg,
-	    NULL, ALL_ZONES, 0, NULL,
-	    match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst);
+	ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type, ill,
+	    ALL_ZONES, NULL,
+	    match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 0, ipst, NULL);
 	if (ire != NULL) {
+		if (ipif != NULL)
+			ipif_refrele(ipif);
 		ire_refrele(gw_ire);
 		ire_refrele(ire);
 		return (EEXIST);
@@ -809,6 +744,8 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 		/* we hold reference to it upon success */
 		gcgrp = gcgrp_lookup(&ga, B_TRUE);
 		if (gcgrp == NULL) {
+			if (ipif != NULL)
+				ipif_refrele(ipif);
 			ire_refrele(gw_ire);
 			return (ENOMEM);
 		}
@@ -824,6 +761,8 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 		if (gc == NULL) {
 			/* release reference held by gcgrp_lookup */
 			GCGRP_REFRELE(gcgrp);
+			if (ipif != NULL)
+				ipif_refrele(ipif);
 			ire_refrele(gw_ire);
 			return (ENOMEM);
 		}
@@ -833,23 +772,12 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 	ire = ire_create_v6(
 	    dst_addr,				/* dest address */
 	    mask,				/* mask */
-	    /* src address assigned by the caller? */
-	    (((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr)) ?
-	    src_addr : NULL),
 	    gw_addr,				/* gateway address */
-	    &gw_ire->ire_max_frag,
-	    NULL,				/* no src nce */
-	    NULL,				/* no recv-from queue */
-	    NULL,				/* no send-to queue */
 	    (ushort_t)type,			/* IRE type */
-	    ipif_arg,
-	    NULL,
-	    0,
-	    0,
+	    ill,
+	    zoneid,
 	    flags,
-	    &gw_ire->ire_uinfo,			/* Inherit ULP info from gw */
 	    gc,					/* security attribute */
-	    NULL,
 	    ipst);
 
 	/*
@@ -862,26 +790,48 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 	if (ire == NULL) {
 		if (gc != NULL)
 			GC_REFRELE(gc);
+		if (ipif != NULL)
+			ipif_refrele(ipif);
 		ire_refrele(gw_ire);
 		return (ENOMEM);
 	}
 
+	/* src address assigned by the caller? */
+	if ((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr))
+		ire->ire_setsrc_addr_v6 = *src_addr;
+
 	/*
 	 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
 	 * SUN/OS socket stuff does but do we really want to allow ::0 ?
 	 */
 
 	/* Add the new IRE. */
-	error = ire_add(&ire, q, mp, func, B_FALSE);
+	nire = ire_add(ire);
+	if (nire == NULL) {
+		/*
+		 * In the result of failure, ire_add() will have
+		 * already deleted the ire in question, so there
+		 * is no need to do that here.
+		 */
+		if (ipif != NULL)
+			ipif_refrele(ipif);
+		ire_refrele(gw_ire);
+		return (ENOMEM);
+	}
 	/*
-	 * In the result of failure, ire_add() will have already
-	 * deleted the ire in question, so there is no need to
-	 * do that here.
+	 * Check if it was a duplicate entry. This handles
+	 * the case of two racing route adds for the same route
 	 */
-	if (error != 0) {
+	if (nire != ire) {
+		ASSERT(nire->ire_identical_ref > 1);
+		ire_delete(nire);
+		ire_refrele(nire);
+		if (ipif != NULL)
+			ipif_refrele(ipif);
 		ire_refrele(gw_ire);
-		return (error);
+		return (EEXIST);
 	}
+	ire = nire;
 
 	if (flags & RTF_MULTIRT) {
 		/*
@@ -896,70 +846,51 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 		if (ipst->ips_ip_cgtp_filter_ops != NULL &&
 		    !IN6_IS_ADDR_MULTICAST(&(ire->ire_addr_v6))) {
 			int res;
-
-			res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v6(
-			    ipst->ips_netstack->netstack_stackid,
-			    &ire->ire_addr_v6,
-			    &ire->ire_gateway_addr_v6,
-			    &ire->ire_src_addr_v6,
-			    &gw_ire->ire_src_addr_v6);
+			ipif_t *src_ipif;
+
+			/* Find the source address corresponding to gw_ire */
+			src_ipif = ipif_lookup_addr_v6(
+			    &gw_ire->ire_gateway_addr_v6, NULL, zoneid, ipst);
+			if (src_ipif != NULL) {
+				res = ipst->ips_ip_cgtp_filter_ops->
+				    cfo_add_dest_v6(
+				    ipst->ips_netstack->netstack_stackid,
+				    &ire->ire_addr_v6,
+				    &ire->ire_gateway_addr_v6,
+				    &ire->ire_setsrc_addr_v6,
+				    &src_ipif->ipif_v6lcl_addr);
+				ipif_refrele(src_ipif);
+			} else {
+				res = EADDRNOTAVAIL;
+			}
 			if (res != 0) {
+				if (ipif != NULL)
+					ipif_refrele(ipif);
 				ire_refrele(gw_ire);
 				ire_delete(ire);
+				ire_refrele(ire);	/* Held in ire_add */
 				return (res);
 			}
 		}
 	}
 
-	/*
-	 * Now that the prefix IRE entry has been created, delete any
-	 * existing gateway IRE cache entries as well as any IRE caches
-	 * using the gateway, and force them to be created through
-	 * ip_newroute_v6.
-	 */
-	if (gc != NULL) {
-		ASSERT(gcgrp != NULL);
-		ire_clookup_delete_cache_gw_v6(gw_addr, ALL_ZONES, ipst);
-	}
-
 save_ire:
 	if (gw_ire != NULL) {
 		ire_refrele(gw_ire);
+		gw_ire = NULL;
 	}
-	if (ipif != NULL) {
-		mblk_t	*save_mp;
-
+	if (ire->ire_ill != NULL) {
 		/*
 		 * Save enough information so that we can recreate the IRE if
-		 * the interface goes down and then up.  The metrics associated
+		 * the ILL goes down and then up.  The metrics associated
 		 * with the route will be saved as well when rts_setmetrics() is
 		 * called after the IRE has been created.  In the case where
 		 * memory cannot be allocated, none of this information will be
 		 * saved.
 		 */
-		save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
-		if (save_mp != NULL) {
-			ifrt_t	*ifrt;
-
-			save_mp->b_wptr += sizeof (ifrt_t);
-			ifrt = (ifrt_t *)save_mp->b_rptr;
-			bzero(ifrt, sizeof (ifrt_t));
-			ifrt->ifrt_type = ire->ire_type;
-			ifrt->ifrt_v6addr = ire->ire_addr_v6;
-			mutex_enter(&ire->ire_lock);
-			ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
-			ifrt->ifrt_v6src_addr = ire->ire_src_addr_v6;
-			mutex_exit(&ire->ire_lock);
-			ifrt->ifrt_v6mask = ire->ire_mask_v6;
-			ifrt->ifrt_flags = ire->ire_flags;
-			ifrt->ifrt_max_frag = ire->ire_max_frag;
-			mutex_enter(&ipif->ipif_saved_ire_lock);
-			save_mp->b_cont = ipif->ipif_saved_ire_mp;
-			ipif->ipif_saved_ire_mp = save_mp;
-			ipif->ipif_saved_ire_cnt++;
-			mutex_exit(&ipif->ipif_saved_ire_lock);
-		}
+		ill_save_ire(ire->ire_ill, ire);
 	}
+
 	if (ire_arg != NULL) {
 		/*
 		 * Store the ire that was successfully added into where ire_arg
@@ -971,28 +902,27 @@ save_ire:
 	} else {
 		ire_refrele(ire);		/* Held in ire_add */
 	}
-	if (ipif_refheld)
+	if (ipif != NULL)
 		ipif_refrele(ipif);
 	return (0);
 }
 
 /*
  * ip_rt_delete_v6 is called to delete an IPv6 route.
- * ipif_arg is passed in to associate it with the correct interface
+ * ill is passed in to associate it with the correct interface.
  * (for link-local destinations and gateways).
  */
 /* ARGSUSED4 */
 int
 ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
-    const in6_addr_t *gw_addr, uint_t rtm_addrs, int flags, ipif_t *ipif_arg,
-    queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst)
+    const in6_addr_t *gw_addr, uint_t rtm_addrs, int flags, ill_t *ill,
+    ip_stack_t *ipst, zoneid_t zoneid)
 {
 	ire_t	*ire = NULL;
 	ipif_t	*ipif;
 	uint_t	type;
 	uint_t	match_flags = MATCH_IRE_TYPE;
 	int	err = 0;
-	boolean_t	ipif_refheld = B_FALSE;
 
 	/*
 	 * If this is the case of RTF_HOST being set, then we set the netmask
@@ -1012,49 +942,49 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 	 *
 	 * This makes it possible to delete an original
 	 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
+	 * However, we have RTF_KERNEL set on the ones created by ipif_up
+	 * and those can not be deleted here.
 	 *
-	 * As the interface index specified with the RTA_IFP sockaddr is the
-	 * same for all ipif's off of an ill, the matching logic below uses
-	 * MATCH_IRE_ILL if such an index was specified.  This means a route
-	 * sharing the same prefix and interface index as the the route
-	 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr
-	 * is specified in the request.
-	 *
-	 * On the other hand, since the gateway address will usually be
-	 * different for each ipif on the system, the matching logic
-	 * uses MATCH_IRE_IPIF in the case of a traditional interface
-	 * route.  This means that interface routes for the same prefix can be
-	 * uniquely identified if they belong to distinct ipif's and if a
-	 * RTA_IFP sockaddr is not present.
+	 * We use MATCH_IRE_ILL if we know the interface. If the caller
+	 * specified an interface (from the RTA_IFP sockaddr) we use it,
+	 * otherwise we use the ill derived from the gateway address.
+	 * We can always match the gateway address since we record it
+	 * in ire_gateway_addr.
 	 *
 	 * For more detail on specifying routes by gateway address and by
 	 * interface index, see the comments in ip_rt_add_v6().
 	 */
-	ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, q, mp, func, &err,
-	    ipst);
+	ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, ipst);
 	if (ipif != NULL) {
-		ipif_refheld = B_TRUE;
-		if (ipif_arg != NULL) {
-			ipif_refrele(ipif);
-			ipif_refheld = B_FALSE;
-			ipif = ipif_arg;
-			match_flags |= MATCH_IRE_ILL;
-		} else {
-			match_flags |= MATCH_IRE_IPIF;
+		ill_t	*ill_match;
+
+		if (ill != NULL)
+			ill_match = ill;
+		else
+			ill_match = ipif->ipif_ill;
+
+		match_flags |= MATCH_IRE_ILL;
+		if (ipif->ipif_ire_type == IRE_LOOPBACK) {
+			ire = ire_ftable_lookup_v6(dst_addr, 0, 0, IRE_LOOPBACK,
+			    ill_match, ALL_ZONES, NULL, match_flags, 0, ipst,
+			    NULL);
+		}
+		if (ire == NULL) {
+			match_flags |= MATCH_IRE_GW;
+			ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr,
+			    IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
+			    match_flags, 0, ipst, NULL);
+		}
+		/* Avoid deleting routes created by kernel from an ipif */
+		if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
+			ire_refrele(ire);
+			ire = NULL;
 		}
 
-		if (ipif->ipif_ire_type == IRE_LOOPBACK)
-			ire = ire_ctable_lookup_v6(dst_addr, 0, IRE_LOOPBACK,
-			    ipif, ALL_ZONES, NULL, match_flags, ipst);
-		if (ire == NULL)
-			ire = ire_ftable_lookup_v6(dst_addr, mask, 0,
-			    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL,
-			    match_flags, ipst);
-	} else if (err == EINPROGRESS) {
-		return (err);
-	} else {
-		err = 0;
+		/* Restore in case we didn't find a match */
+		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
 	}
+
 	if (ire == NULL) {
 		/*
 		 * At this point, the gateway address is not one of our own
@@ -1062,15 +992,11 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 		 * set the IRE type to lookup based on whether
 		 * this is a host route, a default route or just a prefix.
 		 *
-		 * If an ipif_arg was passed in, then the lookup is based on an
+		 * If an ill was passed in, then the lookup is based on an
 		 * interface index so MATCH_IRE_ILL is added to match_flags.
-		 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is
-		 * set as the route being looked up is not a traditional
-		 * interface route.
 		 */
-		match_flags &= ~MATCH_IRE_IPIF;
 		match_flags |= MATCH_IRE_GW;
-		if (ipif_arg != NULL)
+		if (ill != NULL)
 			match_flags |= MATCH_IRE_ILL;
 		if (IN6_ARE_ADDR_EQUAL(mask, &ipv6_all_ones))
 			type = IRE_HOST;
@@ -1079,12 +1005,12 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 		else
 			type = IRE_PREFIX;
 		ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type,
-		    ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
+		    ill, ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
 	}
 
-	if (ipif_refheld) {
+	if (ipif != NULL) {
 		ipif_refrele(ipif);
-		ipif_refheld = B_FALSE;
+		ipif = NULL;
 	}
 	if (ire == NULL)
 		return (ESRCH);
@@ -1103,42 +1029,9 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 		}
 	}
 
-	ipif = ire->ire_ipif;
-	if (ipif != NULL) {
-		mblk_t		**mpp;
-		mblk_t		*mp;
-		ifrt_t		*ifrt;
-		in6_addr_t	gw_addr_v6;
-
-		/* Remove from ipif_saved_ire_mp list if it is there */
-		mutex_enter(&ire->ire_lock);
-		gw_addr_v6 = ire->ire_gateway_addr_v6;
-		mutex_exit(&ire->ire_lock);
-		mutex_enter(&ipif->ipif_saved_ire_lock);
-		for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL;
-		    mpp = &(*mpp)->b_cont) {
-			/*
-			 * On a given ipif, the triple of address, gateway and
-			 * mask is unique for each saved IRE (in the case of
-			 * ordinary interface routes, the gateway address is
-			 * all-zeroes).
-			 */
-			mp = *mpp;
-			ifrt = (ifrt_t *)mp->b_rptr;
-			if (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
-			    &ire->ire_addr_v6) &&
-			    IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
-			    &gw_addr_v6) &&
-			    IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
-			    &ire->ire_mask_v6)) {
-				*mpp = mp->b_cont;
-				ipif->ipif_saved_ire_cnt--;
-				freeb(mp);
-				break;
-			}
-		}
-		mutex_exit(&ipif->ipif_saved_ire_lock);
-	}
+	ill = ire->ire_ill;
+	if (ill != NULL)
+		ill_remove_saved_ire(ill, ire);
 	ire_delete(ire);
 	ire_refrele(ire);
 	return (err);
@@ -1197,7 +1090,6 @@ ipif_set6to4addr(ipif_t *ipif)
 	(void) ip_plen_to_mask_v6(16, &ipif->ipif_v6net_mask);
 	bcopy(ill->ill_phys_addr, &v4phys, sizeof (struct in_addr));
 	IN6_V4ADDR_TO_6TO4(&v4phys, &ipif->ipif_v6lcl_addr);
-	ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
 	V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
 	    ipif->ipif_v6subnet);
 }
@@ -1260,11 +1152,6 @@ ipif_setlinklocal(ipif_t *ipif)
 		    ipif->ipif_v6subnet);
 	}
 
-	if (ipif->ipif_flags & IPIF_NOLOCAL) {
-		ipif->ipif_v6src_addr = ipv6_all_zeros;
-	} else {
-		ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
-	}
 }
 
 /*
@@ -1280,123 +1167,15 @@ ipif_setdestlinklocal(ipif_t *ipif)
 	ASSERT(IAM_WRITER_ILL(ill));
 	if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_dest_token))
 		return;
+	/* Skip if we've already set the pp_dst_addr */
+	if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr))
+		return;
+
 	ipif_get_linklocal(&ipif->ipif_v6pp_dst_addr, &ill->ill_dest_token);
 	ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
 }
 
 /*
- * This function sets up the multicast mappings in NDP.
- * Unlike ARP, there are no mapping_mps here. We delete the
- * mapping nces and add a new one.
- *
- * Returns non-zero on error and 0 on success.
- */
-int
-ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce)
-{
-	ill_t		*ill = ipif->ipif_ill;
-	in6_addr_t	v6_mcast_addr = {(uint32_t)V6_MCAST, 0, 0, 0};
-	in6_addr_t	v6_mcast_mask = {(uint32_t)V6_MCAST, 0, 0, 0};
-	in6_addr_t	v6_extract_mask;
-	uchar_t		*phys_addr, *bphys_addr, *alloc_phys;
-	nce_t		*mnce = NULL;
-	int		err = 0;
-	phyint_t	*phyi = ill->ill_phyint;
-	uint32_t	hw_extract_start;
-	dl_unitdata_req_t *dlur;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	if (ret_nce != NULL)
-		*ret_nce = NULL;
-
-	if (ipif->ipif_flags & IPIF_POINTOPOINT)
-		return (0);
-
-	/*
-	 * IPMP meta-interfaces don't have any inherent multicast mappings,
-	 * and instead use the ones on the underlying interfaces.
-	 */
-	if (IS_IPMP(ill))
-		return (0);
-
-	/*
-	 * Delete the mapping nce. Normally these should not exist
-	 * as a previous ipif_down -> ipif_ndp_down should have deleted
-	 * all the nces. But they can exist if ip_rput_dlpi_writer
-	 * calls this when PHYI_MULTI_BCAST is set.  Mappings are always
-	 * tied to the underlying ill, so don't match across the illgrp.
-	 */
-	mnce = ndp_lookup_v6(ill, B_FALSE, &v6_mcast_addr, B_FALSE);
-	if (mnce != NULL) {
-		ndp_delete(mnce);
-		NCE_REFRELE(mnce);
-		mnce = NULL;
-	}
-
-	/*
-	 * Get media specific v6 mapping information. Note that
-	 * nd_lla_len can be 0 for tunnels.
-	 */
-	alloc_phys = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
-	if ((alloc_phys == NULL) && (ill->ill_nd_lla_len != 0))
-		return (ENOMEM);
-	/*
-	 * Determine the broadcast address.
-	 */
-	dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
-	if (ill->ill_sap_length < 0)
-		bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
-	else
-		bphys_addr = (uchar_t *)dlur +
-		    dlur->dl_dest_addr_offset + ill->ill_sap_length;
-
-	/*
-	 * Check PHYI_MULTI_BCAST and possible length of physical
-	 * address to determine if we use the mapping or the
-	 * broadcast address.
-	 */
-	if ((phyi->phyint_flags & PHYI_MULTI_BCAST) ||
-	    (!MEDIA_V6MINFO(ill->ill_media, ill->ill_nd_lla_len,
-	    bphys_addr, alloc_phys, &hw_extract_start,
-	    &v6_extract_mask))) {
-		if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) {
-			kmem_free(alloc_phys, ill->ill_nd_lla_len);
-			return (E2BIG);
-		}
-		/* Use the link-layer broadcast address for MULTI_BCAST */
-		phys_addr = bphys_addr;
-		bzero(&v6_extract_mask, sizeof (v6_extract_mask));
-		hw_extract_start = ill->ill_nd_lla_len;
-	} else {
-		phys_addr = alloc_phys;
-	}
-	if ((ipif->ipif_flags & IPIF_BROADCAST) ||
-	    (ill->ill_flags & ILLF_MULTICAST) ||
-	    (phyi->phyint_flags & PHYI_MULTI_BCAST)) {
-		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
-		err = ndp_add_v6(ill,
-		    phys_addr,
-		    &v6_mcast_addr,	/* v6 address */
-		    &v6_mcast_mask,	/* v6 mask */
-		    &v6_extract_mask,
-		    hw_extract_start,
-		    NCE_F_MAPPING | NCE_F_PERMANENT | NCE_F_NONUD,
-		    ND_REACHABLE,
-		    &mnce);
-		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-		if (err == 0) {
-			if (ret_nce != NULL) {
-				*ret_nce = mnce;
-			} else {
-				NCE_REFRELE(mnce);
-			}
-		}
-	}
-	kmem_free(alloc_phys, ill->ill_nd_lla_len);
-	return (err);
-}
-
-/*
  * Get the resolver set up for a new ipif.  (Always called as writer.)
  */
 int
@@ -1405,50 +1184,28 @@ ipif_ndp_up(ipif_t *ipif, boolean_t initial)
 	ill_t		*ill = ipif->ipif_ill;
 	int		err = 0;
 	nce_t		*nce = NULL;
-	nce_t		*mnce = NULL;
 	boolean_t	added_ipif = B_FALSE;
 
-	ASSERT(IAM_WRITER_ILL(ill));
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_ndp_up",
+	    ill_t *, ill, ipif_t *, ipif);
 	ip1dbg(("ipif_ndp_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
 
-	/*
-	 * ND not supported on XRESOLV interfaces. If ND support (multicast)
-	 * added later, take out this check.
-	 */
-	if ((ill->ill_flags & ILLF_XRESOLV) ||
-	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) ||
+	if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) ||
 	    (!(ill->ill_net_type & IRE_INTERFACE))) {
 		ipif->ipif_addr_ready = 1;
 		return (0);
 	}
 
-	/*
-	 * Need to setup multicast mapping only when the first
-	 * interface is coming UP.
-	 */
-	if (ill->ill_ipif_up_count == 0 &&
-	    (ill->ill_flags & ILLF_MULTICAST)) {
-		/*
-		 * We set the multicast before setting up the mapping for
-		 * local address because ipif_ndp_setup_multicast does
-		 * ndp_walk to delete nces which will delete the mapping
-		 * for local address also if we added the mapping for
-		 * local address first.
-		 */
-		err = ipif_ndp_setup_multicast(ipif, &mnce);
-		if (err != 0)
-			return (err);
-	}
-
 	if ((ipif->ipif_flags & (IPIF_UNNUMBERED|IPIF_NOLOCAL)) == 0) {
 		uint16_t	flags;
 		uint16_t	state;
-		uchar_t		*hw_addr = NULL;
+		uchar_t		*hw_addr;
 		ill_t		*bound_ill;
 		ipmp_illgrp_t	*illg = ill->ill_grp;
+		uint_t		hw_addr_len;
 
-		/* Permanent entries don't need NUD */
-		flags = NCE_F_PERMANENT | NCE_F_NONUD;
+		flags = NCE_F_MYADDR | NCE_F_NONUD | NCE_F_PUBLISH |
+		    NCE_F_AUTHORITY;
 		if (ill->ill_flags & ILLF_ROUTER)
 			flags |= NCE_F_ISROUTER;
 
@@ -1483,10 +1240,16 @@ ipif_ndp_up(ipif_t *ipif, boolean_t initial)
 				added_ipif = B_TRUE;
 			}
 			hw_addr = bound_ill->ill_nd_lla;
+			hw_addr_len = bound_ill->ill_phys_addr_length;
 		} else {
 			bound_ill = ill;
-			if (ill->ill_net_type == IRE_IF_RESOLVER)
+			if (ill->ill_net_type == IRE_IF_RESOLVER) {
 				hw_addr = ill->ill_nd_lla;
+				hw_addr_len = ill->ill_phys_addr_length;
+			} else {
+				hw_addr = NULL;
+				hw_addr_len = 0;
+			}
 		}
 
 		/*
@@ -1496,28 +1259,16 @@ ipif_ndp_up(ipif_t *ipif, boolean_t initial)
 		 * unsolicited advertisements to inform others.
 		 */
 		if (initial || !ipif->ipif_addr_ready) {
+			/* Causes Duplicate Address Detection to run */
 			state = ND_PROBE;
 		} else {
 			state = ND_REACHABLE;
 			flags |= NCE_F_UNSOL_ADV;
 		}
+
 retry:
-		/*
-		 * Create an nce for the local address. We pass a match_illgrp
-		 * of B_TRUE because the local address must be unique across
-		 * the illgrp, and the existence of an nce with nce_ill set
-		 * to any ill in the group is indicative of a duplicate address
-		 */
-		err = ndp_lookup_then_add_v6(bound_ill,
-		    B_TRUE,
-		    hw_addr,
-		    &ipif->ipif_v6lcl_addr,
-		    &ipv6_all_ones,
-		    &ipv6_all_zeros,
-		    0,
-		    flags,
-		    state,
-		    &nce);
+		err = nce_lookup_then_add_v6(ill, hw_addr, hw_addr_len,
+		    &ipif->ipif_v6lcl_addr, flags, state, &nce);
 		switch (err) {
 		case 0:
 			ip1dbg(("ipif_ndp_up: NCE created for %s\n",
@@ -1535,14 +1286,21 @@ retry:
 		case EEXIST:
 			ip1dbg(("ipif_ndp_up: NCE already exists for %s\n",
 			    ill->ill_name));
-			if (!(nce->nce_flags & NCE_F_PERMANENT)) {
-				ndp_delete(nce);
-				NCE_REFRELE(nce);
+			if (!NCE_MYADDR(nce->nce_common)) {
+				/*
+				 * A leftover nce from before this address
+				 * existed
+				 */
+				ncec_delete(nce->nce_common);
+				nce_refrele(nce);
 				nce = NULL;
 				goto retry;
 			}
 			if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
-				NCE_REFRELE(nce);
+				nce_refrele(nce);
+				nce = NULL;
+				ip1dbg(("ipif_ndp_up: NCE already exists "
+				    "for %s\n", ill->ill_name));
 				goto fail;
 			}
 			/*
@@ -1557,6 +1315,7 @@ retry:
 			ipif->ipif_addr_ready = 1;
 			ipif->ipif_added_nce = 1;
 			nce->nce_ipif_cnt++;
+			err = 0;
 			break;
 		default:
 			ip1dbg(("ipif_ndp_up: NCE creation failed for %s\n",
@@ -1568,15 +1327,9 @@ retry:
 		ipif->ipif_addr_ready = 1;
 	}
 	if (nce != NULL)
-		NCE_REFRELE(nce);
-	if (mnce != NULL)
-		NCE_REFRELE(mnce);
+		nce_refrele(nce);
 	return (0);
 fail:
-	if (mnce != NULL) {
-		ndp_delete(mnce);
-		NCE_REFRELE(mnce);
-	}
 	if (added_ipif)
 		ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
 
@@ -1587,181 +1340,7 @@ fail:
 void
 ipif_ndp_down(ipif_t *ipif)
 {
-	nce_t	*nce;
-	ill_t	*ill = ipif->ipif_ill;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	if (ipif->ipif_isv6) {
-		if (ipif->ipif_added_nce) {
-			/*
-			 * For IPMP, `ill' can be the IPMP ill but the NCE will
-			 * always be tied to an underlying IP interface, so we
-			 * match across the illgrp.  This is safe since we
-			 * ensure uniqueness across the group in ipif_ndp_up().
-			 */
-			nce = ndp_lookup_v6(ill, B_TRUE, &ipif->ipif_v6lcl_addr,
-			    B_FALSE);
-			if (nce != NULL) {
-				if (--nce->nce_ipif_cnt == 0)
-					ndp_delete(nce); /* last ipif for nce */
-				NCE_REFRELE(nce);
-			}
-			ipif->ipif_added_nce = 0;
-		}
-
-		/*
-		 * Make IPMP aware of the deleted data address.
-		 */
-		if (IS_IPMP(ill))
-			ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
-	}
-
-	/*
-	 * Remove mapping and all other nces dependent on this ill
-	 * when the last ipif is going away.
-	 */
-	if (ill->ill_ipif_up_count == 0)
-		ndp_walk(ill, (pfi_t)ndp_delete_per_ill, ill, ill->ill_ipst);
-}
-
-/*
- * Used when an interface comes up to recreate any extra routes on this
- * interface.
- */
-static ire_t **
-ipif_recover_ire_v6(ipif_t *ipif)
-{
-	mblk_t	*mp;
-	ire_t   **ipif_saved_irep;
-	ire_t   **irep;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
-
-	ip1dbg(("ipif_recover_ire_v6(%s:%u)", ipif->ipif_ill->ill_name,
-	    ipif->ipif_id));
-
-	ASSERT(ipif->ipif_isv6);
-
-	mutex_enter(&ipif->ipif_saved_ire_lock);
-	ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) *
-	    ipif->ipif_saved_ire_cnt, KM_NOSLEEP);
-	if (ipif_saved_irep == NULL) {
-		mutex_exit(&ipif->ipif_saved_ire_lock);
-		return (NULL);
-	}
-
-	irep = ipif_saved_irep;
-
-	for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
-		ire_t		*ire;
-		queue_t		*rfq;
-		queue_t		*stq;
-		ifrt_t		*ifrt;
-		in6_addr_t	*src_addr;
-		in6_addr_t	*gateway_addr;
-		char		buf[INET6_ADDRSTRLEN];
-		ushort_t	type;
-
-		/*
-		 * When the ire was initially created and then added in
-		 * ip_rt_add_v6(), it was created either using
-		 * ipif->ipif_net_type in the case of a traditional interface
-		 * route, or as one of the IRE_OFFSUBNET types (with the
-		 * exception of IRE_HOST type redirect ire which is created by
-		 * icmp_redirect_v6() and which we don't need to save or
-		 * recover).  In the case where ipif->ipif_net_type was
-		 * IRE_LOOPBACK, ip_rt_add_v6() will update the ire_type to
-		 * IRE_IF_NORESOLVER before calling ire_add_v6() to satisfy
-		 * software like GateD and Sun Cluster which creates routes
-		 * using the the loopback interface's address as a gateway.
-		 *
-		 * As ifrt->ifrt_type reflects the already updated ire_type,
-		 * ire_create_v6() will be called in the same way here as in
-		 * ip_rt_add_v6(), namely using ipif->ipif_net_type when the
-		 * route looks like a traditional interface route (where
-		 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise
-		 * using the saved ifrt->ifrt_type.  This means that in
-		 * the case where ipif->ipif_net_type is IRE_LOOPBACK,
-		 * the ire created by ire_create_v6() will be an IRE_LOOPBACK,
-		 * it will then be turned into an IRE_IF_NORESOLVER and then
-		 * added by ire_add_v6().
-		 */
-		ifrt = (ifrt_t *)mp->b_rptr;
-		if (ifrt->ifrt_type & IRE_INTERFACE) {
-			rfq = NULL;
-			stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
-			    ? ipif->ipif_rq : ipif->ipif_wq;
-			src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
-			    ? &ifrt->ifrt_v6src_addr
-			    : &ipif->ipif_v6src_addr;
-			gateway_addr = NULL;
-			type = ipif->ipif_net_type;
-		} else {
-			rfq = NULL;
-			stq = NULL;
-			src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
-			    ? &ifrt->ifrt_v6src_addr : NULL;
-			gateway_addr = &ifrt->ifrt_v6gateway_addr;
-			type = ifrt->ifrt_type;
-		}
-
-		/*
-		 * Create a copy of the IRE with the saved address and netmask.
-		 */
-		ip1dbg(("ipif_recover_ire_v6: creating IRE %s (%d) for %s/%d\n",
-		    ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type,
-		    inet_ntop(AF_INET6, &ifrt->ifrt_v6addr, buf, sizeof (buf)),
-		    ip_mask_to_plen_v6(&ifrt->ifrt_v6mask)));
-		ire = ire_create_v6(
-		    &ifrt->ifrt_v6addr,
-		    &ifrt->ifrt_v6mask,
-		    src_addr,
-		    gateway_addr,
-		    &ifrt->ifrt_max_frag,
-		    NULL,
-		    rfq,
-		    stq,
-		    type,
-		    ipif,
-		    NULL,
-		    0,
-		    0,
-		    ifrt->ifrt_flags,
-		    &ifrt->ifrt_iulp_info,
-		    NULL,
-		    NULL,
-		    ipst);
-		if (ire == NULL) {
-			mutex_exit(&ipif->ipif_saved_ire_lock);
-			kmem_free(ipif_saved_irep,
-			    ipif->ipif_saved_ire_cnt * sizeof (ire_t *));
-			return (NULL);
-		}
-
-		/*
-		 * Some software (for example, GateD and Sun Cluster) attempts
-		 * to create (what amount to) IRE_PREFIX routes with the
-		 * loopback address as the gateway.  This is primarily done to
-		 * set up prefixes with the RTF_REJECT flag set (for example,
-		 * when generating aggregate routes.)
-		 *
-		 * If the IRE type (as defined by ipif->ipif_net_type) is
-		 * IRE_LOOPBACK, then we map the request into a
-		 * IRE_IF_NORESOLVER.
-		 */
-		if (ipif->ipif_net_type == IRE_LOOPBACK)
-			ire->ire_type = IRE_IF_NORESOLVER;
-		/*
-		 * ire held by ire_add, will be refreled' in ipif_up_done
-		 * towards the end
-		 */
-		(void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
-		*irep = ire;
-		irep++;
-		ip1dbg(("ipif_recover_ire_v6: added ire %p\n", (void *)ire));
-	}
-	mutex_exit(&ipif->ipif_saved_ire_lock);
-	return (ipif_saved_irep);
+	ipif_nce_down(ipif);
 }
 
 /*
@@ -1826,8 +1405,7 @@ ip_common_prefix_v6(const in6_addr_t *a1, const in6_addr_t *a2)
 
 #define	IPIF_VALID_IPV6_SOURCE(ipif) \
 	(((ipif)->ipif_flags & IPIF_UP) && \
-	!((ipif)->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) && \
-	(ipif)->ipif_addr_ready)
+	!((ipif)->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)))
 
 /* source address candidate */
 typedef struct candidate {
@@ -2195,13 +1773,6 @@ rule_addr_type(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
 static rule_res_t
 rule_prefix(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst)
 {
-	/*
-	 * For IPMP, we always want to choose a random source address from
-	 * among any equally usable addresses, so always report a tie.
-	 */
-	if (IS_IPMP(dstinfo->dst_ill))
-		return (CAND_TIE);
-
 	if (!bc->cand_common_pref_set) {
 		bc->cand_common_pref = ip_common_prefix_v6(&bc->cand_srcaddr,
 		    dstinfo->dst_addr);
@@ -2252,14 +1823,15 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
  *
  * src_prefs is the caller's set of source address preferences.  If source
  * address selection is being called to determine the source address of a
- * connected socket (from ip_bind_connected_v6()), then the preferences are
- * taken from conn_src_preferences.  These preferences can be set on a
+ * connected socket (from ip_set_destination_v6()), then the preferences are
+ * taken from conn_ixa->ixa_src_preferences.  These preferences can be set on a
  * per-socket basis using the IPV6_SRC_PREFERENCES socket option.  The only
  * preference currently implemented is for rfc3041 temporary addresses.
  */
 ipif_t *
 ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
-    boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid)
+    boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid,
+    boolean_t allow_usesrc, boolean_t *notreadyp)
 {
 	dstinfo_t	dstinfo;
 	char		dstr[INET6_ADDRSTRLEN];
@@ -2306,10 +1878,10 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
 	 * usesrc ifindex. This has higher precedence since it is
 	 * finer grained (i.e per interface) v/s being system wide.
 	 */
-	if (dstill->ill_usesrc_ifindex != 0) {
+	if (dstill->ill_usesrc_ifindex != 0 && allow_usesrc) {
 		if ((usesrc_ill =
 		    ill_lookup_on_ifindex(dstill->ill_usesrc_ifindex, B_TRUE,
-		    NULL, NULL, NULL, NULL, ipst)) != NULL) {
+		    ipst)) != NULL) {
 			dstinfo.dst_ill = usesrc_ill;
 		} else {
 			return (NULL);
@@ -2412,6 +1984,12 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
 			if (!IPIF_VALID_IPV6_SOURCE(ipif))
 				continue;
 
+			if (!ipif->ipif_addr_ready) {
+				if (notreadyp != NULL)
+					*notreadyp = B_TRUE;
+				continue;
+			}
+
 			if (zoneid != ALL_ZONES &&
 			    ipif->ipif_zoneid != zoneid &&
 			    ipif->ipif_zoneid != ALL_ZONES)
@@ -2505,7 +2083,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
 		if (IS_IPMP(ill) && ipif != NULL) {
 			mutex_enter(&ipif->ipif_ill->ill_lock);
 			next_ipif = ipif->ipif_next;
-			if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+			if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
 				ill->ill_src_ipif = next_ipif;
 			else
 				ill->ill_src_ipif = NULL;
@@ -2541,7 +2119,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
 	}
 
 	mutex_enter(&ipif->ipif_ill->ill_lock);
-	if (IPIF_CAN_LOOKUP(ipif)) {
+	if (!IPIF_IS_CONDEMNED(ipif)) {
 		ipif_refhold_locked(ipif);
 		mutex_exit(&ipif->ipif_ill->ill_lock);
 		rw_exit(&ipst->ips_ill_g_lock);
@@ -2556,187 +2134,72 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
 }
 
 /*
- * If old_ipif is not NULL, see if ipif was derived from old
- * ipif and if so, recreate the interface route by re-doing
- * source address selection. This happens when ipif_down ->
- * ipif_update_other_ipifs calls us.
+ * Pick a source address based on the destination ill and an optional setsrc
+ * address.
+ * The result is stored in srcp. If generation is set, then put the source
+ * generation number there before we look for the source address (to avoid
+ * missing changes in the set of source addresses.
+ * If flagsp is set, then us it to pass back ipif_flags.
+ *
+ * If the caller wants to cache the returned source address and detect when
+ * that might be stale, the caller should pass in a generation argument,
+ * which the caller can later compare against ips_src_generation
+ *
+ * The precedence order for selecting an IPv6 source address is:
+ *  - RTF_SETSRC on the first ire in the recursive lookup always wins.
+ *  - If usrsrc is set, swap the ill to be the usesrc one.
+ *  - If IPMP is used on the ill, select a random address from the most
+ *    preferred ones below:
+ * That is followed by the long list of IPv6 source address selection rules
+ * starting with rule_isdst(), rule_scope(), etc.
  *
- * If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when ipif_up_done_v6 calls us.
+ * We have lower preference for ALL_ZONES IP addresses,
+ * as they pose problems with unlabeled destinations.
+ *
+ * Note that when multiple IP addresses match e.g., with rule_scope() we pick
+ * the first one if IPMP is not in use. With IPMP we randomize.
  */
-void
-ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
+int
+ip_select_source_v6(ill_t *ill, const in6_addr_t *setsrc, const in6_addr_t *dst,
+    zoneid_t zoneid, ip_stack_t *ipst, uint_t restrict_ill, uint32_t src_prefs,
+    in6_addr_t *srcp, uint32_t *generation, uint64_t *flagsp)
 {
-	ire_t *ire;
-	ire_t *ipif_ire;
-	queue_t *stq;
-	ill_t *ill;
-	ipif_t *nipif = NULL;
-	boolean_t nipif_refheld = B_FALSE;
-	boolean_t ip6_asp_table_held = B_FALSE;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
-
-	ill = ipif->ipif_ill;
-
-	if (!(ipif->ipif_flags &
-	    (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
-		/*
-		 * Can't possibly have borrowed the source
-		 * from old_ipif.
-		 */
-		return;
-	}
+	ipif_t *ipif;
+	boolean_t notready = B_FALSE;	/* Set if !ipif_addr_ready found */
 
-	/*
-	 * Is there any work to be done? No work if the address
-	 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST (
-	 * ipif_select_source_v6() does not borrow addresses from
-	 * NOLOCAL and ANYCAST interfaces).
-	 */
-	if ((old_ipif != NULL) &&
-	    ((IN6_IS_ADDR_UNSPECIFIED(&old_ipif->ipif_v6lcl_addr)) ||
-	    (old_ipif->ipif_ill->ill_wq == NULL) ||
-	    (old_ipif->ipif_flags &
-	    (IPIF_NOLOCAL|IPIF_ANYCAST)))) {
-		return;
-	}
+	if (flagsp != NULL)
+		*flagsp = 0;
 
 	/*
-	 * Perform the same checks as when creating the
-	 * IRE_INTERFACE in ipif_up_done_v6.
+	 * Need to grab the generation number before we check to
+	 * avoid a race with a change to the set of local addresses.
+	 * No lock needed since the thread which updates the set of local
+	 * addresses use ipif/ill locks and exit those (hence a store memory
+	 * barrier) before doing the atomic increase of ips_src_generation.
 	 */
-	if (!(ipif->ipif_flags & IPIF_UP))
-		return;
-
-	if ((ipif->ipif_flags & IPIF_NOXMIT))
-		return;
-
-	if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) &&
-	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
-		return;
-
-	/*
-	 * We know that ipif uses some other source for its
-	 * IRE_INTERFACE. Is it using the source of this
-	 * old_ipif?
-	 */
-	ipif_ire = ipif_to_ire_v6(ipif);
-	if (ipif_ire == NULL)
-		return;
-
-	if (old_ipif != NULL &&
-	    !IN6_ARE_ADDR_EQUAL(&old_ipif->ipif_v6lcl_addr,
-	    &ipif_ire->ire_src_addr_v6)) {
-		ire_refrele(ipif_ire);
-		return;
-	}
-
-	if (ip_debug > 2) {
-		/* ip1dbg */
-		pr_addr_dbg("ipif_recreate_interface_routes_v6: deleting IRE"
-		    " for src %s\n", AF_INET6, &ipif_ire->ire_src_addr_v6);
-	}
-
-	stq = ipif_ire->ire_stq;
-
-	/*
-	 * Can't use our source address. Select a different source address
-	 * for the IRE_INTERFACE.  We restrict interface route source
-	 * address selection to ipif's assigned to the same link as the
-	 * interface.
-	 */
-	if (ip6_asp_can_lookup(ipst)) {
-		ip6_asp_table_held = B_TRUE;
-		nipif = ipif_select_source_v6(ill, &ipif->ipif_v6subnet,
-		    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
-	}
-	if (nipif == NULL) {
-		/* Last resort - all ipif's have IPIF_NOLOCAL */
-		nipif = ipif;
-	} else {
-		nipif_refheld = B_TRUE;
+	if (generation != NULL) {
+		*generation = ipst->ips_src_generation;
 	}
 
-	ire = ire_create_v6(
-	    &ipif->ipif_v6subnet,	/* dest pref */
-	    &ipif->ipif_v6net_mask,	/* mask */
-	    &nipif->ipif_v6src_addr,	/* src addr */
-	    NULL,			/* no gateway */
-	    &ipif->ipif_mtu,		/* max frag */
-	    NULL,			/* no src nce */
-	    NULL,			/* no recv from queue */
-	    stq,			/* send-to queue */
-	    ill->ill_net_type,		/* IF_[NO]RESOLVER */
-	    ipif,
-	    NULL,
-	    0,
-	    0,
-	    0,
-	    &ire_uinfo_null,
-	    NULL,
-	    NULL,
-	    ipst);
-
-	if (ire != NULL) {
-		ire_t *ret_ire;
-		int   error;
-
-		/*
-		 * We don't need ipif_ire anymore. We need to delete
-		 * before we add so that ire_add does not detect
-		 * duplicates.
-		 */
-		ire_delete(ipif_ire);
-		ret_ire = ire;
-		error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE);
-		ASSERT(error == 0);
-		ASSERT(ret_ire == ire);
-		if (ret_ire != NULL) {
-			/* Held in ire_add */
-			ire_refrele(ret_ire);
-		}
+	/* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
+	if (setsrc != NULL && !IN6_IS_ADDR_UNSPECIFIED(setsrc)) {
+		*srcp = *setsrc;
+		return (0);
 	}
-	/*
-	 * Either we are falling through from above or could not
-	 * allocate a replacement.
-	 */
-	ire_refrele(ipif_ire);
-	if (ip6_asp_table_held)
-		ip6_asp_table_refrele(ipst);
-	if (nipif_refheld)
-		ipif_refrele(nipif);
-}
-
-/*
- * This old_ipif is going away.
- *
- * Determine if any other ipif's are using our address as
- * ipif_v6lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
- * IPIF_DEPRECATED).
- * Find the IRE_INTERFACE for such ipif's and recreate them
- * to use an different source address following the rules in
- * ipif_up_done_v6.
- */
-void
-ipif_update_other_ipifs_v6(ipif_t *old_ipif)
-{
-	ipif_t	*ipif;
-	ill_t	*ill;
-	char	buf[INET6_ADDRSTRLEN];
-
-	ASSERT(IAM_WRITER_IPIF(old_ipif));
-
-	ill = old_ipif->ipif_ill;
-
-	ip1dbg(("ipif_update_other_ipifs_v6(%s, %s)\n",
-	    ill->ill_name,
-	    inet_ntop(AF_INET6, &old_ipif->ipif_v6lcl_addr,
-	    buf, sizeof (buf))));
 
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (ipif != old_ipif)
-			ipif_recreate_interface_routes_v6(old_ipif, ipif);
+	ipif = ipif_select_source_v6(ill, dst, restrict_ill, src_prefs, zoneid,
+	    B_TRUE, &notready);
+	if (ipif == NULL) {
+		if (notready)
+			return (ENETDOWN);
+		else
+			return (EADDRNOTAVAIL);
 	}
+	*srcp = ipif->ipif_v6lcl_addr;
+	if (flagsp != NULL)
+		*flagsp = ipif->ipif_flags;
+	ipif_refrele(ipif);
+	return (0);
 }
 
 /*
@@ -2744,11 +2207,10 @@ ipif_update_other_ipifs_v6(ipif_t *old_ipif)
  * the physical device.
  * q and mp represents an ioctl which will be queued waiting for
  * completion of the DLPI message exchange.
- * MUST be called on an ill queue. Can not set conn_pending_ill for that
- * reason thus the DL_PHYS_ADDR_ACK code does not assume ill_pending_q.
+ * MUST be called on an ill queue.
  *
- * Returns EINPROGRESS when mp has been consumed by queueing it on
- * ill_pending_mp and the ioctl will complete in ip_rput.
+ * Returns EINPROGRESS when mp has been consumed by queueing it.
+ * The ioctl will complete in ip_rput.
  */
 int
 ill_dl_phys(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
@@ -2888,6 +2350,7 @@ bad:
 	return (ENOMEM);
 }
 
+/* Add room for tcp+ip headers */
 uint_t ip_loopback_mtu_v6plus = IP_LOOPBACK_MTU + IPV6_HDR_LEN + 20;
 
 /*
@@ -2899,28 +2362,14 @@ uint_t ip_loopback_mtu_v6plus = IP_LOOPBACK_MTU + IPV6_HDR_LEN + 20;
 int
 ipif_up_done_v6(ipif_t *ipif)
 {
-	ire_t	*ire_array[20];
-	ire_t	**irep = ire_array;
-	ire_t	**irep1;
 	ill_t	*ill = ipif->ipif_ill;
-	queue_t	*stq;
-	in6_addr_t	v6addr;
-	in6_addr_t	route_mask;
-	ipif_t	 *src_ipif = NULL;
-	ipif_t   *tmp_ipif;
-	boolean_t	flush_ire_cache = B_TRUE;
 	int	err;
-	char	buf[INET6_ADDRSTRLEN];
-	ire_t	**ipif_saved_irep = NULL;
-	int ipif_saved_ire_cnt;
-	int cnt;
-	boolean_t src_ipif_held = B_FALSE;
 	boolean_t loopback = B_FALSE;
-	boolean_t ip6_asp_table_held = B_FALSE;
-	ip_stack_t	*ipst = ill->ill_ipst;
 
 	ip1dbg(("ipif_up_done_v6(%s:%u)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id));
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done_v6",
+	    ill_t *, ill, ipif_t *, ipif);
 
 	/* Check if this is a loopback interface */
 	if (ipif->ipif_ill->ill_wq == NULL)
@@ -2929,46 +2378,10 @@ ipif_up_done_v6(ipif_t *ipif)
 	ASSERT(ipif->ipif_isv6);
 	ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
 
-	/*
-	 * If all other interfaces for this ill are down or DEPRECATED,
-	 * or otherwise unsuitable for source address selection, remove
-	 * any IRE_CACHE entries for this ill to make sure source
-	 * address selection gets to take this new ipif into account.
-	 * No need to hold ill_lock while traversing the ipif list since
-	 * we are writer
-	 */
-	for (tmp_ipif = ill->ill_ipif; tmp_ipif;
-	    tmp_ipif = tmp_ipif->ipif_next) {
-		if (((tmp_ipif->ipif_flags &
-		    (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
-		    !(tmp_ipif->ipif_flags & IPIF_UP)) ||
-		    (tmp_ipif == ipif))
-			continue;
-		/* first useable pre-existing interface */
-		flush_ire_cache = B_FALSE;
-		break;
-	}
-	if (flush_ire_cache)
-		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
-		    IRE_CACHE, ill_ipif_cache_delete, ill, ill);
+	if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
+		nce_t *loop_nce = NULL;
+		uint16_t flags = (NCE_F_MYADDR | NCE_F_NONUD | NCE_F_AUTHORITY);
 
-	/*
-	 * Figure out which way the send-to queue should go.  Only
-	 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER should show up here.
-	 */
-	switch (ill->ill_net_type) {
-	case IRE_IF_RESOLVER:
-		stq = ill->ill_rq;
-		break;
-	case IRE_IF_NORESOLVER:
-	case IRE_LOOPBACK:
-		stq = ill->ill_wq;
-		break;
-	default:
-		return (EINVAL);
-	}
-
-	if (IS_LOOPBACK(ill)) {
 		/*
 		 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
 		 * ipif_lookup_on_name(), but in the case of zones we can have
@@ -2979,29 +2392,99 @@ ipif_up_done_v6(ipif_t *ipif)
 			ipif->ipif_ire_type = IRE_LOOPBACK;
 		else
 			ipif->ipif_ire_type = IRE_LOCAL;
+		if (ill->ill_net_type != IRE_LOOPBACK)
+			flags |= NCE_F_PUBLISH;
+		err = nce_lookup_then_add_v6(ill, NULL,
+		    ill->ill_phys_addr_length,
+		    &ipif->ipif_v6lcl_addr, flags, ND_REACHABLE, &loop_nce);
+
+		/* A shared-IP zone sees EEXIST for lo0:N */
+		if (err == 0 || err == EEXIST) {
+			ipif->ipif_added_nce = 1;
+			loop_nce->nce_ipif_cnt++;
+			nce_refrele(loop_nce);
+			err = 0;
+		} else {
+			ASSERT(loop_nce == NULL);
+			return (err);
+		}
 	}
 
-	if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
-	    ((ipif->ipif_flags & IPIF_DEPRECATED) &&
-	    !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
+	err = ipif_add_ires_v6(ipif, loopback);
+	if (err != 0) {
 		/*
-		 * Can't use our source address. Select a different
-		 * source address for the IRE_INTERFACE and IRE_LOCAL
+		 * See comments about return value from
+		 * ipif_addr_availability_check() in ipif_add_ires_v6().
 		 */
-		if (ip6_asp_can_lookup(ipst)) {
-			ip6_asp_table_held = B_TRUE;
-			src_ipif = ipif_select_source_v6(ipif->ipif_ill,
-			    &ipif->ipif_v6subnet, B_FALSE,
-			    IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
+		if (err != EADDRINUSE) {
+			ipif_ndp_down(ipif);
+		} else {
+			/*
+			 * Make IPMP aware of the deleted ipif so that
+			 * the needed ipmp cleanup (e.g., of ipif_bound_ill)
+			 * can be completed. Note that we do not want to
+			 * destroy the nce that was created on the ipmp_ill
+			 * for the active copy of the duplicate address in
+			 * use.
+			 */
+			if (IS_IPMP(ill))
+				ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+			err = EADDRNOTAVAIL;
 		}
-		if (src_ipif == NULL)
-			src_ipif = ipif;	/* Last resort */
-		else
-			src_ipif_held = B_TRUE;
-	} else {
-		src_ipif = ipif;
+		return (err);
+	}
+
+	if (ill->ill_ipif_up_count == 1 && !loopback) {
+		/* Recover any additional IREs entries for this ill */
+		(void) ill_recover_saved_ire(ill);
 	}
 
+	if (ill->ill_need_recover_multicast) {
+		/*
+		 * Need to recover all multicast memberships in the driver.
+		 * This had to be deferred until we had attached.
+		 */
+		ill_recover_multicast(ill);
+	}
+
+	if (ill->ill_ipif_up_count == 1) {
+		/*
+		 * Since the interface is now up, it may now be active.
+		 */
+		if (IS_UNDER_IPMP(ill))
+			ipmp_ill_refresh_active(ill);
+	}
+
+	/* Join the allhosts multicast address and the solicited node MC */
+	ipif_multicast_up(ipif);
+
+	/* Perhaps ilgs should use this ill */
+	update_conn_ill(NULL, ill->ill_ipst);
+
+	if (ipif->ipif_addr_ready)
+		ipif_up_notify(ipif);
+
+	return (0);
+}
+
+/*
+ * Add the IREs associated with the ipif.
+ * Those MUST be explicitly removed in ipif_delete_ires_v6.
+ */
+static int
+ipif_add_ires_v6(ipif_t *ipif, boolean_t loopback)
+{
+	ill_t		*ill = ipif->ipif_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ire_t		*ire_array[20];
+	ire_t		**irep = ire_array;
+	ire_t		**irep1;
+	in6_addr_t	v6addr;
+	in6_addr_t	route_mask;
+	int		err;
+	char		buf[INET6_ADDRSTRLEN];
+	ire_t		*ire_local = NULL;	/* LOCAL or LOOPBACK */
+
 	if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
 	    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
 
@@ -3024,45 +2507,38 @@ ipif_up_done_v6(ipif_t *ipif)
 		err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
 		    ipif->ipif_zoneid, ipst);
 		if (err != 0) {
-			ip0dbg(("ipif_up_done_v6: srcid_insert %d\n", err));
-			if (src_ipif_held)
-				ipif_refrele(src_ipif);
-			if (ip6_asp_table_held)
-				ip6_asp_table_refrele(ipst);
+			ip0dbg(("ipif_add_ires_v6: srcid_insert %d\n", err));
 			return (err);
 		}
 		/*
 		 * If the interface address is set, create the LOCAL
 		 * or LOOPBACK IRE.
 		 */
-		ip1dbg(("ipif_up_done_v6: creating IRE %d for %s\n",
+		ip1dbg(("ipif_add_ires_v6: creating IRE %d for %s\n",
 		    ipif->ipif_ire_type,
 		    inet_ntop(AF_INET6, &ipif->ipif_v6lcl_addr,
 		    buf, sizeof (buf))));
 
-		*irep++ = ire_create_v6(
+		ire_local = ire_create_v6(
 		    &ipif->ipif_v6lcl_addr,		/* dest address */
 		    &ipv6_all_ones,			/* mask */
-		    &src_ipif->ipif_v6src_addr,		/* source address */
 		    NULL,				/* no gateway */
-		    &ip_loopback_mtu_v6plus,		/* max frag size */
-		    NULL,
-		    ipif->ipif_rq,			/* recv-from queue */
-		    NULL,				/* no send-to queue */
 		    ipif->ipif_ire_type,		/* LOCAL or LOOPBACK */
-		    ipif,				/* interface */
-		    NULL,
-		    0,
-		    0,
-		    (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
-		    &ire_uinfo_null,
-		    NULL,
+		    ipif->ipif_ill,			/* interface */
+		    ipif->ipif_zoneid,
+		    ((ipif->ipif_flags & IPIF_PRIVATE) ?
+		    RTF_PRIVATE : 0) | RTF_KERNEL,
 		    NULL,
 		    ipst);
+		if (ire_local == NULL) {
+			ip1dbg(("ipif_up_done_v6: NULL ire_local\n"));
+			err = ENOMEM;
+			goto bad;
+		}
 	}
 
 	/* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
-	if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) &&
+	if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
 	    !(IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) &&
 	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))) {
 		/* ipif_v6subnet is ipif_v6pp_dst_addr for pt-pt */
@@ -3074,27 +2550,19 @@ ipif_up_done_v6(ipif_t *ipif)
 			route_mask = ipif->ipif_v6net_mask;
 		}
 
-		ip1dbg(("ipif_up_done_v6: creating if IRE %d for %s\n",
+		ip1dbg(("ipif_add_ires_v6: creating if IRE %d for %s\n",
 		    ill->ill_net_type,
 		    inet_ntop(AF_INET6, &v6addr, buf, sizeof (buf))));
 
 		*irep++ = ire_create_v6(
 		    &v6addr,			/* dest pref */
 		    &route_mask,		/* mask */
-		    &src_ipif->ipif_v6src_addr,	/* src addr */
-		    NULL,			/* no gateway */
-		    &ipif->ipif_mtu,		/* max frag */
-		    NULL,			/* no src nce */
-		    NULL,			/* no recv from queue */
-		    stq,			/* send-to queue */
+		    &ipif->ipif_v6lcl_addr,	/* gateway */
 		    ill->ill_net_type,		/* IF_[NO]RESOLVER */
-		    ipif,
-		    NULL,
-		    0,
-		    0,
-		    (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
-		    &ire_uinfo_null,
-		    NULL,
+		    ipif->ipif_ill,
+		    ipif->ipif_zoneid,
+		    ((ipif->ipif_flags & IPIF_PRIVATE) ?
+		    RTF_PRIVATE : 0) | RTF_KERNEL,
 		    NULL,
 		    ipst);
 	}
@@ -3103,15 +2571,13 @@ ipif_up_done_v6(ipif_t *ipif)
 	for (irep1 = irep; irep1 > ire_array; ) {
 		irep1--;
 		if (*irep1 == NULL) {
-			ip1dbg(("ipif_up_done_v6: NULL ire found in"
+			ip1dbg(("ipif_add_ires_v6: NULL ire found in"
 			    " ire_array\n"));
 			err = ENOMEM;
 			goto bad;
 		}
 	}
 
-	ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
-
 	/*
 	 * Need to atomically check for IP address availability under
 	 * ip_addr_avail_lock.  ill_g_lock is held as reader to ensure no new
@@ -3132,20 +2598,12 @@ ipif_up_done_v6(ipif_t *ipif)
 		 * the other ipif. So we don't want to delete it (otherwise the
 		 * other ipif would be unable to send packets).
 		 * ip_addr_availability_check() identifies this case for us and
-		 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL
+		 * returns EADDRINUSE; Caller must  turn it into EADDRNOTAVAIL
 		 * which is the expected error code.
 		 *
-		 * Note that, for the non-XRESOLV case, ipif_ndp_down() will
-		 * only delete the nce in the case when the nce_ipif_cnt drops
-		 * to 0.
+		 * Note that ipif_ndp_down() will only delete the nce in the
+		 * case when the nce_ipif_cnt drops to 0.
 		 */
-		if (err == EADDRINUSE) {
-			if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV) {
-				freemsg(ipif->ipif_arp_del_mp);
-				ipif->ipif_arp_del_mp = NULL;
-			}
-			err = EADDRNOTAVAIL;
-		}
 		ill->ill_ipif_up_count--;
 		ipif->ipif_flags &= ~IPIF_UP;
 		goto bad;
@@ -3153,91 +2611,42 @@ ipif_up_done_v6(ipif_t *ipif)
 
 	/*
 	 * Add in all newly created IREs.
-	 *
-	 * NOTE : We refrele the ire though we may branch to "bad"
-	 *	  later on where we do ire_delete. This is okay
-	 *	  because nobody can delete it as we are running
-	 *	  exclusively.
 	 */
+	if (ire_local != NULL) {
+		ire_local = ire_add(ire_local);
+#ifdef DEBUG
+		if (ire_local != NULL) {
+			ire_refhold_notr(ire_local);
+			ire_refrele(ire_local);
+		}
+#endif
+	}
+	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+	if (ire_local != NULL)
+		ipif->ipif_ire_local = ire_local;
+	rw_exit(&ipst->ips_ill_g_lock);
+	ire_local = NULL;
+
 	for (irep1 = irep; irep1 > ire_array; ) {
 		irep1--;
 		/* Shouldn't be adding any bcast ire's */
 		ASSERT((*irep1)->ire_type != IRE_BROADCAST);
 		ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
-		/*
-		 * refheld by ire_add. refele towards the end of the func
-		 */
-		(void) ire_add(irep1, NULL, NULL, NULL, B_FALSE);
-	}
-	if (ip6_asp_table_held) {
-		ip6_asp_table_refrele(ipst);
-		ip6_asp_table_held = B_FALSE;
-	}
-
-	/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
-	ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
-	ipif_saved_irep = ipif_recover_ire_v6(ipif);
-
-	if (ill->ill_need_recover_multicast) {
-		/*
-		 * Need to recover all multicast memberships in the driver.
-		 * This had to be deferred until we had attached.
-		 */
-		ill_recover_multicast(ill);
-	}
-
-	if (ill->ill_ipif_up_count == 1) {
-		/*
-		 * Since the interface is now up, it may now be active.
-		 */
-		if (IS_UNDER_IPMP(ill))
-			ipmp_ill_refresh_active(ill);
-	}
-
-	/* Join the allhosts multicast address and the solicited node MC */
-	ipif_multicast_up(ipif);
-
-	/*
-	 * See if anybody else would benefit from our new ipif.
-	 */
-	if (!loopback &&
-	    !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
-		ill_update_source_selection(ill);
-	}
-
-	for (irep1 = irep; irep1 > ire_array; ) {
-		irep1--;
+		/* refheld by ire_add */
+		*irep1 = ire_add(*irep1);
 		if (*irep1 != NULL) {
-			/* was held in ire_add */
-			ire_refrele(*irep1);
-		}
-	}
-
-	cnt = ipif_saved_ire_cnt;
-	for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) {
-		if (*irep1 != NULL) {
-			/* was held in ire_add */
 			ire_refrele(*irep1);
+			*irep1 = NULL;
 		}
 	}
 
 	if (ipif->ipif_addr_ready)
 		ipif_up_notify(ipif);
-
-	if (ipif_saved_irep != NULL) {
-		kmem_free(ipif_saved_irep,
-		    ipif_saved_ire_cnt * sizeof (ire_t *));
-	}
-
-	if (src_ipif_held)
-		ipif_refrele(src_ipif);
-
 	return (0);
 
 bad:
-	if (ip6_asp_table_held)
-		ip6_asp_table_refrele(ipst);
-
+	if (ire_local != NULL)
+		ire_delete(ire_local);
 	while (irep > ire_array) {
 		irep--;
 		if (*irep != NULL)
@@ -3245,21 +2654,85 @@ bad:
 	}
 	(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
 
-	if (ipif_saved_irep != NULL) {
-		kmem_free(ipif_saved_irep,
-		    ipif_saved_ire_cnt * sizeof (ire_t *));
+	return (err);
+}
+
+/* Remove all the IREs created by ipif_add_ires_v6 */
+void
+ipif_delete_ires_v6(ipif_t *ipif)
+{
+	ill_t		*ill = ipif->ipif_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	in6_addr_t	v6addr;
+	in6_addr_t	route_mask;
+	ire_t		*ire;
+	int		match_args;
+	boolean_t	loopback;
+
+	/* Check if this is a loopback interface */
+	loopback = (ipif->ipif_ill->ill_wq == NULL);
+
+	match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_MASK |
+	    MATCH_IRE_ZONEONLY;
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+	if ((ire = ipif->ipif_ire_local) != NULL) {
+		ipif->ipif_ire_local = NULL;
+		rw_exit(&ipst->ips_ill_g_lock);
+		/*
+		 * Move count to ipif so we don't loose the count due to
+		 * a down/up dance.
+		 */
+		atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
+
+		ire_delete(ire);
+		ire_refrele_notr(ire);
+	} else {
+		rw_exit(&ipst->ips_ill_g_lock);
 	}
-	if (src_ipif_held)
-		ipif_refrele(src_ipif);
 
-	ipif_ndp_down(ipif);
-	ipif_resolver_down(ipif);
+	match_args |= MATCH_IRE_GW;
 
-	return (err);
+	/*
+	 * Delete the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate.
+	 * Note that atun interfaces have an all-zero ipif_v6subnet.
+	 * Thus we allow a zero subnet as long as the mask is non-zero.
+	 */
+	if (IS_UNDER_IPMP(ill))
+		match_args |= MATCH_IRE_TESTHIDDEN;
+
+	if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
+	    !(IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) &&
+	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))) {
+		/* ipif_v6subnet is ipif_v6pp_dst_addr for pt-pt */
+		v6addr = ipif->ipif_v6subnet;
+
+		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
+			route_mask = ipv6_all_ones;
+		} else {
+			route_mask = ipif->ipif_v6net_mask;
+		}
+
+		ire = ire_ftable_lookup_v6(
+		    &v6addr,			/* dest pref */
+		    &route_mask,		/* mask */
+		    &ipif->ipif_v6lcl_addr,	/* gateway */
+		    ill->ill_net_type,		/* IF_[NO]RESOLVER */
+		    ipif->ipif_ill,
+		    ipif->ipif_zoneid,
+		    NULL,
+		    match_args,
+		    0,
+		    ipst,
+		    NULL);
+		ASSERT(ire != NULL);
+		ire_delete(ire);
+		ire_refrele(ire);
+	}
 }
 
 /*
- * Delete an ND entry and the corresponding IRE_CACHE entry if it exists.
+ * Delete an ND entry if it exists.
  */
 /* ARGSUSED */
 int
@@ -3267,11 +2740,10 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
 {
 	sin6_t		*sin6;
-	nce_t		*nce;
 	struct lifreq	*lifr;
 	lif_nd_req_t	*lnr;
 	ill_t		*ill = ipif->ipif_ill;
-	ire_t		*ire;
+	nce_t		*nce;
 
 	lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr;
 	lnr = &lifr->lifr_nd;
@@ -3289,29 +2761,27 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 
 	/*
 	 * Since ND mappings must be consistent across an IPMP group, prohibit
-	 * deleting ND mappings on underlying interfaces.  Also, since ND
-	 * mappings for IPMP data addresses are owned by IP itself, prohibit
-	 * deleting them.
+	 * deleting ND mappings on underlying interfaces.
+	 * Don't allow deletion of mappings for local addresses.
 	 */
 	if (IS_UNDER_IPMP(ill))
 		return (EPERM);
 
-	if (IS_IPMP(ill)) {
-		ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL,
-		    ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL,
-		    ill->ill_ipst);
-		if (ire != NULL) {
-			ire_refrele(ire);
-			return (EPERM);
-		}
-	}
-
-	/* See comment in ndp_query() regarding IS_IPMP(ill) usage */
-	nce = ndp_lookup_v6(ill, IS_IPMP(ill), &sin6->sin6_addr, B_FALSE);
+	nce = nce_lookup_v6(ill, &sin6->sin6_addr);
 	if (nce == NULL)
 		return (ESRCH);
-	ndp_delete(nce);
-	NCE_REFRELE(nce);
+
+	if (NCE_MYADDR(nce->nce_common)) {
+		nce_refrele(nce);
+		return (EPERM);
+	}
+
+	/*
+	 * delete the nce_common which will also delete the nces on any
+	 * under_ill in the case of ipmp.
+	 */
+	ncec_delete(nce->nce_common);
+	nce_refrele(nce);
 	return (0);
 }
 
@@ -3383,9 +2853,9 @@ ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 		return (EPERM);
 
 	if (IS_IPMP(ill)) {
-		ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL,
-		    ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL,
-		    ill->ill_ipst);
+		ire = ire_ftable_lookup_v6(&sin6->sin6_addr, NULL, NULL,
+		    IRE_LOCAL, ill, ALL_ZONES, NULL,
+		    MATCH_IRE_TYPE | MATCH_IRE_ILL, 0, ill->ill_ipst, NULL);
 		if (ire != NULL) {
 			ire_refrele(ire);
 			return (EPERM);
diff --git a/usr/src/uts/common/inet/ip/ip6_input.c b/usr/src/uts/common/inet/ip/ip6_input.c
new file mode 100644
index 0000000000..cee5344bf6
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip6_input.c
@@ -0,0 +1,2749 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/dlpi.h>
+#include <sys/stropts.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strlog.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#define	_SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+#include <sys/xti_inet.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kobj.h>
+#include <sys/modctl.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/priv.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/vtrace.h>
+#include <sys/isa_defs.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/optcom.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+#include <inet/ilb_ip.h>
+#include <sys/squeue_impl.h>
+#include <sys/squeue.h>
+
+#include <sys/ethernet.h>
+#include <net/if_types.h>
+#include <sys/cpuvar.h>
+
+#include <ipp/ipp.h>
+#include <ipp/ipp_impl.h>
+#include <ipp/ipgpc/ipgpc.h>
+
+#include <sys/pattr.h>
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+#include <rpc/pmap_prot.h>
+
+#ifdef	DEBUG
+extern boolean_t skip_sctp_cksum;
+#endif
+
+static void	ip_input_local_v6(ire_t *, mblk_t *, ip6_t *, ip_recv_attr_t *);
+
+static void	ip_input_multicast_v6(ire_t *, mblk_t *, ip6_t *,
+    ip_recv_attr_t *);
+
+#pragma inline(ip_input_common_v6, ip_input_local_v6, ip_forward_xmit_v6)
+
+/*
+ * Direct read side procedure capable of dealing with chains. GLDv3 based
+ * drivers call this function directly with mblk chains while STREAMS
+ * read side procedure ip_rput() calls this for single packet with ip_ring
+ * set to NULL to process one packet at a time.
+ *
+ * The ill will always be valid if this function is called directly from
+ * the driver.
+ *
+ * If ip_input_v6() is called from GLDv3:
+ *
+ *   - This must be a non-VLAN IP stream.
+ *   - 'mp' is either an untagged or a special priority-tagged packet.
+ *   - Any VLAN tag that was in the MAC header has been stripped.
+ *
+ * If the IP header in packet is not 32-bit aligned, every message in the
+ * chain will be aligned before further operations. This is required on SPARC
+ * platform.
+ */
+void
+ip_input_v6(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
+    struct mac_header_info_s *mhip)
+{
+	(void) ip_input_common_v6(ill, ip_ring, mp_chain, mhip, NULL, NULL,
+	    NULL);
+}
+
+/*
+ * ip_accept_tcp_v6() - This function is called by the squeue when it retrieves
+ * a chain of packets in the poll mode. The packets have gone through the
+ * data link processing but not IP processing. For performance and latency
+ * reasons, the squeue wants to process the chain in line instead of feeding
+ * it back via ip_input path.
+ *
+ * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v6
+ * will pass back any TCP packets matching the target sqp to
+ * ip_input_common_v6 using ira_target_sqp_mp. Other packets are handled by
+ * ip_input_v6 and ip_fanout_v6 as normal.
+ * The TCP packets that match the target squeue are returned to the caller
+ * as a b_next chain after each packet has been prepend with an mblk
+ * from ip_recv_attr_to_mblk.
+ */
+mblk_t *
+ip_accept_tcp_v6(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
+    mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
+{
+	return (ip_input_common_v6(ill, ip_ring, mp_chain, NULL, target_sqp,
+	    last, cnt));
+}
+
+/*
+ * Used by ip_input_v6 and ip_accept_tcp_v6
+ * The last three arguments are only used by ip_accept_tcp_v6, and mhip is
+ * only used by ip_input_v6.
+ */
+mblk_t *
+ip_input_common_v6(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
+    struct mac_header_info_s *mhip, squeue_t *target_sqp,
+    mblk_t **last, uint_t *cnt)
+{
+	mblk_t		*mp;
+	ip6_t		*ip6h;
+	ip_recv_attr_t	iras;	/* Receive attributes */
+	rtc_t		rtc;
+	iaflags_t	chain_flags = 0;	/* Fixed for chain */
+	mblk_t 		*ahead = NULL;	/* Accepted head */
+	mblk_t		*atail = NULL;	/* Accepted tail */
+	uint_t		acnt = 0;	/* Accepted count */
+
+	ASSERT(mp_chain != NULL);
+	ASSERT(ill != NULL);
+
+	/* These ones do not change as we loop over packets */
+	iras.ira_ill = iras.ira_rill = ill;
+	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+	iras.ira_rifindex = iras.ira_ruifindex;
+	iras.ira_sqp = NULL;
+	iras.ira_ring = ip_ring;
+	/* For ECMP and outbound transmit ring selection */
+	iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring);
+
+	iras.ira_target_sqp = target_sqp;
+	iras.ira_target_sqp_mp = NULL;
+	if (target_sqp != NULL)
+		chain_flags |= IRAF_TARGET_SQP;
+
+	/*
+	 * We try to have a mhip pointer when possible, but
+	 * it might be NULL in some cases. In those cases we
+	 * have to assume unicast.
+	 */
+	iras.ira_mhip = mhip;
+	iras.ira_flags = 0;
+	if (mhip != NULL) {
+		switch (mhip->mhi_dsttype) {
+		case MAC_ADDRTYPE_MULTICAST :
+			chain_flags |= IRAF_L2DST_MULTICAST;
+			break;
+		case MAC_ADDRTYPE_BROADCAST :
+			chain_flags |= IRAF_L2DST_BROADCAST;
+			break;
+		}
+	}
+
+	/*
+	 * Initialize the one-element route cache.
+	 *
+	 * We do ire caching from one iteration to
+	 * another. In the event the packet chain contains
+	 * all packets from the same dst, this caching saves
+	 * an ire_route_recursive for each of the succeeding
+	 * packets in a packet chain.
+	 */
+	rtc.rtc_ire = NULL;
+	rtc.rtc_ip6addr = ipv6_all_zeros;
+
+	/* Loop over b_next */
+	for (mp = mp_chain; mp != NULL; mp = mp_chain) {
+		mp_chain = mp->b_next;
+		mp->b_next = NULL;
+
+		/*
+		 * if db_ref > 1 then copymsg and free original. Packet
+		 * may be changed and we do not want the other entity
+		 * who has a reference to this message to trip over the
+		 * changes. This is a blind change because trying to
+		 * catch all places that might change the packet is too
+		 * difficult.
+		 *
+		 * This corresponds to the fast path case, where we have
+		 * a chain of M_DATA mblks.  We check the db_ref count
+		 * of only the 1st data block in the mblk chain. There
+		 * doesn't seem to be a reason why a device driver would
+		 * send up data with varying db_ref counts in the mblk
+		 * chain. In any case the Fast path is a private
+		 * interface, and our drivers don't do such a thing.
+		 * Given the above assumption, there is no need to walk
+		 * down the entire mblk chain (which could have a
+		 * potential performance problem)
+		 *
+		 * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
+		 * to here because of exclusive ip stacks and vnics.
+		 * Packets transmitted from exclusive stack over vnic
+		 * can have db_ref > 1 and when it gets looped back to
+		 * another vnic in a different zone, you have ip_input()
+		 * getting dblks with db_ref > 1. So if someone
+		 * complains of TCP performance under this scenario,
+		 * take a serious look here on the impact of copymsg().
+		 */
+		if (DB_REF(mp) > 1) {
+			if ((mp = ip_fix_dbref(mp, &iras)) == NULL)
+				continue;
+		}
+
+		/*
+		 * IP header ptr not aligned?
+		 * OR IP header not complete in first mblk
+		 */
+		ip6h = (ip6_t *)mp->b_rptr;
+		if (!OK_32PTR(ip6h) || MBLKL(mp) < IPV6_HDR_LEN) {
+			mp = ip_check_and_align_header(mp, IPV6_HDR_LEN, &iras);
+			if (mp == NULL)
+				continue;
+			ip6h = (ip6_t *)mp->b_rptr;
+		}
+
+		/* Protect against a mix of Ethertypes and IP versions */
+		if (IPH_HDR_VERSION(ip6h) != IPV6_VERSION) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+			freemsg(mp);
+			/* mhip might point into 1st packet in the chain. */
+			iras.ira_mhip = NULL;
+			continue;
+		}
+
+		/*
+		 * Check for Martian addrs; we have to explicitly
+		 * test for for zero dst since this is also used as
+		 * an indication that the rtc is not used.
+		 */
+		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_dst)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+			freemsg(mp);
+			/* mhip might point into 1st packet in the chain. */
+			iras.ira_mhip = NULL;
+			continue;
+		}
+		/*
+		 * Keep L2SRC from a previous packet in chain since mhip
+		 * might point into an earlier packet in the chain.
+		 */
+		chain_flags |= (iras.ira_flags & IRAF_L2SRC_SET);
+
+		iras.ira_flags = IRAF_VERIFY_ULP_CKSUM | chain_flags;
+		iras.ira_free_flags = 0;
+		iras.ira_cred = NULL;
+		iras.ira_cpid = NOPID;
+		iras.ira_tsl = NULL;
+		iras.ira_zoneid = ALL_ZONES;	/* Default for forwarding */
+
+		/*
+		 * We must count all incoming packets, even if they end
+		 * up being dropped later on. Defer counting bytes until
+		 * we have the whole IP header in first mblk.
+		 */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+
+		iras.ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+		UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
+		    iras.ira_pktlen);
+
+		/*
+		 * Call one of:
+		 * 	ill_input_full_v6
+		 *	ill_input_short_v6
+		 * The former is used in the case of TX. See ill_set_inputfn().
+		 */
+		(*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
+
+		/* Any references to clean up? No hold on ira_ill */
+		if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
+			ira_cleanup(&iras, B_FALSE);
+
+		if (iras.ira_target_sqp_mp != NULL) {
+			/* Better be called from ip_accept_tcp */
+			ASSERT(target_sqp != NULL);
+
+			/* Found one packet to accept */
+			mp = iras.ira_target_sqp_mp;
+			iras.ira_target_sqp_mp = NULL;
+			ASSERT(ip_recv_attr_is_mblk(mp));
+
+			if (atail != NULL)
+				atail->b_next = mp;
+			else
+				ahead = mp;
+			atail = mp;
+			acnt++;
+			mp = NULL;
+		}
+		/* mhip might point into 1st packet in the chain. */
+		iras.ira_mhip = NULL;
+	}
+	/* Any remaining references to the route cache? */
+	if (rtc.rtc_ire != NULL) {
+		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
+		ire_refrele(rtc.rtc_ire);
+	}
+
+	if (ahead != NULL) {
+		/* Better be called from ip_accept_tcp */
+		ASSERT(target_sqp != NULL);
+		*last = atail;
+		*cnt = acnt;
+		return (ahead);
+	}
+
+	return (NULL);
+}
+
+/*
+ * This input function is used when
+ *  - is_system_labeled()
+ *
+ * Note that for IPv6 CGTP filtering is handled only when receiving fragment
+ * headers, and RSVP uses router alert options, thus we don't need anything
+ * extra for them.
+ */
+void
+ill_input_full_v6(mblk_t *mp, void *iph_arg, void *nexthop_arg,
+    ip_recv_attr_t *ira, rtc_t *rtc)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	in6_addr_t	*nexthop = (in6_addr_t *)nexthop_arg;
+	ill_t		*ill = ira->ira_ill;
+
+	ASSERT(ira->ira_tsl == NULL);
+
+	/*
+	 * Attach any necessary label information to
+	 * this packet
+	 */
+	if (is_system_labeled()) {
+		ira->ira_flags |= IRAF_SYSTEM_LABELED;
+
+		/*
+		 * This updates ira_cred, ira_tsl and ira_free_flags based
+		 * on the label.
+		 */
+		if (!tsol_get_pkt_label(mp, IPV6_VERSION, ira)) {
+			if (ip6opt_ls != 0)
+				ip0dbg(("tsol_get_pkt_label v6 failed\n"));
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		/* Note that ira_tsl can be NULL here. */
+
+		/* tsol_get_pkt_label sometimes does pullupmsg */
+		ip6h = (ip6_t *)mp->b_rptr;
+	}
+	ill_input_short_v6(mp, ip6h, nexthop, ira, rtc);
+}
+
+/*
+ * Check for IPv6 addresses that should not appear on the wire
+ * as either source or destination.
+ * If we ever implement Stateless IPv6 Translators (SIIT) we'd have
+ * to revisit the IPv4-mapped part.
+ */
+static boolean_t
+ip6_bad_address(in6_addr_t *addr, boolean_t is_src)
+{
+	if (IN6_IS_ADDR_V4MAPPED(addr)) {
+		ip1dbg(("ip_input_v6: pkt with IPv4-mapped addr"));
+		return (B_TRUE);
+	}
+	if (IN6_IS_ADDR_LOOPBACK(addr)) {
+		ip1dbg(("ip_input_v6: pkt with loopback addr"));
+		return (B_TRUE);
+	}
+	if (!is_src && IN6_IS_ADDR_UNSPECIFIED(addr)) {
+		/*
+		 * having :: in the src is ok: it's used for DAD.
+		 */
+		ip1dbg(("ip_input_v6: pkt with unspecified addr"));
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Routing lookup for IPv6 link-locals.
+ * First we look on the inbound interface, then we check for IPMP and
+ * look on the upper interface.
+ * We update ira_ruifindex if we find the IRE on the upper interface.
+ */
+static ire_t *
+ire_linklocal(const in6_addr_t *nexthop, ill_t *ill, ip_recv_attr_t *ira,
+    boolean_t allocate, ip_stack_t *ipst)
+{
+	int match_flags = MATCH_IRE_SECATTR | MATCH_IRE_ILL;
+	ire_t *ire;
+
+	ASSERT(IN6_IS_ADDR_LINKLOCAL(nexthop));
+	ire = ire_route_recursive_v6(nexthop, 0, ill, ALL_ZONES, ira->ira_tsl,
+	    match_flags, allocate, ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
+	if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+	    !IS_UNDER_IPMP(ill))
+		return (ire);
+
+	/*
+	 * When we are using IMP we need to look for an IRE on both the
+	 * under and upper interfaces since there are different
+	 * link-local addresses for the under and upper.
+	 */
+	ill = ipmp_ill_hold_ipmp_ill(ill);
+	if (ill == NULL)
+		return (ire);
+
+	ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+
+	ire_refrele(ire);
+	ire = ire_route_recursive_v6(nexthop, 0, ill, ALL_ZONES, ira->ira_tsl,
+	    match_flags, allocate, ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
+	ill_refrele(ill);
+	return (ire);
+}
+
+/*
+ * This is the tail-end of the full receive side packet handling.
+ * It can be used directly when the configuration is simple.
+ */
+void
+ill_input_short_v6(mblk_t *mp, void *iph_arg, void *nexthop_arg,
+    ip_recv_attr_t *ira, rtc_t *rtc)
+{
+	ire_t		*ire;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	uint_t		pkt_len;
+	ssize_t 	len;
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	in6_addr_t	nexthop = *(in6_addr_t *)nexthop_arg;
+	ilb_stack_t	*ilbs = ipst->ips_netstack->netstack_ilb;
+#define	rptr	((uchar_t *)ip6h)
+
+	ASSERT(DB_TYPE(mp) == M_DATA);
+
+	/*
+	 * Check for source/dest being a bad address: loopback, any, or
+	 * v4mapped. All of them start with a 64 bits of zero.
+	 */
+	if (ip6h->ip6_src.s6_addr32[0] == 0 &&
+	    ip6h->ip6_src.s6_addr32[1] == 0) {
+		if (ip6_bad_address(&ip6h->ip6_src, B_TRUE)) {
+			ip1dbg(("ip_input_v6: pkt with bad src addr\n"));
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+			freemsg(mp);
+			return;
+		}
+	}
+	if (ip6h->ip6_dst.s6_addr32[0] == 0 &&
+	    ip6h->ip6_dst.s6_addr32[1] == 0) {
+		if (ip6_bad_address(&ip6h->ip6_dst, B_FALSE)) {
+			ip1dbg(("ip_input_v6: pkt with bad dst addr\n"));
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+			freemsg(mp);
+			return;
+		}
+	}
+
+	len = mp->b_wptr - rptr;
+	pkt_len = ira->ira_pktlen;
+
+	/* multiple mblk or too short */
+	len -= pkt_len;
+	if (len != 0) {
+		mp = ip_check_length(mp, rptr, len, pkt_len, IPV6_HDR_LEN, ira);
+		if (mp == NULL)
+			return;
+		ip6h = (ip6_t *)mp->b_rptr;
+	}
+
+	DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+	    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
+	    int, 0);
+	/*
+	 * The event for packets being received from a 'physical'
+	 * interface is placed after validation of the source and/or
+	 * destination address as being local so that packets can be
+	 * redirected to loopback addresses using ipnat.
+	 */
+	DTRACE_PROBE4(ip6__physical__in__start,
+	    ill_t *, ill, ill_t *, NULL,
+	    ip6_t *, ip6h, mblk_t *, mp);
+
+	if (HOOKS6_INTERESTED_PHYSICAL_IN(ipst)) {
+		int	ll_multicast = 0;
+		int	error;
+		in6_addr_t orig_dst = ip6h->ip6_dst;
+
+		if (ira->ira_flags & IRAF_L2DST_MULTICAST)
+			ll_multicast = HPE_MULTICAST;
+		else if (ira->ira_flags & IRAF_L2DST_BROADCAST)
+			ll_multicast = HPE_BROADCAST;
+
+		FW_HOOKS6(ipst->ips_ip6_physical_in_event,
+		    ipst->ips_ipv6firewall_physical_in,
+		    ill, NULL, ip6h, mp, mp, ll_multicast, ipst, error);
+
+		DTRACE_PROBE1(ip6__physical__in__end, mblk_t *, mp);
+
+		if (mp == NULL)
+			return;
+
+		/* The length could have changed */
+		ip6h = (ip6_t *)mp->b_rptr;
+		ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+		pkt_len = ira->ira_pktlen;
+
+		/*
+		 * In case the destination changed we override any previous
+		 * change to nexthop.
+		 */
+		if (!IN6_ARE_ADDR_EQUAL(&orig_dst, &ip6h->ip6_dst))
+			nexthop = ip6h->ip6_dst;
+
+		if (IN6_IS_ADDR_UNSPECIFIED(&nexthop)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+			freemsg(mp);
+			return;
+		}
+
+	}
+
+	if (ipst->ips_ip6_observe.he_interested) {
+		zoneid_t dzone;
+
+		/*
+		 * On the inbound path the src zone will be unknown as
+		 * this packet has come from the wire.
+		 */
+		dzone = ip_get_zoneid_v6(&nexthop, mp, ill, ira, ALL_ZONES);
+		ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst);
+	}
+
+	if ((ip6h->ip6_vcf & IPV6_VERS_AND_FLOW_MASK) !=
+	    IPV6_DEFAULT_VERS_AND_FLOW) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
+		ip_drop_input("ipIfStatsInWrongIPVersion", mp, ill);
+		freemsg(mp);
+		return;
+	}
+
+	/*
+	 * For IPv6 we update ira_ip_hdr_length and ira_protocol as
+	 * we parse the headers, starting with the hop-by-hop options header.
+	 */
+	ira->ira_ip_hdr_length = IPV6_HDR_LEN;
+	if ((ira->ira_protocol = ip6h->ip6_nxt) == IPPROTO_HOPOPTS) {
+		ip6_hbh_t	*hbhhdr;
+		uint_t		ehdrlen;
+		uint8_t		*optptr;
+
+		if (pkt_len < IPV6_HDR_LEN + MIN_EHDR_LEN) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		if (mp->b_cont != NULL &&
+		    rptr + IPV6_HDR_LEN + MIN_EHDR_LEN > mp->b_wptr) {
+			ip6h = ip_pullup(mp, IPV6_HDR_LEN + MIN_EHDR_LEN, ira);
+			if (ip6h == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				freemsg(mp);
+				return;
+			}
+		}
+		hbhhdr = (ip6_hbh_t *)&ip6h[1];
+		ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
+
+		if (pkt_len < IPV6_HDR_LEN + ehdrlen) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		if (mp->b_cont != NULL &&
+		    rptr + IPV6_HDR_LEN + ehdrlen > mp->b_wptr) {
+			ip6h = ip_pullup(mp, IPV6_HDR_LEN + ehdrlen, ira);
+			if (ip6h == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				freemsg(mp);
+				return;
+			}
+			hbhhdr = (ip6_hbh_t *)&ip6h[1];
+		}
+
+		/*
+		 * Update ira_ip_hdr_length to skip the hop-by-hop header
+		 * once we get to ip_fanout_v6
+		 */
+		ira->ira_ip_hdr_length += ehdrlen;
+		ira->ira_protocol = hbhhdr->ip6h_nxt;
+
+		optptr = (uint8_t *)&hbhhdr[1];
+		switch (ip_process_options_v6(mp, ip6h, optptr,
+		    ehdrlen - 2, IPPROTO_HOPOPTS, ira)) {
+		case -1:
+			/*
+			 * Packet has been consumed and any
+			 * needed ICMP messages sent.
+			 */
+			return;
+		case 0:
+			/* no action needed */
+			break;
+		case 1:
+			/*
+			 * Known router alert. Make use handle it as local
+			 * by setting the nexthop to be the all-host multicast
+			 * address, and skip multicast membership filter by
+			 * marking as a router alert.
+			 */
+			ira->ira_flags |= IRAF_ROUTER_ALERT;
+			nexthop = ipv6_all_hosts_mcast;
+			break;
+		}
+	}
+
+	/*
+	 * Here we check to see if we machine is setup as
+	 * L3 loadbalancer and if the incoming packet is for a VIP
+	 *
+	 * Check the following:
+	 * - there is at least a rule
+	 * - protocol of the packet is supported
+	 *
+	 * We don't load balance IPv6 link-locals.
+	 */
+	if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ira->ira_protocol) &&
+	    !IN6_IS_ADDR_LINKLOCAL(&nexthop)) {
+		in6_addr_t	lb_dst;
+		int		lb_ret;
+
+		/* For convenience, we just pull up the mblk. */
+		if (mp->b_cont != NULL) {
+			if (pullupmsg(mp, -1) == 0) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards - pullupmsg",
+				    mp, ill);
+				freemsg(mp);
+				return;
+			}
+			ip6h = (ip6_t *)mp->b_rptr;
+		}
+		lb_ret = ilb_check_v6(ilbs, ill, mp, ip6h, ira->ira_protocol,
+		    (uint8_t *)ip6h + ira->ira_ip_hdr_length, &lb_dst);
+		if (lb_ret == ILB_DROPPED) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ILB_DROPPED", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		if (lb_ret == ILB_BALANCED) {
+			/* Set the dst to that of the chosen server */
+			nexthop = lb_dst;
+			DB_CKSUMFLAGS(mp) = 0;
+		}
+	}
+
+	/* Can not use route cache with TX since the labels can differ */
+	if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+		if (IN6_IS_ADDR_MULTICAST(&nexthop)) {
+			ire = ire_multicast(ill);
+		} else if (IN6_IS_ADDR_LINKLOCAL(&nexthop)) {
+			ire = ire_linklocal(&nexthop, ill, ira,
+			    (ill->ill_flags & ILLF_ROUTER), ipst);
+		} else {
+			/* Match destination and label */
+			ire = ire_route_recursive_v6(&nexthop, 0, NULL,
+			    ALL_ZONES,  ira->ira_tsl, MATCH_IRE_SECATTR,
+			    (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint,
+			    ipst, NULL, NULL, NULL);
+		}
+		/* Update the route cache so we do the ire_refrele */
+		ASSERT(ire != NULL);
+		if (rtc->rtc_ire != NULL)
+			ire_refrele(rtc->rtc_ire);
+		rtc->rtc_ire = ire;
+		rtc->rtc_ip6addr = nexthop;
+	} else if (IN6_ARE_ADDR_EQUAL(&nexthop, &rtc->rtc_ip6addr)) {
+		/* Use the route cache */
+		ASSERT(rtc->rtc_ire != NULL);
+		ire = rtc->rtc_ire;
+	} else {
+		/* Update the route cache */
+		if (IN6_IS_ADDR_MULTICAST(&nexthop)) {
+			ire = ire_multicast(ill);
+		} else if (IN6_IS_ADDR_LINKLOCAL(&nexthop)) {
+			ire = ire_linklocal(&nexthop, ill, ira,
+			    (ill->ill_flags & ILLF_ROUTER), ipst);
+		} else {
+			ire = ire_route_recursive_dstonly_v6(&nexthop,
+			    (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint,
+			    ipst);
+		}
+		ASSERT(ire != NULL);
+		if (rtc->rtc_ire != NULL)
+			ire_refrele(rtc->rtc_ire);
+		rtc->rtc_ire = ire;
+		rtc->rtc_ip6addr = nexthop;
+	}
+
+	ire->ire_ib_pkt_count++;
+
+	/*
+	 * Based on ire_type and ire_flags call one of:
+	 *	ire_recv_local_v6 - for IRE_LOCAL
+	 *	ire_recv_loopback_v6 - for IRE_LOOPBACK
+	 *	ire_recv_multirt_v6 - if RTF_MULTIRT
+	 *	ire_recv_noroute_v6 - if RTF_REJECT or RTF_BLACHOLE
+	 *	ire_recv_multicast_v6 - for IRE_MULTICAST
+	 *	ire_recv_noaccept_v6 - for ire_noaccept ones
+	 *	ire_recv_forward_v6 - for the rest.
+	 */
+
+	(*ire->ire_recvfn)(ire, mp, ip6h, ira);
+}
+#undef rptr
+
+/*
+ * ire_recvfn for IREs that need forwarding
+ */
+void
+ire_recv_forward_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	iaflags_t	iraflags = ira->ira_flags;
+	ill_t		*dst_ill;
+	nce_t		*nce;
+	uint32_t	added_tx_len;
+	uint32_t	mtu, iremtu;
+
+	if (iraflags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+		ip_drop_input("l2 multicast not forwarded", mp, ill);
+		freemsg(mp);
+		return;
+	}
+
+	if (!(ill->ill_flags & ILLF_ROUTER)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+		ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+		freemsg(mp);
+		return;
+	}
+
+	/*
+	 * Either ire_nce_capable or ire_dep_parent would be set for the IRE
+	 * when it is found by ire_route_recursive, but that some other thread
+	 * could have changed the routes with the effect of clearing
+	 * ire_dep_parent. In that case we'd end up dropping the packet, or
+	 * finding a new nce below.
+	 * Get, allocate, or update the nce.
+	 * We get a refhold on ire_nce_cache as a result of this to avoid races
+	 * where ire_nce_cache is deleted.
+	 *
+	 * This ensures that we don't forward if the interface is down since
+	 * ipif_down removes all the nces.
+	 */
+	mutex_enter(&ire->ire_lock);
+	nce = ire->ire_nce_cache;
+	if (nce == NULL) {
+		/* Not yet set up - try to set one up */
+		mutex_exit(&ire->ire_lock);
+		(void) ire_revalidate_nce(ire);
+		mutex_enter(&ire->ire_lock);
+		nce = ire->ire_nce_cache;
+		if (nce == NULL) {
+			mutex_exit(&ire->ire_lock);
+			/* The ire_dep_parent chain went bad, or no memory */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("No ire_dep_parent", mp, ill);
+			freemsg(mp);
+			return;
+		}
+	}
+	nce_refhold(nce);
+	mutex_exit(&ire->ire_lock);
+
+	if (nce->nce_is_condemned) {
+		nce_t *nce1;
+
+		nce1 = ire_handle_condemned_nce(nce, ire, NULL, ip6h, B_FALSE);
+		nce_refrele(nce);
+		if (nce1 == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("No nce", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		nce = nce1;
+	}
+	dst_ill = nce->nce_ill;
+
+	/*
+	 * Unless we are forwarding, drop the packet.
+	 * Unlike IPv4 we don't allow source routed packets out the same
+	 * interface when we are not a router.
+	 * Note that ill_forward_set() will set the ILLF_ROUTER on
+	 * all the group members when it gets an ipmp-ill or under-ill.
+	 */
+	if (!(dst_ill->ill_flags & ILLF_ROUTER)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+		ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+		freemsg(mp);
+		nce_refrele(nce);
+		return;
+	}
+
+	if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) {
+		ire->ire_ib_pkt_count--;
+		/*
+		 * Should only use IREs that are visible from the
+		 * global zone for forwarding.
+		 * For IPv6 any source route would have already been
+		 * advanced in ip_fanout_v6
+		 */
+		ire = ire_route_recursive_v6(&ip6h->ip6_dst, 0, NULL,
+		    GLOBAL_ZONEID, ira->ira_tsl, MATCH_IRE_SECATTR,
+		    (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, ipst,
+		    NULL, NULL, NULL);
+		ire->ire_ib_pkt_count++;
+		(*ire->ire_recvfn)(ire, mp, ip6h, ira);
+		ire_refrele(ire);
+		nce_refrele(nce);
+		return;
+	}
+	/*
+	 * ipIfStatsHCInForwDatagrams should only be increment if there
+	 * will be an attempt to forward the packet, which is why we
+	 * increment after the above condition has been checked.
+	 */
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
+
+	/* Initiate Read side IPPF processing */
+	if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
+		/* ip_process translates an IS_UNDER_IPMP */
+		mp = ip_process(IPP_FWD_IN, mp, ill, ill);
+		if (mp == NULL) {
+			/* ip_drop_packet and MIB done */
+			ip2dbg(("ire_recv_forward_v6: pkt dropped/deferred "
+			    "during IPPF processing\n"));
+			nce_refrele(nce);
+			return;
+		}
+	}
+
+	DTRACE_PROBE4(ip6__forwarding__start,
+	    ill_t *, ill, ill_t *, dst_ill, ip6_t *, ip6h, mblk_t *, mp);
+
+	if (HOOKS6_INTERESTED_FORWARDING(ipst)) {
+		int	error;
+
+		FW_HOOKS(ipst->ips_ip6_forwarding_event,
+		    ipst->ips_ipv6firewall_forwarding,
+		    ill, dst_ill, ip6h, mp, mp, 0, ipst, error);
+
+		DTRACE_PROBE1(ip6__forwarding__end, mblk_t *, mp);
+
+		if (mp == NULL) {
+			nce_refrele(nce);
+			return;
+		}
+		/*
+		 * Even if the destination was changed by the filter we use the
+		 * forwarding decision that was made based on the address
+		 * in ip_input.
+		 */
+
+		/* Might have changed */
+		ip6h = (ip6_t *)mp->b_rptr;
+		ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+	}
+
+	/* Packet is being forwarded. Turning off hwcksum flag. */
+	DB_CKSUMFLAGS(mp) = 0;
+
+	/*
+	 * Per RFC 3513 section 2.5.2, we must not forward packets with
+	 * an unspecified source address.
+	 * The loopback address check for both src and dst has already
+	 * been checked in ip_input_v6
+	 * In the future one can envision adding RPF checks using number 3.
+	 */
+	switch (ipst->ips_src_check) {
+	case 0:
+		break;
+	case 1:
+	case 2:
+		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) ||
+		    IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+			nce_refrele(nce);
+			freemsg(mp);
+			return;
+		}
+		break;
+	}
+
+	/*
+	 * Check to see if we're forwarding the packet to a
+	 * different link from which it came.  If so, check the
+	 * source and destination addresses since routers must not
+	 * forward any packets with link-local source or
+	 * destination addresses to other links.  Otherwise (if
+	 * we're forwarding onto the same link), conditionally send
+	 * a redirect message.
+	 */
+	if (!IS_ON_SAME_LAN(dst_ill, ill)) {
+		if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) ||
+		    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+			freemsg(mp);
+			nce_refrele(nce);
+			return;
+		}
+		/* TBD add site-local check at site boundary? */
+	} else if (ipst->ips_ipv6_send_redirects) {
+		ip_send_potential_redirect_v6(mp, ip6h, ire, ira);
+	}
+
+	added_tx_len = 0;
+	if (iraflags & IRAF_SYSTEM_LABELED) {
+		mblk_t		*mp1;
+		uint32_t	old_pkt_len = ira->ira_pktlen;
+
+		/*
+		 * Check if it can be forwarded and add/remove
+		 * CIPSO options as needed.
+		 */
+		if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+			ip_drop_input("tsol_ip_forward", mp, ill);
+			freemsg(mp);
+			nce_refrele(nce);
+			return;
+		}
+		/*
+		 * Size may have changed. Remember amount added in case
+		 * ip_fragment needs to send an ICMP too big.
+		 */
+		mp = mp1;
+		ip6h = (ip6_t *)mp->b_rptr;
+		ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+		ira->ira_ip_hdr_length = IPV6_HDR_LEN;
+		if (ira->ira_pktlen > old_pkt_len)
+			added_tx_len = ira->ira_pktlen - old_pkt_len;
+	}
+
+	mtu = dst_ill->ill_mtu;
+	if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu)
+		mtu = iremtu;
+	ip_forward_xmit_v6(nce, mp, ip6h, ira, mtu, added_tx_len);
+	nce_refrele(nce);
+	return;
+
+}
+
+/*
+ * Used for sending out unicast and multicast packets that are
+ * forwarded.
+ */
+void
+ip_forward_xmit_v6(nce_t *nce, mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira,
+    uint32_t mtu, uint32_t added_tx_len)
+{
+	ill_t		*dst_ill = nce->nce_ill;
+	uint32_t	pkt_len;
+	iaflags_t	iraflags = ira->ira_flags;
+	ip_stack_t	*ipst = dst_ill->ill_ipst;
+
+	if (ip6h->ip6_hops-- <= 1) {
+		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ICMP6_TIME_EXCEED_TRANSIT", mp, ira->ira_ill);
+		icmp_time_exceeded_v6(mp, ICMP6_TIME_EXCEED_TRANSIT, B_FALSE,
+		    ira);
+		return;
+	}
+
+	/* Initiate Write side IPPF processing before any fragmentation */
+	if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
+		/* ip_process translates an IS_UNDER_IPMP */
+		mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill);
+		if (mp == NULL) {
+			/* ip_drop_packet and MIB done */
+			ip2dbg(("ire_recv_forward_v6: pkt dropped/deferred" \
+			    " during IPPF processing\n"));
+			return;
+		}
+	}
+
+	pkt_len = ira->ira_pktlen;
+
+	BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
+
+	if (pkt_len > mtu) {
+		BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails);
+		ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill);
+		if (iraflags & IRAF_SYSTEM_LABELED) {
+			/*
+			 * Remove any CIPSO option added by
+			 * tsol_ip_forward, and make sure we report
+			 * a path MTU so that there
+			 * is room to add such a CIPSO option for future
+			 * packets.
+			 */
+			mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len, AF_INET6);
+		}
+		icmp_pkt2big_v6(mp, mtu, B_TRUE, ira);
+		return;
+	}
+
+	ASSERT(pkt_len ==
+	    ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN);
+
+	if (iraflags & IRAF_LOOPBACK_COPY) {
+		/*
+		 * IXAF_NO_LOOP_ZONEID is not set hence 6th arg
+		 * is don't care
+		 */
+		(void) ip_postfrag_loopcheck(mp, nce,
+		    (IXAF_LOOPBACK_COPY | IXAF_NO_DEV_FLOW_CTL),
+		    pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
+	} else {
+		(void) ip_xmit(mp, nce, IXAF_NO_DEV_FLOW_CTL,
+		    pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
+	}
+}
+
+/*
+ * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE,
+ * which is what ire_route_recursive returns when there is no matching ire.
+ * Send ICMP unreachable unless blackhole.
+ */
+void
+ire_recv_noroute_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+
+	/* Would we have forwarded this packet if we had a route? */
+	if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+		ip_drop_input("l2 multicast not forwarded", mp, ill);
+		freemsg(mp);
+		return;
+	}
+
+	if (!(ill->ill_flags & ILLF_ROUTER)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+		ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+		freemsg(mp);
+		return;
+	}
+	/*
+	 * If we had a route this could have been forwarded. Count as such.
+	 *
+	 * ipIfStatsHCInForwDatagrams should only be increment if there
+	 * will be an attempt to forward the packet, which is why we
+	 * increment after the above condition has been checked.
+	 */
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
+
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
+
+	ip_rts_change_v6(RTM_MISS, &ip6h->ip6_dst, 0, 0, 0, 0, 0, 0, RTA_DST,
+	    ipst);
+
+	if (ire->ire_flags & RTF_BLACKHOLE) {
+		ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill);
+		freemsg(mp);
+	} else {
+		ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill);
+
+		icmp_unreachable_v6(mp, ICMP6_DST_UNREACH_NOROUTE, B_FALSE,
+		    ira);
+	}
+}
+
+/*
+ * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for
+ * VRRP when in noaccept mode.
+ * We silently drop packets except for Neighbor Solicitations and
+ * Neighbor Advertisements.
+ */
+void
+ire_recv_noaccept_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_recv_attr_t *ira)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	icmp6_t		*icmp6;
+	int		ip_hdr_length;
+
+	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
+		freemsg(mp);
+		return;
+	}
+	ip_hdr_length = ira->ira_ip_hdr_length;
+	if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) {
+		if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira);
+		if (ip6h == NULL) {
+			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
+			freemsg(mp);
+			return;
+		}
+	}
+	icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
+
+	if (icmp6->icmp6_type != ND_NEIGHBOR_SOLICIT &&
+	    icmp6->icmp6_type != ND_NEIGHBOR_ADVERT) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
+		freemsg(mp);
+		return;
+	}
+	ire_recv_local_v6(ire, mp, ip6h, ira);
+}
+
+/*
+ * ire_recvfn for IRE_MULTICAST.
+ */
+void
+ire_recv_multicast_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_recv_attr_t *ira)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+
+	ASSERT(ire->ire_ill == ira->ira_ill);
+
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
+	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen);
+
+	/* Tag for higher-level protocols */
+	ira->ira_flags |= IRAF_MULTICAST;
+
+	/*
+	 * So that we don't end up with dups, only one ill an IPMP group is
+	 * nominated to receive multicast traffic.
+	 * If we have no cast_ill we are liberal and accept everything.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		ip_stack_t	*ipst = ill->ill_ipst;
+
+		/* For an under ill_grp can change under lock */
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+		if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
+		    ill->ill_grp->ig_cast_ill != NULL) {
+			rw_exit(&ipst->ips_ill_g_lock);
+			ip_drop_input("not on cast ill", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		rw_exit(&ipst->ips_ill_g_lock);
+		/*
+		 * We switch to the upper ill so that mrouter and hasmembers
+		 * can operate on upper here and in ip_input_multicast.
+		 */
+		ill = ipmp_ill_hold_ipmp_ill(ill);
+		if (ill != NULL) {
+			ASSERT(ill != ira->ira_ill);
+			ASSERT(ire->ire_ill == ira->ira_ill);
+			ira->ira_ill = ill;
+			ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+		} else {
+			ill = ira->ira_ill;
+		}
+	}
+
+#ifdef notdef
+	/*
+	 * Check if we are a multicast router - send ip_mforward a copy of
+	 * the packet.
+	 * Due to mroute_decap tunnels we consider forwarding packets even if
+	 * mrouted has not joined the allmulti group on this interface.
+	 */
+	if (ipst->ips_ip_g_mrouter) {
+		int retval;
+
+		/*
+		 * Clear the indication that this may have hardware
+		 * checksum as we are not using it for forwarding.
+		 */
+		DB_CKSUMFLAGS(mp) = 0;
+
+		/*
+		 * ip_mforward helps us make these distinctions: If received
+		 * on tunnel and not IGMP, then drop.
+		 * If IGMP packet, then don't check membership
+		 * If received on a phyint and IGMP or PIM, then
+		 * don't check membership
+		 */
+		retval = ip_mforward_v6(mp, ira);
+		/* ip_mforward updates mib variables if needed */
+
+		switch (retval) {
+		case 0:
+			/*
+			 * pkt is okay and arrived on phyint.
+			 */
+			break;
+		case -1:
+			/* pkt is mal-formed, toss it */
+			freemsg(mp);
+			goto done;
+		case 1:
+			/*
+			 * pkt is okay and arrived on a tunnel
+			 *
+			 * If we are running a multicast router
+			 * we need to see all mld packets, which
+			 * are marked with router alerts.
+			 */
+			if (ira->ira_flags & IRAF_ROUTER_ALERT)
+				goto forus;
+			ip_drop_input("Multicast on tunnel ignored", mp, ill);
+			freemsg(mp);
+			goto done;
+		}
+	}
+#endif /* notdef */
+
+	/*
+	 * If this was a router alert we skip the group membership check.
+	 */
+	if (ira->ira_flags & IRAF_ROUTER_ALERT)
+		goto forus;
+
+	/*
+	 * Check if we have members on this ill. This is not necessary for
+	 * correctness because even if the NIC/GLD had a leaky filter, we
+	 * filter before passing to each conn_t.
+	 */
+	if (!ill_hasmembers_v6(ill, &ip6h->ip6_dst)) {
+		/*
+		 * Nobody interested
+		 *
+		 * This might just be caused by the fact that
+		 * multiple IP Multicast addresses map to the same
+		 * link layer multicast - no need to increment counter!
+		 */
+		ip_drop_input("Multicast with no members", mp, ill);
+		freemsg(mp);
+		goto done;
+	}
+forus:
+	ip2dbg(("ire_recv_multicast_v6: multicast for us\n"));
+
+	/*
+	 * After reassembly and IPsec we will need to duplicate the
+	 * multicast packet for all matching zones on the ill.
+	 */
+	ira->ira_zoneid = ALL_ZONES;
+
+	/* Reassemble on the ill on which the packet arrived */
+	ip_input_local_v6(ire, mp, ip6h, ira);
+done:
+	if (ill != ire->ire_ill) {
+		ill_refrele(ill);
+		ira->ira_ill = ire->ire_ill;
+		ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
+	}
+}
+
+/*
+ * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT.
+ * Drop packets since we don't forward out multirt routes.
+ */
+/* ARGSUSED */
+void
+ire_recv_multirt_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_ill;
+
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
+	ip_drop_input("Not forwarding out MULTIRT", mp, ill);
+	freemsg(mp);
+}
+
+/*
+ * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK
+ * has rewritten the packet to have a loopback destination address (We
+ * filter out packet with a loopback destination from arriving over the wire).
+ * We don't know what zone to use, thus we always use the GLOBAL_ZONEID.
+ */
+void
+ire_recv_loopback_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	ill_t		*ire_ill = ire->ire_ill;
+
+	ira->ira_zoneid = GLOBAL_ZONEID;
+
+	/* Switch to the lo0 ill for further processing  */
+	if (ire_ill != ill) {
+		/*
+		 * Update ira_ill to be the ILL on which the IP address
+		 * is hosted.
+		 * No need to hold the ill since we have a hold on the ire
+		 */
+		ASSERT(ira->ira_ill == ira->ira_rill);
+		ira->ira_ill = ire_ill;
+
+		ip_input_local_v6(ire, mp, ip6h, ira);
+
+		/* Restore */
+		ASSERT(ira->ira_ill == ire_ill);
+		ira->ira_ill = ill;
+		return;
+
+	}
+	ip_input_local_v6(ire, mp, ip6h, ira);
+}
+
+/*
+ * ire_recvfn for IRE_LOCAL.
+ */
+void
+ire_recv_local_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	ill_t		*ire_ill = ire->ire_ill;
+
+	/* Make a note for DAD that this address is in use */
+	ire->ire_last_used_time = lbolt;
+
+	/* Only target the IRE_LOCAL with the right zoneid. */
+	ira->ira_zoneid = ire->ire_zoneid;
+
+	/*
+	 * If the packet arrived on the wrong ill, we check that
+	 * this is ok.
+	 * If it is, then we ensure that we do the reassembly on
+	 * the ill on which the address is hosted. We keep ira_rill as
+	 * the one on which the packet arrived, so that IP_PKTINFO and
+	 * friends can report this.
+	 */
+	if (ire_ill != ill) {
+		ire_t *new_ire;
+
+		new_ire = ip_check_multihome(&ip6h->ip6_dst, ire, ill);
+		if (new_ire == NULL) {
+			/* Drop packet */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+			ip_drop_input("ipIfStatsInForwProhibits", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		/*
+		 * Update ira_ill to be the ILL on which the IP address
+		 * is hosted. No need to hold the ill since we have a
+		 * hold on the ire. Note that we do the switch even if
+		 * new_ire == ire (for IPMP, ire would be the one corresponding
+		 * to the IPMP ill).
+		 */
+		ASSERT(ira->ira_ill == ira->ira_rill);
+		ira->ira_ill = new_ire->ire_ill;
+
+		/* ira_ruifindex tracks the upper for ira_rill */
+		if (IS_UNDER_IPMP(ill))
+			ira->ira_ruifindex = ill_get_upper_ifindex(ill);
+
+		ip_input_local_v6(new_ire, mp, ip6h, ira);
+
+		/* Restore */
+		ASSERT(ira->ira_ill == new_ire->ire_ill);
+		ira->ira_ill = ill;
+		ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+
+		if (new_ire != ire)
+			ire_refrele(new_ire);
+		return;
+	}
+
+	ip_input_local_v6(ire, mp, ip6h, ira);
+}
+
+/*
+ * Common function for packets arriving for the host. Handles
+ * checksum verification, reassembly checks, etc.
+ */
+static void
+ip_input_local_v6(ire_t *ire, mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
+{
+	iaflags_t	iraflags = ira->ira_flags;
+
+	/*
+	 * For multicast we need some extra work before
+	 * we call ip_fanout_v6(), since in the case of shared-IP zones
+	 * we need to pretend that a packet arrived for each zoneid.
+	 */
+	if (iraflags & IRAF_MULTICAST) {
+		ip_input_multicast_v6(ire, mp, ip6h, ira);
+		return;
+	}
+	ip_fanout_v6(mp, ip6h, ira);
+}
+
+/*
+ * Handle multiple zones which want to receive the same multicast packets
+ * on this ill by delivering a packet to each of them.
+ *
+ * Note that for packets delivered to transports we could instead do this
+ * as part of the fanout code, but since we need to handle icmp_inbound
+ * it is simpler to have multicast work the same as IPv4 broadcast.
+ *
+ * The ip_fanout matching for multicast matches based on ilm independent of
+ * zoneid since the zoneid restriction is applied when joining a multicast
+ * group.
+ */
+/* ARGSUSED */
+static void
+ip_input_multicast_v6(ire_t *ire, mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_ill;
+	iaflags_t	iraflags = ira->ira_flags;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	netstack_t	*ns = ipst->ips_netstack;
+	zoneid_t	zoneid;
+	mblk_t		*mp1;
+	ip6_t		*ip6h1;
+
+	/* ire_recv_multicast has switched to the upper ill for IPMP */
+	ASSERT(!IS_UNDER_IPMP(ill));
+
+	/*
+	 * If we don't have more than one shared-IP zone, or if
+	 * there are no members in anything but the global zone,
+	 * then just set the zoneid and proceed.
+	 */
+	if (ns->netstack_numzones == 1 ||
+	    !ill_hasmembers_otherzones_v6(ill, &ip6h->ip6_dst,
+	    GLOBAL_ZONEID)) {
+		ira->ira_zoneid = GLOBAL_ZONEID;
+
+		/* If sender didn't want this zone to receive it, drop */
+		if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+		    ira->ira_no_loop_zoneid == ira->ira_zoneid) {
+			ip_drop_input("Multicast but wrong zoneid", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		ip_fanout_v6(mp, ip6h, ira);
+		return;
+	}
+
+	/*
+	 * Here we loop over all zoneids that have members in the group
+	 * and deliver a packet to ip_fanout for each zoneid.
+	 *
+	 * First find any members in the lowest numeric zoneid by looking for
+	 * first zoneid larger than -1 (ALL_ZONES).
+	 * We terminate the loop when we receive -1 (ALL_ZONES).
+	 */
+	zoneid = ill_hasmembers_nextzone_v6(ill, &ip6h->ip6_dst, ALL_ZONES);
+	for (; zoneid != ALL_ZONES;
+	    zoneid = ill_hasmembers_nextzone_v6(ill, &ip6h->ip6_dst, zoneid)) {
+		/*
+		 * Avoid an extra copymsg/freemsg by skipping global zone here
+		 * and doing that at the end.
+		 */
+		if (zoneid == GLOBAL_ZONEID)
+			continue;
+
+		ira->ira_zoneid = zoneid;
+
+		/* If sender didn't want this zone to receive it, skip */
+		if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+		    ira->ira_no_loop_zoneid == ira->ira_zoneid)
+			continue;
+
+		mp1 = copymsg(mp);
+		if (mp1 == NULL) {
+			/* Failed to deliver to one zone */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			continue;
+		}
+		ip6h1 = (ip6_t *)mp1->b_rptr;
+		ip_fanout_v6(mp1, ip6h1, ira);
+	}
+
+	/* Do the main ire */
+	ira->ira_zoneid = GLOBAL_ZONEID;
+	/* If sender didn't want this zone to receive it, drop */
+	if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+	    ira->ira_no_loop_zoneid == ira->ira_zoneid) {
+		ip_drop_input("Multicast but wrong zoneid", mp, ill);
+		freemsg(mp);
+	} else {
+		ip_fanout_v6(mp, ip6h, ira);
+	}
+}
+
+
+/*
+ * Determine the zoneid and IRAF_TX_MAC_EXEMPTABLE if trusted extensions
+ * is in use. Updates ira_zoneid and ira_flags as a result.
+ */
+static void
+ip_fanout_tx_v6(mblk_t *mp, ip6_t *ip6h, uint8_t protocol, uint_t ip_hdr_length,
+    ip_recv_attr_t *ira)
+{
+	uint16_t	*up;
+	uint16_t	lport;
+	zoneid_t	zoneid;
+
+	ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED);
+
+	/*
+	 * If the packet is unlabeled we might allow read-down
+	 * for MAC_EXEMPT. Below we clear this if it is a multi-level
+	 * port (MLP).
+	 * Note that ira_tsl can be NULL here.
+	 */
+	if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED)
+		ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE;
+
+	if (ira->ira_zoneid != ALL_ZONES)
+		return;
+
+	ira->ira_flags |= IRAF_TX_SHARED_ADDR;
+
+	up = (uint16_t *)((uchar_t *)ip6h + ip_hdr_length);
+	switch (protocol) {
+	case IPPROTO_TCP:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDP:
+		/* Caller ensures this */
+		ASSERT(((uchar_t *)ip6h) + ip_hdr_length +4 <= mp->b_wptr);
+
+		/*
+		 * Only these transports support MLP.
+		 * We know their destination port numbers is in
+		 * the same place in the header.
+		 */
+		lport = up[1];
+
+		/*
+		 * No need to handle exclusive-stack zones
+		 * since ALL_ZONES only applies to the shared IP instance.
+		 */
+		zoneid = tsol_mlp_findzone(protocol, lport);
+		/*
+		 * If no shared MLP is found, tsol_mlp_findzone returns
+		 * ALL_ZONES.  In that case, we assume it's SLP, and
+		 * search for the zone based on the packet label.
+		 *
+		 * If there is such a zone, we prefer to find a
+		 * connection in it.  Otherwise, we look for a
+		 * MAC-exempt connection in any zone whose label
+		 * dominates the default label on the packet.
+		 */
+		if (zoneid == ALL_ZONES)
+			zoneid = tsol_attr_to_zoneid(ira);
+		else
+			ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE;
+		break;
+	default:
+		/* Handle shared address for other protocols */
+		zoneid = tsol_attr_to_zoneid(ira);
+		break;
+	}
+	ira->ira_zoneid = zoneid;
+}
+
+/*
+ * Increment checksum failure statistics
+ */
+static void
+ip_input_cksum_err_v6(uint8_t protocol, uint16_t hck_flags, ill_t *ill)
+{
+	ip_stack_t	*ipst = ill->ill_ipst;
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
+
+		if (hck_flags & HCK_FULLCKSUM)
+			IP6_STAT(ipst, ip6_tcp_in_full_hw_cksum_err);
+		else if (hck_flags & HCK_PARTIALCKSUM)
+			IP6_STAT(ipst, ip6_tcp_in_part_hw_cksum_err);
+		else
+			IP6_STAT(ipst, ip6_tcp_in_sw_cksum_err);
+		break;
+	case IPPROTO_UDP:
+		BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
+		if (hck_flags & HCK_FULLCKSUM)
+			IP6_STAT(ipst, ip6_udp_in_full_hw_cksum_err);
+		else if (hck_flags & HCK_PARTIALCKSUM)
+			IP6_STAT(ipst, ip6_udp_in_part_hw_cksum_err);
+		else
+			IP6_STAT(ipst, ip6_udp_in_sw_cksum_err);
+		break;
+	case IPPROTO_ICMPV6:
+		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
+		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/* Calculate the IPv6 pseudo-header checksum for TCP, UDP, and ICMPV6 */
+uint32_t
+ip_input_cksum_pseudo_v6(ip6_t *ip6h, ip_recv_attr_t *ira)
+{
+	uint_t		ulp_len;
+	uint32_t	cksum;
+	uint8_t		protocol = ira->ira_protocol;
+	uint16_t	ip_hdr_length = ira->ira_ip_hdr_length;
+
+#define	iphs    ((uint16_t *)ip6h)
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		ulp_len = ira->ira_pktlen - ip_hdr_length;
+
+		/* Protocol and length */
+		cksum = htons(ulp_len) + IP_TCP_CSUM_COMP;
+		/* IP addresses */
+		cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
+		    iphs[8] + iphs[9] + iphs[10] + iphs[11] +
+		    iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+		    iphs[16] + iphs[17] + iphs[18] + iphs[19];
+		break;
+
+	case IPPROTO_UDP: {
+		udpha_t		*udpha;
+
+		udpha = (udpha_t  *)((uchar_t *)ip6h + ip_hdr_length);
+
+		/* Protocol and length */
+		cksum = udpha->uha_length + IP_UDP_CSUM_COMP;
+		/* IP addresses */
+		cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
+		    iphs[8] + iphs[9] + iphs[10] + iphs[11] +
+		    iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+		    iphs[16] + iphs[17] + iphs[18] + iphs[19];
+		break;
+	}
+	case IPPROTO_ICMPV6:
+		ulp_len = ira->ira_pktlen - ip_hdr_length;
+
+		/* Protocol and length */
+		cksum = htons(ulp_len) + IP_ICMPV6_CSUM_COMP;
+		/* IP addresses */
+		cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
+		    iphs[8] + iphs[9] + iphs[10] + iphs[11] +
+		    iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+		    iphs[16] + iphs[17] + iphs[18] + iphs[19];
+		break;
+	default:
+		cksum = 0;
+		break;
+	}
+#undef	iphs
+	return (cksum);
+}
+
+
+/*
+ * Software verification of the ULP checksums.
+ * Returns B_TRUE if ok.
+ * Increments statistics of failed.
+ */
+static boolean_t
+ip_input_sw_cksum_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
+{
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
+	uint32_t	cksum;
+	uint8_t		protocol = ira->ira_protocol;
+	uint16_t	ip_hdr_length = ira->ira_ip_hdr_length;
+
+	IP6_STAT(ipst, ip6_in_sw_cksum);
+
+	ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
+	    protocol == IPPROTO_ICMPV6);
+
+	cksum = ip_input_cksum_pseudo_v6(ip6h, ira);
+	cksum = IP_CSUM(mp, ip_hdr_length, cksum);
+	if (cksum == 0)
+		return (B_TRUE);
+
+	ip_input_cksum_err_v6(protocol, 0, ira->ira_ill);
+	return (B_FALSE);
+}
+
+/*
+ * Verify the ULP checksums.
+ * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum
+ * algorithm.
+ * Increments statistics if failed.
+ */
+static boolean_t
+ip_input_cksum_v6(iaflags_t iraflags, mblk_t *mp, ip6_t *ip6h,
+    ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_rill;
+	uint16_t	hck_flags;
+	uint32_t	cksum;
+	mblk_t		*mp1;
+	uint_t		len;
+	uint8_t		protocol = ira->ira_protocol;
+	uint16_t	ip_hdr_length = ira->ira_ip_hdr_length;
+
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+	case IPPROTO_ICMPV6:
+		break;
+
+	case IPPROTO_UDP: {
+		udpha_t		*udpha;
+
+		udpha = (udpha_t  *)((uchar_t *)ip6h + ip_hdr_length);
+		/*
+		 *  Before going through the regular checksum
+		 *  calculation, make sure the received checksum
+		 *  is non-zero. RFC 2460 says, a 0x0000 checksum
+		 *  in a UDP packet (within IPv6 packet) is invalid
+		 *  and should be replaced by 0xffff. This makes
+		 *  sense as regular checksum calculation will
+		 *  pass for both the cases i.e. 0x0000 and 0xffff.
+		 *  Removing one of the case makes error detection
+		 *  stronger.
+		 */
+		if (udpha->uha_checksum == 0) {
+			/* 0x0000 checksum is invalid */
+			BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
+			return (B_FALSE);
+		}
+		break;
+	}
+	case IPPROTO_SCTP: {
+		sctp_hdr_t	*sctph;
+		uint32_t	pktsum;
+
+		sctph = (sctp_hdr_t *)((uchar_t *)ip6h + ip_hdr_length);
+#ifdef	DEBUG
+		if (skip_sctp_cksum)
+			return (B_TRUE);
+#endif
+		pktsum = sctph->sh_chksum;
+		sctph->sh_chksum = 0;
+		cksum = sctp_cksum(mp, ip_hdr_length);
+		sctph->sh_chksum = pktsum;
+		if (cksum == pktsum)
+			return (B_TRUE);
+
+		/*
+		 * Defer until later whether a bad checksum is ok
+		 * in order to allow RAW sockets to use Adler checksum
+		 * with SCTP.
+		 */
+		ira->ira_flags |= IRAF_SCTP_CSUM_ERR;
+		return (B_TRUE);
+	}
+
+	default:
+		/* No ULP checksum to verify. */
+		return (B_TRUE);
+	}
+
+	/*
+	 * Revert to software checksum calculation if the interface
+	 * isn't capable of checksum offload.
+	 * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout.
+	 * Note: IRAF_NO_HW_CKSUM is not currently used.
+	 */
+	ASSERT(!IS_IPMP(ill));
+	if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
+	    !dohwcksum) {
+		return (ip_input_sw_cksum_v6(mp, ip6h, ira));
+	}
+
+	/*
+	 * We apply this for all ULP protocols. Does the HW know to
+	 * not set the flags for SCTP and other protocols.
+	 */
+
+	hck_flags = DB_CKSUMFLAGS(mp);
+
+	if (hck_flags & HCK_FULLCKSUM) {
+		/*
+		 * Full checksum has been computed by the hardware
+		 * and has been attached.  If the driver wants us to
+		 * verify the correctness of the attached value, in
+		 * order to protect against faulty hardware, compare
+		 * it against -0 (0xFFFF) to see if it's valid.
+		 */
+		if (hck_flags & HCK_FULLCKSUM_OK)
+			return (B_TRUE);
+
+		cksum = DB_CKSUM16(mp);
+		if (cksum == 0xFFFF)
+			return (B_TRUE);
+		ip_input_cksum_err_v6(protocol, hck_flags, ira->ira_ill);
+		return (B_FALSE);
+	}
+
+	mp1 = mp->b_cont;
+	if ((hck_flags & HCK_PARTIALCKSUM) &&
+	    (mp1 == NULL || mp1->b_cont == NULL) &&
+	    ip_hdr_length >= DB_CKSUMSTART(mp) &&
+	    ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) {
+		uint32_t	adj;
+		uchar_t		*cksum_start;
+
+		cksum = ip_input_cksum_pseudo_v6(ip6h, ira);
+
+		cksum_start = ((uchar_t *)ip6h + DB_CKSUMSTART(mp));
+
+		/*
+		 * Partial checksum has been calculated by hardware
+		 * and attached to the packet; in addition, any
+		 * prepended extraneous data is even byte aligned,
+		 * and there are at most two mblks associated with
+		 * the packet.  If any such data exists, we adjust
+		 * the checksum; also take care any postpended data.
+		 */
+		IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj);
+		/*
+		 * One's complement subtract extraneous checksum
+		 */
+		cksum += DB_CKSUM16(mp);
+		if (adj >= cksum)
+			cksum = ~(adj - cksum) & 0xFFFF;
+		else
+			cksum -= adj;
+		cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
+		cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
+		if (!(~cksum & 0xFFFF))
+			return (B_TRUE);
+
+		ip_input_cksum_err_v6(protocol, hck_flags, ira->ira_ill);
+		return (B_FALSE);
+	}
+	return (ip_input_sw_cksum_v6(mp, ip6h, ira));
+}
+
+
+/*
+ * Handle fanout of received packets.
+ * Unicast packets that are looped back (from ire_send_local_v6) and packets
+ * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM.
+ *
+ * IPQoS Notes
+ * Before sending it to the client, invoke IPPF processing. Policy processing
+ * takes place only if the callout_position, IPP_LOCAL_IN, is enabled.
+ */
+void
+ip_fanout_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_ill;
+	iaflags_t	iraflags = ira->ira_flags;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	uint8_t		protocol;
+	conn_t		*connp;
+#define	rptr	((uchar_t *)ip6h)
+	uint_t		ip_hdr_length;
+	uint_t		min_ulp_header_length;
+	int		offset;
+	ssize_t		len;
+	netstack_t	*ns = ipst->ips_netstack;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ill_t		*rill = ira->ira_rill;
+
+	ASSERT(ira->ira_pktlen == ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN);
+
+	/*
+	 * We repeat this as we parse over destination options header and
+	 * fragment headers (earlier we've handled any hop-by-hop options
+	 * header.)
+	 * We update ira_protocol and ira_ip_hdr_length as we skip past
+	 * the intermediate headers; they already point past any
+	 * hop-by-hop header.
+	 */
+repeat:
+	protocol = ira->ira_protocol;
+	ip_hdr_length = ira->ira_ip_hdr_length;
+
+	/*
+	 * Time for IPP once we've done reassembly and IPsec.
+	 * We skip this for loopback packets since we don't do IPQoS
+	 * on loopback.
+	 */
+	if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
+	    !(iraflags & IRAF_LOOPBACK) &&
+	    (protocol != IPPROTO_ESP || protocol != IPPROTO_AH ||
+	    protocol != IPPROTO_DSTOPTS || protocol != IPPROTO_ROUTING ||
+	    protocol != IPPROTO_FRAGMENT)) {
+		/*
+		 * Use the interface on which the packet arrived - not where
+		 * the IP address is hosted.
+		 */
+		/* ip_process translates an IS_UNDER_IPMP */
+		mp = ip_process(IPP_LOCAL_IN, mp, rill, ill);
+		if (mp == NULL) {
+			/* ip_drop_packet and MIB done */
+			return;
+		}
+	}
+
+	/* Determine the minimum required size of the upper-layer header */
+	/* Need to do this for at least the set of ULPs that TX handles. */
+	switch (protocol) {
+	case IPPROTO_TCP:
+		min_ulp_header_length = TCP_MIN_HEADER_LENGTH;
+		break;
+	case IPPROTO_SCTP:
+		min_ulp_header_length = SCTP_COMMON_HDR_LENGTH;
+		break;
+	case IPPROTO_UDP:
+		min_ulp_header_length = UDPH_SIZE;
+		break;
+	case IPPROTO_ICMP:
+	case IPPROTO_ICMPV6:
+		min_ulp_header_length = ICMPH_SIZE;
+		break;
+	case IPPROTO_FRAGMENT:
+	case IPPROTO_DSTOPTS:
+	case IPPROTO_ROUTING:
+		min_ulp_header_length = MIN_EHDR_LEN;
+		break;
+	default:
+		min_ulp_header_length = 0;
+		break;
+	}
+	/* Make sure we have the min ULP header length */
+	len = mp->b_wptr - rptr;
+	if (len < ip_hdr_length + min_ulp_header_length) {
+		if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length)
+			goto pkt_too_short;
+
+		IP6_STAT(ipst, ip6_recv_pullup);
+		ip6h = ip_pullup(mp, ip_hdr_length + min_ulp_header_length,
+		    ira);
+		if (ip6h == NULL)
+			goto discard;
+		len = mp->b_wptr - rptr;
+	}
+
+	/*
+	 * If trusted extensions then determine the zoneid and TX specific
+	 * ira_flags.
+	 */
+	if (iraflags & IRAF_SYSTEM_LABELED) {
+		/* This can update ira->ira_flags and ira->ira_zoneid */
+		ip_fanout_tx_v6(mp, ip6h, protocol, ip_hdr_length, ira);
+		iraflags = ira->ira_flags;
+	}
+
+
+	/* Verify ULP checksum. Handles TCP, UDP, and SCTP */
+	if (iraflags & IRAF_VERIFY_ULP_CKSUM) {
+		if (!ip_input_cksum_v6(iraflags, mp, ip6h, ira)) {
+			/* Bad checksum. Stats are already incremented */
+			ip_drop_input("Bad ULP checksum", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		/* IRAF_SCTP_CSUM_ERR could have been set */
+		iraflags = ira->ira_flags;
+	}
+	switch (protocol) {
+	case IPPROTO_TCP:
+		/* For TCP, discard multicast packets. */
+		if (iraflags & IRAF_MULTIBROADCAST)
+			goto discard;
+
+		/* First mblk contains IP+TCP headers per above check */
+		ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH);
+
+		/* TCP options present? */
+		offset = ((uchar_t *)ip6h)[ip_hdr_length + 12] >> 4;
+		if (offset != 5) {
+			if (offset < 5)
+				goto discard;
+
+			/*
+			 * There must be TCP options.
+			 * Make sure we can grab them.
+			 */
+			offset <<= 2;
+			offset += ip_hdr_length;
+			if (len < offset) {
+				if (ira->ira_pktlen < offset)
+					goto pkt_too_short;
+
+				IP6_STAT(ipst, ip6_recv_pullup);
+				ip6h = ip_pullup(mp, offset, ira);
+				if (ip6h == NULL)
+					goto discard;
+				len = mp->b_wptr - rptr;
+			}
+		}
+
+		/*
+		 * Pass up a squeue hint to tcp.
+		 * If ira_sqp is already set (this is loopback) we leave it
+		 * alone.
+		 */
+		if (ira->ira_sqp == NULL) {
+			ira->ira_sqp = ip_squeue_get(ira->ira_ring);
+		}
+
+		/* Look for AF_INET or AF_INET6 that matches */
+		connp = ipcl_classify_v6(mp, IPPROTO_TCP, ip_hdr_length,
+		    ira, ipst);
+		if (connp == NULL) {
+			/* Send the TH_RST */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+			tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
+			return;
+		}
+		if (connp->conn_incoming_ifindex != 0 &&
+		    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+			CONN_DEC_REF(connp);
+
+			/* Send the TH_RST */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+			tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
+			return;
+		}
+		if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
+		    (iraflags & IRAF_IPSEC_SECURE)) {
+			mp = ipsec_check_inbound_policy(mp, connp,
+			    NULL, ip6h, ira);
+			if (mp == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				/* Note that mp is NULL */
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				CONN_DEC_REF(connp);
+				return;
+			}
+		}
+		/* Found a client; up it goes */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		ira->ira_ill = ira->ira_rill = NULL;
+		if (!IPCL_IS_TCP(connp)) {
+			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
+			(connp->conn_recv)(connp, mp, NULL, ira);
+			CONN_DEC_REF(connp);
+			ira->ira_ill = ill;
+			ira->ira_rill = rill;
+			return;
+		}
+
+		/*
+		 * We do different processing whether called from
+		 * ip_accept_tcp and we match the target, don't match
+		 * the target, and when we are called by ip_input.
+		 */
+		if (iraflags & IRAF_TARGET_SQP) {
+			if (ira->ira_target_sqp == connp->conn_sqp) {
+				mblk_t	*attrmp;
+
+				attrmp = ip_recv_attr_to_mblk(ira);
+				if (attrmp == NULL) {
+					BUMP_MIB(ill->ill_ip_mib,
+					    ipIfStatsInDiscards);
+					ip_drop_input("ipIfStatsInDiscards",
+					    mp, ill);
+					freemsg(mp);
+					CONN_DEC_REF(connp);
+				} else {
+					SET_SQUEUE(attrmp, connp->conn_recv,
+					    connp);
+					attrmp->b_cont = mp;
+					ASSERT(ira->ira_target_sqp_mp == NULL);
+					ira->ira_target_sqp_mp = attrmp;
+					/*
+					 * Conn ref release when drained from
+					 * the squeue.
+					 */
+				}
+			} else {
+				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+				    connp->conn_recv, connp, ira, SQ_FILL,
+				    SQTAG_IP6_TCP_INPUT);
+			}
+		} else {
+			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv,
+			    connp, ira, ip_squeue_flag, SQTAG_IP6_TCP_INPUT);
+		}
+		ira->ira_ill = ill;
+		ira->ira_rill = rill;
+		return;
+
+	case IPPROTO_SCTP: {
+		sctp_hdr_t	*sctph;
+		uint32_t	ports;	/* Source and destination ports */
+		sctp_stack_t	*sctps = ipst->ips_netstack->netstack_sctp;
+
+		/* For SCTP, discard multicast packets. */
+		if (iraflags & IRAF_MULTIBROADCAST)
+			goto discard;
+
+		/*
+		 * Since there is no SCTP h/w cksum support yet, just
+		 * clear the flag.
+		 */
+		DB_CKSUMFLAGS(mp) = 0;
+
+		/* Length ensured above */
+		ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH);
+		sctph = (sctp_hdr_t *)(rptr + ip_hdr_length);
+
+		/* get the ports */
+		ports = *(uint32_t *)&sctph->sh_sport;
+
+		if (iraflags & IRAF_SCTP_CSUM_ERR) {
+			/*
+			 * No potential sctp checksum errors go to the Sun
+			 * sctp stack however they might be Adler-32 summed
+			 * packets a userland stack bound to a raw IP socket
+			 * could reasonably use. Note though that Adler-32 is
+			 * a long deprecated algorithm and customer sctp
+			 * networks should eventually migrate to CRC-32 at
+			 * which time this facility should be removed.
+			 */
+			ip_fanout_sctp_raw(mp, NULL, ip6h, ports, ira);
+			return;
+		}
+		connp = sctp_fanout(&ip6h->ip6_src, &ip6h->ip6_dst, ports,
+		    ira, mp, sctps);
+		if (connp == NULL) {
+			/* Check for raw socket or OOTB handling */
+			ip_fanout_sctp_raw(mp, NULL, ip6h, ports, ira);
+			return;
+		}
+		if (connp->conn_incoming_ifindex != 0 &&
+		    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+			CONN_DEC_REF(connp);
+
+			/* Check for raw socket or OOTB handling */
+			ip_fanout_sctp_raw(mp, NULL, ip6h, ports, ira);
+			return;
+		}
+
+		/* Found a client; up it goes */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		sctp_input(connp, NULL, ip6h, mp, ira);
+		/* sctp_input does a rele of the sctp_t */
+		return;
+	}
+
+	case IPPROTO_UDP:
+		/* First mblk contains IP+UDP headers as checked above */
+		ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE);
+
+		if (iraflags & IRAF_MULTIBROADCAST) {
+			uint16_t *up;	/* Pointer to ports in ULP header */
+
+			up = (uint16_t *)((uchar_t *)ip6h + ip_hdr_length);
+
+			ip_fanout_udp_multi_v6(mp, ip6h, up[1], up[0], ira);
+			return;
+		}
+
+		/* Look for AF_INET or AF_INET6 that matches */
+		connp = ipcl_classify_v6(mp, IPPROTO_UDP, ip_hdr_length,
+		    ira, ipst);
+		if (connp == NULL) {
+	no_udp_match:
+			if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP].
+			    connf_head != NULL) {
+				ASSERT(ira->ira_protocol == IPPROTO_UDP);
+				ip_fanout_proto_v6(mp, ip6h, ira);
+			} else {
+				ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH,
+				    ICMP6_DST_UNREACH_NOPORT, ira);
+			}
+			return;
+
+		}
+		if (connp->conn_incoming_ifindex != 0 &&
+		    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+			CONN_DEC_REF(connp);
+			goto no_udp_match;
+		}
+		if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
+		    !canputnext(connp->conn_rq)) {
+			CONN_DEC_REF(connp);
+			BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
+			ip_drop_input("udpIfStatsInOverflows", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
+		    (iraflags & IRAF_IPSEC_SECURE)) {
+			mp = ipsec_check_inbound_policy(mp, connp,
+			    NULL, ip6h, ira);
+			if (mp == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				/* Note that mp is NULL */
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				CONN_DEC_REF(connp);
+				return;
+			}
+		}
+
+		/* Found a client; up it goes */
+		IP6_STAT(ipst, ip6_udp_fannorm);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		ira->ira_ill = ira->ira_rill = NULL;
+		(connp->conn_recv)(connp, mp, NULL, ira);
+		CONN_DEC_REF(connp);
+		ira->ira_ill = ill;
+		ira->ira_rill = rill;
+		return;
+	default:
+		break;
+	}
+
+	/*
+	 * Clear hardware checksumming flag as it is currently only
+	 * used by TCP and UDP.
+	 */
+	DB_CKSUMFLAGS(mp) = 0;
+
+	switch (protocol) {
+	case IPPROTO_ICMPV6:
+		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
+
+		/* Check variable for testing applications */
+		if (ipst->ips_ipv6_drop_inbound_icmpv6) {
+			ip_drop_input("ipv6_drop_inbound_icmpv6", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		/*
+		 * We need to accomodate icmp messages coming in clear
+		 * until we get everything secure from the wire. If
+		 * icmp_accept_clear_messages is zero we check with
+		 * the global policy and act accordingly. If it is
+		 * non-zero, we accept the message without any checks.
+		 * But *this does not mean* that this will be delivered
+		 * to RAW socket clients. By accepting we might send
+		 * replies back, change our MTU value etc.,
+		 * but delivery to the ULP/clients depends on their
+		 * policy dispositions.
+		 */
+		if (ipst->ips_icmp_accept_clear_messages == 0) {
+			mp = ipsec_check_global_policy(mp, NULL,
+			    NULL, ip6h, ira, ns);
+			if (mp == NULL)
+				return;
+		}
+
+		/*
+		 * On a labeled system, we have to check whether the zone
+		 * itself is permitted to receive raw traffic.
+		 */
+		if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+			if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
+				BUMP_MIB(ill->ill_icmp6_mib,
+				    ipv6IfIcmpInErrors);
+				ip_drop_input("tsol_can_accept_raw", mp, ill);
+				freemsg(mp);
+				return;
+			}
+		}
+
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		mp = icmp_inbound_v6(mp, ira);
+		if (mp == NULL) {
+			/* No need to pass to RAW sockets */
+			return;
+		}
+		break;
+
+	case IPPROTO_DSTOPTS: {
+		ip6_dest_t	*desthdr;
+		uint_t		ehdrlen;
+		uint8_t		*optptr;
+
+		/* We already check for MIN_EHDR_LEN above */
+
+		/* Check if AH is present and needs to be processed. */
+		mp = ipsec_early_ah_v6(mp, ira);
+		if (mp == NULL)
+			return;
+
+		/*
+		 * Reinitialize pointers, as ipsec_early_ah_v6() does
+		 * complete pullups.  We don't have to do more pullups
+		 * as a result.
+		 */
+		ip6h = (ip6_t *)mp->b_rptr;
+
+		if (ira->ira_pktlen - ip_hdr_length < MIN_EHDR_LEN)
+			goto pkt_too_short;
+
+		if (mp->b_cont != NULL &&
+		    rptr + ip_hdr_length + MIN_EHDR_LEN > mp->b_wptr) {
+			ip6h = ip_pullup(mp, ip_hdr_length + MIN_EHDR_LEN, ira);
+			if (ip6h == NULL)
+				goto discard;
+		}
+		desthdr = (ip6_dest_t *)(rptr + ip_hdr_length);
+		ehdrlen = 8 * (desthdr->ip6d_len + 1);
+		if (ira->ira_pktlen - ip_hdr_length < ehdrlen)
+			goto pkt_too_short;
+		if (mp->b_cont != NULL &&
+		    rptr + IPV6_HDR_LEN + ehdrlen > mp->b_wptr) {
+			ip6h = ip_pullup(mp, IPV6_HDR_LEN + ehdrlen, ira);
+			if (ip6h == NULL)
+				goto discard;
+
+			desthdr = (ip6_dest_t *)(rptr + ip_hdr_length);
+		}
+		optptr = (uint8_t *)&desthdr[1];
+
+		/*
+		 * Update ira_ip_hdr_length to skip the destination header
+		 * when we repeat.
+		 */
+		ira->ira_ip_hdr_length += ehdrlen;
+
+		ira->ira_protocol = desthdr->ip6d_nxt;
+
+		/*
+		 * Note: XXX This code does not seem to make
+		 * distinction between Destination Options Header
+		 * being before/after Routing Header which can
+		 * happen if we are at the end of source route.
+		 * This may become significant in future.
+		 * (No real significant Destination Options are
+		 * defined/implemented yet ).
+		 */
+		switch (ip_process_options_v6(mp, ip6h, optptr,
+		    ehdrlen - 2, IPPROTO_DSTOPTS, ira)) {
+		case -1:
+			/*
+			 * Packet has been consumed and any needed
+			 * ICMP errors sent.
+			 */
+			return;
+		case 0:
+			/* No action needed  continue */
+			break;
+		case 1:
+			/*
+			 * Unnexpected return value
+			 * (Router alert is a Hop-by-Hop option)
+			 */
+#ifdef DEBUG
+			panic("ip_fanout_v6: router "
+			    "alert hbh opt indication in dest opt");
+			/*NOTREACHED*/
+#else
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			freemsg(mp);
+			return;
+#endif
+		}
+		goto repeat;
+	}
+	case IPPROTO_FRAGMENT: {
+		ip6_frag_t *fraghdr;
+
+		if (ira->ira_pktlen - ip_hdr_length < sizeof (ip6_frag_t))
+			goto pkt_too_short;
+
+		if (mp->b_cont != NULL &&
+		    rptr + ip_hdr_length + sizeof (ip6_frag_t) > mp->b_wptr) {
+			ip6h = ip_pullup(mp,
+			    ip_hdr_length + sizeof (ip6_frag_t), ira);
+			if (ip6h == NULL)
+				goto discard;
+		}
+
+		fraghdr = (ip6_frag_t *)(rptr + ip_hdr_length);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
+
+		/*
+		 * Invoke the CGTP (multirouting) filtering module to
+		 * process the incoming packet. Packets identified as
+		 * duplicates must be discarded. Filtering is active
+		 * only if the ip_cgtp_filter ndd variable is
+		 * non-zero.
+		 */
+		if (ipst->ips_ip_cgtp_filter &&
+		    ipst->ips_ip_cgtp_filter_ops != NULL) {
+			int cgtp_flt_pkt;
+			netstackid_t stackid;
+
+			stackid = ipst->ips_netstack->netstack_stackid;
+
+			/*
+			 * CGTP and IPMP are mutually exclusive so
+			 * phyint_ifindex is fine here.
+			 */
+			cgtp_flt_pkt =
+			    ipst->ips_ip_cgtp_filter_ops->cfo_filter_v6(
+			    stackid, ill->ill_phyint->phyint_ifindex,
+			    ip6h, fraghdr);
+			if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
+				ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill);
+				freemsg(mp);
+				return;
+			}
+		}
+
+		/*
+		 * Update ip_hdr_length to skip the frag header
+		 * ip_input_fragment_v6 will determine the extension header
+		 * prior to the fragment header and update its nexthdr value,
+		 * and also set ira_protocol to the nexthdr that follows the
+		 * completed fragment.
+		 */
+		ip_hdr_length += sizeof (ip6_frag_t);
+
+		/*
+		 * Make sure we have ira_l2src before we loose the original
+		 * mblk
+		 */
+		if (!(ira->ira_flags & IRAF_L2SRC_SET))
+			ip_setl2src(mp, ira, ira->ira_rill);
+
+		mp = ip_input_fragment_v6(mp, ip6h, fraghdr,
+		    ira->ira_pktlen - ip_hdr_length, ira);
+		if (mp == NULL) {
+			/* Reassembly is still pending */
+			return;
+		}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
+
+		/*
+		 * The mblk chain has the frag header removed and
+		 * ira_protocol, ira_pktlen, ira_ip_hdr_length as well as the
+		 * IP header has been updated to refleact the result.
+		 */
+		ip6h = (ip6_t *)mp->b_rptr;
+		ip_hdr_length = ira->ira_ip_hdr_length;
+		goto repeat;
+	}
+	case IPPROTO_HOPOPTS:
+		/*
+		 * Illegal header sequence.
+		 * (Hop-by-hop headers are processed above
+		 *  and required to immediately follow IPv6 header)
+		 */
+		ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
+		icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira);
+		return;
+
+	case IPPROTO_ROUTING: {
+		uint_t ehdrlen;
+		ip6_rthdr_t *rthdr;
+
+		/* Check if AH is present and needs to be processed. */
+		mp = ipsec_early_ah_v6(mp, ira);
+		if (mp == NULL)
+			return;
+
+		/*
+		 * Reinitialize pointers, as ipsec_early_ah_v6() does
+		 * complete pullups.  We don't have to do more pullups
+		 * as a result.
+		 */
+		ip6h = (ip6_t *)mp->b_rptr;
+
+		if (ira->ira_pktlen - ip_hdr_length < MIN_EHDR_LEN)
+			goto pkt_too_short;
+
+		if (mp->b_cont != NULL &&
+		    rptr + ip_hdr_length + MIN_EHDR_LEN > mp->b_wptr) {
+			ip6h = ip_pullup(mp, ip_hdr_length + MIN_EHDR_LEN, ira);
+			if (ip6h == NULL)
+				goto discard;
+		}
+		rthdr = (ip6_rthdr_t *)(rptr + ip_hdr_length);
+		protocol = ira->ira_protocol = rthdr->ip6r_nxt;
+		ehdrlen = 8 * (rthdr->ip6r_len + 1);
+		if (ira->ira_pktlen - ip_hdr_length < ehdrlen)
+			goto pkt_too_short;
+		if (mp->b_cont != NULL &&
+		    rptr + IPV6_HDR_LEN + ehdrlen > mp->b_wptr) {
+			ip6h = ip_pullup(mp, IPV6_HDR_LEN + ehdrlen, ira);
+			if (ip6h == NULL)
+				goto discard;
+			rthdr = (ip6_rthdr_t *)(rptr + ip_hdr_length);
+		}
+		if (rthdr->ip6r_segleft != 0) {
+			/* Not end of source route */
+			if (ira->ira_flags &
+			    (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
+				BUMP_MIB(ill->ill_ip_mib,
+				    ipIfStatsForwProhibits);
+				ip_drop_input("ipIfStatsInForwProhibits",
+				    mp, ill);
+				freemsg(mp);
+				return;
+			}
+			ip_process_rthdr(mp, ip6h, rthdr, ira);
+			return;
+		}
+		ira->ira_ip_hdr_length += ehdrlen;
+		goto repeat;
+	}
+
+	case IPPROTO_AH:
+	case IPPROTO_ESP: {
+		/*
+		 * Fast path for AH/ESP.
+		 */
+		netstack_t *ns = ipst->ips_netstack;
+		ipsec_stack_t *ipss = ns->netstack_ipsec;
+
+		IP_STAT(ipst, ipsec_proto_ahesp);
+
+		if (!ipsec_loaded(ipss)) {
+			ip_proto_not_sup(mp, ira);
+			return;
+		}
+
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		/* select inbound SA and have IPsec process the pkt */
+		if (protocol == IPPROTO_ESP) {
+			esph_t *esph;
+
+			mp = ipsec_inbound_esp_sa(mp, ira, &esph);
+			if (mp == NULL)
+				return;
+
+			ASSERT(esph != NULL);
+			ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+			ASSERT(ira->ira_ipsec_esp_sa != NULL);
+			ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL);
+
+			mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph,
+			    ira);
+		} else {
+			ah_t *ah;
+
+			mp = ipsec_inbound_ah_sa(mp, ira, &ah);
+			if (mp == NULL)
+				return;
+
+			ASSERT(ah != NULL);
+			ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+			ASSERT(ira->ira_ipsec_ah_sa != NULL);
+			ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
+			mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah,
+			    ira);
+		}
+
+		if (mp == NULL) {
+			/*
+			 * Either it failed or is pending. In the former case
+			 * ipIfStatsInDiscards was increased.
+			 */
+			return;
+		}
+		/* we're done with IPsec processing, send it up */
+		ip_input_post_ipsec(mp, ira);
+		return;
+	}
+	case IPPROTO_NONE:
+		/* All processing is done. Count as "delivered". */
+		freemsg(mp);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		return;
+
+	case IPPROTO_ENCAP:
+	case IPPROTO_IPV6:
+		/* iptun will verify trusted label */
+		connp = ipcl_classify_v6(mp, protocol, ip_hdr_length,
+		    ira, ipst);
+		if (connp != NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+			ira->ira_ill = ira->ira_rill = NULL;
+			connp->conn_recv(connp, mp, NULL, ira);
+			CONN_DEC_REF(connp);
+			ira->ira_ill = ill;
+			ira->ira_rill = rill;
+			return;
+		}
+		/* FALLTHRU */
+	default:
+		/*
+		 * On a labeled system, we have to check whether the zone
+		 * itself is permitted to receive raw traffic.
+		 */
+		if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+			if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				freemsg(mp);
+				return;
+			}
+		}
+		break;
+	}
+
+	/*
+	 * The above input functions may have returned the pulled up message.
+	 * So ip6h need to be reinitialized.
+	 */
+	ip6h = (ip6_t *)mp->b_rptr;
+	ira->ira_protocol = protocol;
+	if (ipst->ips_ipcl_proto_fanout_v6[protocol].connf_head == NULL) {
+		/* No user-level listener for these packets packets */
+		ip_proto_not_sup(mp, ira);
+		return;
+	}
+
+	/*
+	 * Handle fanout to raw sockets.  There
+	 * can be more than one stream bound to a particular
+	 * protocol.  When this is the case, each one gets a copy
+	 * of any incoming packets.
+	 */
+	ASSERT(ira->ira_protocol == protocol);
+	ip_fanout_proto_v6(mp, ip6h, ira);
+	return;
+
+pkt_too_short:
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+	freemsg(mp);
+	return;
+
+discard:
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+	ip_drop_input("ipIfStatsInDiscards", mp, ill);
+	freemsg(mp);
+#undef rptr
+}
diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c
index c13a66fcc2..7697ca20c7 100644
--- a/usr/src/uts/common/inet/ip/ip6_ire.c
+++ b/usr/src/uts/common/inet/ip/ip6_ire.c
@@ -60,122 +60,122 @@
 #include <sys/tsol/label.h>
 #include <sys/tsol/tnet.h>
 
+#define	IS_DEFAULT_ROUTE_V6(ire)	\
+	(((ire)->ire_type & IRE_DEFAULT) || \
+	    (((ire)->ire_type & IRE_INTERFACE) && \
+	    (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
+
 static	ire_t	ire_null;
 
-static ire_t	*ire_ihandle_lookup_onlink_v6(ire_t *cire);
-static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr,
-    const in6_addr_t *mask, const in6_addr_t *gateway, int type,
-    const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
-    const ts_label_t *tsl, int match_flags);
-static	ire_t	*ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
-    const in6_addr_t *, const in6_addr_t *, uint_t *, queue_t *, queue_t *,
-    ushort_t, ipif_t *, const in6_addr_t *, uint32_t, uint32_t, uint_t,
-    const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
-static	ire_t	*ip6_ctable_lookup_impl(ire_ctable_args_t *);
+static ire_t *
+ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
+    const in6_addr_t *gateway, int type, const ill_t *ill,
+    zoneid_t zoneid, const ts_label_t *tsl, int flags,
+    ip_stack_t *ipst);
 
 /*
  * Initialize the ire that is specific to IPv6 part and call
  * ire_init_common to finish it.
+ * Returns zero or errno.
  */
-static ire_t *
+int
 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
-    const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
-    uint_t *max_fragp, queue_t *rfq, queue_t *stq, ushort_t type,
-    ipif_t *ipif, const in6_addr_t *v6cmask, uint32_t phandle,
-    uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
-    tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+    const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
+    zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
 {
+	int error;
 
 	/*
-	 * Reject IRE security attribute creation/initialization
+	 * Reject IRE security attmakeribute creation/initialization
 	 * if system is not running in Trusted mode.
 	 */
-	if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
-		return (NULL);
-
+	if (gc != NULL && !is_system_labeled())
+		return (EINVAL);
 
 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
-	ire->ire_addr_v6 = *v6addr;
-
-	if (v6src_addr != NULL)
-		ire->ire_src_addr_v6 = *v6src_addr;
-	if (v6mask != NULL) {
-		ire->ire_mask_v6 = *v6mask;
-		ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6);
-	}
+	if (v6addr != NULL)
+		ire->ire_addr_v6 = *v6addr;
 	if (v6gateway != NULL)
 		ire->ire_gateway_addr_v6 = *v6gateway;
 
-	if (type == IRE_CACHE && v6cmask != NULL)
-		ire->ire_cmask_v6 = *v6cmask;
-
-	/*
-	 * Multirouted packets need to have a fragment header added so that
-	 * the receiver is able to discard duplicates according to their
-	 * fragment identifier.
-	 */
-	if (type == IRE_CACHE && (flags & RTF_MULTIRT)) {
-		ire->ire_frag_flag = IPH_FRAG_HDR;
+	/* Make sure we don't have stray values in some fields */
+	switch (type) {
+	case IRE_LOOPBACK:
+		ire->ire_gateway_addr_v6 = ire->ire_addr_v6;
+		/* FALLTHRU */
+	case IRE_HOST:
+	case IRE_LOCAL:
+	case IRE_IF_CLONE:
+		ire->ire_mask_v6 = ipv6_all_ones;
+		ire->ire_masklen = IPV6_ABITS;
+		break;
+	case IRE_PREFIX:
+	case IRE_DEFAULT:
+	case IRE_IF_RESOLVER:
+	case IRE_IF_NORESOLVER:
+		if (v6mask != NULL) {
+			ire->ire_mask_v6 = *v6mask;
+			ire->ire_masklen =
+			    ip_mask_to_plen_v6(&ire->ire_mask_v6);
+		}
+		break;
+	case IRE_MULTICAST:
+	case IRE_NOROUTE:
+		ASSERT(v6mask == NULL);
+		break;
+	default:
+		ASSERT(0);
+		return (EINVAL);
 	}
 
-	/* ire_init_common will free the mblks upon encountering any failure */
-	if (!ire_init_common(ire, max_fragp, NULL, rfq, stq, type, ipif,
-	    phandle, ihandle, flags, IPV6_VERSION, ulp_info, gc, gcgrp, ipst))
-		return (NULL);
-
-	return (ire);
-}
-
-/*
- * Similar to ire_create_v6 except that it is called only when
- * we want to allocate ire as an mblk e.g. we have a external
- * resolver. Do we need this in IPv6 ?
- *
- * IPv6 initializes the ire_nce in ire_add_v6, which expects to
- * find the ire_nce to be null when it is called. So, although
- * we have a src_nce parameter (in the interest of matching up with
- * the argument list of the v4 version), we ignore the src_nce
- * argument here.
- */
-/* ARGSUSED */
-ire_t *
-ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
-    const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
-    nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type,
-    ipif_t *ipif, const in6_addr_t *v6cmask,
-    uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
-    tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
-{
-	ire_t	*ire;
-	ire_t	*ret_ire;
-	mblk_t	*mp;
+	error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
+	    gc, ipst);
+	if (error != NULL)
+		return (error);
 
-	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
+	/* Determine which function pointers to use */
+	ire->ire_postfragfn = ip_xmit;		/* Common case */
 
-	/* Allocate the new IRE. */
-	mp = allocb(sizeof (ire_t), BPRI_MED);
-	if (mp == NULL) {
-		ip1dbg(("ire_create_mp_v6: alloc failed\n"));
-		return (NULL);
+	switch (ire->ire_type) {
+	case IRE_LOCAL:
+		ire->ire_sendfn = ire_send_local_v6;
+		ire->ire_recvfn = ire_recv_local_v6;
+#ifdef SO_VRRP
+		ASSERT(ire->ire_ill != NULL);
+		if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) {
+			ire->ire_noaccept = B_TRUE;
+			ire->ire_recvfn = ire_recv_noaccept_v6;
+		}
+#endif
+		break;
+	case IRE_LOOPBACK:
+		ire->ire_sendfn = ire_send_local_v6;
+		ire->ire_recvfn = ire_recv_loopback_v6;
+		break;
+	case IRE_MULTICAST:
+		ire->ire_postfragfn = ip_postfrag_loopcheck;
+		ire->ire_sendfn = ire_send_multicast_v6;
+		ire->ire_recvfn = ire_recv_multicast_v6;
+		break;
+	default:
+		/*
+		 * For IRE_IF_ALL and IRE_OFFLINK we forward received
+		 * packets by default.
+		 */
+		ire->ire_sendfn = ire_send_wire_v6;
+		ire->ire_recvfn = ire_recv_forward_v6;
+		break;
 	}
-
-	ire = (ire_t *)mp->b_rptr;
-	mp->b_wptr = (uchar_t *)&ire[1];
-
-	/* Start clean. */
-	*ire = ire_null;
-	ire->ire_mp = mp;
-	mp->b_datap->db_type = IRE_DB_TYPE;
-
-	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
-	    NULL, rfq, stq, type, ipif, v6cmask, phandle,
-	    ihandle, flags, ulp_info, gc, gcgrp, ipst);
-
-	if (ret_ire == NULL) {
-		freeb(ire->ire_mp);
-		return (NULL);
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+		ire->ire_sendfn = ire_send_noroute_v6;
+		ire->ire_recvfn = ire_recv_noroute_v6;
+	} else if (ire->ire_flags & RTF_MULTIRT) {
+		ire->ire_postfragfn = ip_postfrag_multirt_v6;
+		ire->ire_sendfn = ire_send_multirt_v6;
+		ire->ire_recvfn = ire_recv_multirt_v6;
 	}
-	return (ire);
+	ire->ire_nce_capable = ire_determine_nce_capable(ire);
+	return (0);
 }
 
 /*
@@ -183,153 +183,76 @@ ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
  *
  * NOTE : This is called as writer sometimes though not required
  * by this function.
- *
- * See comments above ire_create_mp_v6() for the rationale behind the
- * unused src_nce argument.
  */
 /* ARGSUSED */
 ire_t *
 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
-    const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
-    uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq,
-    ushort_t type, ipif_t *ipif, const in6_addr_t *v6cmask,
-    uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
-    tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+    const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
+    uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
 {
 	ire_t	*ire;
-	ire_t	*ret_ire;
+	int	error;
 
 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
 
 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
 	if (ire == NULL) {
-		ip1dbg(("ire_create_v6: alloc failed\n"));
+		DTRACE_PROBE(kmem__cache__alloc);
 		return (NULL);
 	}
 	*ire = ire_null;
 
-	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
-	    max_fragp, rfq, stq, type, ipif, v6cmask, phandle,
-	    ihandle, flags, ulp_info, gc, gcgrp, ipst);
+	error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
+	    type, ill, zoneid, flags, gc, ipst);
 
-	if (ret_ire == NULL) {
+	if (error != 0) {
+		DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
 		kmem_cache_free(ire_cache, ire);
 		return (NULL);
 	}
-	ASSERT(ret_ire == ire);
 	return (ire);
 }
 
 /*
- * Find an IRE_INTERFACE for the multicast group.
+ * Find the ill matching a multicast group.
  * Allows different routes for multicast addresses
  * in the unicast routing table (akin to FF::0/8 but could be more specific)
  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
  * specify the interface to join on.
  *
- * Supports link-local addresses by following the ipif/ill when recursing.
+ * Supports link-local addresses by using ire_route_recursive which follows
+ * the ill when recursing.
+ *
+ * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
+ * and the MULTIRT property can be different for different groups, we
+ * extract RTF_MULTIRT from the special unicast route added for a group
+ * with CGTP and pass that back in the multirtp argument.
+ * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
+ * We have a setsrcp argument for the same reason.
  */
-ire_t *
-ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
+ill_t *
+ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
+    ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp)
 {
 	ire_t	*ire;
-	ipif_t	*ipif = NULL;
-	int	match_flags = MATCH_IRE_TYPE;
-	in6_addr_t gw_addr_v6;
-
-	ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL,
-	    zoneid, 0, NULL, MATCH_IRE_DEFAULT, ipst);
+	ill_t	*ill;
 
-	/* We search a resolvable ire in case of multirouting. */
-	if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
-		ire_t *cire = NULL;
-		/*
-		 * If the route is not resolvable, the looked up ire
-		 * may be changed here. In that case, ire_multirt_lookup_v6()
-		 * IRE_REFRELE the original ire and change it.
-		 */
-		(void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW,
-		    NULL, ipst);
-		if (cire != NULL)
-			ire_refrele(cire);
-	}
-	if (ire == NULL)
-		return (NULL);
-	/*
-	 * Make sure we follow ire_ipif.
-	 *
-	 * We need to determine the interface route through
-	 * which the gateway will be reached.
-	 */
-	if (ire->ire_ipif != NULL) {
-		ipif = ire->ire_ipif;
-		match_flags |= MATCH_IRE_ILL;
-	}
+	ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL,
+	    MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL);
+	ASSERT(ire != NULL);
 
-	switch (ire->ire_type) {
-	case IRE_DEFAULT:
-	case IRE_PREFIX:
-	case IRE_HOST:
-		mutex_enter(&ire->ire_lock);
-		gw_addr_v6 = ire->ire_gateway_addr_v6;
-		mutex_exit(&ire->ire_lock);
-		ire_refrele(ire);
-		ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0,
-		    IRE_INTERFACE, ipif, NULL, zoneid, 0,
-		    NULL, match_flags, ipst);
-		return (ire);
-	case IRE_IF_NORESOLVER:
-	case IRE_IF_RESOLVER:
-		return (ire);
-	default:
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		ire_refrele(ire);
 		return (NULL);
 	}
-}
 
-/*
- * Return any local address.  We use this to target ourselves
- * when the src address was specified as 'default'.
- * Preference for IRE_LOCAL entries.
- */
-ire_t *
-ire_lookup_local_v6(zoneid_t zoneid, ip_stack_t *ipst)
-{
-	ire_t	*ire;
-	irb_t	*irb;
-	ire_t	*maybe = NULL;
-	int i;
+	if (multirtp != NULL)
+		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
 
-	for (i = 0; i < ipst->ips_ip6_cache_table_size;  i++) {
-		irb = &ipst->ips_ip_cache_table_v6[i];
-		if (irb->irb_ire == NULL)
-			continue;
-		rw_enter(&irb->irb_lock, RW_READER);
-		for (ire = irb->irb_ire; ire; ire = ire->ire_next) {
-			if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
-			    ire->ire_zoneid != zoneid &&
-			    ire->ire_zoneid != ALL_ZONES)
-				continue;
-			switch (ire->ire_type) {
-			case IRE_LOOPBACK:
-				if (maybe == NULL) {
-					IRE_REFHOLD(ire);
-					maybe = ire;
-				}
-				break;
-			case IRE_LOCAL:
-				if (maybe != NULL) {
-					ire_refrele(maybe);
-				}
-				IRE_REFHOLD(ire);
-				rw_exit(&irb->irb_lock);
-				return (ire);
-			}
-		}
-		rw_exit(&irb->irb_lock);
-	}
-	return (maybe);
+	ill = ire_nexthop_ill(ire);
+	ire_refrele(ire);
+	return (ill);
 }
 
 /*
@@ -369,6 +292,8 @@ ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
 	if (plen < 0 || plen > IPV6_ABITS)
 		return (NULL);
 	*bitmask = ipv6_all_zeros;
+	if (plen == 0)
+		return (bitmask);
 
 	ptr = (uint32_t *)bitmask;
 	while (plen > 32) {
@@ -380,196 +305,78 @@ ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
 }
 
 /*
- * Add a fully initialized IRE to an appropriate
- * table based on ire_type.
- *
- * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST and
- * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
- *
- * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
- * and IRE_CACHE.
- *
- * NOTE : This function is called as writer though not required
- * by this function.
+ * Add a fully initialized IPv6 IRE to the forwarding table.
+ * This returns NULL on failure, or a held IRE on success.
+ * Normally the returned IRE is the same as the argument. But a different
+ * IRE will be returned if the added IRE is deemed identical to an existing
+ * one. In that case ire_identical_ref will be increased.
+ * The caller always needs to do an ire_refrele() on the returned IRE.
  */
-int
-ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
+ire_t *
+ire_add_v6(ire_t *ire)
 {
 	ire_t	*ire1;
 	int	mask_table_index;
 	irb_t	*irb_ptr;
 	ire_t	**irep;
-	int	flags;
-	ire_t	*pire = NULL;
-	ill_t	*stq_ill;
-	boolean_t	ndp_g_lock_held = B_FALSE;
-	ire_t	*ire = *ire_p;
+	int	match_flags;
 	int	error;
 	ip_stack_t	*ipst = ire->ire_ipst;
-	uint_t	marks = 0;
 
 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
-	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
-	ASSERT(ire->ire_nce == NULL);
-
-	/*
-	 * IREs with source addresses hosted on interfaces that are under IPMP
-	 * should be hidden so that applications don't accidentally end up
-	 * sending packets with test addresses as their source addresses, or
-	 * sending out interfaces that are e.g. IFF_INACTIVE.  Hide them here.
-	 * (We let IREs with unspecified source addresses slip through since
-	 * ire_send_v6() will delete them automatically.)
-	 */
-	if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
-	    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
-		DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
-		marks |= IRE_MARK_TESTHIDDEN;
-	}
-
-	/* Find the appropriate list head. */
-	switch (ire->ire_type) {
-	case IRE_HOST:
-		ire->ire_mask_v6 = ipv6_all_ones;
-		ire->ire_masklen = IPV6_ABITS;
-		ire->ire_marks |= marks;
-		if ((ire->ire_flags & RTF_SETSRC) == 0)
-			ire->ire_src_addr_v6 = ipv6_all_zeros;
-		break;
-	case IRE_CACHE:
-		ire->ire_mask_v6 = ipv6_all_ones;
-		ire->ire_masklen = IPV6_ABITS;
-		ire->ire_marks |= marks;
-		break;
-	case IRE_LOCAL:
-	case IRE_LOOPBACK:
-		ire->ire_mask_v6 = ipv6_all_ones;
-		ire->ire_masklen = IPV6_ABITS;
-		break;
-	case IRE_PREFIX:
-	case IRE_DEFAULT:
-		ire->ire_marks |= marks;
-		if ((ire->ire_flags & RTF_SETSRC) == 0)
-			ire->ire_src_addr_v6 = ipv6_all_zeros;
-		break;
-	case IRE_IF_RESOLVER:
-	case IRE_IF_NORESOLVER:
-		ire->ire_marks |= marks;
-		break;
-	default:
-		printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
-		    (void *)ire, ire->ire_type);
-		ire_delete(ire);
-		*ire_p = NULL;
-		return (EINVAL);
-	}
 
 	/* Make sure the address is properly masked. */
 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
 
-	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
-		/* IRE goes into Forward Table */
-		mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
-		if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) ==
-		    NULL) {
-			irb_t *ptr;
-			int i;
-
-			ptr = (irb_t *)mi_zalloc((
-			    ipst->ips_ip6_ftable_hash_size * sizeof (irb_t)));
-			if (ptr == NULL) {
-				ire_delete(ire);
-				*ire_p = NULL;
-				return (ENOMEM);
-			}
-			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
-				rw_init(&ptr[i].irb_lock, NULL,
-				    RW_DEFAULT, NULL);
-			}
-			mutex_enter(&ipst->ips_ire_ft_init_lock);
-			if (ipst->ips_ip_forwarding_table_v6[
-			    mask_table_index] == NULL) {
-				ipst->ips_ip_forwarding_table_v6[
-				    mask_table_index] = ptr;
-				mutex_exit(&ipst->ips_ire_ft_init_lock);
-			} else {
-				/*
-				 * Some other thread won the race in
-				 * initializing the forwarding table at the
-				 * same index.
-				 */
-				mutex_exit(&ipst->ips_ire_ft_init_lock);
-				for (i = 0; i < ipst->ips_ip6_ftable_hash_size;
-				    i++) {
-					rw_destroy(&ptr[i].irb_lock);
-				}
-				mi_free(ptr);
-			}
-		}
-		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
-		    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
-		    ipst->ips_ip6_ftable_hash_size)]);
-	} else {
-		irb_ptr = &(ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(
-		    ire->ire_addr_v6, ipst->ips_ip6_cache_table_size)]);
-	}
-	/*
-	 * For xresolv interfaces (v6 interfaces with an external
-	 * address resolver), ip_newroute_v6/ip_newroute_ipif_v6
-	 * are unable to prevent the deletion of the interface route
-	 * while adding an IRE_CACHE for an on-link destination
-	 * in the IRE_IF_RESOLVER case, since the ire has to go to
-	 * the external resolver and return. We can't do a REFHOLD on the
-	 * associated interface ire for fear of the message being freed
-	 * if the external resolver can't resolve the address.
-	 * Here we look up the interface ire in the forwarding table
-	 * and make sure that the interface route has not been deleted.
-	 */
-	if (ire->ire_type == IRE_CACHE &&
-	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) &&
-	    (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) &&
-	    (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) {
+	mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
+	if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
+		irb_t *ptr;
+		int i;
 
-		pire = ire_ihandle_lookup_onlink_v6(ire);
-		if (pire == NULL) {
+		ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
+		    sizeof (irb_t)));
+		if (ptr == NULL) {
 			ire_delete(ire);
-			*ire_p = NULL;
-			return (EINVAL);
+			return (NULL);
 		}
-		/* Prevent pire from getting deleted */
-		IRB_REFHOLD(pire->ire_bucket);
-		/* Has it been removed already? */
-		if (pire->ire_marks & IRE_MARK_CONDEMNED) {
-			IRB_REFRELE(pire->ire_bucket);
-			ire_refrele(pire);
-			ire_delete(ire);
-			*ire_p = NULL;
-			return (EINVAL);
+		for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
+			rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
+		}
+		mutex_enter(&ipst->ips_ire_ft_init_lock);
+		if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
+		    NULL) {
+			ipst->ips_ip_forwarding_table_v6[mask_table_index] =
+			    ptr;
+			mutex_exit(&ipst->ips_ire_ft_init_lock);
+		} else {
+			/*
+			 * Some other thread won the race in
+			 * initializing the forwarding table at the
+			 * same index.
+			 */
+			mutex_exit(&ipst->ips_ire_ft_init_lock);
+			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
+				rw_destroy(&ptr[i].irb_lock);
+			}
+			mi_free(ptr);
 		}
 	}
+	irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
+	    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
+	    ipst->ips_ip6_ftable_hash_size)]);
 
-	flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
+	match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
+	if (ire->ire_ill != NULL)
+		match_flags |= MATCH_IRE_ILL;
 	/*
-	 * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check
-	 * for duplicates because :
-	 *
-	 * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be
-	 *    pointing at different ills. A real duplicate is
-	 *    a match on both ire_ipif and ire_stq.
-	 *
-	 * 2) We could have multiple packets trying to create
-	 *    an IRE_CACHE for the same ill.
-	 *
-	 * Rather than looking at the packet, we depend on the above for
-	 * MATCH_IRE_ILL here.
-	 *
-	 * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
-	 * multiple IRE_CACHES for an ill for the same destination
-	 * with various scoped addresses i.e represented by ipifs.
-	 *
-	 * MATCH_IRE_ILL is done implicitly below for IRE_CACHES.
+	 * Start the atomic add of the ire. Grab the bucket lock and the
+	 * ill lock. Check for condemned.
 	 */
-	if (ire->ire_ipif != NULL)
-		flags |= MATCH_IRE_IPIF;
+	error = ire_atomic_start(irb_ptr, ire);
+	if (error != 0) {
+		ire_delete(ire);
+		return (NULL);
+	}
 
 	/*
 	 * If we are creating a hidden IRE, make sure we search for
@@ -577,103 +384,36 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
 	 * Otherwise, we might find an IRE on some other interface
 	 * that's not marked hidden.
 	 */
-	if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
-		flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
-	/*
-	 * Start the atomic add of the ire. Grab the ill locks,
-	 * ill_g_usesrc_lock and the bucket lock. Check for condemned.
-	 * To avoid lock order problems, get the ndp6.ndp_g_lock now itself.
-	 */
-	if (ire->ire_type == IRE_CACHE) {
-		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
-		ndp_g_lock_held = B_TRUE;
-	}
-
-	/*
-	 * If ipif or ill is changing ire_atomic_start() may queue the
-	 * request and return EINPROGRESS.
-	 */
-
-	error = ire_atomic_start(irb_ptr, ire, q, mp, func);
-	if (error != 0) {
-		if (ndp_g_lock_held)
-			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-		/*
-		 * We don't know whether it is a valid ipif or not.
-		 * So, set it to NULL. This assumes that the ire has not added
-		 * a reference to the ipif.
-		 */
-		ire->ire_ipif = NULL;
-		ire_delete(ire);
-		if (pire != NULL) {
-			IRB_REFRELE(pire->ire_bucket);
-			ire_refrele(pire);
-		}
-		*ire_p = NULL;
-		return (error);
-	}
-	/*
-	 * To avoid creating ires having stale values for the ire_max_frag
-	 * we get the latest value atomically here. For more details
-	 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
-	 * in ip_rput_dlpi_writer
-	 */
-	if (ire->ire_max_fragp == NULL) {
-		if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6))
-			ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
-		else
-			ire->ire_max_frag = pire->ire_max_frag;
-	} else {
-		uint_t  max_frag;
-
-		max_frag = *ire->ire_max_fragp;
-		ire->ire_max_fragp = NULL;
-		ire->ire_max_frag = max_frag;
-	}
+	if (ire->ire_testhidden)
+		match_flags |= MATCH_IRE_TESTHIDDEN;
 
 	/*
 	 * Atomically check for duplicate and insert in the table.
 	 */
 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
-		if (ire1->ire_marks & IRE_MARK_CONDEMNED)
+		if (IRE_IS_CONDEMNED(ire1))
 			continue;
-
-		if (ire->ire_type == IRE_CACHE) {
-			/*
-			 * We do MATCH_IRE_ILL implicitly here for IRE_CACHES.
-			 * As ire_ipif and ire_stq could point to two
-			 * different ills, we can't pass just ire_ipif to
-			 * ire_match_args and get a match on both ills.
-			 * This is just needed for duplicate checks here and
-			 * so we don't add an extra argument to
-			 * ire_match_args for this. Do it locally.
-			 *
-			 * NOTE : Currently there is no part of the code
-			 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
-			 * match for IRE_CACHEs. Thus we don't want to
-			 * extend the arguments to ire_match_args_v6.
-			 */
-			if (ire1->ire_stq != ire->ire_stq)
-				continue;
-			/*
-			 * Multiroute IRE_CACHEs for a given destination can
-			 * have the same ire_ipif, typically if their source
-			 * address is forced using RTF_SETSRC, and the same
-			 * send-to queue. We differentiate them using the parent
-			 * handle.
-			 */
-			if ((ire1->ire_flags & RTF_MULTIRT) &&
-			    (ire->ire_flags & RTF_MULTIRT) &&
-			    (ire1->ire_phandle != ire->ire_phandle))
-				continue;
-		}
+		/*
+		 * Here we need an exact match on zoneid, i.e.,
+		 * ire_match_args doesn't fit.
+		 */
 		if (ire1->ire_zoneid != ire->ire_zoneid)
 			continue;
+
+		if (ire1->ire_type != ire->ire_type)
+			continue;
+
+		/*
+		 * Note: We do not allow multiple routes that differ only
+		 * in the gateway security attributes; such routes are
+		 * considered duplicates.
+		 * To change that we explicitly have to treat them as
+		 * different here.
+		 */
 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
-		    ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL,
-		    flags)) {
+		    ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL,
+		    match_flags)) {
 			/*
 			 * Return the old ire after doing a REFHOLD.
 			 * As most of the callers continue to use the IRE
@@ -683,141 +423,25 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
 			 */
 			ip1dbg(("found dup ire existing %p new %p",
 			    (void *)ire1, (void *)ire));
-			IRE_REFHOLD(ire1);
-			if (ndp_g_lock_held)
-				mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+			ire_refhold(ire1);
+			atomic_add_32(&ire1->ire_identical_ref, 1);
 			ire_atomic_end(irb_ptr, ire);
 			ire_delete(ire);
-			if (pire != NULL) {
-				/*
-				 * Assert that it is
-				 * not yet removed from the list.
-				 */
-				ASSERT(pire->ire_ptpn != NULL);
-				IRB_REFRELE(pire->ire_bucket);
-				ire_refrele(pire);
-			}
-			*ire_p = ire1;
-			return (0);
+			return (ire1);
 		}
 	}
-	if (ire->ire_type == IRE_CACHE) {
-		const in6_addr_t *addr_v6;
-		ill_t	*ill = ire_to_ill(ire);
-		char	buf[INET6_ADDRSTRLEN];
-		nce_t	*nce;
 
-		/*
-		 * All IRE_CACHE types must have a nce.  If this is
-		 * not the case the entry will not be added. We need
-		 * to make sure that if somebody deletes the nce
-		 * after we looked up, they will find this ire and
-		 * delete the ire. To delete this ire one needs the
-		 * bucket lock which we are still holding here. So,
-		 * even if the nce gets deleted after we looked up,
-		 * this ire  will get deleted.
-		 *
-		 * NOTE : Don't need the ire_lock for accessing
-		 * ire_gateway_addr_v6 as it is appearing first
-		 * time on the list and rts_setgwr_v6 could not
-		 * be changing this.
-		 */
-		addr_v6 = &ire->ire_gateway_addr_v6;
-		if (IN6_IS_ADDR_UNSPECIFIED(addr_v6))
-			addr_v6 = &ire->ire_addr_v6;
-
-		/* nce fastpath is per-ill; don't match across illgrp */
-		nce = ndp_lookup_v6(ill, B_FALSE, addr_v6, B_TRUE);
-		if (nce == NULL)
-			goto failed;
-
-		/* Pair of refhold, refrele just to get the tracing right */
-		NCE_REFHOLD_TO_REFHOLD_NOTR(nce);
-		/*
-		 * Atomically make sure that new IREs don't point
-		 * to an NCE that is logically deleted (CONDEMNED).
-		 * ndp_delete() first marks the NCE CONDEMNED.
-		 * This ensures that the nce_refcnt won't increase
-		 * due to new nce_lookups or due to addition of new IREs
-		 * pointing to this NCE. Then ndp_delete() cleans up
-		 * existing references. If we don't do it atomically here,
-		 * ndp_delete() -> nce_ire_delete() will not be able to
-		 * clean up the IRE list completely, and the nce_refcnt
-		 * won't go down to zero.
-		 */
-		mutex_enter(&nce->nce_lock);
-		if (ill->ill_flags & ILLF_XRESOLV) {
-			/*
-			 * If we used an external resolver, we may not
-			 * have gone through neighbor discovery to get here.
-			 * Must update the nce_state before the next check.
-			 */
-			if (nce->nce_state == ND_INCOMPLETE)
-				nce->nce_state = ND_REACHABLE;
-		}
-		if (nce->nce_state == ND_INCOMPLETE ||
-		    (nce->nce_flags & NCE_F_CONDEMNED) ||
-		    (nce->nce_state == ND_UNREACHABLE)) {
-failed:
-			if (ndp_g_lock_held)
-				mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-			if (nce != NULL)
-				mutex_exit(&nce->nce_lock);
-			ire_atomic_end(irb_ptr, ire);
-			ip1dbg(("ire_add_v6: No nce for dst %s \n",
-			    inet_ntop(AF_INET6, &ire->ire_addr_v6,
-			    buf, sizeof (buf))));
-			ire_delete(ire);
-			if (pire != NULL) {
-				/*
-				 * Assert that it is
-				 * not yet removed from the list.
-				 */
-				ASSERT(pire->ire_ptpn != NULL);
-				IRB_REFRELE(pire->ire_bucket);
-				ire_refrele(pire);
-			}
-			if (nce != NULL)
-				NCE_REFRELE_NOTR(nce);
-			*ire_p = NULL;
-			return (EINVAL);
-		} else {
-			ire->ire_nce = nce;
-		}
-		mutex_exit(&nce->nce_lock);
-	}
 	/*
-	 * Find the first entry that matches ire_addr - provides
-	 * tail insertion. *irep will be null if no match.
+	 * Normally we do head insertion since most things do not care about
+	 * the order of the IREs in the bucket.
+	 * However, due to shared-IP zones (and restrict_interzone_loopback)
+	 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
+	 * address. For that reason we do tail insertion for IRE_IF_CLONE.
 	 */
 	irep = (ire_t **)irb_ptr;
-	while ((ire1 = *irep) != NULL &&
-	    !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
-		irep = &ire1->ire_next;
-	ASSERT(!(ire->ire_type & IRE_BROADCAST));
-
-	if (*irep != NULL) {
-		/*
-		 * Find the last ire which matches ire_addr_v6.
-		 * Needed to do tail insertion among entries with the same
-		 * ire_addr_v6.
-		 */
-		while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
-		    &ire1->ire_addr_v6)) {
+	if (ire->ire_type & IRE_IF_CLONE) {
+		while ((ire1 = *irep) != NULL)
 			irep = &ire1->ire_next;
-			ire1 = *irep;
-			if (ire1 == NULL)
-				break;
-		}
-	}
-
-	if (ire->ire_type == IRE_DEFAULT) {
-		/*
-		 * We keep a count of default gateways which is used when
-		 * assigning them as routes.
-		 */
-		ipst->ips_ipv6_ire_default_count++;
-		ASSERT(ipst->ips_ipv6_ire_default_count != 0); /* Wraparound */
 	}
 	/* Insert at *irep */
 	ire1 = *irep;
@@ -852,62 +476,22 @@ failed:
 	 * in the list for the first time and no one else can bump
 	 * up the reference count on this yet.
 	 */
-	IRE_REFHOLD_LOCKED(ire);
+	ire_refhold_locked(ire);
 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
 	irb_ptr->irb_ire_cnt++;
-	if (ire->ire_marks & IRE_MARK_TEMPORARY)
-		irb_ptr->irb_tmp_ire_cnt++;
 
-	if (ire->ire_ipif != NULL) {
-		DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif,
+	if (ire->ire_ill != NULL) {
+		DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
 		    (char *), "ire", (void *), ire);
-		ire->ire_ipif->ipif_ire_cnt++;
-		if (ire->ire_stq != NULL) {
-			stq_ill = (ill_t *)ire->ire_stq->q_ptr;
-			DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill,
-			    (char *), "ire", (void *), ire);
-			stq_ill->ill_ire_cnt++;
-		}
-	} else {
-		ASSERT(ire->ire_stq == NULL);
+		ire->ire_ill->ill_ire_cnt++;
+		ASSERT(ire->ire_ill->ill_ire_cnt != 0);	/* Wraparound */
 	}
-
-	if (ndp_g_lock_held)
-		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 	ire_atomic_end(irb_ptr, ire);
 
-	if (pire != NULL) {
-		/* Assert that it is not removed from the list yet */
-		ASSERT(pire->ire_ptpn != NULL);
-		IRB_REFRELE(pire->ire_bucket);
-		ire_refrele(pire);
-	}
-
-	if (ire->ire_type != IRE_CACHE) {
-		/*
-		 * For ire's with with host mask see if there is an entry
-		 * in the cache. If there is one flush the whole cache as
-		 * there might be multiple entries due to RTF_MULTIRT (CGTP).
-		 * If no entry is found than there is no need to flush the
-		 * cache.
-		 */
-
-		if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) {
-			ire_t *lire;
-			lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL,
-			    IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
-			    ipst);
-			if (lire != NULL) {
-				ire_refrele(lire);
-				ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
-			}
-		} else {
-			ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
-		}
-	}
+	/* Make any caching of the IREs be notified or updated */
+	ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
 
-	*ire_p = ire;
-	return (0);
+	return (ire);
 }
 
 /*
@@ -931,7 +515,7 @@ ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
 		return;
 	for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
 		irb = &irb_ptr[i];
-		IRB_REFHOLD(irb);
+		irb_refhold(irb);
 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
 			if (!(ire->ire_flags & RTF_DYNAMIC))
 				continue;
@@ -941,50 +525,11 @@ ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
 				ire_delete(ire);
 		}
-		IRB_REFRELE(irb);
+		irb_refrele(irb);
 	}
 }
 
 /*
- * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart
- * of ip_ire_clookup_and_delete. The difference being this function does not
- * return any value. IPv6 processing of a gratuitous ARP, as it stands, is
- * different than IPv4 in that, regardless of the presence of a cache entry
- * for this address, an ire_walk_v6 is done. Another difference is that unlike
- * in the case of IPv4 this does not take an ipif_t argument, since it is only
- * called by ip_arp_news and the match is always only on the address.
- */
-void
-ip_ire_clookup_and_delete_v6(const in6_addr_t *addr, ip_stack_t *ipst)
-{
-	irb_t		*irb;
-	ire_t		*cire;
-	boolean_t	found = B_FALSE;
-
-	irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
-	    ipst->ips_ip6_cache_table_size)];
-	IRB_REFHOLD(irb);
-	for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) {
-		if (cire->ire_marks & IRE_MARK_CONDEMNED)
-			continue;
-		if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) {
-
-			/* This signifies start of a match */
-			if (!found)
-				found = B_TRUE;
-			if (cire->ire_type == IRE_CACHE) {
-				if (cire->ire_nce != NULL)
-					ndp_delete(cire->ire_nce);
-				ire_delete_v6(cire);
-			}
-		/* End of the match */
-		} else if (found)
-			break;
-	}
-	IRB_REFRELE(irb);
-}
-
-/*
  * Delete the specified IRE.
  * All calls should use ire_delete().
  * Sometimes called as writer though not required by this function.
@@ -998,11 +543,20 @@ ire_delete_v6(ire_t *ire)
 	in6_addr_t gw_addr_v6;
 	ip_stack_t	*ipst = ire->ire_ipst;
 
+	/*
+	 * Make sure ire_generation increases from ire_flush_cache happen
+	 * after any lookup/reader has read ire_generation.
+	 * Since the rw_enter makes us wait until any lookup/reader has
+	 * completed we can exit the lock immediately.
+	 */
+	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
+	rw_exit(&ipst->ips_ip6_ire_head_lock);
+
 	ASSERT(ire->ire_refcnt >= 1);
 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
 
-	if (ire->ire_type != IRE_CACHE)
-		ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
+	ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
+
 	if (ire->ire_type == IRE_DEFAULT) {
 		/*
 		 * when a default gateway is going away
@@ -1014,368 +568,284 @@ ire_delete_v6(ire_t *ire)
 		mutex_exit(&ire->ire_lock);
 		ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
 	}
-}
-
-/*
- * ire_walk routine to delete all IRE_CACHE and IRE_HOST type redirect
- * entries.
- */
-/*ARGSUSED1*/
-void
-ire_delete_cache_v6(ire_t *ire, char *arg)
-{
-	char    addrstr1[INET6_ADDRSTRLEN];
-	char    addrstr2[INET6_ADDRSTRLEN];
-
-	if ((ire->ire_type & IRE_CACHE) ||
-	    (ire->ire_flags & RTF_DYNAMIC)) {
-		ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n",
-		    inet_ntop(AF_INET6, &ire->ire_addr_v6,
-		    addrstr1, sizeof (addrstr1)),
-		    ire->ire_type,
-		    inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6,
-		    addrstr2, sizeof (addrstr2))));
-		ire_delete(ire);
-	}
-
-}
 
-/*
- * ire_walk routine to delete all IRE_CACHE/IRE_HOST type redirect entries
- * that have a given gateway address.
- */
-void
-ire_delete_cache_gw_v6(ire_t *ire, char *addr)
-{
-	in6_addr_t	*gw_addr = (in6_addr_t *)addr;
-	char		buf1[INET6_ADDRSTRLEN];
-	char		buf2[INET6_ADDRSTRLEN];
-	in6_addr_t	ire_gw_addr_v6;
-
-	if (!(ire->ire_type & IRE_CACHE) &&
-	    !(ire->ire_flags & RTF_DYNAMIC))
-		return;
-
-	mutex_enter(&ire->ire_lock);
-	ire_gw_addr_v6 = ire->ire_gateway_addr_v6;
-	mutex_exit(&ire->ire_lock);
+	/*
+	 * If we are deleting an IRE_INTERFACE then we make sure we also
+	 * delete any IRE_IF_CLONE that has been created from it.
+	 * Those are always in ire_dep_children.
+	 */
+	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
+		ire_dep_delete_if_clone(ire);
 
-	if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) {
-		ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n",
-		    inet_ntop(AF_INET6, &ire->ire_src_addr_v6,
-		    buf1, sizeof (buf1)),
-		    ire->ire_type,
-		    inet_ntop(AF_INET6, &ire_gw_addr_v6,
-		    buf2, sizeof (buf2))));
-		ire_delete(ire);
+	/* Remove from parent dependencies and child */
+	rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
+	if (ire->ire_dep_parent != NULL) {
+		ire_dep_remove(ire);
 	}
+	while (ire->ire_dep_children != NULL)
+		ire_dep_remove(ire->ire_dep_children);
+	rw_exit(&ipst->ips_ire_dep_lock);
 }
 
 /*
- * Remove all IRE_CACHE entries that match
- * the ire specified.  (Sometimes called
- * as writer though not required by this function.)
- *
- * The flag argument indicates if the
- * flush request is due to addition
- * of new route (IRE_FLUSH_ADD) or deletion of old
- * route (IRE_FLUSH_DELETE).
+ * When an IRE is added or deleted this routine is called to make sure
+ * any caching of IRE information is notified or updated.
  *
- * This routine takes only the IREs from the forwarding
- * table and flushes the corresponding entries from
- * the cache table.
- *
- * When flushing due to the deletion of an old route, it
- * just checks the cache handles (ire_phandle and ire_ihandle) and
- * deletes the ones that match.
- *
- * When flushing due to the creation of a new route, it checks
- * if a cache entry's address matches the one in the IRE and
- * that the cache entry's parent has a less specific mask than the
- * one in IRE. The destination of such a cache entry could be the
- * gateway for other cache entries, so we need to flush those as
- * well by looking for gateway addresses matching the IRE's address.
+ * The flag argument indicates if the flush request is due to addition
+ * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
+ * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
  */
 void
 ire_flush_cache_v6(ire_t *ire, int flag)
 {
-	int i;
-	ire_t *cire;
-	irb_t *irb;
-	ip_stack_t	*ipst = ire->ire_ipst;
+	ip_stack_t *ipst = ire->ire_ipst;
 
-	if (ire->ire_type & IRE_CACHE)
+	/*
+	 * IRE_IF_CLONE ire's don't provide any new information
+	 * than the parent from which they are cloned, so don't
+	 * perturb the generation numbers.
+	 */
+	if (ire->ire_type & IRE_IF_CLONE)
 		return;
 
 	/*
-	 * If a default is just created, there is no point
-	 * in going through the cache, as there will not be any
-	 * cached ires.
+	 * Ensure that an ire_add during a lookup serializes the updates of
+	 * the generation numbers under ire_head_lock so that the lookup gets
+	 * either the old ire and old generation number, or a new ire and new
+	 * generation number.
+	 */
+	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
+
+	/*
+	 * If a route was just added, we need to notify everybody that
+	 * has cached an IRE_NOROUTE since there might now be a better
+	 * route for them.
 	 */
-	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
-		return;
 	if (flag == IRE_FLUSH_ADD) {
+		ire_increment_generation(ipst->ips_ire_reject_v6);
+		ire_increment_generation(ipst->ips_ire_blackhole_v6);
+	}
+
+	/* Adding a default can't otherwise provide a better route */
+	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
+		rw_exit(&ipst->ips_ip6_ire_head_lock);
+		return;
+	}
+
+	switch (flag) {
+	case IRE_FLUSH_DELETE:
+	case IRE_FLUSH_GWCHANGE:
 		/*
-		 * This selective flush is
-		 * due to the addition of
-		 * new IRE.
+		 * Update ire_generation for all ire_dep_children chains
+		 * starting with this IRE
 		 */
-		for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
-			irb = &ipst->ips_ip_cache_table_v6[i];
-			if ((cire = irb->irb_ire) == NULL)
-				continue;
-			IRB_REFHOLD(irb);
-			for (cire = irb->irb_ire; cire != NULL;
-			    cire = cire->ire_next) {
-				if (cire->ire_type != IRE_CACHE)
-					continue;
-				/*
-				 * If 'cire' belongs to the same subnet
-				 * as the new ire being added, and 'cire'
-				 * is derived from a prefix that is less
-				 * specific than the new ire being added,
-				 * we need to flush 'cire'; for instance,
-				 * when a new interface comes up.
-				 */
-				if ((V6_MASK_EQ_2(cire->ire_addr_v6,
-				    ire->ire_mask_v6, ire->ire_addr_v6) &&
-				    (ip_mask_to_plen_v6(&cire->ire_cmask_v6) <=
-				    ire->ire_masklen))) {
-					ire_delete(cire);
-					continue;
-				}
-				/*
-				 * This is the case when the ire_gateway_addr
-				 * of 'cire' belongs to the same subnet as
-				 * the new ire being added.
-				 * Flushing such ires is sometimes required to
-				 * avoid misrouting: say we have a machine with
-				 * two interfaces (I1 and I2), a default router
-				 * R on the I1 subnet, and a host route to an
-				 * off-link destination D with a gateway G on
-				 * the I2 subnet.
-				 * Under normal operation, we will have an
-				 * on-link cache entry for G and an off-link
-				 * cache entry for D with G as ire_gateway_addr,
-				 * traffic to D will reach its destination
-				 * through gateway G.
-				 * If the administrator does 'ifconfig I2 down',
-				 * the cache entries for D and G will be
-				 * flushed. However, G will now be resolved as
-				 * an off-link destination using R (the default
-				 * router) as gateway. Then D will also be
-				 * resolved as an off-link destination using G
-				 * as gateway - this behavior is due to
-				 * compatibility reasons, see comment in
-				 * ire_ihandle_lookup_offlink(). Traffic to D
-				 * will go to the router R and probably won't
-				 * reach the destination.
-				 * The administrator then does 'ifconfig I2 up'.
-				 * Since G is on the I2 subnet, this routine
-				 * will flush its cache entry. It must also
-				 * flush the cache entry for D, otherwise
-				 * traffic will stay misrouted until the IRE
-				 * times out.
-				 */
-				if (V6_MASK_EQ_2(cire->ire_gateway_addr_v6,
-				    ire->ire_mask_v6, ire->ire_addr_v6)) {
-					ire_delete(cire);
-					continue;
-				}
-			}
-			IRB_REFRELE(irb);
-		}
-	} else {
+		ire_dep_incr_generation(ire);
+		break;
+	case IRE_FLUSH_ADD: {
+		in6_addr_t	addr;
+		in6_addr_t	mask;
+		ip_stack_t	*ipst = ire->ire_ipst;
+		uint_t		masklen;
+
 		/*
-		 * delete the cache entries based on
-		 * handle in the IRE as this IRE is
-		 * being deleted/changed.
+		 * Find an IRE which is a shorter match than the ire to be added
+		 * For any such IRE (which we repeat) we update the
+		 * ire_generation the same way as in the delete case.
 		 */
-		for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
-			irb = &ipst->ips_ip_cache_table_v6[i];
-			if ((cire = irb->irb_ire) == NULL)
-				continue;
-			IRB_REFHOLD(irb);
-			for (cire = irb->irb_ire; cire != NULL;
-			    cire = cire->ire_next) {
-				if (cire->ire_type != IRE_CACHE)
-					continue;
-				if ((cire->ire_phandle == 0 ||
-				    cire->ire_phandle != ire->ire_phandle) &&
-				    (cire->ire_ihandle == 0 ||
-				    cire->ire_ihandle != ire->ire_ihandle))
-					continue;
-				ire_delete(cire);
-			}
-			IRB_REFRELE(irb);
+		addr = ire->ire_addr_v6;
+		mask = ire->ire_mask_v6;
+		masklen = ip_mask_to_plen_v6(&mask);
+
+		ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
+		    ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
+		while (ire != NULL) {
+			/* We need to handle all in the same bucket */
+			irb_increment_generation(ire->ire_bucket);
+
+			mask = ire->ire_mask_v6;
+			ASSERT(masklen > ip_mask_to_plen_v6(&mask));
+			masklen = ip_mask_to_plen_v6(&mask);
+			ire_refrele(ire);
+			ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
+			    NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
+		}
 		}
+		break;
 	}
+	rw_exit(&ipst->ips_ip6_ire_head_lock);
 }
 
 /*
  * Matches the arguments passed with the values in the ire.
  *
- * Note: for match types that match using "ipif" passed in, ipif
+ * Note: for match types that match using "ill" passed in, ill
  * must be checked for non-NULL before calling this routine.
  */
-static boolean_t
+boolean_t
 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
-    const in6_addr_t *gateway, int type, const ipif_t *ipif, zoneid_t zoneid,
-    uint32_t ihandle, const ts_label_t *tsl, int match_flags)
+    const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
+    const ts_label_t *tsl, int match_flags)
 {
 	in6_addr_t masked_addr;
 	in6_addr_t gw_addr_v6;
 	ill_t *ire_ill = NULL, *dst_ill;
-	ill_t *ipif_ill = NULL;
-	ipif_t	*src_ipif;
+	ip_stack_t *ipst = ire->ire_ipst;
 
 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
 	ASSERT(addr != NULL);
 	ASSERT(mask != NULL);
 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
 	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
-	    (ipif != NULL && ipif->ipif_isv6));
+	    (ill != NULL && ill->ill_isv6));
 
 	/*
-	 * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
-	 * is in fact hidden, to ensure the caller gets the right one.  One
-	 * exception: if the caller passed MATCH_IRE_IHANDLE, then they
-	 * already know the identity of the given IRE_INTERFACE entry and
-	 * there's no point trying to hide it from them.
+	 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
+	 * is in fact hidden, to ensure the caller gets the right one.
 	 */
-	if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
-		if (match_flags & MATCH_IRE_IHANDLE)
-			match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
-		if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+	if (ire->ire_testhidden) {
+		if (!(match_flags & MATCH_IRE_TESTHIDDEN))
 			return (B_FALSE);
 	}
 
 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
 	    ire->ire_zoneid != ALL_ZONES) {
 		/*
-		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
-		 * valid and does not match that of ire_zoneid, a failure to
+		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
+		 * does not match that of ire_zoneid, a failure to
 		 * match is reported at this point. Otherwise, since some IREs
 		 * that are available in the global zone can be used in local
 		 * zones, additional checks need to be performed:
 		 *
-		 *	IRE_CACHE and IRE_LOOPBACK entries should
-		 *	never be matched in this situation.
+		 * IRE_LOOPBACK
+		 *	entries should never be matched in this situation.
+		 *	Each zone has its own IRE_LOOPBACK.
 		 *
-		 *	IRE entries that have an interface associated with them
-		 *	should in general not match unless they are an IRE_LOCAL
-		 *	or in the case when MATCH_IRE_DEFAULT has been set in
-		 *	the caller.  In the case of the former, checking of the
-		 *	other fields supplied should take place.
+		 * IRE_LOCAL
+		 *	We allow them for any zoneid. ire_route_recursive
+		 *	does additional checks when
+		 *	ip_restrict_interzone_loopback is set.
 		 *
-		 *	In the case where MATCH_IRE_DEFAULT has been set,
-		 *	all of the ipif's associated with the IRE's ill are
-		 *	checked to see if there is a matching zoneid.  If any
-		 *	one ipif has a matching zoneid, this IRE is a
-		 *	potential candidate so checking of the other fields
-		 *	takes place.
+		 * If ill_usesrc_ifindex is set
+		 *	Then we check if the zone has a valid source address
+		 *	on the usesrc ill.
 		 *
-		 *	In the case where the IRE_INTERFACE has a usable source
-		 *	address (indicated by ill_usesrc_ifindex) in the
-		 *	correct zone then it's permitted to return this IRE
+		 * If ire_ill is set, then check that the zone has an ipif
+		 *	on that ill.
+		 *
+		 * Outside of this function (in ire_round_robin) we check
+		 * that any IRE_OFFLINK has a gateway that reachable from the
+		 * zone when we have multiple choices (ECMP).
 		 */
 		if (match_flags & MATCH_IRE_ZONEONLY)
 			return (B_FALSE);
-		if (ire->ire_type & (IRE_CACHE | IRE_LOOPBACK))
+		if (ire->ire_type & IRE_LOOPBACK)
 			return (B_FALSE);
+
+		if (ire->ire_type & IRE_LOCAL)
+			goto matchit;
+
 		/*
-		 * Note, IRE_INTERFACE can have the stq as NULL. For
-		 * example, if the default multicast route is tied to
-		 * the loopback address.
+		 * The normal case of IRE_ONLINK has a matching zoneid.
+		 * Here we handle the case when shared-IP zones have been
+		 * configured with IP addresses on vniN. In that case it
+		 * is ok for traffic from a zone to use IRE_ONLINK routes
+		 * if the ill has a usesrc pointing at vniN
+		 * Applies to IRE_INTERFACE.
 		 */
-		if ((ire->ire_type & IRE_INTERFACE) &&
-		    (ire->ire_stq != NULL)) {
-			dst_ill = (ill_t *)ire->ire_stq->q_ptr;
+		dst_ill = ire->ire_ill;
+		if (ire->ire_type & IRE_ONLINK) {
+			uint_t	ifindex;
+
+			/*
+			 * Note there is no IRE_INTERFACE on vniN thus
+			 * can't do an IRE lookup for a matching route.
+			 */
+			ifindex = dst_ill->ill_usesrc_ifindex;
+			if (ifindex == 0)
+				return (B_FALSE);
+
 			/*
 			 * If there is a usable source address in the
-			 * zone, then it's ok to return an
-			 * IRE_INTERFACE
+			 * zone, then it's ok to return this IRE_INTERFACE
 			 */
-			if ((dst_ill->ill_usesrc_ifindex != 0) &&
-			    (src_ipif = ipif_select_source_v6(dst_ill, addr,
-			    B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid))
-			    != NULL) {
-				ip3dbg(("ire_match_args: src_ipif %p"
-				    " dst_ill %p", (void *)src_ipif,
-				    (void *)dst_ill));
-				ipif_refrele(src_ipif);
-			} else {
-				ip3dbg(("ire_match_args: src_ipif NULL"
+			if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
+			    zoneid, ipst)) {
+				ip3dbg(("ire_match_args: no usrsrc for zone"
 				    " dst_ill %p\n", (void *)dst_ill));
 				return (B_FALSE);
 			}
 		}
-		if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
-		    !(ire->ire_type & IRE_INTERFACE)) {
+		/*
+		 * For exampe, with
+		 * route add 11.0.0.0 gw1 -ifp bge0
+		 * route add 11.0.0.0 gw2 -ifp bge1
+		 * this code would differentiate based on
+		 * where the sending zone has addresses.
+		 * Only if the zone has an address on bge0 can it use the first
+		 * route. It isn't clear if this behavior is documented
+		 * anywhere.
+		 */
+		if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
 			ipif_t	*tipif;
 
-			if ((match_flags & MATCH_IRE_DEFAULT) == 0)
-				return (B_FALSE);
-			mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
-			for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
+			mutex_enter(&dst_ill->ill_lock);
+			for (tipif = dst_ill->ill_ipif;
 			    tipif != NULL; tipif = tipif->ipif_next) {
-				if (IPIF_CAN_LOOKUP(tipif) &&
+				if (!IPIF_IS_CONDEMNED(tipif) &&
 				    (tipif->ipif_flags & IPIF_UP) &&
 				    (tipif->ipif_zoneid == zoneid ||
 				    tipif->ipif_zoneid == ALL_ZONES))
 					break;
 			}
-			mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
+			mutex_exit(&dst_ill->ill_lock);
 			if (tipif == NULL)
 				return (B_FALSE);
 		}
 	}
 
+matchit:
 	if (match_flags & MATCH_IRE_GW) {
 		mutex_enter(&ire->ire_lock);
 		gw_addr_v6 = ire->ire_gateway_addr_v6;
 		mutex_exit(&ire->ire_lock);
 	}
-
-	/*
-	 * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
-	 * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
-	 * of getting a source address -- i.e., ire_src_addr_v6 ==
-	 * ire->ire_ipif->ipif_v6src_addr).  ire_to_ill() handles this.
-	 *
-	 * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
-	 * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
-	 * IPMP test traffic), then the ill must match exactly.
-	 */
 	if (match_flags & MATCH_IRE_ILL) {
-		ire_ill = ire_to_ill(ire);
-		ipif_ill = ipif->ipif_ill;
-	}
+		ire_ill = ire->ire_ill;
 
+		/*
+		 * If asked to match an ill, we *must* match
+		 * on the ire_ill for ipmp test addresses, or
+		 * any of the ill in the group for data addresses.
+		 * If we don't, we may as well fail.
+		 * However, we need an exception for IRE_LOCALs to ensure
+		 * we loopback packets even sent to test addresses on different
+		 * interfaces in the group.
+		 */
+		if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
+		    !(ire->ire_type & IRE_LOCAL)) {
+			if (ire->ire_ill != ill)
+				return (B_FALSE);
+		} else  {
+			match_flags &= ~MATCH_IRE_TESTHIDDEN;
+			/*
+			 * We know that ill is not NULL, but ire_ill could be
+			 * NULL
+			 */
+			if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
+				return (B_FALSE);
+		}
+	}
 	/* No ire_addr_v6 bits set past the mask */
 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
 	    ire->ire_addr_v6));
 	V6_MASK_COPY(*addr, *mask, masked_addr);
-
 	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
 	    ((!(match_flags & MATCH_IRE_GW)) ||
 	    IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
-	    ((!(match_flags & MATCH_IRE_TYPE)) ||
-	    (ire->ire_type & type)) &&
-	    ((!(match_flags & MATCH_IRE_SRC)) ||
-	    IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
-	    &ipif->ipif_v6src_addr)) &&
-	    ((!(match_flags & MATCH_IRE_IPIF)) ||
-	    (ire->ire_ipif == ipif)) &&
-	    ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
-	    (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
-	    ((!(match_flags & MATCH_IRE_ILL)) ||
-	    (ire_ill == ipif_ill ||
-	    (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
-	    ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
-	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
-	    (ire->ire_ihandle == ihandle)) &&
+	    ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
+	    ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
+	    ((!(match_flags & MATCH_IRE_MASK)) ||
+	    (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) &&
 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
 	    (!is_system_labeled()) ||
 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -1386,41 +856,38 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
 }
 
 /*
- * Lookup for a route in all the tables
+ * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
+ * gateway address. If ill is non-NULL we also match on it.
+ * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
  */
-ire_t *
-ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
-    const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
-    zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
+boolean_t
+ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
+    const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
 {
-	ire_t *ire = NULL;
+	ire_t	*ire;
+	uint_t	match_flags;
 
-	/*
-	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
-	 * MATCH_IRE_ILL is set.
-	 */
-	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
-		return (NULL);
+	if (lock_held)
+		ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
+	else
+		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
 
-	/*
-	 * might be asking for a cache lookup,
-	 * This is not best way to lookup cache,
-	 * user should call ire_cache_lookup directly.
-	 *
-	 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
-	 * in the forwarding table, if the applicable type flags were set.
-	 */
-	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
-		ire = ire_ctable_lookup_v6(addr, gateway, type, ipif, zoneid,
-		    tsl, flags, ipst);
-		if (ire != NULL)
-			return (ire);
-	}
-	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
-		ire = ire_ftable_lookup_v6(addr, mask, gateway, type, ipif,
-		    pire, zoneid, 0, tsl, flags, ipst);
+	match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
+	if (ill != NULL)
+		match_flags |= MATCH_IRE_ILL;
+
+	ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
+	    &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags,
+	    ipst);
+
+	if (!lock_held)
+		rw_exit(&ipst->ips_ip6_ire_head_lock);
+	if (ire != NULL) {
+		ire_refrele(ire);
+		return (B_TRUE);
+	} else {
+		return (B_FALSE);
 	}
-	return (ire);
 }
 
 /*
@@ -1429,63 +896,121 @@ ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
  * required parameters and indicating the
  * match required in flag field.
  *
- * Looking for default route can be done in three ways
- * 1) pass mask as ipv6_all_zeros and set MATCH_IRE_MASK in flags field
- *    along with other matches.
- * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
- *    field along with other matches.
- * 3) if the destination and mask are passed as zeros.
- *
- * A request to return a default route if no route
- * is found, can be specified by setting MATCH_IRE_DEFAULT
- * in flags.
- *
- * It does not support recursion more than one level. It
- * will do recursive lookup only when the lookup maps to
- * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
- *
- * If the routing table is setup to allow more than one level
- * of recursion, the cleaning up cache table will not work resulting
- * in invalid routing.
- *
  * Supports link-local addresses by following the ipif/ill when recursing.
- *
- * NOTE : When this function returns NULL, pire has already been released.
- *	  pire is valid only when this function successfully returns an
- *	  ire.
  */
 ire_t *
 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
-    const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
-    zoneid_t zoneid, uint32_t ihandle, const ts_label_t *tsl, int flags,
-    ip_stack_t *ipst)
+    const in6_addr_t *gateway, int type, const ill_t *ill,
+    zoneid_t zoneid, const ts_label_t *tsl, int flags,
+    uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
 {
-	irb_t *irb_ptr;
-	ire_t	*rire;
 	ire_t *ire = NULL;
-	ire_t	*saved_ire;
-	nce_t	*nce;
-	int i;
-	in6_addr_t gw_addr_v6;
 
 	ASSERT(addr != NULL);
 	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
-	ASSERT(ipif == NULL || ipif->ipif_isv6);
+	ASSERT(ill == NULL || ill->ill_isv6);
+
+	ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
 
 	/*
-	 * When we return NULL from this function, we should make
-	 * sure that *pire is NULL so that the callers will not
-	 * wrongly REFRELE the pire.
+	 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
+	 * is set.
 	 */
-	if (pire != NULL)
-		*pire = NULL;
+	if ((flags & (MATCH_IRE_ILL)) && (ill == NULL))
+		return (NULL);
+
+	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
+	ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
+	    tsl, flags, ipst);
+	if (ire == NULL) {
+		rw_exit(&ipst->ips_ip6_ire_head_lock);
+		return (NULL);
+	}
+
 	/*
-	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
-	 * MATCH_IRE_ILL is set.
+	 * round-robin only if we have more than one route in the bucket.
+	 * ips_ip_ecmp_behavior controls when we do ECMP
+	 *	2:	always
+	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
+	 *	0:	never
+	 *
+	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
+	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
+	 * and the IRE_INTERFACESs are likely to be shorter matches.
 	 */
-	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
-		return (NULL);
+	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
+		if (ipst->ips_ip_ecmp_behavior == 2 ||
+		    (ipst->ips_ip_ecmp_behavior == 1 &&
+		    IS_DEFAULT_ROUTE_V6(ire))) {
+			ire_t	*next_ire;
+			ire_ftable_args_t margs;
+
+			(void) memset(&margs, 0, sizeof (margs));
+			margs.ift_addr_v6 = *addr;
+			if (mask != NULL)
+				margs.ift_mask_v6 = *mask;
+			if (gateway != NULL)
+				margs.ift_gateway_v6 = *gateway;
+			margs.ift_type = type;
+			margs.ift_ill = ill;
+			margs.ift_zoneid = zoneid;
+			margs.ift_tsl = tsl;
+			margs.ift_flags = flags;
+
+			next_ire = ire_round_robin(ire->ire_bucket, &margs,
+			    xmit_hint, ire, ipst);
+			if (next_ire == NULL) {
+				/* keep ire if next_ire is null */
+				goto done;
+			}
+			ire_refrele(ire);
+			ire = next_ire;
+		}
+	}
+
+done:
+	/* Return generation before dropping lock */
+	if (generationp != NULL)
+		*generationp = ire->ire_generation;
+
+	rw_exit(&ipst->ips_ip6_ire_head_lock);
+
+	/*
+	 * For shared-IP zones we need additional checks to what was
+	 * done in ire_match_args to make sure IRE_LOCALs are handled.
+	 *
+	 * When ip_restrict_interzone_loopback is set, then
+	 * we ensure that IRE_LOCAL are only used for loopback
+	 * between zones when the logical "Ethernet" would
+	 * have looped them back. That is, if in the absense of
+	 * the IRE_LOCAL we would have sent to packet out the
+	 * same ill.
+	 */
+	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
+	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
+	    ipst->ips_ip_restrict_interzone_loopback) {
+		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
+		ASSERT(ire != NULL);
+	}
+
+	return (ire);
+}
+
+/*
+ * Look up a single ire. The caller holds either the read or write lock.
+ */
+ire_t *
+ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
+    const in6_addr_t *gateway, int type, const ill_t *ill,
+    zoneid_t zoneid, const ts_label_t *tsl, int flags,
+    ip_stack_t *ipst)
+{
+	irb_t *irb_ptr;
+	ire_t *ire = NULL;
+	int i;
+
+	ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
 
 	/*
 	 * If the mask is known, the lookup
@@ -1496,28 +1021,41 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
 		uint_t masklen;
 
 		masklen = ip_mask_to_plen_v6(mask);
-		if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL)
+		if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
 			return (NULL);
+		}
 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
 		    IRE_ADDR_MASK_HASH_V6(*addr, *mask,
 		    ipst->ips_ip6_ftable_hash_size)]);
 		rw_enter(&irb_ptr->irb_lock, RW_READER);
 		for (ire = irb_ptr->irb_ire; ire != NULL;
 		    ire = ire->ire_next) {
-			if (ire->ire_marks & IRE_MARK_CONDEMNED)
+			if (IRE_IS_CONDEMNED(ire))
 				continue;
 			if (ire_match_args_v6(ire, addr, mask, gateway, type,
-			    ipif, zoneid, ihandle, tsl, flags))
+			    ill, zoneid, tsl, flags))
 				goto found_ire;
 		}
 		rw_exit(&irb_ptr->irb_lock);
 	} else {
+		uint_t masklen;
+
 		/*
 		 * In this case we don't know the mask, we need to
 		 * search the table assuming different mask sizes.
-		 * we start with 128 bit mask, we don't allow default here.
 		 */
-		for (i = (IP6_MASK_TABLE_SIZE - 1); i > 0; i--) {
+		if (flags & MATCH_IRE_SHORTERMASK) {
+			masklen = ip_mask_to_plen_v6(mask);
+			if (masklen == 0) {
+				/* Nothing shorter than zero */
+				return (NULL);
+			}
+			masklen--;
+		} else {
+			masklen = IP6_MASK_TABLE_SIZE - 1;
+		}
+
+		for (i = masklen; i >= 0; i--) {
 			in6_addr_t tmpmask;
 
 			if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
@@ -1529,1334 +1067,415 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
 			rw_enter(&irb_ptr->irb_lock, RW_READER);
 			for (ire = irb_ptr->irb_ire; ire != NULL;
 			    ire = ire->ire_next) {
-				if (ire->ire_marks & IRE_MARK_CONDEMNED)
+				if (IRE_IS_CONDEMNED(ire))
 					continue;
 				if (ire_match_args_v6(ire, addr,
-				    &ire->ire_mask_v6, gateway, type, ipif,
-				    zoneid, ihandle, tsl, flags))
+				    &ire->ire_mask_v6, gateway, type, ill,
+				    zoneid, tsl, flags))
 					goto found_ire;
 			}
 			rw_exit(&irb_ptr->irb_lock);
 		}
 	}
-
-	/*
-	 * We come here if no route has yet been found.
-	 *
-	 * Handle the case where default route is
-	 * requested by specifying type as one of the possible
-	 * types for that can have a zero mask (IRE_DEFAULT and IRE_INTERFACE).
-	 *
-	 * If MATCH_IRE_MASK is specified, then the appropriate default route
-	 * would have been found above if it exists so it isn't looked up here.
-	 * If MATCH_IRE_DEFAULT was also specified, then a default route will be
-	 * searched for later.
-	 */
-	if ((flags & (MATCH_IRE_TYPE | MATCH_IRE_MASK)) == MATCH_IRE_TYPE &&
-	    (type & (IRE_DEFAULT | IRE_INTERFACE))) {
-		if (ipst->ips_ip_forwarding_table_v6[0] != NULL) {
-			/* addr & mask is zero for defaults */
-			irb_ptr = &ipst->ips_ip_forwarding_table_v6[0][
-			    IRE_ADDR_HASH_V6(ipv6_all_zeros,
-			    ipst->ips_ip6_ftable_hash_size)];
-			rw_enter(&irb_ptr->irb_lock, RW_READER);
-			for (ire = irb_ptr->irb_ire; ire != NULL;
-			    ire = ire->ire_next) {
-
-				if (ire->ire_marks & IRE_MARK_CONDEMNED)
-					continue;
-
-				if (ire_match_args_v6(ire, addr,
-				    &ipv6_all_zeros, gateway, type, ipif,
-				    zoneid, ihandle, tsl, flags))
-					goto found_ire;
-			}
-			rw_exit(&irb_ptr->irb_lock);
-		}
-	}
-	/*
-	 * We come here only if no route is found.
-	 * see if the default route can be used which is allowed
-	 * only if the default matching criteria is specified.
-	 * The ipv6_ire_default_count tracks the number of IRE_DEFAULT
-	 * entries. However, the ip_forwarding_table_v6[0] also contains
-	 * interface routes thus the count can be zero.
-	 */
-	saved_ire = NULL;
-	if ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
-	    MATCH_IRE_DEFAULT) {
-		ire_t	*ire_origin;
-		uint_t	g_index;
-		uint_t	index;
-
-		if (ipst->ips_ip_forwarding_table_v6[0] == NULL)
-			return (NULL);
-		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[0])[0];
-
-		/*
-		 * Keep a tab on the bucket while looking the IRE_DEFAULT
-		 * entries. We need to keep track of a particular IRE
-		 * (ire_origin) so this ensures that it will not be unlinked
-		 * from the hash list during the recursive lookup below.
-		 */
-		IRB_REFHOLD(irb_ptr);
-		ire = irb_ptr->irb_ire;
-		if (ire == NULL) {
-			IRB_REFRELE(irb_ptr);
-			return (NULL);
-		}
-
-		/*
-		 * Get the index first, since it can be changed by other
-		 * threads. Then get to the right default route skipping
-		 * default interface routes if any. As we hold a reference on
-		 * the IRE bucket, ipv6_ire_default_count can only increase so
-		 * we can't reach the end of the hash list unexpectedly.
-		 */
-		if (ipst->ips_ipv6_ire_default_count != 0) {
-			g_index = ipst->ips_ipv6_ire_default_index++;
-			index = g_index % ipst->ips_ipv6_ire_default_count;
-			while (index != 0) {
-				if (!(ire->ire_type & IRE_INTERFACE))
-					index--;
-				ire = ire->ire_next;
-			}
-			ASSERT(ire != NULL);
-		} else {
-			/*
-			 * No default route, so we only have default interface
-			 * routes: don't enter the first loop.
-			 */
-			ire = NULL;
-		}
-
-		/*
-		 * Round-robin the default routers list looking for a neighbor
-		 * that matches the passed in parameters and is reachable.  If
-		 * none found, just return a route from the default router list
-		 * if it exists. If we can't find a default route (IRE_DEFAULT),
-		 * look for interface default routes.
-		 * We start with the ire we found above and we walk the hash
-		 * list until we're back where we started, see
-		 * ire_get_next_default_ire(). It doesn't matter if default
-		 * routes are added or deleted by other threads - we know this
-		 * ire will stay in the list because we hold a reference on the
-		 * ire bucket.
-		 * NB: if we only have interface default routes, ire is NULL so
-		 * we don't even enter this loop (see above).
-		 */
-		ire_origin = ire;
-		for (; ire != NULL;
-		    ire = ire_get_next_default_ire(ire, ire_origin)) {
-
-			if (ire_match_args_v6(ire, addr,
-			    &ipv6_all_zeros, gateway, type, ipif,
-			    zoneid, ihandle, tsl, flags)) {
-				int match_flags;
-
-				/*
-				 * We have something to work with.
-				 * If we can find a resolved/reachable
-				 * entry, we will use this. Otherwise
-				 * we'll try to find an entry that has
-				 * a resolved cache entry. We will fallback
-				 * on this if we don't find anything else.
-				 */
-				if (saved_ire == NULL)
-					saved_ire = ire;
-				mutex_enter(&ire->ire_lock);
-				gw_addr_v6 = ire->ire_gateway_addr_v6;
-				mutex_exit(&ire->ire_lock);
-				match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
-				rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL,
-				    0, ire->ire_ipif, zoneid, tsl, match_flags,
-				    ipst);
-				if (rire != NULL) {
-					nce = rire->ire_nce;
-					if (nce != NULL &&
-					    NCE_ISREACHABLE(nce) &&
-					    nce->nce_flags & NCE_F_ISROUTER) {
-						ire_refrele(rire);
-						IRE_REFHOLD(ire);
-						IRB_REFRELE(irb_ptr);
-						goto found_ire_held;
-					} else if (nce != NULL &&
-					    !(nce->nce_flags &
-					    NCE_F_ISROUTER)) {
-						/*
-						 * Make sure we don't use
-						 * this ire
-						 */
-						if (saved_ire == ire)
-							saved_ire = NULL;
-					}
-					ire_refrele(rire);
-				} else if (ipst->
-				    ips_ipv6_ire_default_count > 1 &&
-				    zoneid != GLOBAL_ZONEID) {
-					/*
-					 * When we're in a local zone, we're
-					 * only interested in default routers
-					 * that are reachable through ipifs
-					 * within our zone.
-					 * The potentially expensive call to
-					 * ire_route_lookup_v6() is avoided when
-					 * we have only one default route.
-					 */
-					int ire_match_flags = MATCH_IRE_TYPE |
-					    MATCH_IRE_SECATTR;
-
-					if (ire->ire_ipif != NULL) {
-						ire_match_flags |=
-						    MATCH_IRE_ILL;
-					}
-					rire = ire_route_lookup_v6(&gw_addr_v6,
-					    NULL, NULL, IRE_INTERFACE,
-					    ire->ire_ipif, NULL,
-					    zoneid, tsl, ire_match_flags, ipst);
-					if (rire != NULL) {
-						ire_refrele(rire);
-						saved_ire = ire;
-					} else if (saved_ire == ire) {
-						/*
-						 * Make sure we don't use
-						 * this ire
-						 */
-						saved_ire = NULL;
-					}
-				}
-			}
-		}
-		if (saved_ire != NULL) {
-			ire = saved_ire;
-			IRE_REFHOLD(ire);
-			IRB_REFRELE(irb_ptr);
-			goto found_ire_held;
-		} else {
-			/*
-			 * Look for a interface default route matching the
-			 * args passed in. No round robin here. Just pick
-			 * the right one.
-			 */
-			for (ire = irb_ptr->irb_ire; ire != NULL;
-			    ire = ire->ire_next) {
-
-				if (!(ire->ire_type & IRE_INTERFACE))
-					continue;
-
-				if (ire->ire_marks & IRE_MARK_CONDEMNED)
-					continue;
-
-				if (ire_match_args_v6(ire, addr,
-				    &ipv6_all_zeros, gateway, type, ipif,
-				    zoneid, ihandle, tsl, flags)) {
-					IRE_REFHOLD(ire);
-					IRB_REFRELE(irb_ptr);
-					goto found_ire_held;
-				}
-			}
-			IRB_REFRELE(irb_ptr);
-		}
-	}
 	ASSERT(ire == NULL);
 	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
 	return (NULL);
+
 found_ire:
-	ASSERT((ire->ire_marks & IRE_MARK_CONDEMNED) == 0);
-	IRE_REFHOLD(ire);
+	ire_refhold(ire);
 	rw_exit(&irb_ptr->irb_lock);
-
-found_ire_held:
-	if ((flags & MATCH_IRE_RJ_BHOLE) &&
-	    (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
-		return (ire);
-	}
-	/*
-	 * At this point, IRE that was found must be an IRE_FORWARDTABLE
-	 * or IRE_CACHETABLE type.  If this is a recursive lookup and an
-	 * IRE_INTERFACE type was found, return that.  If it was some other
-	 * IRE_FORWARDTABLE type of IRE (one of the prefix types), then it
-	 * is necessary to fill in the  parent IRE pointed to by pire, and
-	 * then lookup the gateway address of  the parent.  For backwards
-	 * compatiblity, if this lookup returns an
-	 * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
-	 * of lookup is done.
-	 */
-	if (flags & MATCH_IRE_RECURSIVE) {
-		const ipif_t *gw_ipif;
-		int match_flags = MATCH_IRE_DSTONLY;
-
-		if (ire->ire_type & IRE_INTERFACE)
-			return (ire);
-		if (pire != NULL)
-			*pire = ire;
-		/*
-		 * If we can't find an IRE_INTERFACE or the caller has not
-		 * asked for pire, we need to REFRELE the saved_ire.
-		 */
-		saved_ire = ire;
-
-		if (ire->ire_ipif != NULL)
-			match_flags |= MATCH_IRE_ILL;
-
-		mutex_enter(&ire->ire_lock);
-		gw_addr_v6 = ire->ire_gateway_addr_v6;
-		mutex_exit(&ire->ire_lock);
-
-		ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, 0,
-		    ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
-		if (ire == NULL) {
-			/*
-			 * In this case we have to deal with the
-			 * MATCH_IRE_PARENT flag, which means the
-			 * parent has to be returned if ire is NULL.
-			 * The aim of this is to have (at least) a starting
-			 * ire when we want to look at all of the ires in a
-			 * bucket aimed at a single destination (as is the
-			 * case in ip_newroute_v6 for the RTF_MULTIRT
-			 * flagged routes).
-			 */
-			if (flags & MATCH_IRE_PARENT) {
-				if (pire != NULL) {
-					/*
-					 * Need an extra REFHOLD, if the
-					 * parent ire is returned via both
-					 * ire and pire.
-					 */
-					IRE_REFHOLD(saved_ire);
-				}
-				ire = saved_ire;
-			} else {
-				ire_refrele(saved_ire);
-				if (pire != NULL)
-					*pire = NULL;
-			}
-			return (ire);
-		}
-		if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
-			/*
-			 * If the caller did not ask for pire, release
-			 * it now.
-			 */
-			if (pire == NULL) {
-				ire_refrele(saved_ire);
-			}
-			return (ire);
-		}
-		match_flags |= MATCH_IRE_TYPE;
-		mutex_enter(&ire->ire_lock);
-		gw_addr_v6 = ire->ire_gateway_addr_v6;
-		mutex_exit(&ire->ire_lock);
-		gw_ipif = ire->ire_ipif;
-		ire_refrele(ire);
-		ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL,
-		    (IRE_CACHETABLE | IRE_INTERFACE), gw_ipif, NULL, zoneid,
-		    NULL, match_flags, ipst);
-		if (ire == NULL) {
-			/*
-			 * In this case we have to deal with the
-			 * MATCH_IRE_PARENT flag, which means the
-			 * parent has to be returned if ire is NULL.
-			 * The aim of this is to have (at least) a starting
-			 * ire when we want to look at all of the ires in a
-			 * bucket aimed at a single destination (as is the
-			 * case in ip_newroute_v6 for the RTF_MULTIRT
-			 * flagged routes).
-			 */
-			if (flags & MATCH_IRE_PARENT) {
-				if (pire != NULL) {
-					/*
-					 * Need an extra REFHOLD, if the
-					 * parent ire is returned via both
-					 * ire and pire.
-					 */
-					IRE_REFHOLD(saved_ire);
-				}
-				ire = saved_ire;
-			} else {
-				ire_refrele(saved_ire);
-				if (pire != NULL)
-					*pire = NULL;
-			}
-			return (ire);
-		} else if (pire == NULL) {
-			/*
-			 * If the caller did not ask for pire, release
-			 * it now.
-			 */
-			ire_refrele(saved_ire);
-		}
-		return (ire);
-	}
-
-	ASSERT(pire == NULL || *pire == NULL);
 	return (ire);
 }
 
-/*
- * Delete the IRE cache for the gateway and all IRE caches whose
- * ire_gateway_addr_v6 points to this gateway, and allow them to
- * be created on demand by ip_newroute_v6.
- */
-void
-ire_clookup_delete_cache_gw_v6(const in6_addr_t *addr, zoneid_t zoneid,
-	ip_stack_t *ipst)
-{
-	irb_t *irb;
-	ire_t *ire;
-
-	irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
-	    ipst->ips_ip6_cache_table_size)];
-	IRB_REFHOLD(irb);
-	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (ire->ire_marks & IRE_MARK_CONDEMNED)
-			continue;
-
-		ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
-		if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, 0,
-		    IRE_CACHE, NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) {
-			ire_delete(ire);
-		}
-	}
-	IRB_REFRELE(irb);
-
-	ire_walk_v6(ire_delete_cache_gw_v6, (char *)addr, zoneid, ipst);
-}
-
-/*
- * Looks up cache table for a route.
- * specific lookup can be indicated by
- * passing the MATCH_* flags and the
- * necessary parameters.
- */
-ire_t *
-ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
-    int type, const ipif_t *ipif, zoneid_t zoneid, const ts_label_t *tsl,
-    int flags, ip_stack_t *ipst)
-{
-	ire_ctable_args_t	margs;
-
-	margs.ict_addr = (void *)addr;
-	margs.ict_gateway = (void *)gateway;
-	margs.ict_type = type;
-	margs.ict_ipif = ipif;
-	margs.ict_zoneid = zoneid;
-	margs.ict_tsl = tsl;
-	margs.ict_flags = flags;
-	margs.ict_ipst = ipst;
-	margs.ict_wq = NULL;
-
-	return (ip6_ctable_lookup_impl(&margs));
-}
 
 /*
- * Lookup cache.
+ * This function is called by
+ * ip_input/ire_route_recursive when doing a route lookup on only the
+ * destination address.
  *
- * In general the zoneid has to match (where ALL_ZONES match all of them).
- * But for IRE_LOCAL we also need to handle the case where L2 should
- * conceptually loop back the packet. This is necessary since neither
- * Ethernet drivers nor Ethernet hardware loops back packets sent to their
- * own MAC address. This loopback is needed when the normal
- * routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill as the ill with which this IRE_LOCAL is associated.
+ * The optimizations of this function over ire_ftable_lookup are:
+ *	o removing unnecessary flag matching
+ *	o doing longest prefix match instead of overloading it further
+ *	  with the unnecessary "best_prefix_match"
  *
- * Earlier versions of this code always matched an IRE_LOCAL independently of
- * the zoneid. We preserve that earlier behavior when
- * ip_restrict_interzone_loopback is turned off.
+ * If no route is found we return IRE_NOROUTE.
  */
 ire_t *
-ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid,
-    const ts_label_t *tsl, ip_stack_t *ipst)
-{
-	irb_t *irb_ptr;
-	ire_t *ire;
-
-	irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
-	    ipst->ips_ip6_cache_table_size)];
-	rw_enter(&irb_ptr->irb_lock, RW_READER);
-	for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
-		if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
-			continue;
-		if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) {
-			/*
-			 * Finally, check if the security policy has any
-			 * restriction on using this route for the specified
-			 * message.
-			 */
-			if (tsl != NULL &&
-			    ire->ire_gw_secattr != NULL &&
-			    tsol_ire_match_gwattr(ire, tsl) != 0) {
-				continue;
-			}
-
-			if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
-			    ire->ire_zoneid == ALL_ZONES) {
-				IRE_REFHOLD(ire);
-				rw_exit(&irb_ptr->irb_lock);
-				return (ire);
-			}
-
-			if (ire->ire_type == IRE_LOCAL) {
-				if (ipst->ips_ip_restrict_interzone_loopback &&
-				    !ire_local_ok_across_zones(ire, zoneid,
-				    (void *)addr, tsl, ipst))
-					continue;
-
-				IRE_REFHOLD(ire);
-				rw_exit(&irb_ptr->irb_lock);
-				return (ire);
-			}
-		}
-	}
-	rw_exit(&irb_ptr->irb_lock);
-	return (NULL);
-}
-
-/*
- * Locate the interface ire that is tied to the cache ire 'cire' via
- * cire->ire_ihandle.
- *
- * We are trying to create the cache ire for an onlink destn. or
- * gateway in 'cire'. We are called from ire_add_v6() in the IRE_IF_RESOLVER
- * case for xresolv interfaces, after the ire has come back from
- * an external resolver.
- */
-static ire_t *
-ire_ihandle_lookup_onlink_v6(ire_t *cire)
+ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
+    ip_stack_t *ipst, uint_t *generationp)
 {
 	ire_t	*ire;
-	int	match_flags;
-	int	i;
-	int	j;
-	irb_t	*irb_ptr;
-	ip_stack_t	*ipst = cire->ire_ipst;
-
-	ASSERT(cire != NULL);
 
-	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
-	/*
-	 * We know that the mask of the interface ire equals cire->ire_cmask.
-	 * (When ip_newroute_v6() created 'cire' for an on-link destn.
-	 * it set its cmask from the interface ire's mask)
-	 */
-	ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6,
-	    NULL, IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
-	    NULL, match_flags, ipst);
-	if (ire != NULL)
-		return (ire);
-	/*
-	 * If we didn't find an interface ire above, we can't declare failure.
-	 * For backwards compatibility, we need to support prefix routes
-	 * pointing to next hop gateways that are not on-link.
-	 *
-	 * In the resolver/noresolver case, ip_newroute_v6() thinks
-	 * it is creating the cache ire for an onlink destination in 'cire'.
-	 * But 'cire' is not actually onlink, because ire_ftable_lookup_v6()
-	 * cheated it, by doing ire_route_lookup_v6() twice and returning an
-	 * interface ire.
-	 *
-	 * Eg. default	-	gw1			(line 1)
-	 *	gw1	-	gw2			(line 2)
-	 *	gw2	-	hme0			(line 3)
-	 *
-	 * In the above example, ip_newroute_v6() tried to create the cache ire
-	 * 'cire' for gw1, based on the interface route in line 3. The
-	 * ire_ftable_lookup_v6() above fails, because there is
-	 * no interface route to reach gw1. (it is gw2). We fall thru below.
-	 *
-	 * Do a brute force search based on the ihandle in a subset of the
-	 * forwarding tables, corresponding to cire->ire_cmask_v6. Otherwise
-	 * things become very complex, since we don't have 'pire' in this
-	 * case. (Also note that this method is not possible in the offlink
-	 * case because we don't know the mask)
-	 */
-	i = ip_mask_to_plen_v6(&cire->ire_cmask_v6);
-	if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
-		return (NULL);
-	for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) {
-		irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][j];
-		rw_enter(&irb_ptr->irb_lock, RW_READER);
-		for (ire = irb_ptr->irb_ire; ire != NULL;
-		    ire = ire->ire_next) {
-			if (ire->ire_marks & IRE_MARK_CONDEMNED)
-				continue;
-			if ((ire->ire_type & IRE_INTERFACE) &&
-			    (ire->ire_ihandle == cire->ire_ihandle)) {
-				IRE_REFHOLD(ire);
-				rw_exit(&irb_ptr->irb_lock);
-				return (ire);
-			}
-		}
-		rw_exit(&irb_ptr->irb_lock);
+	ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL,
+	    MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
+	if (ire == NULL) {
+		ire = ire_reject(ipst, B_TRUE);
+		if (generationp != NULL)
+			*generationp = IRE_GENERATION_VERIFY;
 	}
-	return (NULL);
+	/* ftable_lookup did round robin */
+	return (ire);
 }
 
-
-/*
- * Locate the interface ire that is tied to the cache ire 'cire' via
- * cire->ire_ihandle.
- *
- * We are trying to create the cache ire for an offlink destn based
- * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
- * as found by ip_newroute_v6(). We are called from ip_newroute_v6() in
- * the IRE_CACHE case.
- */
 ire_t *
-ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
+ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa,
+    uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
 {
-	ire_t	*ire;
-	int	match_flags;
-	in6_addr_t	gw_addr;
-	ipif_t		*gw_ipif;
-	ip_stack_t	*ipst = cire->ire_ipst;
-
-	ASSERT(cire != NULL && pire != NULL);
+	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
 
-	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
-	if (pire->ire_ipif != NULL)
-		match_flags |= MATCH_IRE_ILL;
-	/*
-	 * We know that the mask of the interface ire equals cire->ire_cmask.
-	 * (When ip_newroute_v6() created 'cire' for an on-link destn. it set
-	 * its cmask from the interface ire's mask)
-	 */
-	ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, 0,
-	    IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
-	    NULL, match_flags, ipst);
-	if (ire != NULL)
-		return (ire);
-	/*
-	 * If we didn't find an interface ire above, we can't declare failure.
-	 * For backwards compatibility, we need to support prefix routes
-	 * pointing to next hop gateways that are not on-link.
-	 *
-	 * Assume we are trying to ping some offlink destn, and we have the
-	 * routing table below.
-	 *
-	 * Eg.	default	- gw1		<--- pire	(line 1)
-	 *	gw1	- gw2				(line 2)
-	 *	gw2	- hme0				(line 3)
-	 *
-	 * If we already have a cache ire for gw1 in 'cire', the
-	 * ire_ftable_lookup_v6 above would have failed, since there is no
-	 * interface ire to reach gw1. We will fallthru below.
-	 *
-	 * Here we duplicate the steps that ire_ftable_lookup_v6() did in
-	 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
-	 * The differences are the following
-	 * i.   We want the interface ire only, so we call
-	 *	ire_ftable_lookup_v6() instead of ire_route_lookup_v6()
-	 * ii.  We look for only prefix routes in the 1st call below.
-	 * ii.  We want to match on the ihandle in the 2nd call below.
-	 */
-	match_flags =  MATCH_IRE_TYPE;
-	if (pire->ire_ipif != NULL)
-		match_flags |= MATCH_IRE_ILL;
-
-	mutex_enter(&pire->ire_lock);
-	gw_addr = pire->ire_gateway_addr_v6;
-	mutex_exit(&pire->ire_lock);
-	ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_OFFSUBNET,
-	    pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
-	if (ire == NULL)
-		return (NULL);
-	/*
-	 * At this point 'ire' corresponds to the entry shown in line 2.
-	 * gw_addr is 'gw2' in the example above.
-	 */
-	mutex_enter(&ire->ire_lock);
-	gw_addr = ire->ire_gateway_addr_v6;
-	mutex_exit(&ire->ire_lock);
-	gw_ipif = ire->ire_ipif;
-	ire_refrele(ire);
-
-	match_flags |= MATCH_IRE_IHANDLE;
-	ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_INTERFACE,
-	    gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
-	    NULL, match_flags, ipst);
-	return (ire);
+	return (ip_select_route(dst, ixa, generationp, setsrcp, errorp,
+	    multirtp));
 }
 
 /*
- * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
- * ire associated with the specified ipif.
+ * Recursively look for a route to the destination. Can also match on
+ * the zoneid, ill, and label. Used for the data paths. See also
+ * ire_route_recursive_dstonly.
  *
- * This might occasionally be called when IPIF_UP is not set since
- * the IPV6_MULTICAST_IF as well as creating interface routes
- * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
+ * If ill is set this means we will match it by adding MATCH_IRE_ILL.
  *
- * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
- * the ipif this routine might return NULL.
- * (Sometimes called as writer though not required by this function.)
+ * If allocate is not set then we will only inspect the existing IREs; never
+ * create an IRE_IF_CLONE. This is used on the receive side when we are not
+ * forwarding.
+ *
+ * Note that this function never returns NULL. It returns an IRE_NOROUTE
+ * instead.
+ *
+ * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
+ * is an error.
+ * Allow at most one RTF_INDIRECT.
  */
 ire_t *
-ipif_to_ire_v6(const ipif_t *ipif)
+ire_route_recursive_impl_v6(ire_t *ire,
+    const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
+    zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
+    boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst,
+    in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
 {
-	ire_t	*ire;
-	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-	uint_t	match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF;
+	int		i, j;
+	in6_addr_t	v6nexthop = *nexthop;
+	ire_t		*ires[MAX_IRE_RECURSION];
+	uint_t		generation;
+	uint_t		generations[MAX_IRE_RECURSION];
+	boolean_t	need_refrele = B_FALSE;
+	boolean_t	invalidate = B_FALSE;
+	int		prefs[MAX_IRE_RECURSION];
+	ill_t		*ill = NULL;
+
+	if (setsrcp != NULL)
+		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
+	if (gwattrp != NULL)
+		ASSERT(*gwattrp == NULL);
+
+	if (ill_arg != NULL)
+		match_args |= MATCH_IRE_ILL;
 
 	/*
-	 * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
-	 * so that they aren't accidentally returned.  However, if the
-	 * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+	 * We iterate up to three times to resolve a route, even though
+	 * we have four slots in the array. The extra slot is for an
+	 * IRE_IF_CLONE we might need to create.
 	 */
-	if (IS_UNDER_IPMP(ipif->ipif_ill))
-		match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
-	ASSERT(ipif->ipif_isv6);
-	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
-		ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL,
-		    IRE_LOOPBACK, ipif, ALL_ZONES, NULL, match_flags, ipst);
-	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
-		/* In this case we need to lookup destination address. */
-		ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr,
-		    &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES,
-		    0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
-	} else {
-		ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet,
-		    &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL,
-		    ALL_ZONES, 0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
-	}
-	return (ire);
-}
-
-/*
- * Return B_TRUE if a multirt route is resolvable
- * (or if no route is resolved yet), B_FALSE otherwise.
- * This only works in the global zone.
- */
-boolean_t
-ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl,
-    ip_stack_t *ipst)
-{
-	ire_t	*first_fire;
-	ire_t	*first_cire;
-	ire_t	*fire;
-	ire_t	*cire;
-	irb_t	*firb;
-	irb_t	*cirb;
-	int	unres_cnt = 0;
-	boolean_t resolvable = B_FALSE;
-
-	/* Retrieve the first IRE_HOST that matches the destination */
-	first_fire = ire_ftable_lookup_v6(v6dstp, &ipv6_all_ones, 0, IRE_HOST,
-	    NULL, NULL, ALL_ZONES, 0, tsl, MATCH_IRE_MASK | MATCH_IRE_TYPE |
-	    MATCH_IRE_SECATTR, ipst);
-
-	/* No route at all */
-	if (first_fire == NULL) {
-		return (B_TRUE);
-	}
-
-	firb = first_fire->ire_bucket;
-	ASSERT(firb);
-
-	/* Retrieve the first IRE_CACHE ire for that destination. */
-	first_cire = ire_cache_lookup_v6(v6dstp, GLOBAL_ZONEID, tsl, ipst);
-
-	/* No resolved route. */
-	if (first_cire == NULL) {
-		ire_refrele(first_fire);
-		return (B_TRUE);
-	}
-
-	/* At least one route is resolved. */
-
-	cirb = first_cire->ire_bucket;
-	ASSERT(cirb);
-
-	/* Count the number of routes to that dest that are declared. */
-	IRB_REFHOLD(firb);
-	for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
-		if (!(fire->ire_flags & RTF_MULTIRT))
-			continue;
-		if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, v6dstp))
-			continue;
-		unres_cnt++;
-	}
-	IRB_REFRELE(firb);
-
-
-	/* Then subtract the number of routes to that dst that are resolved */
-	IRB_REFHOLD(cirb);
-	for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
-		if (!(cire->ire_flags & RTF_MULTIRT))
-			continue;
-		if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp))
-			continue;
-		if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
-			continue;
-		unres_cnt--;
-	}
-	IRB_REFRELE(cirb);
-
-	/* At least one route is unresolved; search for a resolvable route. */
-	if (unres_cnt > 0)
-		resolvable = ire_multirt_lookup_v6(&first_cire, &first_fire,
-		    MULTIRT_USESTAMP|MULTIRT_CACHEGW, tsl, ipst);
-
-	if (first_fire)
-		ire_refrele(first_fire);
-
-	if (first_cire)
-		ire_refrele(first_cire);
-
-	return (resolvable);
-}
-
-
-/*
- * Return B_TRUE and update *ire_arg and *fire_arg
- * if at least one resolvable route is found.
- * Return B_FALSE otherwise (all routes are resolved or
- * the remaining unresolved routes are all unresolvable).
- * This only works in the global zone.
- */
-boolean_t
-ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
-    const ts_label_t *tsl, ip_stack_t *ipst)
-{
-	clock_t	delta;
-	ire_t	*best_fire = NULL;
-	ire_t	*best_cire = NULL;
-	ire_t	*first_fire;
-	ire_t	*first_cire;
-	ire_t	*fire;
-	ire_t	*cire;
-	irb_t	*firb = NULL;
-	irb_t	*cirb = NULL;
-	ire_t	*gw_ire;
-	boolean_t	already_resolved;
-	boolean_t	res;
-	in6_addr_t	v6dst;
-	in6_addr_t	v6gw;
-
-	ip2dbg(("ire_multirt_lookup_v6: *ire_arg %p, *fire_arg %p, "
-	    "flags %04x\n", (void *)*ire_arg, (void *)*fire_arg, flags));
-
-	ASSERT(ire_arg);
-	ASSERT(fire_arg);
-
-	/* Not an IRE_HOST ire; give up. */
-	if ((*fire_arg == NULL) ||
-	    ((*fire_arg)->ire_type != IRE_HOST)) {
-		return (B_FALSE);
-	}
+	i = 0;
+	while (i < MAX_IRE_RECURSION - 1) {
+		/* ire_ftable_lookup handles round-robin/ECMP */
+		if (ire == NULL) {
+			ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
+			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
+			    match_args, xmit_hint, ipst, &generation);
+		} else {
+			/* Caller passed it; extra hold since we will rele */
+			ire_refhold(ire);
+			if (generationp != NULL)
+				generation = *generationp;
+			else
+				generation = IRE_GENERATION_VERIFY;
+		}
 
-	/* This is the first IRE_HOST ire for that destination. */
-	first_fire = *fire_arg;
-	firb = first_fire->ire_bucket;
-	ASSERT(firb);
+		if (ire == NULL)
+			ire = ire_reject(ipst, B_TRUE);
 
-	mutex_enter(&first_fire->ire_lock);
-	v6dst = first_fire->ire_addr_v6;
-	mutex_exit(&first_fire->ire_lock);
+		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
+		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+			goto error;
 
-	ip2dbg(("ire_multirt_lookup_v6: dst %08x\n",
-	    ntohl(V4_PART_OF_V6(v6dst))));
+		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
 
-	/*
-	 * Retrieve the first IRE_CACHE ire for that destination;
-	 * if we don't find one, no route for that dest is
-	 * resolved yet.
-	 */
-	first_cire = ire_cache_lookup_v6(&v6dst, GLOBAL_ZONEID, tsl, ipst);
-	if (first_cire) {
-		cirb = first_cire->ire_bucket;
-	}
-
-	ip2dbg(("ire_multirt_lookup_v6: first_cire %p\n", (void *)first_cire));
+		prefs[i] = ire_pref(ire);
+		if (i != 0) {
+			/*
+			 * Don't allow anything unusual past the first
+			 * iteration.
+			 */
+			if ((ire->ire_type &
+			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
+			    prefs[i] <= prefs[i-1]) {
+				ire_refrele(ire);
+				ire = ire_reject(ipst, B_TRUE);
+				goto error;
+			}
+		}
+		/* We have a usable IRE */
+		ires[i] = ire;
+		generations[i] = generation;
+		i++;
+
+		/* The first RTF_SETSRC address is passed back if setsrcp */
+		if ((ire->ire_flags & RTF_SETSRC) &&
+		    setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
+			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
+			    &ire->ire_setsrc_addr_v6));
+			*setsrcp = ire->ire_setsrc_addr_v6;
+		}
 
-	/*
-	 * Search for a resolvable route, giving the top priority
-	 * to routes that can be resolved without any call to the resolver.
-	 */
-	IRB_REFHOLD(firb);
+		/* The first ire_gw_secattr is passed back if gwattrp */
+		if (ire->ire_gw_secattr != NULL &&
+		    gwattrp != NULL && *gwattrp == NULL)
+			*gwattrp = ire->ire_gw_secattr;
 
-	if (!IN6_IS_ADDR_MULTICAST(&v6dst)) {
 		/*
-		 * For all multiroute IRE_HOST ires for that destination,
-		 * check if the route via the IRE_HOST's gateway is
-		 * resolved yet.
+		 * Check if we have a short-cut pointer to an IRE for this
+		 * destination, and that the cached dependency isn't stale.
+		 * In that case we've rejoined an existing tree towards a
+		 * parent, thus we don't need to continue the loop to
+		 * discover the rest of the tree.
 		 */
-		for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
-
-			if (!(fire->ire_flags & RTF_MULTIRT))
-				continue;
-			if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
-				continue;
-
-			if (fire->ire_gw_secattr != NULL &&
-			    tsol_ire_match_gwattr(fire, tsl) != 0) {
-				continue;
-			}
-
-			mutex_enter(&fire->ire_lock);
-			v6gw = fire->ire_gateway_addr_v6;
-			mutex_exit(&fire->ire_lock);
-
-			ip2dbg(("ire_multirt_lookup_v6: fire %p, "
-			    "ire_addr %08x, ire_gateway_addr %08x\n",
-			    (void *)fire,
-			    ntohl(V4_PART_OF_V6(fire->ire_addr_v6)),
-			    ntohl(V4_PART_OF_V6(v6gw))));
+		mutex_enter(&ire->ire_lock);
+		if (ire->ire_dep_parent != NULL &&
+		    ire->ire_dep_parent->ire_generation ==
+		    ire->ire_dep_parent_generation) {
+			mutex_exit(&ire->ire_lock);
+			ire = NULL;
+			goto done;
+		}
+		mutex_exit(&ire->ire_lock);
 
-			already_resolved = B_FALSE;
+		/*
+		 * If this type should have an ire_nce_cache (even if it
+		 * doesn't yet have one) then we are done. Includes
+		 * IRE_INTERFACE with a full 128 bit mask.
+		 */
+		if (ire->ire_nce_capable) {
+			ire = NULL;
+			goto done;
+		}
 
-			if (first_cire) {
-				ASSERT(cirb);
+		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
+		/*
+		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
+		 * particular destination
+		 */
+		if (ire->ire_type & IRE_INTERFACE) {
+			ire_t		*clone;
 
-				IRB_REFHOLD(cirb);
-				/*
-				 * For all IRE_CACHE ires for that
-				 * destination.
-				 */
-				for (cire = first_cire;
-				    cire != NULL;
-				    cire = cire->ire_next) {
-
-					if (!(cire->ire_flags & RTF_MULTIRT))
-						continue;
-					if (!IN6_ARE_ADDR_EQUAL(
-					    &cire->ire_addr_v6, &v6dst))
-						continue;
-					if (cire->ire_marks &
-					    (IRE_MARK_CONDEMNED|
-					    IRE_MARK_TESTHIDDEN))
-						continue;
-
-					if (cire->ire_gw_secattr != NULL &&
-					    tsol_ire_match_gwattr(cire,
-					    tsl) != 0) {
-						continue;
-					}
-
-					/*
-					 * Check if the IRE_CACHE's gateway
-					 * matches the IRE_HOST's gateway.
-					 */
-					if (IN6_ARE_ADDR_EQUAL(
-					    &cire->ire_gateway_addr_v6,
-					    &v6gw)) {
-						already_resolved = B_TRUE;
-						break;
-					}
-				}
-				IRB_REFRELE(cirb);
-			}
+			ASSERT(ire->ire_masklen != IPV6_ABITS);
 
 			/*
-			 * This route is already resolved;
-			 * proceed with next one.
+			 * In the case of ip_input and ILLF_FORWARDING not
+			 * being set, and in the case of RTM_GET,
+			 * there is no point in allocating
+			 * an IRE_IF_CLONE. We return the IRE_INTERFACE.
+			 * Note that !allocate can result in a ire_dep_parent
+			 * which is IRE_IF_* without an IRE_IF_CLONE.
+			 * We recover from that when we need to send packets
+			 * by ensuring that the generations become
+			 * IRE_GENERATION_VERIFY in this case.
 			 */
-			if (already_resolved) {
-				ip2dbg(("ire_multirt_lookup_v6: found cire %p, "
-				    "already resolved\n", (void *)cire));
-				continue;
+			if (!allocate) {
+				invalidate = B_TRUE;
+				ire = NULL;
+				goto done;
 			}
 
-			/*
-			 * The route is unresolved; is it actually
-			 * resolvable, i.e. is there a cache or a resolver
-			 * for the gateway?
-			 */
-			gw_ire = ire_route_lookup_v6(&v6gw, 0, 0, 0, NULL, NULL,
-			    ALL_ZONES, tsl, MATCH_IRE_RECURSIVE |
-			    MATCH_IRE_SECATTR, ipst);
-
-			ip2dbg(("ire_multirt_lookup_v6: looked up gw_ire %p\n",
-			    (void *)gw_ire));
-
-			/*
-			 * This route can be resolved without any call to the
-			 * resolver; if the MULTIRT_CACHEGW flag is set,
-			 * give the top priority to this ire and exit the
-			 * loop.
-			 * This occurs when an resolver reply is processed
-			 * through ip_wput_nondata()
-			 */
-			if ((flags & MULTIRT_CACHEGW) &&
-			    (gw_ire != NULL) &&
-			    (gw_ire->ire_type & IRE_CACHETABLE)) {
+			clone = ire_create_if_clone(ire, &v6nexthop,
+			    &generation);
+			if (clone == NULL) {
 				/*
-				 * Release the resolver associated to the
-				 * previous candidate best ire, if any.
+				 * Temporary failure - no memory.
+				 * Don't want caller to cache IRE_NOROUTE.
 				 */
-				if (best_cire) {
-					ire_refrele(best_cire);
-					ASSERT(best_fire);
-				}
-
-				best_fire = fire;
-				best_cire = gw_ire;
-
-				ip2dbg(("ire_multirt_lookup_v6: found top prio "
-				    "best_fire %p, best_cire %p\n",
-				    (void *)best_fire, (void *)best_cire));
-				break;
+				invalidate = B_TRUE;
+				ire = ire_blackhole(ipst, B_TRUE);
+				goto error;
 			}
-
 			/*
-			 * Compute the time elapsed since our preceding
-			 * attempt to  resolve that route.
-			 * If the MULTIRT_USESTAMP flag is set, we take that
-			 * route into account only if this time interval
-			 * exceeds ip_multirt_resolution_interval;
-			 * this prevents us from attempting to resolve a
-			 * broken route upon each sending of a packet.
+			 * Make clone next to last entry and the
+			 * IRE_INTERFACE the last in the dependency
+			 * chain since the clone depends on the
+			 * IRE_INTERFACE.
 			 */
-			delta = lbolt - fire->ire_last_used_time;
-			delta = TICK_TO_MSEC(delta);
-
-			res = (boolean_t)
-			    ((delta > ipst->
-			    ips_ip_multirt_resolution_interval) ||
-			    (!(flags & MULTIRT_USESTAMP)));
+			ASSERT(i >= 1);
+			ASSERT(i < MAX_IRE_RECURSION);
 
-			ip2dbg(("ire_multirt_lookup_v6: fire %p, delta %lu, "
-			    "res %d\n",
-			    (void *)fire, delta, res));
-
-			if (res) {
-				/*
-				 * A resolver exists for the gateway: save
-				 * the current IRE_HOST ire as a candidate
-				 * best ire. If we later discover that a
-				 * top priority ire exists (i.e. no need to
-				 * call the resolver), then this new ire
-				 * will be preferred to the current one.
-				 */
-				if (gw_ire != NULL) {
-					if (best_fire == NULL) {
-						ASSERT(best_cire == NULL);
-
-						best_fire = fire;
-						best_cire = gw_ire;
-
-						ip2dbg(("ire_multirt_lookup_v6:"
-						    "found candidate "
-						    "best_fire %p, "
-						    "best_cire %p\n",
-						    (void *)best_fire,
-						    (void *)best_cire));
-
-						/*
-						 * If MULTIRT_CACHEGW is not
-						 * set, we ignore the top
-						 * priority ires that can
-						 * be resolved without any
-						 * call to the resolver;
-						 * In that case, there is
-						 * actually no need
-						 * to continue the loop.
-						 */
-						if (!(flags &
-						    MULTIRT_CACHEGW)) {
-							break;
-						}
-						continue;
-					}
-				} else {
-					/*
-					 * No resolver for the gateway: the
-					 * route is not resolvable.
-					 * If the MULTIRT_SETSTAMP flag is
-					 * set, we stamp the IRE_HOST ire,
-					 * so we will not select it again
-					 * during this resolution interval.
-					 */
-					if (flags & MULTIRT_SETSTAMP)
-						fire->ire_last_used_time =
-						    lbolt;
-				}
-			}
+			ires[i] = ires[i-1];
+			generations[i] = generations[i-1];
+			ires[i-1] = clone;
+			generations[i-1] = generation;
+			i++;
 
-			if (gw_ire != NULL)
-				ire_refrele(gw_ire);
+			ire = NULL;
+			goto done;
 		}
-	} else { /* IN6_IS_ADDR_MULTICAST(&v6dst) */
 
-		for (fire = first_fire;
-		    fire != NULL;
-		    fire = fire->ire_next) {
-
-			if (!(fire->ire_flags & RTF_MULTIRT))
-				continue;
-			if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
-				continue;
-
-			if (fire->ire_gw_secattr != NULL &&
-			    tsol_ire_match_gwattr(fire, tsl) != 0) {
-				continue;
-			}
-
-			already_resolved = B_FALSE;
-
-			mutex_enter(&fire->ire_lock);
-			v6gw = fire->ire_gateway_addr_v6;
-			mutex_exit(&fire->ire_lock);
-
-			gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
-			    IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, tsl,
-			    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
-			    MATCH_IRE_SECATTR, ipst);
-
-			/* No resolver for the gateway; we skip this ire. */
-			if (gw_ire == NULL) {
-				continue;
-			}
+		/*
+		 * We only match on the type and optionally ILL when
+		 * recursing. The type match is used by some callers
+		 * to exclude certain types (such as IRE_IF_CLONE or
+		 * IRE_LOCAL|IRE_LOOPBACK).
+		 */
+		match_args &= MATCH_IRE_TYPE;
+		v6nexthop = ire->ire_gateway_addr_v6;
+		if (ill == NULL && ire->ire_ill != NULL) {
+			ill = ire->ire_ill;
+			need_refrele = B_TRUE;
+			ill_refhold(ill);
+			match_args |= MATCH_IRE_ILL;
+		}
 
-			if (first_cire) {
+		ire = NULL;
+	}
+	ASSERT(ire == NULL);
+	ire = ire_reject(ipst, B_TRUE);
 
-				IRB_REFHOLD(cirb);
-				/*
-				 * For all IRE_CACHE ires for that
-				 * destination.
-				 */
-				for (cire = first_cire;
-				    cire != NULL;
-				    cire = cire->ire_next) {
-
-					if (!(cire->ire_flags & RTF_MULTIRT))
-						continue;
-					if (!IN6_ARE_ADDR_EQUAL(
-					    &cire->ire_addr_v6, &v6dst))
-						continue;
-					if (cire->ire_marks &
-					    IRE_MARK_CONDEMNED)
-						continue;
-
-					if (cire->ire_gw_secattr != NULL &&
-					    tsol_ire_match_gwattr(cire,
-					    tsl) != 0) {
-						continue;
-					}
-
-					/*
-					 * Cache entries are linked to the
-					 * parent routes using the parent handle
-					 * (ire_phandle). If no cache entry has
-					 * the same handle as fire, fire is
-					 * still unresolved.
-					 */
-					ASSERT(cire->ire_phandle != 0);
-					if (cire->ire_phandle ==
-					    fire->ire_phandle) {
-						already_resolved = B_TRUE;
-						break;
-					}
-				}
-				IRB_REFRELE(cirb);
-			}
+error:
+	ASSERT(ire != NULL);
+	if (need_refrele)
+		ill_refrele(ill);
 
-			/*
-			 * This route is already resolved; proceed with
-			 * next one.
-			 */
-			if (already_resolved) {
-				ire_refrele(gw_ire);
-				continue;
-			}
+	/*
+	 * In the case of MULTIRT we want to try a different IRE the next
+	 * time. We let the next packet retry in that case.
+	 */
+	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
+		(void) ire_no_good(ires[0]);
 
-			/*
-			 * Compute the time elapsed since our preceding
-			 * attempt to resolve that route.
-			 * If the MULTIRT_USESTAMP flag is set, we take
-			 * that route into account only if this time
-			 * interval exceeds ip_multirt_resolution_interval;
-			 * this prevents us from attempting to resolve a
-			 * broken route upon each sending of a packet.
-			 */
-			delta = lbolt - fire->ire_last_used_time;
-			delta = TICK_TO_MSEC(delta);
-
-			res = (boolean_t)
-			    ((delta > ipst->
-			    ips_ip_multirt_resolution_interval) ||
-			    (!(flags & MULTIRT_USESTAMP)));
-
-			ip3dbg(("ire_multirt_lookup_v6: fire %p, delta %lx, "
-			    "flags %04x, res %d\n",
-			    (void *)fire, delta, flags, res));
-
-			if (res) {
-				if (best_cire) {
-					/*
-					 * Release the resolver associated
-					 * to the preceding candidate best
-					 * ire, if any.
-					 */
-					ire_refrele(best_cire);
-					ASSERT(best_fire);
-				}
-				best_fire = fire;
-				best_cire = gw_ire;
-				continue;
-			}
+cleanup:
+	/* cleanup ires[i] */
+	ire_dep_unbuild(ires, i);
+	for (j = 0; j < i; j++)
+		ire_refrele(ires[j]);
 
-			ire_refrele(gw_ire);
-		}
-	}
+	ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE));
+	/*
+	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
+	 * ip_select_route since the reject or lack of memory might be gone.
+	 */
+	if (generationp != NULL)
+		*generationp = IRE_GENERATION_VERIFY;
+	return (ire);
 
-	if (best_fire) {
-		IRE_REFHOLD(best_fire);
+done:
+	ASSERT(ire == NULL);
+	if (need_refrele)
+		ill_refrele(ill);
+
+	/* Build dependencies */
+	if (!ire_dep_build(ires, generations, i)) {
+		/* Something in chain was condemned; tear it apart */
+		ire = ire_blackhole(ipst, B_TRUE);
+		goto cleanup;
 	}
-	IRB_REFRELE(firb);
 
-	/* Release the first IRE_CACHE we initially looked up, if any. */
-	if (first_cire)
-		ire_refrele(first_cire);
-
-	/* Found a resolvable route. */
-	if (best_fire) {
-		ASSERT(best_cire);
-
-		if (*fire_arg)
-			ire_refrele(*fire_arg);
-		if (*ire_arg)
-			ire_refrele(*ire_arg);
+	/*
+	 * Release all refholds except the one for ires[0] that we
+	 * will return to the caller.
+	 */
+	for (j = 1; j < i; j++)
+		ire_refrele(ires[j]);
 
+	if (invalidate) {
 		/*
-		 * Update the passed arguments with the
-		 * resolvable multirt route we found
+		 * Since we needed to allocate but couldn't we need to make
+		 * sure that the dependency chain is rebuilt the next time.
 		 */
-		*fire_arg = best_fire;
-		*ire_arg = best_cire;
-
-		ip2dbg(("ire_multirt_lookup_v6: returning B_TRUE, "
-		    "*fire_arg %p, *ire_arg %p\n",
-		    (void *)best_fire, (void *)best_cire));
-
-		return (B_TRUE);
+		ire_dep_invalidate_generations(ires[0]);
+		generation = IRE_GENERATION_VERIFY;
+	} else {
+		/*
+		 * IREs can have been added or deleted while we did the
+		 * recursive lookup and we can't catch those until we've built
+		 * the dependencies. We verify the stored
+		 * ire_dep_parent_generation to catch any such changes and
+		 * return IRE_GENERATION_VERIFY (which will cause
+		 * ip_select_route to be called again so we can redo the
+		 * recursive lookup next time we send a packet.
+		 */
+		generation = ire_dep_validate_generations(ires[0]);
+		if (generations[0] != ires[0]->ire_generation) {
+			/* Something changed at the top */
+			generation = IRE_GENERATION_VERIFY;
+		}
 	}
+	if (generationp != NULL)
+		*generationp = generation;
 
-	ASSERT(best_cire == NULL);
-
-	ip2dbg(("ire_multirt_lookup_v6: returning B_FALSE, *fire_arg %p, "
-	    "*ire_arg %p\n",
-	    (void *)*fire_arg, (void *)*ire_arg));
-
-	/* No resolvable route. */
-	return (B_FALSE);
+	return (ires[0]);
 }
 
-
-/*
- * Find an IRE_OFFSUBNET IRE entry for the multicast address 'v6dstp'
- * that goes through 'ipif'. As a fallback, a route that goes through
- * ipif->ipif_ill can be returned.
- */
 ire_t *
-ipif_lookup_multi_ire_v6(ipif_t *ipif, const in6_addr_t *v6dstp)
+ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
+    const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
+    boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst,
+    in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
 {
-	ire_t	*ire;
-	ire_t	*save_ire = NULL;
-	ire_t   *gw_ire;
-	irb_t   *irb;
-	in6_addr_t v6gw;
-	int	match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
-
-	ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
-	    NULL, MATCH_IRE_DEFAULT, ipst);
-
-	if (ire == NULL)
-		return (NULL);
-
-	irb = ire->ire_bucket;
-	ASSERT(irb);
-
-	IRB_REFHOLD(irb);
-	ire_refrele(ire);
-	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) ||
-		    (ipif->ipif_zoneid != ire->ire_zoneid &&
-		    ire->ire_zoneid != ALL_ZONES)) {
-			continue;
-		}
-
-		switch (ire->ire_type) {
-		case IRE_DEFAULT:
-		case IRE_PREFIX:
-		case IRE_HOST:
-			mutex_enter(&ire->ire_lock);
-			v6gw = ire->ire_gateway_addr_v6;
-			mutex_exit(&ire->ire_lock);
-			gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
-			    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
-			    NULL, match_flags, ipst);
-
-			if (gw_ire != NULL) {
-				if (save_ire != NULL) {
-					ire_refrele(save_ire);
-				}
-				IRE_REFHOLD(ire);
-				if (gw_ire->ire_ipif == ipif) {
-					ire_refrele(gw_ire);
-
-					IRB_REFRELE(irb);
-					return (ire);
-				}
-				ire_refrele(gw_ire);
-				save_ire = ire;
-			}
-			break;
-		case IRE_IF_NORESOLVER:
-		case IRE_IF_RESOLVER:
-			if (ire->ire_ipif == ipif) {
-				if (save_ire != NULL) {
-					ire_refrele(save_ire);
-				}
-				IRE_REFHOLD(ire);
-
-				IRB_REFRELE(irb);
-				return (ire);
-			}
-			break;
-		}
-	}
-	IRB_REFRELE(irb);
-
-	return (save_ire);
+	return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
+	    zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp,
+	    gwattrp, generationp));
 }
 
 /*
- * This is the implementation of the IPv6 IRE cache lookup procedure.
- * Separating the interface from the implementation allows additional
- * flexibility when specifying search criteria.
+ * Recursively look for a route to the destination.
+ * We only handle a destination match here, yet we have the same arguments
+ * as the full match to allow function pointers to select between the two.
+ *
+ * Note that this function never returns NULL. It returns an IRE_NOROUTE
+ * instead.
+ *
+ * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
+ * is an error.
+ * Allow at most one RTF_INDIRECT.
  */
-static ire_t *
-ip6_ctable_lookup_impl(ire_ctable_args_t *margs)
+ire_t *
+ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, boolean_t allocate,
+    uint32_t xmit_hint, ip_stack_t *ipst)
 {
-	irb_t			*irb_ptr;
-	ire_t			*ire;
-	ip_stack_t		*ipst = margs->ict_ipst;
+	ire_t	*ire;
+	ire_t	*ire1;
+	uint_t	generation;
 
-	if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
-	    (margs->ict_ipif == NULL)) {
-		return (NULL);
-	}
+	/* ire_ftable_lookup handles round-robin/ECMP */
+	ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
+	    &generation);
+	ASSERT(ire != NULL);
 
-	irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(
-	    *((in6_addr_t *)(margs->ict_addr)),
-	    ipst->ips_ip6_cache_table_size)];
-	rw_enter(&irb_ptr->irb_lock, RW_READER);
-	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (ire->ire_marks & IRE_MARK_CONDEMNED)
-			continue;
-		ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
-		if (ire_match_args_v6(ire, (in6_addr_t *)margs->ict_addr,
-		    &ire->ire_mask_v6, (in6_addr_t *)margs->ict_gateway,
-		    margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0,
-		    margs->ict_tsl, margs->ict_flags)) {
-			IRE_REFHOLD(ire);
-			rw_exit(&irb_ptr->irb_lock);
-			return (ire);
-		}
+	/*
+	 * If this type should have an ire_nce_cache (even if it
+	 * doesn't yet have one) then we are done. Includes
+	 * IRE_INTERFACE with a full 128 bit mask.
+	 */
+	if (ire->ire_nce_capable)
+		return (ire);
+
+	/*
+	 * If the IRE has a current cached parent we know that the whole
+	 * parent chain is current, hence we don't need to discover and
+	 * build any dependencies by doing a recursive lookup.
+	 */
+	mutex_enter(&ire->ire_lock);
+	if (ire->ire_dep_parent != NULL &&
+	    ire->ire_dep_parent->ire_generation ==
+	    ire->ire_dep_parent_generation) {
+		mutex_exit(&ire->ire_lock);
+		return (ire);
 	}
+	mutex_exit(&ire->ire_lock);
 
-	rw_exit(&irb_ptr->irb_lock);
-	return (NULL);
+	/*
+	 * Fallback to loop in the normal code starting with the ire
+	 * we found. Normally this would return the same ire.
+	 */
+	ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
+	    NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL,
+	    &generation);
+	ire_refrele(ire);
+	return (ire1);
 }
diff --git a/usr/src/uts/common/inet/ip/ip6_output.c b/usr/src/uts/common/inet/ip/ip6_output.c
new file mode 100644
index 0000000000..3e06050781
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip6_output.c
@@ -0,0 +1,1315 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/dlpi.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/tcp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/optcom.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+
+#include <sys/pattr.h>
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+#ifdef	DEBUG
+extern boolean_t skip_sctp_cksum;
+#endif
+
+int
+ip_output_simple_v6(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+	ip6_t		*ip6h;
+	in6_addr_t	firsthop; /* In IP header */
+	in6_addr_t	dst;	/* End of source route, or ip6_dst if none */
+	ire_t		*ire;
+	in6_addr_t	setsrc;
+	int		error;
+	ill_t		*ill = NULL;
+	dce_t		*dce = NULL;
+	nce_t		*nce;
+	iaflags_t	ixaflags = ixa->ixa_flags;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	uint8_t		*nexthdrp;
+	boolean_t	repeat = B_FALSE;
+	boolean_t	multirt = B_FALSE;
+	uint_t		ifindex;
+
+	ip6h = (ip6_t *)mp->b_rptr;
+	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
+
+	ASSERT(ixa->ixa_nce == NULL);
+
+	ixa->ixa_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+	ASSERT(ixa->ixa_pktlen == msgdsize(mp));
+	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ixa->ixa_ip_hdr_length,
+	    &nexthdrp)) {
+		/* Malformed packet */
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
+		freemsg(mp);
+		return (EINVAL);
+	}
+	ixa->ixa_protocol = *nexthdrp;
+
+	/*
+	 * Assumes that source routed packets have already been massaged by
+	 * the ULP (ip_massage_options_v6) and as a result ip6_dst is the next
+	 * hop in the source route. The final destination is used for IPsec
+	 * policy and DCE lookup.
+	 */
+	firsthop = ip6h->ip6_dst;
+	dst = ip_get_dst_v6(ip6h, mp, NULL);
+
+repeat_ire:
+	error = 0;
+	setsrc = ipv6_all_zeros;
+	ire = ip_select_route_v6(&firsthop, ixa, NULL, &setsrc, &error,
+	    &multirt);
+	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
+	if (error != 0) {
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
+		freemsg(mp);
+		goto done;
+	}
+
+	if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
+		/* ire_ill might be NULL hence need to skip some code */
+		if (ixaflags & IXAF_SET_SOURCE)
+			ip6h->ip6_src = ipv6_loopback;
+		ixa->ixa_fragsize = IP_MAXPACKET;
+		ire->ire_ob_pkt_count++;
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+		/* No dce yet; use default one */
+		error = (ire->ire_sendfn)(ire, mp, ip6h, ixa,
+		    &ipst->ips_dce_default->dce_ident);
+		goto done;
+	}
+
+	/* Note that ip6_dst is only used for IRE_MULTICAST */
+	nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst);
+	if (nce == NULL) {
+		/* Allocation failure? */
+		ip_drop_output("ire_to_nce", mp, ill);
+		freemsg(mp);
+		error = ENOBUFS;
+		goto done;
+	}
+	if (nce->nce_is_condemned) {
+		nce_t *nce1;
+
+		nce1 = ire_handle_condemned_nce(nce, ire, NULL, ip6h, B_TRUE);
+		nce_refrele(nce);
+		if (nce1 == NULL) {
+			if (!repeat) {
+				/* Try finding a better IRE */
+				repeat = B_TRUE;
+				ire_refrele(ire);
+				goto repeat_ire;
+			}
+			/* Tried twice - drop packet */
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("No nce", mp, ill);
+			freemsg(mp);
+			error = ENOBUFS;
+			goto done;
+		}
+		nce = nce1;
+	}
+	/*
+	 * For multicast with multirt we have a flag passed back from
+	 * ire_lookup_multi_ill_v6 since we don't have an IRE for each
+	 * possible multicast address.
+	 * We also need a flag for multicast since we can't check
+	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
+	 */
+	if (multirt) {
+		ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
+		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
+	} else {
+		ixa->ixa_postfragfn = ire->ire_postfragfn;
+		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
+	}
+	ASSERT(ixa->ixa_nce == NULL);
+	ixa->ixa_nce = nce;
+
+	/*
+	 * Check for a dce_t with a path mtu.
+	 */
+	ifindex = 0;
+	if (IN6_IS_ADDR_LINKSCOPE(&dst))
+		ifindex = nce->nce_common->ncec_ill->ill_phyint->phyint_ifindex;
+
+	dce = dce_lookup_v6(&dst, ifindex, ipst, NULL);
+	ASSERT(dce != NULL);
+
+	if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
+		ixa->ixa_fragsize = IPV6_MIN_MTU;
+	} else if (dce->dce_flags & DCEF_PMTU) {
+		/*
+		 * To avoid a periodic timer to increase the path MTU we
+		 * look at dce_last_change_time each time we send a packet.
+		 */
+		if (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+		    ipst->ips_ip_pathmtu_interval) {
+			/*
+			 * Older than 20 minutes. Drop the path MTU information.
+			 */
+			mutex_enter(&dce->dce_lock);
+			dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
+			dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+			mutex_exit(&dce->dce_lock);
+			dce_increment_generation(dce);
+			ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+		} else {
+			uint_t fragsize;
+
+			fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+			if (fragsize > dce->dce_pmtu)
+				fragsize = dce->dce_pmtu;
+			ixa->ixa_fragsize = fragsize;
+		}
+	} else {
+		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+	}
+
+	/*
+	 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
+	 * interface for source address selection.
+	 */
+	ill = ire_nexthop_ill(ire);
+
+	if (ixaflags & IXAF_SET_SOURCE) {
+		in6_addr_t	src;
+
+		/*
+		 * We use the final destination to get
+		 * correct selection for source routed packets
+		 */
+
+		/* If unreachable we have no ill but need some source */
+		if (ill == NULL) {
+			src = ipv6_loopback;
+			error = 0;
+		} else {
+			error = ip_select_source_v6(ill, &setsrc, &dst,
+			    ixa->ixa_zoneid, ipst, B_FALSE,
+			    ixa->ixa_src_preferences, &src, NULL, NULL);
+		}
+		if (error != 0) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - no source",
+			    mp, ill);
+			freemsg(mp);
+			goto done;
+		}
+		ip6h->ip6_src = src;
+	} else if (ixaflags & IXAF_VERIFY_SOURCE) {
+		/* Check if the IP source is assigned to the host. */
+		if (!ip_verify_src(mp, ixa, NULL)) {
+			/* Don't send a packet with a source that isn't ours */
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - invalid source",
+			    mp, ill);
+			freemsg(mp);
+			error = EADDRNOTAVAIL;
+			goto done;
+		}
+	}
+
+	/*
+	 * Check against global IPsec policy to set the AH/ESP attributes.
+	 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
+	 */
+	if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
+		ASSERT(ixa->ixa_ipsec_policy == NULL);
+		mp = ip_output_attach_policy(mp, NULL, ip6h, NULL, ixa);
+		if (mp == NULL) {
+			/* MIB and ip_drop_packet already done */
+			return (EHOSTUNREACH);	/* IPsec policy failure */
+		}
+	}
+
+	if (ill != NULL) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
+	} else {
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+	}
+
+	/*
+	 * We update the statistics on the most specific IRE i.e., the first
+	 * one we found.
+	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
+	 * can only count the use prior to fragmentation. However the MIB
+	 * counters on the ill will be incremented in post fragmentation.
+	 */
+	ire->ire_ob_pkt_count++;
+
+	/*
+	 * Based on ire_type and ire_flags call one of:
+	 *	ire_send_local_v6 - for IRE_LOCAL and IRE_LOOPBACK
+	 *	ire_send_multirt_v6 - if RTF_MULTIRT
+	 *	ire_send_noroute_v6 - if RTF_REJECT or RTF_BLACHOLE
+	 *	ire_send_multicast_v6 - for IRE_MULTICAST
+	 *	ire_send_wire_v6 - for the rest.
+	 */
+	error = (ire->ire_sendfn)(ire, mp, ip6h, ixa, &dce->dce_ident);
+done:
+	ire_refrele(ire);
+	if (dce != NULL)
+		dce_refrele(dce);
+	if (ill != NULL)
+		ill_refrele(ill);
+	if (ixa->ixa_nce != NULL)
+		nce_refrele(ixa->ixa_nce);
+	ixa->ixa_nce = NULL;
+	return (error);
+}
+
+/*
+ * ire_sendfn() functions.
+ * These functions use the following xmit_attr:
+ *  - ixa_fragsize - read to determine whether or not to fragment
+ *  - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
+ *  - ixa_ipsec_*  are used inside IPsec
+ *  - IXAF_LOOPBACK_COPY - for multicast
+ */
+
+
+/*
+ * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
+ *
+ * The checks for restrict_interzone_loopback are done in ire_route_recursive.
+ */
+/* ARGSUSED4 */
+int
+ire_send_local_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	ill_t		*ill = ire->ire_ill;
+	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
+	uint_t		pktlen = ixa->ixa_pktlen;
+
+	/*
+	 * No fragmentation, no nce, and no application of IPsec.
+	 *
+	 *
+	 * Note different order between IP provider and FW_HOOKS than in
+	 * send_wire case.
+	 */
+
+	/*
+	 * DTrace this as ip:::send.  A packet blocked by FW_HOOKS will fire the
+	 * send probe, but not the receive probe.
+	 */
+	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+	    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
+	    int, 1);
+
+	DTRACE_PROBE4(ip6__loopback__out__start,
+	    ill_t *, NULL, ill_t *, ill,
+	    ip6_t *, ip6h, mblk_t *, mp);
+
+	if (HOOKS6_INTERESTED_LOOPBACK_OUT(ipst)) {
+		int	error;
+
+		FW_HOOKS(ipst->ips_ip6_loopback_out_event,
+		    ipst->ips_ipv6firewall_loopback_out,
+		    NULL, ill, ip6h, mp, mp, 0, ipst, error);
+
+		DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
+		if (mp == NULL)
+			return (error);
+
+		/*
+		 * Even if the destination was changed by the filter we use the
+		 * forwarding decision that was made based on the address
+		 * in ip_output/ip_set_destination.
+		 */
+		/* Length could be different */
+		ip6h = (ip6_t *)mp->b_rptr;
+		pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+	}
+
+	/*
+	 * If a callback is enabled then we need to know the
+	 * source and destination zoneids for the packet. We already
+	 * have those handy.
+	 */
+	if (ipst->ips_ip6_observe.he_interested) {
+		zoneid_t szone, dzone;
+		zoneid_t stackzoneid;
+
+		stackzoneid = netstackid_to_zoneid(
+		    ipst->ips_netstack->netstack_stackid);
+
+		if (stackzoneid == GLOBAL_ZONEID) {
+			/* Shared-IP zone */
+			dzone = ire->ire_zoneid;
+			szone = ixa->ixa_zoneid;
+		} else {
+			szone = dzone = stackzoneid;
+		}
+		ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
+	}
+
+	/* Handle lo0 stats */
+	ipst->ips_loopback_packets++;
+
+	/*
+	 * Update output mib stats. Note that we can't move into the icmp
+	 * sender (icmp_output etc) since they don't know the ill and the
+	 * stats are per ill.
+	 */
+	if (ixa->ixa_protocol == IPPROTO_ICMPV6) {
+		icmp6_t		*icmp6;
+
+		icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length);
+		icmp_update_out_mib_v6(ill, icmp6);
+	}
+
+	DTRACE_PROBE4(ip6__loopback__in__start,
+	    ill_t *, ill, ill_t *, NULL,
+	    ip6_t *, ip6h, mblk_t *, mp);
+
+	if (HOOKS6_INTERESTED_LOOPBACK_IN(ipst)) {
+		int	error;
+
+		FW_HOOKS(ipst->ips_ip6_loopback_in_event,
+		    ipst->ips_ipv6firewall_loopback_in,
+		    ill, NULL, ip6h, mp, mp, 0, ipst, error);
+
+		DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
+		if (mp == NULL)
+			return (error);
+
+		/*
+		 * Even if the destination was changed by the filter we use the
+		 * forwarding decision that was made based on the address
+		 * in ip_output/ip_set_destination.
+		 */
+		/* Length could be different */
+		ip6h = (ip6_t *)mp->b_rptr;
+		pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+	}
+
+	DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+	    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
+	    int, 1);
+
+	/* Map ixa to ira including IPsec policies */
+	ipsec_out_to_in(ixa, ill, &iras);
+	iras.ira_pktlen = pktlen;
+
+	ire->ire_ib_pkt_count++;
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
+
+	/* Destined to ire_zoneid - use that for fanout */
+	iras.ira_zoneid = ire->ire_zoneid;
+
+	if (is_system_labeled()) {
+		iras.ira_flags |= IRAF_SYSTEM_LABELED;
+
+		/*
+		 * This updates ira_cred, ira_tsl and ira_free_flags based
+		 * on the label. We don't expect this to ever fail for
+		 * loopback packets, so we silently drop the packet should it
+		 * fail.
+		 */
+		if (!tsol_get_pkt_label(mp, IPV6_VERSION, &iras)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("tsol_get_pkt_label", mp, ill);
+			freemsg(mp);
+			return (0);
+		}
+		ASSERT(iras.ira_tsl != NULL);
+
+		/* tsol_get_pkt_label sometimes does pullupmsg */
+		ip6h = (ip6_t *)mp->b_rptr;
+	}
+
+	ip_fanout_v6(mp, ip6h, &iras);
+
+	/* We moved any IPsec refs from ixa to iras */
+	ira_cleanup(&iras, B_FALSE);
+	return (0);
+}
+
+static void
+multirt_check_v6(ire_t *ire, ip6_t *ip6h, ip_xmit_attr_t *ixa)
+{
+	ip_stack_t *ipst = ixa->ixa_ipst;
+
+	/* Limit the TTL on multirt packets. Do this even if IPV6_HOPLIMIT */
+	if (ire->ire_type & IRE_MULTICAST) {
+		if (ip6h->ip6_hops > 1) {
+			ip2dbg(("ire_send_multirt_v6: forcing multicast "
+			    "multirt TTL to 1 (was %d)\n", ip6h->ip6_hops));
+			ip6h->ip6_hops = 1;
+		}
+		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+	} else if ((ipst->ips_ip_multirt_ttl > 0) &&
+	    (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl)) {
+		ip6h->ip6_hops = ipst->ips_ip_multirt_ttl;
+		/*
+		 * Need to ensure we don't increase the ttl should we go through
+		 * ire_send_multicast.
+		 */
+		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+	}
+
+	/* For IPv6 this also needs to insert a fragment header */
+	ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
+}
+
+/*
+ * ire_sendfn for IRE_MULTICAST
+ *
+ * Note that we do path MTU discovery by default for IPv6 multicast. But
+ * since unconnected UDP and RAW sockets don't set IXAF_PMTU_DISCOVERY
+ * only connected sockets get this by default.
+ */
+int
+ire_send_multicast_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	ill_t		*ill = ire->ire_ill;
+	iaflags_t	ixaflags = ixa->ixa_flags;
+
+	/*
+	 * The IRE_MULTICAST is the same whether or not multirt is in use.
+	 * Hence we need special-case code.
+	 */
+	if (ixaflags & IXAF_MULTIRT_MULTICAST)
+		multirt_check_v6(ire, ip6h, ixa);
+
+	/*
+	 * Check if anything in ip_input_v6 wants a copy of the transmitted
+	 * packet (after IPsec and fragmentation)
+	 *
+	 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
+	 *    RSVP and the rsvp daemon is an example of a
+	 *    protocol and user level process that
+	 *    handles it's own routing. Hence, it uses the
+	 *    SO_DONTROUTE option to accomplish this.
+	 * 2. If the sender has set IP_MULTICAST_LOOP, then we just
+	 *    check whether there are any receivers for the group on the ill
+	 *    (ignoring the zoneid).
+	 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
+	 *    any members in other shared-IP zones.
+	 *    If such members exist, then we indicate that the sending zone
+	 *    shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
+	 *    behavior.
+	 *
+	 * When we loopback we skip hardware checksum to make sure loopback
+	 * copy is checksumed.
+	 *
+	 * Note that ire_ill is the upper in the case of IPMP.
+	 */
+	ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
+	if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
+	    !(ixaflags & IXAF_DONTROUTE)) {
+		ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+	} else if (ixaflags & IXAF_MULTICAST_LOOP) {
+		/*
+		 * If this zone or any other zone has members then loopback
+		 * a copy.
+		 */
+		if (ill_hasmembers_v6(ill, &ip6h->ip6_dst))
+			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+	} else if (ipst->ips_netstack->netstack_numzones > 1) {
+		/*
+		 * This zone should not have a copy. But there are some other
+		 * zones which might have members.
+		 */
+		if (ill_hasmembers_otherzones_v6(ill, &ip6h->ip6_dst,
+		    ixa->ixa_zoneid)) {
+			ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
+			ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
+			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+		}
+	}
+
+	/*
+	 * Unless IPV6_HOPLIMIT or ire_send_multirt_v6 already set a ttl,
+	 * force the ttl to the IP_MULTICAST_TTL value
+	 */
+	if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
+		ip6h->ip6_hops = ixa->ixa_multicast_ttl;
+	}
+
+	return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp));
+}
+
+/*
+ * ire_sendfn for IREs with RTF_MULTIRT
+ */
+int
+ire_send_multirt_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+
+	multirt_check_v6(ire, ip6h, ixa);
+
+	if (ire->ire_type & IRE_MULTICAST)
+		return (ire_send_multicast_v6(ire, mp, ip6h, ixa, identp));
+	else
+		return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp));
+}
+
+/*
+ * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
+ */
+/* ARGSUSED4 */
+int
+ire_send_noroute_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	ill_t		*ill;
+	ip_recv_attr_t	iras;
+	boolean_t	dummy;
+
+	BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
+
+	if (ire->ire_type & IRE_NOROUTE) {
+		/* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
+		ip_rts_change_v6(RTM_MISS, &ip6h->ip6_dst, 0, 0, 0, 0, 0, 0,
+		    RTA_DST, ipst);
+	}
+
+	if (ire->ire_flags & RTF_BLACKHOLE) {
+		ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
+		freemsg(mp);
+		/* No error even for local senders - silent blackhole */
+		return (0);
+	}
+	ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
+
+	/*
+	 * We need an ill_t for the ip_recv_attr_t even though this packet
+	 * was never received and icmp_unreachable doesn't currently use
+	 * ira_ill.
+	 */
+	ill = ill_lookup_on_name("lo0", B_FALSE,
+	    !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
+	if (ill == NULL) {
+		freemsg(mp);
+		return (EHOSTUNREACH);
+	}
+
+	bzero(&iras, sizeof (iras));
+	/* Map ixa to ira including IPsec policies */
+	ipsec_out_to_in(ixa, ill, &iras);
+
+	icmp_unreachable_v6(mp, ICMP6_DST_UNREACH_NOROUTE, B_FALSE, &iras);
+	/* We moved any IPsec refs from ixa to iras */
+	ira_cleanup(&iras, B_FALSE);
+
+	ill_refrele(ill);
+	return (EHOSTUNREACH);
+}
+
+/*
+ * Calculate a checksum ignoring any hardware capabilities
+ *
+ * Returns B_FALSE if the packet was too short for the checksum. Caller
+ * should free and do stats.
+ */
+static boolean_t
+ip_output_sw_cksum_v6(mblk_t *mp, ip6_t *ip6h, ip_xmit_attr_t *ixa)
+{
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	uint_t		pktlen = ixa->ixa_pktlen;
+	uint16_t	*cksump;
+	uint32_t	cksum;
+	uint8_t		protocol = ixa->ixa_protocol;
+	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
+
+#define	iphs    ((uint16_t *)ip6h)
+
+	/* Just in case it contained garbage */
+	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
+
+	/*
+	 * Calculate ULP checksum
+	 */
+	if (protocol == IPPROTO_TCP) {
+		cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length);
+		cksum = IP_TCP_CSUM_COMP;
+	} else if (protocol == IPPROTO_UDP) {
+		cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length);
+		cksum = IP_UDP_CSUM_COMP;
+	} else if (protocol == IPPROTO_SCTP) {
+		sctp_hdr_t	*sctph;
+
+		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
+		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
+		/*
+		 * Zero out the checksum field to ensure proper
+		 * checksum calculation.
+		 */
+		sctph->sh_chksum = 0;
+#ifdef	DEBUG
+		if (!skip_sctp_cksum)
+#endif
+			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
+		return (B_TRUE);
+	} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
+		/*
+		 * icmp has placed length and routing
+		 * header adjustment in the checksum field.
+		 */
+		cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length +
+		    ixa->ixa_raw_cksum_offset);
+		cksum = htons(protocol);
+	} else if (protocol == IPPROTO_ICMPV6) {
+		cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
+		cksum = IP_ICMPV6_CSUM_COMP;	/* Pseudo-header cksum */
+	} else {
+		return (B_TRUE);
+	}
+
+	/* ULP puts the checksum field is in the first mblk */
+	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
+
+	/*
+	 * We accumulate the pseudo header checksum in cksum.
+	 * This is pretty hairy code, so watch close.  One
+	 * thing to keep in mind is that UDP and TCP have
+	 * stored their respective datagram lengths in their
+	 * checksum fields.  This lines things up real nice.
+	 */
+	cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
+	    iphs[8] + iphs[9] + iphs[10] + iphs[11] +
+	    iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+	    iphs[16] + iphs[17] + iphs[18] + iphs[19];
+	cksum = IP_CSUM(mp, ip_hdr_length, cksum);
+
+	/*
+	 * For UDP/IPv6 a zero UDP checksum is not allowed.
+	 * Change to 0xffff
+	 */
+	if (protocol == IPPROTO_UDP && cksum == 0)
+		*cksump = ~cksum;
+	else
+		*cksump = cksum;
+
+	IP6_STAT(ipst, ip6_out_sw_cksum);
+	IP6_STAT_UPDATE(ipst, ip6_out_sw_cksum_bytes, pktlen);
+
+	/* No IP header checksum for IPv6 */
+
+	return (B_TRUE);
+#undef	iphs
+}
+
+/* There are drivers that can't do partial checksum for ICMPv6 */
+int nxge_cksum_workaround = 1;
+
+/*
+ * Calculate the ULP checksum - try to use hardware.
+ * In the case of MULTIRT or multicast the
+ * IXAF_NO_HW_CKSUM is set in which case we use software.
+ *
+ * Returns B_FALSE if the packet was too short for the checksum. Caller
+ * should free and do stats.
+ */
+static boolean_t
+ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h,
+    ip_xmit_attr_t *ixa, ill_t *ill)
+{
+	uint_t		pktlen = ixa->ixa_pktlen;
+	uint16_t	*cksump;
+	uint16_t	hck_flags;
+	uint32_t	cksum;
+	uint8_t		protocol = ixa->ixa_protocol;
+	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
+
+#define	iphs    ((uint16_t *)ip6h)
+
+	if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
+	    !dohwcksum) {
+		return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
+	}
+
+	/*
+	 * Calculate ULP checksum. Note that we don't use cksump and cksum
+	 * if the ill has FULL support.
+	 */
+	if (protocol == IPPROTO_TCP) {
+		cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length);
+		cksum = IP_TCP_CSUM_COMP;	/* Pseudo-header cksum */
+	} else if (protocol == IPPROTO_UDP) {
+		cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length);
+		cksum = IP_UDP_CSUM_COMP;	/* Pseudo-header cksum */
+	} else if (protocol == IPPROTO_SCTP) {
+		sctp_hdr_t	*sctph;
+
+		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
+		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
+		/*
+		 * Zero out the checksum field to ensure proper
+		 * checksum calculation.
+		 */
+		sctph->sh_chksum = 0;
+#ifdef	DEBUG
+		if (!skip_sctp_cksum)
+#endif
+			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
+		goto ip_hdr_cksum;
+	} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
+		/*
+		 * icmp has placed length and routing
+		 * header adjustment in the checksum field.
+		 */
+		cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length +
+		    ixa->ixa_raw_cksum_offset);
+		cksum = htons(protocol);
+	} else if (protocol == IPPROTO_ICMPV6) {
+		cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
+		cksum = IP_ICMPV6_CSUM_COMP;	/* Pseudo-header cksum */
+	} else {
+	ip_hdr_cksum:
+		/* No IP header checksum for IPv6 */
+		return (B_TRUE);
+	}
+
+	/* ULP puts the checksum field is in the first mblk */
+	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
+
+	/*
+	 * Underlying interface supports hardware checksum offload for
+	 * the payload; leave the payload checksum for the hardware to
+	 * calculate.  N.B: We only need to set up checksum info on the
+	 * first mblk.
+	 */
+	hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
+
+	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
+	if (hck_flags & HCKSUM_INET_FULL_V6) {
+		/*
+		 * Hardware calculates pseudo-header, header and the
+		 * payload checksums, so clear the checksum field in
+		 * the protocol header.
+		 */
+		*cksump = 0;
+		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
+		return (B_TRUE);
+	}
+	if (((hck_flags) & HCKSUM_INET_PARTIAL) &&
+	    (protocol != IPPROTO_ICMPV6 || !nxge_cksum_workaround)) {
+		/*
+		 * Partial checksum offload has been enabled.  Fill
+		 * the checksum field in the protocol header with the
+		 * pseudo-header checksum value.
+		 *
+		 * We accumulate the pseudo header checksum in cksum.
+		 * This is pretty hairy code, so watch close.  One
+		 * thing to keep in mind is that UDP and TCP have
+		 * stored their respective datagram lengths in their
+		 * checksum fields.  This lines things up real nice.
+		 */
+		cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
+		    iphs[8] + iphs[9] + iphs[10] + iphs[11] +
+		    iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+		    iphs[16] + iphs[17] + iphs[18] + iphs[19];
+		cksum += *(cksump);
+		cksum = (cksum & 0xFFFF) + (cksum >> 16);
+		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
+
+		/*
+		 * Offsets are relative to beginning of IP header.
+		 */
+		DB_CKSUMSTART(mp) = ip_hdr_length;
+		DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ip6h;
+		DB_CKSUMEND(mp) = pktlen;
+		DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
+		return (B_TRUE);
+	}
+	/* Hardware capabilities include neither full nor partial IPv6 */
+	return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
+#undef	iphs
+}
+
+/*
+ * ire_sendfn for offlink and onlink destinations.
+ * Also called from the multicast, and multirt send functions.
+ *
+ * Assumes that the caller has a hold on the ire.
+ *
+ * This function doesn't care if the IRE just became condemned since that
+ * can happen at any time.
+ */
+/* ARGSUSED */
+int
+ire_send_wire_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	ip6_t		*ip6h = (ip6_t *)iph_arg;
+	iaflags_t	ixaflags = ixa->ixa_flags;
+	ill_t		*ill;
+	uint32_t	pktlen = ixa->ixa_pktlen;
+
+	ASSERT(ixa->ixa_nce != NULL);
+	ill = ixa->ixa_nce->nce_ill;
+
+	/*
+	 * Update output mib stats. Note that we can't move into the icmp
+	 * sender (icmp_output etc) since they don't know the ill and the
+	 * stats are per ill.
+	 *
+	 * With IPMP we record the stats on the upper ill.
+	 */
+	if (ixa->ixa_protocol == IPPROTO_ICMPV6) {
+		icmp6_t		*icmp6;
+
+		icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length);
+		icmp_update_out_mib_v6(ixa->ixa_nce->nce_common->ncec_ill,
+		    icmp6);
+	}
+
+	if (ixaflags & IXAF_DONTROUTE)
+		ip6h->ip6_hops = 1;
+
+	/*
+	 * This might set b_band, thus the IPsec and fragmentation
+	 * code in IP ensures that b_band is updated in the first mblk.
+	 */
+	if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
+		/* ip_process translates an IS_UNDER_IPMP */
+		mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
+		if (mp == NULL) {
+			/* ip_drop_packet and MIB done */
+			return (0);	/* Might just be delayed */
+		}
+	}
+
+	/*
+	 * To handle IPsec/iptun's labeling needs we need to tag packets
+	 * while we still have ixa_tsl
+	 */
+	if (is_system_labeled() && ixa->ixa_tsl != NULL &&
+	    (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
+	    ill->ill_mactype == DL_IPV6)) {
+		cred_t *newcr;
+
+		newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
+		    KM_NOSLEEP);
+		if (newcr == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - newcr",
+			    mp, ill);
+			freemsg(mp);
+			return (ENOBUFS);
+		}
+		mblk_setcred(mp, newcr, NOPID);
+		crfree(newcr);	/* mblk_setcred did its own crhold */
+	}
+
+	/*
+	 * IXAF_IPV6_ADD_FRAGHDR is set for CGTP so that we will add a
+	 * fragment header without fragmenting. CGTP on the receiver will
+	 * filter duplicates on the ident field.
+	 */
+	if (pktlen > ixa->ixa_fragsize ||
+	    (ixaflags & (IXAF_IPSEC_SECURE|IXAF_IPV6_ADD_FRAGHDR))) {
+		uint32_t ident;
+
+		if (ixaflags & IXAF_IPSEC_SECURE)
+			pktlen += ipsec_out_extra_length(ixa);
+
+		if (pktlen > IP_MAXPACKET)
+			return (EMSGSIZE);
+
+		if (ixaflags & IXAF_SET_ULP_CKSUM) {
+			/*
+			 * Compute ULP checksum using software
+			 */
+			if (!ip_output_sw_cksum_v6(mp, ip6h, ixa)) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+				ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+				freemsg(mp);
+				return (EINVAL);
+			}
+			/* Avoid checksum again below if we only add fraghdr */
+			ixaflags &= ~IXAF_SET_ULP_CKSUM;
+		}
+
+		/*
+		 * If we need a fragment header, pick the ident and insert
+		 * the header before IPsec to we have a place to store
+		 * the ident value.
+		 */
+		if ((ixaflags & IXAF_IPV6_ADD_FRAGHDR) ||
+		    pktlen > ixa->ixa_fragsize) {
+			/*
+			 * If this packet would generate a icmp_frag_needed
+			 * message, we need to handle it before we do the IPsec
+			 * processing. Otherwise, we need to strip the IPsec
+			 * headers before we send up the message to the ULPs
+			 * which becomes messy and difficult.
+			 */
+			if ((pktlen > ixa->ixa_fragsize) &&
+			    (ixaflags & IXAF_DONTFRAG)) {
+				/* Generate ICMP and return error */
+				ip_recv_attr_t	iras;
+
+				DTRACE_PROBE4(ip6__fragsize__fail,
+				    uint_t, pktlen, uint_t, ixa->ixa_fragsize,
+				    uint_t, ixa->ixa_pktlen,
+				    uint_t, ixa->ixa_pmtu);
+
+				bzero(&iras, sizeof (iras));
+				/* Map ixa to ira including IPsec policies */
+				ipsec_out_to_in(ixa, ill, &iras);
+
+				ip_drop_output("ICMP6_PKT_TOO_BIG", mp, ill);
+				icmp_pkt2big_v6(mp, ixa->ixa_fragsize, B_TRUE,
+				    &iras);
+				/* We moved any IPsec refs from ixa to iras */
+				ira_cleanup(&iras, B_FALSE);
+				return (EMSGSIZE);
+			}
+			DTRACE_PROBE4(ip6__fragsize__ok, uint_t, pktlen,
+			    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
+			    uint_t, ixa->ixa_pmtu);
+			/*
+			 * Assign an ident value for this packet. There could
+			 * be other threads targeting the same destination, so
+			 * we have to arrange for a atomic increment.
+			 * Normally ixa_extra_ident is 0, but in the case of
+			 * LSO it will be the number of TCP segments  that the
+			 * driver/hardware will extraly construct.
+			 *
+			 * Note that cl_inet_ipident has only been used for
+			 * IPv4. We don't use it here.
+			 */
+			ident = atomic_add_32_nv(identp, ixa->ixa_extra_ident +
+			    1);
+#ifndef _BIG_ENDIAN
+			ident = htonl(ident);
+#endif
+			ixa->ixa_ident = ident;	/* In case we do IPsec */
+		}
+		if (ixaflags & IXAF_IPSEC_SECURE) {
+			/*
+			 * Pass in sufficient information so that
+			 * IPsec can determine whether to fragment, and
+			 * which function to call after fragmentation.
+			 */
+			return (ipsec_out_process(mp, ixa));
+		}
+
+		mp = ip_fraghdr_add_v6(mp, ident, ixa);
+		if (mp == NULL) {
+			/* MIB and ip_drop_output already done */
+			return (ENOMEM);
+		}
+		ASSERT(pktlen == ixa->ixa_pktlen);
+		pktlen += sizeof (ip6_frag_t);
+
+		if (pktlen > ixa->ixa_fragsize) {
+			return (ip_fragment_v6(mp, ixa->ixa_nce, ixaflags,
+			    pktlen, ixa->ixa_fragsize,
+			    ixa->ixa_xmit_hint, ixa->ixa_zoneid,
+			    ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
+			    &ixa->ixa_cookie));
+		}
+	}
+	if (ixaflags & IXAF_SET_ULP_CKSUM) {
+		/* Compute ULP checksum and IP header checksum */
+		/* An IS_UNDER_IPMP ill is ok here */
+		if (!ip_output_cksum_v6(ixaflags, mp, ip6h, ixa, ill)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+			freemsg(mp);
+			return (EINVAL);
+		}
+	}
+	return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
+	    pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
+	    ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
+}
+
+/*
+ * Post fragmentation function for RTF_MULTIRT routes.
+ * Since IRE_MULTICASTs might have RTF_MULTIRT, this function
+ * checks IXAF_LOOPBACK_COPY.
+ *
+ * If no packet is sent due to failures then we return an errno, but if at
+ * least one succeeded we return zero.
+ */
+int
+ip_postfrag_multirt_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
+    uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
+    uintptr_t *ixacookie)
+{
+	irb_t		*irb;
+	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
+	ire_t		*ire;
+	ire_t		*ire1;
+	mblk_t		*mp1;
+	nce_t		*nce1;
+	ill_t		*ill = nce->nce_ill;
+	ill_t		*ill1;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	int		error = 0;
+	int		num_sent = 0;
+	int		err;
+	uint_t		ire_type;
+	in6_addr_t	nexthop;
+
+	ASSERT(!(ixaflags & IXAF_IS_IPV4));
+
+	/* Check for IXAF_LOOPBACK_COPY */
+	if (ixaflags & IXAF_LOOPBACK_COPY) {
+		mblk_t *mp1;
+
+		mp1 = copymsg(mp);
+		if (mp1 == NULL) {
+			/* Failed to deliver the loopback copy. */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+			error = ENOBUFS;
+		} else {
+			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
+			    nolzid);
+		}
+	}
+
+	/*
+	 * Loop over RTF_MULTIRT for ip6_dst in the same bucket. Send
+	 * a copy to each one.
+	 * Use the nce (nexthop) and ip6_dst to find the ire.
+	 *
+	 * MULTIRT is not designed to work with shared-IP zones thus we don't
+	 * need to pass a zoneid or a label to the IRE lookup.
+	 */
+	if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, &ip6h->ip6_dst)) {
+		/* Broadcast and multicast case */
+		ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, 0, NULL,
+		    ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
+	} else {
+		/* Unicast case */
+		ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, &nce->nce_addr,
+		    0, NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
+	}
+
+	if (ire == NULL ||
+	    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+	    !(ire->ire_flags & RTF_MULTIRT)) {
+		/* Drop */
+		ip_drop_output("ip_postfrag_multirt didn't find route",
+		    mp, nce->nce_ill);
+		if (ire != NULL)
+			ire_refrele(ire);
+		return (ENETUNREACH);
+	}
+
+	irb = ire->ire_bucket;
+	irb_refhold(irb);
+	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
+		if (IRE_IS_CONDEMNED(ire1) ||
+		    !(ire1->ire_flags & RTF_MULTIRT))
+			continue;
+
+		/* Note: When IPv6 uses radix tree we don't need this check */
+		if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
+			continue;
+
+		/* Do the ire argument one after the loop */
+		if (ire1 == ire)
+			continue;
+
+		ill1 = ire_nexthop_ill(ire1);
+		if (ill1 == NULL) {
+			/*
+			 * This ire might not have been picked by
+			 * ire_route_recursive, in which case ire_dep might
+			 * not have been setup yet.
+			 * We kick ire_route_recursive to try to resolve
+			 * starting at ire1.
+			 */
+			ire_t *ire2;
+
+			ire2 = ire_route_recursive_impl_v6(ire1,
+			    &ire1->ire_addr_v6, ire1->ire_type, ire1->ire_ill,
+			    ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY,
+			    B_TRUE, 0, ipst, NULL, NULL, NULL);
+			if (ire2 != NULL)
+				ire_refrele(ire2);
+			ill1 = ire_nexthop_ill(ire1);
+		}
+		if (ill1 == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - no ill",
+			    mp, ill);
+			error = ENETUNREACH;
+			continue;
+		}
+		/* Pick the addr and type to use for ndp_nce_init */
+		if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
+			ire_type = IRE_MULTICAST;
+			nexthop = ip6h->ip6_dst;
+		} else {
+			ire_type = ire1->ire_type;	/* Doesn't matter */
+			nexthop = ire1->ire_gateway_addr_v6;
+		}
+
+		/* If IPMP meta or under, then we just drop */
+		if (ill1->ill_grp != NULL) {
+			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - IPMP",
+			    mp, ill1);
+			ill_refrele(ill1);
+			error = ENETUNREACH;
+			continue;
+		}
+
+		nce1 = ndp_nce_init(ill1, &nexthop, ire_type);
+		if (nce1 == NULL) {
+			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - no nce",
+			    mp, ill1);
+			ill_refrele(ill1);
+			error = ENOBUFS;
+			continue;
+		}
+		mp1 = copymsg(mp);
+		if (mp1 == NULL) {
+			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
+			nce_refrele(nce1);
+			ill_refrele(ill1);
+			error = ENOBUFS;
+			continue;
+		}
+		/* Preserve HW checksum for this copy */
+		DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
+		DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
+		DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
+		DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
+		DB_LSOMSS(mp1) = DB_LSOMSS(mp);
+
+		ire1->ire_ob_pkt_count++;
+		err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
+		    0, ixacookie);
+		if (err == 0)
+			num_sent++;
+		else
+			error = err;
+		nce_refrele(nce1);
+		ill_refrele(ill1);
+	}
+	irb_refrele(irb);
+	ire_refrele(ire);
+	/* Finally, the main one */
+	err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
+	    ixacookie);
+	if (err == 0)
+		num_sent++;
+	else
+		error = err;
+	if (num_sent > 0)
+		return (0);
+	else
+		return (error);
+}
diff --git a/usr/src/uts/common/inet/ip/ip6_rts.c b/usr/src/uts/common/inet/ip/ip6_rts.c
index dcf429c8ba..38b43cdf60 100644
--- a/usr/src/uts/common/inet/ip/ip6_rts.c
+++ b/usr/src/uts/common/inet/ip/ip6_rts.c
@@ -80,8 +80,8 @@ void
 rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst,
     const in6_addr_t *mask, const in6_addr_t *gateway,
     const in6_addr_t *src_addr, const in6_addr_t *brd_addr,
-    const in6_addr_t *author, const ipif_t *ipif, mblk_t *mp,
-    uint_t sacnt, const tsol_gc_t *gc)
+    const in6_addr_t *author, const in6_addr_t *ifaddr, const ill_t *ill,
+    mblk_t *mp, const tsol_gc_t *gc)
 {
 	rt_msghdr_t	*rtm;
 	sin6_t		*sin6;
@@ -90,7 +90,6 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst,
 	int		i;
 
 	ASSERT(mp != NULL);
-	ASSERT(sacnt == 0 || gc != NULL);
 	/*
 	 * First find the type of the message
 	 * and its length.
@@ -100,7 +99,7 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst,
 	 * Now find the size of the data
 	 * that follows the message header.
 	 */
-	data_size = rts_data_msg_size(rtm_addrs, AF_INET6, sacnt);
+	data_size = rts_data_msg_size(rtm_addrs, AF_INET6, gc != NULL ? 1 : 0);
 
 	rtm = (rt_msghdr_t *)mp->b_rptr;
 	mp->b_wptr = &mp->b_rptr[header_size];
@@ -125,13 +124,17 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst,
 			cp += sizeof (sin6_t);
 			break;
 		case RTA_IFA:
+			sin6->sin6_addr = *ifaddr;
+			sin6->sin6_family = AF_INET6;
+			cp += sizeof (sin6_t);
+			break;
 		case RTA_SRC:
 			sin6->sin6_addr = *src_addr;
 			sin6->sin6_family = AF_INET6;
 			cp += sizeof (sin6_t);
 			break;
 		case RTA_IFP:
-			cp += ill_dls_info((struct sockaddr_dl *)cp, ipif);
+			cp += ill_dls_info((struct sockaddr_dl *)cp, ill);
 			break;
 		case RTA_AUTHOR:
 			sin6->sin6_addr = *author;
@@ -154,24 +157,20 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst,
 		rtm_ext_t *rtm_ext;
 		struct rtsa_s *rp_dst;
 		tsol_rtsecattr_t *rsap;
-		int i;
 
 		ASSERT(gc->gc_grp != NULL);
 		ASSERT(RW_LOCK_HELD(&gc->gc_grp->gcgrp_rwlock));
-		ASSERT(sacnt > 0);
 
 		rtm_ext = (rtm_ext_t *)cp;
 		rtm_ext->rtmex_type = RTMEX_GATEWAY_SECATTR;
-		rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(sacnt);
+		rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(1);
 
 		rsap = (tsol_rtsecattr_t *)(rtm_ext + 1);
-		rsap->rtsa_cnt = sacnt;
+		rsap->rtsa_cnt = 1;
 		rp_dst = rsap->rtsa_attr;
 
-		for (i = 0; i < sacnt; i++, gc = gc->gc_next, rp_dst++) {
-			ASSERT(gc->gc_db != NULL);
-			bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
-		}
+		ASSERT(gc->gc_db != NULL);
+		bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
 		cp = (uchar_t *)rp_dst;
 	}
 
@@ -208,7 +207,7 @@ ip_rts_change_v6(int type, const in6_addr_t *dst_addr,
 	if (mp == NULL)
 		return;
 	rts_fill_msg_v6(type, rtm_addrs, dst_addr, net_mask, gw_addr, source,
-	    &ipv6_all_zeros, author, NULL, mp, 0, NULL);
+	    &ipv6_all_zeros, &ipv6_all_zeros, author, NULL, mp, NULL);
 	rtm = (rt_msghdr_t *)mp->b_rptr;
 	rtm->rtm_flags = flags;
 	rtm->rtm_errno = error;
diff --git a/usr/src/uts/common/inet/ip/ip_arp.c b/usr/src/uts/common/inet/ip/ip_arp.c
new file mode 100644
index 0000000000..489d59dbf6
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_arp.c
@@ -0,0 +1,2468 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <inet/ip_arp.h>
+#include <inet/ip_ndp.h>
+#include <net/if_arp.h>
+#include <netinet/if_ether.h>
+#include <sys/strsubr.h>
+#include <inet/ip6.h>
+#include <inet/ip.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <sys/dlpi.h>
+#include <sys/sunddi.h>
+#include <sys/strsun.h>
+#include <sys/sdt.h>
+#include <inet/mi.h>
+#include <inet/arp.h>
+#include <inet/ipdrop.h>
+#include <sys/sockio.h>
+#include <inet/ip_impl.h>
+#include <sys/policy.h>
+
+#define	ARL_LL_ADDR_OFFSET(arl)	(((arl)->arl_sap_length) < 0 ? \
+	(sizeof (dl_unitdata_req_t)) : \
+	((sizeof (dl_unitdata_req_t)) + (ABS((arl)->arl_sap_length))))
+
+/*
+ * MAC-specific intelligence.  Shouldn't be needed, but the DL_INFO_ACK
+ * doesn't quite do it for us.
+ */
+typedef struct arp_m_s {
+	t_uscalar_t	arp_mac_type;
+	uint32_t	arp_mac_arp_hw_type;
+	t_scalar_t	arp_mac_sap_length;
+	uint32_t	arp_mac_hw_addr_length;
+} arp_m_t;
+
+static int arp_close(queue_t *, int);
+static void arp_rput(queue_t *, mblk_t *);
+static void arp_wput(queue_t *, mblk_t *);
+static arp_m_t	*arp_m_lookup(t_uscalar_t mac_type);
+static void arp_notify(ipaddr_t, mblk_t *, uint32_t, ip_recv_attr_t *,
+	ncec_t *);
+static int arp_output(ill_t *, uint32_t, const uchar_t *, const uchar_t *,
+	const uchar_t *, const uchar_t *, uchar_t *);
+static int  arp_modclose(arl_t *);
+static void  arp_mod_close_tail(arl_t *);
+static mblk_t *arl_unbind(arl_t *);
+static void arp_process_packet(ill_t *, mblk_t *);
+static void arp_excl(ipsq_t *, queue_t *, mblk_t *, void *);
+static void arp_drop_packet(const char *str, mblk_t *, ill_t *);
+static int arp_open(queue_t *, dev_t *, int, int, cred_t *);
+static int ip_sioctl_ifunitsel_arp(queue_t *, int *);
+static int ip_sioctl_slifname_arp(queue_t *, void *);
+static void arp_dlpi_send(arl_t *, mblk_t *);
+static void arl_defaults_common(arl_t *, mblk_t *);
+static int arp_modopen(queue_t *, dev_t *, int, int, cred_t *);
+static void arp_ifname_notify(arl_t *);
+static void arp_rput_dlpi_writer(ipsq_t *, queue_t *, mblk_t *, void *);
+static arl_t *ill_to_arl(ill_t *);
+
+#define	DL_PRIM(mp)	(((union DL_primitives *)(mp)->b_rptr)->dl_primitive)
+#define	IS_DLPI_DATA(mp)						\
+	((DB_TYPE(mp) == M_PROTO) &&					\
+	MBLKL(mp) >= sizeof (dl_unitdata_ind_t) &&			\
+	(DL_PRIM(mp) == DL_UNITDATA_IND))
+
+#define	AR_NOTFOUND	1	/* No matching ace found in cache */
+#define	AR_MERGED	2	/* Matching ace updated (RFC 826 Merge_flag) */
+#define	AR_LOOPBACK	3	/* Our own arp packet was received */
+#define	AR_BOGON	4	/* Another host has our IP addr. */
+#define	AR_FAILED	5	/* Duplicate Address Detection has failed */
+#define	AR_CHANGED	6	/* Address has changed; tell IP (and merged) */
+
+boolean_t arp_no_defense;
+
+struct module_info arp_mod_info = {
+	IP_MOD_ID, "arpip", 1, INFPSZ, 65536, 1024
+};
+static struct qinit rinit_arp = {
+	(pfi_t)arp_rput, NULL, arp_open, arp_close, NULL, &arp_mod_info
+};
+static struct qinit winit_arp = {
+	(pfi_t)arp_wput, NULL, arp_open, arp_close, NULL,
+	&arp_mod_info
+};
+struct streamtab arpinfo = {
+	&rinit_arp, &winit_arp
+};
+#define	ARH_FIXED_LEN	8
+#define	AR_LL_HDR_SLACK	32
+
+/*
+ * pfhooks for ARP.
+ */
+#define	ARP_HOOK_IN(_hook, _event, _ilp, _hdr, _fm, _m, ipst)		\
+									\
+	if ((_hook).he_interested) {                       		\
+		hook_pkt_event_t info;                          	\
+									\
+		info.hpe_protocol = ipst->ips_arp_net_data;		\
+		info.hpe_ifp = _ilp;                       		\
+		info.hpe_ofp = 0;                       		\
+		info.hpe_hdr = _hdr;                            	\
+		info.hpe_mp = &(_fm);                           	\
+		info.hpe_mb = _m;                               	\
+		if (hook_run(ipst->ips_arp_net_data->netd_hooks,	\
+		    _event, (hook_data_t)&info) != 0) {			\
+			if (_fm != NULL) {                      	\
+				freemsg(_fm);                   	\
+				_fm = NULL;                     	\
+			}                                       	\
+			_hdr = NULL;                            	\
+			_m = NULL;                              	\
+		} else {                                        	\
+			_hdr = info.hpe_hdr;                    	\
+			_m = info.hpe_mb;                       	\
+		}                                               	\
+	}
+
+#define	ARP_HOOK_OUT(_hook, _event, _olp, _hdr, _fm, _m, ipst)		\
+									\
+	if ((_hook).he_interested) {                       		\
+		hook_pkt_event_t info;                          	\
+									\
+		info.hpe_protocol = ipst->ips_arp_net_data;		\
+		info.hpe_ifp = 0;                       		\
+		info.hpe_ofp = _olp;                       		\
+		info.hpe_hdr = _hdr;                            	\
+		info.hpe_mp = &(_fm);                           	\
+		info.hpe_mb = _m;                               	\
+		if (hook_run(ipst->ips_arp_net_data->netd_hooks,	\
+		    _event, (hook_data_t)&info) != 0) {			\
+			if (_fm != NULL) {                      	\
+				freemsg(_fm);                   	\
+				_fm = NULL;                     	\
+			}                                       	\
+			_hdr = NULL;                            	\
+			_m = NULL;                              	\
+		} else {                                        	\
+			_hdr = info.hpe_hdr;                    	\
+			_m = info.hpe_mb;                       	\
+		}                                               	\
+	}
+
+static arp_m_t	arp_m_tbl[] = {
+	{ DL_CSMACD,	ARPHRD_ETHER,	-2,	6},	/* 802.3 */
+	{ DL_TPB,	ARPHRD_IEEE802,	-2,	6},	/* 802.4 */
+	{ DL_TPR,	ARPHRD_IEEE802,	-2,	6},	/* 802.5 */
+	{ DL_METRO,	ARPHRD_IEEE802,	-2,	6},	/* 802.6 */
+	{ DL_ETHER,	ARPHRD_ETHER,	-2,	6},	/* Ethernet */
+	{ DL_FDDI,	ARPHRD_ETHER,	-2,	6},	/* FDDI */
+	{ DL_IB,	ARPHRD_IB,	-2,	20},	/* Infiniband */
+	{ DL_OTHER,	ARPHRD_ETHER,	-2,	6}	/* unknown */
+};
+
+static void
+arl_refhold_locked(arl_t *arl)
+{
+	ASSERT(MUTEX_HELD(&arl->arl_lock));
+	arl->arl_refcnt++;
+	ASSERT(arl->arl_refcnt != 0);
+}
+
+static void
+arl_refrele(arl_t *arl)
+{
+	mutex_enter(&arl->arl_lock);
+	ASSERT(arl->arl_refcnt != 0);
+	arl->arl_refcnt--;
+	if (arl->arl_refcnt > 1) {
+		mutex_exit(&arl->arl_lock);
+		return;
+	}
+
+	/* ill_close or arp_unbind_complete may be waiting */
+	cv_broadcast(&arl->arl_cv);
+	mutex_exit(&arl->arl_lock);
+}
+
+/*
+ * wake up any pending ip ioctls.
+ */
+static void
+arp_cmd_done(ill_t *ill, int err, t_uscalar_t lastprim)
+{
+	if (lastprim == DL_UNBIND_REQ && ill->ill_replumbing)
+		arp_replumb_done(ill, 0);
+	else
+		arp_bringup_done(ill, err);
+}
+
+static int
+ip_nce_resolve_all(ill_t *ill, uchar_t *src_haddr, uint32_t hlen,
+    const in_addr_t *src_paddr, ncec_t **sncec, int op)
+{
+	int retv;
+	ncec_t *ncec;
+	boolean_t ll_changed;
+	uchar_t *lladdr = NULL;
+	int new_state;
+
+	ASSERT(ill != NULL);
+
+	ncec = ncec_lookup_illgrp_v4(ill, src_paddr);
+	*sncec = ncec;
+
+	if (ncec == NULL) {
+		retv = AR_NOTFOUND;
+		goto done;
+	}
+
+	mutex_enter(&ncec->ncec_lock);
+	/*
+	 * IP addr and hardware address match what we already
+	 * have, then this is a broadcast packet emitted by one of our
+	 * interfaces, reflected by the switch and received on another
+	 * interface.  We return AR_LOOPBACK.
+	 */
+	lladdr = ncec->ncec_lladdr;
+	if (NCE_MYADDR(ncec) && hlen == ncec->ncec_ill->ill_phys_addr_length &&
+	    bcmp(lladdr, src_haddr, hlen) == 0) {
+		mutex_exit(&ncec->ncec_lock);
+		retv = AR_LOOPBACK;
+		goto done;
+	}
+	/*
+	 * If the entry is unverified, then we've just verified that
+	 * someone else already owns this address, because this is a
+	 * message with the same protocol address but different
+	 * hardware address.
+	 */
+	if (ncec->ncec_flags & NCE_F_UNVERIFIED) {
+		mutex_exit(&ncec->ncec_lock);
+		ncec_delete(ncec);
+		ncec_refrele(ncec);
+		*sncec = NULL;
+		retv = AR_FAILED;
+		goto done;
+	}
+
+	/*
+	 * If the IP address matches ours and we're authoritative for
+	 * this entry, then some other node is using our IP addr, so
+	 * return AR_BOGON.  Also reset the transmit count to zero so
+	 * that, if we're currently in initial announcement mode, we
+	 * switch back to the lazier defense mode.  Knowing that
+	 * there's at least one duplicate out there, we ought not
+	 * blindly announce.
+	 *
+	 * NCE_F_AUTHORITY is set in one of two ways:
+	 * 1. /sbin/arp told us so, via the "permanent" flag.
+	 * 2. This is one of my addresses.
+	 */
+	if (ncec->ncec_flags & NCE_F_AUTHORITY) {
+		ncec->ncec_unsolicit_count = 0;
+		mutex_exit(&ncec->ncec_lock);
+		retv = AR_BOGON;
+		goto done;
+	}
+
+	/*
+	 * No address conflict was detected, and we are getting
+	 * ready to update the ncec's hwaddr. The nce MUST NOT be on an
+	 * under interface, because all dynamic nce's are created on the
+	 * native interface (in the non-IPMP case) or on the IPMP
+	 * meta-interface (in the IPMP case)
+	 */
+	ASSERT(!IS_UNDER_IPMP(ncec->ncec_ill));
+
+	/*
+	 * update ncec with src_haddr, hlen.
+	 *
+	 * We are trying to resolve this ncec_addr/src_paddr and we
+	 * got a REQUEST/RESPONSE from the ncec_addr/src_paddr.
+	 * So the new_state is at least "STALE". If, in addition,
+	 * this a solicited, unicast ARP_RESPONSE, we can transition
+	 * to REACHABLE.
+	 */
+	new_state = ND_STALE;
+	ip1dbg(("got info for ncec %p from addr %x\n",
+	    (void *)ncec, *src_paddr));
+	retv = AR_MERGED;
+	if (ncec->ncec_state == ND_INCOMPLETE ||
+	    ncec->ncec_state == ND_INITIAL) {
+		ll_changed = B_TRUE;
+	} else {
+		ll_changed = nce_cmp_ll_addr(ncec, src_haddr, hlen);
+		if (!ll_changed)
+			new_state = ND_UNCHANGED;
+		else
+			retv = AR_CHANGED;
+	}
+	/*
+	 * We don't have the equivalent of the IPv6 'S' flag indicating
+	 * a solicited response, so we assume that if we are in
+	 * INCOMPLETE, or got back an unchanged lladdr in PROBE state,
+	 * and this is an ARP_RESPONSE, it must be a
+	 * solicited response allowing us to transtion to REACHABLE.
+	 */
+	if (op == ARP_RESPONSE) {
+		switch (ncec->ncec_state) {
+		case ND_PROBE:
+			new_state = (ll_changed ? ND_STALE : ND_REACHABLE);
+			break;
+		case ND_INCOMPLETE:
+			new_state = ND_REACHABLE;
+			break;
+		}
+	}
+	/*
+	 * Call nce_update() to refresh fastpath information on any
+	 * dependent nce_t entries.
+	 */
+	nce_update(ncec, new_state, (ll_changed ? src_haddr : NULL));
+	mutex_exit(&ncec->ncec_lock);
+	nce_resolv_ok(ncec);
+done:
+	return (retv);
+}
+
+/* Find an entry for a particular MAC type in the arp_m_tbl. */
+static arp_m_t	*
+arp_m_lookup(t_uscalar_t mac_type)
+{
+	arp_m_t	*arm;
+
+	for (arm = arp_m_tbl; arm < A_END(arp_m_tbl); arm++) {
+		if (arm->arp_mac_type == mac_type)
+			return (arm);
+	}
+	return (NULL);
+}
+
+static uint32_t
+arp_hw_type(t_uscalar_t mactype)
+{
+	arp_m_t *arm;
+
+	if ((arm = arp_m_lookup(mactype)) == NULL)
+		arm = arp_m_lookup(DL_OTHER);
+	return (arm->arp_mac_arp_hw_type);
+}
+
+/*
+ * Called when an DLPI control message has been acked; send down the next
+ * queued message (if any).
+ * The DLPI messages of interest being bind, attach and unbind since
+ * these are the only ones sent by ARP via arp_dlpi_send.
+ */
+static void
+arp_dlpi_done(arl_t *arl, ill_t *ill)
+{
+	mblk_t *mp;
+	int err;
+	t_uscalar_t prim;
+
+	mutex_enter(&arl->arl_lock);
+	prim = arl->arl_dlpi_pending;
+
+	if ((mp = arl->arl_dlpi_deferred) == NULL) {
+		arl->arl_dlpi_pending = DL_PRIM_INVAL;
+		if (arl->arl_state_flags & ARL_LL_DOWN)
+			err = ENETDOWN;
+		else
+			err = 0;
+		mutex_exit(&arl->arl_lock);
+
+		mutex_enter(&ill->ill_lock);
+		ill->ill_arl_dlpi_pending = 0;
+		mutex_exit(&ill->ill_lock);
+		arp_cmd_done(ill, err, prim);
+		return;
+	}
+
+	arl->arl_dlpi_deferred = mp->b_next;
+	mp->b_next = NULL;
+
+	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+
+	arl->arl_dlpi_pending = DL_PRIM(mp);
+	mutex_exit(&arl->arl_lock);
+
+	mutex_enter(&ill->ill_lock);
+	ill->ill_arl_dlpi_pending = 1;
+	mutex_exit(&ill->ill_lock);
+
+	putnext(arl->arl_wq, mp);
+}
+
+/*
+ * This routine is called during module initialization when the DL_INFO_ACK
+ * comes back from the device.	We set up defaults for all the device dependent
+ * doo-dads we are going to need.  This will leave us ready to roll if we are
+ * attempting auto-configuration.  Alternatively, these defaults can be
+ * overridden by initialization procedures possessing higher intelligence.
+ *
+ * Caller will free the mp.
+ */
+static void
+arp_ll_set_defaults(arl_t *arl, mblk_t *mp)
+{
+	arp_m_t		*arm;
+	dl_info_ack_t	*dlia = (dl_info_ack_t *)mp->b_rptr;
+
+	if ((arm = arp_m_lookup(dlia->dl_mac_type)) == NULL)
+		arm = arp_m_lookup(DL_OTHER);
+	ASSERT(arm != NULL);
+
+	/*
+	 * We initialize based on parameters in the (currently) not too
+	 * exhaustive arp_m_tbl.
+	 */
+	if (dlia->dl_version == DL_VERSION_2) {
+		arl->arl_sap_length = dlia->dl_sap_length;
+		arl->arl_phys_addr_length = dlia->dl_brdcst_addr_length;
+		if (dlia->dl_provider_style == DL_STYLE2)
+			arl->arl_needs_attach = 1;
+	} else {
+		arl->arl_sap_length = arm->arp_mac_sap_length;
+		arl->arl_phys_addr_length = arm->arp_mac_hw_addr_length;
+	}
+	/*
+	 * Note: the arp_hw_type in the arp header may be derived from
+	 * the ill_mac_type and arp_m_lookup().
+	 */
+	arl->arl_sap = ETHERTYPE_ARP;
+	arl_defaults_common(arl, mp);
+}
+
+static void
+arp_wput(queue_t *q, mblk_t *mp)
+{
+	int err = EINVAL;
+	struct iocblk *ioc;
+	mblk_t *mp1;
+
+	switch (DB_TYPE(mp)) {
+	case M_IOCTL:
+		ASSERT(q->q_next != NULL);
+		ioc = (struct iocblk *)mp->b_rptr;
+		if (ioc->ioc_cmd != SIOCSLIFNAME &&
+		    ioc->ioc_cmd != IF_UNITSEL) {
+			DTRACE_PROBE4(arl__dlpi, char *, "arp_wput",
+			    char *, "<some ioctl>", char *, "-",
+			    arl_t *, (arl_t *)q->q_ptr);
+			putnext(q, mp);
+			return;
+		}
+		if ((mp1 = mp->b_cont) == 0)
+			err = EINVAL;
+		else if (ioc->ioc_cmd == SIOCSLIFNAME)
+			err = ip_sioctl_slifname_arp(q, mp1->b_rptr);
+		else if (ioc->ioc_cmd == IF_UNITSEL)
+			err = ip_sioctl_ifunitsel_arp(q, (int *)mp1->b_rptr);
+		if (err == 0)
+			miocack(q, mp, 0, 0);
+		else
+			miocnak(q, mp, 0, err);
+		return;
+	default:
+		DTRACE_PROBE4(arl__dlpi, char *, "arp_wput default",
+		    char *, "default mblk", char *, "-",
+		    arl_t *, (arl_t *)q->q_ptr);
+		putnext(q, mp);
+		return;
+	}
+}
+
+/*
+ * similar to ill_dlpi_pending(): verify that the received DLPI response
+ * matches the one that is pending for the arl.
+ */
+static boolean_t
+arl_dlpi_pending(arl_t *arl, t_uscalar_t prim)
+{
+	t_uscalar_t pending;
+
+	mutex_enter(&arl->arl_lock);
+	if (arl->arl_dlpi_pending == prim) {
+		mutex_exit(&arl->arl_lock);
+		return (B_TRUE);
+	}
+
+	if (arl->arl_state_flags & ARL_CONDEMNED) {
+		mutex_exit(&arl->arl_lock);
+		return (B_FALSE);
+	}
+	pending = arl->arl_dlpi_pending;
+	mutex_exit(&arl->arl_lock);
+
+	if (pending == DL_PRIM_INVAL) {
+		ip0dbg(("arl_dlpi_pending unsolicited ack for %s on %s",
+		    dl_primstr(prim), arl->arl_name));
+	} else {
+		ip0dbg(("arl_dlpi_pending ack for %s on %s expect %s",
+		    dl_primstr(prim), arl->arl_name, dl_primstr(pending)));
+	}
+	return (B_FALSE);
+}
+
+/* DLPI messages, other than DL_UNITDATA_IND are handled here. */
+static void
+arp_rput_dlpi(queue_t *q, mblk_t *mp)
+{
+	arl_t		*arl = (arl_t *)q->q_ptr;
+	union DL_primitives *dlp;
+	t_uscalar_t	prim;
+	t_uscalar_t	reqprim = DL_PRIM_INVAL;
+	ill_t		*ill;
+
+	if ((mp->b_wptr - mp->b_rptr) < sizeof (dlp->dl_primitive)) {
+		putnext(q, mp);
+		return;
+	}
+	dlp = (union DL_primitives *)mp->b_rptr;
+	prim = dlp->dl_primitive;
+
+	/*
+	 * If we received an ACK but didn't send a request for it, then it
+	 * can't be part of any pending operation; discard up-front.
+	 */
+	switch (prim) {
+	case DL_ERROR_ACK:
+		/*
+		 * ce is confused about how DLPI works, so we have to interpret
+		 * an "error" on DL_NOTIFY_ACK (which we never could have sent)
+		 * as really meaning an error on DL_NOTIFY_REQ.
+		 *
+		 * Note that supporting DL_NOTIFY_REQ is optional, so printing
+		 * out an error message on the console isn't warranted except
+		 * for debug.
+		 */
+		if (dlp->error_ack.dl_error_primitive == DL_NOTIFY_ACK ||
+		    dlp->error_ack.dl_error_primitive == DL_NOTIFY_REQ) {
+			reqprim = DL_NOTIFY_REQ;
+		} else {
+			reqprim = dlp->error_ack.dl_error_primitive;
+		}
+		break;
+	case DL_INFO_ACK:
+		reqprim = DL_INFO_REQ;
+		break;
+	case DL_OK_ACK:
+		reqprim = dlp->ok_ack.dl_correct_primitive;
+		break;
+	case DL_BIND_ACK:
+		reqprim = DL_BIND_REQ;
+		break;
+	default:
+		DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl,
+		    union DL_primitives *, dlp);
+		putnext(q, mp);
+		return;
+	}
+	if (reqprim == DL_PRIM_INVAL || !arl_dlpi_pending(arl, reqprim)) {
+		freemsg(mp);
+		return;
+	}
+	DTRACE_PROBE4(arl__dlpi, char *, "arp_rput_dlpi received",
+	    char *, dl_primstr(prim), char *, dl_primstr(reqprim),
+	    arl_t *, arl);
+
+	ASSERT(prim != DL_NOTIFY_IND);
+
+	ill = arl_to_ill(arl);
+
+	switch (reqprim) {
+	case DL_INFO_REQ:
+		/*
+		 * ill has not been set up yet for this case. This is the
+		 * DL_INFO_ACK for the first DL_INFO_REQ sent from
+		 * arp_modopen(). There should be no other arl_dlpi_deferred
+		 * messages pending. We initialize the arl here.
+		 */
+		ASSERT(!arl->arl_dlpi_style_set);
+		ASSERT(arl->arl_dlpi_pending == DL_INFO_REQ);
+		ASSERT(arl->arl_dlpi_deferred == NULL);
+		arl->arl_dlpi_pending = DL_PRIM_INVAL;
+		arp_ll_set_defaults(arl, mp);
+		freemsg(mp);
+		return;
+	case DL_UNBIND_REQ:
+		mutex_enter(&arl->arl_lock);
+		arl->arl_state_flags &= ~ARL_DL_UNBIND_IN_PROGRESS;
+		/*
+		 * This is not an error, so we don't set ARL_LL_DOWN
+		 */
+		arl->arl_state_flags &= ~ARL_LL_UP;
+		arl->arl_state_flags |= ARL_LL_UNBOUND;
+		if (arl->arl_state_flags & ARL_CONDEMNED) {
+			/*
+			 * if this is part of the unplumb the arl may
+			 * vaporize any moment after we cv_signal the
+			 * arl_cv so we reset arl_dlpi_pending here.
+			 * All other cases (including replumb) will
+			 * have the arl_dlpi_pending reset in
+			 * arp_dlpi_done.
+			 */
+			arl->arl_dlpi_pending = DL_PRIM_INVAL;
+		}
+		cv_signal(&arl->arl_cv);
+		mutex_exit(&arl->arl_lock);
+		break;
+	}
+	if (ill != NULL) {
+		/*
+		 * ill ref obtained by arl_to_ill()  will be released
+		 * by qwriter_ip()
+		 */
+		qwriter_ip(ill, ill->ill_wq, mp, arp_rput_dlpi_writer,
+		    CUR_OP, B_TRUE);
+		return;
+	}
+	freemsg(mp);
+}
+
+/*
+ * Handling of DLPI messages that require exclusive access to the ipsq.
+ */
+/* ARGSUSED */
+static void
+arp_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
+{
+	union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
+	ill_t		*ill = (ill_t *)q->q_ptr;
+	arl_t		*arl = ill_to_arl(ill);
+
+	if (arl == NULL) {
+		/*
+		 * happens as a result arp_modclose triggering unbind.
+		 * arp_rput_dlpi will cv_signal the arl_cv and the modclose
+		 * will complete, but when it does ipsq_exit, the waiting
+		 * qwriter_ip gets into the ipsq but will find the arl null.
+		 * There should be no deferred messages in this case, so
+		 * just complete and exit.
+		 */
+		arp_cmd_done(ill, 0, DL_UNBIND_REQ);
+		freemsg(mp);
+		return;
+	}
+	switch (dlp->dl_primitive) {
+	case DL_ERROR_ACK:
+		switch (dlp->error_ack.dl_error_primitive) {
+		case DL_UNBIND_REQ:
+			mutex_enter(&arl->arl_lock);
+			arl->arl_state_flags &= ~ARL_DL_UNBIND_IN_PROGRESS;
+			arl->arl_state_flags &= ~ARL_LL_UP;
+			arl->arl_state_flags |= ARL_LL_UNBOUND;
+			arl->arl_state_flags |= ARL_LL_DOWN;
+			cv_signal(&arl->arl_cv);
+			mutex_exit(&arl->arl_lock);
+			break;
+		case DL_BIND_REQ:
+			mutex_enter(&arl->arl_lock);
+			arl->arl_state_flags &= ~ARL_LL_UP;
+			arl->arl_state_flags |= ARL_LL_DOWN;
+			arl->arl_state_flags |= ARL_LL_UNBOUND;
+			cv_signal(&arl->arl_cv);
+			mutex_exit(&arl->arl_lock);
+			break;
+		case DL_ATTACH_REQ:
+			break;
+		default:
+			/* If it's anything else, we didn't send it. */
+			arl_refrele(arl);
+			putnext(q, mp);
+			return;
+		}
+		break;
+	case DL_OK_ACK:
+		DTRACE_PROBE4(arl__dlpi, char *, "arp_rput_dlpi_writer ok",
+		    char *, dl_primstr(dlp->ok_ack.dl_correct_primitive),
+		    char *, dl_primstr(dlp->ok_ack.dl_correct_primitive),
+		    arl_t *, arl);
+		mutex_enter(&arl->arl_lock);
+		switch (dlp->ok_ack.dl_correct_primitive) {
+		case DL_UNBIND_REQ:
+		case DL_ATTACH_REQ:
+			break;
+		default:
+			ip0dbg(("Dropping unrecognized DL_OK_ACK for %s",
+			    dl_primstr(dlp->ok_ack.dl_correct_primitive)));
+			mutex_exit(&arl->arl_lock);
+			arl_refrele(arl);
+			freemsg(mp);
+			return;
+		}
+		mutex_exit(&arl->arl_lock);
+		break;
+	case DL_BIND_ACK:
+		DTRACE_PROBE2(rput_dl_bind, arl_t *, arl,
+		    dl_bind_ack_t *, &dlp->bind_ack);
+
+		mutex_enter(&arl->arl_lock);
+		ASSERT(arl->arl_state_flags & ARL_LL_BIND_PENDING);
+		arl->arl_state_flags &=
+		    ~(ARL_LL_BIND_PENDING|ARL_LL_DOWN|ARL_LL_UNBOUND);
+		arl->arl_state_flags |= ARL_LL_UP;
+		mutex_exit(&arl->arl_lock);
+		break;
+	case DL_UDERROR_IND:
+		DTRACE_PROBE2(rput_dl_uderror, arl_t *, arl,
+		    dl_uderror_ind_t *, &dlp->uderror_ind);
+		arl_refrele(arl);
+		putnext(q, mp);
+		return;
+	default:
+		DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl,
+		    union DL_primitives *, dlp);
+		arl_refrele(arl);
+		putnext(q, mp);
+		return;
+	}
+	arp_dlpi_done(arl, ill);
+	arl_refrele(arl);
+	freemsg(mp);
+}
+
+void
+arp_rput(queue_t *q, mblk_t *mp)
+{
+	arl_t		*arl = q->q_ptr;
+	boolean_t	need_refrele = B_FALSE;
+
+	mutex_enter(&arl->arl_lock);
+	if (((arl->arl_state_flags &
+	    (ARL_CONDEMNED | ARL_LL_REPLUMBING)) != 0)) {
+		/*
+		 * Only allow high priority DLPI messages during unplumb or
+		 * replumb, and we don't take an arl_refcnt for that case.
+		 */
+		if (DB_TYPE(mp) != M_PCPROTO) {
+			mutex_exit(&arl->arl_lock);
+			freemsg(mp);
+			return;
+		}
+	} else {
+		arl_refhold_locked(arl);
+		need_refrele = B_TRUE;
+	}
+	mutex_exit(&arl->arl_lock);
+
+	switch (DB_TYPE(mp)) {
+	case M_PCPROTO:
+	case M_PROTO: {
+		ill_t *ill;
+
+		/*
+		 * could be one of
+		 * (i)   real message from the wire, (DLPI_DATA)
+		 * (ii)  DLPI message
+		 * Take a ref on the ill associated with this arl to
+		 * prevent the ill from being unplumbed until this thread
+		 * is done.
+		 */
+		if (IS_DLPI_DATA(mp)) {
+			ill = arl_to_ill(arl);
+			if (ill == NULL) {
+				arp_drop_packet("No ill", mp, ill);
+				break;
+			}
+			arp_process_packet(ill, mp);
+			ill_refrele(ill);
+			break;
+		}
+		/* Miscellaneous DLPI messages get shuffled off. */
+		arp_rput_dlpi(q, mp);
+		break;
+	}
+	case M_ERROR:
+	case M_HANGUP:
+		if (mp->b_rptr < mp->b_wptr)
+			arl->arl_error = (int)(*mp->b_rptr & 0xFF);
+		if (arl->arl_error == 0)
+			arl->arl_error = ENXIO;
+		freemsg(mp);
+		break;
+	default:
+		ip1dbg(("arp_rput other db type %x\n", DB_TYPE(mp)));
+		putnext(q, mp);
+		break;
+	}
+	if (need_refrele)
+		arl_refrele(arl);
+}
+
+static void
+arp_process_packet(ill_t *ill, mblk_t *mp)
+{
+	mblk_t 		*mp1;
+	arh_t		*arh;
+	in_addr_t	src_paddr, dst_paddr;
+	uint32_t	hlen, plen;
+	boolean_t	is_probe;
+	int		op;
+	ncec_t		*dst_ncec, *src_ncec = NULL;
+	uchar_t		*src_haddr, *arhp, *dst_haddr, *dp, *sp;
+	int		err;
+	ip_stack_t	*ipst;
+	boolean_t	need_ill_refrele = B_FALSE;
+	nce_t		*nce;
+	uchar_t		*src_lladdr;
+	dl_unitdata_ind_t *dlui;
+	ip_recv_attr_t	iras;
+
+	ASSERT(ill != NULL);
+	if (ill->ill_flags & ILLF_NOARP) {
+		arp_drop_packet("Interface does not support ARP", mp, ill);
+		return;
+	}
+	ipst = ill->ill_ipst;
+	/*
+	 * What we should have at this point is a DL_UNITDATA_IND message
+	 * followed by an ARP packet.  We do some initial checks and then
+	 * get to work.
+	 */
+	dlui = (dl_unitdata_ind_t *)mp->b_rptr;
+	if (dlui->dl_group_address == 1) {
+		/*
+		 * multicast or broadcast  packet. Only accept on the ipmp
+		 * nominated interface for multicasts ('cast_ill').
+		 * If we have no cast_ill we are liberal and accept everything.
+		 */
+		if (IS_UNDER_IPMP(ill)) {
+			/* For an under ill_grp can change under lock */
+			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+			if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
+			    ill->ill_grp->ig_cast_ill != NULL) {
+				rw_exit(&ipst->ips_ill_g_lock);
+				arp_drop_packet("Interface is not nominated "
+				    "for multicast sends and receives",
+				    mp, ill);
+				return;
+			}
+			rw_exit(&ipst->ips_ill_g_lock);
+		}
+	}
+	mp1 = mp->b_cont;
+	if (mp1 == NULL) {
+		arp_drop_packet("Missing ARP packet", mp, ill);
+		return;
+	}
+	if (mp1->b_cont != NULL) {
+		/* No fooling around with funny messages. */
+		if (!pullupmsg(mp1, -1)) {
+			arp_drop_packet("Funny message: pullup failed",
+			    mp, ill);
+			return;
+		}
+	}
+	arh = (arh_t *)mp1->b_rptr;
+	hlen = arh->arh_hlen;
+	plen = arh->arh_plen;
+	if (MBLKL(mp1) < ARH_FIXED_LEN + 2 * hlen + 2 * plen) {
+		arp_drop_packet("mblk len too small", mp, ill);
+		return;
+	}
+	/*
+	 * hlen 0 is used for RFC 1868 UnARP.
+	 *
+	 * Note that the rest of the code checks that hlen is what we expect
+	 * for this hardware address type, so might as well discard packets
+	 * here that don't match.
+	 */
+	if ((hlen > 0 && hlen != ill->ill_phys_addr_length) || plen == 0) {
+		DTRACE_PROBE2(rput_bogus, ill_t *, ill, mblk_t *, mp1);
+		arp_drop_packet("Bogus hlen or plen", mp, ill);
+		return;
+	}
+	/*
+	 * Historically, Solaris has been lenient about hardware type numbers.
+	 * We should check here, but don't.
+	 */
+	DTRACE_PROBE3(arp__physical__in__start, ill_t *, ill, arh_t *, arh,
+	    mblk_t *, mp);
+	/*
+	 * If ill is in an ipmp group, it will be the under ill. If we want
+	 * to report the packet as coming up the IPMP interface, we should
+	 * convert it to the ipmp ill.
+	 */
+	ARP_HOOK_IN(ipst->ips_arp_physical_in_event, ipst->ips_arp_physical_in,
+	    ill->ill_phyint->phyint_ifindex, arh, mp, mp1, ipst);
+	DTRACE_PROBE1(arp__physical__in__end, mblk_t *, mp);
+	if (mp == NULL)
+		return;
+	arhp = (uchar_t *)arh + ARH_FIXED_LEN;
+	src_haddr = arhp;			/* ar$sha */
+	arhp += hlen;
+	bcopy(arhp, &src_paddr, IP_ADDR_LEN);	/* ar$spa */
+	sp = arhp;
+	arhp += IP_ADDR_LEN;
+	dst_haddr = arhp;			/* ar$dha */
+	arhp += hlen;
+	bcopy(arhp, &dst_paddr, IP_ADDR_LEN);	/* ar$tpa */
+	dp = arhp;
+	op = BE16_TO_U16(arh->arh_operation);
+
+	DTRACE_PROBE2(ip__arp__input, (in_addr_t), src_paddr,
+	    (in_addr_t), dst_paddr);
+
+	/* Determine if this is just a probe */
+	is_probe = (src_paddr == INADDR_ANY);
+
+	/*
+	 * ira_ill is the only field used down the arp_notify path.
+	 */
+	bzero(&iras, sizeof (iras));
+	iras.ira_ill = iras.ira_rill = ill;
+	/*
+	 * RFC 826: first check if the <protocol, sender protocol address> is
+	 * in the cache, if there is a sender protocol address.  Note that this
+	 * step also handles resolutions based on source.
+	 */
+	/* Note: after here we need to freeb(mp) and freemsg(mp1) separately */
+	mp->b_cont = NULL;
+	if (is_probe) {
+		err = AR_NOTFOUND;
+	} else {
+		if (plen != 4) {
+			arp_drop_packet("bad protocol len", mp, ill);
+			return;
+		}
+		err = ip_nce_resolve_all(ill, src_haddr, hlen, &src_paddr,
+		    &src_ncec, op);
+		switch (err) {
+		case AR_BOGON:
+			ASSERT(src_ncec != NULL);
+			arp_notify(src_paddr, mp1, AR_CN_BOGON,
+			    &iras, src_ncec);
+			break;
+		case AR_FAILED:
+			arp_notify(src_paddr, mp1, AR_CN_FAILED, &iras,
+			    src_ncec);
+			break;
+		case AR_LOOPBACK:
+			DTRACE_PROBE2(rput_loopback, ill_t *, ill, arh_t *,
+			    arh);
+			freemsg(mp1);
+			break;
+		default:
+			goto update;
+		}
+		freemsg(mp);
+		if (src_ncec != NULL)
+			ncec_refrele(src_ncec);
+		return;
+	}
+update:
+	/*
+	 * Now look up the destination address.  By RFC 826, we ignore the
+	 * packet at this step if the target isn't one of our addresses (i.e.,
+	 * one we have been asked to PUBLISH).  This is true even if the
+	 * target is something we're trying to resolve and the packet
+	 * is a response.
+	 */
+	dst_ncec = ncec_lookup_illgrp_v4(ill, &dst_paddr);
+	if (dst_ncec == NULL || !NCE_PUBLISH(dst_ncec)) {
+		/*
+		 * Let the client know if the source mapping has changed, even
+		 * if the destination provides no useful information for the
+		 * client.
+		 */
+		if (err == AR_CHANGED) {
+			arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras,
+			    NULL);
+			freemsg(mp);
+		} else {
+			freemsg(mp);
+			arp_drop_packet("Target is not interesting", mp1, ill);
+		}
+		if (dst_ncec != NULL)
+			ncec_refrele(dst_ncec);
+		if (src_ncec != NULL)
+			ncec_refrele(src_ncec);
+		return;
+	}
+
+	if (dst_ncec->ncec_flags & NCE_F_UNVERIFIED) {
+		/*
+		 * Check for a reflection.  Some misbehaving bridges will
+		 * reflect our own transmitted packets back to us.
+		 */
+		ASSERT(NCE_PUBLISH(dst_ncec));
+		if (hlen != dst_ncec->ncec_ill->ill_phys_addr_length) {
+			ncec_refrele(dst_ncec);
+			if (src_ncec != NULL)
+				ncec_refrele(src_ncec);
+			freemsg(mp);
+			arp_drop_packet("bad arh_len", mp1, ill);
+			return;
+		}
+		if (!nce_cmp_ll_addr(dst_ncec, src_haddr, hlen)) {
+			DTRACE_PROBE3(rput_probe_reflected, ill_t *, ill,
+			    arh_t *, arh, ncec_t *, dst_ncec);
+			ncec_refrele(dst_ncec);
+			if (src_ncec != NULL)
+				ncec_refrele(src_ncec);
+			freemsg(mp);
+			arp_drop_packet("Reflected probe", mp1, ill);
+			return;
+		}
+		/*
+		 * Responses targeting our HW address that are not responses to
+		 * our DAD probe must be ignored as they are related to requests
+		 * sent before DAD was restarted.
+		 */
+		if (op == ARP_RESPONSE &&
+		    (nce_cmp_ll_addr(dst_ncec, dst_haddr, hlen) == 0)) {
+			ncec_refrele(dst_ncec);
+			if (src_ncec != NULL)
+				ncec_refrele(src_ncec);
+			freemsg(mp);
+			arp_drop_packet(
+			    "Response to request that was sent before DAD",
+			    mp1, ill);
+			return;
+		}
+		/*
+		 * Responses targeted to HW addresses which are not ours but
+		 * sent to our unverified proto address are also conflicts.
+		 * These may be reported by a proxy rather than the interface
+		 * with the conflicting address, dst_paddr is in conflict
+		 * rather than src_paddr. To ensure IP can locate the correct
+		 * ipif to take down, it is necessary to copy dst_paddr to
+		 * the src_paddr field before sending it to IP. The same is
+		 * required for probes, where src_paddr will be INADDR_ANY.
+		 */
+		if (is_probe || op == ARP_RESPONSE) {
+			bcopy(dp, sp, plen);
+			arp_notify(src_paddr, mp1, AR_CN_FAILED, &iras,
+			    NULL);
+			ncec_delete(dst_ncec);
+		} else if (err == AR_CHANGED) {
+			arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras,
+			    NULL);
+		} else {
+			DTRACE_PROBE3(rput_request_unverified,
+			    ill_t *, ill, arh_t *, arh, ncec_t *, dst_ncec);
+			arp_drop_packet("Unverified request", mp1, ill);
+		}
+		freemsg(mp);
+		ncec_refrele(dst_ncec);
+		if (src_ncec != NULL)
+			ncec_refrele(src_ncec);
+		return;
+	}
+	/*
+	 * If it's a request, then we reply to this, and if we think the
+	 * sender's unknown, then we create an entry to avoid unnecessary ARPs.
+	 * The design assumption is that someone ARPing us is likely to send us
+	 * a packet soon, and that we'll want to reply to it.
+	 */
+	if (op == ARP_REQUEST) {
+		const uchar_t *nce_hwaddr;
+		struct in_addr nce_paddr;
+		clock_t now;
+		ill_t *under_ill = ill;
+		boolean_t send_unicast = B_TRUE;
+
+		ASSERT(NCE_PUBLISH(dst_ncec));
+
+		if ((dst_ncec->ncec_flags & (NCE_F_BCAST|NCE_F_MCAST)) != 0) {
+			/*
+			 * Ignore senders who are deliberately or accidentally
+			 * confused.
+			 */
+			goto bail;
+		}
+
+		if (!is_probe && err == AR_NOTFOUND) {
+			ASSERT(src_ncec == NULL);
+
+			if (IS_UNDER_IPMP(under_ill)) {
+				/*
+				 * create the ncec for the sender on ipmp_ill.
+				 * We pass in the ipmp_ill itself to avoid
+				 * creating an nce_t on the under_ill.
+				 */
+				ill = ipmp_ill_hold_ipmp_ill(under_ill);
+				if (ill == NULL)
+					ill = under_ill;
+				else
+					need_ill_refrele = B_TRUE;
+			}
+
+			err = nce_lookup_then_add_v4(ill, src_haddr, hlen,
+			    &src_paddr, 0, ND_STALE, &nce);
+
+			switch (err) {
+			case 0:
+			case EEXIST:
+				ip1dbg(("added ncec %p in state %d ill %s\n",
+				    (void *)src_ncec, src_ncec->ncec_state,
+				    ill->ill_name));
+				src_ncec = nce->nce_common;
+				break;
+			default:
+				/*
+				 * Either no memory, or the outgoing interface
+				 * is in the process of down/unplumb. In the
+				 * latter case, we will fail the send anyway,
+				 * and in the former case, we should try to send
+				 * the ARP response.
+				 */
+				src_lladdr = src_haddr;
+				goto send_response;
+			}
+			ncec_refhold(src_ncec);
+			nce_refrele(nce);
+			/* set up cleanup interval on ncec */
+		}
+
+		/*
+		 * This implements periodic address defense based on a modified
+		 * version of the RFC 3927 requirements.  Instead of sending a
+		 * broadcasted reply every time, as demanded by the RFC, we
+		 * send at most one broadcast reply per arp_broadcast_interval.
+		 */
+		now = ddi_get_lbolt();
+		if ((now - dst_ncec->ncec_last_time_defended) >
+		    MSEC_TO_TICK(ipst->ips_ipv4_dad_announce_interval)) {
+			dst_ncec->ncec_last_time_defended = now;
+			/*
+			 * If this is one of the long-suffering entries,
+			 * pull it out now.  It no longer needs separate
+			 * defense, because we're now doing that with this
+			 * broadcasted reply.
+			 */
+			dst_ncec->ncec_flags &= ~NCE_F_DELAYED;
+			send_unicast = B_FALSE;
+		}
+		if (src_ncec != NULL && send_unicast) {
+			src_lladdr = src_ncec->ncec_lladdr;
+		} else {
+			src_lladdr = under_ill->ill_bcast_mp->b_rptr +
+			    NCE_LL_ADDR_OFFSET(under_ill);
+		}
+send_response:
+		nce_hwaddr = dst_ncec->ncec_lladdr;
+		IN6_V4MAPPED_TO_INADDR(&dst_ncec->ncec_addr, &nce_paddr);
+
+		(void) arp_output(under_ill, ARP_RESPONSE,
+		    nce_hwaddr, (uchar_t *)&nce_paddr, src_haddr,
+		    (uchar_t *)&src_paddr, src_lladdr);
+	}
+bail:
+	if (dst_ncec != NULL) {
+		ncec_refrele(dst_ncec);
+	}
+	if (src_ncec != NULL) {
+		ncec_refrele(src_ncec);
+	}
+	if (err == AR_CHANGED) {
+		mp->b_cont = NULL;
+		arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras, NULL);
+		mp1 = NULL;
+	}
+	if (need_ill_refrele)
+		ill_refrele(ill);
+done:
+	freemsg(mp);
+	freemsg(mp1);
+}
+
+/*
+ * Basic initialization of the arl_t and the arl_common structure shared with
+ * the ill_t that is done after SLIFNAME/IF_UNITSEL.
+ */
+static int
+arl_ill_init(arl_t *arl, char *ill_name)
+{
+	ill_t *ill;
+	arl_ill_common_t *ai;
+
+	ill = ill_lookup_on_name(ill_name, B_FALSE, B_FALSE, B_FALSE,
+	    arl->arl_ipst);
+
+	if (ill == NULL)
+		return (ENXIO);
+
+	/*
+	 * By the time we set up the arl, we expect the ETHERTYPE_IP
+	 * stream to be fully bound and attached. So we copy/verify
+	 * relevant information as possible from/against the ill.
+	 *
+	 * The following should have been set up in arp_ll_set_defaults()
+	 * after the first DL_INFO_ACK was received.
+	 */
+	ASSERT(arl->arl_phys_addr_length == ill->ill_phys_addr_length);
+	ASSERT(arl->arl_sap == ETHERTYPE_ARP);
+	ASSERT(arl->arl_mactype == ill->ill_mactype);
+	ASSERT(arl->arl_sap_length == ill->ill_sap_length);
+
+	ai =  kmem_zalloc(sizeof (*ai), KM_SLEEP);
+	mutex_enter(&ill->ill_lock);
+	/* First ensure that the ill is not CONDEMNED.  */
+	if (ill->ill_state_flags & ILL_CONDEMNED) {
+		mutex_exit(&ill->ill_lock);
+		ill_refrele(ill);
+		kmem_free(ai, sizeof (*ai));
+		return (ENXIO);
+	}
+	if (ill->ill_common != NULL || arl->arl_common != NULL) {
+		mutex_exit(&ill->ill_lock);
+		ip0dbg(("%s: PPA already exists", ill->ill_name));
+		ill_refrele(ill);
+		kmem_free(ai, sizeof (*ai));
+		return (EEXIST);
+	}
+	mutex_init(&ai->ai_lock, NULL, MUTEX_DEFAULT, NULL);
+	ai->ai_arl = arl;
+	ai->ai_ill = ill;
+	ill->ill_common = ai;
+	arl->arl_common = ai;
+	mutex_exit(&ill->ill_lock);
+	(void) strlcpy(arl->arl_name, ill->ill_name, LIFNAMSIZ);
+	arl->arl_name_length = ill->ill_name_length;
+	ill_refrele(ill);
+	arp_ifname_notify(arl);
+	return (0);
+}
+
+/* Allocate and do common initializations for DLPI messages. */
+static mblk_t *
+ip_ar_dlpi_comm(t_uscalar_t prim, size_t size)
+{
+	mblk_t  *mp;
+
+	if ((mp = allocb(size, BPRI_HI)) == NULL)
+		return (NULL);
+
+	/*
+	 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
+	 * of which we don't seem to use) are sent with M_PCPROTO, and
+	 * that other DLPI are M_PROTO.
+	 */
+	DB_TYPE(mp) = (prim == DL_INFO_REQ) ? M_PCPROTO : M_PROTO;
+
+	mp->b_wptr = mp->b_rptr + size;
+	bzero(mp->b_rptr, size);
+	DL_PRIM(mp) = prim;
+	return (mp);
+}
+
+
+int
+ip_sioctl_ifunitsel_arp(queue_t *q, int *ppa)
+{
+	arl_t *arl;
+	char *cp, ill_name[LIFNAMSIZ];
+
+	if (q->q_next == NULL)
+		return (EINVAL);
+
+	do {
+		q = q->q_next;
+	} while (q->q_next != NULL);
+	cp = q->q_qinfo->qi_minfo->mi_idname;
+
+	arl = (arl_t *)q->q_ptr;
+	(void) snprintf(ill_name, sizeof (ill_name), "%s%d", cp, *ppa);
+	arl->arl_ppa = *ppa;
+	return (arl_ill_init(arl, ill_name));
+}
+
+int
+ip_sioctl_slifname_arp(queue_t *q, void *lifreq)
+{
+	arl_t *arl;
+	struct lifreq *lifr = lifreq;
+
+	/* ioctl not valid when IP opened as a device */
+	if (q->q_next == NULL)
+		return (EINVAL);
+
+	arl = (arl_t *)q->q_ptr;
+	arl->arl_ppa = lifr->lifr_ppa;
+	return (arl_ill_init(arl, lifr->lifr_name));
+}
+
+arl_t *
+ill_to_arl(ill_t *ill)
+{
+	arl_ill_common_t *ai = ill->ill_common;
+	arl_t *arl = NULL;
+
+	if (ai == NULL)
+		return (NULL);
+	/*
+	 * Find the arl_t that corresponds to this ill_t from the shared
+	 * ill_common structure. We can safely access the ai here as it
+	 * will only be freed in arp_modclose() after we have become
+	 * single-threaded.
+	 */
+	mutex_enter(&ai->ai_lock);
+	if ((arl = ai->ai_arl) != NULL) {
+		mutex_enter(&arl->arl_lock);
+		if (!(arl->arl_state_flags & ARL_CONDEMNED)) {
+			arl_refhold_locked(arl);
+			mutex_exit(&arl->arl_lock);
+		} else {
+			mutex_exit(&arl->arl_lock);
+			arl = NULL;
+		}
+	}
+	mutex_exit(&ai->ai_lock);
+	return (arl);
+}
+
+ill_t *
+arl_to_ill(arl_t *arl)
+{
+	arl_ill_common_t *ai = arl->arl_common;
+	ill_t *ill = NULL;
+
+	if (ai == NULL) {
+		/*
+		 * happens when the arp stream is just being opened, and
+		 * arl_ill_init has not been executed yet.
+		 */
+		return (NULL);
+	}
+	/*
+	 * Find the ill_t that corresponds to this arl_t from the shared
+	 * arl_common structure. We can safely access the ai here as it
+	 * will only be freed in arp_modclose() after we have become
+	 * single-threaded.
+	 */
+	mutex_enter(&ai->ai_lock);
+	if ((ill = ai->ai_ill) != NULL) {
+		mutex_enter(&ill->ill_lock);
+		if (!ILL_IS_CONDEMNED(ill)) {
+			ill_refhold_locked(ill);
+			mutex_exit(&ill->ill_lock);
+		} else {
+			mutex_exit(&ill->ill_lock);
+			ill = NULL;
+		}
+	}
+	mutex_exit(&ai->ai_lock);
+	return (ill);
+}
+
+int
+arp_ll_up(ill_t *ill)
+{
+	mblk_t	*attach_mp = NULL;
+	mblk_t	*bind_mp = NULL;
+	mblk_t	*unbind_mp = NULL;
+	arl_t 	*arl;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+	arl = ill_to_arl(ill);
+
+	DTRACE_PROBE2(ill__downup, char *, "arp_ll_up", ill_t *, ill);
+	if (arl == NULL)
+		return (ENXIO);
+	DTRACE_PROBE2(arl__downup, char *, "arp_ll_up", arl_t *, arl);
+	if ((arl->arl_state_flags & ARL_LL_UP) != 0) {
+		arl_refrele(arl);
+		return (0);
+	}
+	if (arl->arl_needs_attach) { /* DL_STYLE2 */
+		attach_mp =
+		    ip_ar_dlpi_comm(DL_ATTACH_REQ, sizeof (dl_attach_req_t));
+		if (attach_mp == NULL)
+			goto bad;
+		((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa = arl->arl_ppa;
+	}
+
+	/* Allocate and initialize a bind message. */
+	bind_mp = ip_ar_dlpi_comm(DL_BIND_REQ, sizeof (dl_bind_req_t));
+	if (bind_mp == NULL)
+		goto bad;
+	((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ETHERTYPE_ARP;
+	((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
+
+	unbind_mp = ip_ar_dlpi_comm(DL_UNBIND_REQ, sizeof (dl_unbind_req_t));
+	if (unbind_mp == NULL)
+		goto bad;
+	if (arl->arl_needs_attach) {
+		arp_dlpi_send(arl, attach_mp);
+	}
+	arl->arl_unbind_mp = unbind_mp;
+
+	arl->arl_state_flags |= ARL_LL_BIND_PENDING;
+	arp_dlpi_send(arl, bind_mp);
+	arl_refrele(arl);
+	return (EINPROGRESS);
+
+bad:
+	freemsg(attach_mp);
+	freemsg(bind_mp);
+	freemsg(unbind_mp);
+	arl_refrele(arl);
+	return (ENOMEM);
+}
+
+/*
+ * consumes/frees mp
+ */
+static void
+arp_notify(in_addr_t src, mblk_t *mp, uint32_t arcn_code,
+    ip_recv_attr_t *ira, ncec_t *ncec)
+{
+	char		hbuf[MAC_STR_LEN];
+	char		sbuf[INET_ADDRSTRLEN];
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	arh_t		*arh = (arh_t *)mp->b_rptr;
+
+	switch (arcn_code) {
+	case AR_CN_BOGON:
+		/*
+		 * Someone is sending ARP packets with a source protocol
+		 * address that we have published and for which we believe our
+		 * entry is authoritative and verified to be unique on
+		 * the network.
+		 *
+		 * arp_process_packet() sends AR_CN_FAILED for the case when
+		 * a DAD probe is received and the hardware address of a
+		 * non-authoritative entry has changed. Thus, AR_CN_BOGON
+		 * indicates a real conflict, and we have to do resolution.
+		 *
+		 * We back away quickly from the address if it's from DHCP or
+		 * otherwise temporary and hasn't been used recently (or at
+		 * all).  We'd like to include "deprecated" addresses here as
+		 * well (as there's no real reason to defend something we're
+		 * discarding), but IPMP "reuses" this flag to mean something
+		 * other than the standard meaning.
+		 */
+		if (ip_nce_conflict(mp, ira, ncec)) {
+			(void) mac_colon_addr((uint8_t *)(arh + 1),
+			    arh->arh_hlen, hbuf, sizeof (hbuf));
+			(void) ip_dot_addr(src, sbuf);
+			cmn_err(CE_WARN,
+			    "proxy ARP problem?  Node '%s' is using %s on %s",
+			    hbuf, sbuf, ill->ill_name);
+			if (!arp_no_defense)
+				(void) arp_announce(ncec);
+			/*
+			 * ncec_last_time_defended has been adjusted in
+			 * ip_nce_conflict.
+			 */
+		} else {
+			ncec_delete(ncec);
+		}
+		freemsg(mp);
+		break;
+	case AR_CN_ANNOUNCE: {
+		nce_hw_map_t hwm;
+		/*
+		 * ARP gives us a copy of any packet where it thinks
+		 * the address has changed, so that we can update our
+		 * caches.  We're responsible for caching known answers
+		 * in the current design.  We check whether the
+		 * hardware address really has changed in all of our
+		 * entries that have cached this mapping, and if so, we
+		 * blow them away.  This way we will immediately pick
+		 * up the rare case of a host changing hardware
+		 * address.
+		 */
+		if (src == 0) {
+			freemsg(mp);
+			break;
+		}
+		hwm.hwm_addr = src;
+		hwm.hwm_hwlen = arh->arh_hlen;
+		hwm.hwm_hwaddr = (uchar_t *)(arh + 1);
+		hwm.hwm_flags = 0;
+		ncec_walk_common(ipst->ips_ndp4, NULL,
+		    (pfi_t)nce_update_hw_changed, &hwm, B_TRUE);
+		freemsg(mp);
+		break;
+	}
+	case AR_CN_FAILED:
+		if (arp_no_defense) {
+			(void) mac_colon_addr((uint8_t *)(arh + 1),
+			    arh->arh_hlen, hbuf, sizeof (hbuf));
+			(void) ip_dot_addr(src, sbuf);
+
+			cmn_err(CE_WARN,
+			    "node %s is using our IP address %s on %s",
+			    hbuf, sbuf, ill->ill_name);
+			freemsg(mp);
+			break;
+		}
+		/*
+		 * mp will be freed by arp_excl.
+		 */
+		ill_refhold(ill);
+		qwriter_ip(ill, ill->ill_rq, mp, arp_excl, NEW_OP, B_FALSE);
+		return;
+	default:
+		ASSERT(0);
+		freemsg(mp);
+		break;
+	}
+}
+
+/*
+ * arp_output is called to transmit an ARP Request or Response. The mapping
+ * to RFC 826 variables is:
+ *   haddr1 == ar$sha
+ *   paddr1 == ar$spa
+ *   haddr2 == ar$tha
+ *   paddr2 == ar$tpa
+ * The ARP frame is sent to the ether_dst in dst_lladdr.
+ */
+static int
+arp_output(ill_t *ill, uint32_t operation,
+    const uchar_t *haddr1, const uchar_t *paddr1, const uchar_t *haddr2,
+    const uchar_t *paddr2, uchar_t *dst_lladdr)
+{
+	arh_t	*arh;
+	uint8_t	*cp;
+	uint_t	hlen;
+	uint32_t plen = IPV4_ADDR_LEN; /* ar$pln from RFC 826 */
+	uint32_t proto = IP_ARP_PROTO_TYPE;
+	mblk_t *mp;
+	arl_t *arl;
+
+	ASSERT(dst_lladdr != NULL);
+	hlen = ill->ill_phys_addr_length; /* ar$hln from RFC 826 */
+	mp = ill_dlur_gen(dst_lladdr, hlen, ETHERTYPE_ARP, ill->ill_sap_length);
+
+	if (mp == NULL)
+		return (ENOMEM);
+
+	/* IFF_NOARP flag is set or link down: do not send arp messages */
+	if ((ill->ill_flags & ILLF_NOARP) || !ill->ill_dl_up) {
+		freemsg(mp);
+		return (ENXIO);
+	}
+
+	mp->b_cont = allocb(AR_LL_HDR_SLACK + ARH_FIXED_LEN + (hlen * 4) +
+	    plen + plen, BPRI_MED);
+	if (mp->b_cont == NULL) {
+		freeb(mp);
+		return (ENOMEM);
+	}
+
+	/* Fill in the ARP header. */
+	cp = mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen);
+	mp->b_cont->b_rptr = cp;
+	arh = (arh_t *)cp;
+	U16_TO_BE16(arp_hw_type(ill->ill_mactype), arh->arh_hardware);
+	U16_TO_BE16(proto, arh->arh_proto);
+	arh->arh_hlen = (uint8_t)hlen;
+	arh->arh_plen = (uint8_t)plen;
+	U16_TO_BE16(operation, arh->arh_operation);
+	cp += ARH_FIXED_LEN;
+	bcopy(haddr1, cp, hlen);
+	cp += hlen;
+	if (paddr1 == NULL)
+		bzero(cp, plen);
+	else
+		bcopy(paddr1, cp, plen);
+	cp += plen;
+	if (haddr2 == NULL)
+		bzero(cp, hlen);
+	else
+		bcopy(haddr2, cp, hlen);
+	cp += hlen;
+	bcopy(paddr2, cp, plen);
+	cp += plen;
+	mp->b_cont->b_wptr = cp;
+
+	DTRACE_PROBE3(arp__physical__out__start,
+	    ill_t *, ill, arh_t *, arh, mblk_t *, mp);
+	ARP_HOOK_OUT(ill->ill_ipst->ips_arp_physical_out_event,
+	    ill->ill_ipst->ips_arp_physical_out,
+	    ill->ill_phyint->phyint_ifindex, arh, mp, mp->b_cont,
+	    ill->ill_ipst);
+	DTRACE_PROBE1(arp__physical__out__end, mblk_t *, mp);
+	if (mp == NULL)
+		return (0);
+
+	/* Ship it out. */
+	arl = ill_to_arl(ill);
+	if (arl == NULL) {
+		freemsg(mp);
+		return (0);
+	}
+	if (canputnext(arl->arl_wq))
+		putnext(arl->arl_wq, mp);
+	else
+		freemsg(mp);
+	arl_refrele(arl);
+	return (0);
+}
+
+/*
+ * Process resolve requests.
+ * If we are not yet reachable then we check and decrease ncec_rcnt; otherwise
+ * we leave it alone (the caller will check and manage ncec_pcnt in those
+ * cases.)
+ */
+int
+arp_request(ncec_t *ncec, in_addr_t sender, ill_t *ill)
+{
+	int err;
+	const uchar_t *target_hwaddr;
+	struct in_addr nce_paddr;
+	uchar_t *dst_lladdr;
+	boolean_t use_rcnt = !NCE_ISREACHABLE(ncec);
+
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+	ASSERT(!IS_IPMP(ill));
+
+	if (use_rcnt && ncec->ncec_rcnt == 0) {
+		/* not allowed any more retransmits. */
+		return (0);
+	}
+
+	if ((ill->ill_flags & ILLF_NOARP) != 0)
+		return (0);
+
+	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &nce_paddr);
+
+	target_hwaddr =
+	    ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
+
+	if (NCE_ISREACHABLE(ncec)) {
+		dst_lladdr =  ncec->ncec_lladdr;
+	} else {
+		dst_lladdr =  ill->ill_bcast_mp->b_rptr +
+		    NCE_LL_ADDR_OFFSET(ill);
+	}
+
+	mutex_exit(&ncec->ncec_lock);
+	err = arp_output(ill, ARP_REQUEST,
+	    ill->ill_phys_addr, (uchar_t *)&sender, target_hwaddr,
+	    (uchar_t *)&nce_paddr, dst_lladdr);
+	mutex_enter(&ncec->ncec_lock);
+
+	if (err != 0) {
+		/*
+		 * Some transient error such as ENOMEM or a down link was
+		 * encountered. If the link has been taken down permanently,
+		 * the ncec will eventually be cleaned up (ipif_down_tail()
+		 * will call ipif_nce_down() and flush the ncec), to terminate
+		 * recurring attempts to send ARP requests. In all other cases,
+		 * allow the caller another chance at success next time.
+		 */
+		return (ncec->ncec_ill->ill_reachable_retrans_time);
+	}
+
+	if (use_rcnt)
+		ncec->ncec_rcnt--;
+
+	return (ncec->ncec_ill->ill_reachable_retrans_time);
+}
+
+/* return B_TRUE if dropped */
+boolean_t
+arp_announce(ncec_t *ncec)
+{
+	ill_t *ill;
+	int err;
+	uchar_t *sphys_addr, *bcast_addr;
+	struct in_addr ncec_addr;
+	boolean_t need_refrele = B_FALSE;
+
+	ASSERT((ncec->ncec_flags & NCE_F_BCAST) == 0);
+	ASSERT((ncec->ncec_flags & NCE_F_MCAST) == 0);
+
+	if (IS_IPMP(ncec->ncec_ill)) {
+		/* sent on the cast_ill */
+		ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, B_FALSE);
+		if (ill == NULL)
+			return (B_TRUE);
+		need_refrele = B_TRUE;
+	} else {
+		ill = ncec->ncec_ill;
+	}
+
+	/*
+	 * broadcast an announce to ill_bcast address.
+	 */
+	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ncec_addr);
+
+	sphys_addr = ncec->ncec_lladdr;
+	bcast_addr = ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
+
+	err = arp_output(ill, ARP_REQUEST,
+	    sphys_addr, (uchar_t *)&ncec_addr, bcast_addr,
+	    (uchar_t *)&ncec_addr, bcast_addr);
+
+	if (need_refrele)
+		ill_refrele(ill);
+	return (err != 0);
+}
+
+/* return B_TRUE if dropped */
+boolean_t
+arp_probe(ncec_t *ncec)
+{
+	ill_t *ill;
+	int err;
+	struct in_addr ncec_addr;
+	uchar_t *sphys_addr, *dst_lladdr;
+
+	if (IS_IPMP(ncec->ncec_ill)) {
+		ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, B_FALSE);
+		if (ill == NULL)
+			return (B_TRUE);
+	} else {
+		ill = ncec->ncec_ill;
+	}
+
+	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ncec_addr);
+
+	sphys_addr = ncec->ncec_lladdr;
+	dst_lladdr = ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
+	err = arp_output(ill, ARP_REQUEST,
+	    sphys_addr, NULL, NULL, (uchar_t *)&ncec_addr, dst_lladdr);
+
+	if (IS_IPMP(ncec->ncec_ill))
+		ill_refrele(ill);
+	return (err != 0);
+}
+
+static mblk_t *
+arl_unbind(arl_t *arl)
+{
+	mblk_t *mp;
+
+	if ((mp = arl->arl_unbind_mp) != NULL) {
+		arl->arl_unbind_mp = NULL;
+		arl->arl_state_flags |= ARL_DL_UNBIND_IN_PROGRESS;
+	}
+	return (mp);
+}
+
+int
+arp_ll_down(ill_t *ill)
+{
+	arl_t 	*arl;
+	mblk_t *unbind_mp;
+	int err = 0;
+	boolean_t replumb = (ill->ill_replumbing == 1);
+
+	DTRACE_PROBE2(ill__downup, char *, "arp_ll_down", ill_t *, ill);
+	if ((arl = ill_to_arl(ill)) == NULL)
+		return (ENXIO);
+	DTRACE_PROBE2(arl__downup, char *, "arp_ll_down", arl_t *, arl);
+	mutex_enter(&arl->arl_lock);
+	unbind_mp = arl_unbind(arl);
+	if (unbind_mp != NULL) {
+		ASSERT(arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS);
+		DTRACE_PROBE2(arp__unbinding, mblk_t *, unbind_mp,
+		    arl_t *, arl);
+		err = EINPROGRESS;
+		if (replumb)
+			arl->arl_state_flags |= ARL_LL_REPLUMBING;
+	}
+	mutex_exit(&arl->arl_lock);
+	if (unbind_mp != NULL)
+		arp_dlpi_send(arl, unbind_mp);
+	arl_refrele(arl);
+	return (err);
+}
+
+/* ARGSUSED */
+int
+arp_close(queue_t *q, int flags)
+{
+	if (WR(q)->q_next != NULL) {
+		/* This is a module close */
+		return (arp_modclose(q->q_ptr));
+	}
+	qprocsoff(q);
+	q->q_ptr = WR(q)->q_ptr = NULL;
+	return (0);
+}
+
+static int
+arp_modclose(arl_t *arl)
+{
+	arl_ill_common_t *ai = arl->arl_common;
+	ill_t		*ill;
+	queue_t		*q = arl->arl_rq;
+	mblk_t		*mp, *nextmp;
+	ipsq_t		*ipsq = NULL;
+
+	ill = arl_to_ill(arl);
+	if (ill != NULL) {
+		if (!ill_waiter_inc(ill)) {
+			ill_refrele(ill);
+		} else {
+			ill_refrele(ill);
+			if (ipsq_enter(ill, B_FALSE, NEW_OP))
+				ipsq = ill->ill_phyint->phyint_ipsq;
+			ill_waiter_dcr(ill);
+		}
+		if (ipsq == NULL) {
+			/*
+			 * could not enter the ipsq because ill is already
+			 * marked CONDEMNED.
+			 */
+			ill = NULL;
+		}
+	}
+	if (ai != NULL && ipsq == NULL) {
+		/*
+		 * Either we did not get an ill because it was marked CONDEMNED
+		 * or we could not enter the ipsq because it was unplumbing.
+		 * In both cases, wait for the ill to complete ip_modclose().
+		 *
+		 * If the arp_modclose happened even before SLIFNAME, the ai
+		 * itself would be NULL, in which case we can complete the close
+		 * without waiting.
+		 */
+		mutex_enter(&ai->ai_lock);
+		while (ai->ai_ill != NULL)
+			cv_wait(&ai->ai_ill_unplumb_done, &ai->ai_lock);
+		mutex_exit(&ai->ai_lock);
+	}
+	ASSERT(ill == NULL || IAM_WRITER_ILL(ill));
+
+	mutex_enter(&arl->arl_lock);
+	/*
+	 * If the ill had completed unplumbing before arp_modclose(), there
+	 * would be no ill (and therefore, no ipsq) to serialize arp_modclose()
+	 * so that we need to explicitly check for ARL_CONDEMNED and back off
+	 * if it is set.
+	 */
+	if ((arl->arl_state_flags & ARL_CONDEMNED) != 0) {
+		mutex_exit(&arl->arl_lock);
+		ASSERT(ipsq == NULL);
+		return (0);
+	}
+	arl->arl_state_flags |= ARL_CONDEMNED;
+
+	/*
+	 * send out all pending dlpi messages, don't wait for the ack (which
+	 * will be ignored in arp_rput when CONDEMNED is set)
+	 *
+	 * We have to check for pending DL_UNBIND_REQ because, in the case
+	 * that ip_modclose() executed before arp_modclose(), the call to
+	 * ill_delete_tail->ipif_arp_down() would have triggered a
+	 * DL_UNBIND_REQ. When arp_modclose() executes ipsq_enter() will fail
+	 * (since ip_modclose() is in the ipsq) but the DL_UNBIND_ACK may not
+	 * have been processed yet. In this scenario, we cannot reset
+	 * arl_dlpi_pending, because the setting/clearing of arl_state_flags
+	 * related to unbind, and the associated cv_waits must be allowed to
+	 * continue.
+	 */
+	if (arl->arl_dlpi_pending != DL_UNBIND_REQ)
+		arl->arl_dlpi_pending = DL_PRIM_INVAL;
+	mp = arl->arl_dlpi_deferred;
+	arl->arl_dlpi_deferred = NULL;
+	mutex_exit(&arl->arl_lock);
+
+	for (; mp != NULL; mp = nextmp) {
+		nextmp = mp->b_next;
+		mp->b_next = NULL;
+		putnext(arl->arl_wq, mp);
+	}
+
+	/* Wait for data paths to quiesce */
+	mutex_enter(&arl->arl_lock);
+	while (arl->arl_refcnt != 0)
+		cv_wait(&arl->arl_cv, &arl->arl_lock);
+
+	/*
+	 * unbind, so that nothing else can come up from driver.
+	 */
+	mp = arl_unbind(arl);
+	mutex_exit(&arl->arl_lock);
+	if (mp != NULL)
+		arp_dlpi_send(arl, mp);
+	mutex_enter(&arl->arl_lock);
+
+	/* wait for unbind ack  */
+	while (arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS)
+		cv_wait(&arl->arl_cv, &arl->arl_lock);
+	mutex_exit(&arl->arl_lock);
+
+	qprocsoff(q);
+
+	if (ill != NULL) {
+		mutex_enter(&ill->ill_lock);
+		ill->ill_arl_dlpi_pending = 0;
+		mutex_exit(&ill->ill_lock);
+	}
+
+	if (ai != NULL) {
+		mutex_enter(&ai->ai_lock);
+		ai->ai_arl = NULL;
+		if (ai->ai_ill == NULL) {
+			mutex_destroy(&ai->ai_lock);
+			kmem_free(ai, sizeof (*ai));
+		} else {
+			mutex_exit(&ai->ai_lock);
+		}
+	}
+
+	/* free up the rest */
+	arp_mod_close_tail(arl);
+
+	q->q_ptr = WR(q)->q_ptr = NULL;
+
+	if (ipsq != NULL)
+		ipsq_exit(ipsq);
+
+	return (0);
+}
+
+static void
+arp_mod_close_tail(arl_t *arl)
+{
+	ip_stack_t	*ipst = arl->arl_ipst;
+	mblk_t		**mpp;
+
+	netstack_hold(ipst->ips_netstack);
+
+	mutex_enter(&ipst->ips_ip_mi_lock);
+	mi_close_unlink(&ipst->ips_arp_g_head, (IDP)arl);
+	mutex_exit(&ipst->ips_ip_mi_lock);
+
+	/*
+	 * credp could be null if the open didn't succeed and ip_modopen
+	 * itself calls ip_close.
+	 */
+	if (arl->arl_credp != NULL)
+		crfree(arl->arl_credp);
+
+	/* Free all retained control messages. */
+	mpp = &arl->arl_first_mp_to_free;
+	do {
+		while (mpp[0]) {
+			mblk_t  *mp;
+			mblk_t  *mp1;
+
+			mp = mpp[0];
+			mpp[0] = mp->b_next;
+			for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
+				mp1->b_next = NULL;
+				mp1->b_prev = NULL;
+			}
+			freemsg(mp);
+		}
+	} while (mpp++ != &arl->arl_last_mp_to_free);
+
+	netstack_rele(ipst->ips_netstack);
+	mi_free(arl->arl_name);
+	mi_close_free((IDP)arl);
+}
+
+/*
+ * DAD failed. Tear down ipifs with the specified srce address. Note that
+ * tearing down the ipif also meas deleting the ncec through ipif_down,
+ * so it is not possible to use nce_timer for recovery. Instead we start
+ * a timer on the ipif. Caller has to free the mp.
+ */
+void
+arp_failure(mblk_t *mp, ip_recv_attr_t *ira)
+{
+	ill_t *ill = ira->ira_ill;
+
+	if ((mp = copymsg(mp)) != NULL) {
+		ill_refhold(ill);
+		qwriter_ip(ill, ill->ill_rq, mp, arp_excl, NEW_OP, B_FALSE);
+	}
+}
+
+/*
+ * This is for exclusive changes due to ARP.  Tear down an interface due
+ * to AR_CN_FAILED and AR_CN_BOGON.
+ */
+/* ARGSUSED */
+static void
+arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
+{
+	ill_t	*ill = rq->q_ptr;
+	arh_t *arh;
+	ipaddr_t src;
+	ipif_t	*ipif;
+	ip_stack_t *ipst = ill->ill_ipst;
+	uchar_t	*haddr;
+	uint_t	haddrlen;
+
+	/* first try src = ar$spa */
+	arh = (arh_t *)mp->b_rptr;
+	bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN);
+
+	haddrlen = arh->arh_hlen;
+	haddr = (uint8_t *)(arh + 1);
+
+	if (haddrlen == ill->ill_phys_addr_length) {
+		/*
+		 * Ignore conflicts generated by misbehaving switches that
+		 * just reflect our own messages back to us.  For IPMP, we may
+		 * see reflections across any ill in the illgrp.
+		 */
+		/* For an under ill_grp can change under lock */
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
+		    IS_UNDER_IPMP(ill) && ill->ill_grp != NULL &&
+		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
+		    haddrlen) != NULL) {
+			rw_exit(&ipst->ips_ill_g_lock);
+			goto ignore_conflict;
+		}
+		rw_exit(&ipst->ips_ill_g_lock);
+	}
+
+	/*
+	 * Look up the appropriate ipif.
+	 */
+	ipif = ipif_lookup_addr(src, ill, ALL_ZONES, ipst);
+	if (ipif == NULL)
+		goto ignore_conflict;
+
+	/* Reload the ill to match the ipif */
+	ill = ipif->ipif_ill;
+
+	/* If it's already duplicate or ineligible, then don't do anything. */
+	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
+		ipif_refrele(ipif);
+		goto ignore_conflict;
+	}
+
+	/*
+	 * If we failed on a recovery probe, then restart the timer to
+	 * try again later.
+	 */
+	if (!ipif->ipif_was_dup) {
+		char hbuf[MAC_STR_LEN];
+		char sbuf[INET_ADDRSTRLEN];
+		char ibuf[LIFNAMSIZ];
+
+		(void) mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf));
+		(void) ip_dot_addr(src, sbuf);
+		ipif_get_name(ipif, ibuf, sizeof (ibuf));
+
+		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
+		    " disabled", ibuf, sbuf, hbuf);
+	}
+	mutex_enter(&ill->ill_lock);
+	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
+	ipif->ipif_flags |= IPIF_DUPLICATE;
+	ill->ill_ipif_dup_count++;
+	mutex_exit(&ill->ill_lock);
+	(void) ipif_down(ipif, NULL, NULL);
+	(void) ipif_down_tail(ipif);
+	mutex_enter(&ill->ill_lock);
+	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
+	    ill->ill_net_type == IRE_IF_RESOLVER &&
+	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
+	    ipst->ips_ip_dup_recovery > 0) {
+		ASSERT(ipif->ipif_recovery_id == 0);
+		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
+		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
+	}
+	mutex_exit(&ill->ill_lock);
+	ipif_refrele(ipif);
+
+ignore_conflict:
+	freemsg(mp);
+}
+
+/*
+ * This is a place for a dtrace hook.
+ * Note that mp can be either the DL_UNITDATA_IND with a b_cont payload,
+ * or just the ARP packet payload as an M_DATA.
+ */
+/* ARGSUSED */
+static void
+arp_drop_packet(const char *str, mblk_t *mp, ill_t *ill)
+{
+	freemsg(mp);
+}
+
+static boolean_t
+arp_over_driver(queue_t *q)
+{
+	queue_t *qnext = STREAM(q)->sd_wrq->q_next;
+
+	/*
+	 * check if first module below stream head is IP or UDP.
+	 */
+	ASSERT(qnext != NULL);
+	if (strcmp(Q2NAME(qnext), "ip") != 0 &&
+	    strcmp(Q2NAME(qnext), "udp") != 0) {
+		/*
+		 * module below is not ip or udp, so arp has been pushed
+		 * on the driver.
+		 */
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static int
+arp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+	int err;
+
+	ASSERT(sflag & MODOPEN);
+	if (!arp_over_driver(q)) {
+		q->q_qinfo = dummymodinfo.st_rdinit;
+		WR(q)->q_qinfo = dummymodinfo.st_wrinit;
+		return ((*dummymodinfo.st_rdinit->qi_qopen)(q, devp, flag,
+		    sflag, credp));
+	}
+	err = arp_modopen(q, devp, flag, sflag, credp);
+	return (err);
+}
+
+/*
+ * In most cases we must be a writer on the IP stream before coming to
+ * arp_dlpi_send(), to serialize DLPI sends to the driver. The exceptions
+ * when we are not a writer are very early duing initialization (in
+ * arl_init, before the arl has done a SLIFNAME, so that we don't yet know
+ * the associated ill) or during arp_mod_close, when we could not enter the
+ * ipsq because the ill has already unplumbed.
+ */
+static void
+arp_dlpi_send(arl_t *arl, mblk_t *mp)
+{
+	mblk_t **mpp;
+	t_uscalar_t prim;
+	arl_ill_common_t *ai;
+
+	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+
+#ifdef DEBUG
+	ai = arl->arl_common;
+	if (ai != NULL) {
+		mutex_enter(&ai->ai_lock);
+		if (ai->ai_ill != NULL)
+			ASSERT(IAM_WRITER_ILL(ai->ai_ill));
+		mutex_exit(&ai->ai_lock);
+	}
+#endif /* DEBUG */
+
+	mutex_enter(&arl->arl_lock);
+	if (arl->arl_dlpi_pending != DL_PRIM_INVAL) {
+		/* Must queue message. Tail insertion */
+		mpp = &arl->arl_dlpi_deferred;
+		while (*mpp != NULL)
+			mpp = &((*mpp)->b_next);
+
+		*mpp = mp;
+		mutex_exit(&arl->arl_lock);
+		return;
+	}
+	mutex_exit(&arl->arl_lock);
+	if ((prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive)
+	    == DL_BIND_REQ) {
+		ASSERT((arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS) == 0);
+	}
+	/*
+	 * No need to take the arl_lock to examine ARL_CONDEMNED at this point
+	 * because the only thread that can see ARL_CONDEMNED here is the
+	 * closing arp_modclose() thread which sets the flag after becoming a
+	 * writer on the ipsq. Threads from IP must have finished and
+	 * cannot be active now.
+	 */
+	if (!(arl->arl_state_flags & ARL_CONDEMNED) ||
+	    (prim == DL_UNBIND_REQ)) {
+		if (prim != DL_NOTIFY_CONF) {
+			ill_t *ill = arl_to_ill(arl);
+
+			arl->arl_dlpi_pending = prim;
+			if (ill != NULL) {
+				mutex_enter(&ill->ill_lock);
+				ill->ill_arl_dlpi_pending = 1;
+				mutex_exit(&ill->ill_lock);
+				ill_refrele(ill);
+			}
+		}
+	}
+	DTRACE_PROBE4(arl__dlpi, char *, "arp_dlpi_send",
+	    char *, dl_primstr(prim), char *, "-",  arl_t *, arl);
+	putnext(arl->arl_wq, mp);
+}
+
+static void
+arl_defaults_common(arl_t *arl, mblk_t *mp)
+{
+	dl_info_ack_t	*dlia = (dl_info_ack_t *)mp->b_rptr;
+	/*
+	 * Till the ill is fully up  the ill is not globally visible.
+	 * So no need for a lock.
+	 */
+	arl->arl_mactype = dlia->dl_mac_type;
+	arl->arl_sap_length = dlia->dl_sap_length;
+
+	if (!arl->arl_dlpi_style_set) {
+		if (dlia->dl_provider_style == DL_STYLE2)
+			arl->arl_needs_attach = 1;
+		mutex_enter(&arl->arl_lock);
+		ASSERT(arl->arl_dlpi_style_set == 0);
+		arl->arl_dlpi_style_set = 1;
+		arl->arl_state_flags &= ~ARL_LL_SUBNET_PENDING;
+		cv_broadcast(&arl->arl_cv);
+		mutex_exit(&arl->arl_lock);
+	}
+}
+
+int
+arl_init(queue_t *q, arl_t *arl)
+{
+	mblk_t *info_mp;
+	dl_info_req_t   *dlir;
+
+	/* subset of ill_init */
+	mutex_init(&arl->arl_lock, NULL, MUTEX_DEFAULT, 0);
+
+	arl->arl_rq = q;
+	arl->arl_wq = WR(q);
+
+	info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
+	    BPRI_HI);
+	if (info_mp == NULL)
+		return (ENOMEM);
+	/*
+	 * allocate sufficient space to contain device name.
+	 */
+	arl->arl_name = (char *)(mi_zalloc(2 * LIFNAMSIZ));
+	arl->arl_ppa = UINT_MAX;
+	arl->arl_state_flags |= (ARL_LL_SUBNET_PENDING | ARL_LL_UNBOUND);
+
+	/* Send down the Info Request to the driver. */
+	info_mp->b_datap->db_type = M_PCPROTO;
+	dlir = (dl_info_req_t *)info_mp->b_rptr;
+	info_mp->b_wptr = (uchar_t *)&dlir[1];
+	dlir->dl_primitive = DL_INFO_REQ;
+	arl->arl_dlpi_pending = DL_PRIM_INVAL;
+	qprocson(q);
+
+	arp_dlpi_send(arl, info_mp);
+	return (0);
+}
+
+int
+arl_wait_for_info_ack(arl_t *arl)
+{
+	int err;
+
+	mutex_enter(&arl->arl_lock);
+	while (arl->arl_state_flags & ARL_LL_SUBNET_PENDING) {
+		/*
+		 * Return value of 0 indicates a pending signal.
+		 */
+		err = cv_wait_sig(&arl->arl_cv, &arl->arl_lock);
+		if (err == 0) {
+			mutex_exit(&arl->arl_lock);
+			return (EINTR);
+		}
+	}
+	mutex_exit(&arl->arl_lock);
+	/*
+	 * ip_rput_other could have set an error  in ill_error on
+	 * receipt of M_ERROR.
+	 */
+	return (arl->arl_error);
+}
+
+void
+arl_set_muxid(ill_t *ill, int muxid)
+{
+	arl_t *arl;
+
+	arl = ill_to_arl(ill);
+	if (arl != NULL) {
+		arl->arl_muxid = muxid;
+		arl_refrele(arl);
+	}
+}
+
+int
+arl_get_muxid(ill_t *ill)
+{
+	arl_t *arl;
+	int muxid = 0;
+
+	arl = ill_to_arl(ill);
+	if (arl != NULL) {
+		muxid = arl->arl_muxid;
+		arl_refrele(arl);
+	}
+	return (muxid);
+}
+
+static int
+arp_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+	int	err;
+	zoneid_t zoneid;
+	netstack_t *ns;
+	ip_stack_t *ipst;
+	arl_t	*arl = NULL;
+
+	/*
+	 * Prevent unprivileged processes from pushing IP so that
+	 * they can't send raw IP.
+	 */
+	if (secpolicy_net_rawaccess(credp) != 0)
+		return (EPERM);
+
+	ns = netstack_find_by_cred(credp);
+	ASSERT(ns != NULL);
+	ipst = ns->netstack_ip;
+	ASSERT(ipst != NULL);
+
+	/*
+	 * For exclusive stacks we set the zoneid to zero
+	 * to make IP operate as if in the global zone.
+	 */
+	if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
+		zoneid = GLOBAL_ZONEID;
+	else
+		zoneid = crgetzoneid(credp);
+
+	arl = (arl_t *)mi_open_alloc_sleep(sizeof (arl_t));
+	q->q_ptr = WR(q)->q_ptr = arl;
+	arl->arl_ipst = ipst;
+	arl->arl_zoneid = zoneid;
+	err = arl_init(q, arl);
+
+	if (err != 0) {
+		mi_free(arl->arl_name);
+		mi_free(arl);
+		netstack_rele(ipst->ips_netstack);
+		q->q_ptr = NULL;
+		WR(q)->q_ptr = NULL;
+		return (err);
+	}
+
+	/*
+	 * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
+	 */
+	err = arl_wait_for_info_ack(arl);
+	if (err == 0)
+		arl->arl_credp = credp;
+	else
+		goto fail;
+
+	crhold(credp);
+
+	mutex_enter(&ipst->ips_ip_mi_lock);
+	err = mi_open_link(&ipst->ips_arp_g_head, (IDP)q->q_ptr, devp, flag,
+	    sflag, credp);
+	mutex_exit(&ipst->ips_ip_mi_lock);
+fail:
+	if (err) {
+		(void) arp_close(q, 0);
+		return (err);
+	}
+	return (0);
+}
+
+/*
+ * Notify any downstream modules (esp softmac and hitbox) of the name
+ * of this interface using an M_CTL.
+ */
+static void
+arp_ifname_notify(arl_t *arl)
+{
+	mblk_t *mp1, *mp2;
+	struct iocblk *iocp;
+	struct lifreq *lifr;
+
+	if ((mp1 = mkiocb(SIOCSLIFNAME)) == NULL)
+		return;
+	if ((mp2 = allocb(sizeof (struct lifreq), BPRI_HI)) == NULL) {
+		freemsg(mp1);
+		return;
+	}
+
+	lifr = (struct lifreq *)mp2->b_rptr;
+	mp2->b_wptr += sizeof (struct lifreq);
+	bzero(lifr, sizeof (struct lifreq));
+
+	(void) strncpy(lifr->lifr_name, arl->arl_name, LIFNAMSIZ);
+	lifr->lifr_ppa = arl->arl_ppa;
+	lifr->lifr_flags = ILLF_IPV4;
+
+	/* Use M_CTL to avoid confusing anyone else who might be listening. */
+	DB_TYPE(mp1) = M_CTL;
+	mp1->b_cont = mp2;
+	iocp = (struct iocblk *)mp1->b_rptr;
+	iocp->ioc_count = msgsize(mp1->b_cont);
+	DTRACE_PROBE4(arl__dlpi, char *, "arp_ifname_notify",
+	    char *, "SIOCSLIFNAME", char *, "-",  arl_t *, arl);
+	putnext(arl->arl_wq, mp1);
+}
+
+void
+arp_send_replumb_conf(ill_t *ill)
+{
+	mblk_t *mp;
+	arl_t *arl = ill_to_arl(ill);
+
+	if (arl == NULL)
+		return;
+	/*
+	 * arl_got_replumb and arl_got_unbind to be cleared after we complete
+	 * arp_cmd_done.
+	 */
+	mp = mexchange(NULL, NULL, sizeof (dl_notify_conf_t), M_PROTO,
+	    DL_NOTIFY_CONF);
+	((dl_notify_conf_t *)(mp->b_rptr))->dl_notification =
+	    DL_NOTE_REPLUMB_DONE;
+	arp_dlpi_send(arl, mp);
+	mutex_enter(&arl->arl_lock);
+	arl->arl_state_flags &= ~ARL_LL_REPLUMBING;
+	mutex_exit(&arl->arl_lock);
+	arl_refrele(arl);
+}
+
+/*
+ * The unplumb code paths call arp_unbind_complete() to make sure that it is
+ * safe to tear down the ill. We wait for DL_UNBIND_ACK to complete, and also
+ * for the arl_refcnt to fall to one so that, when we return from
+ * arp_unbind_complete(), we know for certain that there are no threads in
+ * arp_rput() that might access the arl_ill.
+ */
+void
+arp_unbind_complete(ill_t *ill)
+{
+	arl_t *arl = ill_to_arl(ill);
+
+	if (arl == NULL)
+		return;
+	mutex_enter(&arl->arl_lock);
+	/*
+	 * wait for unbind ack and arl_refcnt to drop to 1. Note that the
+	 * quiescent arl_refcnt for this function is 1 (and not 0) because
+	 * ill_to_arl() will itself return after taking a ref on the arl_t.
+	 */
+	while (arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS)
+		cv_wait(&arl->arl_cv, &arl->arl_lock);
+	while (arl->arl_refcnt != 1)
+		cv_wait(&arl->arl_cv, &arl->arl_lock);
+	mutex_exit(&arl->arl_lock);
+	arl_refrele(arl);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c
new file mode 100644
index 0000000000..a46a82c85f
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_attr.c
@@ -0,0 +1,1338 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/tcp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/optcom.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+#include <sys/squeue_impl.h>
+#include <sys/squeue.h>
+
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+/*
+ * Release a reference on ip_xmit_attr.
+ * The reference is acquired by conn_get_ixa()
+ */
+#define	IXA_REFRELE(ixa)					\
+{								\
+	if (atomic_add_32_nv(&(ixa)->ixa_refcnt, -1) == 0)	\
+		ixa_inactive(ixa);				\
+}
+
+#define	IXA_REFHOLD(ixa)					\
+{								\
+	ASSERT((ixa)->ixa_refcnt != 0);				\
+	atomic_add_32(&(ixa)->ixa_refcnt, 1);			\
+}
+
+/*
+ * When we need to handle a transmit side asynchronous operation, then we need
+ * to save sufficient information so that we can call the fragment and postfrag
+ * functions. That information is captured in an mblk containing this structure.
+ *
+ * Since this is currently only used for IPsec, we include information for
+ * the kernel crypto framework.
+ */
+typedef struct ixamblk_s {
+	boolean_t	ixm_inbound;	/* B_FALSE */
+	iaflags_t	ixm_flags;	/* ixa_flags */
+	netstackid_t	ixm_stackid;	/* Verify it didn't go away */
+	uint_t		ixm_ifindex;	/* Used to find the nce */
+	in6_addr_t	ixm_nceaddr_v6;	/* Used to find nce */
+#define	ixm_nceaddr_v4	V4_PART_OF_V6(ixm_nceaddr_v6)
+	uint32_t	ixm_fragsize;
+	uint_t		ixm_pktlen;
+	uint16_t	ixm_ip_hdr_length; /* Points to ULP header */
+	uint8_t		ixm_protocol;	/* Protocol number for ULP cksum */
+	pfirepostfrag_t	ixm_postfragfn;
+
+	zoneid_t	ixm_zoneid;		/* Needed for ipobs */
+	zoneid_t	ixm_no_loop_zoneid;	/* IXAF_NO_LOOP_ZONEID_SET */
+
+	uint_t		ixm_scopeid;		/* For IPv6 link-locals */
+
+	uint32_t	ixm_ident;		/* For IPv6 fragment header */
+	uint32_t	ixm_xmit_hint;
+
+	cred_t		*ixm_cred;	/* For getpeerucred - refhold if set */
+	pid_t		ixm_cpid;	/* For getpeerucred */
+
+	ts_label_t	*ixm_tsl;	/* Refhold if set. */
+
+	/*
+	 * When the pointers below are set they have a refhold on the struct.
+	 */
+	ipsec_latch_t		*ixm_ipsec_latch;
+	struct ipsa_s		*ixm_ipsec_ah_sa;	/* SA for AH */
+	struct ipsa_s		*ixm_ipsec_esp_sa;	/* SA for ESP */
+	struct ipsec_policy_s 	*ixm_ipsec_policy;	/* why are we here? */
+	struct ipsec_action_s	*ixm_ipsec_action; /* For reflected packets */
+
+	ipsa_ref_t		ixm_ipsec_ref[2]; /* Soft reference to SA */
+
+	/* Need these while waiting for SA */
+	uint16_t ixm_ipsec_src_port;	/* Source port number of d-gram. */
+	uint16_t ixm_ipsec_dst_port;	/* Destination port number of d-gram. */
+	uint8_t  ixm_ipsec_icmp_type;	/* ICMP type of d-gram */
+	uint8_t  ixm_ipsec_icmp_code;	/* ICMP code of d-gram */
+
+	sa_family_t ixm_ipsec_inaf;	/* Inner address family */
+	uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN];	/* Inner src address */
+	uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN];	/* Inner dest address */
+	uint8_t  ixm_ipsec_insrcpfx;	/* Inner source prefix */
+	uint8_t  ixm_ipsec_indstpfx;	/* Inner destination prefix */
+
+	uint8_t ixm_ipsec_proto;	/* IP protocol number for d-gram. */
+} ixamblk_t;
+
+
+/*
+ * When we need to handle a receive side asynchronous operation, then we need
+ * to save sufficient information so that we can call ip_fanout.
+ * That information is captured in an mblk containing this structure.
+ *
+ * Since this is currently only used for IPsec, we include information for
+ * the kernel crypto framework.
+ */
+typedef struct iramblk_s {
+	boolean_t	irm_inbound;	/* B_TRUE */
+	iaflags_t	irm_flags;	/* ira_flags */
+	netstackid_t	irm_stackid;	/* Verify it didn't go away */
+	uint_t		irm_ifindex;	/* To find ira_ill */
+
+	uint_t		irm_rifindex;	/* ira_rifindex */
+	uint_t		irm_ruifindex;	/* ira_ruifindex */
+	uint_t		irm_pktlen;
+	uint16_t	irm_ip_hdr_length; /* Points to ULP header */
+	uint8_t		irm_protocol;	/* Protocol number for ULP cksum */
+	zoneid_t	irm_zoneid;	/* ALL_ZONES unless local delivery */
+
+	squeue_t	*irm_sqp;
+	ill_rx_ring_t	*irm_ring;
+
+	ipaddr_t	irm_mroute_tunnel;	/* IRAF_MROUTE_TUNNEL_SET */
+	zoneid_t	irm_no_loop_zoneid;	/* IRAF_NO_LOOP_ZONEID_SET */
+	uint32_t	irm_esp_udp_ports;	/* IRAF_ESP_UDP_PORTS */
+
+	char		irm_l2src[IRA_L2SRC_SIZE];	/* If IRAF_L2SRC_SET */
+
+	cred_t		*irm_cred;	/* For getpeerucred - refhold if set */
+	pid_t		irm_cpid;	/* For getpeerucred */
+
+	ts_label_t	*irm_tsl;	/* Refhold if set. */
+
+	/*
+	 * When set these correspond to a refhold on the object.
+	 */
+	struct ipsa_s		*irm_ipsec_ah_sa;	/* SA for AH */
+	struct ipsa_s		*irm_ipsec_esp_sa;	/* SA for ESP */
+	struct ipsec_action_s	*irm_ipsec_action; /* For reflected packets */
+} iramblk_t;
+
+
+/*
+ * Take the information in ip_xmit_attr_t and stick it in an mblk
+ * that can later be passed to ip_xmit_attr_from_mblk to recreate the
+ * ip_xmit_attr_t.
+ *
+ * Returns NULL on memory allocation failure.
+ */
+mblk_t *
+ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa)
+{
+	mblk_t		*ixamp;
+	ixamblk_t	*ixm;
+	nce_t		*nce = ixa->ixa_nce;
+
+	ASSERT(nce != NULL);
+	ixamp = allocb(sizeof (*ixm), BPRI_MED);
+	if (ixamp == NULL)
+		return (NULL);
+
+	ixamp->b_datap->db_type = M_BREAK;
+	ixamp->b_wptr += sizeof (*ixm);
+	ixm = (ixamblk_t *)ixamp->b_rptr;
+
+	bzero(ixm, sizeof (*ixm));
+	ixm->ixm_inbound = B_FALSE;
+	ixm->ixm_flags = ixa->ixa_flags;
+	ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid;
+	ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex;
+	ixm->ixm_nceaddr_v6 = nce->nce_addr;
+	ixm->ixm_fragsize = ixa->ixa_fragsize;
+	ixm->ixm_pktlen = ixa->ixa_pktlen;
+	ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length;
+	ixm->ixm_protocol = ixa->ixa_protocol;
+	ixm->ixm_postfragfn = ixa->ixa_postfragfn;
+	ixm->ixm_zoneid = ixa->ixa_zoneid;
+	ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
+	ixm->ixm_scopeid = ixa->ixa_scopeid;
+	ixm->ixm_ident = ixa->ixa_ident;
+	ixm->ixm_xmit_hint = ixa->ixa_xmit_hint;
+
+	if (ixa->ixa_tsl != NULL) {
+		ixm->ixm_tsl = ixa->ixa_tsl;
+		label_hold(ixm->ixm_tsl);
+	}
+	if (ixa->ixa_cred != NULL) {
+		ixm->ixm_cred = ixa->ixa_cred;
+		crhold(ixa->ixa_cred);
+	}
+	ixm->ixm_cpid = ixa->ixa_cpid;
+
+	if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
+		if (ixa->ixa_ipsec_ah_sa != NULL) {
+			ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa;
+			IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
+		}
+		if (ixa->ixa_ipsec_esp_sa != NULL) {
+			ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa;
+			IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
+		}
+		if (ixa->ixa_ipsec_policy != NULL) {
+			ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy;
+			IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
+		}
+		if (ixa->ixa_ipsec_action != NULL) {
+			ixm->ixm_ipsec_action = ixa->ixa_ipsec_action;
+			IPACT_REFHOLD(ixa->ixa_ipsec_action);
+		}
+		if (ixa->ixa_ipsec_latch != NULL) {
+			ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch;
+			IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
+		}
+		ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0];
+		ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1];
+		ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port;
+		ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port;
+		ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type;
+		ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code;
+		ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf;
+		ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0];
+		ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1];
+		ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2];
+		ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3];
+		ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0];
+		ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1];
+		ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2];
+		ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3];
+		ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx;
+		ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx;
+		ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto;
+	}
+	return (ixamp);
+}
+
+/*
+ * Extract the ip_xmit_attr_t from the mblk, checking that the
+ * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is
+ * not the case.
+ *
+ * Otherwise ixa is updated.
+ * Caller needs to release references on the ixa by calling ixa_refrele()
+ * which will imediately call ixa_inactive to release the references.
+ */
+boolean_t
+ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa)
+{
+	ixamblk_t	*ixm;
+	netstack_t	*ns;
+	ip_stack_t	*ipst;
+	ill_t		*ill;
+	nce_t		*nce;
+
+	/* We assume the caller hasn't initialized ixa */
+	bzero(ixa, sizeof (*ixa));
+
+	ASSERT(DB_TYPE(ixamp) == M_BREAK);
+	ASSERT(ixamp->b_cont == NULL);
+
+	ixm = (ixamblk_t *)ixamp->b_rptr;
+	ASSERT(!ixm->ixm_inbound);
+
+	/* Verify the netstack is still around */
+	ns = netstack_find_by_stackid(ixm->ixm_stackid);
+	if (ns == NULL) {
+		/* Disappeared on us */
+		(void) ip_xmit_attr_free_mblk(ixamp);
+		return (B_FALSE);
+	}
+	ipst = ns->netstack_ip;
+
+	/* Verify the ill is still around */
+	ill = ill_lookup_on_ifindex(ixm->ixm_ifindex,
+	    !(ixm->ixm_flags & IXAF_IS_IPV4), ipst);
+
+	/* We have the ill, hence the netstack can't go away */
+	netstack_rele(ns);
+	if (ill == NULL) {
+		/* Disappeared on us */
+		(void) ip_xmit_attr_free_mblk(ixamp);
+		return (B_FALSE);
+	}
+	/*
+	 * Find the nce. We don't load-spread (only lookup nce's on the ill)
+	 * because we want to find the same nce as the one we had when
+	 * ip_xmit_attr_to_mblk was called.
+	 */
+	if (ixm->ixm_flags & IXAF_IS_IPV4) {
+		nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4);
+	} else {
+		nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6);
+	}
+
+	/* We have the nce, hence the ill can't go away */
+	ill_refrele(ill);
+	if (nce == NULL) {
+		/*
+		 * Since this is unusual and we don't know what type of
+		 * nce it was, we drop the packet.
+		 */
+		(void) ip_xmit_attr_free_mblk(ixamp);
+		return (B_FALSE);
+	}
+
+	ixa->ixa_flags = ixm->ixm_flags;
+	ixa->ixa_refcnt = 1;
+	ixa->ixa_ipst = ipst;
+	ixa->ixa_fragsize = ixm->ixm_fragsize;
+	ixa->ixa_pktlen =  ixm->ixm_pktlen;
+	ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length;
+	ixa->ixa_protocol = ixm->ixm_protocol;
+	ixa->ixa_nce = nce;
+	ixa->ixa_postfragfn = ixm->ixm_postfragfn;
+	ixa->ixa_zoneid = ixm->ixm_zoneid;
+	ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid;
+	ixa->ixa_scopeid = ixm->ixm_scopeid;
+	ixa->ixa_ident = ixm->ixm_ident;
+	ixa->ixa_xmit_hint = ixm->ixm_xmit_hint;
+
+	if (ixm->ixm_tsl != NULL) {
+		ixa->ixa_tsl = ixm->ixm_tsl;
+		ixa->ixa_free_flags |= IXA_FREE_TSL;
+	}
+	if (ixm->ixm_cred != NULL) {
+		ixa->ixa_cred = ixm->ixm_cred;
+		ixa->ixa_free_flags |= IXA_FREE_CRED;
+	}
+	ixa->ixa_cpid = ixm->ixm_cpid;
+
+	ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa;
+	ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa;
+	ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy;
+	ixa->ixa_ipsec_action = ixm->ixm_ipsec_action;
+	ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch;
+
+	ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0];
+	ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1];
+	ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port;
+	ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port;
+	ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type;
+	ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code;
+	ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf;
+	ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0];
+	ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1];
+	ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2];
+	ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3];
+	ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0];
+	ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1];
+	ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2];
+	ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3];
+	ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx;
+	ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx;
+	ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto;
+
+	freeb(ixamp);
+	return (B_TRUE);
+}
+
+/*
+ * Free the ixm mblk and any references it holds
+ * Returns b_cont.
+ */
+mblk_t *
+ip_xmit_attr_free_mblk(mblk_t *ixamp)
+{
+	ixamblk_t	*ixm;
+	mblk_t		*mp;
+
+	/* Consume mp */
+	ASSERT(DB_TYPE(ixamp) == M_BREAK);
+	mp = ixamp->b_cont;
+
+	ixm = (ixamblk_t *)ixamp->b_rptr;
+	ASSERT(!ixm->ixm_inbound);
+
+	if (ixm->ixm_ipsec_ah_sa != NULL) {
+		IPSA_REFRELE(ixm->ixm_ipsec_ah_sa);
+		ixm->ixm_ipsec_ah_sa = NULL;
+	}
+	if (ixm->ixm_ipsec_esp_sa != NULL) {
+		IPSA_REFRELE(ixm->ixm_ipsec_esp_sa);
+		ixm->ixm_ipsec_esp_sa = NULL;
+	}
+	if (ixm->ixm_ipsec_policy != NULL) {
+		IPPOL_REFRELE(ixm->ixm_ipsec_policy);
+		ixm->ixm_ipsec_policy = NULL;
+	}
+	if (ixm->ixm_ipsec_action != NULL) {
+		IPACT_REFRELE(ixm->ixm_ipsec_action);
+		ixm->ixm_ipsec_action = NULL;
+	}
+	if (ixm->ixm_ipsec_latch) {
+		IPLATCH_REFRELE(ixm->ixm_ipsec_latch);
+		ixm->ixm_ipsec_latch = NULL;
+	}
+
+	if (ixm->ixm_tsl != NULL) {
+		label_rele(ixm->ixm_tsl);
+		ixm->ixm_tsl = NULL;
+	}
+	if (ixm->ixm_cred != NULL) {
+		crfree(ixm->ixm_cred);
+		ixm->ixm_cred = NULL;
+	}
+	freeb(ixamp);
+	return (mp);
+}
+
+/*
+ * Take the information in ip_recv_attr_t and stick it in an mblk
+ * that can later be passed to ip_recv_attr_from_mblk to recreate the
+ * ip_recv_attr_t.
+ *
+ * Returns NULL on memory allocation failure.
+ */
+mblk_t *
+ip_recv_attr_to_mblk(ip_recv_attr_t *ira)
+{
+	mblk_t		*iramp;
+	iramblk_t	*irm;
+	ill_t		*ill = ira->ira_ill;
+
+	ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0);
+
+	iramp = allocb(sizeof (*irm), BPRI_MED);
+	if (iramp == NULL)
+		return (NULL);
+
+	iramp->b_datap->db_type = M_BREAK;
+	iramp->b_wptr += sizeof (*irm);
+	irm = (iramblk_t *)iramp->b_rptr;
+
+	bzero(irm, sizeof (*irm));
+	irm->irm_inbound = B_TRUE;
+	irm->irm_flags = ira->ira_flags;
+	if (ill != NULL) {
+		/* Internal to IP - preserve ip_stack_t, ill and rill */
+		irm->irm_stackid =
+		    ill->ill_ipst->ips_netstack->netstack_stackid;
+		irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
+		ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex ==
+		    ira->ira_rifindex);
+	} else {
+		/* Let ip_recv_attr_from_stackid know there isn't one */
+		irm->irm_stackid = -1;
+	}
+	irm->irm_rifindex = ira->ira_rifindex;
+	irm->irm_ruifindex = ira->ira_ruifindex;
+	irm->irm_pktlen = ira->ira_pktlen;
+	irm->irm_ip_hdr_length = ira->ira_ip_hdr_length;
+	irm->irm_protocol = ira->ira_protocol;
+
+	irm->irm_sqp = ira->ira_sqp;
+	irm->irm_ring = ira->ira_ring;
+
+	irm->irm_zoneid = ira->ira_zoneid;
+	irm->irm_mroute_tunnel = ira->ira_mroute_tunnel;
+	irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid;
+	irm->irm_esp_udp_ports = ira->ira_esp_udp_ports;
+
+	if (ira->ira_tsl != NULL) {
+		irm->irm_tsl = ira->ira_tsl;
+		label_hold(irm->irm_tsl);
+	}
+	if (ira->ira_cred != NULL) {
+		irm->irm_cred = ira->ira_cred;
+		crhold(ira->ira_cred);
+	}
+	irm->irm_cpid = ira->ira_cpid;
+
+	if (ira->ira_flags & IRAF_L2SRC_SET)
+		bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE);
+
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+		if (ira->ira_ipsec_ah_sa != NULL) {
+			irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa;
+			IPSA_REFHOLD(ira->ira_ipsec_ah_sa);
+		}
+		if (ira->ira_ipsec_esp_sa != NULL) {
+			irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa;
+			IPSA_REFHOLD(ira->ira_ipsec_esp_sa);
+		}
+		if (ira->ira_ipsec_action != NULL) {
+			irm->irm_ipsec_action = ira->ira_ipsec_action;
+			IPACT_REFHOLD(ira->ira_ipsec_action);
+		}
+	}
+	return (iramp);
+}
+
+/*
+ * Extract the ip_recv_attr_t from the mblk. If we are used inside IP
+ * then irm_stackid is not -1, in which case we check that the
+ * ip_stack_t and ill_t still exist. Returns B_FALSE if that is
+ * not the case.
+ * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter)
+ * and we just proceed with ira_ill and ira_rill as NULL.
+ *
+ * The caller needs to release any references on the pointers inside the ire
+ * by calling ira_cleanup.
+ */
+boolean_t
+ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira)
+{
+	iramblk_t	*irm;
+	netstack_t	*ns;
+	ip_stack_t	*ipst = NULL;
+	ill_t		*ill = NULL, *rill = NULL;
+
+	/* We assume the caller hasn't initialized ira */
+	bzero(ira, sizeof (*ira));
+
+	ASSERT(DB_TYPE(iramp) == M_BREAK);
+	ASSERT(iramp->b_cont == NULL);
+
+	irm = (iramblk_t *)iramp->b_rptr;
+	ASSERT(irm->irm_inbound);
+
+	if (irm->irm_stackid != -1) {
+		/* Verify the netstack is still around */
+		ns = netstack_find_by_stackid(irm->irm_stackid);
+		if (ns == NULL) {
+			/* Disappeared on us */
+			(void) ip_recv_attr_free_mblk(iramp);
+			return (B_FALSE);
+		}
+		ipst = ns->netstack_ip;
+
+		/* Verify the ill is still around */
+		ill = ill_lookup_on_ifindex(irm->irm_ifindex,
+		    !(irm->irm_flags & IRAF_IS_IPV4), ipst);
+
+		if (irm->irm_ifindex == irm->irm_rifindex) {
+			rill = ill;
+		} else {
+			rill = ill_lookup_on_ifindex(irm->irm_rifindex,
+			    !(irm->irm_flags & IRAF_IS_IPV4), ipst);
+		}
+
+		/* We have the ill, hence the netstack can't go away */
+		netstack_rele(ns);
+		if (ill == NULL || rill == NULL) {
+			/* Disappeared on us */
+			if (ill != NULL)
+				ill_refrele(ill);
+			if (rill != NULL && rill != ill)
+				ill_refrele(rill);
+			(void) ip_recv_attr_free_mblk(iramp);
+			return (B_FALSE);
+		}
+	}
+
+	ira->ira_flags = irm->irm_flags;
+	/* Caller must ill_refele(ira_ill) by using ira_cleanup() */
+	ira->ira_ill = ill;
+	ira->ira_rill = rill;
+
+	ira->ira_rifindex = irm->irm_rifindex;
+	ira->ira_ruifindex = irm->irm_ruifindex;
+	ira->ira_pktlen = irm->irm_pktlen;
+	ira->ira_ip_hdr_length = irm->irm_ip_hdr_length;
+	ira->ira_protocol = irm->irm_protocol;
+
+	ira->ira_sqp = irm->irm_sqp;
+	/* The rest of IP assumes that the rings never go away. */
+	ira->ira_ring = irm->irm_ring;
+
+	ira->ira_zoneid = irm->irm_zoneid;
+	ira->ira_mroute_tunnel = irm->irm_mroute_tunnel;
+	ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid;
+	ira->ira_esp_udp_ports = irm->irm_esp_udp_ports;
+
+	if (irm->irm_tsl != NULL) {
+		ira->ira_tsl = irm->irm_tsl;
+		ira->ira_free_flags |= IRA_FREE_TSL;
+	}
+	if (irm->irm_cred != NULL) {
+		ira->ira_cred = irm->irm_cred;
+		ira->ira_free_flags |= IRA_FREE_CRED;
+	}
+	ira->ira_cpid = irm->irm_cpid;
+
+	if (ira->ira_flags & IRAF_L2SRC_SET)
+		bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE);
+
+	ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa;
+	ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa;
+	ira->ira_ipsec_action = irm->irm_ipsec_action;
+
+	freeb(iramp);
+	return (B_TRUE);
+}
+
+/*
+ * Free the irm mblk and any references it holds
+ * Returns b_cont.
+ */
+mblk_t *
+ip_recv_attr_free_mblk(mblk_t *iramp)
+{
+	iramblk_t	*irm;
+	mblk_t		*mp;
+
+	/* Consume mp */
+	ASSERT(DB_TYPE(iramp) == M_BREAK);
+	mp = iramp->b_cont;
+
+	irm = (iramblk_t *)iramp->b_rptr;
+	ASSERT(irm->irm_inbound);
+
+	if (irm->irm_ipsec_ah_sa != NULL) {
+		IPSA_REFRELE(irm->irm_ipsec_ah_sa);
+		irm->irm_ipsec_ah_sa = NULL;
+	}
+	if (irm->irm_ipsec_esp_sa != NULL) {
+		IPSA_REFRELE(irm->irm_ipsec_esp_sa);
+		irm->irm_ipsec_esp_sa = NULL;
+	}
+	if (irm->irm_ipsec_action != NULL) {
+		IPACT_REFRELE(irm->irm_ipsec_action);
+		irm->irm_ipsec_action = NULL;
+	}
+	if (irm->irm_tsl != NULL) {
+		label_rele(irm->irm_tsl);
+		irm->irm_tsl = NULL;
+	}
+	if (irm->irm_cred != NULL) {
+		crfree(irm->irm_cred);
+		irm->irm_cred = NULL;
+	}
+
+	freeb(iramp);
+	return (mp);
+}
+
+/*
+ * Returns true if the mblk contains an ip_recv_attr_t
+ * For now we just check db_type.
+ */
+boolean_t
+ip_recv_attr_is_mblk(mblk_t *mp)
+{
+	/*
+	 * Need to handle the various forms of tcp_timermp which are tagged
+	 * with b_wptr and might have a NULL b_datap.
+	 */
+	if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1)
+		return (B_FALSE);
+
+#ifdef	DEBUG
+	iramblk_t	*irm;
+
+	if (DB_TYPE(mp) != M_BREAK)
+		return (B_FALSE);
+
+	irm = (iramblk_t *)mp->b_rptr;
+	ASSERT(irm->irm_inbound);
+	return (B_TRUE);
+#else
+	return (DB_TYPE(mp) == M_BREAK);
+#endif
+}
+
+static ip_xmit_attr_t *
+conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag)
+{
+	ip_xmit_attr_t	*ixa;
+	ip_xmit_attr_t	*oldixa;
+
+	mutex_enter(&connp->conn_lock);
+	ixa = connp->conn_ixa;
+
+	/* At least one references for the conn_t */
+	ASSERT(ixa->ixa_refcnt >= 1);
+	if (atomic_add_32_nv(&ixa->ixa_refcnt, 1) == 2) {
+		/* No other thread using conn_ixa */
+		mutex_exit(&connp->conn_lock);
+		return (ixa);
+	}
+	ixa = kmem_alloc(sizeof (*ixa), kmflag);
+	if (ixa == NULL) {
+		mutex_exit(&connp->conn_lock);
+		ixa_refrele(connp->conn_ixa);
+		return (NULL);
+	}
+	ixa_safe_copy(connp->conn_ixa, ixa);
+
+	/* Make sure we drop conn_lock before any refrele */
+	if (replace) {
+		ixa->ixa_refcnt++;	/* No atomic needed - not visible */
+		oldixa = connp->conn_ixa;
+		connp->conn_ixa = ixa;
+		mutex_exit(&connp->conn_lock);
+		IXA_REFRELE(oldixa);	/* Undo refcnt from conn_t */
+	} else {
+		oldixa = connp->conn_ixa;
+		mutex_exit(&connp->conn_lock);
+	}
+	IXA_REFRELE(oldixa);	/* Undo above atomic_add_32_nv */
+
+	return (ixa);
+}
+
+/*
+ * Return an ip_xmit_attr_t to use with a conn_t that ensures that only
+ * the caller can access the ip_xmit_attr_t.
+ *
+ * If nobody else is using conn_ixa we return it.
+ * Otherwise we make a "safe" copy of conn_ixa
+ * and return it. The "safe" copy has the pointers set to NULL
+ * (since the pointers might be changed by another thread using
+ * conn_ixa). The caller needs to check for NULL pointers to see
+ * if ip_set_destination needs to be called to re-establish the pointers.
+ *
+ * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t.
+ * That is used when we connect() the ULP.
+ */
+ip_xmit_attr_t *
+conn_get_ixa(conn_t *connp, boolean_t replace)
+{
+	return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP));
+}
+
+/*
+ * Used only when the option is to have the kernel hang due to not
+ * cleaning up ixa references on ills etc.
+ */
+ip_xmit_attr_t *
+conn_get_ixa_tryhard(conn_t *connp, boolean_t replace)
+{
+	return (conn_get_ixa_impl(connp, replace, KM_SLEEP));
+}
+
+/*
+ * Replace conn_ixa with the ixa argument.
+ *
+ * The caller must hold conn_lock.
+ *
+ * We return the old ixa; the caller must ixa_refrele that after conn_lock
+ * has been dropped.
+ */
+ip_xmit_attr_t *
+conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa)
+{
+	ip_xmit_attr_t	*oldixa;
+
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+
+	oldixa = connp->conn_ixa;
+	IXA_REFHOLD(ixa);
+	connp->conn_ixa = ixa;
+	return (oldixa);
+}
+
+/*
+ * Return a ip_xmit_attr_t to use with a conn_t that is based on but
+ * separate from conn_ixa.
+ *
+ * This "safe" copy has the pointers set to NULL
+ * (since the pointers might be changed by another thread using
+ * conn_ixa). The caller needs to check for NULL pointers to see
+ * if ip_set_destination needs to be called to re-establish the pointers.
+ */
+ip_xmit_attr_t *
+conn_get_ixa_exclusive(conn_t *connp)
+{
+	ip_xmit_attr_t *ixa;
+
+	mutex_enter(&connp->conn_lock);
+	ixa = connp->conn_ixa;
+
+	/* At least one references for the conn_t */
+	ASSERT(ixa->ixa_refcnt >= 1);
+
+	/* Make sure conn_ixa doesn't disappear while we copy it */
+	atomic_add_32(&ixa->ixa_refcnt, 1);
+
+	ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
+	if (ixa == NULL) {
+		mutex_exit(&connp->conn_lock);
+		ixa_refrele(connp->conn_ixa);
+		return (NULL);
+	}
+	ixa_safe_copy(connp->conn_ixa, ixa);
+	mutex_exit(&connp->conn_lock);
+	IXA_REFRELE(connp->conn_ixa);
+	return (ixa);
+}
+
+void
+ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
+{
+	bcopy(src, ixa, sizeof (*ixa));
+	ixa->ixa_refcnt = 1;
+	/*
+	 * Clear any pointers that have references and might be changed
+	 * by ip_set_destination or the ULP
+	 */
+	ixa->ixa_ire = NULL;
+	ixa->ixa_nce = NULL;
+	ixa->ixa_dce = NULL;
+	ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+#ifdef DEBUG
+	ixa->ixa_curthread = NULL;
+#endif
+	/* Clear all the IPsec pointers and the flag as well. */
+	ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+
+	ixa->ixa_ipsec_latch = NULL;
+	ixa->ixa_ipsec_ah_sa = NULL;
+	ixa->ixa_ipsec_esp_sa = NULL;
+	ixa->ixa_ipsec_policy = NULL;
+	ixa->ixa_ipsec_action = NULL;
+
+	/*
+	 * We leave ixa_tsl unchanged, but if it has a refhold we need
+	 * to get an extra refhold.
+	 */
+	if (ixa->ixa_free_flags & IXA_FREE_TSL)
+		label_hold(ixa->ixa_tsl);
+
+	/*
+	 * We leave ixa_cred unchanged, but if it has a refhold we need
+	 * to get an extra refhold.
+	 */
+	if (ixa->ixa_free_flags & IXA_FREE_CRED)
+		crhold(ixa->ixa_cred);
+}
+
+/*
+ * Duplicate an ip_xmit_attr_t.
+ * Assumes that the caller controls the ixa, hence we do not need to use
+ * a safe copy. We just have to increase the refcnt on any pointers.
+ */
+ip_xmit_attr_t *
+ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa)
+{
+	ip_xmit_attr_t *ixa;
+
+	ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
+	if (ixa == NULL)
+		return (NULL);
+	bcopy(src_ixa, ixa, sizeof (*ixa));
+	ixa->ixa_refcnt = 1;
+
+	if (ixa->ixa_ire != NULL)
+		ire_refhold_notr(ixa->ixa_ire);
+	if (ixa->ixa_nce != NULL)
+		nce_refhold(ixa->ixa_nce);
+	if (ixa->ixa_dce != NULL)
+		dce_refhold_notr(ixa->ixa_dce);
+
+#ifdef DEBUG
+	ixa->ixa_curthread = NULL;
+#endif
+
+	if (ixa->ixa_ipsec_latch != NULL)
+		IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
+	if (ixa->ixa_ipsec_ah_sa != NULL)
+		IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
+	if (ixa->ixa_ipsec_esp_sa != NULL)
+		IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
+	if (ixa->ixa_ipsec_policy != NULL)
+		IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
+	if (ixa->ixa_ipsec_action != NULL)
+		IPACT_REFHOLD(ixa->ixa_ipsec_action);
+
+	if (ixa->ixa_tsl != NULL) {
+		label_hold(ixa->ixa_tsl);
+		ixa->ixa_free_flags |= IXA_FREE_TSL;
+	}
+	if (ixa->ixa_cred != NULL) {
+		crhold(ixa->ixa_cred);
+		ixa->ixa_free_flags |= IXA_FREE_CRED;
+	}
+	return (ixa);
+}
+
+/*
+ * Used to replace the ixa_label field.
+ * The caller should have a reference on the label, which we transfer to
+ * the attributes so that when the attribute is freed/cleaned up
+ * we will release that reference.
+ */
+void
+ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl)
+{
+	ASSERT(tsl != NULL);
+
+	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+		ASSERT(ixa->ixa_tsl != NULL);
+		label_rele(ixa->ixa_tsl);
+	} else {
+		ixa->ixa_free_flags |= IXA_FREE_TSL;
+	}
+	ixa->ixa_tsl = tsl;
+}
+
+/*
+ * Replace the ip_recv_attr_t's label.
+ * Due to kernel RPC's use of db_credp we also need to replace ira_cred;
+ * TCP/UDP uses ira_cred to set db_credp for non-socket users.
+ * This can fail (and return B_FALSE) due to lack of memory.
+ */
+boolean_t
+ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl)
+{
+	cred_t	*newcr;
+
+	if (ira->ira_free_flags & IRA_FREE_TSL) {
+		ASSERT(ira->ira_tsl != NULL);
+		label_rele(ira->ira_tsl);
+	}
+	label_hold(tsl);
+	ira->ira_tsl = tsl;
+	ira->ira_free_flags |= IRA_FREE_TSL;
+
+	/*
+	 * Reset zoneid if we have a shared address. That allows
+	 * ip_fanout_tx_v4/v6 to determine the zoneid again.
+	 */
+	if (ira->ira_flags & IRAF_TX_SHARED_ADDR)
+		ira->ira_zoneid = ALL_ZONES;
+
+	/* We update ira_cred for RPC */
+	newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP);
+	if (newcr == NULL)
+		return (B_FALSE);
+	if (ira->ira_free_flags & IRA_FREE_CRED)
+		crfree(ira->ira_cred);
+	ira->ira_cred = newcr;
+	ira->ira_free_flags |= IRA_FREE_CRED;
+	return (B_TRUE);
+}
+
+/*
+ * This needs to be called after ip_set_destination/tsol_check_dest might
+ * have changed ixa_tsl to be specific for a destination, and we now want to
+ * send to a different destination.
+ * We have to restart with crgetlabel() since ip_set_destination/
+ * tsol_check_dest will start with ixa_tsl.
+ */
+void
+ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr)
+{
+	if (!is_system_labeled())
+		return;
+
+	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+		ASSERT(ixa->ixa_tsl != NULL);
+		label_rele(ixa->ixa_tsl);
+		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+	}
+	ixa->ixa_tsl = crgetlabel(cr);
+}
+
+void
+ixa_refrele(ip_xmit_attr_t *ixa)
+{
+	IXA_REFRELE(ixa);
+}
+
+void
+ixa_inactive(ip_xmit_attr_t *ixa)
+{
+	ASSERT(ixa->ixa_refcnt == 0);
+
+	ixa_cleanup(ixa);
+	kmem_free(ixa, sizeof (*ixa));
+}
+
+/*
+ * Release any references contained in the ixa.
+ * Also clear any fields that are not controlled by ixa_flags.
+ */
+void
+ixa_cleanup(ip_xmit_attr_t *ixa)
+{
+	if (ixa->ixa_ire != NULL) {
+		ire_refrele_notr(ixa->ixa_ire);
+		ixa->ixa_ire = NULL;
+	}
+	if (ixa->ixa_dce != NULL) {
+		dce_refrele_notr(ixa->ixa_dce);
+		ixa->ixa_dce = NULL;
+	}
+	if (ixa->ixa_nce != NULL) {
+		nce_refrele(ixa->ixa_nce);
+		ixa->ixa_nce = NULL;
+	}
+	ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+	if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
+		ipsec_out_release_refs(ixa);
+	}
+	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+		ASSERT(ixa->ixa_tsl != NULL);
+		label_rele(ixa->ixa_tsl);
+		ixa->ixa_tsl = NULL;
+		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+	}
+	if (ixa->ixa_free_flags & IXA_FREE_CRED) {
+		ASSERT(ixa->ixa_cred != NULL);
+		crfree(ixa->ixa_cred);
+		ixa->ixa_cred = NULL;
+		ixa->ixa_free_flags &= ~IXA_FREE_CRED;
+	}
+	ixa->ixa_src_preferences = 0;
+	ixa->ixa_ifindex = 0;
+	ixa->ixa_multicast_ifindex = 0;
+	ixa->ixa_multicast_ifaddr = INADDR_ANY;
+}
+
+/*
+ * Release any references contained in the ira.
+ * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second
+ * argument.
+ */
+void
+ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill)
+{
+	if (ira->ira_ill != NULL) {
+		if (ira->ira_rill != ira->ira_ill) {
+			/* Caused by async processing */
+			ill_refrele(ira->ira_rill);
+		}
+		if (refrele_ill)
+			ill_refrele(ira->ira_ill);
+	}
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+		ipsec_in_release_refs(ira);
+	}
+	if (ira->ira_free_flags & IRA_FREE_TSL) {
+		ASSERT(ira->ira_tsl != NULL);
+		label_rele(ira->ira_tsl);
+		ira->ira_tsl = NULL;
+		ira->ira_free_flags &= ~IRA_FREE_TSL;
+	}
+	if (ira->ira_free_flags & IRA_FREE_CRED) {
+		ASSERT(ira->ira_cred != NULL);
+		crfree(ira->ira_cred);
+		ira->ira_cred = NULL;
+		ira->ira_free_flags &= ~IRA_FREE_CRED;
+	}
+}
+
+/*
+ * Function to help release any IRE, NCE, or DCEs that
+ * have been deleted and are marked as condemned.
+ * The caller is responsible for any serialization which is different
+ * for TCP, SCTP, and others.
+ */
+static void
+ixa_cleanup_stale(ip_xmit_attr_t *ixa)
+{
+	ire_t		*ire;
+	nce_t		*nce;
+	dce_t		*dce;
+
+	ire = ixa->ixa_ire;
+	nce = ixa->ixa_nce;
+	dce = ixa->ixa_dce;
+
+	if (ire != NULL && IRE_IS_CONDEMNED(ire)) {
+		ire_refrele_notr(ire);
+		ire = ire_blackhole(ixa->ixa_ipst,
+		    !(ixa->ixa_flags & IXAF_IS_IPV4));
+		ASSERT(ire != NULL);
+#ifdef DEBUG
+		ire_refhold_notr(ire);
+		ire_refrele(ire);
+#endif
+		ixa->ixa_ire = ire;
+		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+	}
+	if (nce != NULL && nce->nce_is_condemned) {
+		/* Can make it NULL as long as we set IRE_GENERATION_VERIFY */
+		nce_refrele(nce);
+		ixa->ixa_nce = NULL;
+		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+	}
+	if (dce != NULL && DCE_IS_CONDEMNED(dce)) {
+		dce_refrele_notr(dce);
+		dce = dce_get_default(ixa->ixa_ipst);
+		ASSERT(dce != NULL);
+#ifdef DEBUG
+		dce_refhold_notr(dce);
+		dce_refrele(dce);
+#endif
+		ixa->ixa_dce = dce;
+		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+	}
+}
+
+/*
+ * Used to run ixa_cleanup_stale inside the tcp squeue.
+ * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
+ * and waking up the caller.
+ */
+/* ARGSUSED2 */
+static void
+tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy)
+{
+	conn_t	*connp = (conn_t *)arg;
+	tcp_stack_t	*tcps;
+
+	tcps = connp->conn_netstack->netstack_tcp;
+
+	ixa_cleanup_stale(connp->conn_ixa);
+
+	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+	ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
+	tcps->tcps_ixa_cleanup_mp = mp;
+	cv_signal(&tcps->tcps_ixa_cleanup_cv);
+	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+}
+
+
+/*
+ * ipcl_walk() function to help release any IRE, NCE, or DCEs that
+ * have been deleted and are marked as condemned.
+ * Note that we can't cleanup the pointers since there can be threads
+ * in conn_ip_output() sending while we are called.
+ */
+void
+conn_ixa_cleanup(conn_t *connp, void *arg)
+{
+	boolean_t tryhard = (boolean_t)arg;
+
+	if (IPCL_IS_TCP(connp)) {
+		mblk_t		*mp;
+		tcp_stack_t	*tcps;
+
+		tcps = connp->conn_netstack->netstack_tcp;
+
+		mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+		while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
+			/*
+			 * Multiple concurrent cleanups; need to have the last
+			 * one run since it could be an unplumb.
+			 */
+			cv_wait(&tcps->tcps_ixa_cleanup_cv,
+			    &tcps->tcps_ixa_cleanup_lock);
+		}
+		tcps->tcps_ixa_cleanup_mp = NULL;
+		mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+
+		if (connp->conn_sqp->sq_run == curthread) {
+			/* Already on squeue */
+			tcp_ixa_cleanup(connp, mp, NULL, NULL);
+		} else {
+			CONN_INC_REF(connp);
+			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
+			    connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
+
+			/* Wait until tcp_ixa_cleanup has run */
+			mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+			while (tcps->tcps_ixa_cleanup_mp == NULL) {
+				cv_wait(&tcps->tcps_ixa_cleanup_cv,
+				    &tcps->tcps_ixa_cleanup_lock);
+			}
+			mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+		}
+	} else if (IPCL_IS_SCTP(connp)) {
+		sctp_t	*sctp;
+		sctp_faddr_t *fp;
+
+		sctp = CONN2SCTP(connp);
+		RUN_SCTP(sctp);
+		ixa_cleanup_stale(connp->conn_ixa);
+		for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next)
+			ixa_cleanup_stale(fp->ixa);
+		WAKE_SCTP(sctp);
+	} else {
+		ip_xmit_attr_t	*ixa;
+
+		/*
+		 * If there is a different thread using conn_ixa then we get a
+		 * new copy and cut the old one loose from conn_ixa. Otherwise
+		 * we use conn_ixa and prevent any other thread from
+		 * using/changing it. Anybody using conn_ixa (e.g., a thread in
+		 * conn_ip_output) will do an ixa_refrele which will remove any
+		 * references on the ire etc.
+		 *
+		 * Once we are done other threads can use conn_ixa since the
+		 * refcnt will be back at one.
+		 *
+		 * We are called either because an ill is going away, or
+		 * due to memory reclaim. In the former case we wait for
+		 * memory since we must remove the refcnts on the ill.
+		 */
+		if (tryhard) {
+			ixa = conn_get_ixa_tryhard(connp, B_TRUE);
+			ASSERT(ixa != NULL);
+		} else {
+			ixa = conn_get_ixa(connp, B_TRUE);
+			if (ixa == NULL) {
+				/*
+				 * Somebody else was using it and kmem_alloc
+				 * failed! Next memory reclaim will try to
+				 * clean up.
+				 */
+				DTRACE_PROBE1(conn__ixa__cleanup__bail,
+				    conn_t *, connp);
+				return;
+			}
+		}
+		ixa_cleanup_stale(ixa);
+		ixa_refrele(ixa);
+	}
+}
+
+/*
+ * ixa needs to be an exclusive copy so that no one changes the cookie
+ * or the ixa_nce.
+ */
+boolean_t
+ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa)
+{
+	uintptr_t cookie = ixa->ixa_cookie;
+	ill_dld_direct_t *idd;
+	idl_tx_list_t *idl_txl;
+	ill_t *ill = ixa->ixa_nce->nce_ill;
+	boolean_t inserted = B_FALSE;
+
+	idd = &(ill)->ill_dld_capab->idc_direct;
+	idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
+	if (cookie == 0) {
+		/*
+		 * ip_xmit failed the canputnext check
+		 */
+		connp->conn_did_putbq = 1;
+		ASSERT(cookie == 0);
+		conn_drain_insert(connp, idl_txl);
+		if (!IPCL_IS_NONSTR(connp))
+			noenable(connp->conn_wq);
+		return (B_TRUE);
+	}
+	ASSERT(ILL_DIRECT_CAPABLE(ill));
+	mutex_enter(&idl_txl->txl_lock);
+	if (connp->conn_direct_blocked ||
+	    (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0)) {
+		DTRACE_PROBE1(ill__tx__not__blocked, boolean,
+		    connp->conn_direct_blocked);
+	} else if (idl_txl->txl_cookie != NULL &&
+	    idl_txl->txl_cookie != ixa->ixa_cookie) {
+		DTRACE_PROBE2(ill__send__tx__collision, uintptr_t, cookie,
+		    uintptr_t, idl_txl->txl_cookie);
+		/* bump kstat for cookie collision */
+	} else {
+		connp->conn_direct_blocked = B_TRUE;
+		idl_txl->txl_cookie = cookie;
+		conn_drain_insert(connp, idl_txl);
+		if (!IPCL_IS_NONSTR(connp))
+			noenable(connp->conn_wq);
+		inserted = B_TRUE;
+	}
+	mutex_exit(&idl_txl->txl_lock);
+	return (inserted);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_dce.c b/usr/src/uts/common/inet/ip/ip_dce.c
new file mode 100644
index 0000000000..839c5ae0d0
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_dce.c
@@ -0,0 +1,873 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+#define	_SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/snmpcom.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/ip_ndp.h>
+#include <inet/ipclassifier.h>
+#include <inet/ip_listutils.h>
+
+#include <sys/sunddi.h>
+
+/*
+ * Routines for handling destination cache entries.
+ * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
+ * That entry holds both the IP ident value and the dce generation number.
+ *
+ * Any time a DCE is changed significantly (different path MTU, but NOT
+ * different ULP info!), the dce_generation number is increased.
+ * Also, when a new DCE is created, the dce_generation number in the default
+ * DCE is bumped. That allows the dce_t information to be cached efficiently
+ * as long as the entity caching the dce_t also caches the dce_generation,
+ * and compares the cached generation to detect any changes.
+ * Furthermore, when a DCE is deleted, if there are any outstanding references
+ * to the DCE it will be marked as condemned. The condemned mark is
+ * a designated generation number which is never otherwise used, hence
+ * the single comparison with the generation number captures that as well.
+ *
+ * An example of code which caches is as follows:
+ *
+ *	if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
+ *		The DCE has changed
+ *		mystruct->my_dce = dce_lookup_pkt(mp, ixa,
+ *		    &mystruct->my_dce_generation);
+ *		Not needed in practice, since we have the default DCE:
+ *		if (DCE_IS_CONDEMNED(mystruct->my_dce))
+ *			return failure;
+ *	}
+ *
+ * Note that for IPv6 link-local addresses we record the ifindex since the
+ * link-locals are not globally unique.
+ */
+
+/*
+ * Hash bucket structure for DCEs
+ */
+typedef struct dcb_s {
+	krwlock_t	dcb_lock;
+	uint32_t	dcb_cnt;
+	dce_t		*dcb_dce;
+} dcb_t;
+
+static void	dce_delete_locked(dcb_t *, dce_t *);
+static void	dce_make_condemned(dce_t *);
+
+static kmem_cache_t *dce_cache;
+
+
+/* Operates on a uint64_t */
+#define	RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
+
+/*
+ * Reclaim a fraction of dce's in the dcb.
+ * For now we have a higher probability to delete DCEs without DCE_PMTU.
+ */
+static void
+dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
+{
+	uint_t	fraction_pmtu = fraction*4;
+	uint_t	hash;
+	dce_t	*dce, *nextdce;
+
+	rw_enter(&dcb->dcb_lock, RW_WRITER);
+	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
+		nextdce = dce->dce_next;
+		/* Clear DCEF_PMTU if the pmtu is too old */
+		mutex_enter(&dce->dce_lock);
+		if ((dce->dce_flags & DCEF_PMTU) &&
+		    TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+		    ipst->ips_ip_pathmtu_interval) {
+			dce->dce_flags &= ~DCEF_PMTU;
+			mutex_exit(&dce->dce_lock);
+			dce_increment_generation(dce);
+		} else {
+			mutex_exit(&dce->dce_lock);
+		}
+		hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
+		if (dce->dce_flags & DCEF_PMTU) {
+			if (hash % fraction_pmtu != 0)
+				continue;
+		} else {
+			if (hash % fraction != 0)
+				continue;
+		}
+
+		IP_STAT(ipst, ip_dce_reclaim_deleted);
+		dce_delete_locked(dcb, dce);
+		dce_refrele(dce);
+	}
+	rw_exit(&dcb->dcb_lock);
+}
+
+/*
+ * kmem_cache callback to free up memory.
+ *
+ */
+static void
+ip_dce_reclaim_stack(ip_stack_t *ipst)
+{
+	int	i;
+
+	IP_STAT(ipst, ip_dce_reclaim_calls);
+	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+		dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
+		    ipst->ips_ip_dce_reclaim_fraction);
+
+		dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
+		    ipst->ips_ip_dce_reclaim_fraction);
+	}
+
+	/*
+	 * Walk all CONNs that can have a reference on an ire, nce or dce.
+	 * Get them to update any stale references to drop any refholds they
+	 * have.
+	 */
+	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
+}
+
+/*
+ * Called by the memory allocator subsystem directly, when the system
+ * is running low on memory.
+ */
+/* ARGSUSED */
+void
+ip_dce_reclaim(void *args)
+{
+	netstack_handle_t nh;
+	netstack_t *ns;
+
+	netstack_next_init(&nh);
+	while ((ns = netstack_next(&nh)) != NULL) {
+		ip_dce_reclaim_stack(ns->netstack_ip);
+		netstack_rele(ns);
+	}
+	netstack_next_fini(&nh);
+}
+
+void
+dce_g_init(void)
+{
+	dce_cache = kmem_cache_create("dce_cache",
+	    sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
+}
+
+void
+dce_g_destroy(void)
+{
+	kmem_cache_destroy(dce_cache);
+}
+
+
+/*
+ * Allocate a default DCE and a hash table for per-IP address DCEs
+ */
+void
+dce_stack_init(ip_stack_t *ipst)
+{
+	int	i;
+
+	ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
+	bzero(ipst->ips_dce_default, sizeof (dce_t));
+	ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
+	ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
+	ipst->ips_dce_default->dce_last_change_time = TICK_TO_SEC(lbolt64);
+	ipst->ips_dce_default->dce_refcnt = 1;	/* Should never go away */
+	ipst->ips_dce_default->dce_ipst = ipst;
+
+	/* This must be a power of two since we are using IRE_ADDR_HASH macro */
+	ipst->ips_dce_hashsize = 256;
+	ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
+	    sizeof (dcb_t), KM_SLEEP);
+	ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
+	    sizeof (dcb_t), KM_SLEEP);
+	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+		rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
+		    NULL);
+		rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
+		    NULL);
+	}
+}
+
+void
+dce_stack_destroy(ip_stack_t *ipst)
+{
+	int i;
+	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+		rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
+		rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
+	}
+	kmem_free(ipst->ips_dce_hash_v4,
+	    ipst->ips_dce_hashsize * sizeof (dcb_t));
+	ipst->ips_dce_hash_v4 = NULL;
+	kmem_free(ipst->ips_dce_hash_v6,
+	    ipst->ips_dce_hashsize * sizeof (dcb_t));
+	ipst->ips_dce_hash_v6 = NULL;
+	ipst->ips_dce_hashsize = 0;
+
+	ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
+	kmem_cache_free(dce_cache, ipst->ips_dce_default);
+	ipst->ips_dce_default = NULL;
+}
+
+/* When any DCE is good enough */
+dce_t *
+dce_get_default(ip_stack_t *ipst)
+{
+	dce_t		*dce;
+
+	dce = ipst->ips_dce_default;
+	dce_refhold(dce);
+	return (dce);
+}
+
+/*
+ * Generic for IPv4 and IPv6.
+ *
+ * Used by callers that need to cache e.g., the datapath
+ * Returns the generation number in the last argument.
+ */
+dce_t *
+dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
+{
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		/*
+		 * If we have a source route we need to look for the final
+		 * destination in the source route option.
+		 */
+		ipaddr_t final_dst;
+		ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+		final_dst = ip_get_dst(ipha);
+		return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
+	} else {
+		uint_t ifindex;
+		/*
+		 * If we have a routing header we need to look for the final
+		 * destination in the routing extension header.
+		 */
+		in6_addr_t final_dst;
+		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+		final_dst = ip_get_dst_v6(ip6h, mp, NULL);
+		ifindex = 0;
+		if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
+			ifindex = ixa->ixa_nce->nce_common->ncec_ill->
+			    ill_phyint->phyint_ifindex;
+		}
+		return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
+		    generationp));
+	}
+}
+
+/*
+ * Used by callers that need to cache e.g., the datapath
+ * Returns the generation number in the last argument.
+ */
+dce_t *
+dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
+{
+	uint_t		hash;
+	dcb_t		*dcb;
+	dce_t		*dce;
+
+	/* Set *generationp before dropping the lock(s) that allow additions */
+	if (generationp != NULL)
+		*generationp = ipst->ips_dce_default->dce_generation;
+
+	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
+	dcb = &ipst->ips_dce_hash_v4[hash];
+	rw_enter(&dcb->dcb_lock, RW_READER);
+	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+		if (dce->dce_v4addr == dst) {
+			mutex_enter(&dce->dce_lock);
+			if (!DCE_IS_CONDEMNED(dce)) {
+				dce_refhold(dce);
+				if (generationp != NULL)
+					*generationp = dce->dce_generation;
+				mutex_exit(&dce->dce_lock);
+				rw_exit(&dcb->dcb_lock);
+				return (dce);
+			}
+			mutex_exit(&dce->dce_lock);
+		}
+	}
+	rw_exit(&dcb->dcb_lock);
+	/* Not found */
+	dce = ipst->ips_dce_default;
+	dce_refhold(dce);
+	return (dce);
+}
+
+/*
+ * Used by callers that need to cache e.g., the datapath
+ * Returns the generation number in the last argument.
+ * ifindex should only be set for link-locals
+ */
+dce_t *
+dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
+    uint_t *generationp)
+{
+	uint_t		hash;
+	dcb_t		*dcb;
+	dce_t		*dce;
+
+	/* Set *generationp before dropping the lock(s) that allow additions */
+	if (generationp != NULL)
+		*generationp = ipst->ips_dce_default->dce_generation;
+
+	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
+	dcb = &ipst->ips_dce_hash_v6[hash];
+	rw_enter(&dcb->dcb_lock, RW_READER);
+	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
+		    dce->dce_ifindex == ifindex) {
+			mutex_enter(&dce->dce_lock);
+			if (!DCE_IS_CONDEMNED(dce)) {
+				dce_refhold(dce);
+				if (generationp != NULL)
+					*generationp = dce->dce_generation;
+				mutex_exit(&dce->dce_lock);
+				rw_exit(&dcb->dcb_lock);
+				return (dce);
+			}
+			mutex_exit(&dce->dce_lock);
+		}
+	}
+	rw_exit(&dcb->dcb_lock);
+	/* Not found */
+	dce = ipst->ips_dce_default;
+	dce_refhold(dce);
+	return (dce);
+}
+
+/*
+ * Atomically looks for a non-default DCE, and if not found tries to create one.
+ * If there is no memory it returns NULL.
+ * When an entry is created we increase the generation number on
+ * the default DCE so that conn_ip_output will detect there is a new DCE.
+ */
+dce_t *
+dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
+{
+	uint_t		hash;
+	dcb_t		*dcb;
+	dce_t		*dce;
+
+	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
+	dcb = &ipst->ips_dce_hash_v4[hash];
+	rw_enter(&dcb->dcb_lock, RW_WRITER);
+	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+		if (dce->dce_v4addr == dst) {
+			mutex_enter(&dce->dce_lock);
+			if (!DCE_IS_CONDEMNED(dce)) {
+				dce_refhold(dce);
+				mutex_exit(&dce->dce_lock);
+				rw_exit(&dcb->dcb_lock);
+				return (dce);
+			}
+			mutex_exit(&dce->dce_lock);
+		}
+	}
+	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
+	if (dce == NULL) {
+		rw_exit(&dcb->dcb_lock);
+		return (NULL);
+	}
+	bzero(dce, sizeof (dce_t));
+	dce->dce_ipst = ipst;	/* No netstack_hold */
+	dce->dce_v4addr = dst;
+	dce->dce_generation = DCE_GENERATION_INITIAL;
+	dce->dce_ipversion = IPV4_VERSION;
+	dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+	dce_refhold(dce);	/* For the hash list */
+
+	/* Link into list */
+	if (dcb->dcb_dce != NULL)
+		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
+	dce->dce_next = dcb->dcb_dce;
+	dce->dce_ptpn = &dcb->dcb_dce;
+	dcb->dcb_dce = dce;
+	dce->dce_bucket = dcb;
+	dce_refhold(dce);	/* For the caller */
+	rw_exit(&dcb->dcb_lock);
+
+	/* Initialize dce_ident to be different than for the last packet */
+	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
+
+	dce_increment_generation(ipst->ips_dce_default);
+	return (dce);
+}
+
+/*
+ * Atomically looks for a non-default DCE, and if not found tries to create one.
+ * If there is no memory it returns NULL.
+ * When an entry is created we increase the generation number on
+ * the default DCE so that conn_ip_output will detect there is a new DCE.
+ * ifindex should only be used with link-local addresses.
+ */
+dce_t *
+dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
+{
+	uint_t		hash;
+	dcb_t		*dcb;
+	dce_t		*dce;
+
+	/* We should not create entries for link-locals w/o an ifindex */
+	ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
+
+	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
+	dcb = &ipst->ips_dce_hash_v6[hash];
+	rw_enter(&dcb->dcb_lock, RW_WRITER);
+	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
+		    dce->dce_ifindex == ifindex) {
+			mutex_enter(&dce->dce_lock);
+			if (!DCE_IS_CONDEMNED(dce)) {
+				dce_refhold(dce);
+				mutex_exit(&dce->dce_lock);
+				rw_exit(&dcb->dcb_lock);
+				return (dce);
+			}
+			mutex_exit(&dce->dce_lock);
+		}
+	}
+
+	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
+	if (dce == NULL) {
+		rw_exit(&dcb->dcb_lock);
+		return (NULL);
+	}
+	bzero(dce, sizeof (dce_t));
+	dce->dce_ipst = ipst;	/* No netstack_hold */
+	dce->dce_v6addr = *dst;
+	dce->dce_ifindex = ifindex;
+	dce->dce_generation = DCE_GENERATION_INITIAL;
+	dce->dce_ipversion = IPV6_VERSION;
+	dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+	dce_refhold(dce);	/* For the hash list */
+
+	/* Link into list */
+	if (dcb->dcb_dce != NULL)
+		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
+	dce->dce_next = dcb->dcb_dce;
+	dce->dce_ptpn = &dcb->dcb_dce;
+	dcb->dcb_dce = dce;
+	dce->dce_bucket = dcb;
+	atomic_add_32(&dcb->dcb_cnt, 1);
+	dce_refhold(dce);	/* For the caller */
+	rw_exit(&dcb->dcb_lock);
+
+	/* Initialize dce_ident to be different than for the last packet */
+	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
+	dce_increment_generation(ipst->ips_dce_default);
+	return (dce);
+}
+
+/*
+ * Set/update uinfo. Creates a per-destination dce if none exists.
+ *
+ * Note that we do not bump the generation number here.
+ * New connections will find the new uinfo.
+ *
+ * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
+ */
+static void
+dce_setuinfo(dce_t *dce, iulp_t *uinfo)
+{
+	/*
+	 * Update the round trip time estimate and/or the max frag size
+	 * and/or the slow start threshold.
+	 *
+	 * We serialize multiple advises using dce_lock.
+	 */
+	mutex_enter(&dce->dce_lock);
+	/* Gard against setting to zero */
+	if (uinfo->iulp_rtt != 0) {
+		/*
+		 * If there is no old cached values, initialize them
+		 * conservatively.  Set them to be (1.5 * new value).
+		 */
+		if (dce->dce_uinfo.iulp_rtt != 0) {
+			dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
+			    uinfo->iulp_rtt) >> 1;
+		} else {
+			dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
+			    (uinfo->iulp_rtt >> 1);
+		}
+		if (dce->dce_uinfo.iulp_rtt_sd != 0) {
+			dce->dce_uinfo.iulp_rtt_sd =
+			    (dce->dce_uinfo.iulp_rtt_sd +
+			    uinfo->iulp_rtt_sd) >> 1;
+		} else {
+			dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
+			    (uinfo->iulp_rtt_sd >> 1);
+		}
+	}
+	if (uinfo->iulp_mtu != 0) {
+		if (dce->dce_flags & DCEF_PMTU) {
+			dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
+		} else {
+			dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
+			dce->dce_flags |= DCEF_PMTU;
+		}
+		dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+	}
+	if (uinfo->iulp_ssthresh != 0) {
+		if (dce->dce_uinfo.iulp_ssthresh != 0)
+			dce->dce_uinfo.iulp_ssthresh =
+			    (uinfo->iulp_ssthresh +
+			    dce->dce_uinfo.iulp_ssthresh) >> 1;
+		else
+			dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
+	}
+	/* We have uinfo for sure */
+	dce->dce_flags |= DCEF_UINFO;
+	mutex_exit(&dce->dce_lock);
+}
+
+
+int
+dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
+{
+	dce_t *dce;
+
+	dce = dce_lookup_and_add_v4(dst, ipst);
+	if (dce == NULL)
+		return (ENOMEM);
+
+	dce_setuinfo(dce, uinfo);
+	dce_refrele(dce);
+	return (0);
+}
+
+int
+dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
+    ip_stack_t *ipst)
+{
+	dce_t *dce;
+
+	dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
+	if (dce == NULL)
+		return (ENOMEM);
+
+	dce_setuinfo(dce, uinfo);
+	dce_refrele(dce);
+	return (0);
+}
+
+/* Common routine for IPv4 and IPv6 */
+int
+dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
+    ip_stack_t *ipst)
+{
+	ipaddr_t dst4;
+
+	if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
+		IN6_V4MAPPED_TO_IPADDR(dst, dst4);
+		return (dce_update_uinfo_v4(dst4, uinfo, ipst));
+	} else {
+		return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
+	}
+}
+
+static void
+dce_make_condemned(dce_t *dce)
+{
+	ip_stack_t	*ipst = dce->dce_ipst;
+
+	mutex_enter(&dce->dce_lock);
+	ASSERT(!DCE_IS_CONDEMNED(dce));
+	dce->dce_generation = DCE_GENERATION_CONDEMNED;
+	mutex_exit(&dce->dce_lock);
+	/* Count how many condemned dces for kmem_cache callback */
+	atomic_add_32(&ipst->ips_num_dce_condemned, 1);
+}
+
+/*
+ * Increment the generation avoiding the special condemned value
+ */
+void
+dce_increment_generation(dce_t *dce)
+{
+	uint_t generation;
+
+	mutex_enter(&dce->dce_lock);
+	if (!DCE_IS_CONDEMNED(dce)) {
+		generation = dce->dce_generation + 1;
+		if (generation == DCE_GENERATION_CONDEMNED)
+			generation = DCE_GENERATION_INITIAL;
+		ASSERT(generation != DCE_GENERATION_VERIFY);
+		dce->dce_generation = generation;
+	}
+	mutex_exit(&dce->dce_lock);
+}
+
+/*
+ * Increment the generation number on all dces that have a path MTU and
+ * the default DCE. Used when ill_mtu changes.
+ */
+void
+dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
+{
+	int		i;
+	dcb_t		*dcb;
+	dce_t		*dce;
+
+	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+		if (isv6)
+			dcb = &ipst->ips_dce_hash_v6[i];
+		else
+			dcb = &ipst->ips_dce_hash_v4[i];
+		rw_enter(&dcb->dcb_lock, RW_WRITER);
+		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+			if (DCE_IS_CONDEMNED(dce))
+				continue;
+			dce_increment_generation(dce);
+		}
+		rw_exit(&dcb->dcb_lock);
+	}
+	dce_increment_generation(ipst->ips_dce_default);
+}
+
+/*
+ * Caller needs to do a dce_refrele since we can't do the
+ * dce_refrele under dcb_lock.
+ */
+static void
+dce_delete_locked(dcb_t *dcb, dce_t *dce)
+{
+	dce->dce_bucket = NULL;
+	*dce->dce_ptpn = dce->dce_next;
+	if (dce->dce_next != NULL)
+		dce->dce_next->dce_ptpn = dce->dce_ptpn;
+	dce->dce_ptpn = NULL;
+	dce->dce_next = NULL;
+	atomic_add_32(&dcb->dcb_cnt, -1);
+	dce_make_condemned(dce);
+}
+
+static void
+dce_inactive(dce_t *dce)
+{
+	ip_stack_t	*ipst = dce->dce_ipst;
+
+	ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
+	ASSERT(dce->dce_ptpn == NULL);
+	ASSERT(dce->dce_bucket == NULL);
+
+	/* Count how many condemned dces for kmem_cache callback */
+	if (DCE_IS_CONDEMNED(dce))
+		atomic_add_32(&ipst->ips_num_dce_condemned, -1);
+
+	kmem_cache_free(dce_cache, dce);
+}
+
+void
+dce_refrele(dce_t *dce)
+{
+	ASSERT(dce->dce_refcnt != 0);
+	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
+		dce_inactive(dce);
+}
+
+void
+dce_refhold(dce_t *dce)
+{
+	atomic_add_32(&dce->dce_refcnt, 1);
+	ASSERT(dce->dce_refcnt != 0);
+}
+
+/* No tracing support yet hence the same as the above functions */
+void
+dce_refrele_notr(dce_t *dce)
+{
+	ASSERT(dce->dce_refcnt != 0);
+	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
+		dce_inactive(dce);
+}
+
+void
+dce_refhold_notr(dce_t *dce)
+{
+	atomic_add_32(&dce->dce_refcnt, 1);
+	ASSERT(dce->dce_refcnt != 0);
+}
+
+/* Report both the IPv4 and IPv6 DCEs. */
+mblk_t *
+ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
+{
+	struct opthdr		*optp;
+	mblk_t			*mp2ctl;
+	dest_cache_entry_t	dest_cache;
+	mblk_t			*mp_tail = NULL;
+	dce_t			*dce;
+	dcb_t			*dcb;
+	int			i;
+	uint64_t		current_time;
+
+	current_time = TICK_TO_SEC(lbolt64);
+
+	/*
+	 * make a copy of the original message
+	 */
+	mp2ctl = copymsg(mpctl);
+
+	/* First we do IPv4 entries */
+	optp = (struct opthdr *)&mpctl->b_rptr[
+	    sizeof (struct T_optmgmt_ack)];
+	optp->level = MIB2_IP;
+	optp->name = EXPER_IP_DCE;
+
+	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+		dcb = &ipst->ips_dce_hash_v4[i];
+		rw_enter(&dcb->dcb_lock, RW_READER);
+		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+			dest_cache.DestIpv4Address = dce->dce_v4addr;
+			dest_cache.DestFlags = dce->dce_flags;
+			if (dce->dce_flags & DCEF_PMTU)
+				dest_cache.DestPmtu = dce->dce_pmtu;
+			else
+				dest_cache.DestPmtu = 0;
+			dest_cache.DestIdent = dce->dce_ident;
+			dest_cache.DestIfindex = 0;
+			dest_cache.DestAge = current_time -
+			    dce->dce_last_change_time;
+			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
+			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
+				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
+				    "failed to allocate %u bytes\n",
+				    (uint_t)sizeof (dest_cache)));
+			}
+		}
+		rw_exit(&dcb->dcb_lock);
+	}
+	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
+	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
+	    (int)optp->level, (int)optp->name, (int)optp->len));
+	qreply(q, mpctl);
+
+	if (mp2ctl == NULL) {
+		/* Copymsg failed above */
+		return (NULL);
+	}
+
+	/* Now for IPv6 */
+	mpctl = mp2ctl;
+	mp_tail = NULL;
+	mp2ctl = copymsg(mpctl);
+	optp = (struct opthdr *)&mpctl->b_rptr[
+	    sizeof (struct T_optmgmt_ack)];
+	optp->level = MIB2_IP6;
+	optp->name = EXPER_IP_DCE;
+
+	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+		dcb = &ipst->ips_dce_hash_v6[i];
+		rw_enter(&dcb->dcb_lock, RW_READER);
+		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+			dest_cache.DestIpv6Address = dce->dce_v6addr;
+			dest_cache.DestFlags = dce->dce_flags;
+			if (dce->dce_flags & DCEF_PMTU)
+				dest_cache.DestPmtu = dce->dce_pmtu;
+			else
+				dest_cache.DestPmtu = 0;
+			dest_cache.DestIdent = dce->dce_ident;
+			if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
+				dest_cache.DestIfindex = dce->dce_ifindex;
+			else
+				dest_cache.DestIfindex = 0;
+			dest_cache.DestAge = current_time -
+			    dce->dce_last_change_time;
+			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
+			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
+				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
+				    "failed to allocate %u bytes\n",
+				    (uint_t)sizeof (dest_cache)));
+			}
+		}
+		rw_exit(&dcb->dcb_lock);
+	}
+	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
+	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
+	    (int)optp->level, (int)optp->name, (int)optp->len));
+	qreply(q, mpctl);
+
+	return (mp2ctl);
+}
+
+/*
+ * Remove IPv6 DCEs which refer to an ifindex that is going away.
+ * This is not required for correctness, but it avoids netstat -d
+ * showing stale stuff that will never be used.
+ */
+void
+dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
+{
+	uint_t	i;
+	dcb_t	*dcb;
+	dce_t	*dce, *nextdce;
+
+	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+		dcb = &ipst->ips_dce_hash_v6[i];
+		rw_enter(&dcb->dcb_lock, RW_WRITER);
+
+		for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
+			nextdce = dce->dce_next;
+			if (dce->dce_ifindex == ifindex) {
+				dce_delete_locked(dcb, dce);
+				dce_refrele(dce);
+			}
+		}
+		rw_exit(&dcb->dcb_lock);
+	}
+}
diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c
index 9e228c2925..771dd9f62f 100644
--- a/usr/src/uts/common/inet/ip/ip_ftable.c
+++ b/usr/src/uts/common/inet/ip/ip_ftable.c
@@ -42,7 +42,6 @@
 #include <sys/param.h>
 #include <sys/socket.h>
 #include <sys/strsubr.h>
-#include <sys/pattr.h>
 #include <net/if.h>
 #include <net/route.h>
 #include <netinet/in.h>
@@ -50,6 +49,7 @@
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 
+#include <inet/ipsec_impl.h>
 #include <inet/common.h>
 #include <inet/mi.h>
 #include <inet/mib2.h>
@@ -65,7 +65,6 @@
 #include <inet/nd.h>
 
 #include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
 #include <inet/sadb.h>
 #include <inet/tcp.h>
 #include <inet/ipclassifier.h>
@@ -78,87 +77,34 @@
 	(((ire)->ire_type & IRE_DEFAULT) || \
 	    (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
 
-/*
- * structure for passing args between ire_ftable_lookup and ire_find_best_route
- */
-typedef struct ire_ftable_args_s {
-	ipaddr_t	ift_addr;
-	ipaddr_t	ift_mask;
-	ipaddr_t	ift_gateway;
-	int		ift_type;
-	const ipif_t		*ift_ipif;
-	zoneid_t	ift_zoneid;
-	uint32_t	ift_ihandle;
-	const ts_label_t	*ift_tsl;
-	int		ift_flags;
-	ire_t		*ift_best_ire;
-} ire_ftable_args_t;
-
 static ire_t	*route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
-static ire_t   	*ire_round_robin(irb_t *, zoneid_t, ire_ftable_args_t *,
-    ip_stack_t *);
-static void		ire_del_host_redir(ire_t *, char *);
-static boolean_t	ire_find_best_route(struct radix_node *, void *);
-static int	ip_send_align_hcksum_flags(mblk_t *, ill_t *);
-static ire_t	*ire_ftable_lookup_simple(ipaddr_t,
-	ire_t **, zoneid_t,  int, ip_stack_t *);
+static void	ire_del_host_redir(ire_t *, char *);
+static boolean_t ire_find_best_route(struct radix_node *, void *);
 
 /*
  * Lookup a route in forwarding table. A specific lookup is indicated by
  * passing the required parameters and indicating the match required in the
  * flag field.
  *
- * Looking for default route can be done in three ways
- * 1) pass mask as 0 and set MATCH_IRE_MASK in flags field
- *    along with other matches.
- * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
- *    field along with other matches.
- * 3) if the destination and mask are passed as zeros.
- *
- * A request to return a default route if no route
- * is found, can be specified by setting MATCH_IRE_DEFAULT
- * in flags.
- *
- * It does not support recursion more than one level. It
- * will do recursive lookup only when the lookup maps to
- * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
- *
- * If the routing table is setup to allow more than one level
- * of recursion, the cleaning up cache table will not work resulting
- * in invalid routing.
- *
  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
- *
- * NOTE : When this function returns NULL, pire has already been released.
- *	  pire is valid only when this function successfully returns an
- *	  ire.
  */
 ire_t *
-ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
-    int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid,
-    uint32_t ihandle, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
+ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
+    int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
+    int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
 {
-	ire_t *ire = NULL;
-	ipaddr_t gw_addr;
+	ire_t *ire;
 	struct rt_sockaddr rdst, rmask;
 	struct rt_entry *rt;
 	ire_ftable_args_t margs;
-	boolean_t found_incomplete = B_FALSE;
 
-	ASSERT(ipif == NULL || !ipif->ipif_isv6);
+	ASSERT(ill == NULL || !ill->ill_isv6);
 
 	/*
-	 * When we return NULL from this function, we should make
-	 * sure that *pire is NULL so that the callers will not
-	 * wrongly REFRELE the pire.
-	 */
-	if (pire != NULL)
-		*pire = NULL;
-	/*
-	 * ire_match_args() will dereference ipif MATCH_IRE_SRC or
-	 * MATCH_IRE_ILL is set.
+	 * ire_match_args() will dereference ill if MATCH_IRE_ILL
+	 * is set.
 	 */
-	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
+	if ((flags & MATCH_IRE_ILL) && (ill == NULL))
 		return (NULL);
 
 	(void) memset(&rdst, 0, sizeof (rdst));
@@ -176,9 +122,8 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
 	margs.ift_mask = mask;
 	margs.ift_gateway = gateway;
 	margs.ift_type = type;
-	margs.ift_ipif = ipif;
+	margs.ift_ill = ill;
 	margs.ift_zoneid = zoneid;
-	margs.ift_ihandle = ihandle;
 	margs.ift_tsl = tsl;
 	margs.ift_flags = flags;
 
@@ -191,232 +136,93 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
 	 * each matching leaf in the  radix tree. ire_match_args is
 	 * invoked by the callback function ire_find_best_route()
 	 * We hold the global tree lock in read mode when calling
-	 * rn_match_args.Before dropping the global tree lock, ensure
+	 * rn_match_args. Before dropping the global tree lock, ensure
 	 * that the radix node can't be deleted by incrementing ire_refcnt.
 	 */
 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
 	    ipst->ips_ip_ftable, ire_find_best_route, &margs);
 	ire = margs.ift_best_ire;
-	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
-
 	if (rt == NULL) {
+		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 		return (NULL);
-	} else {
-		ASSERT(ire != NULL);
 	}
+	ASSERT(ire != NULL);
 
 	DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
 
-	if (!IS_DEFAULT_ROUTE(ire))
-		goto found_ire_held;
-	/*
-	 * If default route is found, see if default matching criteria
-	 * are satisfied.
-	 */
-	if (flags & MATCH_IRE_MASK) {
-		/*
-		 * we were asked to match a 0 mask, and came back with
-		 * a default route. Ok to return it.
-		 */
-		goto found_default_ire;
-	}
-	if ((flags & MATCH_IRE_TYPE) &&
-	    (type & (IRE_DEFAULT | IRE_INTERFACE))) {
-		/*
-		 * we were asked to match a default ire type. Ok to return it.
-		 */
-		goto found_default_ire;
-	}
-	if (flags & MATCH_IRE_DEFAULT) {
-		goto found_default_ire;
-	}
-	/*
-	 * we found a default route, but default matching criteria
-	 * are not specified and we are not explicitly looking for
-	 * default.
-	 */
-	IRE_REFRELE(ire);
-	return (NULL);
-found_default_ire:
 	/*
 	 * round-robin only if we have more than one route in the bucket.
+	 * ips_ip_ecmp_behavior controls when we do ECMP
+	 *	2:	always
+	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
+	 *	0:	never
 	 */
-	if ((ire->ire_bucket->irb_ire_cnt > 1) &&
-	    IS_DEFAULT_ROUTE(ire) &&
-	    ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
-	    MATCH_IRE_DEFAULT)) {
-		ire_t *next_ire;
-
-		next_ire = ire_round_robin(ire->ire_bucket, zoneid, &margs,
-		    ipst);
-		IRE_REFRELE(ire);
-		if (next_ire != NULL) {
+	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
+		if (ipst->ips_ip_ecmp_behavior == 2 ||
+		    (ipst->ips_ip_ecmp_behavior == 1 &&
+		    IS_DEFAULT_ROUTE(ire))) {
+			ire_t	*next_ire;
+
+			margs.ift_best_ire = NULL;
+			next_ire = ire_round_robin(ire->ire_bucket, &margs,
+			    xmit_hint, ire, ipst);
+			if (next_ire == NULL) {
+				/* keep ire if next_ire is null */
+				goto done;
+			}
+			ire_refrele(ire);
 			ire = next_ire;
-		} else {
-			/* no route */
-			return (NULL);
 		}
 	}
-found_ire_held:
-	if ((flags & MATCH_IRE_RJ_BHOLE) &&
-	    (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
-		return (ire);
-	}
-	/*
-	 * At this point, IRE that was found must be an IRE_FORWARDTABLE
-	 * type.  If this is a recursive lookup and an IRE_INTERFACE type was
-	 * found, return that.  If it was some other IRE_FORWARDTABLE type of
-	 * IRE (one of the prefix types), then it is necessary to fill in the
-	 * parent IRE pointed to by pire, and then lookup the gateway address of
-	 * the parent.  For backwards compatiblity, if this lookup returns an
-	 * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
-	 * of lookup is done.
-	 */
-	if (flags & MATCH_IRE_RECURSIVE) {
-		ipif_t	*gw_ipif;
-		int match_flags = MATCH_IRE_DSTONLY;
-		ire_t *save_ire;
 
-		if (ire->ire_type & IRE_INTERFACE)
-			return (ire);
-		if (pire != NULL)
-			*pire = ire;
-		/*
-		 * If we can't find an IRE_INTERFACE or the caller has not
-		 * asked for pire, we need to REFRELE the save_ire.
-		 */
-		save_ire = ire;
+done:
+	/* Return generation before dropping lock */
+	if (generationp != NULL)
+		*generationp = ire->ire_generation;
 
-		if (ire->ire_ipif != NULL)
-			match_flags |= MATCH_IRE_ILL;
+	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 
-		/*
-		 * ire_ftable_lookup may end up with an incomplete IRE_CACHE
-		 * entry for the gateway (i.e., one for which the
-		 * ire_nce->nce_state is not yet ND_REACHABLE). If the caller
-		 * has specified MATCH_IRE_COMPLETE, such entries will not
-		 * be returned; instead, we return the IF_RESOLVER ire.
-		 */
-		ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0,
-		    ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
-		DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
-		    (ire_t *), save_ire);
-		if (ire == NULL ||
-		    ((ire->ire_type & IRE_CACHE) && ire->ire_nce &&
-		    ire->ire_nce->nce_state != ND_REACHABLE &&
-		    (flags & MATCH_IRE_COMPLETE))) {
-			/*
-			 * Do not release the parent ire if MATCH_IRE_PARENT
-			 * is set. Also return it via ire.
-			 */
-			if (ire != NULL) {
-				ire_refrele(ire);
-				ire = NULL;
-				found_incomplete = B_TRUE;
-			}
-			if (flags & MATCH_IRE_PARENT) {
-				if (pire != NULL) {
-					/*
-					 * Need an extra REFHOLD, if the parent
-					 * ire is returned via both ire and
-					 * pire.
-					 */
-					IRE_REFHOLD(save_ire);
-				}
-				ire = save_ire;
-			} else {
-				ire_refrele(save_ire);
-				if (pire != NULL)
-					*pire = NULL;
-			}
-			if (!found_incomplete)
-				return (ire);
-		}
-		if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
-			/*
-			 * If the caller did not ask for pire, release
-			 * it now.
-			 */
-			if (pire == NULL) {
-				ire_refrele(save_ire);
-			}
-			return (ire);
-		}
-		match_flags |= MATCH_IRE_TYPE;
-		gw_addr = ire->ire_gateway_addr;
-		gw_ipif = ire->ire_ipif;
-		ire_refrele(ire);
-		ire = ire_route_lookup(gw_addr, 0, 0,
-		    (found_incomplete? IRE_INTERFACE :
-		    (IRE_CACHETABLE | IRE_INTERFACE)),
-		    gw_ipif, NULL, zoneid, tsl, match_flags, ipst);
-		DTRACE_PROBE2(ftable__route__lookup2, (ire_t *), ire,
-		    (ire_t *), save_ire);
-		if (ire == NULL ||
-		    ((ire->ire_type & IRE_CACHE) && ire->ire_nce &&
-		    ire->ire_nce->nce_state != ND_REACHABLE &&
-		    (flags & MATCH_IRE_COMPLETE))) {
-			/*
-			 * Do not release the parent ire if MATCH_IRE_PARENT
-			 * is set. Also return it via ire.
-			 */
-			if (ire != NULL) {
-				ire_refrele(ire);
-				ire = NULL;
-			}
-			if (flags & MATCH_IRE_PARENT) {
-				if (pire != NULL) {
-					/*
-					 * Need an extra REFHOLD, if the
-					 * parent ire is returned via both
-					 * ire and pire.
-					 */
-					IRE_REFHOLD(save_ire);
-				}
-				ire = save_ire;
-			} else {
-				ire_refrele(save_ire);
-				if (pire != NULL)
-					*pire = NULL;
-			}
-			return (ire);
-		} else if (pire == NULL) {
-			/*
-			 * If the caller did not ask for pire, release
-			 * it now.
-			 */
-			ire_refrele(save_ire);
-		}
-		return (ire);
+	/*
+	 * For shared-IP zones we need additional checks to what was
+	 * done in ire_match_args to make sure IRE_LOCALs are handled.
+	 *
+	 * When ip_restrict_interzone_loopback is set, then
+	 * we ensure that IRE_LOCAL are only used for loopback
+	 * between zones when the logical "Ethernet" would
+	 * have looped them back. That is, if in the absense of
+	 * the IRE_LOCAL we would have sent to packet out the
+	 * same ill.
+	 */
+	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
+	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
+	    ipst->ips_ip_restrict_interzone_loopback) {
+		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
+		ASSERT(ire != NULL);
 	}
-	ASSERT(pire == NULL || *pire == NULL);
 	return (ire);
 }
 
 /*
  * This function is called by
- * ip_fast_forward->ire_forward_simple
+ * ip_input/ire_route_recursive when doing a route lookup on only the
+ * destination address.
+ *
  * The optimizations of this function over ire_ftable_lookup are:
  *	o removing unnecessary flag matching
  *	o doing longest prefix match instead of overloading it further
  *	  with the unnecessary "best_prefix_match"
- *	o Does not do round robin of default route for every packet
- *	o inlines code of ire_ctable_lookup to look for nexthop cache
- *	  entry before calling ire_route_lookup
+ *
+ * If no route is found we return IRE_NOROUTE.
  */
-static ire_t *
-ire_ftable_lookup_simple(ipaddr_t addr,
-    ire_t **pire, zoneid_t zoneid, int flags,
-    ip_stack_t *ipst)
+ire_t *
+ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
+    uint_t *generationp)
 {
-	ire_t *ire = NULL;
-	ire_t *tmp_ire = NULL;
+	ire_t *ire;
 	struct rt_sockaddr rdst;
 	struct rt_entry *rt;
-	irb_t *irb_ptr;
-	ire_t *save_ire;
-	int match_flags;
+	irb_t *irb;
 
 	rdst.rt_sin_len = sizeof (rdst);
 	rdst.rt_sin_family = AF_INET;
@@ -430,263 +236,125 @@ ire_ftable_lookup_simple(ipaddr_t addr,
 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
 	    ipst->ips_ip_ftable, NULL, NULL);
 
-	if (rt == NULL) {
-		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
-		return (NULL);
-	}
-	irb_ptr = &rt->rt_irb;
-	if (irb_ptr == NULL || irb_ptr->irb_ire_cnt == 0) {
-		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
-		return (NULL);
-	}
+	if (rt == NULL)
+		goto bad;
 
-	rw_enter(&irb_ptr->irb_lock, RW_READER);
-	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (ire->ire_zoneid == zoneid)
-			break;
-	}
+	irb = &rt->rt_irb;
+	if (irb->irb_ire_cnt == 0)
+		goto bad;
 
-	if (ire == NULL || (ire->ire_marks & IRE_MARK_CONDEMNED)) {
-		rw_exit(&irb_ptr->irb_lock);
-		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
-		return (NULL);
+	rw_enter(&irb->irb_lock, RW_READER);
+	ire = irb->irb_ire;
+	if (ire == NULL) {
+		rw_exit(&irb->irb_lock);
+		goto bad;
 	}
-	/* we have a ire that matches */
-	if (ire != NULL)
-		IRE_REFHOLD(ire);
-	rw_exit(&irb_ptr->irb_lock);
-	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
-
-	if ((flags & MATCH_IRE_RJ_BHOLE) &&
-	    (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
-		return (ire);
+	while (IRE_IS_CONDEMNED(ire)) {
+		ire = ire->ire_next;
+		if (ire == NULL) {
+			rw_exit(&irb->irb_lock);
+			goto bad;
+		}
 	}
-	/*
-	 * At this point, IRE that was found must be an IRE_FORWARDTABLE
-	 * type.  If this is a recursive lookup and an IRE_INTERFACE type was
-	 * found, return that.  If it was some other IRE_FORWARDTABLE type of
-	 * IRE (one of the prefix types), then it is necessary to fill in the
-	 * parent IRE pointed to by pire, and then lookup the gateway address of
-	 * the parent.  For backwards compatiblity, if this lookup returns an
-	 * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
-	 * of lookup is done.
-	 */
-	match_flags = MATCH_IRE_DSTONLY;
 
-	if (ire->ire_type & IRE_INTERFACE)
-		return (ire);
-	*pire = ire;
-	/*
-	 * If we can't find an IRE_INTERFACE or the caller has not
-	 * asked for pire, we need to REFRELE the save_ire.
-	 */
-	save_ire = ire;
+	/* we have a ire that matches */
+	ire_refhold(ire);
+	rw_exit(&irb->irb_lock);
 
 	/*
-	 * Currently MATCH_IRE_ILL is never used with
-	 * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
-	 * sending out packets as MATCH_IRE_ILL is used only
-	 * for communicating with on-link hosts. We can't assert
-	 * that here as RTM_GET calls this function with
-	 * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
-	 * We have already used the MATCH_IRE_ILL in determining
-	 * the right prefix route at this point. To match the
-	 * behavior of how we locate routes while sending out
-	 * packets, we don't want to use MATCH_IRE_ILL below
-	 * while locating the interface route.
+	 * round-robin only if we have more than one route in the bucket.
+	 * ips_ip_ecmp_behavior controls when we do ECMP
+	 *	2:	always
+	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
+	 *	0:	never
 	 *
-	 * ire_ftable_lookup may end up with an incomplete IRE_CACHE
-	 * entry for the gateway (i.e., one for which the
-	 * ire_nce->nce_state is not yet ND_REACHABLE). If the caller
-	 * has specified MATCH_IRE_COMPLETE, such entries will not
-	 * be returned; instead, we return the IF_RESOLVER ire.
+	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
+	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
+	 * and the IRE_INTERFACESs are likely to be shorter matches.
 	 */
-
-	if (ire->ire_ipif == NULL) {
-		tmp_ire = ire;
-		/*
-		 * Look to see if the nexthop entry is in the cachetable
-		 */
-		ire = ire_cache_lookup(ire->ire_gateway_addr, zoneid, NULL,
-		    ipst);
-		if (ire == NULL) {
-			/* Try ire_route_lookup */
-			ire = tmp_ire;
-		} else {
-			goto solved;
-		}
-	}
-	if (ire->ire_ipif != NULL)
-		match_flags |= MATCH_IRE_ILL;
-
-	ire = ire_route_lookup(ire->ire_gateway_addr, 0,
-	    0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst);
-solved:
-	DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
-	    (ire_t *), save_ire);
-	if (ire == NULL) {
-		/*
-		 * Do not release the parent ire if MATCH_IRE_PARENT
-		 * is set. Also return it via ire.
-		 */
-		ire_refrele(save_ire);
-		*pire = NULL;
-		return (ire);
-	}
-	if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
-		/*
-		 * If the caller did not ask for pire, release
-		 * it now.
-		 */
-		if (pire == NULL) {
-			ire_refrele(save_ire);
+	if (ire->ire_bucket->irb_ire_cnt > 1) {
+		if (ipst->ips_ip_ecmp_behavior == 2 ||
+		    (ipst->ips_ip_ecmp_behavior == 1 &&
+		    IS_DEFAULT_ROUTE(ire))) {
+			ire_t	*next_ire;
+			ire_ftable_args_t margs;
+
+			(void) memset(&margs, 0, sizeof (margs));
+			margs.ift_addr = addr;
+			margs.ift_zoneid = ALL_ZONES;
+
+			next_ire = ire_round_robin(ire->ire_bucket, &margs,
+			    xmit_hint, ire, ipst);
+			if (next_ire == NULL) {
+				/* keep ire if next_ire is null */
+				if (generationp != NULL)
+					*generationp = ire->ire_generation;
+				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+				return (ire);
+			}
+			ire_refrele(ire);
+			ire = next_ire;
 		}
 	}
-	return (ire);
-}
-
-/*
- * Find an IRE_OFFSUBNET IRE entry for the multicast address 'group'
- * that goes through 'ipif'. As a fallback, a route that goes through
- * ipif->ipif_ill can be returned.
- */
-ire_t *
-ipif_lookup_multi_ire(ipif_t *ipif, ipaddr_t group)
-{
-	ire_t	*ire;
-	ire_t	*save_ire = NULL;
-	ire_t   *gw_ire;
-	irb_t   *irb;
-	ipaddr_t gw_addr;
-	int	match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
-	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
-	ASSERT(CLASSD(group));
-
-	ire = ire_ftable_lookup(group, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
-	    NULL, MATCH_IRE_DEFAULT, ipst);
-
-	if (ire == NULL)
-		return (NULL);
+	/* Return generation before dropping lock */
+	if (generationp != NULL)
+		*generationp = ire->ire_generation;
 
-	irb = ire->ire_bucket;
-	ASSERT(irb);
+	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 
-	IRB_REFHOLD(irb);
-	ire_refrele(ire);
-	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (ire->ire_addr != group ||
-		    ipif->ipif_zoneid != ire->ire_zoneid &&
-		    ire->ire_zoneid != ALL_ZONES) {
-			continue;
-		}
+	/*
+	 * Since we only did ALL_ZONES matches there is no special handling
+	 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
+	 */
+	return (ire);
 
-		switch (ire->ire_type) {
-		case IRE_DEFAULT:
-		case IRE_PREFIX:
-		case IRE_HOST:
-			gw_addr = ire->ire_gateway_addr;
-			gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE,
-			    ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
-
-			if (gw_ire != NULL) {
-				if (save_ire != NULL) {
-					ire_refrele(save_ire);
-				}
-				IRE_REFHOLD(ire);
-				if (gw_ire->ire_ipif == ipif) {
-					ire_refrele(gw_ire);
-
-					IRB_REFRELE(irb);
-					return (ire);
-				}
-				ire_refrele(gw_ire);
-				save_ire = ire;
-			}
-			break;
-		case IRE_IF_NORESOLVER:
-		case IRE_IF_RESOLVER:
-			if (ire->ire_ipif == ipif) {
-				if (save_ire != NULL) {
-					ire_refrele(save_ire);
-				}
-				IRE_REFHOLD(ire);
-
-				IRB_REFRELE(irb);
-				return (ire);
-			}
-			break;
-		}
-	}
-	IRB_REFRELE(irb);
+bad:
+	if (generationp != NULL)
+		*generationp = IRE_GENERATION_VERIFY;
 
-	return (save_ire);
+	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+	return (ire_reject(ipst, B_FALSE));
 }
 
 /*
- * Find an IRE_INTERFACE for the multicast group.
+ * Find the ill matching a multicast group.
  * Allows different routes for multicast addresses
  * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
  * which point at different interfaces. This is used when IP_MULTICAST_IF
  * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
  * specify the interface to join on.
  *
- * Supports IP_BOUND_IF by following the ipif/ill when recursing.
+ * Supports link-local addresses by using ire_route_recursive which follows
+ * the ill when recursing.
+ *
+ * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
+ * and the MULTIRT property can be different for different groups, we
+ * extract RTF_MULTIRT from the special unicast route added for a group
+ * with CGTP and pass that back in the multirtp argument.
+ * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
+ * We have a setsrcp argument for the same reason.
  */
-ire_t *
-ire_lookup_multi(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
+ill_t *
+ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
+    boolean_t *multirtp, ipaddr_t *setsrcp)
 {
 	ire_t	*ire;
-	ipif_t	*ipif = NULL;
-	int	match_flags = MATCH_IRE_TYPE;
-	ipaddr_t gw_addr;
-
-	ire = ire_ftable_lookup(group, 0, 0, 0, NULL, NULL, zoneid,
-	    0, NULL, MATCH_IRE_DEFAULT, ipst);
+	ill_t	*ill;
 
-	/* We search a resolvable ire in case of multirouting. */
-	if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
-		ire_t *cire = NULL;
-		/*
-		 * If the route is not resolvable, the looked up ire
-		 * may be changed here. In that case, ire_multirt_lookup()
-		 * IRE_REFRELE the original ire and change it.
-		 */
-		(void) ire_multirt_lookup(&cire, &ire, MULTIRT_CACHEGW, NULL,
-		    NULL, ipst);
-		if (cire != NULL)
-			ire_refrele(cire);
-	}
-	if (ire == NULL)
-		return (NULL);
-	/*
-	 * Make sure we follow ire_ipif.
-	 *
-	 * We need to determine the interface route through
-	 * which the gateway will be reached.
-	 */
-	if (ire->ire_ipif != NULL) {
-		ipif = ire->ire_ipif;
-		match_flags |= MATCH_IRE_ILL;
-	}
-
-	switch (ire->ire_type) {
-	case IRE_DEFAULT:
-	case IRE_PREFIX:
-	case IRE_HOST:
-		gw_addr = ire->ire_gateway_addr;
-		ire_refrele(ire);
-		ire = ire_ftable_lookup(gw_addr, 0, 0,
-		    IRE_INTERFACE, ipif, NULL, zoneid, 0,
-		    NULL, match_flags, ipst);
-		return (ire);
-	case IRE_IF_NORESOLVER:
-	case IRE_IF_RESOLVER:
-		return (ire);
-	default:
+	ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
+	    MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL);
+	ASSERT(ire != NULL);
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		ire_refrele(ire);
 		return (NULL);
 	}
+
+	if (multirtp != NULL)
+		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
+
+	ill = ire_nexthop_ill(ire);
+	ire_refrele(ire);
+	return (ill);
 }
 
 /*
@@ -701,7 +369,7 @@ ire_del_host_redir(ire_t *ire, char *gateway)
 }
 
 /*
- * Search for all HOST REDIRECT routes that are
+ * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
  * pointing at the specified gateway and
  * delete them. This routine is called only
  * when a default gateway is going away.
@@ -718,732 +386,6 @@ ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
 	    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
 }
 
-struct ihandle_arg {
-	uint32_t ihandle;
-	ire_t	 *ire;
-};
-
-static int
-ire_ihandle_onlink_match(struct radix_node *rn, void *arg)
-{
-	struct rt_entry *rt;
-	irb_t *irb;
-	ire_t *ire;
-	struct ihandle_arg *ih = arg;
-
-	rt = (struct rt_entry *)rn;
-	ASSERT(rt != NULL);
-	irb = &rt->rt_irb;
-	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if ((ire->ire_type & IRE_INTERFACE) &&
-		    (ire->ire_ihandle == ih->ihandle)) {
-			ih->ire = ire;
-			IRE_REFHOLD(ire);
-			return (1);
-		}
-	}
-	return (0);
-}
-
-/*
- * Locate the interface ire that is tied to the cache ire 'cire' via
- * cire->ire_ihandle.
- *
- * We are trying to create the cache ire for an onlink destn. or
- * gateway in 'cire'. We are called from ire_add_v4() in the IRE_IF_RESOLVER
- * case, after the ire has come back from ARP.
- */
-ire_t *
-ire_ihandle_lookup_onlink(ire_t *cire)
-{
-	ire_t	*ire;
-	int	match_flags;
-	struct ihandle_arg ih;
-	ip_stack_t *ipst;
-
-	ASSERT(cire != NULL);
-	ipst = cire->ire_ipst;
-
-	/*
-	 * We don't need to specify the zoneid to ire_ftable_lookup() below
-	 * because the ihandle refers to an ipif which can be in only one zone.
-	 */
-	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
-	/*
-	 * We know that the mask of the interface ire equals cire->ire_cmask.
-	 * (When ip_newroute() created 'cire' for an on-link destn. it set its
-	 * cmask from the interface ire's mask)
-	 */
-	ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0,
-	    IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
-	    NULL, match_flags, ipst);
-	if (ire != NULL)
-		return (ire);
-	/*
-	 * If we didn't find an interface ire above, we can't declare failure.
-	 * For backwards compatibility, we need to support prefix routes
-	 * pointing to next hop gateways that are not on-link.
-	 *
-	 * In the resolver/noresolver case, ip_newroute() thinks it is creating
-	 * the cache ire for an onlink destination in 'cire'. But 'cire' is
-	 * not actually onlink, because ire_ftable_lookup() cheated it, by
-	 * doing ire_route_lookup() twice and returning an interface ire.
-	 *
-	 * Eg. default	-	gw1			(line 1)
-	 *	gw1	-	gw2			(line 2)
-	 *	gw2	-	hme0			(line 3)
-	 *
-	 * In the above example, ip_newroute() tried to create the cache ire
-	 * 'cire' for gw1, based on the interface route in line 3. The
-	 * ire_ftable_lookup() above fails, because there is no interface route
-	 * to reach gw1. (it is gw2). We fall thru below.
-	 *
-	 * Do a brute force search based on the ihandle in a subset of the
-	 * forwarding tables, corresponding to cire->ire_cmask. Otherwise
-	 * things become very complex, since we don't have 'pire' in this
-	 * case. (Also note that this method is not possible in the offlink
-	 * case because we don't know the mask)
-	 */
-	(void) memset(&ih, 0, sizeof (ih));
-	ih.ihandle = cire->ire_ihandle;
-	(void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
-	    ire_ihandle_onlink_match, &ih, irb_refhold_rn, irb_refrele_rn);
-	return (ih.ire);
-}
-
-/*
- * IRE iterator used by ire_ftable_lookup[_v6]() to process multiple default
- * routes. Given a starting point in the hash list (ire_origin), walk the IREs
- * in the bucket skipping default interface routes and deleted entries.
- * Returns the next IRE (unheld), or NULL when we're back to the starting point.
- * Assumes that the caller holds a reference on the IRE bucket.
- */
-ire_t *
-ire_get_next_default_ire(ire_t *ire, ire_t *ire_origin)
-{
-	ASSERT(ire_origin->ire_bucket != NULL);
-	ASSERT(ire != NULL);
-
-	do {
-		ire = ire->ire_next;
-		if (ire == NULL)
-			ire = ire_origin->ire_bucket->irb_ire;
-		if (ire == ire_origin)
-			return (NULL);
-	} while ((ire->ire_type & IRE_INTERFACE) ||
-	    (ire->ire_marks & IRE_MARK_CONDEMNED));
-	ASSERT(ire != NULL);
-	return (ire);
-}
-
-static ipif_t *
-ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire,
-    int zoneid, ushort_t *marks)
-{
-	ipif_t *src_ipif;
-	ill_t *ill = ire->ire_ipif->ipif_ill;
-	ip_stack_t *ipst = ill->ill_ipst;
-
-	/*
-	 * Pick the best source address from ill.
-	 *
-	 * 1) Try to pick the source address from the destination
-	 *    route. Clustering assumes that when we have multiple
-	 *    prefixes hosted on an interface, the prefix of the
-	 *    source address matches the prefix of the destination
-	 *    route. We do this only if the address is not
-	 *    DEPRECATED.
-	 *
-	 * 2) If the conn is in a different zone than the ire, we
-	 *    need to pick a source address from the right zone.
-	 */
-	if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
-		/*
-		 * The RTF_SETSRC flag is set in the parent ire (sire).
-		 * Check that the ipif matching the requested source
-		 * address still exists.
-		 */
-		src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL,
-		    zoneid, NULL, NULL, NULL, NULL, ipst);
-		return (src_ipif);
-	}
-	*marks |= IRE_MARK_USESRC_CHECK;
-	if (IS_IPMP(ill) ||
-	    (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
-	    (ill->ill_usesrc_ifindex != 0)) {
-		src_ipif = ipif_select_source(ill, dst, zoneid);
-	} else {
-		src_ipif = ire->ire_ipif;
-		ASSERT(src_ipif != NULL);
-		/* hold src_ipif for uniformity */
-		ipif_refhold(src_ipif);
-	}
-	return (src_ipif);
-}
-
-/*
- * This function is called by ip_rput_noire() and ip_fast_forward()
- * to resolve the route of incoming packet that needs to be forwarded.
- * If the ire of the nexthop is not already in the cachetable, this
- * routine will insert it to the table, but won't trigger ARP resolution yet.
- * Thus unlike ip_newroute, this function adds incomplete ires to
- * the cachetable. ARP resolution for these ires are  delayed until
- * after all of the packet processing is completed and its ready to
- * be sent out on the wire, Eventually, the packet transmit routine
- * ip_xmit_v4() attempts to send a packet  to the driver. If it finds
- * that there is no link layer information, it will do the arp
- * resolution and queue the packet in ire->ire_nce->nce_qd_mp and
- * then send it out once the arp resolution is over
- * (see ip_xmit_v4()->ire_arpresolve()). This scheme is similar to
- * the model of BSD/SunOS 4
- *
- * In future, the insertion of incomplete ires in the cachetable should
- * be implemented in hostpath as well, as doing so will greatly reduce
- * the existing complexity for code paths that depend on the context of
- * the sender (such as IPsec).
- *
- * Thus this scheme of adding incomplete ires in cachetable in forwarding
- * path can be used as a template for simplifying the hostpath.
- */
-
-ire_t *
-ire_forward(ipaddr_t dst, enum ire_forward_action *ret_action,
-    ire_t *supplied_ire, ire_t *supplied_sire, const struct ts_label_s *tsl,
-    ip_stack_t *ipst)
-{
-	ipaddr_t gw = 0;
-	ire_t	*ire = NULL;
-	ire_t   *sire = NULL, *save_ire;
-	ill_t *dst_ill = NULL;
-	int error;
-	zoneid_t zoneid;
-	ipif_t *src_ipif = NULL;
-	mblk_t *res_mp;
-	ushort_t ire_marks = 0;
-	tsol_gcgrp_t *gcgrp = NULL;
-	tsol_gcgrp_addr_t ga;
-
-	zoneid = GLOBAL_ZONEID;
-
-	if (supplied_ire != NULL) {
-		/* We have arrived here from ipfil_sendpkt */
-		ire = supplied_ire;
-		sire = supplied_sire;
-		goto create_irecache;
-	}
-
-	ire = ire_ftable_lookup(dst, 0, 0, 0, NULL, &sire, zoneid, 0,
-	    tsl, MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-	    MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT|MATCH_IRE_SECATTR, ipst);
-
-	if (ire == NULL) {
-		ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
-		goto icmp_err_ret;
-	}
-
-	/*
-	 * If we encounter CGTP, we should  have the caller use
-	 * ip_newroute to resolve multirt instead of this function.
-	 * CGTP specs explicitly state that it can't be used with routers.
-	 * This essentially prevents insertion of incomplete RTF_MULTIRT
-	 * ires in cachetable.
-	 */
-	if (ipst->ips_ip_cgtp_filter &&
-	    ((ire->ire_flags & RTF_MULTIRT) ||
-	    ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)))) {
-		ip3dbg(("ire_forward: packet is to be multirouted- "
-		    "handing it to ip_newroute\n"));
-		if (sire != NULL)
-			ire_refrele(sire);
-		ire_refrele(ire);
-		/*
-		 * Inform caller about encountering of multirt so that
-		 * ip_newroute() can be called.
-		 */
-		*ret_action = Forward_check_multirt;
-		return (NULL);
-	}
-
-	/*
-	 * Verify that the returned IRE does not have either
-	 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
-	 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
-	 */
-	if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
-	    (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) {
-		ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n",
-		    (void *)ire));
-		goto icmp_err_ret;
-	}
-
-	/*
-	 * If we already have a fully resolved IRE CACHE of the
-	 * nexthop router, just hand over the cache entry
-	 * and we are done.
-	 */
-
-	if (ire->ire_type & IRE_CACHE) {
-
-		/*
-		 * If we are using this ire cache entry as a
-		 * gateway to forward packets, chances are we
-		 * will be using it again. So turn off
-		 * the temporary flag, thus reducing its
-		 * chances of getting deleted frequently.
-		 */
-		if (ire->ire_marks & IRE_MARK_TEMPORARY) {
-			irb_t *irb = ire->ire_bucket;
-			rw_enter(&irb->irb_lock, RW_WRITER);
-			/*
-			 * We need to recheck for IRE_MARK_TEMPORARY after
-			 * acquiring the lock in order to guarantee
-			 * irb_tmp_ire_cnt
-			 */
-			if (ire->ire_marks & IRE_MARK_TEMPORARY) {
-				ire->ire_marks &= ~IRE_MARK_TEMPORARY;
-				irb->irb_tmp_ire_cnt--;
-			}
-			rw_exit(&irb->irb_lock);
-		}
-
-		if (sire != NULL) {
-			UPDATE_OB_PKT_COUNT(sire);
-			sire->ire_last_used_time = lbolt;
-			ire_refrele(sire);
-		}
-		*ret_action = Forward_ok;
-		return (ire);
-	}
-create_irecache:
-	/*
-	 * Increment the ire_ob_pkt_count field for ire if it is an
-	 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
-	 * increment the same for the parent IRE, sire, if it is some
-	 * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST).
-	 */
-	if ((ire->ire_type & IRE_INTERFACE) != 0) {
-		UPDATE_OB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
-	}
-
-	/*
-	 * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type
-	 */
-	if (sire != NULL) {
-		gw = sire->ire_gateway_addr;
-		ASSERT((sire->ire_type &
-		    (IRE_CACHETABLE | IRE_INTERFACE)) == 0);
-		UPDATE_OB_PKT_COUNT(sire);
-		sire->ire_last_used_time = lbolt;
-	}
-
-	dst_ill = ire->ire_ipif->ipif_ill;
-	if (IS_IPMP(dst_ill))
-		dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp);
-	else
-		ill_refhold(dst_ill);
-
-	if (dst_ill == NULL) {
-		ip2dbg(("ire_forward no dst ill; ire 0x%p\n", (void *)ire));
-		goto icmp_err_ret;
-	}
-
-	ASSERT(src_ipif == NULL);
-	/* Now obtain the src_ipif */
-	src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks);
-	if (src_ipif == NULL)
-		goto icmp_err_ret;
-
-	switch (ire->ire_type) {
-	case IRE_IF_NORESOLVER:
-		/* create ire_cache for ire_addr endpoint */
-		if (dst_ill->ill_resolver_mp == NULL) {
-			ip1dbg(("ire_forward: dst_ill %p "
-			    "for IRE_IF_NORESOLVER ire %p has "
-			    "no ill_resolver_mp\n",
-			    (void *)dst_ill, (void *)ire));
-			goto icmp_err_ret;
-		}
-		/* FALLTHRU */
-	case IRE_IF_RESOLVER:
-		/*
-		 * We have the IRE_IF_RESOLVER of the nexthop gateway
-		 * and now need to build a IRE_CACHE for it.
-		 * In this case, we have the following :
-		 *
-		 * 1) src_ipif - used for getting a source address.
-		 *
-		 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
-		 *    means packets using the IRE_CACHE that we will build
-		 *    here will go out on dst_ill.
-		 *
-		 * 3) sire may or may not be NULL. But, the IRE_CACHE that is
-		 *    to be created will only be tied to the IRE_INTERFACE
-		 *    that was derived from the ire_ihandle field.
-		 *
-		 *    If sire is non-NULL, it means the destination is
-		 *    off-link and we will first create the IRE_CACHE for the
-		 *    gateway.
-		 */
-		res_mp = dst_ill->ill_resolver_mp;
-		if (ire->ire_type == IRE_IF_RESOLVER &&
-		    (!OK_RESOLVER_MP(res_mp))) {
-			goto icmp_err_ret;
-		}
-		/*
-		 * To be at this point in the code with a non-zero gw
-		 * means that dst is reachable through a gateway that
-		 * we have never resolved.  By changing dst to the gw
-		 * addr we resolve the gateway first.
-		 */
-		if (gw != INADDR_ANY) {
-			/*
-			 * The source ipif that was determined above was
-			 * relative to the destination address, not the
-			 * gateway's. If src_ipif was not taken out of
-			 * the IRE_IF_RESOLVER entry, we'll need to call
-			 * ipif_select_source() again.
-			 */
-			if (src_ipif != ire->ire_ipif) {
-				ipif_refrele(src_ipif);
-				src_ipif = ipif_select_source(dst_ill,
-				    gw, zoneid);
-				if (src_ipif == NULL)
-					goto icmp_err_ret;
-			}
-			dst = gw;
-			gw = INADDR_ANY;
-		}
-		/*
-		 * dst has been set to the address of the nexthop.
-		 *
-		 * TSol note: get security attributes of the nexthop;
-		 * Note that the nexthop may either be a gateway, or the
-		 * packet destination itself; Detailed explanation of
-		 * issues involved is  provided in the  IRE_IF_NORESOLVER
-		 * logic in ip_newroute().
-		 */
-		ga.ga_af = AF_INET;
-		IN6_IPADDR_TO_V4MAPPED(dst, &ga.ga_addr);
-		gcgrp = gcgrp_lookup(&ga, B_FALSE);
-
-		if (ire->ire_type == IRE_IF_NORESOLVER)
-			dst = ire->ire_addr; /* ire_cache for tunnel endpoint */
-
-		save_ire = ire;
-		/*
-		 * create an incomplete IRE_CACHE.
-		 * An areq_mp will be generated in ire_arpresolve() for
-		 * RESOLVER interfaces.
-		 */
-		ire = ire_create(
-		    (uchar_t *)&dst,		/* dest address */
-		    (uchar_t *)&ip_g_all_ones,	/* mask */
-		    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
-		    (uchar_t *)&gw,		/* gateway address */
-		    (save_ire->ire_type == IRE_IF_RESOLVER ?  NULL:
-		    &save_ire->ire_max_frag),
-		    NULL,
-		    dst_ill->ill_rq,		/* recv-from queue */
-		    dst_ill->ill_wq,		/* send-to queue */
-		    IRE_CACHE,			/* IRE type */
-		    src_ipif,
-		    ire->ire_mask,		/* Parent mask */
-		    0,
-		    ire->ire_ihandle,	/* Interface handle */
-		    0,
-		    &(ire->ire_uinfo),
-		    NULL,
-		    gcgrp,
-		    ipst);
-		ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire));
-		if (ire != NULL) {
-			gcgrp = NULL; /* reference now held by IRE */
-			ire->ire_marks |= ire_marks;
-			/* add the incomplete ire: */
-			error = ire_add(&ire, NULL, NULL, NULL, B_TRUE);
-			if (error == 0 && ire != NULL) {
-				ire->ire_max_frag = save_ire->ire_max_frag;
-				ip1dbg(("setting max_frag to %d in ire 0x%p\n",
-				    ire->ire_max_frag, (void *)ire));
-			} else {
-				ire_refrele(save_ire);
-				goto icmp_err_ret;
-			}
-		} else {
-			if (gcgrp != NULL) {
-				GCGRP_REFRELE(gcgrp);
-				gcgrp = NULL;
-			}
-		}
-
-		ire_refrele(save_ire);
-		break;
-	default:
-		break;
-	}
-
-	*ret_action = Forward_ok;
-	if (sire != NULL)
-		ire_refrele(sire);
-	if (dst_ill != NULL)
-		ill_refrele(dst_ill);
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-	return (ire);
-icmp_err_ret:
-	*ret_action = Forward_ret_icmp_err;
-	if (sire != NULL)
-		ire_refrele(sire);
-	if (dst_ill != NULL)
-		ill_refrele(dst_ill);
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-	if (ire != NULL) {
-		if (ire->ire_flags & RTF_BLACKHOLE)
-			*ret_action = Forward_blackhole;
-		ire_refrele(ire);
-	}
-	return (NULL);
-}
-
-/*
- * Since caller is ip_fast_forward, there is no CGTP or Tsol test
- * Also we dont call ftable lookup with MATCH_IRE_PARENT
- */
-
-ire_t *
-ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
-    ip_stack_t *ipst)
-{
-	ipaddr_t gw = 0;
-	ire_t	*ire = NULL;
-	ire_t   *sire = NULL, *save_ire;
-	ill_t *dst_ill = NULL;
-	int error;
-	zoneid_t zoneid = GLOBAL_ZONEID;
-	ipif_t *src_ipif = NULL;
-	mblk_t *res_mp;
-	ushort_t ire_marks = 0;
-
-	ire = ire_ftable_lookup_simple(dst, &sire, zoneid,
-	    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE, ipst);
-	if (ire == NULL) {
-		ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
-		goto icmp_err_ret;
-	}
-
-	/*
-	 * Verify that the returned IRE does not have either
-	 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
-	 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
-	 */
-	if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
-	    ((ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0)) {
-		ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n",
-		    (void *)ire));
-		goto icmp_err_ret;
-	}
-
-	/*
-	 * If we already have a fully resolved IRE CACHE of the
-	 * nexthop router, just hand over the cache entry
-	 * and we are done.
-	 */
-	if (ire->ire_type & IRE_CACHE) {
-		/*
-		 * If we are using this ire cache entry as a
-		 * gateway to forward packets, chances are we
-		 * will be using it again. So turn off
-		 * the temporary flag, thus reducing its
-		 * chances of getting deleted frequently.
-		 */
-		if (ire->ire_marks & IRE_MARK_TEMPORARY) {
-			irb_t *irb = ire->ire_bucket;
-			rw_enter(&irb->irb_lock, RW_WRITER);
-			ire->ire_marks &= ~IRE_MARK_TEMPORARY;
-			irb->irb_tmp_ire_cnt--;
-			rw_exit(&irb->irb_lock);
-		}
-
-		if (sire != NULL) {
-			UPDATE_OB_PKT_COUNT(sire);
-			ire_refrele(sire);
-		}
-		*ret_action = Forward_ok;
-		return (ire);
-	}
-	/*
-	 * Increment the ire_ob_pkt_count field for ire if it is an
-	 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
-	 * increment the same for the parent IRE, sire, if it is some
-	 * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST).
-	 */
-	if ((ire->ire_type & IRE_INTERFACE) != 0) {
-		UPDATE_OB_PKT_COUNT(ire);
-		ire->ire_last_used_time = lbolt;
-	}
-
-	/*
-	 * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type
-	 */
-	if (sire != NULL) {
-		gw = sire->ire_gateway_addr;
-		ASSERT((sire->ire_type &
-		    (IRE_CACHETABLE | IRE_INTERFACE)) == 0);
-		UPDATE_OB_PKT_COUNT(sire);
-	}
-
-	dst_ill = ire->ire_ipif->ipif_ill;
-	if (IS_IPMP(dst_ill))
-		dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp);
-	else
-		ill_refhold(dst_ill);	/* for symmetry */
-
-	if (dst_ill == NULL) {
-		ip2dbg(("ire_forward_simple: no dst ill; ire 0x%p\n",
-		    (void *)ire));
-		goto icmp_err_ret;
-	}
-
-	ASSERT(src_ipif == NULL);
-	/* Now obtain the src_ipif */
-	src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks);
-	if (src_ipif == NULL)
-		goto icmp_err_ret;
-
-	switch (ire->ire_type) {
-	case IRE_IF_NORESOLVER:
-		/* create ire_cache for ire_addr endpoint */
-	case IRE_IF_RESOLVER:
-		/*
-		 * We have the IRE_IF_RESOLVER of the nexthop gateway
-		 * and now need to build a IRE_CACHE for it.
-		 * In this case, we have the following :
-		 *
-		 * 1) src_ipif - used for getting a source address.
-		 *
-		 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
-		 *    means packets using the IRE_CACHE that we will build
-		 *    here will go out on dst_ill.
-		 *
-		 * 3) sire may or may not be NULL. But, the IRE_CACHE that is
-		 *    to be created will only be tied to the IRE_INTERFACE
-		 *    that was derived from the ire_ihandle field.
-		 *
-		 *    If sire is non-NULL, it means the destination is
-		 *    off-link and we will first create the IRE_CACHE for the
-		 *    gateway.
-		 */
-		res_mp = dst_ill->ill_resolver_mp;
-		if (ire->ire_type == IRE_IF_RESOLVER &&
-		    (!OK_RESOLVER_MP(res_mp))) {
-			ire_refrele(ire);
-			ire = NULL;
-			goto out;
-		}
-		/*
-		 * To be at this point in the code with a non-zero gw
-		 * means that dst is reachable through a gateway that
-		 * we have never resolved.  By changing dst to the gw
-		 * addr we resolve the gateway first.
-		 */
-		if (gw != INADDR_ANY) {
-			/*
-			 * The source ipif that was determined above was
-			 * relative to the destination address, not the
-			 * gateway's. If src_ipif was not taken out of
-			 * the IRE_IF_RESOLVER entry, we'll need to call
-			 * ipif_select_source() again.
-			 */
-			if (src_ipif != ire->ire_ipif) {
-				ipif_refrele(src_ipif);
-				src_ipif = ipif_select_source(dst_ill,
-				    gw, zoneid);
-				if (src_ipif == NULL)
-					goto icmp_err_ret;
-			}
-			dst = gw;
-			gw = INADDR_ANY;
-		}
-
-		if (ire->ire_type == IRE_IF_NORESOLVER)
-			dst = ire->ire_addr; /* ire_cache for tunnel endpoint */
-
-		save_ire = ire;
-		/*
-		 * create an incomplete IRE_CACHE.
-		 * An areq_mp will be generated in ire_arpresolve() for
-		 * RESOLVER interfaces.
-		 */
-		ire = ire_create(
-		    (uchar_t *)&dst,		/* dest address */
-		    (uchar_t *)&ip_g_all_ones,	/* mask */
-		    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
-		    (uchar_t *)&gw,		/* gateway address */
-		    (save_ire->ire_type == IRE_IF_RESOLVER ?  NULL:
-		    &save_ire->ire_max_frag),
-		    NULL,
-		    dst_ill->ill_rq,		/* recv-from queue */
-		    dst_ill->ill_wq,		/* send-to queue */
-		    IRE_CACHE,			/* IRE type */
-		    src_ipif,
-		    ire->ire_mask,		/* Parent mask */
-		    0,
-		    ire->ire_ihandle,	/* Interface handle */
-		    0,
-		    &(ire->ire_uinfo),
-		    NULL,
-		    NULL,
-		    ipst);
-		ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire));
-		if (ire != NULL) {
-			ire->ire_marks |= ire_marks;
-			/* add the incomplete ire: */
-			error = ire_add(&ire, NULL, NULL, NULL, B_TRUE);
-			if (error == 0 && ire != NULL) {
-				ire->ire_max_frag = save_ire->ire_max_frag;
-				ip1dbg(("setting max_frag to %d in ire 0x%p\n",
-				    ire->ire_max_frag, (void *)ire));
-			} else {
-				ire_refrele(save_ire);
-				goto icmp_err_ret;
-			}
-		}
-
-		ire_refrele(save_ire);
-		break;
-	default:
-		break;
-	}
-
-out:
-	*ret_action = Forward_ok;
-	if (sire != NULL)
-		ire_refrele(sire);
-	if (dst_ill != NULL)
-		ill_refrele(dst_ill);
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-	return (ire);
-icmp_err_ret:
-	*ret_action = Forward_ret_icmp_err;
-	if (src_ipif != NULL)
-		ipif_refrele(src_ipif);
-	if (dst_ill != NULL)
-		ill_refrele(dst_ill);
-	if (sire != NULL)
-		ire_refrele(sire);
-	if (ire != NULL) {
-		if (ire->ire_flags & RTF_BLACKHOLE)
-			*ret_action = Forward_blackhole;
-		ire_refrele(ire);
-	}
-	/* caller needs to send icmp error message */
-	return (NULL);
-
-}
-
 /*
  * Obtain the rt_entry and rt_irb for the route to be added to
  * the ips_ip_ftable.
@@ -1489,7 +431,7 @@ ire_get_bucket(ire_t *ire)
 	rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
 	rt->rt_dst = rdst;
 	irb = &rt->rt_irb;
-	irb->irb_marks |= IRB_MARK_FTABLE; /* dynamically allocated/freed */
+	irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
 	irb->irb_ipst = ipst;
 	rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
@@ -1510,7 +452,7 @@ ire_get_bucket(ire_t *ire)
 	}
 	if (rt != NULL) {
 		irb = &rt->rt_irb;
-		IRB_REFHOLD(irb);
+		irb_refhold(irb);
 	}
 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 	return (irb);
@@ -1551,10 +493,12 @@ ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
 
 	ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
 
-	if ((ire =  route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
-		ill = ire_to_ill(ire);
-		if (ill != NULL)
+	if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
+		ill = ire_nexthop_ill(ire);
+		if (ill != NULL) {
 			ifindex = ill->ill_phyint->phyint_ifindex;
+			ill_refrele(ill);
+		}
 		ire_refrele(ire);
 	}
 	netstack_rele(ns);
@@ -1563,7 +507,7 @@ ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
 
 /*
  * Routine to find the route to a destination. If a ifindex is supplied
- * it tries to match the the route to the corresponding ipif for the ifindex
+ * it tries to match the route to the corresponding ipif for the ifindex
  */
 static	ire_t *
 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
@@ -1571,27 +515,33 @@ route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
 	ire_t *ire = NULL;
 	int match_flags;
 
-	match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
-	    MATCH_IRE_RECURSIVE | MATCH_IRE_RJ_BHOLE);
+	match_flags = MATCH_IRE_DSTONLY;
 
 	/* XXX pass NULL tsl for now */
 
 	if (dst_addr->sa_family == AF_INET) {
-		ire = ire_route_lookup(
-		    ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr,
-		    0, 0, 0, NULL, NULL, zoneid, NULL, match_flags, ipst);
+		ire = ire_route_recursive_v4(
+		    ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
+		    zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL,
+		    NULL);
 	} else {
-		ire = ire_route_lookup_v6(
-		    &((struct sockaddr_in6 *)dst_addr)->sin6_addr,
-		    0, 0, 0, NULL, NULL, zoneid, NULL, match_flags, ipst);
+		ire = ire_route_recursive_v6(
+		    &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
+		    zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL,
+		    NULL);
+	}
+	ASSERT(ire != NULL);
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+		ire_refrele(ire);
+		return (NULL);
 	}
 	return (ire);
 }
 
 /*
  * This routine is called by IP Filter to send a packet out on the wire
- * to a specified V4 dst (which may be onlink or offlink). The ifindex may or
- * may not be 0. A non-null ifindex indicates IP Filter has stipulated
+ * to a specified dstination (which may be onlink or offlink). The ifindex may
+ * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
  * an outgoing interface and requires the nexthop to be on that interface.
  * IP WILL NOT DO the following to the data packet before sending it out:
  *	a. manipulate ttl
@@ -1611,21 +561,18 @@ route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
  *			of the offlink dst's nexthop needs to get
  *			resolved before packet can be sent to dst.
  *			Thus transmission is not guaranteed.
- *
+ *			Note: No longer have visibility to the ARP queue
+ *			hence no EINPROGRESS.
  */
-
 int
 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
     zoneid_t zoneid)
 {
-	ire_t *ire = NULL, *sire = NULL;
-	ire_t *ire_cache = NULL;
-	int value;
-	int match_flags;
-	ipaddr_t dst;
+	ipaddr_t nexthop;
 	netstack_t *ns;
 	ip_stack_t *ipst;
-	enum ire_forward_action ret_action;
+	ip_xmit_attr_t ixas;
+	int error;
 
 	ASSERT(mp != NULL);
 
@@ -1646,429 +593,57 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
 	ASSERT(dst_addr->sa_family == AF_INET ||
 	    dst_addr->sa_family == AF_INET6);
 
-	if (dst_addr->sa_family == AF_INET) {
-		dst = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
-	} else {
-		/*
-		 * We dont have support for V6 yet. It will be provided
-		 * once RFE  6399103  has been delivered.
-		 * Until then, for V6 dsts, IP Filter will not call
-		 * this function. Instead the netinfo framework provides
-		 * its own code path, in ip_inject_impl(), to achieve
-		 * what it needs to do, for the time being.
-		 */
-		ip1dbg(("ipfil_sendpkt: no V6 support \n"));
-		value = ECOMM;
-		freemsg(mp);
-		goto discard;
-	}
-
-	/*
-	 * Lets get the ire. We might get the ire cache entry,
-	 * or the ire,sire pair needed to create the cache entry.
-	 * XXX pass NULL tsl for now.
-	 */
-
-	if (ifindex == 0) {
-		/* There is no supplied index. So use the FIB info */
-
-		match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
-		    MATCH_IRE_RECURSIVE | MATCH_IRE_RJ_BHOLE);
-		ire = ire_route_lookup(dst,
-		    0, 0, 0, NULL, &sire, zoneid, msg_getlabel(mp),
-		    match_flags, ipst);
-	} else {
-		ipif_t *supplied_ipif;
-		ill_t *ill;
-
-		match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
-		    MATCH_IRE_RECURSIVE| MATCH_IRE_RJ_BHOLE|
-		    MATCH_IRE_SECATTR | MATCH_IRE_ILL);
-
-		/*
-		 * If supplied ifindex is non-null, the only valid
-		 * nexthop is one off of the interface corresponding
-		 * to the specified ifindex.
-		 */
-		ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
-		    NULL, NULL, NULL, NULL, ipst);
-		if (ill == NULL) {
-			ip1dbg(("ipfil_sendpkt: Could not find"
-			    " route to dst\n"));
-			value = ECOMM;
-			freemsg(mp);
-			goto discard;
-		}
-
-		supplied_ipif = ipif_get_next_ipif(NULL, ill);
-		ire = ire_route_lookup(dst, 0, 0, 0, supplied_ipif,
-		    &sire, zoneid, msg_getlabel(mp), match_flags, ipst);
-		if (supplied_ipif != NULL)
-			ipif_refrele(supplied_ipif);
-		ill_refrele(ill);
-	}
-
+	bzero(&ixas, sizeof (ixas));
 	/*
-	 * Verify that the returned IRE is non-null and does
-	 * not have either the RTF_REJECT or RTF_BLACKHOLE
-	 * flags set and that the IRE is  either an IRE_CACHE,
-	 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
+	 * No IPsec, no fragmentation, and don't let any hooks see
+	 * the packet.
 	 */
-	if (ire == NULL ||
-	    ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
-	    (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0)) {
-		/*
-		 * Either ire could not be found or we got
-		 * an invalid one
-		 */
-		ip1dbg(("ipfil_sendpkt: Could not find route to dst\n"));
-		value = ENONET;
-		freemsg(mp);
-		goto discard;
-	}
-
-	/* IP Filter and CGTP dont mix. So bail out if CGTP is on */
-	if (ipst->ips_ip_cgtp_filter &&
-	    ((ire->ire_flags & RTF_MULTIRT) ||
-	    ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)))) {
-		ip1dbg(("ipfil_sendpkt: IPFilter does not work with CGTP\n"));
-		value = ECOMM;
-		freemsg(mp);
-		goto discard;
-	}
+	ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
+	ixas.ixa_cred = kcred;
+	ixas.ixa_cpid = NOPID;
+	ixas.ixa_tsl = NULL;
+	ixas.ixa_ipst = ipst;
+	ixas.ixa_ifindex = ifindex;
 
-	ASSERT(ire->ire_type != IRE_CACHE || ire->ire_nce != NULL);
-
-	/*
-	 * If needed, we will create the ire cache entry for the
-	 * nexthop, resolve its link-layer address and then send
-	 * the packet out without ttl or IPSec processing.
-	 */
-	switch (ire->ire_type) {
-	case IRE_CACHE:
-		if (sire != NULL) {
-			UPDATE_OB_PKT_COUNT(sire);
-			sire->ire_last_used_time = lbolt;
-			ire_refrele(sire);
-		}
-		ire_cache = ire;
-		break;
-	case IRE_IF_NORESOLVER:
-	case IRE_IF_RESOLVER:
-		/*
-		 * Call ire_forward(). This function
-		 * will, create the ire cache entry of the
-		 * the nexthop and adds this incomplete ire
-		 * to the ire cache table
-		 */
-		ire_cache = ire_forward(dst, &ret_action, ire, sire,
-		    msg_getlabel(mp), ipst);
-		if (ire_cache == NULL) {
-			ip1dbg(("ipfil_sendpkt: failed to create the"
-			    " ire cache entry \n"));
-			value = ENONET;
-			freemsg(mp);
-			sire = NULL;
-			ire = NULL;
-			goto discard;
-		}
-		break;
-	}
-
-	if (DB_CKSUMFLAGS(mp)) {
-		if (ip_send_align_hcksum_flags(mp, ire_to_ill(ire_cache)))
-			goto cleanup;
-	}
-
-	/*
-	 * Now that we have the ire cache entry of the nexthop, call
-	 * ip_xmit_v4() to trigger mac addr resolution
-	 * if necessary and send it once ready.
-	 */
-
-	value = ip_xmit_v4(mp, ire_cache, NULL, B_FALSE, NULL);
-cleanup:
-	ire_refrele(ire_cache);
-	/*
-	 * At this point, the reference for these have already been
-	 * released within ire_forward() and/or ip_xmit_v4(). So we set
-	 * them to NULL to make sure we dont drop the references
-	 * again in case ip_xmit_v4() returns with either SEND_FAILED
-	 * or LLHDR_RESLV_FAILED
-	 */
-	sire = NULL;
-	ire = NULL;
-
-	switch (value) {
-	case SEND_FAILED:
-		ip1dbg(("ipfil_sendpkt: Send failed\n"));
-		value = ECOMM;
-		break;
-	case LLHDR_RESLV_FAILED:
-		ip1dbg(("ipfil_sendpkt: Link-layer resolution"
-		    "  failed\n"));
-		value = ECOMM;
-		break;
-	case LOOKUP_IN_PROGRESS:
-		netstack_rele(ns);
-		return (EINPROGRESS);
-	case SEND_PASSED:
-		netstack_rele(ns);
-		return (0);
-	}
-discard:
 	if (dst_addr->sa_family == AF_INET) {
-		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-	} else {
-		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
-	}
-	if (ire != NULL)
-		ire_refrele(ire);
-	if (sire != NULL)
-		ire_refrele(sire);
-	netstack_rele(ns);
-	return (value);
-}
-
-
-/*
- * We don't check for dohwcksum in here because it should be being used
- * elsewhere to control what flags are being set on the mblk.  That is,
- * if DB_CKSUMFLAGS() is non-zero then we assume dohwcksum to be true
- * for this packet.
- *
- * This function assumes that it is *only* being called for TCP or UDP
- * packets and nothing else.
- */
-static int
-ip_send_align_hcksum_flags(mblk_t *mp, ill_t *ill)
-{
-	int illhckflags;
-	int mbhckflags;
-	uint16_t *up;
-	uint32_t cksum;
-	ipha_t *ipha;
-	ip6_t *ip6;
-	int proto;
-	int ipversion;
-	int length;
-	int start;
-	ip6_pkt_t ipp;
-
-	mbhckflags = DB_CKSUMFLAGS(mp);
-	ASSERT(mbhckflags != 0);
-	ASSERT(mp->b_datap->db_type == M_DATA);
-	/*
-	 * Since this function only knows how to manage the hardware checksum
-	 * issue, reject and packets that have flags set on the aside from
-	 * checksum related attributes as we cannot necessarily safely map
-	 * that packet onto the new NIC.  Packets that can be potentially
-	 * dropped here include those marked for LSO.
-	 */
-	if ((mbhckflags &
-	    ~(HCK_FULLCKSUM|HCK_PARTIALCKSUM|HCK_IPV4_HDRCKSUM)) != 0) {
-		DTRACE_PROBE2(pbr__incapable, (mblk_t *), mp, (ill_t *), ill);
-		freemsg(mp);
-		return (-1);
-	}
-
-	ipha = (ipha_t *)mp->b_rptr;
-
-	/*
-	 * Find out what the new NIC is capable of, if anything, and
-	 * only allow it to be used with M_DATA mblks being sent out.
-	 */
-	if (ILL_HCKSUM_CAPABLE(ill)) {
-		illhckflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
-	} else {
-		/*
-		 * No capabilities, so turn off everything.
-		 */
-		illhckflags = 0;
-		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 0, 0);
-		mp->b_datap->db_struioflag &= ~STRUIO_IP;
-	}
-
-	DTRACE_PROBE4(pbr__info__a, (mblk_t *), mp, (ill_t *), ill,
-	    uint32_t, illhckflags, uint32_t, mbhckflags);
-	/*
-	 * This block of code that looks for the position of the TCP/UDP
-	 * checksum is early in this function because we need to know
-	 * what needs to be blanked out for the hardware checksum case.
-	 *
-	 * That we're in this function implies that the packet is either
-	 * TCP or UDP on Solaris, so checks are made for one protocol and
-	 * if that fails, the other is therefore implied.
-	 */
-	ipversion = IPH_HDR_VERSION(ipha);
+		ipha_t *ipha = (ipha_t *)mp->b_rptr;
 
-	if (ipversion == IPV4_VERSION) {
-		proto = ipha->ipha_protocol;
-		if (proto == IPPROTO_TCP) {
-			up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
-		} else {
-			up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
+		ixas.ixa_flags |= IXAF_IS_IPV4;
+		nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
+		if (nexthop != ipha->ipha_dst) {
+			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
+			ixas.ixa_nexthop_v4 = nexthop;
 		}
+		ixas.ixa_multicast_ttl = ipha->ipha_ttl;
 	} else {
-		uint8_t lasthdr;
-
-		/*
-		 * Nothing I've seen indicates that IPv6 checksum'ing
-		 * precludes the presence of extension headers, so we
-		 * can't just look at the next header value in the IPv6
-		 * packet header to see if it is TCP/UDP.
-		 */
-		ip6 = (ip6_t *)ipha;
-		(void) memset(&ipp, 0, sizeof (ipp));
-		start = ip_find_hdr_v6(mp, ip6, &ipp, &lasthdr);
-		proto = lasthdr;
-
-		if (proto == IPPROTO_TCP) {
-			up = IPH_TCPH_CHECKSUMP(ipha, start);
-		} else {
-			up = IPH_UDPH_CHECKSUMP(ipha, start);
-		}
-	}
+		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+		in6_addr_t *nexthop6;
 
-	/*
-	 * The first case here is easiest:
-	 * mblk hasn't asked for full checksum, but the card supports it.
-	 *
-	 * In addition, check for IPv4 header capability.  Note that only
-	 * the mblk flag is checked and not ipversion.
-	 */
-	if ((((illhckflags & HCKSUM_INET_FULL_V4) && (ipversion == 4)) ||
-	    (((illhckflags & HCKSUM_INET_FULL_V6) && (ipversion == 6)))) &&
-	    ((mbhckflags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) != 0)) {
-		int newflags = HCK_FULLCKSUM;
-
-		if ((mbhckflags & HCK_IPV4_HDRCKSUM) != 0) {
-			if ((illhckflags & HCKSUM_IPHDRCKSUM) != 0) {
-				newflags |= HCK_IPV4_HDRCKSUM;
-			} else {
-				/*
-				 * Rather than call a function, just inline
-				 * the computation of the basic IPv4 header.
-				 */
-				cksum = (ipha->ipha_dst >> 16) +
-				    (ipha->ipha_dst & 0xFFFF) +
-				    (ipha->ipha_src >> 16) +
-				    (ipha->ipha_src & 0xFFFF);
-				IP_HDR_CKSUM(ipha, cksum,
-				    ((uint32_t *)ipha)[0],
-				    ((uint16_t *)ipha)[4]);
-			}
+		nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
+		if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
+			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
+			ixas.ixa_nexthop_v6 = *nexthop6;
 		}
-
-		*up = 0;
-		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
-		    newflags, 0);
-		return (0);
-	}
-
-	DTRACE_PROBE2(pbr__info__b, int, ipversion, int, proto);
-
-	/*
-	 * Start calculating the pseudo checksum over the IP packet header.
-	 * Although the final pseudo checksum used by TCP/UDP consists of
-	 * more than just the address fields, we can use the result of
-	 * adding those together a little bit further down for IPv4.
-	 */
-	if (ipversion == IPV4_VERSION) {
-		cksum = (ipha->ipha_dst >> 16) + (ipha->ipha_dst & 0xFFFF) +
-		    (ipha->ipha_src >> 16) + (ipha->ipha_src & 0xFFFF);
-		start = IP_SIMPLE_HDR_LENGTH;
-		length = ntohs(ipha->ipha_length);
-		DTRACE_PROBE3(pbr__info__e, uint32_t, ipha->ipha_src,
-		    uint32_t, ipha->ipha_dst, int, cksum);
-	} else {
-		uint16_t *pseudo;
-
-		pseudo = (uint16_t *)&ip6->ip6_src;
-
-		/* calculate pseudo-header checksum */
-		cksum = pseudo[0] + pseudo[1] + pseudo[2] + pseudo[3] +
-		    pseudo[4] + pseudo[5] + pseudo[6] + pseudo[7] +
-		    pseudo[8] + pseudo[9] + pseudo[10] + pseudo[11] +
-		    pseudo[12] + pseudo[13] + pseudo[14] + pseudo[15];
-
-		length = ntohs(ip6->ip6_plen) + sizeof (ip6_t);
-	}
-
-	/* Fold the initial sum */
-	cksum = (cksum & 0xffff) + (cksum >> 16);
-
-	/*
-	 * If the packet was asking for an IPv4 header checksum to be
-	 * calculated but the interface doesn't support that, fill it in
-	 * using our pseudo checksum as a starting point.
-	 */
-	if (((mbhckflags & HCK_IPV4_HDRCKSUM) != 0) &&
-	    ((illhckflags & HCKSUM_IPHDRCKSUM) == 0)) {
-		/*
-		 * IP_HDR_CKSUM uses the 2rd arg to the macro in a destructive
-		 * way so pass in a copy of the checksum calculated thus far.
-		 */
-		uint32_t ipsum = cksum;
-
-		DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
-
-		IP_HDR_CKSUM(ipha, ipsum, ((uint32_t *)ipha)[0],
-		    ((uint16_t *)ipha)[4]);
-	}
-
-	DTRACE_PROBE3(pbr__info__c, int, start, int, length, int, cksum);
-
-	if (proto == IPPROTO_TCP) {
-		cksum += IP_TCP_CSUM_COMP;
-	} else {
-		cksum += IP_UDP_CSUM_COMP;
+		ixas.ixa_multicast_ttl = ip6h->ip6_hops;
 	}
-	cksum += htons(length - start);
-	cksum = (cksum & 0xffff) + (cksum >> 16);
-
-	/*
-	 * For TCP/UDP, we either want to setup the packet for partial
-	 * checksum or we want to do it all ourselves because the NIC
-	 * offers no support for either partial or full checksum.
-	 */
-	if ((illhckflags & HCKSUM_INET_PARTIAL) != 0) {
-		/*
-		 * The only case we care about here is if the mblk was
-		 * previously set for full checksum offload.  If it was
-		 * marked for partial (and the NIC does partial), then
-		 * we have nothing to do.  Similarly if the packet was
-		 * not set for partial or full, we do nothing as this
-		 * is cheaper than more work to set something up.
-		 */
-		if ((mbhckflags & HCK_FULLCKSUM) != 0) {
-			uint32_t offset;
-
-			if (proto == IPPROTO_TCP) {
-				offset = TCP_CHECKSUM_OFFSET;
-			} else {
-				offset = UDP_CHECKSUM_OFFSET;
-			}
-			*up = cksum;
-
-			DTRACE_PROBE3(pbr__info__f, int, length - start, int,
-			    cksum, int, offset);
+	error = ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
 
-			(void) hcksum_assoc(mp, NULL, NULL, start,
-			    start + offset, length, 0,
-			    DB_CKSUMFLAGS(mp) | HCK_PARTIALCKSUM, 0);
-		}
+	netstack_rele(ns);
+	switch (error) {
+	case 0:
+		break;
 
-	} else if (mbhckflags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) {
-		DB_CKSUMFLAGS(mp) &= ~(HCK_PARTIALCKSUM|HCK_FULLCKSUM);
+	case EHOSTUNREACH:
+	case ENETUNREACH:
+		error = ENONET;
+		break;
 
-		*up = 0;
-		*up = IP_CSUM(mp, start, cksum);
+	default:
+		error = ECOMM;
+		break;
 	}
-
-	DTRACE_PROBE4(pbr__info__d, (mblk_t *), mp, (ipha_t *), ipha,
-	    (uint16_t *), up, int, cksum);
-	return (0);
+	return (error);
 }
 
 /*
@@ -2094,18 +669,18 @@ ire_find_best_route(struct radix_node *rn, void *arg)
 
 	rw_enter(&irb_ptr->irb_lock, RW_READER);
 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (ire->ire_marks & IRE_MARK_CONDEMNED)
+		if (IRE_IS_CONDEMNED(ire))
 			continue;
-		if (margs->ift_flags & MATCH_IRE_MASK)
+		if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK))
 			match_mask = margs->ift_mask;
 		else
 			match_mask = ire->ire_mask;
 
 		if (ire_match_args(ire, margs->ift_addr, match_mask,
-		    margs->ift_gateway, margs->ift_type, margs->ift_ipif,
-		    margs->ift_zoneid, margs->ift_ihandle, margs->ift_tsl,
-		    margs->ift_flags, NULL)) {
-			IRE_REFHOLD(ire);
+		    margs->ift_gateway, margs->ift_type, margs->ift_ill,
+		    margs->ift_zoneid, margs->ift_tsl,
+		    margs->ift_flags)) {
+			ire_refhold(ire);
 			rw_exit(&irb_ptr->irb_lock);
 			margs->ift_best_ire = ire;
 			return (B_TRUE);
@@ -2198,107 +773,182 @@ irb_refrele_ftable(irb_t *irb)
 }
 
 /*
- * IRE iterator used by ire_ftable_lookup() to process multiple default
- * routes. Given a starting point in the hash list (ire_origin), walk the IREs
- * in the bucket skipping default interface routes and deleted entries.
- * Returns the next IRE (unheld), or NULL when we're back to the starting point.
- * Assumes that the caller holds a reference on the IRE bucket.
+ * IRE iterator used by ire_ftable_lookup to process multiple equal
+ * routes. Given a starting point in the hash list (hash), walk the IREs
+ * in the bucket skipping deleted entries. We treat the bucket as a circular
+ * list for the purposes of walking it.
+ * Returns the IRE (held) that corresponds to the hash value. If that IRE is
+ * not applicable (ire_match_args failed) then it returns a subsequent one.
+ * If we fail to find an IRE we return NULL.
  *
- * In the absence of good IRE_DEFAULT routes, this function will return
- * the first IRE_INTERFACE route found (if any).
+ * Assumes that the caller holds a reference on the IRE bucket and a read lock
+ * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
+ *
+ * Applies to IPv4 and IPv6.
+ *
+ * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
+ * address and bucket, we compare against ire_type for the orig_ire. We also
+ * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
+ * first in the bucket. Thus we compare that ire_flags match the orig_ire.
+ *
+ * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
+ * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
+ * in which the zone has an IP address. We check this for the global zone
+ * even if no shared-IP zones are configured.
  */
 ire_t *
-ire_round_robin(irb_t *irb_ptr, zoneid_t zoneid, ire_ftable_args_t *margs,
-	ip_stack_t *ipst)
+ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
+    ire_t *orig_ire, ip_stack_t *ipst)
 {
-	ire_t	*ire_origin;
-	ire_t	*ire, *maybe_ire = NULL;
+	ire_t		*ire, *maybe_ire = NULL;
+	uint_t		maybe_badcnt;
+	uint_t		maxwalk;
 
-	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
-	ire_origin = irb_ptr->irb_rr_origin;
-	if (ire_origin != NULL) {
-		ire_origin = ire_origin->ire_next;
-		IRE_FIND_NEXT_ORIGIN(ire_origin);
-	}
+	/* Fold in more bits from the hint/hash */
+	hash = hash ^ (hash >> 8) ^ (hash >> 16);
 
-	if (ire_origin == NULL) {
-		/*
-		 * first time through routine, or we dropped off the end
-		 * of list.
-		 */
-		ire_origin = irb_ptr->irb_ire;
-		IRE_FIND_NEXT_ORIGIN(ire_origin);
-	}
-	irb_ptr->irb_rr_origin = ire_origin;
-	IRB_REFHOLD_LOCKED(irb_ptr);
+	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
+	maxwalk = irb_ptr->irb_ire_cnt;	/* Excludes condemned */
+	hash %= maxwalk;
+	irb_refhold_locked(irb_ptr);
 	rw_exit(&irb_ptr->irb_lock);
 
-	DTRACE_PROBE2(ire__rr__origin, (irb_t *), irb_ptr,
-	    (ire_t *), ire_origin);
-
 	/*
 	 * Round-robin the routers list looking for a route that
 	 * matches the passed in parameters.
-	 * We start with the ire we found above and we walk the hash
-	 * list until we're back where we started. It doesn't matter if
-	 * routes are added or deleted by other threads - we know this
-	 * ire will stay in the list because we hold a reference on the
-	 * ire bucket.
+	 * First we skip "hash" number of non-condemned IREs.
+	 * Then we match the IRE.
+	 * If we find an ire which has a non-zero ire_badcnt then we remember
+	 * it and keep on looking for a lower ire_badcnt.
+	 * If we come to the end of the list we continue (treat the
+	 * bucket list as a circular list) but we match less than "max"
+	 * entries.
 	 */
-	ire = ire_origin;
-	while (ire != NULL) {
-		int match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
-		ire_t *rire;
+	ire = irb_ptr->irb_ire;
+	while (maxwalk > 0) {
+		if (IRE_IS_CONDEMNED(ire))
+			goto next_ire_skip;
+
+		/* Skip the first "hash" entries to do ECMP */
+		if (hash != 0) {
+			hash--;
+			goto next_ire_skip;
+		}
 
-		if (ire->ire_marks & IRE_MARK_CONDEMNED)
+		/* See CGTP comment above */
+		if (ire->ire_type != orig_ire->ire_type ||
+		    ire->ire_flags != orig_ire->ire_flags)
 			goto next_ire;
 
-		if (!ire_match_args(ire, margs->ift_addr, (ipaddr_t)0,
-		    margs->ift_gateway, margs->ift_type, margs->ift_ipif,
-		    margs->ift_zoneid, margs->ift_ihandle, margs->ift_tsl,
-		    margs->ift_flags, NULL))
+		/*
+		 * Note: Since IPv6 has hash buckets instead of radix
+		 * buckers we need to explicitly compare the addresses.
+		 * That makes this less efficient since we will be called
+		 * even if there is no alternatives just because the
+		 * bucket has multiple IREs for different addresses.
+		 */
+		if (ire->ire_ipversion == IPV6_VERSION) {
+			if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
+			    &ire->ire_addr_v6))
+				goto next_ire;
+		}
+
+		/*
+		 * For some reason find_best_route uses ire_mask. We do
+		 * the same.
+		 */
+		if (ire->ire_ipversion == IPV4_VERSION ?
+		    !ire_match_args(ire, margs->ift_addr,
+		    ire->ire_mask, margs->ift_gateway,
+		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
+		    margs->ift_tsl, margs->ift_flags) :
+		    !ire_match_args_v6(ire, &margs->ift_addr_v6,
+		    &ire->ire_mask_v6, &margs->ift_gateway_v6,
+		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
+		    margs->ift_tsl, margs->ift_flags))
 			goto next_ire;
 
-		if (ire->ire_type & IRE_INTERFACE) {
+		if (margs->ift_zoneid != ALL_ZONES &&
+		    (ire->ire_type & IRE_OFFLINK)) {
 			/*
-			 * keep looking to see if there is a non-interface
-			 * default ire, but save this one as a last resort.
+			 * When we're in a zone, we're only
+			 * interested in routers that are
+			 * reachable through ipifs within our zone.
 			 */
-			if (maybe_ire == NULL)
-				maybe_ire = ire;
-			goto next_ire;
+			if (ire->ire_ipversion == IPV4_VERSION) {
+				if (!ire_gateway_ok_zone_v4(
+				    ire->ire_gateway_addr, margs->ift_zoneid,
+				    ire->ire_ill, margs->ift_tsl, ipst,
+				    B_TRUE))
+					goto next_ire;
+			} else {
+				if (!ire_gateway_ok_zone_v6(
+				    &ire->ire_gateway_addr_v6,
+				    margs->ift_zoneid, ire->ire_ill,
+				    margs->ift_tsl, ipst, B_TRUE))
+					goto next_ire;
+			}
 		}
-
-		if (zoneid == ALL_ZONES) {
-			IRE_REFHOLD(ire);
-			IRB_REFRELE(irb_ptr);
+		mutex_enter(&ire->ire_lock);
+		/* Look for stale ire_badcnt and clear */
+		if (ire->ire_badcnt != 0 &&
+		    (TICK_TO_SEC(lbolt64) - ire->ire_last_badcnt >
+		    ipst->ips_ip_ire_badcnt_lifetime))
+			ire->ire_badcnt = 0;
+		mutex_exit(&ire->ire_lock);
+
+		if (ire->ire_badcnt == 0) {
+			/* We found one with a zero badcnt; done */
+			ire_refhold(ire);
+			/*
+			 * Care needed since irb_refrele grabs WLOCK to free
+			 * the irb_t.
+			 */
+			if (ire->ire_ipversion == IPV4_VERSION) {
+				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+				irb_refrele(irb_ptr);
+				RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
+			} else {
+				rw_exit(&ipst->ips_ip6_ire_head_lock);
+				irb_refrele(irb_ptr);
+				rw_enter(&ipst->ips_ip6_ire_head_lock,
+				    RW_READER);
+			}
 			return (ire);
 		}
 		/*
-		 * When we're in a non-global zone, we're only
-		 * interested in routers that are
-		 * reachable through ipifs within our zone.
+		 * keep looking to see if there is a better (lower
+		 * badcnt) matching IRE, but save this one as a last resort.
+		 * If we find a lower badcnt pick that one as the last* resort.
 		 */
-		if (ire->ire_ipif != NULL)
-			match_flags |= MATCH_IRE_ILL;
-
-		rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0,
-		    IRE_INTERFACE, ire->ire_ipif, NULL, zoneid, margs->ift_tsl,
-		    match_flags, ipst);
-		if (rire != NULL) {
-			ire_refrele(rire);
-			IRE_REFHOLD(ire);
-			IRB_REFRELE(irb_ptr);
-			return (ire);
+		if (maybe_ire == NULL) {
+			maybe_ire = ire;
+			maybe_badcnt = ire->ire_badcnt;
+		} else if (ire->ire_badcnt < maybe_badcnt) {
+			maybe_ire = ire;
+			maybe_badcnt = ire->ire_badcnt;
 		}
+
 next_ire:
-		ire = (ire->ire_next ?  ire->ire_next : irb_ptr->irb_ire);
-		if (ire == ire_origin)
-			break;
+		maxwalk--;
+next_ire_skip:
+		ire = ire->ire_next;
+		if (ire == NULL)
+			ire = irb_ptr->irb_ire;
 	}
 	if (maybe_ire != NULL)
-		IRE_REFHOLD(maybe_ire);
-	IRB_REFRELE(irb_ptr);
+		ire_refhold(maybe_ire);
+
+	/* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
+	if (ire->ire_ipversion == IPV4_VERSION) {
+		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+		irb_refrele(irb_ptr);
+		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
+	} else {
+		rw_exit(&ipst->ips_ip6_ire_head_lock);
+		irb_refrele(irb_ptr);
+		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
+	}
 	return (maybe_ire);
 }
 
@@ -2306,7 +956,7 @@ void
 irb_refhold_rn(struct radix_node *rn)
 {
 	if ((rn->rn_flags & RNF_ROOT) == 0)
-		IRB_REFHOLD(&((rt_t *)(rn))->rt_irb);
+		irb_refhold(&((rt_t *)(rn))->rt_irb);
 }
 
 void
@@ -2315,3 +965,587 @@ irb_refrele_rn(struct radix_node *rn)
 	if ((rn->rn_flags & RNF_ROOT) == 0)
 		irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
 }
+
+/*
+ * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
+ * routes this routine sets up a ire_nce_cache as well. The caller needs to
+ * lookup an nce for the multicast case.
+ */
+ire_t *
+ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa,
+    uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
+{
+	uint_t		match_args;
+	uint_t		ire_type;
+	ill_t		*ill;
+	ire_t		*ire;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	ipaddr_t	v4dst;
+	in6_addr_t	v6nexthop;
+	iaflags_t	ixaflags = ixa->ixa_flags;
+	nce_t		*nce;
+
+	match_args = MATCH_IRE_SECATTR;
+	IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
+	if (setsrcp != NULL)
+		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
+	if (errorp != NULL)
+		ASSERT(*errorp == 0);
+
+	/*
+	 * The content of the ixa will be different if IP_NEXTHOP,
+	 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
+	 */
+
+	if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) :
+	    IN6_IS_ADDR_MULTICAST(v6dst)) {
+		/* Pick up the IRE_MULTICAST for the ill */
+		if (ixa->ixa_multicast_ifindex != 0) {
+			ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
+			    !(ixaflags & IXAF_IS_IPV4), ipst);
+		} else if (ixaflags & IXAF_SCOPEID_SET) {
+			/* sin6_scope_id takes precedence over ixa_ifindex */
+			ASSERT(ixa->ixa_scopeid != 0);
+			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
+			    !(ixaflags & IXAF_IS_IPV4), ipst);
+		} else if (ixa->ixa_ifindex != 0) {
+			/*
+			 * In the ipmp case, the ixa_ifindex is set to
+			 * point at an under_ill and we would return the
+			 * ire_multicast() corresponding to that under_ill.
+			 */
+			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
+			    !(ixaflags & IXAF_IS_IPV4), ipst);
+		} else if (ixaflags & IXAF_IS_IPV4) {
+			ipaddr_t	v4setsrc = INADDR_ANY;
+
+			ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst,
+			    multirtp, &v4setsrc);
+			if (setsrcp != NULL)
+				IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
+		} else {
+			ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst,
+			    multirtp, setsrcp);
+		}
+		if (ill != NULL && IS_VNI(ill)) {
+			ill_refrele(ill);
+			ill = NULL;
+		}
+		if (ill == NULL) {
+			if (errorp != NULL)
+				*errorp = ENXIO;
+			/* Get a hold on the IRE_NOROUTE */
+			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+			return (ire);
+		}
+		if (!(ill->ill_flags & ILLF_MULTICAST)) {
+			ill_refrele(ill);
+			if (errorp != NULL)
+				*errorp = EHOSTUNREACH;
+			/* Get a hold on the IRE_NOROUTE */
+			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+			return (ire);
+		}
+		/* Get a refcnt on the single IRE_MULTICAST per ill */
+		ire = ire_multicast(ill);
+		ill_refrele(ill);
+		if (generationp != NULL)
+			*generationp = ire->ire_generation;
+		if (errorp != NULL &&
+		    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+			*errorp = EHOSTUNREACH;
+		}
+		return (ire);
+	}
+
+	if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
+		if (ixaflags & IXAF_SCOPEID_SET) {
+			/* sin6_scope_id takes precedence over ixa_ifindex */
+			ASSERT(ixa->ixa_scopeid != 0);
+			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
+			    !(ixaflags & IXAF_IS_IPV4), ipst);
+		} else {
+			ASSERT(ixa->ixa_ifindex != 0);
+			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
+			    !(ixaflags & IXAF_IS_IPV4), ipst);
+		}
+		if (ill != NULL && IS_VNI(ill)) {
+			ill_refrele(ill);
+			ill = NULL;
+		}
+		if (ill == NULL) {
+			if (errorp != NULL)
+				*errorp = ENXIO;
+			/* Get a hold on the IRE_NOROUTE */
+			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+			return (ire);
+		}
+		/*
+		 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
+		 * so for both of them we need to be able look for an under
+		 * interface.
+		 */
+		if (IS_UNDER_IPMP(ill))
+			match_args |= MATCH_IRE_TESTHIDDEN;
+	} else {
+		ill = NULL;
+	}
+
+	if (ixaflags & IXAF_NEXTHOP_SET) {
+		/* IP_NEXTHOP was set */
+		v6nexthop = ixa->ixa_nexthop_v6;
+	} else {
+		v6nexthop = *v6dst;
+	}
+
+	ire_type = 0;
+	/* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */
+
+	/*
+	 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
+	 * we only look for an onlink IRE.
+	 */
+	if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
+		match_args |= MATCH_IRE_TYPE;
+		ire_type = IRE_ONLINK;
+	}
+
+	if (ixaflags & IXAF_IS_IPV4) {
+		ipaddr_t	v4nexthop;
+		ipaddr_t	v4setsrc = INADDR_ANY;
+
+		IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
+		ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
+		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE,
+		    ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
+		if (setsrcp != NULL)
+			IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
+	} else {
+		ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
+		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE,
+		    ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
+	}
+
+#ifdef DEBUG
+	if (match_args & MATCH_IRE_TESTHIDDEN) {
+		ip3dbg(("looking for hidden; dst %x ire %p\n",
+		    v4dst, (void *)ire));
+	}
+#endif
+
+	if (ill != NULL)
+		ill_refrele(ill);
+
+	if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+	    (ire->ire_type & IRE_MULTICAST)) {
+		/* No ire_nce_cache */
+		return (ire);
+	}
+
+	/* Setup ire_nce_cache if it doesn't exist or is condemned. */
+	mutex_enter(&ire->ire_lock);
+	nce = ire->ire_nce_cache;
+	if (nce == NULL || nce->nce_is_condemned) {
+		mutex_exit(&ire->ire_lock);
+		(void) ire_revalidate_nce(ire);
+	} else {
+		mutex_exit(&ire->ire_lock);
+	}
+	return (ire);
+}
+
+/*
+ * Find a route given some xmit attributes and a packet.
+ * Generic for IPv4 and IPv6
+ *
+ * This never returns NULL. But when it returns the IRE_NOROUTE
+ * it might set errorp.
+ */
+ire_t *
+ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
+    int *errorp, boolean_t *multirtp)
+{
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
+		in6_addr_t	v6dst;
+
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
+
+		return (ip_select_route(&v6dst, ixa, generationp,
+		    NULL, errorp, multirtp));
+	} else {
+		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
+
+		return (ip_select_route(&ip6h->ip6_dst, ixa, generationp,
+		    NULL, errorp, multirtp));
+	}
+}
+
+ire_t *
+ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp,
+    ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
+{
+	in6_addr_t	v6dst;
+	ire_t		*ire;
+	in6_addr_t	setsrc;
+
+	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
+
+	IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
+
+	setsrc = ipv6_all_zeros;
+	ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp,
+	    multirtp);
+	if (v4setsrcp != NULL)
+		IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
+	return (ire);
+}
+
+/*
+ * Recursively look for a route to the destination. Can also match on
+ * the zoneid, ill, and label. Used for the data paths. See also
+ * ire_route_recursive.
+ *
+ * If ill is set this means we will match it by adding MATCH_IRE_ILL.
+ *
+ * Note that this function never returns NULL. It returns an IRE_NOROUTE
+ * instead.
+ *
+ * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
+ * is an error.
+ * Allow at most one RTF_INDIRECT.
+ */
+ire_t *
+ire_route_recursive_impl_v4(ire_t *ire,
+    ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
+    zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
+    boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
+    tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
+{
+	int		i, j;
+	ire_t		*ires[MAX_IRE_RECURSION];
+	uint_t		generation;
+	uint_t		generations[MAX_IRE_RECURSION];
+	boolean_t	need_refrele = B_FALSE;
+	boolean_t	invalidate = B_FALSE;
+	int		prefs[MAX_IRE_RECURSION];
+	ill_t		*ill = NULL;
+
+	if (setsrcp != NULL)
+		ASSERT(*setsrcp == INADDR_ANY);
+	if (gwattrp != NULL)
+		ASSERT(*gwattrp == NULL);
+
+	if (ill_arg != NULL)
+		match_args |= MATCH_IRE_ILL;
+
+	/*
+	 * We iterate up to three times to resolve a route, even though
+	 * we have four slots in the array. The extra slot is for an
+	 * IRE_IF_CLONE we might need to create.
+	 */
+	i = 0;
+	while (i < MAX_IRE_RECURSION - 1) {
+		/* ire_ftable_lookup handles round-robin/ECMP */
+		if (ire == NULL) {
+			ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
+			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
+			    match_args, xmit_hint, ipst, &generation);
+		} else {
+			/* Caller passed it; extra hold since we will rele */
+			ire_refhold(ire);
+			if (generationp != NULL)
+				generation = *generationp;
+			else
+				generation = IRE_GENERATION_VERIFY;
+		}
+		if (ire == NULL)
+			ire = ire_reject(ipst, B_FALSE);
+
+		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
+		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+			goto error;
+
+		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
+
+		prefs[i] = ire_pref(ire);
+		if (i != 0) {
+			/*
+			 * Don't allow anything unusual past the first
+			 * iteration.
+			 */
+			if ((ire->ire_type &
+			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
+			    prefs[i] <= prefs[i-1]) {
+				ire_refrele(ire);
+				ire = ire_reject(ipst, B_FALSE);
+				goto error;
+			}
+		}
+		/* We have a usable IRE */
+		ires[i] = ire;
+		generations[i] = generation;
+		i++;
+
+		/* The first RTF_SETSRC address is passed back if setsrcp */
+		if ((ire->ire_flags & RTF_SETSRC) &&
+		    setsrcp != NULL && *setsrcp == INADDR_ANY) {
+			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
+			*setsrcp = ire->ire_setsrc_addr;
+		}
+
+		/* The first ire_gw_secattr is passed back if gwattrp */
+		if (ire->ire_gw_secattr != NULL &&
+		    gwattrp != NULL && *gwattrp == NULL)
+			*gwattrp = ire->ire_gw_secattr;
+
+		/*
+		 * Check if we have a short-cut pointer to an IRE for this
+		 * destination, and that the cached dependency isn't stale.
+		 * In that case we've rejoined an existing tree towards a
+		 * parent, thus we don't need to continue the loop to
+		 * discover the rest of the tree.
+		 */
+		mutex_enter(&ire->ire_lock);
+		if (ire->ire_dep_parent != NULL &&
+		    ire->ire_dep_parent->ire_generation ==
+		    ire->ire_dep_parent_generation) {
+			mutex_exit(&ire->ire_lock);
+			ire = NULL;
+			goto done;
+		}
+		mutex_exit(&ire->ire_lock);
+
+		/*
+		 * If this type should have an ire_nce_cache (even if it
+		 * doesn't yet have one) then we are done. Includes
+		 * IRE_INTERFACE with a full 32 bit mask.
+		 */
+		if (ire->ire_nce_capable) {
+			ire = NULL;
+			goto done;
+		}
+		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
+		/*
+		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
+		 * particular destination
+		 */
+		if (ire->ire_type & IRE_INTERFACE) {
+			in6_addr_t	v6nexthop;
+			ire_t		*clone;
+
+			ASSERT(ire->ire_masklen != IPV4_ABITS);
+
+			/*
+			 * In the case of ip_input and ILLF_FORWARDING not
+			 * being set, and in the case of RTM_GET,
+			 * there is no point in allocating
+			 * an IRE_IF_CLONE. We return the IRE_INTERFACE.
+			 * Note that !allocate can result in a ire_dep_parent
+			 * which is IRE_IF_* without an IRE_IF_CLONE.
+			 * We recover from that when we need to send packets
+			 * by ensuring that the generations become
+			 * IRE_GENERATION_VERIFY in this case.
+			 */
+			if (!allocate) {
+				invalidate = B_TRUE;
+				ire = NULL;
+				goto done;
+			}
+
+			IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
+
+			clone = ire_create_if_clone(ire, &v6nexthop,
+			    &generation);
+			if (clone == NULL) {
+				/*
+				 * Temporary failure - no memory.
+				 * Don't want caller to cache IRE_NOROUTE.
+				 */
+				invalidate = B_TRUE;
+				ire = ire_blackhole(ipst, B_FALSE);
+				goto error;
+			}
+			/*
+			 * Make clone next to last entry and the
+			 * IRE_INTERFACE the last in the dependency
+			 * chain since the clone depends on the
+			 * IRE_INTERFACE.
+			 */
+			ASSERT(i >= 1);
+			ASSERT(i < MAX_IRE_RECURSION);
+
+			ires[i] = ires[i-1];
+			generations[i] = generations[i-1];
+			ires[i-1] = clone;
+			generations[i-1] = generation;
+			i++;
+
+			ire = NULL;
+			goto done;
+		}
+
+		/*
+		 * We only match on the type and optionally ILL when
+		 * recursing. The type match is used by some callers
+		 * to exclude certain types (such as IRE_IF_CLONE or
+		 * IRE_LOCAL|IRE_LOOPBACK).
+		 */
+		match_args &= MATCH_IRE_TYPE;
+		nexthop = ire->ire_gateway_addr;
+		if (ill == NULL && ire->ire_ill != NULL) {
+			ill = ire->ire_ill;
+			need_refrele = B_TRUE;
+			ill_refhold(ill);
+			match_args |= MATCH_IRE_ILL;
+		}
+		ire = NULL;
+	}
+	ASSERT(ire == NULL);
+	ire = ire_reject(ipst, B_FALSE);
+
+error:
+	ASSERT(ire != NULL);
+	if (need_refrele)
+		ill_refrele(ill);
+
+	/*
+	 * In the case of MULTIRT we want to try a different IRE the next
+	 * time. We let the next packet retry in that case.
+	 */
+	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
+		(void) ire_no_good(ires[0]);
+
+cleanup:
+	/* cleanup ires[i] */
+	ire_dep_unbuild(ires, i);
+	for (j = 0; j < i; j++)
+		ire_refrele(ires[j]);
+
+	ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE));
+	/*
+	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
+	 * ip_select_route since the reject or lack of memory might be gone.
+	 */
+	if (generationp != NULL)
+		*generationp = IRE_GENERATION_VERIFY;
+	return (ire);
+
+done:
+	ASSERT(ire == NULL);
+	if (need_refrele) {
+		ill_refrele(ill);
+		ill = NULL;
+	}
+
+	/* Build dependencies */
+	if (!ire_dep_build(ires, generations, i)) {
+		/* Something in chain was condemned; tear it apart */
+		ire = ire_reject(ipst, B_FALSE);
+		goto cleanup;
+	}
+
+	/*
+	 * Release all refholds except the one for ires[0] that we
+	 * will return to the caller.
+	 */
+	for (j = 1; j < i; j++)
+		ire_refrele(ires[j]);
+
+	if (invalidate) {
+		/*
+		 * Since we needed to allocate but couldn't we need to make
+		 * sure that the dependency chain is rebuilt the next time.
+		 */
+		ire_dep_invalidate_generations(ires[0]);
+		generation = IRE_GENERATION_VERIFY;
+	} else {
+		/*
+		 * IREs can have been added or deleted while we did the
+		 * recursive lookup and we can't catch those until we've built
+		 * the dependencies. We verify the stored
+		 * ire_dep_parent_generation to catch any such changes and
+		 * return IRE_GENERATION_VERIFY (which will cause
+		 * ip_select_route to be called again so we can redo the
+		 * recursive lookup next time we send a packet.
+		 */
+		generation = ire_dep_validate_generations(ires[0]);
+		if (generations[0] != ires[0]->ire_generation) {
+			/* Something changed at the top */
+			generation = IRE_GENERATION_VERIFY;
+		}
+	}
+	if (generationp != NULL)
+		*generationp = generation;
+
+	return (ires[0]);
+}
+
+ire_t *
+ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
+    zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
+    boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
+    tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
+{
+	return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
+	    zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp,
+	    gwattrp, generationp));
+}
+
+/*
+ * Recursively look for a route to the destination.
+ * We only handle a destination match here, yet we have the same arguments
+ * as the full match to allow function pointers to select between the two.
+ *
+ * Note that this function never returns NULL. It returns an IRE_NOROUTE
+ * instead.
+ *
+ * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
+ * is an error.
+ * Allow at most one RTF_INDIRECT.
+ */
+ire_t *
+ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate,
+    uint32_t xmit_hint, ip_stack_t *ipst)
+{
+	ire_t	*ire;
+	ire_t	*ire1;
+	uint_t	generation;
+
+	/* ire_ftable_lookup handles round-robin/ECMP */
+	ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
+	    &generation);
+	ASSERT(ire != NULL);
+
+	/*
+	 * If this type should have an ire_nce_cache (even if it
+	 * doesn't yet have one) then we are done. Includes
+	 * IRE_INTERFACE with a full 32 bit mask.
+	 */
+	if (ire->ire_nce_capable)
+		return (ire);
+
+	/*
+	 * If the IRE has a current cached parent we know that the whole
+	 * parent chain is current, hence we don't need to discover and
+	 * build any dependencies by doing a recursive lookup.
+	 */
+	mutex_enter(&ire->ire_lock);
+	if (ire->ire_dep_parent != NULL &&
+	    ire->ire_dep_parent->ire_generation ==
+	    ire->ire_dep_parent_generation) {
+		mutex_exit(&ire->ire_lock);
+		return (ire);
+	}
+	mutex_exit(&ire->ire_lock);
+
+	/*
+	 * Fallback to loop in the normal code starting with the ire
+	 * we found. Normally this would return the same ire.
+	 */
+	ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
+	    NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL,
+	    &generation);
+	ire_refrele(ire);
+	return (ire1);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_helper_stream.c b/usr/src/uts/common/inet/ip/ip_helper_stream.c
index 6f5608e950..3fa6364417 100644
--- a/usr/src/uts/common/inet/ip/ip_helper_stream.c
+++ b/usr/src/uts/common/inet/ip/ip_helper_stream.c
@@ -58,14 +58,14 @@ static struct qinit ip_helper_stream_winit = {
 	&ip_helper_stream_info, NULL, NULL, NULL, STRUIOT_NONE
 };
 
-#define	IP_USE_HELPER_CACHE	(ip_helper_stream_cache != NULL)
-
 /*
  * set the q_ptr of the 'q' to the conn_t pointer passed in
  */
 static void
 ip_helper_share_conn(queue_t *q, mblk_t *mp, cred_t *crp)
 {
+	conn_t *connp = *((conn_t **)mp->b_cont->b_rptr);
+
 	/*
 	 * This operation is allowed only on helper streams with kcred
 	 */
@@ -75,24 +75,12 @@ ip_helper_share_conn(queue_t *q, mblk_t *mp, cred_t *crp)
 		return;
 	}
 
-	if (IP_USE_HELPER_CACHE) {
-		ip_helper_stream_info_t	*ip_helper_info;
-
-		ip_helper_info = *((ip_helper_stream_info_t **)
-		    mp->b_cont->b_rptr);
-		ip_helper_info->iphs_minfo = q->q_ptr;
-		ip_helper_info->iphs_rq = RD(q);
-		ip_helper_info->iphs_wq = WR(q);
-	} else {
-		conn_t *connp = *((conn_t **)mp->b_cont->b_rptr);
-
-		connp->conn_helper_info->iphs_minfo = q->q_ptr;
-		connp->conn_helper_info->iphs_rq = RD(q);
-		connp->conn_helper_info->iphs_wq = WR(q);
-		WR(q)->q_ptr = RD(q)->q_ptr = (void *)connp;
-		connp->conn_rq = RD(q);
-		connp->conn_wq = WR(q);
-	}
+	connp->conn_helper_info->iphs_minfo = q->q_ptr;
+	connp->conn_helper_info->iphs_rq = RD(q);
+	connp->conn_helper_info->iphs_wq = WR(q);
+	WR(q)->q_ptr = RD(q)->q_ptr = (void *)connp;
+	connp->conn_rq = RD(q);
+	connp->conn_wq = WR(q);
 	miocack(q, mp, 0, 0);
 }
 
@@ -104,17 +92,13 @@ ip_helper_wput(queue_t *q, mblk_t *mp)
 	    iocp->ioc_cmd == SIOCSQPTR) {
 		ip_helper_share_conn(q, mp, iocp->ioc_cr);
 	} else {
-		conn_t *connp = (conn_t *)q->q_ptr;
-
-		if (connp->conn_af_isv6) {
-			ip_wput_v6(q, mp);
-		} else {
-			ip_wput(q, mp);
-		}
+		/* We only handle ioctl related messages here */
+		ASSERT(DB_TYPE(mp) != M_DATA);
+		ip_wput_nondata(q, mp);
 	}
 }
 
-/* ARGSUSED */
+/* ARGSUSED3 */
 int
 ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag,
     cred_t *credp, boolean_t isv6)
@@ -126,10 +110,8 @@ ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag,
 
 	ASSERT(RD(q) == q);
 
-	ip_minfop = kmem_alloc(sizeof (ip_helper_minfo_t), KM_NOSLEEP);
-	if (ip_minfop == NULL) {
-		return (ENOMEM);
-	}
+	ip_minfop = kmem_alloc(sizeof (ip_helper_minfo_t), KM_SLEEP);
+	ASSERT(ip_minfop != NULL);
 
 	ip_minfop->ip_minfo_dev = 0;
 	ip_minfop->ip_minfo_arena = NULL;
@@ -171,7 +153,7 @@ ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag,
 	return (0);
 }
 
-/* ARGSUSED */
+/* ARGSUSED1 */
 static int
 ip_helper_stream_close(queue_t *q, int flag)
 {
@@ -189,305 +171,91 @@ ip_helper_stream_close(queue_t *q, int flag)
 
 /*
  * Public interface for creating an IP stream with shared conn_t
+ * Handles multiple callers in parallel by using conn_lock.
+ * Note that we allocate the helper stream without any locks, which means
+ * we might need to free it if we had two threads doing this concurrently
+ * for the conn_t.
  */
-/* ARGSUSED */
 int
 ip_create_helper_stream(conn_t *connp, ldi_ident_t li)
 {
+	ip_helper_stream_info_t *helper;
 	int	error;
 	int	ret;
 
 	ASSERT(!servicing_interrupt());
 
-	error = 0;
-	if (IP_USE_HELPER_CACHE) {
-		connp->conn_helper_info = kmem_cache_alloc(
-		    ip_helper_stream_cache, KM_NOSLEEP);
-		if (connp->conn_helper_info == NULL)
-			return (EAGAIN);
-		connp->conn_rq = connp->conn_helper_info->iphs_rq;
-		connp->conn_wq = connp->conn_helper_info->iphs_wq;
-		/*
-		 * Doesn't need to hold the QLOCK for there is no one else
-		 * should have a pointer to this queue.
-		 */
-		connp->conn_rq->q_flag |= QWANTR;
-		connp->conn_wq->q_flag |= QWANTR;
-
-		connp->conn_rq->q_ptr = connp;
-		connp->conn_wq->q_ptr = connp;
-	} else {
-		ASSERT(connp->conn_helper_info == NULL);
-		connp->conn_helper_info = kmem_alloc(
-		    sizeof (ip_helper_stream_info_t), KM_SLEEP);
-		/*
-		 * open ip device via the layered interface.
-		 * pass in kcred as some threads do not have the
-		 * priviledge to open /dev/ip and the check in
-		 * secpolicy_spec_open() will fail the open
-		 */
-		error = ldi_open_by_name(connp->conn_af_isv6 ?
-		    DEV_IP6 : DEV_IP, IP_HELPER_STR,
-		    kcred, &connp->conn_helper_info->iphs_handle, li);
-
-		if (error != 0) {
-			kmem_free(connp->conn_helper_info,
-			    (sizeof (ip_helper_stream_info_t)));
-			connp->conn_helper_info = NULL;
-			return (error);
-		}
-		/*
-		 * Share connp with the helper stream
-		 */
-		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
-		    SIOCSQPTR, (intptr_t)connp, FKIOCTL, kcred, &ret);
-
-		if (error != 0) {
-			/*
-			 * Passing in a zero flag indicates that an error
-			 * occured and stream was not shared
-			 */
-			(void) ldi_close(connp->conn_helper_info->iphs_handle,
-			    0, kcred);
-			kmem_free(connp->conn_helper_info,
-			    (sizeof (ip_helper_stream_info_t)));
-			connp->conn_helper_info = NULL;
-		}
+	if (connp->conn_helper_info != NULL) {
+		/* Already allocated */
+		return (0);
 	}
-	return (error);
-}
-
-/*
- * Public interface for freeing IP helper stream
- */
-/* ARGSUSED */
-void
-ip_free_helper_stream(conn_t *connp)
-{
-	ASSERT(!servicing_interrupt());
-	if (IP_USE_HELPER_CACHE) {
-
-		if (connp->conn_helper_info == NULL)
-			return;
-		ASSERT(connp->conn_helper_info->iphs_rq != NULL);
-		ASSERT(connp->conn_helper_info->iphs_wq != NULL);
-
-		/* Prevent service procedures from being called */
-		disable_svc(connp->conn_helper_info->iphs_rq);
-
-		/* Wait until service procedure of each queue is run */
-		wait_svc(connp->conn_helper_info->iphs_rq);
-
-		/* Cleanup any pending ioctls */
-		conn_ioctl_cleanup(connp);
-
-		/* Allow service procedures to be called again */
-		enable_svc(connp->conn_helper_info->iphs_rq);
-
-		/* Flush the queues */
-		flushq(connp->conn_helper_info->iphs_rq, FLUSHALL);
-		flushq(connp->conn_helper_info->iphs_wq, FLUSHALL);
-
-		connp->conn_helper_info->iphs_rq->q_ptr = NULL;
-		connp->conn_helper_info->iphs_wq->q_ptr = NULL;
-
-		kmem_cache_free(ip_helper_stream_cache,
-		    connp->conn_helper_info);
-	} else {
-		ASSERT(
-		    connp->conn_helper_info->iphs_handle != NULL);
-
-		connp->conn_helper_info->iphs_rq->q_ptr =
-		    connp->conn_helper_info->iphs_wq->q_ptr =
-		    connp->conn_helper_info->iphs_minfo;
-		(void) ldi_close(connp->conn_helper_info->iphs_handle,
-		    IP_HELPER_STR, kcred);
-		kmem_free(connp->conn_helper_info,
-		    sizeof (ip_helper_stream_info_t));
-	}
-	connp->conn_helper_info = NULL;
-}
-
-/*
- * create a T_SVR4_OPTMGMT_REQ TPI message and send down the IP stream
- */
-static int
-ip_send_option_request(conn_t *connp, uint_t optset_context, int level,
-    int option_name, const void *optval, t_uscalar_t optlen, cred_t *cr)
-{
-	struct T_optmgmt_req	*optmgmt_reqp;
-	struct opthdr		*ohp;
-	ssize_t			size;
-	mblk_t			*mp;
-
-	size = sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + optlen;
-	/* Not used to generate UCRED, thus don't need correct pid */
-	mp = allocb_cred(size, cr, NOPID);
-	if (mp == NULL)
-		return (ENOMEM);
-
-	mp->b_datap->db_type = M_PROTO;
-	optmgmt_reqp = (struct T_optmgmt_req *)mp->b_wptr;
-
-	optmgmt_reqp->PRIM_type = T_SVR4_OPTMGMT_REQ;
-	optmgmt_reqp->MGMT_flags = optset_context;
-	optmgmt_reqp->OPT_length = (t_scalar_t)sizeof (struct opthdr) + optlen;
-	optmgmt_reqp->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_req);
-
-	mp->b_wptr += sizeof (struct T_optmgmt_req);
-
-	ohp = (struct opthdr *)mp->b_wptr;
 
-	ohp->level = level;
-	ohp->name = option_name;
-	ohp->len = optlen;
-
-	mp->b_wptr += sizeof (struct opthdr);
-
-	if (optval != NULL) {
-		bcopy(optval, mp->b_wptr, optlen);
-	} else {
-		bzero(mp->b_wptr, optlen);
-	}
-	mp->b_wptr += optlen;
+	error = 0;
+	helper = kmem_alloc(sizeof (ip_helper_stream_info_t), KM_SLEEP);
 
 	/*
-	 * Send down the primitive
+	 * open ip device via the layered interface.
+	 * pass in kcred as some threads do not have the
+	 * priviledge to open /dev/ip and the check in
+	 * secpolicy_spec_open() will fail the open
 	 */
-	return (ldi_putmsg(connp->conn_helper_info->iphs_handle, mp));
-}
+	error = ldi_open_by_name((connp->conn_family == AF_INET6 ? DEV_IP6 :
+	    DEV_IP), IP_HELPER_STR, kcred, &helper->iphs_handle, li);
 
-/*
- * wait/process the response to T_SVR4_OPTMGMT_REQ TPI message
- */
-static int
-ip_get_option_response(conn_t *connp, uint_t optset_context, void *optval,
-    t_uscalar_t *optlenp)
-{
-	union T_primitives	*tpr;
-	int			error;
-	mblk_t			*mp;
-
-	mp = NULL;
-
-	ASSERT(optset_context == T_CHECK || optset_context == T_NEGOTIATE);
-	error = ldi_getmsg(connp->conn_helper_info->iphs_handle, &mp, NULL);
 	if (error != 0) {
+		kmem_free(helper, sizeof (ip_helper_stream_info_t));
 		return (error);
 	}
-
-	if (DB_TYPE(mp) != M_PCPROTO || MBLKL(mp) < sizeof (tpr->type)) {
-		error = EPROTO;
-		goto done;
-	}
-
-	tpr = (union T_primitives *)mp->b_rptr;
-
-	switch (tpr->type) {
-	case T_OPTMGMT_ACK:
-		if (MBLKL(mp) < TOPTMGMTACKSZ)
-			error = EPROTO;
-		break;
-	case T_ERROR_ACK:
-		if (MBLKL(mp) < TERRORACKSZ) {
-			error = EPROTO;
-			break;
-		}
-
-		if (tpr->error_ack.TLI_error == TSYSERR)
-			error = tpr->error_ack.UNIX_error;
-		else
-			error = proto_tlitosyserr(tpr->error_ack.TLI_error);
-		break;
-	default:
-		error = EPROTO;
-		break;
+	/* Make sure we are the only one */
+	mutex_enter(&connp->conn_lock);
+	if (connp->conn_helper_info != NULL) {
+		/* Some other thread won - discard this stream */
+		mutex_exit(&connp->conn_lock);
+		(void) ldi_close(helper->iphs_handle, 0, kcred);
+		kmem_free(helper, sizeof (ip_helper_stream_info_t));
+		return (0);
 	}
+	connp->conn_helper_info = helper;
+	/*
+	 * Share connp with the helper stream. We hold conn_lock across this
+	 * operation.
+	 */
+	error = ldi_ioctl(helper->iphs_handle, SIOCSQPTR, (intptr_t)connp,
+	    FKIOCTL, kcred, &ret);
 
-	if ((optset_context == T_CHECK) && (error == 0)) {
-		struct opthdr		*opt_res;
-		t_uscalar_t		len;
-		t_uscalar_t		size;
-		t_uscalar_t		maxlen = *optlenp;
-		void			*option;
-		struct T_optmgmt_ack	*optmgmt_ack;
-
-		optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
-		opt_res = (struct opthdr *)
-		    ((uintptr_t)mp->b_rptr +  optmgmt_ack->OPT_offset);
-		/*
-		 * Check mblk boundary
-		 */
-		if (!MBLKIN(mp, optmgmt_ack->OPT_offset,
-		    optmgmt_ack->OPT_length)) {
-			error = EPROTO;
-			goto done;
-		}
-
-		/*
-		 * Check alignment
-		 */
-		if ((((uintptr_t)opt_res) & (__TPI_ALIGN_SIZE - 1)) != 0) {
-			error = EPROTO;
-			goto done;
-		}
-
-		option = &opt_res[1];
-
-		/* check to ensure that the option is within bounds */
-		if ((((uintptr_t)option + opt_res->len) < (uintptr_t)option) ||
-		    !MBLKIN(mp, sizeof (struct opthdr), opt_res->len)) {
-			error = EPROTO;
-			goto done;
-		}
-
-		len = opt_res->len;
-		size = MIN(len, maxlen);
-
+	if (error != 0) {
 		/*
-		 * Copy data
+		 * Passing in a zero flag indicates that an error
+		 * occured and stream was not shared
 		 */
-		bcopy(option, optval, size);
-		bcopy(&size, optlenp, sizeof (size));
+		(void) ldi_close(helper->iphs_handle, 0, kcred);
+		kmem_free(helper, sizeof (ip_helper_stream_info_t));
+		connp->conn_helper_info = NULL;
 	}
-
-done:
-	freemsg(mp);
+	mutex_exit(&connp->conn_lock);
 	return (error);
 }
 
 /*
- * Public interface to get socketoptions via the ip helper stream.
- */
-int
-ip_get_options(conn_t *connp, int level, int option_name, void *optval,
-    t_uscalar_t *optlenp, cred_t *cr)
-{
-	int			error;
-
-	error = ip_send_option_request(connp, T_CHECK, level, option_name, NULL,
-	    *optlenp, cr);
-	if (error)
-		return (error);
-
-	return (ip_get_option_response(connp, T_CHECK, optval, optlenp));
-}
-
-/*
- * Public interface to set socket options via the ip helper stream.
+ * Public interface for freeing IP helper stream
+ * Caller must ensure no concurrent use of the conn_t, which is normally
+ * done by calling this from the close routine when the conn_t is quiesced.
  */
-int
-ip_set_options(conn_t *connp, int level, int option_name, const void *optval,
-    t_uscalar_t optlen, cred_t *cr)
+void
+ip_free_helper_stream(conn_t *connp)
 {
+	ASSERT(!servicing_interrupt());
 
-	int	error;
+	if (connp->conn_helper_info == NULL)
+		return;
 
-	error = ip_send_option_request(connp, T_NEGOTIATE, level, option_name,
-	    optval, optlen, cr);
-	if (error)
-		return (error);
+	ASSERT(connp->conn_helper_info->iphs_handle != NULL);
 
-	return (ip_get_option_response(connp, T_NEGOTIATE, (void *)optval,
-	    &optlen));
+	connp->conn_helper_info->iphs_rq->q_ptr =
+	    connp->conn_helper_info->iphs_wq->q_ptr =
+	    connp->conn_helper_info->iphs_minfo;
+	(void) ldi_close(connp->conn_helper_info->iphs_handle,
+	    IP_HELPER_STR, kcred);
+	kmem_free(connp->conn_helper_info, sizeof (ip_helper_stream_info_t));
+	connp->conn_helper_info = NULL;
 }
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index b175f4530f..6066da35b4 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -72,6 +72,7 @@
 #include <inet/mi.h>
 #include <inet/nd.h>
 #include <inet/arp.h>
+#include <inet/ip_arp.h>
 #include <inet/mib2.h>
 #include <inet/ip.h>
 #include <inet/ip6.h>
@@ -88,12 +89,6 @@
 #include <inet/ip_netinfo.h>
 #include <inet/ilb_ip.h>
 
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
-#include <inet/sadb.h>
-#include <inet/ipsec_impl.h>
-#include <sys/iphada.h>
-
 #include <netinet/igmp.h>
 #include <inet/ip_listutils.h>
 #include <inet/ipclassifier.h>
@@ -119,15 +114,6 @@ typedef struct ipft_s {
 #define	IPFT_F_NO_REPLY		0x1	/* IP ioctl does not expect any reply */
 #define	IPFT_F_SELF_REPLY	0x2	/* ioctl callee does the ioctl reply */
 
-typedef struct ip_sock_ar_s {
-	union {
-		area_t	ip_sock_area;
-		ared_t	ip_sock_ared;
-		areq_t	ip_sock_areq;
-	} ip_sock_ar_u;
-	queue_t	*ip_sock_ar_q;
-} ip_sock_ar_t;
-
 static int	nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 static int	nd_ill_forward_set(queue_t *q, mblk_t *mp,
 		    char *value, caddr_t cp, cred_t *ioc_cr);
@@ -148,7 +134,7 @@ static int	ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 static int	ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
     queue_t *q, mblk_t *mp, boolean_t need_up);
 static int	ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
-    int ioccmd, struct linkblk *li, boolean_t doconsist);
+    int ioccmd, struct linkblk *li);
 static ipaddr_t	ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
 static void	ip_wput_ioctl(queue_t *q, mblk_t *mp);
 static void	ipsq_flush(ill_t *ill);
@@ -159,17 +145,14 @@ static void	ipsq_delete(ipsq_t *);
 
 static ipif_t	*ipif_allocate(ill_t *ill, int id, uint_t ire_type,
     boolean_t initialize, boolean_t insert);
-static void	ipif_check_bcast_ires(ipif_t *test_ipif);
 static ire_t	**ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
+static void	ipif_delete_bcast_ires(ipif_t *ipif);
+static int	ipif_add_ires_v4(ipif_t *, boolean_t);
 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
 		    boolean_t isv6);
-static void	ipif_down_delete_ire(ire_t *ire, char *ipif);
-static void	ipif_delete_cache_ire(ire_t *, char *);
 static int	ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
 static void	ipif_free(ipif_t *ipif);
 static void	ipif_free_tail(ipif_t *ipif);
-static void	ipif_mtu_change(ire_t *ire, char *ipif_arg);
-static void	ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
 static void	ipif_set_default(ipif_t *ipif);
 static int	ipif_set_values(queue_t *q, mblk_t *mp,
     char *interf_name, uint_t *ppa);
@@ -177,17 +160,13 @@ static int	ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
     queue_t *q);
 static ipif_t	*ipif_lookup_on_name(char *name, size_t namelen,
     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
-    queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *);
-static void	ipif_update_other_ipifs(ipif_t *old_ipif);
+    ip_stack_t *);
 
 static int	ill_alloc_ppa(ill_if_t *, ill_t *);
-static int	ill_arp_off(ill_t *ill);
-static int	ill_arp_on(ill_t *ill);
 static void	ill_delete_interface_type(ill_if_t *);
 static int	ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
 static void	ill_dl_down(ill_t *ill);
 static void	ill_down(ill_t *ill);
-static void	ill_downi(ire_t *ire, char *ill_arg);
 static void	ill_free_mib(ill_t *ill);
 static void	ill_glist_delete(ill_t *);
 static void	ill_phyint_reinit(ill_t *ill);
@@ -199,38 +178,22 @@ static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
-static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo;
-static ip_v6mapinfo_func_t ip_nodef_v6mapinfo;
-static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo;
-static ip_v4mapinfo_func_t ip_nodef_v4mapinfo;
-static void	ipif_save_ire(ipif_t *, ire_t *);
-static void	ipif_remove_ire(ipif_t *, ire_t *);
-static void 	ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *);
+static ip_v4mapinfo_func_t ip_ether_v4_mapping;
+static ip_v6mapinfo_func_t ip_ether_v6_mapping;
+static ip_v4mapinfo_func_t ip_ib_v4_mapping;
+static ip_v6mapinfo_func_t ip_ib_v6_mapping;
+static ip_v4mapinfo_func_t ip_mbcast_mapping;
+static void 	ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
 static void 	ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
 static void	phyint_free(phyint_t *);
 
-/*
- * Per-ill IPsec capabilities management.
- */
-static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void);
-static void	ill_ipsec_capab_free(ill_ipsec_capab_t *);
-static void	ill_ipsec_capab_add(ill_t *, uint_t, boolean_t);
-static void	ill_ipsec_capab_delete(ill_t *, uint_t);
-static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int);
-static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *,
-    boolean_t);
+static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *);
-static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *);
 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
     dl_capability_sub_t *);
 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
-static int  ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *,
-    int *);
 static void	ill_capability_dld_reset_fill(ill_t *, mblk_t *);
 static void	ill_capability_dld_ack(ill_t *, mblk_t *,
 		    dl_capability_sub_t *);
@@ -242,11 +205,11 @@ static void	ill_capability_send(ill_t *, mblk_t *);
 static ill_t	*ill_prev_usesrc(ill_t *);
 static int	ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
 static void	ill_disband_usesrc_group(ill_t *);
-static void	conn_cleanup_stale_ire(conn_t *, caddr_t);
+static void	ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
 
 #ifdef DEBUG
-static  void    ill_trace_cleanup(const ill_t *);
-static  void    ipif_trace_cleanup(const ipif_t *);
+static	void	ill_trace_cleanup(const ill_t *);
+static	void	ipif_trace_cleanup(const ipif_t *);
 #endif
 
 /*
@@ -255,182 +218,10 @@ static  void    ipif_trace_cleanup(const ipif_t *);
  */
 int ip_min_frag_prune_time = 0;
 
-/*
- * max # of IPsec algorithms supported.  Limited to 1 byte by PF_KEY
- * and the IPsec DOI
- */
-#define	MAX_IPSEC_ALGS	256
-
-#define	BITSPERBYTE	8
-#define	BITS(type)	(BITSPERBYTE * (long)sizeof (type))
-
-#define	IPSEC_ALG_ENABLE(algs, algid) \
-		((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \
-		(1 << ((algid) % BITS(ipsec_capab_elem_t))))
-
-#define	IPSEC_ALG_IS_ENABLED(algid, algs) \
-		((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \
-		(1 << ((algid) % BITS(ipsec_capab_elem_t))))
-
-typedef uint8_t ipsec_capab_elem_t;
-
-/*
- * Per-algorithm parameters.  Note that at present, only encryption
- * algorithms have variable keysize (IKE does not provide a way to negotiate
- * auth algorithm keysize).
- *
- * All sizes here are in bits.
- */
-typedef struct
-{
-	uint16_t	minkeylen;
-	uint16_t	maxkeylen;
-} ipsec_capab_algparm_t;
-
-/*
- * Per-ill capabilities.
- */
-struct ill_ipsec_capab_s {
-	ipsec_capab_elem_t *encr_hw_algs;
-	ipsec_capab_elem_t *auth_hw_algs;
-	uint32_t algs_size;	/* size of _hw_algs in bytes */
-	/* algorithm key lengths */
-	ipsec_capab_algparm_t *encr_algparm;
-	uint32_t encr_algparm_size;
-	uint32_t encr_algparm_end;
-};
-
-/*
- * The field values are larger than strictly necessary for simple
- * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls.
- */
-static area_t	ip_area_template = {
-	AR_ENTRY_ADD,			/* area_cmd */
-	sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl),
-					/* area_name_offset */
-	/* area_name_length temporarily holds this structure length */
-	sizeof (area_t),			/* area_name_length */
-	IP_ARP_PROTO_TYPE,		/* area_proto */
-	sizeof (ip_sock_ar_t),		/* area_proto_addr_offset */
-	IP_ADDR_LEN,			/* area_proto_addr_length */
-	sizeof (ip_sock_ar_t) + IP_ADDR_LEN,
-					/* area_proto_mask_offset */
-	0,				/* area_flags */
-	sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN,
-					/* area_hw_addr_offset */
-	/* Zero length hw_addr_length means 'use your idea of the address' */
-	0				/* area_hw_addr_length */
-};
-
-/*
- * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver
- * support
- */
-static area_t	ip6_area_template = {
-	AR_ENTRY_ADD,			/* area_cmd */
-	sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t),
-					/* area_name_offset */
-	/* area_name_length temporarily holds this structure length */
-	sizeof (area_t),			/* area_name_length */
-	IP_ARP_PROTO_TYPE,		/* area_proto */
-	sizeof (ip_sock_ar_t),		/* area_proto_addr_offset */
-	IPV6_ADDR_LEN,			/* area_proto_addr_length */
-	sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN,
-					/* area_proto_mask_offset */
-	0,				/* area_flags */
-	sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN,
-					/* area_hw_addr_offset */
-	/* Zero length hw_addr_length means 'use your idea of the address' */
-	0				/* area_hw_addr_length */
-};
-
-static ared_t	ip_ared_template = {
-	AR_ENTRY_DELETE,
-	sizeof (ared_t) + IP_ADDR_LEN,
-	sizeof (ared_t),
-	IP_ARP_PROTO_TYPE,
-	sizeof (ared_t),
-	IP_ADDR_LEN,
-	0
-};
-
-static ared_t	ip6_ared_template = {
-	AR_ENTRY_DELETE,
-	sizeof (ared_t) + IPV6_ADDR_LEN,
-	sizeof (ared_t),
-	IP_ARP_PROTO_TYPE,
-	sizeof (ared_t),
-	IPV6_ADDR_LEN,
-	0
-};
-
-/*
- * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as
- * as the areq doesn't include an IP address in ill_dl_up() (the only place a
- * areq is used).
- */
-static areq_t	ip_areq_template = {
-	AR_ENTRY_QUERY,			/* cmd */
-	sizeof (areq_t)+(2*IP_ADDR_LEN),	/* name offset */
-	sizeof (areq_t),	/* name len (filled by ill_arp_alloc) */
-	IP_ARP_PROTO_TYPE,		/* protocol, from arps perspective */
-	sizeof (areq_t),			/* target addr offset */
-	IP_ADDR_LEN,			/* target addr_length */
-	0,				/* flags */
-	sizeof (areq_t) + IP_ADDR_LEN,	/* sender addr offset */
-	IP_ADDR_LEN,			/* sender addr length */
-	AR_EQ_DEFAULT_XMIT_COUNT,	/* xmit_count */
-	AR_EQ_DEFAULT_XMIT_INTERVAL,	/* (re)xmit_interval in milliseconds */
-	AR_EQ_DEFAULT_MAX_BUFFERED	/* max # of requests to buffer */
-	/* anything else filled in by the code */
-};
-
-static arc_t	ip_aru_template = {
-	AR_INTERFACE_UP,
-	sizeof (arc_t),		/* Name offset */
-	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
-};
-
-static arc_t	ip_ard_template = {
-	AR_INTERFACE_DOWN,
-	sizeof (arc_t),		/* Name offset */
-	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
-};
-
-static arc_t	ip_aron_template = {
-	AR_INTERFACE_ON,
-	sizeof (arc_t),		/* Name offset */
-	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
-};
-
-static arc_t	ip_aroff_template = {
-	AR_INTERFACE_OFF,
-	sizeof (arc_t),		/* Name offset */
-	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
-};
-
-static arma_t	ip_arma_multi_template = {
-	AR_MAPPING_ADD,
-	sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN,
-				/* Name offset */
-	sizeof (arma_t),	/* Name length (set by ill_arp_alloc) */
-	IP_ARP_PROTO_TYPE,
-	sizeof (arma_t),			/* proto_addr_offset */
-	IP_ADDR_LEN,				/* proto_addr_length */
-	sizeof (arma_t) + IP_ADDR_LEN,		/* proto_mask_offset */
-	sizeof (arma_t) + 2*IP_ADDR_LEN,	/* proto_extract_mask_offset */
-	ACE_F_PERMANENT | ACE_F_MAPPING,	/* flags */
-	sizeof (arma_t) + 3*IP_ADDR_LEN,	/* hw_addr_offset */
-	IP_MAX_HW_LEN,				/* hw_addr_length */
-	0,					/* hw_mapping_start */
-};
-
 static ipft_t	ip_ioctl_ftbl[] = {
 	{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
 	{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
 		IPFT_F_NO_REPLY },
-	{ IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t),
-		IPFT_F_NO_REPLY },
 	{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
 	{ 0 }
 };
@@ -444,35 +235,38 @@ static uchar_t	ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
 
 static ip_m_t   ip_m_tbl[] = {
 	{ DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
-	    ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_ether_v6intfid,
+	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
 	    ip_nodef_v6intfid },
 	{ DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
-	    ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid,
+	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 	    ip_nodef_v6intfid },
 	{ DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
-	    ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid,
+	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 	    ip_nodef_v6intfid },
 	{ DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
-	    ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid,
+	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 	    ip_nodef_v6intfid },
 	{ DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
-	    ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_ether_v6intfid,
+	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
 	    ip_nodef_v6intfid },
 	{ DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
-	    ip_ib_v4mapinfo, ip_ib_v6mapinfo, ip_ib_v6intfid,
+	    ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
+	    ip_nodef_v6intfid },
+	{ DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
+	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
+	    ip_ipv4_v6destintfid },
+	{ DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
+	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
+	    ip_ipv6_v6destintfid },
+	{ DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
+	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
 	    ip_nodef_v6intfid },
-	{ DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo,
-	    ip_nodef_v6mapinfo, ip_ipv4_v6intfid, ip_ipv4_v6destintfid },
-	{ DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo,
-	    ip_nodef_v6mapinfo, ip_ipv6_v6intfid, ip_ipv6_v6destintfid },
-	{ DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo,
-	    ip_nodef_v6mapinfo, ip_ipv4_v6intfid, ip_nodef_v6intfid },
 	{ SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 	    NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
 	{ SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 	    NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
 	{ DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
-	    ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid,
+	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 	    ip_nodef_v6intfid }
 };
 
@@ -567,149 +361,6 @@ ill_allocate_mibs(ill_t *ill)
 }
 
 /*
- * Common code for preparation of ARP commands.  Two points to remember:
- * 	1) The ill_name is tacked on at the end of the allocated space so
- *	   the templates name_offset field must contain the total space
- *	   to allocate less the name length.
- *
- *	2) The templates name_length field should contain the *template*
- *	   length.  We use it as a parameter to bcopy() and then write
- *	   the real ill_name_length into the name_length field of the copy.
- * (Always called as writer.)
- */
-mblk_t *
-ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr)
-{
-	arc_t	*arc = (arc_t *)template;
-	char	*cp;
-	int	len;
-	mblk_t	*mp;
-	uint_t	name_length = ill->ill_name_length;
-	uint_t	template_len = arc->arc_name_length;
-
-	len = arc->arc_name_offset + name_length;
-	mp = allocb(len, BPRI_HI);
-	if (mp == NULL)
-		return (NULL);
-	cp = (char *)mp->b_rptr;
-	mp->b_wptr = (uchar_t *)&cp[len];
-	if (template_len)
-		bcopy(template, cp, template_len);
-	if (len > template_len)
-		bzero(&cp[template_len], len - template_len);
-	mp->b_datap->db_type = M_PROTO;
-
-	arc = (arc_t *)cp;
-	arc->arc_name_length = name_length;
-	cp = (char *)arc + arc->arc_name_offset;
-	bcopy(ill->ill_name, cp, name_length);
-
-	if (addr) {
-		area_t	*area = (area_t *)mp->b_rptr;
-
-		cp = (char *)area + area->area_proto_addr_offset;
-		bcopy(addr, cp, area->area_proto_addr_length);
-		if (area->area_cmd == AR_ENTRY_ADD) {
-			cp = (char *)area;
-			len = area->area_proto_addr_length;
-			if (area->area_proto_mask_offset)
-				cp += area->area_proto_mask_offset;
-			else
-				cp += area->area_proto_addr_offset + len;
-			while (len-- > 0)
-				*cp++ = (char)~0;
-		}
-	}
-	return (mp);
-}
-
-mblk_t *
-ipif_area_alloc(ipif_t *ipif, uint_t optflags)
-{
-	caddr_t	addr;
-	mblk_t 	*mp;
-	area_t	*area;
-	uchar_t	*areap;
-	ill_t	*ill = ipif->ipif_ill;
-
-	if (ill->ill_isv6) {
-		ASSERT(ill->ill_flags & ILLF_XRESOLV);
-		addr = (caddr_t)&ipif->ipif_v6lcl_addr;
-		areap = (uchar_t *)&ip6_area_template;
-	} else {
-		addr = (caddr_t)&ipif->ipif_lcl_addr;
-		areap = (uchar_t *)&ip_area_template;
-	}
-
-	if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL)
-		return (NULL);
-
-	/*
-	 * IPMP requires that the hardware address be included in all
-	 * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on.
-	 * If there are no active underlying ills in the group (and thus no
-	 * hardware address, DAD will be deferred until an underlying ill
-	 * becomes active.
-	 */
-	if (IS_IPMP(ill)) {
-		if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
-			freemsg(mp);
-			return (NULL);
-		}
-	} else {
-		ill_refhold(ill);
-	}
-
-	area = (area_t *)mp->b_rptr;
-	area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR;
-	area->area_flags |= optflags;
-	area->area_hw_addr_length = ill->ill_phys_addr_length;
-	bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset,
-	    area->area_hw_addr_length);
-
-	ill_refrele(ill);
-	return (mp);
-}
-
-mblk_t *
-ipif_ared_alloc(ipif_t *ipif)
-{
-	caddr_t	addr;
-	uchar_t	*aredp;
-
-	if (ipif->ipif_ill->ill_isv6) {
-		ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV);
-		addr = (caddr_t)&ipif->ipif_v6lcl_addr;
-		aredp = (uchar_t *)&ip6_ared_template;
-	} else {
-		addr = (caddr_t)&ipif->ipif_lcl_addr;
-		aredp = (uchar_t *)&ip_ared_template;
-	}
-
-	return (ill_arp_alloc(ipif->ipif_ill, aredp, addr));
-}
-
-mblk_t *
-ill_ared_alloc(ill_t *ill, ipaddr_t addr)
-{
-	return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template,
-	    (char *)&addr));
-}
-
-mblk_t *
-ill_arie_alloc(ill_t *ill, const char *grifname, const void *template)
-{
-	mblk_t	*mp = ill_arp_alloc(ill, template, 0);
-	arie_t	*arie;
-
-	if (mp != NULL) {
-		arie = (arie_t *)mp->b_rptr;
-		(void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ);
-	}
-	return (mp);
-}
-
-/*
  * Completely vaporize a lower level tap and all associated interfaces.
  * ill_delete is called only out of ip_close when the device control
  * stream is being closed.
@@ -735,8 +386,8 @@ ill_delete(ill_t *ill)
 	 * remove it from the list, and free the data structure.
 	 * Walk down the ipif list and remove the logical interfaces
 	 * first before removing the main ipif. We can't unplumb
-	 * zeroth interface first in the case of IPv6 as reset_conn_ill
-	 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking
+	 * zeroth interface first in the case of IPv6 as update_conn_ill
+	 * -> ip_ll_multireq de-references ill_ipif for checking
 	 * POINTOPOINT.
 	 *
 	 * If ill_ipif was not properly initialized (i.e low on memory),
@@ -747,22 +398,15 @@ ill_delete(ill_t *ill)
 		ipif_free(ipif);
 
 	/*
-	 * Used only by ill_arp_on and ill_arp_off, which are writers.
-	 * So nobody can be using this mp now. Free the mp allocated for
-	 * honoring ILLF_NOARP
+	 * clean out all the nce_t entries that depend on this
+	 * ill for the ill_phys_addr.
 	 */
-	freemsg(ill->ill_arp_on_mp);
-	ill->ill_arp_on_mp = NULL;
+	nce_flush(ill, B_TRUE);
 
 	/* Clean up msgs on pending upcalls for mrouted */
 	reset_mrt_ill(ill);
 
-	/*
-	 * ipif_free -> reset_conn_ipif will remove all multicast
-	 * references for IPv4. For IPv6, we need to do it here as
-	 * it points only at ills.
-	 */
-	reset_conn_ill(ill);
+	update_conn_ill(ill, ipst);
 
 	/*
 	 * Remove multicast references added as a result of calls to
@@ -786,6 +430,16 @@ ill_delete(ill_t *ill)
 	sctp_update_ill(ill, SCTP_ILL_REMOVE);
 
 	/*
+	 * Walk all CONNs that can have a reference on an ire or nce for this
+	 * ill (we actually walk all that now have stale references).
+	 */
+	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
+
+	/* With IPv6 we have dce_ifindex. Cleanup for neatness */
+	if (ill->ill_isv6)
+		dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
+
+	/*
 	 * If an address on this ILL is being used as a source address then
 	 * clear out the pointers in other ILLs that point to this ILL.
 	 */
@@ -828,12 +482,10 @@ ill_delete_tail(ill_t *ill)
 
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 		ipif_non_duplicate(ipif);
-		ipif_down_tail(ipif);
+		(void) ipif_down_tail(ipif);
 	}
 
-	ASSERT(ill->ill_ipif_dup_count == 0 &&
-	    ill->ill_arp_down_mp == NULL &&
-	    ill->ill_arp_del_mapping_mp == NULL);
+	ASSERT(ill->ill_ipif_dup_count == 0);
 
 	/*
 	 * If polling capability is enabled (which signifies direct
@@ -864,23 +516,6 @@ ill_delete_tail(ill_t *ill)
 	/*
 	 * Free capabilities.
 	 */
-	if (ill->ill_ipsec_capab_ah != NULL) {
-		ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH);
-		ill_ipsec_capab_free(ill->ill_ipsec_capab_ah);
-		ill->ill_ipsec_capab_ah = NULL;
-	}
-
-	if (ill->ill_ipsec_capab_esp != NULL) {
-		ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP);
-		ill_ipsec_capab_free(ill->ill_ipsec_capab_esp);
-		ill->ill_ipsec_capab_esp = NULL;
-	}
-
-	if (ill->ill_mdt_capab != NULL) {
-		kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t));
-		ill->ill_mdt_capab = NULL;
-	}
-
 	if (ill->ill_hcksum_capab != NULL) {
 		kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
 		ill->ill_hcksum_capab = NULL;
@@ -911,11 +546,10 @@ ill_delete_tail(ill_t *ill)
 	 *
 	 * We don't walk conns, mrts and ires because
 	 *
-	 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts.
+	 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
 	 * 2) ill_down ->ill_downi walks all the ires and cleans up
 	 *    ill references.
 	 */
-	ASSERT(ilm_walk_ill(ill) == 0);
 
 	/*
 	 * If this ill is an IPMP meta-interface, blow away the illgrp.  This
@@ -974,6 +608,9 @@ ill_delete_tail(ill_t *ill)
 	ill_trace_cleanup(ill);
 #endif
 
+	/* The default multicast interface might have changed */
+	ire_increment_multicast_generation(ipst, ill->ill_isv6);
+
 	/* Drop refcnt here */
 	netstack_rele(ill->ill_ipst->ips_netstack);
 	ill->ill_ipst = NULL;
@@ -1077,97 +714,6 @@ ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
 }
 
 /*
- * Add the 'mp' to the list of pending mp's headed by ill_pending_mp.  Return
- * an error if we already have 1 or more ioctls in progress.  This is only
- * needed for SIOCG*ARP.
- */
-boolean_t
-ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp)
-{
-	ASSERT(MUTEX_HELD(&ill->ill_lock));
-	ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
-	/* We should only see M_IOCDATA arp ioctls here. */
-	ASSERT(add_mp->b_datap->db_type == M_IOCDATA);
-
-	ASSERT(MUTEX_HELD(&connp->conn_lock));
-	/*
-	 * Return error if the conn has started closing. The conn
-	 * could have finished cleaning up the pending mp list,
-	 * If so we should not add another mp to the list negating
-	 * the cleanup.
-	 */
-	if (connp->conn_state_flags & CONN_CLOSING)
-		return (B_FALSE);
-	/*
-	 * Add the pending mp to the head of the list, chained by b_next.
-	 * Note down the conn on which the ioctl request came, in b_prev.
-	 * This will be used to later get the conn, when we get a response
-	 * on the ill queue, from some other module (typically arp)
-	 */
-	add_mp->b_next = (void *)ill->ill_pending_mp;
-	add_mp->b_queue = CONNP_TO_WQ(connp);
-	ill->ill_pending_mp = add_mp;
-	if (connp != NULL)
-		connp->conn_oper_pending_ill = ill;
-	return (B_TRUE);
-}
-
-/*
- * Retrieve the ill_pending_mp and return it. We have to walk the list
- * of mblks starting at ill_pending_mp, and match based on the ioc_id.
- */
-mblk_t *
-ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id)
-{
-	mblk_t	*prev = NULL;
-	mblk_t	*curr = NULL;
-	uint_t	id;
-	conn_t	*connp;
-
-	/*
-	 * When the conn closes, conn_ioctl_cleanup needs to clean
-	 * up the pending mp, but it does not know the ioc_id and
-	 * passes in a zero for it.
-	 */
-	mutex_enter(&ill->ill_lock);
-	if (ioc_id != 0)
-		*connpp = NULL;
-
-	/* Search the list for the appropriate ioctl based on ioc_id */
-	for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL;
-	    prev = curr, curr = curr->b_next) {
-		id = ((struct iocblk *)curr->b_rptr)->ioc_id;
-		connp = Q_TO_CONN(curr->b_queue);
-		/* Match based on the ioc_id or based on the conn */
-		if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp))
-			break;
-	}
-
-	if (curr != NULL) {
-		/* Unlink the mblk from the pending mp list */
-		if (prev != NULL) {
-			prev->b_next = curr->b_next;
-		} else {
-			ASSERT(ill->ill_pending_mp == curr);
-			ill->ill_pending_mp = curr->b_next;
-		}
-
-		/*
-		 * conn refcnt must have been bumped up at the start of
-		 * the ioctl. So we can safely access the conn.
-		 */
-		ASSERT(CONN_Q(curr->b_queue));
-		*connpp = Q_TO_CONN(curr->b_queue);
-		curr->b_next = NULL;
-		curr->b_queue = NULL;
-	}
-
-	mutex_exit(&ill->ill_lock);
-
-	return (curr);
-}
-
-/*
  * Add the pending mp to the list. There can be only 1 pending mp
  * in the list. Any exclusive ioctl that needs to wait for a response
  * from another module or driver needs to use this function to set
@@ -1283,6 +829,7 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
 	ipxop_t	*ipx;
 	queue_t	*q;
 	ipif_t	*ipif;
+	int	cmd;
 
 	ASSERT(IAM_WRITER_ILL(ill));
 	ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
@@ -1312,11 +859,16 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
 	ipx->ipx_pending_ipif = NULL;
 	ipx->ipx_waitfor = 0;
 	ipx->ipx_current_ipif = NULL;
+	cmd = ipx->ipx_current_ioctl;
 	ipx->ipx_current_ioctl = 0;
 	ipx->ipx_current_done = B_TRUE;
 	mutex_exit(&ipx->ipx_lock);
 
 	if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
+		DTRACE_PROBE4(ipif__ioctl,
+		    char *, "ipsq_pending_mp_cleanup",
+		    int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
+		    ipif_t *, ipif);
 		if (connp == NULL) {
 			ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
 		} else {
@@ -1337,43 +889,6 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
 }
 
 /*
- * The ill is closing. Cleanup all the pending mps. Called exclusively
- * towards the end of ill_delete. The refcount has gone to 0. So nobody
- * knows this ill, and hence nobody can add an mp to this list
- */
-static void
-ill_pending_mp_cleanup(ill_t *ill)
-{
-	mblk_t	*mp;
-	queue_t	*q;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	mutex_enter(&ill->ill_lock);
-	/*
-	 * Every mp on the pending mp list originating from an ioctl
-	 * added 1 to the conn refcnt, at the start of the ioctl.
-	 * So bump it down now.  See comments in ip_wput_nondata()
-	 */
-	while (ill->ill_pending_mp != NULL) {
-		mp = ill->ill_pending_mp;
-		ill->ill_pending_mp = mp->b_next;
-		mutex_exit(&ill->ill_lock);
-
-		q = mp->b_queue;
-		ASSERT(CONN_Q(q));
-		mp->b_next = NULL;
-		mp->b_prev = NULL;
-		mp->b_queue = NULL;
-		ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
-		mutex_enter(&ill->ill_lock);
-	}
-	ill->ill_pending_ipif = NULL;
-
-	mutex_exit(&ill->ill_lock);
-}
-
-/*
  * Called in the conn close path and ill delete path
  */
 static void
@@ -1435,6 +950,9 @@ ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
 		curr->b_prev = NULL;
 		curr->b_queue = NULL;
 		if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
+			DTRACE_PROBE4(ipif__ioctl,
+			    char *, "ipsq_xopq_mp_cleanup",
+			    int, 0, ill_t *, NULL, ipif_t *, NULL);
 			ip_ioctl_finish(q, curr, ENXIO, connp != NULL ?
 			    CONN_CLOSE : NO_COPYOUT, NULL);
 		} else {
@@ -1455,7 +973,6 @@ ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
 void
 conn_ioctl_cleanup(conn_t *connp)
 {
-	mblk_t *curr;
 	ipsq_t	*ipsq;
 	ill_t	*ill;
 	boolean_t refheld;
@@ -1476,13 +993,6 @@ conn_ioctl_cleanup(conn_t *connp)
 		return;
 	}
 
-	curr = ill_pending_mp_get(ill, &connp, 0);
-	if (curr != NULL) {
-		mutex_exit(&connp->conn_lock);
-		CONN_DEC_REF(connp);
-		inet_freemsg(curr);
-		return;
-	}
 	/*
 	 * We may not be able to refhold the ill if the ill/ipif
 	 * is changing. But we need to make sure that the ill will
@@ -1522,58 +1032,43 @@ conn_ioctl_cleanup(conn_t *connp)
 
 /*
  * ipcl_walk function for cleaning up conn_*_ill fields.
+ * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
+ * conn_bound_if in place. We prefer dropping
+ * packets instead of sending them out the wrong interface, or accepting
+ * packets from the wrong ifindex.
  */
 static void
 conn_cleanup_ill(conn_t *connp, caddr_t arg)
 {
 	ill_t	*ill = (ill_t *)arg;
-	ire_t	*ire;
 
 	mutex_enter(&connp->conn_lock);
-	if (connp->conn_multicast_ill == ill) {
-		/* Revert to late binding */
-		connp->conn_multicast_ill = NULL;
-	}
-	if (connp->conn_incoming_ill == ill)
-		connp->conn_incoming_ill = NULL;
-	if (connp->conn_outgoing_ill == ill)
-		connp->conn_outgoing_ill = NULL;
 	if (connp->conn_dhcpinit_ill == ill) {
 		connp->conn_dhcpinit_ill = NULL;
 		ASSERT(ill->ill_dhcpinit != 0);
 		atomic_dec_32(&ill->ill_dhcpinit);
-	}
-	if (connp->conn_ire_cache != NULL) {
-		ire = connp->conn_ire_cache;
-		/*
-		 * Source address selection makes it possible for IRE_CACHE
-		 * entries to be created with ire_stq coming from interface X
-		 * and ipif coming from interface Y.  Thus whenever interface
-		 * X goes down, remove all references to it by checking both
-		 * on ire_ipif and ire_stq.
-		 */
-		if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
-		    (ire->ire_type == IRE_CACHE &&
-		    ire->ire_stq == ill->ill_wq)) {
-			connp->conn_ire_cache = NULL;
-			mutex_exit(&connp->conn_lock);
-			ire_refrele_notr(ire);
-			return;
-		}
+		ill_set_inputfn(ill);
 	}
 	mutex_exit(&connp->conn_lock);
 }
 
-static void
+static int
 ill_down_ipifs_tail(ill_t *ill)
 {
 	ipif_t	*ipif;
+	int err;
 
 	ASSERT(IAM_WRITER_ILL(ill));
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 		ipif_non_duplicate(ipif);
-		ipif_down_tail(ipif);
+		/*
+		 * ipif_down_tail will call arp_ll_down on the last ipif
+		 * and typically return EINPROGRESS when the DL_UNBIND is sent.
+		 */
+		if ((err = ipif_down_tail(ipif)) != 0)
+			return (err);
 	}
+	return (0);
 }
 
 /* ARGSUSED */
@@ -1581,7 +1076,7 @@ void
 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 {
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
-	ill_down_ipifs_tail(q->q_ptr);
+	(void) ill_down_ipifs_tail(q->q_ptr);
 	freemsg(mp);
 	ipsq_current_finish(ipsq);
 }
@@ -1598,12 +1093,27 @@ ill_down_start(queue_t *q, mblk_t *mp)
 	ipif_t	*ipif;
 
 	ASSERT(IAM_WRITER_ILL(ill));
+	mutex_enter(&ill->ill_lock);
+	ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
+	/* no more nce addition allowed */
+	mutex_exit(&ill->ill_lock);
 
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
 		(void) ipif_down(ipif, NULL, NULL);
 
 	ill_down(ill);
 
+	/*
+	 * Walk all CONNs that can have a reference on an ire or nce for this
+	 * ill (we actually walk all that now have stale references).
+	 */
+	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
+
+	/* With IPv6 we have dce_ifindex. Cleanup for neatness */
+	if (ill->ill_isv6)
+		dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
+
+
 	(void) ipsq_pending_mp_cleanup(ill, NULL);
 
 	ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
@@ -1626,44 +1136,68 @@ ill_down_start(queue_t *q, mblk_t *mp)
 static void
 ill_down(ill_t *ill)
 {
+	mblk_t	*mp;
 	ip_stack_t	*ipst = ill->ill_ipst;
 
-	/* Blow off any IREs dependent on this ILL. */
-	ire_walk(ill_downi, ill, ipst);
+	/*
+	 * Blow off any IREs dependent on this ILL.
+	 * The caller needs to handle conn_ixa_cleanup
+	 */
+	ill_delete_ires(ill);
+
+	ire_walk_ill(0, 0, ill_downi, ill, ill);
 
 	/* Remove any conn_*_ill depending on this ill */
 	ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
+
+	/*
+	 * Free state for additional IREs.
+	 */
+	mutex_enter(&ill->ill_saved_ire_lock);
+	mp = ill->ill_saved_ire_mp;
+	ill->ill_saved_ire_mp = NULL;
+	ill->ill_saved_ire_cnt = 0;
+	mutex_exit(&ill->ill_saved_ire_lock);
+	freemsg(mp);
 }
 
 /*
- * ire_walk routine used to delete every IRE that depends on queues
- * associated with 'ill'.  (Always called as writer.)
+ * ire_walk routine used to delete every IRE that depends on
+ * 'ill'.  (Always called as writer.)
+ *
+ * Note: since the routes added by the kernel are deleted separately,
+ * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
+ *
+ * We also remove references on ire_nce_cache entries that refer to the ill.
  */
-static void
+void
 ill_downi(ire_t *ire, char *ill_arg)
 {
 	ill_t	*ill = (ill_t *)ill_arg;
+	nce_t	*nce;
 
-	/*
-	 * Source address selection makes it possible for IRE_CACHE
-	 * entries to be created with ire_stq coming from interface X
-	 * and ipif coming from interface Y.  Thus whenever interface
-	 * X goes down, remove all references to it by checking both
-	 * on ire_ipif and ire_stq.
-	 */
-	if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
-	    (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) {
+	mutex_enter(&ire->ire_lock);
+	nce = ire->ire_nce_cache;
+	if (nce != NULL && nce->nce_ill == ill)
+		ire->ire_nce_cache = NULL;
+	else
+		nce = NULL;
+	mutex_exit(&ire->ire_lock);
+	if (nce != NULL)
+		nce_refrele(nce);
+	if (ire->ire_ill == ill)
 		ire_delete(ire);
-	}
 }
 
-/*
- * Remove ire/nce from the fastpath list.
- */
+/* Remove IRE_IF_CLONE on this ill */
 void
-ill_fastpath_nack(ill_t *ill)
+ill_downi_if_clone(ire_t *ire, char *ill_arg)
 {
-	nce_fastpath_list_dispatch(ill, NULL, NULL);
+	ill_t	*ill = (ill_t *)ill_arg;
+
+	ASSERT(ire->ire_type & IRE_IF_CLONE);
+	if (ire->ire_ill == ill)
+		ire_delete(ire);
 }
 
 /* Consume an M_IOCACK of the fastpath probe. */
@@ -1685,20 +1219,11 @@ ill_fastpath_ack(ill_t *ill, mblk_t *mp)
 	freeb(mp1);
 	if (mp == NULL)
 		return;
-	if (mp->b_cont != NULL) {
-		/*
-		 * Update all IRE's or NCE's that are waiting for
-		 * fastpath update.
-		 */
-		nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp);
-		mp1 = mp->b_cont;
-		freeb(mp);
-		mp = mp1;
-	} else {
+	if (mp->b_cont != NULL)
+		nce_fastpath_update(ill, mp);
+	else
 		ip0dbg(("ill_fastpath_ack:  no b_cont\n"));
-	}
-
-	freeb(mp);
+	freemsg(mp);
 }
 
 /*
@@ -1745,6 +1270,8 @@ ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
 	ioc = (struct iocblk *)mp->b_rptr;
 	ioc->ioc_count = msgdsize(mp->b_cont);
 
+	DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
+	    char *, "DL_IOC_HDR_INFO", ill_t *, ill);
 	putnext(ill->ill_wq, mp);
 	return (0);
 }
@@ -1797,8 +1324,7 @@ ill_capability_reset(ill_t *ill, boolean_t reneg)
 	 * direct function call capabilities viz. ILL_CAPAB_DLD*
 	 * which will be turned off by the corresponding reset functions.
 	 */
-	ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM  |
-	    ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP);
+	ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM  | ILL_CAPAB_ZEROCOPY);
 }
 
 static void
@@ -1812,9 +1338,6 @@ ill_capability_reset_alloc(ill_t *ill)
 	ASSERT(IAM_WRITER_ILL(ill));
 	ASSERT(ill->ill_capab_reset_mp == NULL);
 
-	if (ILL_MDT_CAPABLE(ill))
-		size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
-
 	if (ILL_HCKSUM_CAPABLE(ill)) {
 		size += sizeof (dl_capability_sub_t) +
 		    sizeof (dl_capab_hcksum_t);
@@ -1825,12 +1348,6 @@ ill_capability_reset_alloc(ill_t *ill)
 		    sizeof (dl_capab_zerocopy_t);
 	}
 
-	if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) {
-		size += sizeof (dl_capability_sub_t);
-		size += ill_capability_ipsec_reset_size(ill, NULL, NULL,
-		    NULL, NULL);
-	}
-
 	if (ill->ill_capabilities & ILL_CAPAB_DLD) {
 		size += sizeof (dl_capability_sub_t) +
 		    sizeof (dl_capab_dld_t);
@@ -1853,10 +1370,8 @@ ill_capability_reset_alloc(ill_t *ill)
 	 * Each handler fills in the corresponding dl_capability_sub_t
 	 * inside the mblk,
 	 */
-	ill_capability_mdt_reset_fill(ill, mp);
 	ill_capability_hcksum_reset_fill(ill, mp);
 	ill_capability_zerocopy_reset_fill(ill, mp);
-	ill_capability_ipsec_reset_fill(ill, mp);
 	ill_capability_dld_reset_fill(ill, mp);
 
 	ill->ill_capab_reset_mp = mp;
@@ -1906,162 +1421,7 @@ ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
 	}
 
 	/* Process the encapsulated sub-capability */
-	ill_capability_dispatch(ill, mp, inners, B_TRUE);
-}
-
-/*
- * Process Multidata Transmit capability negotiation ack received from a
- * DLS Provider.  isub must point to the sub-capability (DL_CAPAB_MDT) of a
- * DL_CAPABILITY_ACK message.
- */
-static void
-ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
-{
-	mblk_t *nmp = NULL;
-	dl_capability_req_t *oc;
-	dl_capab_mdt_t *mdt_ic, *mdt_oc;
-	ill_mdt_capab_t **ill_mdt_capab;
-	uint_t sub_dl_cap = isub->dl_cap;
-	uint8_t *capend;
-
-	ASSERT(sub_dl_cap == DL_CAPAB_MDT);
-
-	ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab;
-
-	/*
-	 * Note: range checks here are not absolutely sufficient to
-	 * make us robust against malformed messages sent by drivers;
-	 * this is in keeping with the rest of IP's dlpi handling.
-	 * (Remember, it's coming from something else in the kernel
-	 * address space)
-	 */
-
-	capend = (uint8_t *)(isub + 1) + isub->dl_length;
-	if (capend > mp->b_wptr) {
-		cmn_err(CE_WARN, "ill_capability_mdt_ack: "
-		    "malformed sub-capability too long for mblk");
-		return;
-	}
-
-	mdt_ic = (dl_capab_mdt_t *)(isub + 1);
-
-	if (mdt_ic->mdt_version != MDT_VERSION_2) {
-		cmn_err(CE_CONT, "ill_capability_mdt_ack: "
-		    "unsupported MDT sub-capability (version %d, expected %d)",
-		    mdt_ic->mdt_version, MDT_VERSION_2);
-		return;
-	}
-
-	if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) {
-		ip1dbg(("ill_capability_mdt_ack: mid token for MDT "
-		    "capability isn't as expected; pass-thru module(s) "
-		    "detected, discarding capability\n"));
-		return;
-	}
-
-	if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) {
-
-		if (*ill_mdt_capab == NULL) {
-			*ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t),
-			    KM_NOSLEEP);
-			if (*ill_mdt_capab == NULL) {
-				cmn_err(CE_WARN, "ill_capability_mdt_ack: "
-				    "could not enable MDT version %d "
-				    "for %s (ENOMEM)\n", MDT_VERSION_2,
-				    ill->ill_name);
-				return;
-			}
-		}
-
-		ip1dbg(("ill_capability_mdt_ack: interface %s supports "
-		    "MDT version %d (%d bytes leading, %d bytes trailing "
-		    "header spaces, %d max pld bufs, %d span limit)\n",
-		    ill->ill_name, MDT_VERSION_2,
-		    mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail,
-		    mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit));
-
-		(*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2;
-		(*ill_mdt_capab)->ill_mdt_on = 1;
-		/*
-		 * Round the following values to the nearest 32-bit; ULP
-		 * may further adjust them to accomodate for additional
-		 * protocol headers.  We pass these values to ULP during
-		 * bind time.
-		 */
-		(*ill_mdt_capab)->ill_mdt_hdr_head =
-		    roundup(mdt_ic->mdt_hdr_head, 4);
-		(*ill_mdt_capab)->ill_mdt_hdr_tail =
-		    roundup(mdt_ic->mdt_hdr_tail, 4);
-		(*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld;
-		(*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit;
-
-		ill->ill_capabilities |= ILL_CAPAB_MDT;
-	} else {
-		uint_t size;
-		uchar_t *rptr;
-
-		size = sizeof (dl_capability_req_t) +
-		    sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
-
-		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
-			cmn_err(CE_WARN, "ill_capability_mdt_ack: "
-			    "could not enable MDT for %s (ENOMEM)\n",
-			    ill->ill_name);
-			return;
-		}
-
-		rptr = nmp->b_rptr;
-		/* initialize dl_capability_req_t */
-		oc = (dl_capability_req_t *)nmp->b_rptr;
-		oc->dl_sub_offset = sizeof (dl_capability_req_t);
-		oc->dl_sub_length = sizeof (dl_capability_sub_t) +
-		    sizeof (dl_capab_mdt_t);
-		nmp->b_rptr += sizeof (dl_capability_req_t);
-
-		/* initialize dl_capability_sub_t */
-		bcopy(isub, nmp->b_rptr, sizeof (*isub));
-		nmp->b_rptr += sizeof (*isub);
-
-		/* initialize dl_capab_mdt_t */
-		mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr;
-		bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic));
-
-		nmp->b_rptr = rptr;
-
-		ip1dbg(("ill_capability_mdt_ack: asking interface %s "
-		    "to enable MDT version %d\n", ill->ill_name,
-		    MDT_VERSION_2));
-
-		/* set ENABLE flag */
-		mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE;
-
-		/* nmp points to a DL_CAPABILITY_REQ message to enable MDT */
-		ill_capability_send(ill, nmp);
-	}
-}
-
-static void
-ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp)
-{
-	dl_capab_mdt_t *mdt_subcap;
-	dl_capability_sub_t *dl_subcap;
-
-	if (!ILL_MDT_CAPABLE(ill))
-		return;
-
-	ASSERT(ill->ill_mdt_capab != NULL);
-
-	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
-	dl_subcap->dl_cap = DL_CAPAB_MDT;
-	dl_subcap->dl_length = sizeof (*mdt_subcap);
-
-	mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1);
-	mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version;
-	mdt_subcap->mdt_flags = 0;
-	mdt_subcap->mdt_hdr_head = 0;
-	mdt_subcap->mdt_hdr_tail = 0;
-
-	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap);
+	ill_capability_dispatch(ill, mp, inners);
 }
 
 static void
@@ -2083,503 +1443,10 @@ ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
 	mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
 }
 
-/*
- * Allocate an IPsec capability request which will be filled by our
- * caller to turn on support for one or more algorithms.
- */
-/* ARGSUSED */
-static mblk_t *
-ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub)
-{
-	mblk_t *nmp;
-	dl_capability_req_t	*ocap;
-	dl_capab_ipsec_t	*ocip;
-	dl_capab_ipsec_t	*icip;
-	uint8_t			*ptr;
-	icip = (dl_capab_ipsec_t *)(isub + 1);
-
-	/*
-	 * Allocate new mblk which will contain a new capability
-	 * request to enable the capabilities.
-	 */
-
-	nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) +
-	    sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ);
-	if (nmp == NULL)
-		return (NULL);
-
-	ptr = nmp->b_rptr;
-
-	/* initialize dl_capability_req_t */
-	ocap = (dl_capability_req_t *)ptr;
-	ocap->dl_sub_offset = sizeof (dl_capability_req_t);
-	ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length;
-	ptr += sizeof (dl_capability_req_t);
-
-	/* initialize dl_capability_sub_t */
-	bcopy(isub, ptr, sizeof (*isub));
-	ptr += sizeof (*isub);
-
-	/* initialize dl_capab_ipsec_t */
-	ocip = (dl_capab_ipsec_t *)ptr;
-	bcopy(icip, ocip, sizeof (*icip));
-
-	nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]);
-	return (nmp);
-}
-
-/*
- * Process an IPsec capability negotiation ack received from a DLS Provider.
- * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or
- * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message.
- */
 static void
-ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
-{
-	dl_capab_ipsec_t	*icip;
-	dl_capab_ipsec_alg_t	*ialg;	/* ptr to input alg spec. */
-	dl_capab_ipsec_alg_t	*oalg;	/* ptr to output alg spec. */
-	uint_t cipher, nciphers;
-	mblk_t *nmp;
-	uint_t alg_len;
-	boolean_t need_sadb_dump;
-	uint_t sub_dl_cap = isub->dl_cap;
-	ill_ipsec_capab_t **ill_capab;
-	uint64_t ill_capab_flag;
-	uint8_t *capend, *ciphend;
-	boolean_t sadb_resync;
-
-	ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH ||
-	    sub_dl_cap == DL_CAPAB_IPSEC_ESP);
-
-	if (sub_dl_cap == DL_CAPAB_IPSEC_AH) {
-		ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah;
-		ill_capab_flag = ILL_CAPAB_AH;
-	} else {
-		ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp;
-		ill_capab_flag = ILL_CAPAB_ESP;
-	}
-
-	/*
-	 * If the ill capability structure exists, then this incoming
-	 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle.
-	 * If this is so, then we'd need to resynchronize the SADB
-	 * after re-enabling the offloaded ciphers.
-	 */
-	sadb_resync = (*ill_capab != NULL);
-
-	/*
-	 * Note: range checks here are not absolutely sufficient to
-	 * make us robust against malformed messages sent by drivers;
-	 * this is in keeping with the rest of IP's dlpi handling.
-	 * (Remember, it's coming from something else in the kernel
-	 * address space)
-	 */
-
-	capend = (uint8_t *)(isub + 1) + isub->dl_length;
-	if (capend > mp->b_wptr) {
-		cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
-		    "malformed sub-capability too long for mblk");
-		return;
-	}
-
-	/*
-	 * There are two types of acks we process here:
-	 * 1. acks in reply to a (first form) generic capability req
-	 *    (no ENABLE flag set)
-	 * 2. acks in reply to a ENABLE capability req.
-	 *    (ENABLE flag set)
-	 *
-	 * We process the subcapability passed as argument as follows:
-	 * 1 do initializations
-	 *   1.1 initialize nmp = NULL
-	 *   1.2 set need_sadb_dump to B_FALSE
-	 * 2 for each cipher in subcapability:
-	 *   2.1 if ENABLE flag is set:
-	 *	2.1.1 update per-ill ipsec capabilities info
-	 *	2.1.2 set need_sadb_dump to B_TRUE
-	 *   2.2 if ENABLE flag is not set:
-	 *	2.2.1 if nmp is NULL:
-	 *		2.2.1.1 allocate and initialize nmp
-	 *		2.2.1.2 init current pos in nmp
-	 *	2.2.2 copy current cipher to current pos in nmp
-	 *	2.2.3 set ENABLE flag in nmp
-	 *	2.2.4 update current pos
-	 * 3 if nmp is not equal to NULL, send enable request
-	 *   3.1 send capability request
-	 * 4 if need_sadb_dump is B_TRUE
-	 *   4.1 enable promiscuous on/off notifications
-	 *   4.2 call ill_dlpi_send(isub->dlcap) to send all
-	 *	AH or ESP SA's to interface.
-	 */
-
-	nmp = NULL;
-	oalg = NULL;
-	need_sadb_dump = B_FALSE;
-	icip = (dl_capab_ipsec_t *)(isub + 1);
-	ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]);
-
-	nciphers = icip->cip_nciphers;
-	ciphend = (uint8_t *)(ialg + icip->cip_nciphers);
-
-	if (ciphend > capend) {
-		cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
-		    "too many ciphers for sub-capability len");
-		return;
-	}
-
-	for (cipher = 0; cipher < nciphers; cipher++) {
-		alg_len = sizeof (dl_capab_ipsec_alg_t);
-
-		if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) {
-			/*
-			 * TBD: when we provide a way to disable capabilities
-			 * from above, need to manage the request-pending state
-			 * and fail if we were not expecting this ACK.
-			 */
-			IPSECHW_DEBUG(IPSECHW_CAPAB,
-			    ("ill_capability_ipsec_ack: got ENABLE ACK\n"));
-
-			/*
-			 * Update IPsec capabilities for this ill
-			 */
-
-			if (*ill_capab == NULL) {
-				IPSECHW_DEBUG(IPSECHW_CAPAB,
-				    ("ill_capability_ipsec_ack: "
-				    "allocating ipsec_capab for ill\n"));
-				*ill_capab = ill_ipsec_capab_alloc();
-
-				if (*ill_capab == NULL) {
-					cmn_err(CE_WARN,
-					    "ill_capability_ipsec_ack: "
-					    "could not enable IPsec Hardware "
-					    "acceleration for %s (ENOMEM)\n",
-					    ill->ill_name);
-					return;
-				}
-			}
-
-			ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH ||
-			    ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR);
-
-			if (ialg->alg_prim >= MAX_IPSEC_ALGS) {
-				cmn_err(CE_WARN,
-				    "ill_capability_ipsec_ack: "
-				    "malformed IPsec algorithm id %d",
-				    ialg->alg_prim);
-				continue;
-			}
-
-			if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) {
-				IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs,
-				    ialg->alg_prim);
-			} else {
-				ipsec_capab_algparm_t *alp;
-
-				IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs,
-				    ialg->alg_prim);
-				if (!ill_ipsec_capab_resize_algparm(*ill_capab,
-				    ialg->alg_prim)) {
-					cmn_err(CE_WARN,
-					    "ill_capability_ipsec_ack: "
-					    "no space for IPsec alg id %d",
-					    ialg->alg_prim);
-					continue;
-				}
-				alp = &((*ill_capab)->encr_algparm[
-				    ialg->alg_prim]);
-				alp->minkeylen = ialg->alg_minbits;
-				alp->maxkeylen = ialg->alg_maxbits;
-			}
-			ill->ill_capabilities |= ill_capab_flag;
-			/*
-			 * indicate that a capability was enabled, which
-			 * will be used below to kick off a SADB dump
-			 * to the ill.
-			 */
-			need_sadb_dump = B_TRUE;
-		} else {
-			IPSECHW_DEBUG(IPSECHW_CAPAB,
-			    ("ill_capability_ipsec_ack: enabling alg 0x%x\n",
-			    ialg->alg_prim));
-
-			if (nmp == NULL) {
-				nmp = ill_alloc_ipsec_cap_req(ill, isub);
-				if (nmp == NULL) {
-					/*
-					 * Sending the PROMISC_ON/OFF
-					 * notification request failed.
-					 * We cannot enable the algorithms
-					 * since the Provider will not
-					 * notify IP of promiscous mode
-					 * changes, which could lead
-					 * to leakage of packets.
-					 */
-					cmn_err(CE_WARN,
-					    "ill_capability_ipsec_ack: "
-					    "could not enable IPsec Hardware "
-					    "acceleration for %s (ENOMEM)\n",
-					    ill->ill_name);
-					return;
-				}
-				/* ptr to current output alg specifier */
-				oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
-			}
-
-			/*
-			 * Copy current alg specifier, set ENABLE
-			 * flag, and advance to next output alg.
-			 * For now we enable all IPsec capabilities.
-			 */
-			ASSERT(oalg != NULL);
-			bcopy(ialg, oalg, alg_len);
-			oalg->alg_flag |= DL_CAPAB_ALG_ENABLE;
-			nmp->b_wptr += alg_len;
-			oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
-		}
-
-		/* move to next input algorithm specifier */
-		ialg = (dl_capab_ipsec_alg_t *)
-		    ((char *)ialg + alg_len);
-	}
-
-	if (nmp != NULL)
-		/*
-		 * nmp points to a DL_CAPABILITY_REQ message to enable
-		 * IPsec hardware acceleration.
-		 */
-		ill_capability_send(ill, nmp);
-
-	if (need_sadb_dump)
-		/*
-		 * An acknowledgement corresponding to a request to
-		 * enable acceleration was received, notify SADB.
-		 */
-		ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync);
-}
-
-/*
- * Given an mblk with enough space in it, create sub-capability entries for
- * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised
- * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared,
- * in preparation for the reset the DL_CAPABILITY_REQ message.
- */
-static void
-ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen,
-    ill_ipsec_capab_t *ill_cap, mblk_t *mp)
-{
-	dl_capab_ipsec_t *oipsec;
-	dl_capab_ipsec_alg_t *oalg;
-	dl_capability_sub_t *dl_subcap;
-	int i, k;
-
-	ASSERT(nciphers > 0);
-	ASSERT(ill_cap != NULL);
-	ASSERT(mp != NULL);
-	ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen);
-
-	/* dl_capability_sub_t for "stype" */
-	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
-	dl_subcap->dl_cap = stype;
-	dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen;
-	mp->b_wptr += sizeof (dl_capability_sub_t);
-
-	/* dl_capab_ipsec_t for "stype" */
-	oipsec = (dl_capab_ipsec_t *)mp->b_wptr;
-	oipsec->cip_version = 1;
-	oipsec->cip_nciphers = nciphers;
-	mp->b_wptr = (uchar_t *)&oipsec->cip_data[0];
-
-	/* create entries for "stype" AUTH ciphers */
-	for (i = 0; i < ill_cap->algs_size; i++) {
-		for (k = 0; k < BITSPERBYTE; k++) {
-			if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0)
-				continue;
-
-			oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
-			bzero((void *)oalg, sizeof (*oalg));
-			oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH;
-			oalg->alg_prim = k + (BITSPERBYTE * i);
-			mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
-		}
-	}
-	/* create entries for "stype" ENCR ciphers */
-	for (i = 0; i < ill_cap->algs_size; i++) {
-		for (k = 0; k < BITSPERBYTE; k++) {
-			if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0)
-				continue;
-
-			oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
-			bzero((void *)oalg, sizeof (*oalg));
-			oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR;
-			oalg->alg_prim = k + (BITSPERBYTE * i);
-			mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
-		}
-	}
-}
-
-/*
- * Macro to count number of 1s in a byte (8-bit word).  The total count is
- * accumulated into the passed-in argument (sum).  We could use SPARCv9's
- * POPC instruction, but our macro is more flexible for an arbitrary length
- * of bytes, such as {auth,encr}_hw_algs.  These variables are currently
- * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length
- * stays that way, we can reduce the number of iterations required.
- */
-#define	COUNT_1S(val, sum) {					\
-	uint8_t x = val & 0xff;					\
-	x = (x & 0x55) + ((x >> 1) & 0x55);			\
-	x = (x & 0x33) + ((x >> 2) & 0x33);			\
-	sum += (x & 0xf) + ((x >> 4) & 0xf);			\
-}
-
-/* ARGSUSED */
-static int
-ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp,
-    int *esp_cntp, int *esp_lenp)
-{
-	ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
-	ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
-	uint64_t ill_capabilities = ill->ill_capabilities;
-	int ah_cnt = 0, esp_cnt = 0;
-	int ah_len = 0, esp_len = 0;
-	int i, size = 0;
-
-	if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)))
-		return (0);
-
-	ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH));
-	ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP));
-
-	/* Find out the number of ciphers for AH */
-	if (cap_ah != NULL) {
-		for (i = 0; i < cap_ah->algs_size; i++) {
-			COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt);
-			COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt);
-		}
-		if (ah_cnt > 0) {
-			size += sizeof (dl_capability_sub_t) +
-			    sizeof (dl_capab_ipsec_t);
-			/* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
-			ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
-			size += ah_len;
-		}
-	}
-
-	/* Find out the number of ciphers for ESP */
-	if (cap_esp != NULL) {
-		for (i = 0; i < cap_esp->algs_size; i++) {
-			COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt);
-			COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt);
-		}
-		if (esp_cnt > 0) {
-			size += sizeof (dl_capability_sub_t) +
-			    sizeof (dl_capab_ipsec_t);
-			/* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
-			esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
-			size += esp_len;
-		}
-	}
-
-	if (ah_cntp != NULL)
-		*ah_cntp = ah_cnt;
-	if (ah_lenp != NULL)
-		*ah_lenp = ah_len;
-	if (esp_cntp != NULL)
-		*esp_cntp = esp_cnt;
-	if (esp_lenp != NULL)
-		*esp_lenp = esp_len;
-
-	return (size);
-}
-
-/* ARGSUSED */
-static void
-ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp)
+ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
 {
-	ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
-	ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
-	int ah_cnt = 0, esp_cnt = 0;
-	int ah_len = 0, esp_len = 0;
-	int size;
-
-	size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len,
-	    &esp_cnt, &esp_len);
-	if (size == 0)
-		return;
-
-	/*
-	 * Clear the capability flags for IPsec HA but retain the ill
-	 * capability structures since it's possible that another thread
-	 * is still referring to them.  The structures only get deallocated
-	 * when we destroy the ill.
-	 *
-	 * Various places check the flags to see if the ill is capable of
-	 * hardware acceleration, and by clearing them we ensure that new
-	 * outbound IPsec packets are sent down encrypted.
-	 */
-
-	/* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */
-	if (ah_cnt > 0) {
-		ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len,
-		    cap_ah, mp);
-	}
-
-	/* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */
-	if (esp_cnt > 0) {
-		ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len,
-		    cap_esp, mp);
-	}
-
-	/*
-	 * At this point we've composed a bunch of sub-capabilities to be
-	 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream
-	 * by the caller.  Upon receiving this reset message, the driver
-	 * must stop inbound decryption (by destroying all inbound SAs)
-	 * and let the corresponding packets come in encrypted.
-	 */
-}
-
-static void
-ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
-    boolean_t encapsulated)
-{
-	boolean_t legacy = B_FALSE;
-
-	/*
-	 * Note that only the following two sub-capabilities may be
-	 * considered as "legacy", since their original definitions
-	 * do not incorporate the dl_mid_t module ID token, and hence
-	 * may require the use of the wrapper sub-capability.
-	 */
 	switch (subp->dl_cap) {
-	case DL_CAPAB_IPSEC_AH:
-	case DL_CAPAB_IPSEC_ESP:
-		legacy = B_TRUE;
-		break;
-	}
-
-	/*
-	 * For legacy sub-capabilities which don't incorporate a queue_t
-	 * pointer in their structures, discard them if we detect that
-	 * there are intermediate modules in between IP and the driver.
-	 */
-	if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) {
-		ip1dbg(("ill_capability_dispatch: unencapsulated capab type "
-		    "%d discarded; %d module(s) present below IP\n",
-		    subp->dl_cap, ill->ill_lmod_cnt));
-		return;
-	}
-
-	switch (subp->dl_cap) {
-	case DL_CAPAB_IPSEC_AH:
-	case DL_CAPAB_IPSEC_ESP:
-		ill_capability_ipsec_ack(ill, mp, subp);
-		break;
-	case DL_CAPAB_MDT:
-		ill_capability_mdt_ack(ill, mp, subp);
-		break;
 	case DL_CAPAB_HCKSUM:
 		ill_capability_hcksum_ack(ill, mp, subp);
 		break;
@@ -3104,7 +1971,7 @@ ill_capability_lso_enable(ill_t *ill)
 	    DLD_ENABLE)) == 0) {
 		ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
 		ill->ill_lso_capab->ill_lso_max = lso.lso_max;
-		ill->ill_capabilities |= ILL_CAPAB_DLD_LSO;
+		ill->ill_capabilities |= ILL_CAPAB_LSO;
 		ip1dbg(("ill_capability_lso_enable: interface %s "
 		    "has enabled LSO\n ", ill->ill_name));
 	} else {
@@ -3180,7 +2047,7 @@ ill_capability_dld_disable(ill_t *ill)
 		    NULL, DLD_DISABLE);
 	}
 
-	if ((ill->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) {
+	if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
 		ASSERT(ill->ill_lso_capab != NULL);
 		/*
 		 * Clear the capability flag for LSO but retain the
@@ -3189,7 +2056,7 @@ ill_capability_dld_disable(ill_t *ill)
 		 * deallocated when we destroy the ill.
 		 */
 
-		ill->ill_capabilities &= ~ILL_CAPAB_DLD_LSO;
+		ill->ill_capabilities &= ~ILL_CAPAB_LSO;
 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
 		    NULL, DLD_DISABLE);
 	}
@@ -3335,7 +2202,7 @@ ill_capability_ack_thr(void *arg)
 			ill_capability_id_ack(ill, mp, subp);
 			break;
 		default:
-			ill_capability_dispatch(ill, mp, subp, B_FALSE);
+			ill_capability_dispatch(ill, mp, subp);
 			break;
 		}
 	}
@@ -3410,8 +2277,14 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval)
 	uint32_t	hdr_length;
 	mblk_t	*send_icmp_head;
 	mblk_t	*send_icmp_head_v6;
-	zoneid_t zoneid;
 	ip_stack_t *ipst = ill->ill_ipst;
+	ip_recv_attr_t iras;
+
+	bzero(&iras, sizeof (iras));
+	iras.ira_flags = 0;
+	iras.ira_ill = iras.ira_rill = ill;
+	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+	iras.ira_rifindex = iras.ira_ruifindex;
 
 	ipfb = ill->ill_frag_hash_tbl;
 	if (ipfb == NULL)
@@ -3483,6 +2356,7 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval)
 				}
 			}
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
+			ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
 			freeb(ipf->ipf_mp);
 		}
 		mutex_exit(&ipfb->ipfb_lock);
@@ -3496,19 +2370,21 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval)
 			mp = send_icmp_head_v6;
 			send_icmp_head_v6 = send_icmp_head_v6->b_next;
 			mp->b_next = NULL;
-			if (mp->b_datap->db_type == M_CTL)
-				ip6h = (ip6_t *)mp->b_cont->b_rptr;
-			else
-				ip6h = (ip6_t *)mp->b_rptr;
-			zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
+			ip6h = (ip6_t *)mp->b_rptr;
+			iras.ira_flags = 0;
+			/*
+			 * This will result in an incorrect ALL_ZONES zoneid
+			 * for multicast packets, but we
+			 * don't send ICMP errors for those in any case.
+			 */
+			iras.ira_zoneid =
+			    ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
 			    ill, ipst);
-			if (zoneid == ALL_ZONES) {
-				freemsg(mp);
-			} else {
-				icmp_time_exceeded_v6(ill->ill_wq, mp,
-				    ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
-				    B_FALSE, zoneid, ipst);
-			}
+			ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
+			icmp_time_exceeded_v6(mp,
+			    ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
+			    &iras);
+			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
 		}
 		while (send_icmp_head != NULL) {
 			ipaddr_t dst;
@@ -3517,19 +2393,20 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval)
 			send_icmp_head = send_icmp_head->b_next;
 			mp->b_next = NULL;
 
-			if (mp->b_datap->db_type == M_CTL)
-				dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst;
-			else
-				dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
+			dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
 
-			zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst);
-			if (zoneid == ALL_ZONES) {
-				freemsg(mp);
-			} else {
-				icmp_time_exceeded(ill->ill_wq, mp,
-				    ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid,
-				    ipst);
-			}
+			iras.ira_flags = IRAF_IS_IPV4;
+			/*
+			 * This will result in an incorrect ALL_ZONES zoneid
+			 * for broadcast and multicast packets, but we
+			 * don't send ICMP errors for those in any case.
+			 */
+			iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
+			    ill, ipst);
+			ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
+			icmp_time_exceeded(mp,
+			    ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
+			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
 		}
 	}
 	/*
@@ -3647,8 +2524,9 @@ ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
 		ipfb->ipfb_count -= count;
 		ASSERT(ipfb->ipfb_frag_pkts > 0);
 		ipfb->ipfb_frag_pkts--;
-		freemsg(mp);
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
+		ip_drop_input("ipIfStatsReasmFails", mp, ill);
+		freemsg(mp);
 	}
 
 	if (ipf)
@@ -3776,6 +2654,7 @@ static void
 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
 {
 	ipif_t *ipif;
+	ncec_t *ncec;
 	nce_t *nce;
 
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
@@ -3784,16 +2663,16 @@ ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
 		 * addresses on IPMP interfaces have an nce_ill that points to
 		 * the bound underlying ill.
 		 */
-		nce = ndp_lookup_v6(ill, B_TRUE, &ipif->ipif_v6lcl_addr,
-		    B_FALSE);
+		nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
 		if (nce != NULL) {
-			mutex_enter(&nce->nce_lock);
+			ncec = nce->nce_common;
+			mutex_enter(&ncec->ncec_lock);
 			if (enable)
-				nce->nce_flags |= NCE_F_ISROUTER;
+				ncec->ncec_flags |= NCE_F_ISROUTER;
 			else
-				nce->nce_flags &= ~NCE_F_ISROUTER;
-			mutex_exit(&nce->nce_lock);
-			NCE_REFRELE(nce);
+				ncec->ncec_flags &= ~NCE_F_ISROUTER;
+			mutex_exit(&ncec->ncec_lock);
+			nce_refrele(nce);
 		}
 	}
 }
@@ -3986,8 +2865,7 @@ ill_get_ppa_ptr(char *name)
  * use avl tree to locate the ill.
  */
 static ill_t *
-ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
-    ipsq_func_t func, int *error, ip_stack_t *ipst)
+ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
 {
 	char *ppa_ptr = NULL;
 	int len;
@@ -3995,10 +2873,6 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
 	ill_t *ill = NULL;
 	ill_if_t *ifp;
 	int list;
-	ipsq_t *ipsq;
-
-	if (error != NULL)
-		*error = 0;
 
 	/*
 	 * get ppa ptr
@@ -4009,8 +2883,6 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
 		list = IP_V4_G_HEAD;
 
 	if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
-		if (error != NULL)
-			*error = ENXIO;
 		return (NULL);
 	}
 
@@ -4038,42 +2910,19 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
 		/*
 		 * Even the interface type does not exist.
 		 */
-		if (error != NULL)
-			*error = ENXIO;
 		return (NULL);
 	}
 
 	ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
 	if (ill != NULL) {
-		/*
-		 * The block comment at the start of ipif_down
-		 * explains the use of the macros used below
-		 */
-		GRAB_CONN_LOCK(q);
 		mutex_enter(&ill->ill_lock);
 		if (ILL_CAN_LOOKUP(ill)) {
 			ill_refhold_locked(ill);
 			mutex_exit(&ill->ill_lock);
-			RELEASE_CONN_LOCK(q);
 			return (ill);
-		} else if (ILL_CAN_WAIT(ill, q)) {
-			ipsq = ill->ill_phyint->phyint_ipsq;
-			mutex_enter(&ipsq->ipsq_lock);
-			mutex_enter(&ipsq->ipsq_xop->ipx_lock);
-			mutex_exit(&ill->ill_lock);
-			ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
-			mutex_exit(&ipsq->ipsq_xop->ipx_lock);
-			mutex_exit(&ipsq->ipsq_lock);
-			RELEASE_CONN_LOCK(q);
-			if (error != NULL)
-				*error = EINPROGRESS;
-			return (NULL);
 		}
 		mutex_exit(&ill->ill_lock);
-		RELEASE_CONN_LOCK(q);
 	}
-	if (error != NULL)
-		*error = ENXIO;
 	return (NULL);
 }
 
@@ -4474,6 +3323,8 @@ ill_init(queue_t *q, ill_t *ill)
 	 * ip_open(), before we reach here.
 	 */
 	mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
+	mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
+	ill->ill_saved_ire_cnt = 0;
 
 	ill->ill_rq = q;
 	ill->ill_wq = WR(q);
@@ -4521,7 +3372,9 @@ ill_init(queue_t *q, ill_t *ill)
 	 */
 	ill->ill_phyint->phyint_illv4 = ill;
 	ill->ill_ppa = UINT_MAX;
-	ill->ill_fastpath_list = &ill->ill_fastpath_list;
+	list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
+
+	ill_set_inputfn(ill);
 
 	if (!ipsq_init(ill, B_TRUE)) {
 		freemsg(info_mp);
@@ -4536,6 +3389,8 @@ ill_init(queue_t *q, ill_t *ill)
 	ill->ill_frag_count = 0;
 	ill->ill_ipf_gen = 0;
 
+	rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
 	ill->ill_global_timer = INFINITY;
 	ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
 	ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
@@ -4550,7 +3405,6 @@ ill_init(queue_t *q, ill_t *ill)
 	 * IPv6.
 	 */
 	ill->ill_reachable_time = ND_REACHABLE_TIME;
-	ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
 	ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
 	ill->ill_max_buf = ND_MAX_Q;
 	ill->ill_refcnt = 0;
@@ -4574,15 +3428,14 @@ ill_init(queue_t *q, ill_t *ill)
  * creates datalink socket info from the device.
  */
 int
-ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif)
+ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
 {
 	size_t	len;
-	ill_t	*ill = ipif->ipif_ill;
 
 	sdl->sdl_family = AF_LINK;
-	sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
+	sdl->sdl_index = ill_get_upper_ifindex(ill);
 	sdl->sdl_type = ill->ill_type;
-	ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data));
+	ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
 	len = strlen(sdl->sdl_data);
 	ASSERT(len < 256);
 	sdl->sdl_nlen = (uchar_t)len;
@@ -4604,7 +3457,7 @@ ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
 	sdl->sdl_type = ill->ill_type;
-	ipif_get_name(ill->ill_ipif, sdl->sdl_data, sizeof (sdl->sdl_data));
+	ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
 	sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
 	sdl->sdl_alen = ill->ill_phys_addr_length;
 	sdl->sdl_slen = 0;
@@ -4646,7 +3499,7 @@ loopback_kstat_update(kstat_t *ksp, int rw)
 /*
  * Has ifindex been plumbed already?
  */
-boolean_t
+static boolean_t
 phyint_exists(uint_t index, ip_stack_t *ipst)
 {
 	ASSERT(index != 0);
@@ -4749,8 +3602,7 @@ phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
  */
 ill_t *
 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
-    queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc,
-    ip_stack_t *ipst)
+    boolean_t *did_alloc, ip_stack_t *ipst)
 {
 	ill_t	*ill;
 	ipif_t	*ipif;
@@ -4762,9 +3614,9 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 	isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
 
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst);
+	ill = ill_find_by_name(name, isv6, ipst);
 	rw_exit(&ipst->ips_ill_g_lock);
-	if (ill != NULL || (error != NULL && *error == EINPROGRESS))
+	if (ill != NULL)
 		return (ill);
 
 	/*
@@ -4775,9 +3627,8 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 		return (NULL);
 
 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
-	ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst);
-	if (ill != NULL || (error != NULL && *error == EINPROGRESS)) {
+	ill = ill_find_by_name(name, isv6, ipst);
+	if (ill != NULL) {
 		rw_exit(&ipst->ips_ill_g_lock);
 		return (ill);
 	}
@@ -4791,6 +3642,7 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 	*ill = ill_null;
 	mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL);
 	ill->ill_ipst = ipst;
+	list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
 	netstack_hold(ipst->ips_netstack);
 	/*
 	 * For exclusive stacks we set the zoneid to zero
@@ -4809,17 +3661,16 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 	mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
 	phyint_flags_init(ill->ill_phyint, DL_LOOP);
 
-	ill->ill_max_frag = IP_LOOPBACK_MTU;
-	/* Add room for tcp+ip headers */
 	if (isv6) {
 		ill->ill_isv6 = B_TRUE;
-		ill->ill_max_frag += IPV6_HDR_LEN + 20;	/* for TCP */
+		ill->ill_max_frag = ip_loopback_mtu_v6plus;
 	} else {
-		ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20;
+		ill->ill_max_frag = ip_loopback_mtuplus;
 	}
 	if (!ill_allocate_mibs(ill))
 		goto done;
-	ill->ill_max_mtu = ill->ill_max_frag;
+	ill->ill_current_frag = ill->ill_max_frag;
+	ill->ill_mtu = ill->ill_max_frag;	/* Initial value */
 	/*
 	 * ipif_loopback_name can't be pointed at directly because its used
 	 * by both the ipv4 and ipv6 interfaces.  When the ill is removed
@@ -4832,6 +3683,8 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 	/* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
 	ill->ill_dlpi_pending = DL_PRIM_INVAL;
 
+	rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
 	ill->ill_global_timer = INFINITY;
 	ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
 	ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
@@ -4857,14 +3710,12 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 		ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
 
 		IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
-		ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
 		V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
 		V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
 		    ipif->ipif_v6subnet);
 		ill->ill_flags |= ILLF_IPV4;
 	} else {
 		ipif->ipif_v6lcl_addr = ipv6_loopback;
-		ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
 		ipif->ipif_v6net_mask = ipv6_all_ones;
 		V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
 		    ipif->ipif_v6subnet);
@@ -4884,6 +3735,8 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 
 	ipsq = ill->ill_phyint->phyint_ipsq;
 
+	ill_set_inputfn(ill);
+
 	if (ill_glist_insert(ill, "lo", isv6) != 0)
 		cmn_err(CE_PANIC, "cannot insert loopback interface");
 
@@ -4924,8 +3777,6 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 		}
 	}
 
-	if (error != NULL)
-		*error = 0;
 	*did_alloc = B_TRUE;
 	rw_exit(&ipst->ips_ill_g_lock);
 	ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
@@ -4947,8 +3798,6 @@ done:
 		mi_free(ill);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
-	if (error != NULL)
-		*error = ENOMEM;
 	return (NULL);
 }
 
@@ -4956,8 +3805,7 @@ done:
  * For IPP calls - use the ip_stack_t for global stack.
  */
 ill_t *
-ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6,
-    queue_t *q, mblk_t *mp, ipsq_func_t func, int *err)
+ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
 {
 	ip_stack_t	*ipst;
 	ill_t		*ill;
@@ -4968,7 +3816,7 @@ ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6,
 		return (NULL);
 	}
 
-	ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst);
+	ill = ill_lookup_on_ifindex(index, isv6, ipst);
 	netstack_rele(ipst->ips_netstack);
 	return (ill);
 }
@@ -4977,19 +3825,11 @@ ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6,
  * Return a pointer to the ill which matches the index and IP version type.
  */
 ill_t *
-ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp,
-    ipsq_func_t func, int *err, ip_stack_t *ipst)
+ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
 {
 	ill_t	*ill;
-	ipsq_t  *ipsq;
 	phyint_t *phyi;
 
-	ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) ||
-	    (q != NULL && mp != NULL && func != NULL && err != NULL));
-
-	if (err != NULL)
-		*err = 0;
-
 	/*
 	 * Indexes are stored in the phyint - a common structure
 	 * to both IPv4 and IPv6.
@@ -5000,43 +3840,45 @@ ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp,
 	if (phyi != NULL) {
 		ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
 		if (ill != NULL) {
-			/*
-			 * The block comment at the start of ipif_down
-			 * explains the use of the macros used below
-			 */
-			GRAB_CONN_LOCK(q);
 			mutex_enter(&ill->ill_lock);
-			if (ILL_CAN_LOOKUP(ill)) {
+			if (!ILL_IS_CONDEMNED(ill)) {
 				ill_refhold_locked(ill);
 				mutex_exit(&ill->ill_lock);
-				RELEASE_CONN_LOCK(q);
 				rw_exit(&ipst->ips_ill_g_lock);
 				return (ill);
-			} else if (ILL_CAN_WAIT(ill, q)) {
-				ipsq = ill->ill_phyint->phyint_ipsq;
-				mutex_enter(&ipsq->ipsq_lock);
-				mutex_enter(&ipsq->ipsq_xop->ipx_lock);
-				rw_exit(&ipst->ips_ill_g_lock);
-				mutex_exit(&ill->ill_lock);
-				ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
-				mutex_exit(&ipsq->ipsq_xop->ipx_lock);
-				mutex_exit(&ipsq->ipsq_lock);
-				RELEASE_CONN_LOCK(q);
-				if (err != NULL)
-					*err = EINPROGRESS;
-				return (NULL);
 			}
-			RELEASE_CONN_LOCK(q);
 			mutex_exit(&ill->ill_lock);
 		}
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
-	if (err != NULL)
-		*err = ENXIO;
 	return (NULL);
 }
 
 /*
+ * Verify whether or not an interface index is valid.
+ * It can be zero (meaning "reset") or an interface index assigned
+ * to a non-VNI interface. (We don't use VNI interface to send packets.)
+ */
+boolean_t
+ip_ifindex_valid(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
+{
+	ill_t		*ill;
+
+	if (ifindex == 0)
+		return (B_TRUE);
+
+	ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
+	if (ill == NULL)
+		return (B_FALSE);
+	if (IS_VNI(ill)) {
+		ill_refrele(ill);
+		return (B_FALSE);
+	}
+	ill_refrele(ill);
+	return (B_TRUE);
+}
+
+/*
  * Return the ifindex next in sequence after the passed in ifindex.
  * If there is no next ifindex for the given protocol, return 0.
  */
@@ -5118,6 +3960,20 @@ ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
 }
 
 /*
+ * Return the ifindex to be used by upper layer protocols for instance
+ * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
+ */
+uint_t
+ill_get_upper_ifindex(const ill_t *ill)
+{
+	if (IS_UNDER_IPMP(ill))
+		return (ipmp_ill_get_ipmp_ifindex(ill));
+	else
+		return (ill->ill_phyint->phyint_ifindex);
+}
+
+
+/*
  * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
  * that gives a running thread a reference to the ill. This reference must be
  * released by the thread when it is done accessing the ill and related
@@ -5145,17 +4001,18 @@ ill_refhold_locked(ill_t *ill)
 	ILL_TRACE_REF(ill);
 }
 
-int
+/* Returns true if we managed to get a refhold */
+boolean_t
 ill_check_and_refhold(ill_t *ill)
 {
 	mutex_enter(&ill->ill_lock);
-	if (ILL_CAN_LOOKUP(ill)) {
+	if (!ILL_IS_CONDEMNED(ill)) {
 		ill_refhold_locked(ill);
 		mutex_exit(&ill->ill_lock);
-		return (0);
+		return (B_TRUE);
 	}
 	mutex_exit(&ill->ill_lock);
-	return (ILL_LOOKUP_FAILED);
+	return (B_FALSE);
 }
 
 /*
@@ -5234,8 +4091,8 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	ASSERT(IAM_WRITER_ILL(ill));
 
 	/*
-	 * Till the ill is fully up ILL_CHANGING will be set and
-	 * the ill is not globally visible. So no need for a lock.
+	 * Till the ill is fully up  the ill is not globally visible.
+	 * So no need for a lock.
 	 */
 	dlia = (dl_info_ack_t *)mp->b_rptr;
 	ill->ill_mactype = dlia->dl_mac_type;
@@ -5279,8 +4136,9 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	 * IP will fly apart otherwise.
 	 */
 	min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
-	ill->ill_max_frag  = MAX(min_mtu, dlia->dl_max_sdu);
-	ill->ill_max_mtu = ill->ill_max_frag;
+	ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
+	ill->ill_current_frag = ill->ill_max_frag;
+	ill->ill_mtu = ill->ill_max_frag;
 
 	ill->ill_type = ipm->ip_m_type;
 
@@ -5320,14 +4178,6 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	 */
 	ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
 	/*
-	 * Set ipif_mtu which is used to set the IRE's
-	 * ire_max_frag value. The driver could have sent
-	 * a different mtu from what it sent last time. No
-	 * need to call ipif_mtu_change because IREs have
-	 * not yet been created.
-	 */
-	ill->ill_ipif->ipif_mtu = ill->ill_max_mtu;
-	/*
 	 * Clear all the flags that were set based on ill_bcast_addr_length
 	 * and ill_phys_addr_length (in ipif_set_values) as these could have
 	 * changed now and we need to re-evaluate.
@@ -5336,8 +4186,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
 
 	/*
-	 * Free ill_resolver_mp and ill_bcast_mp as things could have
-	 * changed now.
+	 * Free ill_bcast_mp as things could have changed now.
 	 *
 	 * NOTE: The IPMP meta-interface is special-cased because it starts
 	 * with no underlying interfaces (and thus an unknown broadcast
@@ -5345,19 +4194,14 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	 * capable as part of allowing it to join a group.
 	 */
 	if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
-		if (ill->ill_resolver_mp != NULL)
-			freemsg(ill->ill_resolver_mp);
 		if (ill->ill_bcast_mp != NULL)
 			freemsg(ill->ill_bcast_mp);
-		if (ill->ill_flags & ILLF_XRESOLV)
-			ill->ill_net_type = IRE_IF_RESOLVER;
-		else
-			ill->ill_net_type = IRE_IF_NORESOLVER;
-		ill->ill_resolver_mp = ill_dlur_gen(NULL,
+		ill->ill_net_type = IRE_IF_NORESOLVER;
+
+		ill->ill_bcast_mp = ill_dlur_gen(NULL,
 		    ill->ill_phys_addr_length,
 		    ill->ill_sap,
 		    ill->ill_sap_length);
-		ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp);
 
 		if (ill->ill_isv6)
 			/*
@@ -5520,7 +4364,7 @@ ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
  * 	3b. link local, but deprecated
  * 	4. loopback.
  */
-ipif_t *
+static ipif_t *
 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
 {
 	ill_t			*ill;
@@ -5537,7 +4381,8 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
 
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
 		mutex_enter(&ill->ill_lock);
-		if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) ||
+		if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
+		    ILL_IS_CONDEMNED(ill) ||
 		    !(ill->ill_flags & ILLF_MULTICAST)) {
 			mutex_exit(&ill->ill_lock);
 			continue;
@@ -5550,7 +4395,7 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
 				continue;
 			}
 			if (!(ipif->ipif_flags & IPIF_UP) ||
-			    !IPIF_CAN_LOOKUP(ipif)) {
+			    IPIF_IS_CONDEMNED(ipif)) {
 				continue;
 			}
 
@@ -5618,6 +4463,22 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
 	}
 }
 
+ill_t *
+ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
+{
+	ipif_t *ipif;
+	ill_t *ill;
+
+	ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
+	if (ipif == NULL)
+		return (NULL);
+
+	ill = ipif->ipif_ill;
+	ill_refhold(ill);
+	ipif_refrele(ipif);
+	return (ill);
+}
+
 /*
  * This function is called when an application does not specify an interface
  * to be used for multicast traffic (joining a group/sending data).  It
@@ -5629,22 +4490,21 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
  * anything in between.  If there is no such multicast route, we just find
  * any multicast capable interface and return it.  The returned ipif
  * is refhold'ed.
+ *
+ * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
+ * unicast table. This is used by CGTP.
  */
-ipif_t *
-ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
+ill_t *
+ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
+    boolean_t *multirtp, ipaddr_t *setsrcp)
 {
-	ire_t			*ire;
-	ipif_t			*ipif;
+	ill_t			*ill;
 
-	ire = ire_lookup_multi(group, zoneid, ipst);
-	if (ire != NULL) {
-		ipif = ire->ire_ipif;
-		ipif_refhold(ipif);
-		ire_refrele(ire);
-		return (ipif);
-	}
+	ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp);
+	if (ill != NULL)
+		return (ill);
 
-	return (ipif_lookup_multicast(ipst, zoneid, B_FALSE));
+	return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
 }
 
 /*
@@ -5652,16 +4512,11 @@ ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
  * The destination address is used only for matching point-to-point interfaces.
  */
 ipif_t *
-ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
-    ipsq_func_t func, int *error, ip_stack_t *ipst)
+ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
 {
 	ipif_t	*ipif;
 	ill_t	*ill;
 	ill_walk_context_t ctx;
-	ipsq_t	*ipsq;
-
-	if (error != NULL)
-		*error = 0;
 
 	/*
 	 * First match all the point-to-point interfaces
@@ -5672,7 +4527,6 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		GRAB_CONN_LOCK(q);
 		mutex_enter(&ill->ill_lock);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
@@ -5680,41 +4534,20 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
 			if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
 			    (ipif->ipif_lcl_addr == if_addr) &&
 			    (ipif->ipif_pp_dst_addr == dst)) {
-				/*
-				 * The block comment at the start of ipif_down
-				 * explains the use of the macros used below
-				 */
-				if (IPIF_CAN_LOOKUP(ipif)) {
+				if (!IPIF_IS_CONDEMNED(ipif)) {
 					ipif_refhold_locked(ipif);
 					mutex_exit(&ill->ill_lock);
-					RELEASE_CONN_LOCK(q);
 					rw_exit(&ipst->ips_ill_g_lock);
 					return (ipif);
-				} else if (IPIF_CAN_WAIT(ipif, q)) {
-					ipsq = ill->ill_phyint->phyint_ipsq;
-					mutex_enter(&ipsq->ipsq_lock);
-					mutex_enter(&ipsq->ipsq_xop->ipx_lock);
-					mutex_exit(&ill->ill_lock);
-					rw_exit(&ipst->ips_ill_g_lock);
-					ipsq_enq(ipsq, q, mp, func, NEW_OP,
-					    ill);
-					mutex_exit(&ipsq->ipsq_xop->ipx_lock);
-					mutex_exit(&ipsq->ipsq_lock);
-					RELEASE_CONN_LOCK(q);
-					if (error != NULL)
-						*error = EINPROGRESS;
-					return (NULL);
 				}
 			}
 		}
 		mutex_exit(&ill->ill_lock);
-		RELEASE_CONN_LOCK(q);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 
 	/* lookup the ipif based on interface address */
-	ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error,
-	    ipst);
+	ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
 	ASSERT(ipif == NULL || !ipif->ipif_isv6);
 	return (ipif);
 }
@@ -5723,18 +4556,15 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
  * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
  */
 static ipif_t *
-ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp,
-    zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
-    ip_stack_t *ipst)
+ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
+    zoneid_t zoneid, ip_stack_t *ipst)
 {
 	ipif_t  *ipif;
 	ill_t   *ill;
 	boolean_t ptp = B_FALSE;
-	ipsq_t	*ipsq;
 	ill_walk_context_t	ctx;
-
-	if (error != NULL)
-		*error = 0;
+	boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
+	boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
 
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	/*
@@ -5748,7 +4578,6 @@ repeat:
 		    (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
 			continue;
 		}
-		GRAB_CONN_LOCK(q);
 		mutex_enter(&ill->ill_lock);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
@@ -5756,47 +4585,29 @@ repeat:
 			    zoneid != ipif->ipif_zoneid &&
 			    ipif->ipif_zoneid != ALL_ZONES)
 				continue;
+
+			if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
+				continue;
+
 			/* Allow the ipif to be down */
 			if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
 			    ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
 			    (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
 			    (ipif->ipif_pp_dst_addr == addr))) {
-				/*
-				 * The block comment at the start of ipif_down
-				 * explains the use of the macros used below
-				 */
-				if (IPIF_CAN_LOOKUP(ipif)) {
+				if (!IPIF_IS_CONDEMNED(ipif)) {
 					ipif_refhold_locked(ipif);
 					mutex_exit(&ill->ill_lock);
-					RELEASE_CONN_LOCK(q);
 					rw_exit(&ipst->ips_ill_g_lock);
 					return (ipif);
-				} else if (IPIF_CAN_WAIT(ipif, q)) {
-					ipsq = ill->ill_phyint->phyint_ipsq;
-					mutex_enter(&ipsq->ipsq_lock);
-					mutex_enter(&ipsq->ipsq_xop->ipx_lock);
-					mutex_exit(&ill->ill_lock);
-					rw_exit(&ipst->ips_ill_g_lock);
-					ipsq_enq(ipsq, q, mp, func, NEW_OP,
-					    ill);
-					mutex_exit(&ipsq->ipsq_xop->ipx_lock);
-					mutex_exit(&ipsq->ipsq_lock);
-					RELEASE_CONN_LOCK(q);
-					if (error != NULL)
-						*error = EINPROGRESS;
-					return (NULL);
 				}
 			}
 		}
 		mutex_exit(&ill->ill_lock);
-		RELEASE_CONN_LOCK(q);
 	}
 
 	/* If we already did the ptp case, then we are done */
 	if (ptp) {
 		rw_exit(&ipst->ips_ill_g_lock);
-		if (error != NULL)
-			*error = ENXIO;
 		return (NULL);
 	}
 	ptp = B_TRUE;
@@ -5804,55 +4615,6 @@ repeat:
 }
 
 /*
- * Check if the address exists in the system.
- * We don't hold the conn_lock as we will not perform defered ipsqueue
- * operation.
- */
-boolean_t
-ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
-{
-	ipif_t  *ipif;
-	ill_t   *ill;
-	ill_walk_context_t	ctx;
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-
-	ill = ILL_START_WALK_V4(&ctx, ipst);
-	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		mutex_enter(&ill->ill_lock);
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (zoneid != ALL_ZONES &&
-			    zoneid != ipif->ipif_zoneid &&
-			    ipif->ipif_zoneid != ALL_ZONES)
-				continue;
-			/* Allow the ipif to be down */
-			/*
-			 * XXX Different from ipif_lookup_addr(), we don't do
-			 * twice lookups. As from bind()'s point of view, we
-			 * may return once we find a match.
-			 */
-			if (((ipif->ipif_lcl_addr == addr) &&
-			    ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
-			    ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
-			    (ipif->ipif_pp_dst_addr == addr))) {
-				/*
-				 * Allow bind() to be successful even if the
-				 * ipif is with IPIF_CHANGING bit set.
-				 */
-				mutex_exit(&ill->ill_lock);
-				rw_exit(&ipst->ips_ill_g_lock);
-				return (B_TRUE);
-			}
-		}
-		mutex_exit(&ill->ill_lock);
-	}
-
-	rw_exit(&ipst->ips_ill_g_lock);
-	return (B_FALSE);
-}
-
-/*
  * Lookup an ipif with the specified address.  For point-to-point links we
  * look for matches on either the destination address or the local address,
  * but we skip the local address check if IPIF_UNNUMBERED is set.  If the
@@ -5860,11 +4622,25 @@ ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
  * (or illgrp if `match_ill' is in an IPMP group).
  */
 ipif_t *
-ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
-    mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
+    ip_stack_t *ipst)
+{
+	return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
+	    zoneid, ipst));
+}
+
+/*
+ * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
+ * except that we will only return an address if it is not marked as
+ * IPIF_DUPLICATE
+ */
+ipif_t *
+ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
+    ip_stack_t *ipst)
 {
-	return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp,
-	    func, error, ipst));
+	return (ipif_lookup_addr_common(addr, match_ill,
+	    (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
+	    zoneid, ipst));
 }
 
 /*
@@ -5872,12 +4648,12 @@ ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
  * `match_ill' across the IPMP group.  This function is only needed in some
  * corner-cases; almost everything should use ipif_lookup_addr().
  */
-static ipif_t *
+ipif_t *
 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
 {
 	ASSERT(match_ill != NULL);
-	return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES,
-	    NULL, NULL, NULL, NULL, ipst));
+	return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
+	    ipst));
 }
 
 /*
@@ -5951,13 +4727,13 @@ repeat:
  * IRE lookup and pick the first ipif corresponding to the source address in the
  * ire.
  * Returns: held ipif
+ *
+ * This is only used for ICMP_ADDRESS_MASK_REQUESTs
  */
 ipif_t *
 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
 {
 	ipif_t	*ipif;
-	ire_t	*ire;
-	ip_stack_t	*ipst = ill->ill_ipst;
 
 	ASSERT(!ill->ill_isv6);
 
@@ -5970,7 +4746,7 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
 	 */
 	mutex_enter(&ill->ill_lock);
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (!IPIF_CAN_LOOKUP(ipif))
+		if (IPIF_IS_CONDEMNED(ipif))
 			continue;
 		if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
 		    ipif->ipif_zoneid != ALL_ZONES)
@@ -5991,24 +4767,11 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
 		}
 	}
 	mutex_exit(&ill->ill_lock);
-	ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid,
-	    NULL, MATCH_IRE_RECURSIVE, ipst);
-	if (ire != NULL) {
-		/*
-		 * The callers of this function wants to know the
-		 * interface on which they have to send the replies
-		 * back. For IREs that have ire_stq and ire_ipif
-		 * derived from different ills, we really don't care
-		 * what we return here.
-		 */
-		ipif = ire->ire_ipif;
-		if (ipif != NULL) {
-			ipif_refhold(ipif);
-			ire_refrele(ire);
-			return (ipif);
-		}
-		ire_refrele(ire);
-	}
+	/*
+	 * For a remote destination it isn't possible to nail down a particular
+	 * ipif.
+	 */
+
 	/* Pick the first interface */
 	ipif = ipif_get_next_ipif(NULL, ill);
 	return (ipif);
@@ -6027,9 +4790,8 @@ ill_is_quiescent(ill_t *ill)
 	ASSERT(MUTEX_HELD(&ill->ill_lock));
 
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) {
+		if (ipif->ipif_refcnt != 0)
 			return (B_FALSE);
-		}
 	}
 	if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
 		return (B_FALSE);
@@ -6045,7 +4807,7 @@ ill_is_freeable(ill_t *ill)
 	ASSERT(MUTEX_HELD(&ill->ill_lock));
 
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (ipif->ipif_refcnt != 0 || !IPIF_FREE_OK(ipif)) {
+		if (ipif->ipif_refcnt != 0) {
 			return (B_FALSE);
 		}
 	}
@@ -6067,9 +4829,8 @@ ipif_is_quiescent(ipif_t *ipif)
 
 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
 
-	if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) {
+	if (ipif->ipif_refcnt != 0)
 		return (B_FALSE);
-	}
 
 	ill = ipif->ipif_ill;
 	if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
@@ -6078,7 +4839,7 @@ ipif_is_quiescent(ipif_t *ipif)
 	}
 
 	/* This is the last ipif going down or being deleted on this ill */
-	if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
+	if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
 		return (B_FALSE);
 	}
 
@@ -6087,14 +4848,14 @@ ipif_is_quiescent(ipif_t *ipif)
 
 /*
  * return true if the ipif can be destroyed: the ipif has to be quiescent
- * with zero references from ire/nce/ilm to it.
+ * with zero references from ire/ilm to it.
  */
 static boolean_t
 ipif_is_freeable(ipif_t *ipif)
 {
 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
 	ASSERT(ipif->ipif_id != 0);
-	return (ipif->ipif_refcnt == 0 && IPIF_FREE_OK(ipif));
+	return (ipif->ipif_refcnt == 0);
 }
 
 /*
@@ -6275,7 +5036,7 @@ th_trace_gethash(ip_stack_t *ipst)
 		 * block.
 		 */
 		objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
-		    MAX(sizeof (ire_t), sizeof (nce_t)));
+		    MAX(sizeof (ire_t), sizeof (ncec_t)));
 		rshift = highbit(objsize);
 		mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
 		    th_trace_free, mod_hash_byptr, (void *)rshift,
@@ -6509,7 +5270,7 @@ ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
 	mutex_enter(&ill->ill_lock);
 	for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
 	    ipif != NULL; ipif = ipif->ipif_next) {
-		if (!IPIF_CAN_LOOKUP(ipif))
+		if (IPIF_IS_CONDEMNED(ipif))
 			continue;
 		ipif_refhold_locked(ipif);
 		mutex_exit(&ill->ill_lock);
@@ -6535,28 +5296,53 @@ ip_m_lookup(t_uscalar_t mac_type)
 }
 
 /*
+ * Make a link layer address from the multicast IP address *addr.
+ * To form the link layer address, invoke the ip_m_v*mapping function
+ * associated with the link-layer type.
+ */
+void
+ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
+{
+	ip_m_t *ipm;
+
+	if (ill->ill_net_type == IRE_IF_NORESOLVER)
+		return;
+
+	ASSERT(addr != NULL);
+
+	ipm = ip_m_lookup(ill->ill_mactype);
+	if (ipm == NULL ||
+	    (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
+	    (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
+		ip0dbg(("no mapping for ill %s mactype 0x%x\n",
+		    ill->ill_name, ill->ill_mactype));
+		return;
+	}
+	if (ill->ill_isv6)
+		(*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
+	else
+		(*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
+}
+
+/*
  * ip_rt_add is called to add an IPv4 route to the forwarding table.
- * ipif_arg is passed in to associate it with the correct interface.
- * We may need to restart this operation if the ipif cannot be looked up
- * due to an exclusive operation that is currently in progress. The restart
- * entry point is specified by 'func'
+ * ill is passed in to associate it with the correct interface.
+ * If ire_arg is set, then we return the held IRE in that location.
  */
 int
 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
-    ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ire_t **ire_arg,
-    boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func,
-    struct rtsa_s *sp, ip_stack_t *ipst)
+    ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
+    boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid)
 {
-	ire_t	*ire;
+	ire_t	*ire, *nire;
 	ire_t	*gw_ire = NULL;
 	ipif_t	*ipif = NULL;
-	boolean_t ipif_refheld = B_FALSE;
 	uint_t	type;
 	int	match_flags = MATCH_IRE_TYPE;
-	int	error;
 	tsol_gc_t *gc = NULL;
 	tsol_gcgrp_t *gcgrp = NULL;
 	boolean_t gcgrp_xtraref = B_FALSE;
+	boolean_t cgtp_broadcast;
 
 	ip1dbg(("ip_rt_add:"));
 
@@ -6579,27 +5365,19 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 		return (ENETUNREACH);
 	/*
 	 * Get the ipif, if any, corresponding to the gw_addr
+	 * If -ifp was specified we restrict ourselves to the ill, otherwise
+	 * we match on the gatway and destination to handle unnumbered pt-pt
+	 * interfaces.
 	 */
-	ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &error,
-	    ipst);
+	if (ill != NULL)
+		ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
+	else
+		ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
 	if (ipif != NULL) {
 		if (IS_VNI(ipif->ipif_ill)) {
 			ipif_refrele(ipif);
 			return (EINVAL);
 		}
-		ipif_refheld = B_TRUE;
-	} else if (error == EINPROGRESS) {
-		ip1dbg(("ip_rt_add: null and EINPROGRESS"));
-		return (EINPROGRESS);
-	} else {
-		error = 0;
-	}
-
-	if (ipif != NULL) {
-		ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull"));
-		ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
-	} else {
-		ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null"));
 	}
 
 	/*
@@ -6612,12 +5390,12 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 		flags &= ~RTF_GATEWAY;
 		if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
 		    mask == IP_HOST_MASK) {
-			ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif,
-			    ALL_ZONES, NULL, match_flags, ipst);
+			ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
+			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
+			    NULL);
 			if (ire != NULL) {
 				ire_refrele(ire);
-				if (ipif_refheld)
-					ipif_refrele(ipif);
+				ipif_refrele(ipif);
 				return (EEXIST);
 			}
 			ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
@@ -6627,40 +5405,58 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 			ire = ire_create(
 			    (uchar_t *)&dst_addr,	/* dest address */
 			    (uchar_t *)&mask,		/* mask */
-			    (uchar_t *)&ipif->ipif_src_addr,
 			    NULL,			/* no gateway */
-			    &ipif->ipif_mtu,
-			    NULL,
-			    ipif->ipif_rq,		/* recv-from queue */
-			    NULL,			/* no send-to queue */
 			    ipif->ipif_ire_type,	/* LOOPBACK */
-			    ipif,
-			    0,
-			    0,
-			    0,
-			    (ipif->ipif_flags & IPIF_PRIVATE) ?
-			    RTF_PRIVATE : 0,
-			    &ire_uinfo_null,
-			    NULL,
+			    ipif->ipif_ill,
+			    zoneid,
+			    (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
 			    NULL,
 			    ipst);
 
 			if (ire == NULL) {
-				if (ipif_refheld)
-					ipif_refrele(ipif);
+				ipif_refrele(ipif);
 				return (ENOMEM);
 			}
-			error = ire_add(&ire, q, mp, func, B_FALSE);
-			if (error == 0)
-				goto save_ire;
-			if (ipif_refheld)
-				ipif_refrele(ipif);
-			return (error);
+			/* src address assigned by the caller? */
+			if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
+				ire->ire_setsrc_addr = src_addr;
 
+			nire = ire_add(ire);
+			if (nire == NULL) {
+				/*
+				 * In the result of failure, ire_add() will have
+				 * already deleted the ire in question, so there
+				 * is no need to do that here.
+				 */
+				ipif_refrele(ipif);
+				return (ENOMEM);
+			}
+			/*
+			 * Check if it was a duplicate entry. This handles
+			 * the case of two racing route adds for the same route
+			 */
+			if (nire != ire) {
+				ASSERT(nire->ire_identical_ref > 1);
+				ire_delete(nire);
+				ire_refrele(nire);
+				ipif_refrele(ipif);
+				return (EEXIST);
+			}
+			ire = nire;
+			goto save_ire;
 		}
 	}
 
 	/*
+	 * The routes for multicast with CGTP are quite special in that
+	 * the gateway is the local interface address, yet RTF_GATEWAY
+	 * is set. We turn off RTF_GATEWAY to provide compatibility with
+	 * this undocumented and unusual use of multicast routes.
+	 */
+	if ((flags & RTF_MULTIRT) && ipif != NULL)
+		flags &= ~RTF_GATEWAY;
+
+	/*
 	 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
 	 * and the gateway address provided is one of the system's interface
 	 * addresses.  By using the routing socket interface and supplying an
@@ -6694,8 +5490,8 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 	 * logical interfaces
 	 *
 	 *	192.0.2.32	255.255.255.224	192.0.2.33	U	if0
-	 *	192.0.2.32	255.255.255.224	192.0.2.34	U	if0:1
-	 *	192.0.2.32	255.255.255.224	192.0.2.35	U	if0:2
+	 *	192.0.2.32	255.255.255.224	192.0.2.34	U	if0
+	 *	192.0.2.32	255.255.255.224	192.0.2.35	U	if0
 	 *
 	 * the ipif's corresponding to each of these interface routes can be
 	 * uniquely identified by the "gateway" (actually interface address).
@@ -6710,47 +5506,37 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 
 	/* RTF_GATEWAY not set */
 	if (!(flags & RTF_GATEWAY)) {
-		queue_t	*stq;
-
 		if (sp != NULL) {
 			ip2dbg(("ip_rt_add: gateway security attributes "
 			    "cannot be set with interface route\n"));
-			if (ipif_refheld)
+			if (ipif != NULL)
 				ipif_refrele(ipif);
 			return (EINVAL);
 		}
 
 		/*
-		 * As the interface index specified with the RTA_IFP sockaddr is
-		 * the same for all ipif's off of an ill, the matching logic
-		 * below uses MATCH_IRE_ILL if such an index was specified.
-		 * This means that routes sharing the same prefix when added
-		 * using a RTA_IFP sockaddr must have distinct interface
-		 * indices (namely, they must be on distinct ill's).
-		 *
-		 * On the other hand, since the gateway address will usually be
-		 * different for each ipif on the system, the matching logic
-		 * uses MATCH_IRE_IPIF in the case of a traditional interface
-		 * route.  This means that interface routes for the same prefix
-		 * can be created if they belong to distinct ipif's and if a
-		 * RTA_IFP sockaddr is not present.
+		 * Whether or not ill (RTA_IFP) is set, we require that
+		 * the gateway is one of our local addresses.
 		 */
-		if (ipif_arg != NULL) {
-			if (ipif_refheld)  {
-				ipif_refrele(ipif);
-				ipif_refheld = B_FALSE;
-			}
-			ipif = ipif_arg;
-			match_flags |= MATCH_IRE_ILL;
-		} else {
-			/*
-			 * Check the ipif corresponding to the gw_addr
-			 */
-			if (ipif == NULL)
-				return (ENETUNREACH);
-			match_flags |= MATCH_IRE_IPIF;
+		if (ipif == NULL)
+			return (ENETUNREACH);
+
+		/*
+		 * We use MATCH_IRE_ILL here. If the caller specified an
+		 * interface (from the RTA_IFP sockaddr) we use it, otherwise
+		 * we use the ill derived from the gateway address.
+		 * We can always match the gateway address since we record it
+		 * in ire_gateway_addr.
+		 * We don't allow RTA_IFP to specify a different ill than the
+		 * one matching the ipif to make sure we can delete the route.
+		 */
+		match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
+		if (ill == NULL) {
+			ill = ipif->ipif_ill;
+		} else if (ill != ipif->ipif_ill) {
+			ipif_refrele(ipif);
+			return (EINVAL);
 		}
-		ASSERT(ipif != NULL);
 
 		/*
 		 * We check for an existing entry at this point.
@@ -6761,45 +5547,32 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 		 */
 		if (!ioctl_msg)
 			match_flags |= MATCH_IRE_MASK;
-		ire = ire_ftable_lookup(dst_addr, mask, 0, IRE_INTERFACE, ipif,
-		    NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
+		ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
+		    IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
+		    NULL);
 		if (ire != NULL) {
 			ire_refrele(ire);
-			if (ipif_refheld)
-				ipif_refrele(ipif);
+			ipif_refrele(ipif);
 			return (EEXIST);
 		}
 
-		stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
-		    ? ipif->ipif_rq : ipif->ipif_wq;
-
 		/*
-		 * Create a copy of the IRE_LOOPBACK,
-		 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with
-		 * the modified address and netmask.
+		 * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or
+		 * IRE_IF_RESOLVER with the modified address, netmask, and
+		 * gateway.
 		 */
 		ire = ire_create(
 		    (uchar_t *)&dst_addr,
 		    (uint8_t *)&mask,
-		    (uint8_t *)&ipif->ipif_src_addr,
-		    NULL,
-		    &ipif->ipif_mtu,
-		    NULL,
-		    NULL,
-		    stq,
-		    ipif->ipif_net_type,
-		    ipif,
-		    0,
-		    0,
-		    0,
+		    (uint8_t *)&gw_addr,
+		    ill->ill_net_type,
+		    ill,
+		    zoneid,
 		    flags,
-		    &ire_uinfo_null,
-		    NULL,
 		    NULL,
 		    ipst);
 		if (ire == NULL) {
-			if (ipif_refheld)
-				ipif_refrele(ipif);
+			ipif_refrele(ipif);
 			return (ENOMEM);
 		}
 
@@ -6810,7 +5583,7 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 		 * set up prefixes with the RTF_REJECT flag set (for example,
 		 * when generating aggregate routes.)
 		 *
-		 * If the IRE type (as defined by ipif->ipif_net_type) is
+		 * If the IRE type (as defined by ill->ill_net_type) is
 		 * IRE_LOOPBACK, then we map the request into a
 		 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
 		 * these interface routes, by definition, can only be that.
@@ -6819,27 +5592,37 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 		 * routine, but rather using ire_create() directly.
 		 *
 		 */
-		if (ipif->ipif_net_type == IRE_LOOPBACK) {
+		if (ill->ill_net_type == IRE_LOOPBACK) {
 			ire->ire_type = IRE_IF_NORESOLVER;
 			ire->ire_flags |= RTF_BLACKHOLE;
 		}
 
-		error = ire_add(&ire, q, mp, func, B_FALSE);
-		if (error == 0)
-			goto save_ire;
+		/* src address assigned by the caller? */
+		if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
+			ire->ire_setsrc_addr = src_addr;
 
+		nire = ire_add(ire);
+		if (nire == NULL) {
+			/*
+			 * In the result of failure, ire_add() will have
+			 * already deleted the ire in question, so there
+			 * is no need to do that here.
+			 */
+			ipif_refrele(ipif);
+			return (ENOMEM);
+		}
 		/*
-		 * In the result of failure, ire_add() will have already
-		 * deleted the ire in question, so there is no need to
-		 * do that here.
+		 * Check if it was a duplicate entry. This handles
+		 * the case of two racing route adds for the same route
 		 */
-		if (ipif_refheld)
+		if (nire != ire) {
+			ire_delete(nire);
+			ire_refrele(nire);
 			ipif_refrele(ipif);
-		return (error);
-	}
-	if (ipif_refheld) {
-		ipif_refrele(ipif);
-		ipif_refheld = B_FALSE;
+			return (EEXIST);
+		}
+		ire = nire;
+		goto save_ire;
 	}
 
 	/*
@@ -6847,13 +5630,19 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 	 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
 	 * gateway, it is currently unreachable and we fail the request
 	 * accordingly.
+	 * If RTA_IFP was specified we look on that particular ill.
 	 */
-	ipif = ipif_arg;
-	if (ipif_arg != NULL)
+	if (ill != NULL)
 		match_flags |= MATCH_IRE_ILL;
+
+	/* Check whether the gateway is reachable. */
 again:
-	gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL,
-	    ALL_ZONES, 0, NULL, match_flags, ipst);
+	type = IRE_INTERFACE;
+	if (flags & RTF_INDIRECT)
+		type |= IRE_OFFLINK;
+
+	gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
+	    ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
 	if (gw_ire == NULL) {
 		/*
 		 * With IPMP, we allow host routes to influence in.mpathd's
@@ -6862,10 +5651,13 @@ again:
 		 * underlying IRE_INTERFACEs are marked hidden.  So allow
 		 * hidden test IREs to be found and try again.
 		 */
-		if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))  {
-			match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+		if (!(match_flags & MATCH_IRE_TESTHIDDEN))  {
+			match_flags |= MATCH_IRE_TESTHIDDEN;
 			goto again;
 		}
+
+		if (ipif != NULL)
+			ipif_refrele(ipif);
 		return (ENETUNREACH);
 	}
 
@@ -6885,10 +5677,12 @@ again:
 		type = IRE_PREFIX;
 
 	/* check for a duplicate entry */
-	ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg,
-	    NULL, ALL_ZONES, 0, NULL,
-	    match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst);
+	ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
+	    ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW,
+	    0, ipst, NULL);
 	if (ire != NULL) {
+		if (ipif != NULL)
+			ipif_refrele(ipif);
 		ire_refrele(gw_ire);
 		ire_refrele(ire);
 		return (EEXIST);
@@ -6905,6 +5699,8 @@ again:
 		/* we hold reference to it upon success */
 		gcgrp = gcgrp_lookup(&ga, B_TRUE);
 		if (gcgrp == NULL) {
+			if (ipif != NULL)
+				ipif_refrele(ipif);
 			ire_refrele(gw_ire);
 			return (ENOMEM);
 		}
@@ -6918,6 +5714,8 @@ again:
 		 */
 		gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
 		if (gc == NULL) {
+			if (ipif != NULL)
+				ipif_refrele(ipif);
 			/* release reference held by gcgrp_lookup */
 			GCGRP_REFRELE(gcgrp);
 			ire_refrele(gw_ire);
@@ -6929,23 +5727,12 @@ again:
 	ire = ire_create(
 	    (uchar_t *)&dst_addr,		/* dest address */
 	    (uchar_t *)&mask,			/* mask */
-	    /* src address assigned by the caller? */
-	    (uchar_t *)(((src_addr != INADDR_ANY) &&
-	    (flags & RTF_SETSRC)) ?  &src_addr : NULL),
 	    (uchar_t *)&gw_addr,		/* gateway address */
-	    &gw_ire->ire_max_frag,
-	    NULL,				/* no src nce */
-	    NULL,				/* no recv-from queue */
-	    NULL,				/* no send-to queue */
 	    (ushort_t)type,			/* IRE type */
-	    ipif_arg,
-	    0,
-	    0,
-	    0,
+	    ill,
+	    zoneid,
 	    flags,
-	    &gw_ire->ire_uinfo,			/* Inherit ULP info from gw */
 	    gc,					/* security attribute */
-	    NULL,
 	    ipst);
 
 	/*
@@ -6958,26 +5745,51 @@ again:
 	if (ire == NULL) {
 		if (gc != NULL)
 			GC_REFRELE(gc);
+		if (ipif != NULL)
+			ipif_refrele(ipif);
 		ire_refrele(gw_ire);
 		return (ENOMEM);
 	}
 
+	/* Before we add, check if an extra CGTP broadcast is needed */
+	cgtp_broadcast = ((flags & RTF_MULTIRT) &&
+	    ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST);
+
+	/* src address assigned by the caller? */
+	if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
+		ire->ire_setsrc_addr = src_addr;
+
 	/*
 	 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
 	 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
 	 */
 
 	/* Add the new IRE. */
-	error = ire_add(&ire, q, mp, func, B_FALSE);
-	if (error != 0) {
+	nire = ire_add(ire);
+	if (nire == NULL) {
 		/*
-		 * In the result of failure, ire_add() will have already
-		 * deleted the ire in question, so there is no need to
-		 * do that here.
+		 * In the result of failure, ire_add() will have
+		 * already deleted the ire in question, so there
+		 * is no need to do that here.
 		 */
+		if (ipif != NULL)
+			ipif_refrele(ipif);
 		ire_refrele(gw_ire);
-		return (error);
+		return (ENOMEM);
+	}
+	/*
+	 * Check if it was a duplicate entry. This handles
+	 * the case of two racing route adds for the same route
+	 */
+	if (nire != ire) {
+		ire_delete(nire);
+		ire_refrele(nire);
+		if (ipif != NULL)
+			ipif_refrele(ipif);
+		ire_refrele(gw_ire);
+		return (EEXIST);
 	}
+	ire = nire;
 
 	if (flags & RTF_MULTIRT) {
 		/*
@@ -6990,45 +5802,47 @@ again:
 		 * because an IP source address cannot be a broadcast
 		 * or a multicast.
 		 */
-		ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0,
-		    IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-		if (ire_dst != NULL) {
-			ip_cgtp_bcast_add(ire, ire_dst, ipst);
-			ire_refrele(ire_dst);
+		if (cgtp_broadcast) {
+			ip_cgtp_bcast_add(ire, ipst);
 			goto save_ire;
 		}
 		if (ipst->ips_ip_cgtp_filter_ops != NULL &&
 		    !CLASSD(ire->ire_addr)) {
-			int res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v4(
-			    ipst->ips_netstack->netstack_stackid,
-			    ire->ire_addr,
-			    ire->ire_gateway_addr,
-			    ire->ire_src_addr,
-			    gw_ire->ire_src_addr);
+			int res;
+			ipif_t *src_ipif;
+
+			/* Find the source address corresponding to gw_ire */
+			src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr,
+			    NULL, zoneid, ipst);
+			if (src_ipif != NULL) {
+				res = ipst->ips_ip_cgtp_filter_ops->
+				    cfo_add_dest_v4(
+				    ipst->ips_netstack->netstack_stackid,
+				    ire->ire_addr,
+				    ire->ire_gateway_addr,
+				    ire->ire_setsrc_addr,
+				    src_ipif->ipif_lcl_addr);
+				ipif_refrele(src_ipif);
+			} else {
+				res = EADDRNOTAVAIL;
+			}
 			if (res != 0) {
+				if (ipif != NULL)
+					ipif_refrele(ipif);
 				ire_refrele(gw_ire);
 				ire_delete(ire);
+				ire_refrele(ire);	/* Held in ire_add */
 				return (res);
 			}
 		}
 	}
 
-	/*
-	 * Now that the prefix IRE entry has been created, delete any
-	 * existing gateway IRE cache entries as well as any IRE caches
-	 * using the gateway, and force them to be created through
-	 * ip_newroute.
-	 */
-	if (gc != NULL) {
-		ASSERT(gcgrp != NULL);
-		ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES, ipst);
-	}
-
 save_ire:
 	if (gw_ire != NULL) {
 		ire_refrele(gw_ire);
+		gw_ire = NULL;
 	}
-	if (ipif != NULL) {
+	if (ill != NULL) {
 		/*
 		 * Save enough information so that we can recreate the IRE if
 		 * the interface goes down and then up.  The metrics associated
@@ -7037,7 +5851,7 @@ save_ire:
 		 * memory cannot be allocated, none of this information will be
 		 * saved.
 		 */
-		ipif_save_ire(ipif, ire);
+		ill_save_ire(ill, ire);
 	}
 	if (ioctl_msg)
 		ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
@@ -7052,27 +5866,23 @@ save_ire:
 	} else {
 		ire_refrele(ire);		/* Held in ire_add */
 	}
-	if (ipif_refheld)
+	if (ipif != NULL)
 		ipif_refrele(ipif);
 	return (0);
 }
 
 /*
  * ip_rt_delete is called to delete an IPv4 route.
- * ipif_arg is passed in to associate it with the correct interface.
- * We may need to restart this operation if the ipif cannot be looked up
- * due to an exclusive operation that is currently in progress. The restart
- * entry point is specified by 'func'
+ * ill is passed in to associate it with the correct interface.
  */
 /* ARGSUSED4 */
 int
 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
-    uint_t rtm_addrs, int flags, ipif_t *ipif_arg, boolean_t ioctl_msg,
-    queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst)
+    uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
+    ip_stack_t *ipst, zoneid_t zoneid)
 {
 	ire_t	*ire = NULL;
 	ipif_t	*ipif;
-	boolean_t ipif_refheld = B_FALSE;
 	uint_t	type;
 	uint_t	match_flags = MATCH_IRE_TYPE;
 	int	err = 0;
@@ -7096,52 +5906,47 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 	 *
 	 * This makes it possible to delete an original
 	 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
+	 * However, we have RTF_KERNEL set on the ones created by ipif_up
+	 * and those can not be deleted here.
 	 *
-	 * As the interface index specified with the RTA_IFP sockaddr is the
-	 * same for all ipif's off of an ill, the matching logic below uses
-	 * MATCH_IRE_ILL if such an index was specified.  This means a route
-	 * sharing the same prefix and interface index as the the route
-	 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr
-	 * is specified in the request.
-	 *
-	 * On the other hand, since the gateway address will usually be
-	 * different for each ipif on the system, the matching logic
-	 * uses MATCH_IRE_IPIF in the case of a traditional interface
-	 * route.  This means that interface routes for the same prefix can be
-	 * uniquely identified if they belong to distinct ipif's and if a
-	 * RTA_IFP sockaddr is not present.
+	 * We use MATCH_IRE_ILL if we know the interface. If the caller
+	 * specified an interface (from the RTA_IFP sockaddr) we use it,
+	 * otherwise we use the ill derived from the gateway address.
+	 * We can always match the gateway address since we record it
+	 * in ire_gateway_addr.
 	 *
 	 * For more detail on specifying routes by gateway address and by
 	 * interface index, see the comments in ip_rt_add().
 	 */
-	ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &err,
-	    ipst);
-	if (ipif != NULL)
-		ipif_refheld = B_TRUE;
-	else if (err == EINPROGRESS)
-		return (err);
-	else
-		err = 0;
+	ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
 	if (ipif != NULL) {
-		if (ipif_arg != NULL) {
-			if (ipif_refheld) {
-				ipif_refrele(ipif);
-				ipif_refheld = B_FALSE;
-			}
-			ipif = ipif_arg;
-			match_flags |= MATCH_IRE_ILL;
-		} else {
-			match_flags |= MATCH_IRE_IPIF;
-		}
+		ill_t	*ill_match;
+
+		if (ill != NULL)
+			ill_match = ill;
+		else
+			ill_match = ipif->ipif_ill;
+
+		match_flags |= MATCH_IRE_ILL;
 		if (ipif->ipif_ire_type == IRE_LOOPBACK) {
-			ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif,
-			    ALL_ZONES, NULL, match_flags, ipst);
+			ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
+			    ill_match, ALL_ZONES, NULL, match_flags, 0, ipst,
+			    NULL);
 		}
 		if (ire == NULL) {
-			ire = ire_ftable_lookup(dst_addr, mask, 0,
-			    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL,
-			    match_flags, ipst);
+			match_flags |= MATCH_IRE_GW;
+			ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
+			    IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
+			    match_flags, 0, ipst, NULL);
 		}
+		/* Avoid deleting routes created by kernel from an ipif */
+		if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
+			ire_refrele(ire);
+			ire = NULL;
+		}
+
+		/* Restore in case we didn't find a match */
+		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
 	}
 
 	if (ire == NULL) {
@@ -7151,15 +5956,11 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 		 * set the IRE type to lookup based on whether
 		 * this is a host route, a default route or just a prefix.
 		 *
-		 * If an ipif_arg was passed in, then the lookup is based on an
+		 * If an ill was passed in, then the lookup is based on an
 		 * interface index so MATCH_IRE_ILL is added to match_flags.
-		 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is
-		 * set as the route being looked up is not a traditional
-		 * interface route.
 		 */
-		match_flags &= ~MATCH_IRE_IPIF;
 		match_flags |= MATCH_IRE_GW;
-		if (ipif_arg != NULL)
+		if (ill != NULL)
 			match_flags |= MATCH_IRE_ILL;
 		if (mask == IP_HOST_MASK)
 			type = IRE_HOST;
@@ -7167,14 +5968,15 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 			type = IRE_DEFAULT;
 		else
 			type = IRE_PREFIX;
-		ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg,
-		    NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
+		ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
+		    ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
 	}
 
-	if (ipif_refheld)
+	if (ipif != NULL) {
 		ipif_refrele(ipif);
+		ipif = NULL;
+	}
 
-	/* ipif is not refheld anymore */
 	if (ire == NULL)
 		return (ESRCH);
 
@@ -7193,9 +5995,9 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 		ip_cgtp_bcast_delete(ire, ipst);
 	}
 
-	ipif = ire->ire_ipif;
-	if (ipif != NULL)
-		ipif_remove_ire(ipif, ire);
+	ill = ire->ire_ill;
+	if (ill != NULL)
+		ill_remove_saved_ire(ill, ire);
 	if (ioctl_msg)
 		ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
 	ire_delete(ire);
@@ -7249,7 +6051,7 @@ ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	}
 
 	error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
-	    B_TRUE, q, mp, ip_process_ioctl, NULL, ipst);
+	    B_TRUE, NULL, ipst, ALL_ZONES);
 	if (ipif != NULL)
 		ipif_refrele(ipif);
 	return (error);
@@ -7301,8 +6103,8 @@ ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	}
 
 	error = ip_rt_delete(dst_addr, mask, gw_addr,
-	    RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, q,
-	    mp, ip_process_ioctl, ipst);
+	    RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
+	    ipst, ALL_ZONES);
 	if (ipif != NULL)
 		ipif_refrele(ipif);
 	return (error);
@@ -7655,7 +6457,8 @@ ipsq_dlpi_done(ipsq_t *ipsq)
 		if (phyi != NULL) {
 			ill = phyi->phyint_illv4;
 			if (ill != NULL &&
-			    ill->ill_dlpi_pending != DL_PRIM_INVAL)
+			    (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
+			    ill->ill_arl_dlpi_pending))
 				return (B_FALSE);
 
 			ill = phyi->phyint_illv6;
@@ -7819,8 +6622,8 @@ ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
 
 /*
  * The ipsq_t (ipsq) is the synchronization data structure used to serialize
- * certain critical operations like plumbing (i.e. most set ioctls), multicast
- * joins, igmp/mld timers, etc.  There is one ipsq per phyint. The ipsq
+ * certain critical operations like plumbing (i.e. most set ioctls), etc.
+ * There is one ipsq per phyint. The ipsq
  * serializes exclusive ioctls issued by applications on a per ipsq basis in
  * ipsq_xopq_mphead. It also protects against multiple threads executing in
  * the ipsq. Responses from the driver pertain to the current ioctl (say a
@@ -7838,7 +6641,7 @@ ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
  * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
  * ioctl if the current ioctl has completed. If the current ioctl is still
  * in progress it simply returns. The current ioctl could be waiting for
- * a response from another module (arp or the driver or could be waiting for
+ * a response from another module (the driver or could be waiting for
  * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
  * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
  * execution of the ioctl and ipsq_exit does not start the next ioctl unless
@@ -7959,6 +6762,38 @@ ipsq_exit(ipsq_t *ipsq)
 }
 
 /*
+ * Used to start any igmp or mld timers that could not be started
+ * while holding ill_mcast_lock. The timers can't be started while holding
+ * the lock, since mld/igmp_start_timers may need to call untimeout()
+ * which can't be done while holding the lock which the timeout handler
+ * acquires. Otherwise
+ * there could be a deadlock since the timeout handlers
+ * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
+ * ill_mcast_lock.
+ */
+void
+ill_mcast_timer_start(ip_stack_t *ipst)
+{
+	int		next;
+
+	mutex_enter(&ipst->ips_igmp_timer_lock);
+	next = ipst->ips_igmp_deferred_next;
+	ipst->ips_igmp_deferred_next = INFINITY;
+	mutex_exit(&ipst->ips_igmp_timer_lock);
+
+	if (next != INFINITY)
+		igmp_start_timers(next, ipst);
+
+	mutex_enter(&ipst->ips_mld_timer_lock);
+	next = ipst->ips_mld_deferred_next;
+	ipst->ips_mld_deferred_next = INFINITY;
+	mutex_exit(&ipst->ips_mld_timer_lock);
+
+	if (next != INFINITY)
+		mld_start_timers(next, ipst);
+}
+
+/*
  * Start the current exclusive operation on `ipsq'; associate it with `ipif'
  * and `ioccmd'.
  */
@@ -8101,7 +6936,6 @@ ipsq_flush(ill_t *ill)
 	mutex_exit(&ipx->ipx_lock);
 	(void) ipsq_pending_mp_cleanup(ill, NULL);
 	ipsq_xopq_mp_cleanup(ill, NULL);
-	ill_pending_mp_cleanup(ill);
 }
 
 /*
@@ -8114,7 +6948,7 @@ ipsq_flush(ill_t *ill)
  */
 int
 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
-    cmd_info_t *ci, ipsq_func_t func)
+    cmd_info_t *ci)
 {
 	char		*name;
 	struct ifreq    *ifr;
@@ -8124,7 +6958,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 	conn_t		*connp;
 	boolean_t	isv6;
 	boolean_t	exists;
-	int		err;
 	mblk_t		*mp1;
 	zoneid_t	zoneid;
 	ip_stack_t	*ipst;
@@ -8138,7 +6971,7 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 	} else {
 		ill = NULL;
 		connp = Q_TO_CONN(q);
-		isv6 = connp->conn_af_isv6;
+		isv6 = (connp->conn_family == AF_INET6);
 		zoneid = connp->conn_zoneid;
 		if (zoneid == GLOBAL_ZONEID) {
 			/* global zone can access ipifs in all zones */
@@ -8195,13 +7028,38 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 		ipif_refhold(ipif);
 	} else {
 		ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE,
-		    &exists, isv6, zoneid,
-		    (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err,
-		    ipst);
-		if (ipif == NULL) {
-			if (err == EINPROGRESS)
-				return (err);
-			err = 0;	/* Ensure we don't use it below */
+		    &exists, isv6, zoneid, ipst);
+
+		/*
+		 * Ensure that get ioctls don't see any internal state changes
+		 * caused by set ioctls by deferring them if IPIF_CHANGING is
+		 * set.
+		 */
+		if (ipif != NULL && !(ipip->ipi_flags & IPI_WR) &&
+		    !IAM_WRITER_IPIF(ipif)) {
+			ipsq_t	*ipsq;
+
+			if (connp != NULL)
+				mutex_enter(&connp->conn_lock);
+			mutex_enter(&ipif->ipif_ill->ill_lock);
+			if (IPIF_IS_CHANGING(ipif) &&
+			    !IPIF_IS_CONDEMNED(ipif)) {
+				ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
+				mutex_enter(&ipsq->ipsq_lock);
+				mutex_enter(&ipsq->ipsq_xop->ipx_lock);
+				mutex_exit(&ipif->ipif_ill->ill_lock);
+				ipsq_enq(ipsq, q, mp, ip_process_ioctl,
+				    NEW_OP, ipif->ipif_ill);
+				mutex_exit(&ipsq->ipsq_xop->ipx_lock);
+				mutex_exit(&ipsq->ipsq_lock);
+				if (connp != NULL)
+					mutex_exit(&connp->conn_lock);
+				ipif_refrele(ipif);
+				return (EINPROGRESS);
+			}
+			mutex_exit(&ipif->ipif_ill->ill_lock);
+			if (connp != NULL)
+				mutex_exit(&connp->conn_lock);
 		}
 	}
 
@@ -8226,6 +7084,9 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 	if (ipif == NULL)
 		return (ENXIO);
 
+	DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
+	    int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
+
 	ci->ci_ipif = ipif;
 	return (0);
 }
@@ -8544,7 +7405,6 @@ ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
 	uint_t	ifindex;
 	zoneid_t zoneid;
-	int err = 0;
 	boolean_t isv6 = B_FALSE;
 	struct	sockaddr_in	*sin;
 	struct	sockaddr_in6	*sin6;
@@ -8571,13 +7431,12 @@ ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
 		return (EINVAL);
 
 	ifindex = STRUCT_FGET(lifs, lifs_ifindex);
-	isv6 = (Q_TO_CONN(q))->conn_af_isv6;
-	ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp,
-	    ip_process_ioctl, &err, ipst);
+	isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
+	ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
 	if (ipif == NULL) {
 		ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
 		    ifindex));
-		return (err);
+		return (ENXIO);
 	}
 
 	/* Allocate a buffer to hold requested information */
@@ -8943,17 +7802,19 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
 	in6_addr_t	*daddr, *saddr;
 	ipaddr_t	v4daddr;
 	ire_t		*ire;
+	ipaddr_t	v4setsrc;
+	in6_addr_t	v6setsrc;
 	char		*slabel, *dlabel;
 	boolean_t	isipv4;
 	int		match_ire;
 	ill_t		*dst_ill;
-	ipif_t		*src_ipif, *ire_ipif;
 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
-	zoneid_t	zoneid;
-	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
+	conn_t		*connp = Q_TO_CONN(q);
+	zoneid_t	zoneid = IPCL_ZONEID(connp);
+	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
+	uint64_t	ipif_flags;
 
 	ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
-	zoneid = Q_TO_CONN(q)->conn_zoneid;
 
 	/*
 	 * This ioctl is I_STR only, and must have a
@@ -8976,7 +7837,7 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
 		data_mp = new_data_mp;
 		mp->b_cont = data_mp;
 	}
-	match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT;
+	match_ire = MATCH_IRE_DSTONLY;
 
 	for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
 	    end - cur >= sizeof (struct dstinforeq);
@@ -8987,8 +7848,8 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
 
 		/*
 		 * ip_addr_scope_v6() and ip6_asp_lookup() handle
-		 * v4 mapped addresses; ire_ftable_lookup[_v6]()
-		 * and ipif_select_source[_v6]() do not.
+		 * v4 mapped addresses; ire_ftable_lookup_v6()
+		 * and ip_select_source_v6() do not.
 		 */
 		dir->dir_dscope = ip_addr_scope_v6(daddr);
 		dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst);
@@ -8996,13 +7857,19 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
 		isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
 		if (isipv4) {
 			IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
-			ire = ire_ftable_lookup(v4daddr, NULL, NULL,
-			    0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst);
+			v4setsrc = INADDR_ANY;
+			ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid,
+			    NULL, match_ire, B_TRUE, 0, ipst, &v4setsrc, NULL,
+			    NULL);
 		} else {
-			ire = ire_ftable_lookup_v6(daddr, NULL, NULL,
-			    0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst);
+			v6setsrc = ipv6_all_zeros;
+			ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid,
+			    NULL, match_ire, B_TRUE, 0, ipst, &v6setsrc, NULL,
+			    NULL);
 		}
-		if (ire == NULL) {
+		ASSERT(ire != NULL);
+		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+			ire_refrele(ire);
 			dir->dir_dreachable = 0;
 
 			/* move on to next dst addr */
@@ -9010,36 +7877,40 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
 		}
 		dir->dir_dreachable = 1;
 
-		ire_ipif = ire->ire_ipif;
-		if (ire_ipif == NULL)
-			goto next_dst;
+		dst_ill = ire_nexthop_ill(ire);
+		if (dst_ill == NULL) {
+			ire_refrele(ire);
+			continue;
+		}
 
-		/*
-		 * We expect to get back an interface ire or a
-		 * gateway ire cache entry.  For both types, the
-		 * output interface is ire_ipif->ipif_ill.
-		 */
-		dst_ill = ire_ipif->ipif_ill;
+		/* With ipmp we most likely look at the ipmp ill here */
 		dir->dir_dmactype = dst_ill->ill_mactype;
 
 		if (isipv4) {
-			src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid);
+			ipaddr_t v4saddr;
+
+			if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr,
+			    connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst,
+			    &v4saddr, NULL, &ipif_flags) != 0) {
+				v4saddr = INADDR_ANY;
+				ipif_flags = 0;
+			}
+			IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr);
 		} else {
-			src_ipif = ipif_select_source_v6(dst_ill,
-			    daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
+			if (ip_select_source_v6(dst_ill, &v6setsrc, daddr,
+			    zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT,
+			    saddr, NULL, &ipif_flags) != 0) {
+				*saddr = ipv6_all_zeros;
+				ipif_flags = 0;
+			}
 		}
-		if (src_ipif == NULL)
-			goto next_dst;
 
-		*saddr = src_ipif->ipif_v6lcl_addr;
 		dir->dir_sscope = ip_addr_scope_v6(saddr);
 		slabel = ip6_asp_lookup(saddr, NULL, ipst);
 		dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
-		dir->dir_sdeprecated =
-		    (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
-		ipif_refrele(src_ipif);
-next_dst:
+		dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
 		ire_refrele(ire);
+		ill_refrele(dst_ill);
 	}
 	miocack(q, mp, iocp->ioc_count, 0);
 }
@@ -9088,16 +7959,16 @@ ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 
 			IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
 			    v4_addr);
-			ire = ire_ctable_lookup(v4_addr, 0,
-			    IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
-			    NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
+			ire = ire_ftable_lookup_v4(v4_addr, 0, 0,
+			    IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
+			    MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
 		} else {
 			in6_addr_t v6addr;
 
 			v6addr = sin6->sin6_addr;
-			ire = ire_ctable_lookup_v6(&v6addr, 0,
-			    IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
-			    NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
+			ire = ire_ftable_lookup_v6(&v6addr, 0, 0,
+			    IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
+			    MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
 		}
 		break;
 	}
@@ -9105,9 +7976,9 @@ ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 		ipaddr_t v4addr;
 
 		v4addr = sin->sin_addr.s_addr;
-		ire = ire_ctable_lookup(v4addr, 0,
+		ire = ire_ftable_lookup_v4(v4addr, 0, 0,
 		    IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
-		    NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
+		    NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
 		break;
 	}
 	default:
@@ -9160,9 +8031,8 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	sin = (sin_t *)&sia->sa_addr;
 
 	/*
-	 * Match addresses with a zero gateway field to avoid
-	 * routes going through a router.
-	 * Exclude broadcast and multicast addresses.
+	 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST
+	 * to make sure we only look at on-link unicast address.
 	 */
 	switch (sin->sin_family) {
 	case AF_INET6: {
@@ -9174,20 +8044,18 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 			IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
 			    v4_addr);
 			if (!CLASSD(v4_addr)) {
-				ire = ire_route_lookup(v4_addr, 0, 0, 0,
-				    NULL, NULL, zoneid, NULL,
-				    MATCH_IRE_GW, ipst);
+				ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0,
+				    NULL, zoneid, NULL, MATCH_IRE_DSTONLY,
+				    0, ipst, NULL);
 			}
 		} else {
 			in6_addr_t v6addr;
-			in6_addr_t v6gw;
 
 			v6addr = sin6->sin6_addr;
-			v6gw = ipv6_all_zeros;
 			if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
-				ire = ire_route_lookup_v6(&v6addr, 0,
-				    &v6gw, 0, NULL, NULL, zoneid,
-				    NULL, MATCH_IRE_GW, ipst);
+				ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0,
+				    NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0,
+				    ipst, NULL);
 			}
 		}
 		break;
@@ -9197,9 +8065,8 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 
 		v4addr = sin->sin_addr.s_addr;
 		if (!CLASSD(v4addr)) {
-			ire = ire_route_lookup(v4addr, 0, 0, 0,
-			    NULL, NULL, zoneid, NULL,
-			    MATCH_IRE_GW, ipst);
+			ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
+			    zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
 		}
 		break;
 	}
@@ -9208,10 +8075,11 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	}
 	sia->sa_res = 0;
 	if (ire != NULL) {
-		if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE|
-		    IRE_LOCAL|IRE_LOOPBACK)) {
+		ASSERT(!(ire->ire_type & IRE_MULTICAST));
+
+		if ((ire->ire_type & IRE_ONLINK) &&
+		    !(ire->ire_type & IRE_BROADCAST))
 			sia->sa_res = 1;
-		}
 		ire_refrele(ire);
 	}
 	return (0);
@@ -9228,54 +8096,40 @@ ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	return (ENXIO);
 }
 
-/*
- * ARP IOCTLs.
- * How does IP get in the business of fronting ARP configuration/queries?
- * Well it's like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP)
- * are by tradition passed in through a datagram socket.  That lands in IP.
- * As it happens, this is just as well since the interface is quite crude in
- * that it passes in no information about protocol or hardware types, or
- * interface association.  After making the protocol assumption, IP is in
- * the position to look up the name of the ILL, which ARP will need, and
- * format a request that can be handled by ARP.  The request is passed up
- * stream to ARP, and the original IOCTL is completed by IP when ARP passes
- * back a response.  ARP supports its own set of more general IOCTLs, in
- * case anyone is interested.
- */
+/* ARP IOCTLs. */
 /* ARGSUSED */
 int
 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
 {
-	mblk_t *mp1;
-	mblk_t *mp2;
-	mblk_t *pending_mp;
-	ipaddr_t ipaddr;
-	area_t *area;
-	struct iocblk *iocp;
-	conn_t *connp;
-	struct arpreq *ar;
-	struct xarpreq *xar;
-	int flags, alength;
-	uchar_t *lladdr;
-	ire_t *ire;
-	ip_stack_t *ipst;
-	ill_t *ill = ipif->ipif_ill;
-	ill_t *proxy_ill = NULL;
-	ipmp_arpent_t *entp = NULL;
-	boolean_t if_arp_ioctl = B_FALSE;
-	boolean_t proxyarp = B_FALSE;
+	int		err;
+	ipaddr_t	ipaddr;
+	struct iocblk	*iocp;
+	conn_t		*connp;
+	struct arpreq	*ar;
+	struct xarpreq	*xar;
+	int		arp_flags, flags, alength;
+	uchar_t		*lladdr;
+	ip_stack_t	*ipst;
+	ill_t		*ill = ipif->ipif_ill;
+	ill_t		*proxy_ill = NULL;
+	ipmp_arpent_t	*entp = NULL;
+	boolean_t	proxyarp = B_FALSE;
+	boolean_t	if_arp_ioctl = B_FALSE;
+	ncec_t		*ncec = NULL;
+	nce_t		*nce;
 
 	ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
 	connp = Q_TO_CONN(q);
 	ipst = connp->conn_netstack->netstack_ip;
+	iocp = (struct iocblk *)mp->b_rptr;
 
 	if (ipip->ipi_cmd_type == XARP_CMD) {
 		/* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
 		xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
 		ar = NULL;
 
-		flags = xar->xarp_flags;
+		arp_flags = xar->xarp_flags;
 		lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
 		if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
 		/*
@@ -9294,7 +8148,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
 		xar = NULL;
 
-		flags = ar->arp_flags;
+		arp_flags = ar->arp_flags;
 		lladdr = (uchar_t *)ar->arp_ha.sa_data;
 		/*
 		 * Theoretically, the sa_family could tell us what link
@@ -9315,7 +8169,14 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		}
 	}
 
-	ipaddr = sin->sin_addr.s_addr;
+	/* Translate ATF* flags to NCE* flags */
+	flags = 0;
+	if (arp_flags & ATF_AUTHORITY)
+		flags |= NCE_F_AUTHORITY;
+	if (arp_flags & ATF_PERM)
+		flags |= NCE_F_NONUD; /* not subject to aging */
+	if (arp_flags & ATF_PUBL)
+		flags |= NCE_F_PUBLISH;
 
 	/*
 	 * IPMP ARP special handling:
@@ -9349,171 +8210,120 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 					lladdr = proxy_ill->ill_phys_addr;
 			}
 			/* FALLTHRU */
-		case SIOCDARP:
-		case SIOCDXARP:
-			ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL,
-			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-			if (ire != NULL) {
-				ire_refrele(ire);
-				return (EPERM);
-			}
 		}
 	}
 
+	ipaddr = sin->sin_addr.s_addr;
 	/*
-	 * We are going to pass up to ARP a packet chain that looks
-	 * like:
-	 *
-	 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK
-	 *
-	 * Get a copy of the original IOCTL mblk to head the chain,
-	 * to be sent up (in mp1). Also get another copy to store
-	 * in the ill_pending_mp list, for matching the response
-	 * when it comes back from ARP.
-	 */
-	mp1 = copyb(mp);
-	pending_mp = copymsg(mp);
-	if (mp1 == NULL || pending_mp == NULL) {
-		if (mp1 != NULL)
-			freeb(mp1);
-		if (pending_mp != NULL)
-			inet_freemsg(pending_mp);
-		return (ENOMEM);
-	}
-
-	mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
-	    (caddr_t)&ipaddr);
-	if (mp2 == NULL) {
-		freeb(mp1);
-		inet_freemsg(pending_mp);
-		return (ENOMEM);
-	}
-	/* Put together the chain. */
-	mp1->b_cont = mp2;
-	mp1->b_datap->db_type = M_IOCTL;
-	mp2->b_cont = mp;
-	mp2->b_datap->db_type = M_DATA;
-
-	iocp = (struct iocblk *)mp1->b_rptr;
-
-	/*
-	 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an
-	 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a
-	 * cp_private field (or cp_rval on 32-bit systems) in place of the
-	 * ioc_count field; set ioc_count to be correct.
+	 * don't match across illgrp per case (1) and (2).
+	 * XXX use IS_IPMP(ill) like ndp_sioc_update?
 	 */
-	iocp->ioc_count = MBLKL(mp1->b_cont);
+	nce = nce_lookup_v4(ill, &ipaddr);
+	if (nce != NULL)
+		ncec = nce->nce_common;
 
-	/*
-	 * Set the proper command in the ARP message.
-	 * Convert the SIOC{G|S|D}ARP calls into our
-	 * AR_ENTRY_xxx calls.
-	 */
-	area = (area_t *)mp2->b_rptr;
 	switch (iocp->ioc_cmd) {
 	case SIOCDARP:
-	case SIOCDXARP:
+	case SIOCDXARP: {
 		/*
-		 * We defer deleting the corresponding IRE until
-		 * we return from arp.
+		 * Delete the NCE if any.
+		 */
+		if (ncec == NULL) {
+			iocp->ioc_error = ENXIO;
+			break;
+		}
+		/* Don't allow changes to arp mappings of local addresses. */
+		if (NCE_MYADDR(ncec)) {
+			nce_refrele(nce);
+			return (ENOTSUP);
+		}
+		iocp->ioc_error = 0;
+
+		/*
+		 * Delete the nce_common which has ncec_ill set to ipmp_ill.
+		 * This will delete all the nce entries on the under_ills.
+		 */
+		ncec_delete(ncec);
+		/*
+		 * Once the NCE has been deleted, then the ire_dep* consistency
+		 * mechanism will find any IRE which depended on the now
+		 * condemned NCE (as part of sending packets).
+		 * That mechanism handles redirects by deleting redirects
+		 * that refer to UNREACHABLE nces.
 		 */
-		area->area_cmd = AR_ENTRY_DELETE;
-		area->area_proto_mask_offset = 0;
 		break;
+	}
 	case SIOCGARP:
 	case SIOCGXARP:
-		area->area_cmd = AR_ENTRY_SQUERY;
-		area->area_proto_mask_offset = 0;
+		if (ncec != NULL) {
+			lladdr = ncec->ncec_lladdr;
+			flags = ncec->ncec_flags;
+			iocp->ioc_error = 0;
+			ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags);
+		} else {
+			iocp->ioc_error = ENXIO;
+		}
 		break;
 	case SIOCSARP:
 	case SIOCSXARP:
-		/*
-		 * Delete the corresponding ire to make sure IP will
-		 * pick up any change from arp.
-		 */
+		/* Don't allow changes to arp mappings of local addresses. */
+		if (ncec != NULL && NCE_MYADDR(ncec)) {
+			nce_refrele(nce);
+			return (ENOTSUP);
+		}
+
+		/* static arp entries will undergo NUD if ATF_PERM is not set */
+		flags |= NCE_F_STATIC;
 		if (!if_arp_ioctl) {
-			(void) ip_ire_clookup_and_delete(ipaddr, NULL, ipst);
+			ip_nce_lookup_and_update(&ipaddr, NULL, ipst,
+			    lladdr, alength, flags);
 		} else {
 			ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
 			if (ipif != NULL) {
-				(void) ip_ire_clookup_and_delete(ipaddr, ipif,
-				    ipst);
+				ip_nce_lookup_and_update(&ipaddr, ipif, ipst,
+				    lladdr, alength, flags);
 				ipif_refrele(ipif);
 			}
 		}
-		break;
-	}
-	iocp->ioc_cmd = area->area_cmd;
-
-	/*
-	 * Fill in the rest of the ARP operation fields.
-	 */
-	area->area_hw_addr_length = alength;
-	bcopy(lladdr, (char *)area + area->area_hw_addr_offset, alength);
-
-	/* Translate the flags. */
-	if (flags & ATF_PERM)
-		area->area_flags |= ACE_F_PERMANENT;
-	if (flags & ATF_PUBL)
-		area->area_flags |= ACE_F_PUBLISH;
-	if (flags & ATF_AUTHORITY)
-		area->area_flags |= ACE_F_AUTHORITY;
-
-	/*
-	 * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it
-	 * so that IP can update ARP as the active ills in the group change.
-	 */
-	if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD &&
-	    (area->area_flags & ACE_F_PERMANENT)) {
-		entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp);
-
+		if (nce != NULL) {
+			nce_refrele(nce);
+			nce = NULL;
+		}
 		/*
-		 * The second part of the conditional below handles a corner
-		 * case: if this is proxy ARP and the IPMP group has no active
-		 * interfaces, we can't send the request to ARP now since it
-		 * won't be able to build an ACE.  So we return success and
-		 * notify ARP about the proxy ARP entry once an interface
-		 * becomes active.
+		 * NCE_F_STATIC entries will be added in state ND_REACHABLE
+		 * by nce_add_common()
 		 */
-		if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
-			mp2->b_cont = NULL;
-			inet_freemsg(mp1);
-			inet_freemsg(pending_mp);
-			return (entp == NULL ? ENOMEM : 0);
+		err = nce_lookup_then_add_v4(ill, lladdr,
+		    ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED,
+		    &nce);
+		if (err == EEXIST) {
+			ncec = nce->nce_common;
+			mutex_enter(&ncec->ncec_lock);
+			ncec->ncec_state = ND_REACHABLE;
+			ncec->ncec_flags = flags;
+			nce_update(ncec, ND_UNCHANGED, lladdr);
+			mutex_exit(&ncec->ncec_lock);
+			err = 0;
+		}
+		if (nce != NULL) {
+			nce_refrele(nce);
+			nce = NULL;
+		}
+		if (IS_IPMP(ill) && err == 0) {
+			entp = ipmp_illgrp_create_arpent(ill->ill_grp,
+			    proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length,
+			    flags);
+			if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
+				iocp->ioc_error = (entp == NULL ? ENOMEM : 0);
+				break;
+			}
 		}
+		iocp->ioc_error = err;
 	}
 
-	/*
-	 * Before sending 'mp' to ARP, we have to clear the b_next
-	 * and b_prev. Otherwise if STREAMS encounters such a message
-	 * in freemsg(), (because ARP can close any time) it can cause
-	 * a panic. But mi code needs the b_next and b_prev values of
-	 * mp->b_cont, to complete the ioctl. So we store it here
-	 * in pending_mp->bcont, and restore it in ip_sioctl_iocack()
-	 * when the response comes down from ARP.
-	 */
-	pending_mp->b_cont->b_next = mp->b_cont->b_next;
-	pending_mp->b_cont->b_prev = mp->b_cont->b_prev;
-	mp->b_cont->b_next = NULL;
-	mp->b_cont->b_prev = NULL;
-
-	mutex_enter(&connp->conn_lock);
-	mutex_enter(&ill->ill_lock);
-	/* conn has not yet started closing, hence this can't fail */
-	if (ipip->ipi_flags & IPI_WR) {
-		VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp),
-		    pending_mp, 0) != 0);
-	} else {
-		VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
+	if (nce != NULL) {
+		nce_refrele(nce);
 	}
-	mutex_exit(&ill->ill_lock);
-	mutex_exit(&connp->conn_lock);
-
-	/*
-	 * Up to ARP it goes.  The response will come back in ip_wput() as an
-	 * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion.
-	 */
-	putnext(ill->ill_rq, mp1);
 
 	/*
 	 * If we created an IPMP ARP entry, mark that we've notified ARP.
@@ -9521,7 +8331,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	if (entp != NULL)
 		ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
 
-	return (EINPROGRESS);
+	return (iocp->ioc_error);
 }
 
 /*
@@ -9530,10 +8340,9 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
  */
 int
 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
-    cmd_info_t *ci, ipsq_func_t func)
+    cmd_info_t *ci)
 {
 	mblk_t	*mp1;
-	int	err;
 	sin_t	*sin;
 	conn_t	*connp;
 	ipif_t	*ipif;
@@ -9548,7 +8357,7 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 	/* ioctl comes down on a conn */
 	ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
 	connp = Q_TO_CONN(q);
-	if (connp->conn_af_isv6)
+	if (connp->conn_family == AF_INET6)
 		return (ENXIO);
 
 	ipst = connp->conn_netstack->netstack_ip;
@@ -9575,10 +8384,9 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 
 	if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) {
 		ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen,
-		    B_FALSE, &exists, B_FALSE, ALL_ZONES, CONNP_TO_WQ(connp),
-		    mp, func, &err, ipst);
+		    B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst);
 		if (ipif == NULL)
-			return (err);
+			return (ENXIO);
 		if (ipif->ipif_id != 0) {
 			ipif_refrele(ipif);
 			return (ENXIO);
@@ -9591,23 +8399,24 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 		 * find the wrong ill, so we first do an ipif_lookup_addr().
 		 */
 		ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
-		    CONNP_TO_WQ(connp), mp, func, &err, ipst);
+		    ipst);
 		if (ipif == NULL) {
-			ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0,
-			    IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL,
-			    MATCH_IRE_TYPE, ipst);
-			if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) {
+			ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr,
+			    0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES,
+			    NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
+			if (ire == NULL || ((ill = ire->ire_ill) == NULL)) {
 				if (ire != NULL)
 					ire_refrele(ire);
 				return (ENXIO);
 			}
+			ASSERT(ire != NULL && ill != NULL);
 			ipif = ill->ill_ipif;
 			ipif_refhold(ipif);
 			ire_refrele(ire);
 		}
 	}
 
-	if (ipif->ipif_net_type != IRE_IF_RESOLVER) {
+	if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) {
 		ipif_refrele(ipif);
 		return (ENXIO);
 	}
@@ -9700,123 +8509,20 @@ ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
 void
 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 {
-	mblk_t		*mp1, *mp2;
+	mblk_t		*mp1;
 	struct linkblk	*li;
-	struct ipmx_s	*ipmxp;
-	ill_t		*ill;
 	int		ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
 	int		err = 0;
-	boolean_t	entered_ipsq = B_FALSE;
-	boolean_t	islink;
-	ip_stack_t	*ipst;
-
-	if (CONN_Q(q))
-		ipst = CONNQ_TO_IPST(q);
-	else
-		ipst = ILLQ_TO_IPST(q);
 
 	ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK ||
 	    ioccmd == I_LINK || ioccmd == I_UNLINK);
 
-	islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
-
 	mp1 = mp->b_cont;	/* This is the linkblk info */
 	li = (struct linkblk *)mp1->b_rptr;
 
-	/*
-	 * ARP has added this special mblk, and the utility is asking us
-	 * to perform consistency checks, and also atomically set the
-	 * muxid. Ifconfig is an example.  It achieves this by using
-	 * /dev/arp as the mux to plink the arp stream, and pushes arp on
-	 * to /dev/udp[6] stream for use as the mux when plinking the IP
-	 * stream. SIOCSLIFMUXID is not required.  See ifconfig.c, arp.c
-	 * and other comments in this routine for more details.
-	 */
-	mp2 = mp1->b_cont;	/* This is added by ARP */
-
-	/*
-	 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than
-	 * ifconfig which didn't push ARP on top of the dummy mux, we won't
-	 * get the special mblk above.  For backward compatibility, we
-	 * request ip_sioctl_plink_ipmod() to skip the consistency checks.
-	 * The utility will use SIOCSLIFMUXID to store the muxids.  This is
-	 * not atomic, and can leave the streams unplumbable if the utility
-	 * is interrupted before it does the SIOCSLIFMUXID.
-	 */
-	if (mp2 == NULL) {
-		err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_FALSE);
-		if (err == EINPROGRESS)
-			return;
-		goto done;
-	}
-
-	/*
-	 * This is an I_{P}LINK sent down by ifconfig through the ARP module;
-	 * ARP has appended this last mblk to tell us whether the lower stream
-	 * is an arp-dev stream or an IP module stream.
-	 */
-	ipmxp = (struct ipmx_s *)mp2->b_rptr;
-	if (ipmxp->ipmx_arpdev_stream) {
-		/*
-		 * The lower stream is the arp-dev stream.
-		 */
-		ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE,
-		    q, mp, ip_sioctl_plink, &err, NULL, ipst);
-		if (ill == NULL) {
-			if (err == EINPROGRESS)
-				return;
-			err = EINVAL;
-			goto done;
-		}
-
-		if (ipsq == NULL) {
-			ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
-			    NEW_OP, B_FALSE);
-			if (ipsq == NULL) {
-				ill_refrele(ill);
-				return;
-			}
-			entered_ipsq = B_TRUE;
-		}
-		ASSERT(IAM_WRITER_ILL(ill));
-		ill_refrele(ill);
-
-		/*
-		 * To ensure consistency between IP and ARP, the following
-		 * LIFO scheme is used in plink/punlink. (IP first, ARP last).
-		 * This is because the muxid's are stored in the IP stream on
-		 * the ill.
-		 *
-		 * I_{P}LINK: ifconfig plinks the IP stream before plinking
-		 * the ARP stream. On an arp-dev stream, IP checks that it is
-		 * not yet plinked, and it also checks that the corresponding
-		 * IP stream is already plinked.
-		 *
-		 * I_{P}UNLINK: ifconfig punlinks the ARP stream before
-		 * punlinking the IP stream. IP does not allow punlink of the
-		 * IP stream unless the arp stream has been punlinked.
-		 */
-		if ((islink &&
-		    (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) ||
-		    (!islink && ill->ill_arp_muxid != li->l_index)) {
-			err = EINVAL;
-			goto done;
-		}
-
-		if (IS_IPMP(ill) &&
-		    (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
-			goto done;
-
-		ill->ill_arp_muxid = islink ? li->l_index : 0;
-	} else {
-		/*
-		 * The lower stream is probably an IP module stream.  Do
-		 * consistency checking.
-		 */
-		err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_TRUE);
-		if (err == EINPROGRESS)
-			return;
-	}
+	err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li);
+	if (err == EINPROGRESS)
+		return;
 done:
 	if (err == 0)
 		miocack(q, mp, 0, 0);
@@ -9826,21 +8532,19 @@ done:
 	/* Conn was refheld in ip_sioctl_copyin_setup */
 	if (CONN_Q(q))
 		CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
-	if (entered_ipsq)
-		ipsq_exit(ipsq);
 }
 
 /*
  * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to
  * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP
  * module stream).  If `doconsist' is set, then do the extended consistency
- * checks requested by ifconfig(1M) and (atomically) set ill_ip_muxid here.
+ * checks requested by ifconfig(1M) and (atomically) set ill_muxid here.
  * Returns zero on success, EINPROGRESS if the operation is still pending, or
  * an error code on failure.
  */
 static int
 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
-    struct linkblk *li, boolean_t doconsist)
+    struct linkblk *li)
 {
 	int		err = 0;
 	ill_t  		*ill;
@@ -9849,6 +8553,8 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 	struct qinit	*qinfo;
 	boolean_t	islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
 	boolean_t	entered_ipsq = B_FALSE;
+	boolean_t	is_ip = B_FALSE;
+	arl_t		*arl;
 
 	/*
 	 * Walk the lower stream to verify it's the IP module stream.
@@ -9861,6 +8567,11 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 		name = qinfo->qi_minfo->mi_idname;
 		if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 &&
 		    qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) {
+			is_ip = B_TRUE;
+			break;
+		}
+		if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 &&
+		    qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) {
 			break;
 		}
 	}
@@ -9871,30 +8582,46 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 	if (ipwq == NULL)
 		return (0);
 
-	ill = ipwq->q_ptr;
+	if (!is_ip) {
+		arl = (arl_t *)ipwq->q_ptr;
+		ill = arl_to_ill(arl);
+		if (ill == NULL)
+			return (0);
+	} else {
+		ill = ipwq->q_ptr;
+	}
 	ASSERT(ill != NULL);
 
 	if (ipsq == NULL) {
 		ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
 		    NEW_OP, B_FALSE);
-		if (ipsq == NULL)
+		if (ipsq == NULL) {
+			if (!is_ip)
+				ill_refrele(ill);
 			return (EINPROGRESS);
+		}
 		entered_ipsq = B_TRUE;
 	}
 	ASSERT(IAM_WRITER_ILL(ill));
-
-	if (doconsist) {
-		/*
-		 * Consistency checking requires that I_{P}LINK occurs
-		 * prior to setting ill_ip_muxid, and that I_{P}UNLINK
-		 * occurs prior to clearing ill_arp_muxid.
-		 */
-		if ((islink && ill->ill_ip_muxid != 0) ||
-		    (!islink && ill->ill_arp_muxid != 0)) {
-			err = EINVAL;
-			goto done;
+	mutex_enter(&ill->ill_lock);
+	if (!is_ip) {
+		if (islink && ill->ill_muxid == 0) {
+			/*
+			 * Plumbing has to be done with IP plumbed first, arp
+			 * second, but here we have arp being plumbed first.
+			 */
+			mutex_exit(&ill->ill_lock);
+			ipsq_exit(ipsq);
+			ill_refrele(ill);
+			return (EINVAL);
 		}
 	}
+	mutex_exit(&ill->ill_lock);
+	if (!is_ip) {
+		arl->arl_muxid = islink ? li->l_index : 0;
+		ill_refrele(ill);
+		goto done;
+	}
 
 	if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
 		goto done;
@@ -9912,8 +8639,7 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 			ill->ill_lmod_cnt++;
 	}
 
-	if (doconsist)
-		ill->ill_ip_muxid = islink ? li->l_index : 0;
+	ill->ill_muxid = islink ? li->l_index : 0;
 
 	/*
 	 * Mark the ipsq busy until the capability operations initiated below
@@ -9997,11 +8723,11 @@ ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
 }
 
 /*
- * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message
+ * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message
  * that arrives.  Most of the IOCTLs are "socket" IOCTLs which we handle
  * in either I_STR or TRANSPARENT form, using the mi_copy facility.
  * We establish here the size of the block to be copied in.  mi_copyin
- * arranges for this to happen, an processing continues in ip_wput with
+ * arranges for this to happen, an processing continues in ip_wput_nondata with
  * an M_IOCDATA message.
  */
 void
@@ -10054,17 +8780,7 @@ ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
 	 * will fail all ioctls).
 	 */
 	if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
-		if (ipip->ipi_flags & IPI_PASS_DOWN) {
-			/*
-			 * Pass common Streams ioctls which the IP
-			 * module does not own or consume along to
-			 * be processed down stream.
-			 */
-			putnext(q, mp);
-			return;
-		} else {
-			goto nak;
-		}
+		goto nak;
 	}
 
 	/* Make sure we have ioctl data to process. */
@@ -10216,286 +8932,62 @@ nak:
 	qreply(q, mp);
 }
 
-/* ip_wput hands off ARP IOCTL responses to us */
-/* ARGSUSED3 */
-void
-ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
+static void
+ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags)
 {
 	struct arpreq *ar;
 	struct xarpreq *xar;
-	area_t	*area;
-	mblk_t	*area_mp;
+	mblk_t	*tmp;
 	struct iocblk *iocp;
-	mblk_t	*orig_ioc_mp, *tmp;
-	struct iocblk	*orig_iocp;
-	ill_t *ill;
-	conn_t *connp = NULL;
-	mblk_t *pending_mp;
-	int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE;
+	int x_arp_ioctl = B_FALSE;
 	int *flagsp;
 	char *storage = NULL;
-	sin_t *sin;
-	ipaddr_t addr;
-	int err;
-	ip_stack_t *ipst;
 
-	ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
-	ill = q->q_ptr;
 	ASSERT(ill != NULL);
-	ipst = ill->ill_ipst;
-
-	/*
-	 * We should get back from ARP a packet chain that looks like:
-	 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK
-	 */
-	if (!(area_mp = mp->b_cont) ||
-	    (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) ||
-	    !(orig_ioc_mp = area_mp->b_cont) ||
-	    !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) {
-		freemsg(mp);
-		return;
-	}
 
-	orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr;
+	iocp = (struct iocblk *)mp->b_rptr;
+	ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP);
 
-	tmp = (orig_ioc_mp->b_cont)->b_cont;
-	if ((orig_iocp->ioc_cmd == SIOCGXARP) ||
-	    (orig_iocp->ioc_cmd == SIOCSXARP) ||
-	    (orig_iocp->ioc_cmd == SIOCDXARP)) {
+	tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */
+	if ((iocp->ioc_cmd == SIOCGXARP) ||
+	    (iocp->ioc_cmd == SIOCSXARP)) {
 		x_arp_ioctl = B_TRUE;
 		xar = (struct xarpreq *)tmp->b_rptr;
-		sin = (sin_t *)&xar->xarp_pa;
 		flagsp = &xar->xarp_flags;
 		storage = xar->xarp_ha.sdl_data;
-		if (xar->xarp_ha.sdl_nlen != 0)
-			ifx_arp_ioctl = B_TRUE;
 	} else {
 		ar = (struct arpreq *)tmp->b_rptr;
-		sin = (sin_t *)&ar->arp_pa;
 		flagsp = &ar->arp_flags;
 		storage = ar->arp_ha.sa_data;
 	}
 
-	iocp = (struct iocblk *)mp->b_rptr;
-
-	/*
-	 * Find the pending message; if we're exclusive, it'll be on our IPSQ.
-	 * Otherwise, we can find it from our ioc_id.
-	 */
-	if (ipsq != NULL)
-		pending_mp = ipsq_pending_mp_get(ipsq, &connp);
-	else
-		pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id);
-
-	if (pending_mp == NULL) {
-		ASSERT(connp == NULL);
-		inet_freemsg(mp);
-		return;
-	}
-	ASSERT(connp != NULL);
-	q = CONNP_TO_WQ(connp);
-
-	/* Uncouple the internally generated IOCTL from the original one */
-	area = (area_t *)area_mp->b_rptr;
-	area_mp->b_cont = NULL;
-
-	/*
-	 * Restore the b_next and b_prev used by mi code. This is needed
-	 * to complete the ioctl using mi* functions. We stored them in
-	 * the pending mp prior to sending the request to ARP.
-	 */
-	orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next;
-	orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev;
-	inet_freemsg(pending_mp);
-
 	/*
-	 * We're done if there was an error or if this is not an SIOCG{X}ARP
-	 * Catch the case where there is an IRE_CACHE by no entry in the
-	 * arp table.
-	 */
-	addr = sin->sin_addr.s_addr;
-	if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) {
-		ire_t			*ire;
-		dl_unitdata_req_t	*dlup;
-		mblk_t			*llmp;
-		int			addr_len;
-		ill_t			*ipsqill = NULL;
-
-		if (ifx_arp_ioctl) {
-			/*
-			 * There's no need to lookup the ill, since
-			 * we've already done that when we started
-			 * processing the ioctl and sent the message
-			 * to ARP on that ill.  So use the ill that
-			 * is stored in q->q_ptr.
-			 */
-			ipsqill = ill;
-			ire = ire_ctable_lookup(addr, 0, IRE_CACHE,
-			    ipsqill->ill_ipif, ALL_ZONES,
-			    NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-		} else {
-			ire = ire_ctable_lookup(addr, 0, IRE_CACHE,
-			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-			if (ire != NULL)
-				ipsqill = ire_to_ill(ire);
-		}
-
-		if ((x_arp_ioctl) && (ipsqill != NULL))
-			storage += ill_xarp_info(&xar->xarp_ha, ipsqill);
-
-		if (ire != NULL) {
-			/*
-			 * Since the ire obtained from cachetable is used for
-			 * mac addr copying below, treat an incomplete ire as if
-			 * as if we never found it.
-			 */
-			if (ire->ire_nce != NULL &&
-			    ire->ire_nce->nce_state != ND_REACHABLE) {
-				ire_refrele(ire);
-				ire = NULL;
-				ipsqill = NULL;
-				goto errack;
-			}
-			*flagsp = ATF_INUSE;
-			llmp = (ire->ire_nce != NULL ?
-			    ire->ire_nce->nce_res_mp : NULL);
-			if (llmp != NULL && ipsqill != NULL) {
-				uchar_t *macaddr;
-
-				addr_len = ipsqill->ill_phys_addr_length;
-				if (x_arp_ioctl && ((addr_len +
-				    ipsqill->ill_name_length) >
-				    sizeof (xar->xarp_ha.sdl_data))) {
-					ire_refrele(ire);
-					freemsg(mp);
-					ip_ioctl_finish(q, orig_ioc_mp,
-					    EINVAL, NO_COPYOUT, ipsq);
-					return;
-				}
-				*flagsp |= ATF_COM;
-				dlup = (dl_unitdata_req_t *)llmp->b_rptr;
-				if (ipsqill->ill_sap_length < 0)
-					macaddr = llmp->b_rptr +
-					    dlup->dl_dest_addr_offset;
-				else
-					macaddr = llmp->b_rptr +
-					    dlup->dl_dest_addr_offset +
-					    ipsqill->ill_sap_length;
-				/*
-				 * For SIOCGARP, MAC address length
-				 * validation has already been done
-				 * before the ioctl was issued to ARP to
-				 * allow it to progress only on 6 byte
-				 * addressable (ethernet like) media. Thus
-				 * the mac address copying can not overwrite
-				 * the sa_data area below.
-				 */
-				bcopy(macaddr, storage, addr_len);
-			}
-			/* Ditch the internal IOCTL. */
-			freemsg(mp);
-			ire_refrele(ire);
-			ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
-			return;
-		}
-	}
-
-	/*
-	 * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE
-	 * on the IPMP meta-interface, ensure any ARP entries added in
-	 * ip_sioctl_arp() are deleted.
-	 */
-	if (IS_IPMP(ill) &&
-	    ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) ||
-	    ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) {
-		ipmp_illgrp_t *illg = ill->ill_grp;
-		ipmp_arpent_t *entp;
-
-		if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL)
-			ipmp_illgrp_destroy_arpent(illg, entp);
-	}
-
-	/*
-	 * Delete the coresponding IRE_CACHE if any.
-	 * Reset the error if there was one (in case there was no entry
-	 * in arp.)
-	 */
-	if (iocp->ioc_cmd == AR_ENTRY_DELETE) {
-		ipif_t *ipintf = NULL;
-
-		if (ifx_arp_ioctl) {
-			/*
-			 * There's no need to lookup the ill, since
-			 * we've already done that when we started
-			 * processing the ioctl and sent the message
-			 * to ARP on that ill.  So use the ill that
-			 * is stored in q->q_ptr.
-			 */
-			ipintf = ill->ill_ipif;
-		}
-		if (ip_ire_clookup_and_delete(addr, ipintf, ipst)) {
-			/*
-			 * The address in "addr" may be an entry for a
-			 * router. If that's true, then any off-net
-			 * IRE_CACHE entries that go through the router
-			 * with address "addr" must be clobbered. Use
-			 * ire_walk to achieve this goal.
-			 */
-			if (ifx_arp_ioctl)
-				ire_walk_ill_v4(MATCH_IRE_ILL, 0,
-				    ire_delete_cache_gw, (char *)&addr, ill);
-			else
-				ire_walk_v4(ire_delete_cache_gw, (char *)&addr,
-				    ALL_ZONES, ipst);
-			iocp->ioc_error = 0;
-		}
-	}
-errack:
-	if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) {
-		err = iocp->ioc_error;
-		freemsg(mp);
-		ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq);
-		return;
-	}
-
-	/*
-	 * Completion of an SIOCG{X}ARP.  Translate the information from
-	 * the area_t into the struct {x}arpreq.
+	 * We're done if this is not an SIOCG{X}ARP
 	 */
 	if (x_arp_ioctl) {
 		storage += ill_xarp_info(&xar->xarp_ha, ill);
 		if ((ill->ill_phys_addr_length + ill->ill_name_length) >
 		    sizeof (xar->xarp_ha.sdl_data)) {
-			freemsg(mp);
-			ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT,
-			    ipsq);
+			iocp->ioc_error = EINVAL;
 			return;
 		}
 	}
 	*flagsp = ATF_INUSE;
-	if (area->area_flags & ACE_F_PERMANENT)
-		*flagsp |= ATF_PERM;
-	if (area->area_flags & ACE_F_PUBLISH)
-		*flagsp |= ATF_PUBL;
-	if (area->area_flags & ACE_F_AUTHORITY)
+	/*
+	 * If /sbin/arp told us we are the authority using the "permanent"
+	 * flag, or if this is one of my addresses print "permanent"
+	 * in the /sbin/arp output.
+	 */
+	if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY))
 		*flagsp |= ATF_AUTHORITY;
-	if (area->area_hw_addr_length != 0) {
+	if (flags & NCE_F_NONUD)
+		*flagsp |= ATF_PERM; /* not subject to aging */
+	if (flags & NCE_F_PUBLISH)
+		*flagsp |= ATF_PUBL;
+	if (hwaddr != NULL) {
 		*flagsp |= ATF_COM;
-		/*
-		 * For SIOCGARP, MAC address length validation has
-		 * already been done before the ioctl was issued to ARP
-		 * to allow it to progress only on 6 byte addressable
-		 * (ethernet like) media. Thus the mac address copying
-		 * can not overwrite the sa_data area below.
-		 */
-		bcopy((char *)area + area->area_hw_addr_offset,
-		    storage, area->area_hw_addr_length);
+		bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length);
 	}
-
-	/* Ditch the internal IOCTL. */
-	freemsg(mp);
-	/* Complete the original. */
-	ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
 }
 
 /*
@@ -10552,7 +9044,7 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	name = lifr->lifr_name;
 	ASSERT(CONN_Q(q));
 	connp = Q_TO_CONN(q);
-	isv6 = connp->conn_af_isv6;
+	isv6 = (connp->conn_family == AF_INET6);
 	zoneid = connp->conn_zoneid;
 	namelen = mi_strlen(name);
 	if (namelen == 0)
@@ -10567,7 +9059,7 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 		 * for the last 4 args to ipif_lookup_name.
 		 */
 		ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE,
-		    &exists, isv6, zoneid, NULL, NULL, NULL, NULL, ipst);
+		    &exists, isv6, zoneid, ipst);
 		/* Prevent any further action */
 		if (ipif == NULL) {
 			return (ENOBUFS);
@@ -10605,12 +9097,11 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 				break;
 			}
 		}
-		ill = ill_lookup_on_name(name, B_FALSE, isv6,
-		    CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL, ipst);
+		ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst);
 		if (found_sep)
 			*cp = IPIF_SEPARATOR_CHAR;
 		if (ill == NULL)
-			return (err);
+			return (ENXIO);
 	}
 
 	ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
@@ -10687,7 +9178,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	ASSERT(q->q_next == NULL);
 	ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
-	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+	    ill->ill_name, ipif->ipif_id, (void *)ipif));
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
 	connp = Q_TO_CONN(q);
@@ -10703,7 +9194,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	 * same as any other interface (meaning it skips the code directly
 	 * below).
 	 */
-	if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) {
+	if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
 		if (sin->sin_family == AF_UNSPEC &&
 		    (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
 			/*
@@ -10802,7 +9293,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		mutex_exit(&ill->ill_lock);
 		mutex_exit(&connp->conn_lock);
 		ipif_non_duplicate(ipif);
-		ipif_down_tail(ipif);
+		(void) ipif_down_tail(ipif);
 		ipif_free_tail(ipif); /* frees ipif */
 		return (0);
 	}
@@ -10833,7 +9324,7 @@ ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
 	ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
 	    ill->ill_name, ipif->ipif_id, (void *)ipif));
 
-	if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) {
+	if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
 		ASSERT(ill->ill_state_flags & ILL_CONDEMNED);
 		ill_delete_tail(ill);
 		mi_free(ill);
@@ -10841,10 +9332,9 @@ ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
 	}
 
 	ipif_non_duplicate(ipif);
-	ipif_down_tail(ipif);
+	(void) ipif_down_tail(ipif);
 	ipif_free_tail(ipif);
 
-	ILL_UNMARK_CHANGING(ill);
 	return (0);
 }
 
@@ -10930,8 +9420,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		 * we have net and subnet bcast ire's for
 		 * the old address if we need them.
 		 */
-		if (!ipif->ipif_isv6)
-			ipif_check_bcast_ires(ipif);
 		/*
 		 * If the interface is already marked up,
 		 * we call ipif_down which will take care
@@ -10941,7 +9429,7 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		err = ipif_logical_down(ipif, q, mp);
 		if (err == EINPROGRESS)
 			return (err);
-		ipif_down_tail(ipif);
+		(void) ipif_down_tail(ipif);
 		need_up = 1;
 	}
 
@@ -10988,11 +9476,6 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	ov6addr = ipif->ipif_v6lcl_addr;
 	ipif->ipif_v6lcl_addr = v6addr;
 	sctp_update_ipif_addr(ipif, ov6addr);
-	if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) {
-		ipif->ipif_v6src_addr = ipv6_all_zeros;
-	} else {
-		ipif->ipif_v6src_addr = v6addr;
-	}
 	ipif->ipif_addr_ready = 0;
 
 	/*
@@ -11050,12 +9533,22 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		 * ip_rput_dlpi when we see the DL_BIND_ACK.
 		 */
 		err = ipif_up(ipif, q, mp);
+	} else {
+		/* Perhaps ilgs should use this ill */
+		update_conn_ill(NULL, ill->ill_ipst);
 	}
 
 	if (need_dl_down)
 		ill_dl_down(ill);
-	if (need_arp_down)
-		ipif_resolver_down(ipif);
+
+	if (need_arp_down && !ill->ill_isv6)
+		(void) ipif_arp_down(ipif);
+
+	/*
+	 * The default multicast interface might have changed (for
+	 * instance if the IPv6 scope of the address changed)
+	 */
+	ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
 
 	return (err);
 }
@@ -11072,7 +9565,7 @@ ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
 	ASSERT(IAM_WRITER_IPIF(ipif));
-	ipif_down_tail(ipif);
+	(void) ipif_down_tail(ipif);
 	return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
 }
 
@@ -11162,7 +9655,7 @@ ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		err = ipif_logical_down(ipif, q, mp);
 		if (err == EINPROGRESS)
 			return (err);
-		ipif_down_tail(ipif);
+		(void) ipif_down_tail(ipif);
 		need_up = B_TRUE;
 	}
 	/*
@@ -11254,8 +9747,8 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	if (need_dl_down)
 		ill_dl_down(ill);
-	if (need_arp_down)
-		ipif_resolver_down(ipif);
+	if (need_arp_down && !ipif->ipif_isv6)
+		(void) ipif_arp_down(ipif);
 
 	return (err);
 }
@@ -11271,7 +9764,7 @@ ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 {
 	ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
-	ipif_down_tail(ipif);
+	(void) ipif_down_tail(ipif);
 	return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
 }
 
@@ -11333,7 +9826,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	struct ifreq *ifr;
 	struct lifreq *lifr;
 	boolean_t set_linklocal = B_FALSE;
-	boolean_t zero_source = B_FALSE;
 
 	ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -11345,7 +9837,7 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	if (ipip->ipi_cmd_type == IF_CMD) {
 		ifr = (struct ifreq *)if_req;
-		flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
+		flags =  (uint64_t)(ifr->ifr_flags & 0x0000ffff);
 	} else {
 		lifr = (struct lifreq *)if_req;
 		flags = lifr->lifr_flags;
@@ -11425,10 +9917,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	}
 
 	/*
-	 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on
+	 * Only allow IFF_TEMPORARY flag to be set on
 	 * IPv6 interfaces.
 	 */
-	if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6))
+	if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6))
 		return (EINVAL);
 
 	/*
@@ -11444,9 +9936,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
 		return (EINVAL);
 
-	if (flags & (IFF_NOLOCAL|IFF_ANYCAST))
-		zero_source = B_TRUE;
-
 	/*
 	 * For IPv6 ipif_id 0, don't allow the interface to be up without
 	 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
@@ -11454,7 +9943,7 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	 * set later on in this function.
 	 */
 	if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
-	    (flags & IFF_UP) && !zero_source &&
+	    (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) &&
 	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
 		if (ipif_cant_setlinklocal(ipif))
 			return (EINVAL);
@@ -11560,13 +10049,15 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 				    ill_ipif, RTSQ_DEFAULT);
 			}
 		}
+		/* The default multicast interface might have changed */
+		ire_increment_multicast_generation(ill->ill_ipst,
+		    ill->ill_isv6);
+
 		return (0);
-	} else if (set_linklocal || zero_source) {
+	} else if (set_linklocal) {
 		mutex_enter(&ill->ill_lock);
 		if (set_linklocal)
 			ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
-		if (zero_source)
-			ipif->ipif_state_flags |= IPIF_ZERO_SOURCE;
 		mutex_exit(&ill->ill_lock);
 	}
 
@@ -11610,13 +10101,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	    ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
 	    IPIF_NOFAILOVER)) {
 		/*
-		 * Taking this ipif down, make sure we have
-		 * valid net and subnet bcast ire's for other
-		 * logical interfaces, if we need them.
+		 * ipif_down() will ire_delete bcast ire's for the subnet,
+		 * while the ire_identical_ref tracks the case of IRE_BROADCAST
+		 * entries shared between multiple ipifs on the same subnet.
 		 */
-		if (!ipif->ipif_isv6)
-			ipif_check_bcast_ires(ipif);
-
 		if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
 		    !(turn_off & IPIF_UP)) {
 			if (ipif->ipif_flags & IPIF_UP)
@@ -11627,7 +10115,7 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		ip1dbg(("ipif_down returns %d err ", err));
 		if (err == EINPROGRESS)
 			return (err);
-		ipif_down_tail(ipif);
+		(void) ipif_down_tail(ipif);
 	}
 	return (ip_sioctl_flags_tail(ipif, flags, q, mp));
 }
@@ -11642,7 +10130,6 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 	boolean_t phyint_flags_modified = B_FALSE;
 	int	err = 0;
 	boolean_t set_linklocal = B_FALSE;
-	boolean_t zero_source = B_FALSE;
 
 	ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id));
@@ -11680,21 +10167,13 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 		set_linklocal = B_TRUE;
 		ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
 	}
-	if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) {
-		zero_source = B_TRUE;
-		ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE;
-	}
+
 	mutex_exit(&ill->ill_lock);
 	mutex_exit(&phyi->phyint_lock);
 
 	if (set_linklocal)
 		(void) ipif_setlinklocal(ipif);
 
-	if (zero_source)
-		ipif->ipif_v6src_addr = ipv6_all_zeros;
-	else
-		ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
-
 	/*
 	 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
 	 * the kernel: if any of them has been set by userland, the interface
@@ -11744,6 +10223,9 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 		 */
 		sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
 	}
+
+	/* The default multicast interface might have changed */
+	ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
 	return (err);
 }
 
@@ -11762,7 +10244,7 @@ ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
 
-	ipif_down_tail(ipif);
+	(void) ipif_down_tail(ipif);
 	if (ipip->ipi_cmd_type == IF_CMD) {
 		/* cast to uint16_t prevents unwanted sign extension */
 		flags = (uint16_t)ifr->ifr_flags;
@@ -11814,6 +10296,10 @@ ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	return (0);
 }
 
+/*
+ * We allow the MTU to be set on an ILL, but not have it be different
+ * for different IPIFs since we don't actually send packets on IPIFs.
+ */
 /* ARGSUSED */
 int
 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
@@ -11823,8 +10309,7 @@ ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	int ip_min_mtu;
 	struct ifreq	*ifr;
 	struct lifreq *lifr;
-	ire_t	*ire;
-	ip_stack_t *ipst;
+	ill_t	*ill;
 
 	ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
 	    ipif->ipif_id, (void *)ipif));
@@ -11835,48 +10320,35 @@ ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		lifr = (struct lifreq *)if_req;
 		mtu = lifr->lifr_mtu;
 	}
+	/* Only allow for logical unit zero i.e. not on "bge0:17" */
+	if (ipif->ipif_id != 0)
+		return (EINVAL);
 
+	ill = ipif->ipif_ill;
 	if (ipif->ipif_isv6)
 		ip_min_mtu = IPV6_MIN_MTU;
 	else
 		ip_min_mtu = IP_MIN_MTU;
 
-	if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu)
+	mutex_enter(&ill->ill_lock);
+	if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) {
+		mutex_exit(&ill->ill_lock);
 		return (EINVAL);
+	}
+	/*
+	 * The dce and fragmentation code can handle changes to ill_mtu
+	 * concurrent with sending/fragmenting packets.
+	 */
+	ill->ill_mtu = mtu;
+	ill->ill_flags |= ILLF_FIXEDMTU;
+	mutex_exit(&ill->ill_lock);
 
 	/*
-	 * Change the MTU size in all relevant ire's.
-	 * Mtu change Vs. new ire creation - protocol below.
-	 * First change ipif_mtu and the ire_max_frag of the
-	 * interface ire. Then do an ire walk and change the
-	 * ire_max_frag of all affected ires. During ire_add
-	 * under the bucket lock, set the ire_max_frag of the
-	 * new ire being created from the ipif/ire from which
-	 * it is being derived. If an mtu change happens after
-	 * the ire is added, the new ire will be cleaned up.
-	 * Conversely if the mtu change happens before the ire
-	 * is added, ire_add will see the new value of the mtu.
+	 * Make sure all dce_generation checks find out
+	 * that ill_mtu has changed.
 	 */
-	ipif->ipif_mtu = mtu;
-	ipif->ipif_flags |= IPIF_FIXEDMTU;
+	dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
 
-	if (ipif->ipif_isv6)
-		ire = ipif_to_ire_v6(ipif);
-	else
-		ire = ipif_to_ire(ipif);
-	if (ire != NULL) {
-		ire->ire_max_frag = ipif->ipif_mtu;
-		ire_refrele(ire);
-	}
-	ipst = ipif->ipif_ill->ill_ipst;
-	if (ipif->ipif_flags & IPIF_UP) {
-		if (ipif->ipif_isv6)
-			ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES,
-			    ipst);
-		else
-			ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES,
-			    ipst);
-	}
 	/* Update the MTU in SCTP's list */
 	sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
 	return (0);
@@ -11893,12 +10365,17 @@ ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+
+	/*
+	 * We allow a get on any logical interface even though the set
+	 * can only be done on logical unit 0.
+	 */
 	if (ipip->ipi_cmd_type == IF_CMD) {
 		ifr = (struct ifreq *)if_req;
-		ifr->ifr_metric = ipif->ipif_mtu;
+		ifr->ifr_metric = ipif->ipif_ill->ill_mtu;
 	} else {
 		lifr = (struct lifreq *)if_req;
-		lifr->lifr_mtu = ipif->ipif_mtu;
+		lifr->lifr_mtu = ipif->ipif_ill->ill_mtu;
 	}
 	return (0);
 }
@@ -11911,9 +10388,10 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 {
 	ipaddr_t addr;
 	ire_t	*ire;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
+	ill_t		*ill = ipif->ipif_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name,
+	ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name,
 	    ipif->ipif_id));
 
 	ASSERT(IAM_WRITER_IPIF(ipif));
@@ -11931,12 +10409,10 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		 * If we are already up, make sure the new
 		 * broadcast address makes sense.  If it does,
 		 * there should be an IRE for it already.
-		 * Don't match on ipif, only on the ill
-		 * since we are sharing these now.
 		 */
-		ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST,
-		    ipif, ALL_ZONES, NULL,
-		    (MATCH_IRE_ILL | MATCH_IRE_TYPE), ipst);
+		ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST,
+		    ill, ipif->ipif_zoneid, NULL,
+		    (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL);
 		if (ire == NULL) {
 			return (EINVAL);
 		} else {
@@ -11944,13 +10420,13 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		}
 	}
 	/*
-	 * Changing the broadcast addr for this ipif.
-	 * Make sure we have valid net and subnet bcast
-	 * ire's for other logical interfaces, if needed.
+	 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST
+	 * needs to already exist we never need to change the set of
+	 * IRE_BROADCASTs when we are UP.
 	 */
 	if (addr != ipif->ipif_brd_addr)
-		ipif_check_bcast_ires(ipif);
-	IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
+		IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
+
 	return (0);
 }
 
@@ -12026,13 +10502,10 @@ ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	 * Make sure we have valid net and subnet broadcast ire's
 	 * for the old netmask, if needed by other logical interfaces.
 	 */
-	if (!ipif->ipif_isv6)
-		ipif_check_bcast_ires(ipif);
-
 	err = ipif_logical_down(ipif, q, mp);
 	if (err == EINPROGRESS)
 		return (err);
-	ipif_down_tail(ipif);
+	(void) ipif_down_tail(ipif);
 	err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
 	return (err);
 }
@@ -12087,7 +10560,7 @@ ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 {
 	ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
-	ipif_down_tail(ipif);
+	(void) ipif_down_tail(ipif);
 	return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
 }
 
@@ -12188,6 +10661,7 @@ int
 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *if_req)
 {
+	int	arp_muxid;
 
 	ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -12197,14 +10671,15 @@ ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	if (ipip->ipi_cmd_type == IF_CMD) {
 		struct ifreq *ifr = (struct ifreq *)if_req;
 
-		ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid;
-		ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid;
+		ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid;
+		arp_muxid = ifr->ifr_arp_muxid;
 	} else {
 		struct lifreq *lifr = (struct lifreq *)if_req;
 
-		ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid;
-		ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid;
+		ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid;
+		arp_muxid = lifr->lifr_arp_muxid;
 	}
+	arl_set_muxid(ipif->ipif_ill, arp_muxid);
 	return (0);
 }
 
@@ -12213,22 +10688,24 @@ int
 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *if_req)
 {
+	int	arp_muxid = 0;
 
 	ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
 	/*
 	 * Get the muxid saved in ill for I_PUNLINK.
 	 */
+	arp_muxid = arl_get_muxid(ipif->ipif_ill);
 	if (ipip->ipi_cmd_type == IF_CMD) {
 		struct ifreq *ifr = (struct ifreq *)if_req;
 
-		ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid;
-		ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid;
+		ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid;
+		ifr->ifr_arp_muxid = arp_muxid;
 	} else {
 		struct lifreq *lifr = (struct lifreq *)if_req;
 
-		lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid;
-		lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid;
+		lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid;
+		lifr->lifr_arp_muxid = arp_muxid;
 	}
 	return (0);
 }
@@ -12298,7 +10775,7 @@ ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		err = ipif_logical_down(ipif, q, mp);
 		if (err == EINPROGRESS)
 			return (err);
-		ipif_down_tail(ipif);
+		(void) ipif_down_tail(ipif);
 		need_up = B_TRUE;
 	}
 
@@ -12353,7 +10830,7 @@ ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
-	ipif_down_tail(ipif);
+	(void) ipif_down_tail(ipif);
 
 	addrlen = lifr->lifr_addrlen;
 	if (ipif->ipif_isv6) {
@@ -12454,7 +10931,7 @@ ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		err = ipif_logical_down(ipif, q, mp);
 		if (err == EINPROGRESS)
 			return (err);
-		ipif_down_tail(ipif);
+		(void) ipif_down_tail(ipif);
 		need_up = B_TRUE;
 	}
 	err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
@@ -12538,24 +11015,6 @@ ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 /*
  * Set (hardware) link specific information that might override
  * what was acquired through the DL_INFO_ACK.
- * The logic is as follows.
- *
- * become exclusive
- * set CHANGING flag
- * change mtu on affected IREs
- * clear CHANGING flag
- *
- * An ire add that occurs before the CHANGING flag is set will have its mtu
- * changed by the ip_sioctl_lnkinfo.
- *
- * During the time the CHANGING flag is set, no new ires will be added to the
- * bucket, and ire add will fail (due the CHANGING flag).
- *
- * An ire add that occurs after the CHANGING flag is set will have the right mtu
- * before it is added to the bucket.
- *
- * Obviously only 1 thread can set the CHANGING flag and we need to become
- * exclusive to set the flag.
  */
 /* ARGSUSED */
 int
@@ -12563,19 +11022,16 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipi, void *if_req)
 {
 	ill_t		*ill = ipif->ipif_ill;
-	ipif_t		*nipif;
 	int		ip_min_mtu;
-	boolean_t	mtu_walk = B_FALSE;
 	struct lifreq	*lifr = (struct lifreq *)if_req;
 	lif_ifinfo_req_t *lir;
-	ire_t		*ire;
 
 	ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
 	lir = &lifr->lifr_ifinfo;
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
-	/* Only allow for logical unit zero i.e. not on "le0:17" */
+	/* Only allow for logical unit zero i.e. not on "bge0:17" */
 	if (ipif->ipif_id != 0)
 		return (EINVAL);
 
@@ -12588,9 +11044,20 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	/*
 	 * Verify values before we set anything. Allow zero to
 	 * mean unspecified.
+	 *
+	 * XXX We should be able to set the user-defined lir_mtu to some value
+	 * that is greater than ill_current_frag but less than ill_max_frag- the
+	 * ill_max_frag value tells us the max MTU that can be handled by the
+	 * datalink, whereas the ill_current_frag is dynamically computed for
+	 * some link-types like tunnels, based on the tunnel PMTU. However,
+	 * since there is currently no way of distinguishing between
+	 * administratively fixed link mtu values (e.g., those set via
+	 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered
+	 * for tunnels) we conservatively choose the  ill_current_frag as the
+	 * upper-bound.
 	 */
 	if (lir->lir_maxmtu != 0 &&
-	    (lir->lir_maxmtu > ill->ill_max_frag ||
+	    (lir->lir_maxmtu > ill->ill_current_frag ||
 	    lir->lir_maxmtu < ip_min_mtu))
 		return (EINVAL);
 	if (lir->lir_reachtime != 0 &&
@@ -12601,18 +11068,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		return (EINVAL);
 
 	mutex_enter(&ill->ill_lock);
-	ill->ill_state_flags |= ILL_CHANGING;
-	for (nipif = ill->ill_ipif; nipif != NULL;
-	    nipif = nipif->ipif_next) {
-		nipif->ipif_state_flags |= IPIF_CHANGING;
-	}
-
-	if (lir->lir_maxmtu != 0) {
-		ill->ill_max_mtu = lir->lir_maxmtu;
+	/*
+	 * The dce and fragmentation code can handle changes to ill_mtu
+	 * concurrent with sending/fragmenting packets.
+	 */
+	if (lir->lir_maxmtu != 0)
 		ill->ill_user_mtu = lir->lir_maxmtu;
-		mtu_walk = B_TRUE;
-	}
-	mutex_exit(&ill->ill_lock);
 
 	if (lir->lir_reachtime != 0)
 		ill->ill_reachable_time = lir->lir_reachtime;
@@ -12621,47 +11082,29 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		ill->ill_reachable_retrans_time = lir->lir_reachretrans;
 
 	ill->ill_max_hops = lir->lir_maxhops;
-
 	ill->ill_max_buf = ND_MAX_Q;
-
-	if (mtu_walk) {
+	if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) {
 		/*
-		 * Set the MTU on all ipifs associated with this ill except
-		 * for those whose MTU was fixed via SIOCSLIFMTU.
+		 * ill_mtu is the actual interface MTU, obtained as the min
+		 * of user-configured mtu and the value announced by the
+		 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since
+		 * we have already made the choice of requiring
+		 * ill_user_mtu < ill_current_frag by the time we get here,
+		 * the ill_mtu effectively gets assigned to the ill_user_mtu
+		 * here.
 		 */
-		for (nipif = ill->ill_ipif; nipif != NULL;
-		    nipif = nipif->ipif_next) {
-			if (nipif->ipif_flags & IPIF_FIXEDMTU)
-				continue;
-
-			nipif->ipif_mtu = ill->ill_max_mtu;
-
-			if (!(nipif->ipif_flags & IPIF_UP))
-				continue;
-
-			if (nipif->ipif_isv6)
-				ire = ipif_to_ire_v6(nipif);
-			else
-				ire = ipif_to_ire(nipif);
-			if (ire != NULL) {
-				ire->ire_max_frag = ipif->ipif_mtu;
-				ire_refrele(ire);
-			}
-
-			ire_walk_ill(MATCH_IRE_ILL, 0, ipif_mtu_change,
-			    nipif, ill);
-		}
-	}
-
-	mutex_enter(&ill->ill_lock);
-	for (nipif = ill->ill_ipif; nipif != NULL;
-	    nipif = nipif->ipif_next) {
-		nipif->ipif_state_flags &= ~IPIF_CHANGING;
+		ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu);
 	}
-	ILL_UNMARK_CHANGING(ill);
 	mutex_exit(&ill->ill_lock);
 
 	/*
+	 * Make sure all dce_generation checks find out
+	 * that ill_mtu has changed.
+	 */
+	if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0))
+		dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
+
+	/*
 	 * Refresh IPMP meta-interface MTU if necessary.
 	 */
 	if (IS_UNDER_IPMP(ill))
@@ -12687,7 +11130,7 @@ ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	lir->lir_maxhops = ill->ill_max_hops;
 	lir->lir_reachtime = ill->ill_reachable_time;
 	lir->lir_reachretrans = ill->ill_reachable_retrans_time;
-	lir->lir_maxmtu = ill->ill_max_mtu;
+	lir->lir_maxmtu = ill->ill_mtu;
 
 	return (0);
 }
@@ -12722,7 +11165,7 @@ ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst)
 		mutex_enter(&ill->ill_lock);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
-			if (!IPIF_CAN_LOOKUP(ipif))
+			if (IPIF_IS_CONDEMNED(ipif))
 				continue;
 			if (!(ipif->ipif_flags & IPIF_UP))
 				continue;
@@ -12848,29 +11291,9 @@ done:
 }
 
 /*
- * Lookup an ipif using the sequence id (ipif_seqid)
+ * Assign a unique id for the ipif. This is used by sctp_addr.c
+ * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures.
  */
-ipif_t *
-ipif_lookup_seqid(ill_t *ill, uint_t seqid)
-{
-	ipif_t *ipif;
-
-	ASSERT(MUTEX_HELD(&ill->ill_lock));
-
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif))
-			return (ipif);
-	}
-	return (NULL);
-}
-
-/*
- * Assign a unique id for the ipif. This is used later when we send
- * IRES to ARP for resolution where we initialize ire_ipif_seqid
- * to the value pointed by ire_ipif->ipif_seqid. Later when the
- * IRE is added, we verify that ipif has not disappeared.
- */
-
 static void
 ipif_assign_seqid(ipif_t *ipif)
 {
@@ -12893,41 +11316,21 @@ ipif_clone(const ipif_t *sipif, ipif_t *dipif)
 	ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
 	ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
 	ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
-	ASSERT(sipif->ipif_arp_del_mp == NULL);
-	ASSERT(dipif->ipif_arp_del_mp == NULL);
-	ASSERT(sipif->ipif_igmp_rpt == NULL);
-	ASSERT(dipif->ipif_igmp_rpt == NULL);
-	ASSERT(sipif->ipif_multicast_up == 0);
-	ASSERT(dipif->ipif_multicast_up == 0);
-	ASSERT(sipif->ipif_joined_allhosts == 0);
-	ASSERT(dipif->ipif_joined_allhosts == 0);
-
-	dipif->ipif_mtu = sipif->ipif_mtu;
+
 	dipif->ipif_flags = sipif->ipif_flags;
 	dipif->ipif_metric = sipif->ipif_metric;
 	dipif->ipif_zoneid = sipif->ipif_zoneid;
 	dipif->ipif_v6subnet = sipif->ipif_v6subnet;
 	dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
-	dipif->ipif_v6src_addr = sipif->ipif_v6src_addr;
 	dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
 	dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
 	dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
 
 	/*
-	 * While dipif is down right now, it might've been up before.  Since
-	 * it's changing identity, its packet counters need to be reset.
-	 */
-	dipif->ipif_ib_pkt_count = 0;
-	dipif->ipif_ob_pkt_count = 0;
-	dipif->ipif_fo_pkt_count = 0;
-
-	/*
 	 * As per the comment atop the function, we assume that these sipif
 	 * fields will be changed before sipif is unlocked.
 	 */
 	dipif->ipif_seqid = sipif->ipif_seqid;
-	dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp;
-	dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt;
 	dipif->ipif_state_flags = sipif->ipif_state_flags;
 }
 
@@ -12951,13 +11354,6 @@ ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
 	 * Grab all of the locks that protect the ipif in a defined order.
 	 */
 	GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
-	if (sipif > dipif) {
-		mutex_enter(&sipif->ipif_saved_ire_lock);
-		mutex_enter(&dipif->ipif_saved_ire_lock);
-	} else {
-		mutex_enter(&dipif->ipif_saved_ire_lock);
-		mutex_enter(&sipif->ipif_saved_ire_lock);
-	}
 
 	ipif_clone(sipif, dipif);
 	if (virgipif != NULL) {
@@ -12965,8 +11361,6 @@ ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
 		mi_free(virgipif);
 	}
 
-	mutex_exit(&sipif->ipif_saved_ire_lock);
-	mutex_exit(&dipif->ipif_saved_ire_lock);
 	RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
 
 	/*
@@ -13115,10 +11509,7 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
 	 */
 	ipif->ipif_zoneid = ill->ill_zoneid;
 
-	mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
-
 	ipif->ipif_refcnt = 0;
-	ipif->ipif_saved_ire_cnt = 0;
 
 	if (insert) {
 		if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) {
@@ -13171,8 +11562,6 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
 		    &ipif->ipif_v6lcl_addr);
 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &ipif->ipif_v6src_addr);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
 		    &ipif->ipif_v6subnet);
 		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
 		    &ipif->ipif_v6net_mask);
@@ -13189,8 +11578,6 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
 	if (!initialize)
 		goto out;
 
-	ipif->ipif_mtu = ill->ill_max_mtu;
-
 	/*
 	 * NOTE: The IPMP meta-interface is special-cased because it starts
 	 * with no underlying interfaces (and thus an unknown broadcast
@@ -13236,207 +11623,47 @@ out:
 }
 
 /*
- * If appropriate, send a message up to the resolver delete the entry
- * for the address of this interface which is going out of business.
- * (Always called as writer).
- *
- * NOTE : We need to check for NULL mps as some of the fields are
- *	  initialized only for some interface types. See ipif_resolver_up()
- *	  for details.
+ * Remove the neighbor cache entries associated with this logical
+ * interface.
  */
-void
-ipif_resolver_down(ipif_t *ipif)
+int
+ipif_arp_down(ipif_t *ipif)
 {
-	mblk_t	*mp;
 	ill_t	*ill = ipif->ipif_ill;
+	int	err = 0;
 
-	ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
+	ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
-	if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
-		return;
-
-	/* Delete the mapping for the local address */
-	mp = ipif->ipif_arp_del_mp;
-	if (mp != NULL) {
-		ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
-		    *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
-		putnext(ill->ill_rq, mp);
-		ipif->ipif_arp_del_mp = NULL;
-	}
-
-	/*
-	 * Make IPMP aware of the deleted data address.
-	 */
-	if (IS_IPMP(ill))
-		ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down",
+	    ill_t *, ill, ipif_t *, ipif);
+	ipif_nce_down(ipif);
 
 	/*
 	 * If this is the last ipif that is going down and there are no
 	 * duplicate addresses we may yet attempt to re-probe, then we need to
 	 * clean up ARP completely.
 	 */
-	if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) {
+	if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
+	    !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) {
 		/*
 		 * If this was the last ipif on an IPMP interface, purge any
-		 * IPMP ARP entries associated with it.
+		 * static ARP entries associated with it.
 		 */
 		if (IS_IPMP(ill))
 			ipmp_illgrp_refresh_arpent(ill->ill_grp);
 
-		/* Send up AR_INTERFACE_DOWN message */
-		mp = ill->ill_arp_down_mp;
-		if (mp != NULL) {
-			ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
-			    *(unsigned *)mp->b_rptr, ill->ill_name,
-			    ipif->ipif_id));
-			putnext(ill->ill_rq, mp);
-			ill->ill_arp_down_mp = NULL;
-		}
-
-		/* Tell ARP to delete the multicast mappings */
-		mp = ill->ill_arp_del_mapping_mp;
-		if (mp != NULL) {
-			ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
-			    *(unsigned *)mp->b_rptr, ill->ill_name,
-			    ipif->ipif_id));
-			putnext(ill->ill_rq, mp);
-			ill->ill_arp_del_mapping_mp = NULL;
-		}
+		/* UNBIND, DETACH */
+		err = arp_ll_down(ill);
 	}
-}
-
-/*
- * Set up the multicast mappings for `ipif' in ARP.  If `arp_add_mapping_mp'
- * is non-NULL, then upon success it will contain an mblk that can be passed
- * to ARP to create the mapping.  Otherwise, if it's NULL, upon success ARP
- * will have already been notified to create the mapping.  Returns zero on
- * success, -1 upon failure.
- */
-int
-ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
-{
-	mblk_t	*del_mp = NULL;
-	mblk_t *add_mp = NULL;
-	mblk_t *mp;
-	ill_t	*ill = ipif->ipif_ill;
-	phyint_t *phyi = ill->ill_phyint;
-	ipaddr_t addr, mask, extract_mask = 0;
-	arma_t	*arma;
-	uint8_t *maddr, *bphys_addr;
-	uint32_t hw_start;
-	dl_unitdata_req_t *dlur;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
-	if (ipif->ipif_flags & IPIF_POINTOPOINT)
-		return (0);
-
-	/*
-	 * IPMP meta-interfaces don't have any inherent multicast mappings,
-	 * and instead use the ones on the underlying interfaces.
-	 */
-	if (IS_IPMP(ill))
-		return (0);
-
-	/*
-	 * Delete the existing mapping from ARP.  Normally, ipif_down() ->
-	 * ipif_resolver_down() will send this up to ARP, but it may be that
-	 * we are enabling PHYI_MULTI_BCAST via ip_rput_dlpi_writer().
-	 */
-	mp = ill->ill_arp_del_mapping_mp;
-	if (mp != NULL) {
-		ip1dbg(("ipif_arp_setup_multicast: arp cmd %x for %s:%u\n",
-		    *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
-		putnext(ill->ill_rq, mp);
-		ill->ill_arp_del_mapping_mp = NULL;
-	}
-
-	if (arp_add_mapping_mp != NULL)
-		*arp_add_mapping_mp = NULL;
-
-	/*
-	 * Check that the address is not to long for the constant
-	 * length reserved in the template arma_t.
-	 */
-	if (ill->ill_phys_addr_length > IP_MAX_HW_LEN)
-		return (-1);
-
-	/* Add mapping mblk */
-	addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP);
-	mask = (ipaddr_t)htonl(IN_CLASSD_NET);
-	add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template,
-	    (caddr_t)&addr);
-	if (add_mp == NULL)
-		return (-1);
-	arma = (arma_t *)add_mp->b_rptr;
-	maddr = (uint8_t *)arma + arma->arma_hw_addr_offset;
-	bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN);
-	arma->arma_hw_addr_length = ill->ill_phys_addr_length;
 
-	/*
-	 * Determine the broadcast address.
-	 */
-	dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
-	if (ill->ill_sap_length < 0)
-		bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
-	else
-		bphys_addr = (uchar_t *)dlur +
-		    dlur->dl_dest_addr_offset + ill->ill_sap_length;
-	/*
-	 * Check PHYI_MULTI_BCAST and length of physical
-	 * address to determine if we use the mapping or the
-	 * broadcast address.
-	 */
-	if (!(phyi->phyint_flags & PHYI_MULTI_BCAST))
-		if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length,
-		    bphys_addr, maddr, &hw_start, &extract_mask))
-			phyi->phyint_flags |= PHYI_MULTI_BCAST;
-
-	if ((phyi->phyint_flags & PHYI_MULTI_BCAST) ||
-	    (ill->ill_flags & ILLF_MULTICAST)) {
-		/* Make sure this will not match the "exact" entry. */
-		addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP);
-		del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template,
-		    (caddr_t)&addr);
-		if (del_mp == NULL) {
-			freemsg(add_mp);
-			return (-1);
-		}
-		bcopy(&extract_mask, (char *)arma +
-		    arma->arma_proto_extract_mask_offset, IP_ADDR_LEN);
-		if (phyi->phyint_flags & PHYI_MULTI_BCAST) {
-			/* Use link-layer broadcast address for MULTI_BCAST */
-			bcopy(bphys_addr, maddr, ill->ill_phys_addr_length);
-			ip2dbg(("ipif_arp_setup_multicast: adding"
-			    " MULTI_BCAST ARP setup for %s\n", ill->ill_name));
-		} else {
-			arma->arma_hw_mapping_start = hw_start;
-			ip2dbg(("ipif_arp_setup_multicast: adding multicast"
-			    " ARP setup for %s\n", ill->ill_name));
-		}
-	} else {
-		freemsg(add_mp);
-		ASSERT(del_mp == NULL);
-		/* It is neither MULTICAST nor MULTI_BCAST */
-		return (0);
-	}
-	ASSERT(add_mp != NULL && del_mp != NULL);
-	ASSERT(ill->ill_arp_del_mapping_mp == NULL);
-	ill->ill_arp_del_mapping_mp = del_mp;
-	if (arp_add_mapping_mp != NULL) {
-		/* The caller just wants the mblks allocated */
-		*arp_add_mapping_mp = add_mp;
-	} else {
-		/* The caller wants us to send it to arp */
-		putnext(ill->ill_rq, add_mp);
-	}
-	return (0);
+	return (err);
 }
 
 /*
  * Get the resolver set up for a new IP address.  (Always called as writer.)
- * Called both for IPv4 and IPv6 interfaces, though it only sets up the
- * resolver for v6 if it's an ILLF_XRESOLV interface.  Honors ILLF_NOARP.
+ * Called both for IPv4 and IPv6 interfaces, though it only does some
+ * basic DAD related initialization for IPv6. Honors ILLF_NOARP.
  *
  * The enumerated value res_act tunes the behavior:
  * 	* Res_act_initial: set up all the resolver structures for a new
@@ -13451,17 +11678,9 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
 int
 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 {
-	mblk_t	*arp_up_mp = NULL;
-	mblk_t	*arp_down_mp = NULL;
-	mblk_t	*arp_add_mp = NULL;
-	mblk_t	*arp_del_mp = NULL;
-	mblk_t	*arp_add_mapping_mp = NULL;
-	mblk_t	*arp_del_mapping_mp = NULL;
-	ill_t	*ill = ipif->ipif_ill;
-	int	err = ENOMEM;
-	boolean_t added_ipif = B_FALSE;
-	boolean_t publish;
-	boolean_t was_dup;
+	ill_t		*ill = ipif->ipif_ill;
+	int		err;
+	boolean_t	was_dup;
 
 	ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
 	    ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags));
@@ -13490,231 +11709,55 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 		return (0);
 	}
 	/* NDP will set the ipif_addr_ready flag when it's ready */
-	if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
+	if (ill->ill_isv6)
 		return (0);
 
-	if (ill->ill_isv6) {
-		/*
-		 * External resolver for IPv6
-		 */
-		ASSERT(res_act == Res_act_initial);
-		publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr);
-	} else {
-		/*
-		 * IPv4 arp case. If the ARP stream has already started
-		 * closing, fail this request for ARP bringup. Else
-		 * record the fact that an ARP bringup is pending.
-		 */
-		mutex_enter(&ill->ill_lock);
-		if (ill->ill_arp_closing) {
-			mutex_exit(&ill->ill_lock);
-			err = EINVAL;
-			goto failed;
-		} else {
-			if (ill->ill_ipif_up_count == 0 &&
-			    ill->ill_ipif_dup_count == 0 && !was_dup)
-				ill->ill_arp_bringup_pending = 1;
-			mutex_exit(&ill->ill_lock);
-		}
-		publish = (ipif->ipif_lcl_addr != INADDR_ANY);
-	}
-
-	if (IS_IPMP(ill) && publish) {
-		/*
-		 * If we're here via ipif_up(), then the ipif won't be bound
-		 * yet -- add it to the group, which will bind it if possible.
-		 * (We would add it in ipif_up(), but deleting on failure
-		 * there is gruesome.)  If we're here via ipmp_ill_bind_ipif(),
-		 * then the ipif has already been added to the group and we
-		 * just need to use the binding.
-		 */
-		if (ipmp_ipif_bound_ill(ipif) == NULL) {
-			if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) {
-				/*
-				 * We couldn't bind the ipif to an ill yet,
-				 * so we have nothing to publish.
-				 */
-				publish = B_FALSE;
-			}
-			added_ipif = B_TRUE;
-		}
-	}
-
-	/*
-	 * Add an entry for the local address in ARP only if it
-	 * is not UNNUMBERED and it is suitable for publishing.
-	 */
-	if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) {
-		if (res_act == Res_act_defend) {
-			arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND);
-			if (arp_add_mp == NULL)
-				goto failed;
-			/*
-			 * If we're just defending our address now, then
-			 * there's no need to set up ARP multicast mappings.
-			 * The publish command is enough.
-			 */
-			goto done;
-		}
-
-		/*
-		 * Allocate an ARP add message and an ARP delete message (the
-		 * latter is saved for use when the address goes down).
-		 */
-		if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL)
-			goto failed;
-
-		if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL)
-			goto failed;
-
-		if (res_act != Res_act_initial)
-			goto arp_setup_multicast;
-	} else {
-		if (res_act != Res_act_initial)
-			goto done;
-	}
-	/*
-	 * Need to bring up ARP or setup multicast mapping only
-	 * when the first interface is coming UP.
-	 */
-	if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup)
-		goto done;
-
-	/*
-	 * Allocate an ARP down message (to be saved) and an ARP up message.
-	 */
-	arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0);
-	if (arp_down_mp == NULL)
-		goto failed;
-
-	arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0);
-	if (arp_up_mp == NULL)
-		goto failed;
-
-	if (ipif->ipif_flags & IPIF_POINTOPOINT)
-		goto done;
-
-arp_setup_multicast:
-	/*
-	 * Setup the multicast mappings. This function initializes
-	 * ill_arp_del_mapping_mp also. This does not need to be done for
-	 * IPv6, or for the IPMP interface (since it has no link-layer).
-	 */
-	if (!ill->ill_isv6 && !IS_IPMP(ill)) {
-		err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp);
-		if (err != 0)
-			goto failed;
-		ASSERT(ill->ill_arp_del_mapping_mp != NULL);
-		ASSERT(arp_add_mapping_mp != NULL);
-	}
-done:
-	if (arp_up_mp != NULL) {
-		ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n",
-		    ill->ill_name, ipif->ipif_id));
-		putnext(ill->ill_rq, arp_up_mp);
-		arp_up_mp = NULL;
-	}
-	if (arp_add_mp != NULL) {
-		ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n",
-		    ill->ill_name, ipif->ipif_id));
-		/*
-		 * If it's an extended ARP implementation, then we'll wait to
-		 * hear that DAD has finished before using the interface.
-		 */
-		if (!ill->ill_arp_extend)
-			ipif->ipif_addr_ready = 1;
-		putnext(ill->ill_rq, arp_add_mp);
-		arp_add_mp = NULL;
-	} else {
-		ipif->ipif_addr_ready = 1;
-	}
-	if (arp_add_mapping_mp != NULL) {
-		ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n",
-		    ill->ill_name, ipif->ipif_id));
-		putnext(ill->ill_rq, arp_add_mapping_mp);
-		arp_add_mapping_mp = NULL;
-	}
-
-	if (res_act == Res_act_initial) {
-		if (ill->ill_flags & ILLF_NOARP)
-			err = ill_arp_off(ill);
-		else
-			err = ill_arp_on(ill);
-		if (err != 0) {
-			ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n",
-			    err));
-			goto failed;
-		}
-	}
-
-	if (arp_del_mp != NULL) {
-		ASSERT(ipif->ipif_arp_del_mp == NULL);
-		ipif->ipif_arp_del_mp = arp_del_mp;
-	}
-	if (arp_down_mp != NULL) {
-		ASSERT(ill->ill_arp_down_mp == NULL);
-		ill->ill_arp_down_mp = arp_down_mp;
-	}
-	if (arp_del_mapping_mp != NULL) {
-		ASSERT(ill->ill_arp_del_mapping_mp == NULL);
-		ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
-	}
-
-	return ((ill->ill_ipif_up_count != 0 || was_dup ||
-	    ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS);
-failed:
-	ip1dbg(("ipif_resolver_up: FAILED\n"));
-	if (added_ipif)
-		ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
-	freemsg(arp_add_mp);
-	freemsg(arp_del_mp);
-	freemsg(arp_add_mapping_mp);
-	freemsg(arp_up_mp);
-	freemsg(arp_down_mp);
-	ill->ill_arp_bringup_pending = 0;
+	err = ipif_arp_up(ipif, res_act, was_dup);
 	return (err);
 }
 
 /*
- * This routine restarts IPv4 duplicate address detection (DAD) when a link has
- * just gone back up.
+ * This routine restarts IPv4/IPv6 duplicate address detection (DAD)
+ * when a link has just gone back up.
  */
 static void
-ipif_arp_start_dad(ipif_t *ipif)
+ipif_nce_start_dad(ipif_t *ipif)
 {
+	ncec_t *ncec;
 	ill_t *ill = ipif->ipif_ill;
-	mblk_t *arp_add_mp;
+	boolean_t isv6 = ill->ill_isv6;
 
-	/* ACE_F_UNVERIFIED restarts DAD */
-	if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing ||
-	    (ipif->ipif_flags & IPIF_UNNUMBERED) ||
-	    ipif->ipif_lcl_addr == INADDR_ANY ||
-	    (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) {
-		/*
-		 * If we can't contact ARP for some reason, that's not really a
-		 * problem.  Just send out the routing socket notification that
-		 * DAD completion would have done, and continue.
-		 */
-		ipif_mask_reply(ipif);
-		ipif_up_notify(ipif);
-		ipif->ipif_addr_ready = 1;
-		return;
-	}
+	if (isv6) {
+		ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill,
+		    &ipif->ipif_v6lcl_addr);
+	} else {
+		ipaddr_t v4addr;
 
-	putnext(ill->ill_rq, arp_add_mp);
-}
+		if (ill->ill_net_type != IRE_IF_RESOLVER ||
+		    (ipif->ipif_flags & IPIF_UNNUMBERED) ||
+		    ipif->ipif_lcl_addr == INADDR_ANY) {
+			/*
+			 * If we can't contact ARP for some reason,
+			 * that's not really a problem.  Just send
+			 * out the routing socket notification that
+			 * DAD completion would have done, and continue.
+			 */
+			ipif_mask_reply(ipif);
+			ipif_up_notify(ipif);
+			ipif->ipif_addr_ready = 1;
+			return;
+		}
 
-static void
-ipif_ndp_start_dad(ipif_t *ipif)
-{
-	nce_t *nce;
+		IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr);
+		ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr);
+	}
 
-	nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr,
-	    B_FALSE);
-	if (nce == NULL)
+	if (ncec == NULL) {
+		ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n",
+		    (void *)ipif));
 		return;
-
-	if (!ndp_restart_dad(nce)) {
+	}
+	if (!nce_restart_dad(ncec)) {
 		/*
 		 * If we can't restart DAD for some reason, that's not really a
 		 * problem.  Just send out the routing socket notification that
@@ -13723,7 +11766,7 @@ ipif_ndp_start_dad(ipif_t *ipif)
 		ipif_up_notify(ipif);
 		ipif->ipif_addr_ready = 1;
 	}
-	NCE_REFRELE(nce);
+	ncec_refrele(ncec);
 }
 
 /*
@@ -13749,30 +11792,21 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
 	 * If layer two doesn't support duplicate address detection, then just
 	 * send the routing socket message now and be done with it.
 	 */
-	if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) ||
-	    (!ill->ill_isv6 && !ill->ill_arp_extend)) {
+	if (!ill->ill_isv6 && arp_no_defense) {
 		ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
 		return;
 	}
 
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 		if (went_up) {
+
 			if (ipif->ipif_flags & IPIF_UP) {
-				if (ill->ill_isv6)
-					ipif_ndp_start_dad(ipif);
-				else
-					ipif_arp_start_dad(ipif);
-			} else if (ill->ill_isv6 &&
-			    (ipif->ipif_flags & IPIF_DUPLICATE)) {
+				ipif_nce_start_dad(ipif);
+			} else if (ipif->ipif_flags & IPIF_DUPLICATE) {
 				/*
-				 * For IPv4, the ARP module itself will
-				 * automatically start the DAD process when it
-				 * sees DL_NOTE_LINK_UP.  We respond to the
-				 * AR_CN_READY at the completion of that task.
-				 * For IPv6, we must kick off the bring-up
-				 * process now.
+				 * kick off the bring-up process now.
 				 */
-				ndp_do_recovery(ipif);
+				ipif_do_recovery(ipif);
 			} else {
 				/*
 				 * Unfortunately, the first ipif is "special"
@@ -13822,7 +11856,7 @@ ipsq_delete(ipsq_t *ipsq)
 static int
 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
 {
-	int err;
+	int err = 0;
 	ipif_t *ipif;
 
 	if (ill == NULL)
@@ -13841,9 +11875,6 @@ ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
 			}
 		}
 	}
-	mutex_enter(&ill->ill_lock);
-	ill->ill_state_flags &= ~ILL_CHANGING;
-	mutex_exit(&ill->ill_lock);
 	ill->ill_up_ipifs = B_FALSE;
 	return (0);
 }
@@ -13859,6 +11890,15 @@ ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
+	if (ill->ill_replumbing) {
+		ill->ill_replumbing = 0;
+		/*
+		 * Send down REPLUMB_DONE notification followed by the
+		 * BIND_REQ on the arp stream.
+		 */
+		if (!ill->ill_isv6)
+			arp_send_replumb_conf(ill);
+	}
 	err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
 	if (err != 0)
 		return (err);
@@ -13887,16 +11927,10 @@ ill_down_ipifs(ill_t *ill, boolean_t logical)
 		if (ipif->ipif_flags & IPIF_UP)
 			ipif->ipif_was_up = B_TRUE;
 
-		/*
-		 * Need to re-create net/subnet bcast ires if
-		 * they are dependent on ipif.
-		 */
-		if (!ipif->ipif_isv6)
-			ipif_check_bcast_ires(ipif);
 		if (logical) {
 			(void) ipif_logical_down(ipif, NULL, NULL);
 			ipif_non_duplicate(ipif);
-			ipif_down_tail(ipif);
+			(void) ipif_down_tail(ipif);
 		} else {
 			(void) ipif_down(ipif, NULL, NULL);
 		}
@@ -13904,29 +11938,18 @@ ill_down_ipifs(ill_t *ill, boolean_t logical)
 }
 
 /*
- * Redo source address selection.  This is called when a
- * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up.
+ * Redo source address selection.  This makes IXAF_VERIFY_SOURCE take
+ * a look again at valid source addresses.
+ * This should be called each time after the set of source addresses has been
+ * changed.
  */
 void
-ill_update_source_selection(ill_t *ill)
+ip_update_source_selection(ip_stack_t *ipst)
 {
-	ipif_t *ipif;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	/*
-	 * Underlying interfaces are only used for test traffic and thus
-	 * should always send with their (deprecated) source addresses.
-	 */
-	if (IS_UNDER_IPMP(ill))
-		return;
-
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (ill->ill_isv6)
-			ipif_recreate_interface_routes_v6(NULL, ipif);
-		else
-			ipif_recreate_interface_routes(NULL, ipif);
-	}
+	/* We skip past SRC_GENERATION_VERIFY */
+	if (atomic_add_32_nv(&ipst->ips_src_generation, 1) ==
+	    SRC_GENERATION_VERIFY)
+		atomic_add_32(&ipst->ips_src_generation, 1);
 }
 
 /*
@@ -14154,6 +12177,8 @@ ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 static void
 ill_dl_down(ill_t *ill)
 {
+	DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill);
+
 	/*
 	 * The ill is down; unbind but stay attached since we're still
 	 * associated with a PPA. If we have negotiated DLPI capabilites
@@ -14167,6 +12192,13 @@ ill_dl_down(ill_t *ill)
 
 	ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
 
+	if (!ill->ill_replumbing) {
+		/* Free all ilms for this ill */
+		update_conn_ill(ill, ill->ill_ipst);
+	} else {
+		ill_leave_multicast(ill);
+	}
+
 	ill->ill_unbind_mp = NULL;
 	if (mp != NULL) {
 		ip1dbg(("ill_dl_down: %s (%u) for %s\n",
@@ -14191,23 +12223,13 @@ ill_dl_down(ill_t *ill)
 		ill_capability_reset(ill, B_FALSE);
 		ill_dlpi_send(ill, mp);
 	}
-
-	/*
-	 * Toss all of our multicast memberships.  We could keep them, but
-	 * then we'd have to do bookkeeping of any joins and leaves performed
-	 * by the application while the the interface is down (we can't just
-	 * issue them because arp cannot currently process AR_ENTRY_SQUERY's
-	 * on a downed interface).
-	 */
-	ill_leave_multicast(ill);
-
 	mutex_enter(&ill->ill_lock);
 	ill->ill_dl_up = 0;
 	ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
 	mutex_exit(&ill->ill_lock);
 }
 
-static void
+void
 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
 {
 	union DL_primitives *dlp;
@@ -14249,6 +12271,8 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
 	}
 
 	mutex_exit(&ill->ill_lock);
+	DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch",
+	    char *, dl_primstr(prim), ill_t *, ill);
 	putnext(ill->ill_wq, mp);
 
 	/*
@@ -14301,8 +12325,9 @@ ill_dlpi_send(ill_t *ill, mblk_t *mp)
 		while (*mpp != NULL)
 			mpp = &((*mpp)->b_next);
 
-		ip1dbg(("ill_dlpi_send: deferring request for %s\n",
-		    ill->ill_name));
+		ip1dbg(("ill_dlpi_send: deferring request for %s "
+		    "while %s pending\n", ill->ill_name,
+		    dl_primstr(ill->ill_dlpi_pending)));
 
 		*mpp = mp;
 		mutex_exit(&ill->ill_lock);
@@ -14437,51 +12462,237 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
 	ill_dlpi_dispatch(ill, mp);
 }
 
+/*
+ * Queue a (multicast) DLPI control message to be sent to the driver by
+ * later calling ill_dlpi_send_queued.
+ * We queue them while holding a lock (ill_mcast_lock) to ensure that they
+ * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ
+ * for the same group to race.
+ * We send DLPI control messages in order using ill_lock.
+ * For IPMP we should be called on the cast_ill.
+ */
 void
-conn_delete_ire(conn_t *connp, caddr_t arg)
+ill_dlpi_queue(ill_t *ill, mblk_t *mp)
 {
-	ipif_t	*ipif = (ipif_t *)arg;
-	ire_t	*ire;
+	mblk_t **mpp;
 
-	/*
-	 * Look at the cached ires on conns which has pointers to ipifs.
-	 * We just call ire_refrele which clears up the reference
-	 * to ire. Called when a conn closes. Also called from ipif_free
-	 * to cleanup indirect references to the stale ipif via the cached ire.
-	 */
-	mutex_enter(&connp->conn_lock);
-	ire = connp->conn_ire_cache;
-	if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) {
-		connp->conn_ire_cache = NULL;
-		mutex_exit(&connp->conn_lock);
-		IRE_REFRELE_NOTR(ire);
-		return;
+	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+
+	mutex_enter(&ill->ill_lock);
+	/* Must queue message. Tail insertion */
+	mpp = &ill->ill_dlpi_deferred;
+	while (*mpp != NULL)
+		mpp = &((*mpp)->b_next);
+
+	*mpp = mp;
+	mutex_exit(&ill->ill_lock);
+}
+
+/*
+ * Send the messages that were queued. Make sure there is only
+ * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done()
+ * when an ACK or a NAK is received to process the next queued message.
+ * For IPMP we are called on the upper ill, but when send what is queued
+ * on the cast_ill.
+ */
+void
+ill_dlpi_send_queued(ill_t *ill)
+{
+	mblk_t	*mp;
+	union DL_primitives *dlp;
+	t_uscalar_t prim;
+	ill_t *release_ill = NULL;
+
+	if (IS_IPMP(ill)) {
+		/* On the upper IPMP ill. */
+		release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+		if (release_ill == NULL) {
+			/* Avoid ever sending anything down to the ipmpstub */
+			return;
+		}
+		ill = release_ill;
 	}
-	mutex_exit(&connp->conn_lock);
+	mutex_enter(&ill->ill_lock);
+	while ((mp = ill->ill_dlpi_deferred) != NULL) {
+		if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
+			/* Can't send. Somebody else will send it */
+			mutex_exit(&ill->ill_lock);
+			goto done;
+		}
+		ill->ill_dlpi_deferred = mp->b_next;
+		mp->b_next = NULL;
+		if (!ill->ill_dl_up) {
+			/*
+			 * Nobody there. All multicast addresses will be
+			 * re-joined when we get the DL_BIND_ACK bringing the
+			 * interface up.
+			 */
+			freemsg(mp);
+			continue;
+		}
+		dlp = (union DL_primitives *)mp->b_rptr;
+		prim = dlp->dl_primitive;
+
+		if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
+		    (prim == DL_UNBIND_REQ)) {
+			ill->ill_dlpi_pending = prim;
+		}
+		mutex_exit(&ill->ill_lock);
 
+		DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued",
+		    char *, dl_primstr(prim), ill_t *, ill);
+		putnext(ill->ill_wq, mp);
+		mutex_enter(&ill->ill_lock);
+	}
+	mutex_exit(&ill->ill_lock);
+done:
+	if (release_ill != NULL)
+		ill_refrele(release_ill);
 }
 
 /*
- * Some operations (e.g., ipif_down()) conditionally delete a number
- * of IREs. Those IREs may have been previously cached in the conn structure.
- * This ipcl_walk() walker function releases all references to such IREs based
- * on the condemned flag.
+ * Queue an IP (IGMP/MLD) message to be sent by IP from
+ * ill_mcast_send_queued
+ * We queue them while holding a lock (ill_mcast_lock) to ensure that they
+ * are sent in order i.e., prevent a IGMP leave and IGMP join for the same
+ * group to race.
+ * We send them in order using ill_lock.
+ * For IPMP we are called on the upper ill, but we queue on the cast_ill.
  */
-/* ARGSUSED */
 void
-conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
+ill_mcast_queue(ill_t *ill, mblk_t *mp)
 {
-	ire_t	*ire;
+	mblk_t **mpp;
+	ill_t *release_ill = NULL;
 
-	mutex_enter(&connp->conn_lock);
-	ire = connp->conn_ire_cache;
-	if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) {
-		connp->conn_ire_cache = NULL;
-		mutex_exit(&connp->conn_lock);
-		IRE_REFRELE_NOTR(ire);
-		return;
+	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
+
+	if (IS_IPMP(ill)) {
+		/* On the upper IPMP ill. */
+		release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+		if (release_ill == NULL) {
+			/* Discard instead of queuing for the ipmp interface */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - no cast_ill",
+			    mp, ill);
+			freemsg(mp);
+			return;
+		}
+		ill = release_ill;
 	}
-	mutex_exit(&connp->conn_lock);
+
+	mutex_enter(&ill->ill_lock);
+	/* Must queue message. Tail insertion */
+	mpp = &ill->ill_mcast_deferred;
+	while (*mpp != NULL)
+		mpp = &((*mpp)->b_next);
+
+	*mpp = mp;
+	mutex_exit(&ill->ill_lock);
+	if (release_ill != NULL)
+		ill_refrele(release_ill);
+}
+
+/*
+ * Send the IP packets that were queued by ill_mcast_queue.
+ * These are IGMP/MLD packets.
+ *
+ * For IPMP we are called on the upper ill, but when send what is queued
+ * on the cast_ill.
+ *
+ * Request loopback of the report if we are acting as a multicast
+ * router, so that the process-level routing demon can hear it.
+ * This will run multiple times for the same group if there are members
+ * on the same group for multiple ipif's on the same ill. The
+ * igmp_input/mld_input code will suppress this due to the loopback thus we
+ * always loopback membership report.
+ *
+ * We also need to make sure that this does not get load balanced
+ * by IPMP. We do this by passing an ill to ip_output_simple.
+ */
+void
+ill_mcast_send_queued(ill_t *ill)
+{
+	mblk_t	*mp;
+	ip_xmit_attr_t ixas;
+	ill_t *release_ill = NULL;
+
+	if (IS_IPMP(ill)) {
+		/* On the upper IPMP ill. */
+		release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+		if (release_ill == NULL) {
+			/*
+			 * We should have no messages on the ipmp interface
+			 * but no point in trying to send them.
+			 */
+			return;
+		}
+		ill = release_ill;
+	}
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_zoneid = ALL_ZONES;
+	ixas.ixa_cred = kcred;
+	ixas.ixa_cpid = NOPID;
+	ixas.ixa_tsl = NULL;
+	/*
+	 * Here we set ixa_ifindex. If IPMP it will be the lower ill which
+	 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill.
+	 * That is necessary to handle IGMP/MLD snooping switches.
+	 */
+	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
+	ixas.ixa_ipst = ill->ill_ipst;
+
+	mutex_enter(&ill->ill_lock);
+	while ((mp = ill->ill_mcast_deferred) != NULL) {
+		ill->ill_mcast_deferred = mp->b_next;
+		mp->b_next = NULL;
+		if (!ill->ill_dl_up) {
+			/*
+			 * Nobody there. Just drop the ip packets.
+			 * IGMP/MLD will resend later, if this is a replumb.
+			 */
+			freemsg(mp);
+			continue;
+		}
+		mutex_enter(&ill->ill_phyint->phyint_lock);
+		if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
+			/*
+			 * When the ill is getting deactivated, we only want to
+			 * send the DLPI messages, so drop IGMP/MLD packets.
+			 * DLPI messages are handled by ill_dlpi_send_queued()
+			 */
+			mutex_exit(&ill->ill_phyint->phyint_lock);
+			freemsg(mp);
+			continue;
+		}
+		mutex_exit(&ill->ill_phyint->phyint_lock);
+		mutex_exit(&ill->ill_lock);
+
+		/* Check whether we are sending IPv4 or IPv6. */
+		if (ill->ill_isv6) {
+			ip6_t  *ip6h = (ip6_t *)mp->b_rptr;
+
+			ixas.ixa_multicast_ttl = ip6h->ip6_hops;
+			ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+		} else {
+			ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+			ixas.ixa_multicast_ttl = ipha->ipha_ttl;
+			ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+			ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM;
+		}
+
+		ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE;
+		(void) ip_output_simple(mp, &ixas);
+		ixa_cleanup(&ixas);
+
+		mutex_enter(&ill->ill_lock);
+	}
+	mutex_exit(&ill->ill_lock);
+
+done:
+	if (release_ill != NULL)
+		ill_refrele(release_ill);
 }
 
 /*
@@ -14494,7 +12705,7 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
  *    that both Solaris and 4.3 BSD have exhibited this behaviour for a long
  *    time. We go thru the cleanup in order to remove these routes.
  * b. The bringup of the interface could fail in ill_dl_up i.e. we get
- *    DL_ERROR_ACK in response to the the DL_BIND request. The interface is
+ *    DL_ERROR_ACK in response to the DL_BIND request. The interface is
  *    down, but we need to cleanup i.e. do ill_dl_down and
  *    ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
  *
@@ -14504,12 +12715,11 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
  *
  * The following members in ipif_t track references to the ipif.
  *	int     ipif_refcnt;    Active reference count
- *	uint_t  ipif_ire_cnt;   Number of ire's referencing this ipif
- *	uint_t  ipif_ilm_cnt;   Number of ilms's references this ipif.
  *
  * The following members in ill_t track references to the ill.
  *	int             ill_refcnt;     active refcnt
  *	uint_t          ill_ire_cnt;	Number of ires referencing ill
+ *	uint_t          ill_ncec_cnt;	Number of ncecs referencing ill
  *	uint_t          ill_nce_cnt;	Number of nces referencing ill
  *	uint_t          ill_ilm_cnt;	Number of ilms referencing ill
  *
@@ -14525,21 +12735,25 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
  * references to the ipif / ill. Pointers from other structures do not
  * count towards this reference count.
  *
- * ipif_ire_cnt/ill_ire_cnt is the number of ire's
- * associated with the ipif/ill. This is incremented whenever a new
- * ire is created referencing the ipif/ill. This is done atomically inside
- * ire_add_v[46] where the ire is actually added to the ire hash table.
- * The count is decremented in ire_inactive where the ire is destroyed.
+ * ill_ire_cnt is the number of ire's associated with the
+ * ill. This is incremented whenever a new ire is created referencing the
+ * ill. This is done atomically inside ire_add_v[46] where the ire is
+ * actually added to the ire hash table. The count is decremented in
+ * ire_inactive where the ire is destroyed.
  *
- * nce's reference ill's thru nce_ill and the count of nce's associated with
- * an ill is recorded in ill_nce_cnt. This is incremented atomically in
+ * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill.
+ * This is incremented atomically in
  * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the
- * table. Similarly it is decremented in ndp_inactive() where the nce
+ * table. Similarly it is decremented in ncec_inactive() where the ncec
+ * is destroyed.
+ *
+ * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is
+ * incremented atomically in nce_add() where the nce is actually added to the
+ * ill_nce. Similarly it is decremented in nce_inactive() where the nce
  * is destroyed.
  *
- * ilm's reference to the ipif (for IPv4 ilm's) or the ill (for IPv6 ilm's)
- * is incremented in ilm_add_v6() and decremented before the ilm is freed
- * in ilm_walker_cleanup() or ilm_delete().
+ * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in
+ * ilm_add() and decremented before the ilm is freed in ilm_delete().
  *
  * Flow of ioctls involving interface down/up
  *
@@ -14555,50 +12769,22 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
  * to the above. All the *tail functions are called after the refcounts have
  * dropped to the appropriate values.
  *
- * The mechanism to quiesce an ipif is as follows.
- *
- * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed
- * on the ipif. Callers either pass a flag requesting wait or the lookup
- *  functions will return NULL.
- *
- * Delete all ires referencing this ipif
+ * SIOC ioctls during the IPIF_CHANGING interval.
  *
- * Any thread attempting to do an ipif_refhold on an ipif that has been
- * obtained thru a cached pointer will first make sure that
- * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then
- * increment the refcount.
- *
- * The above guarantees that the ipif refcount will eventually come down to
- * zero and the ipif will quiesce, once all threads that currently hold a
- * reference to the ipif refrelease the ipif. The ipif is quiescent after the
- * ipif_refcount has dropped to zero and all ire's associated with this ipif
- * have also been ire_inactive'd. i.e. when ipif_{ire, ill}_cnt and
- * ipif_refcnt both drop to zero. See also: comments above IPIF_DOWN_OK()
- * in ip.h
- *
- * Lookups during the IPIF_CHANGING/ILL_CHANGING interval.
- *
- * Threads trying to lookup an ipif or ill can pass a flag requesting
- * wait and restart if the ipif / ill cannot be looked up currently.
- * For eg. bind, and route operations (Eg. route add / delete) cannot return
- * failure if the ipif is currently undergoing an exclusive operation, and
- * hence pass the flag. The mblk is then enqueued in the ipsq and the operation
- * is restarted by ipsq_exit() when the current exclusive operation completes.
- * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The
+ * Threads handling SIOC set ioctls serialize on the squeue, but this
+ * is not done for SIOC get ioctls. Since a set ioctl can cause several
+ * steps of internal changes to the state, some of which are visible in
+ * ipif_flags (such as IFF_UP being cleared and later set), and we want
+ * the set ioctl to be atomic related to the get ioctls, the SIOC get code
+ * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then
+ * enqueued in the ipsq and the operation is restarted by ipsq_exit() when
+ * the current exclusive operation completes. The IPIF_CHANGING check
+ * and enqueue is atomic using the ill_lock and ipsq_lock. The
  * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
  * change while the ill_lock is held. Before dropping the ill_lock we acquire
  * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
- * until we release the ipsq_lock, even though the the ill/ipif state flags
+ * until we release the ipsq_lock, even though the ill/ipif state flags
  * can change after we drop the ill_lock.
- *
- * An attempt to send out a packet using an ipif that is currently
- * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this
- * operation and restart it later when the exclusive condition on the ipif ends.
- * This is an example of not passing the wait flag to the lookup functions. For
- * example an attempt to refhold and use conn->conn_multicast_ipif and send
- * out a multicast packet on that ipif will fail while the ipif is
- * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is
- * currently IPIF_CHANGING will also fail.
  */
 int
 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
@@ -14613,6 +12799,9 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 
 	ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
 
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_down",
+	    ill_t *, ill, ipif_t *, ipif);
+
 	if (ipif->ipif_flags & IPIF_UP) {
 		mutex_enter(&ill->ill_lock);
 		ipif->ipif_flags &= ~IPIF_UP;
@@ -14649,15 +12838,12 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		}
 	}
 
-	/*
-	 * Delete all IRE's pointing at this ipif or its source address.
-	 */
-	if (ipif->ipif_isv6) {
-		ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES,
-		    ipst);
-	} else {
-		ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES,
-		    ipst);
+	if (ipif_was_up) {
+		/* only delete if we'd added ire's before */
+		if (ipif->ipif_isv6)
+			ipif_delete_ires_v6(ipif);
+		else
+			ipif_delete_ires_v4(ipif);
 	}
 
 	if (ipif_was_up && ill->ill_ipif_up_count == 0) {
@@ -14672,30 +12858,28 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 	}
 
 	/*
-	 * Cleaning up the conn_ire_cache or conns must be done only after the
-	 * ires have been deleted above. Otherwise a thread could end up
-	 * caching an ire in a conn after we have finished the cleanup of the
-	 * conn. The caching is done after making sure that the ire is not yet
-	 * condemned. Also documented in the block comment above ip_output
+	 * neighbor-discovery or arp entries for this interface. The ipif
+	 * has to be quiesced, so we walk all the nce's and delete those
+	 * that point at the ipif->ipif_ill. At the same time, we also
+	 * update IPMP so that ipifs for data addresses are unbound. We dont
+	 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer
+	 * that for ipif_down_tail()
 	 */
-	ipcl_walk(conn_cleanup_stale_ire, NULL, ipst);
-	/* Also, delete the ires cached in SCTP */
-	sctp_ire_cache_flush(ipif);
+	ipif_nce_down(ipif);
 
 	/*
-	 * Update any other ipifs which have used "our" local address as
-	 * a source address. This entails removing and recreating IRE_INTERFACE
-	 * entries for such ipifs.
+	 * If this is the last ipif on the ill, we also need to remove
+	 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will
+	 * never succeed.
 	 */
-	if (ipif->ipif_isv6)
-		ipif_update_other_ipifs_v6(ipif);
-	else
-		ipif_update_other_ipifs(ipif);
+	if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0)
+		ire_walk_ill(0, 0, ill_downi, ill, ill);
 
 	/*
-	 * neighbor-discovery or arp entries for this interface.
+	 * Walk all CONNs that can have a reference on an ire for this
+	 * ipif (we actually walk all that now have stale references).
 	 */
-	ipif_ndp_down(ipif);
+	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
 
 	/*
 	 * If mp is NULL the caller will wait for the appropriate refcnt.
@@ -14748,10 +12932,14 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 	return (EINPROGRESS);
 }
 
-void
+int
 ipif_down_tail(ipif_t *ipif)
 {
 	ill_t	*ill = ipif->ipif_ill;
+	int	err = 0;
+
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail",
+	    ill_t *, ill, ipif_t *, ipif);
 
 	/*
 	 * Skip any loopback interface (null wq).
@@ -14766,15 +12954,14 @@ ipif_down_tail(ipif_t *ipif)
 	    ill->ill_dl_up) {
 		ill_dl_down(ill);
 	}
-	ill->ill_logical_down = 0;
+	if (!ipif->ipif_isv6)
+		err = ipif_arp_down(ipif);
 
-	/*
-	 * Has to be after removing the routes in ipif_down_delete_ire.
-	 */
-	ipif_resolver_down(ipif);
+	ill->ill_logical_down = 0;
 
 	ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
 	ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
+	return (err);
 }
 
 /*
@@ -14785,6 +12972,9 @@ ipif_down_tail(ipif_t *ipif)
 static int
 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 {
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down",
+	    ill_t *, ipif->ipif_ill, ipif_t *, ipif);
+
 	/*
 	 * The ill_logical_down flag is a transient flag. It is set here
 	 * and is cleared once the down has completed in ipif_down_tail.
@@ -14799,152 +12989,6 @@ ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 }
 
 /*
- * This is called when the SIOCSLIFUSESRC ioctl is processed in IP.
- * If the usesrc client ILL is already part of a usesrc group or not,
- * in either case a ire_stq with the matching usesrc client ILL will
- * locate the IRE's that need to be deleted. We want IREs to be created
- * with the new source address.
- */
-static void
-ipif_delete_cache_ire(ire_t *ire, char *ill_arg)
-{
-	ill_t	*ucill = (ill_t *)ill_arg;
-
-	ASSERT(IAM_WRITER_ILL(ucill));
-
-	if (ire->ire_stq == NULL)
-		return;
-
-	if ((ire->ire_type == IRE_CACHE) &&
-	    ((ill_t *)ire->ire_stq->q_ptr == ucill))
-		ire_delete(ire);
-}
-
-/*
- * ire_walk routine to delete every IRE dependent on the interface
- * address that is going down.	(Always called as writer.)
- * Works for both v4 and v6.
- * In addition for checking for ire_ipif matches it also checks for
- * IRE_CACHE entries which have the same source address as the
- * disappearing ipif since ipif_select_source might have picked
- * that source. Note that ipif_down/ipif_update_other_ipifs takes
- * care of any IRE_INTERFACE with the disappearing source address.
- */
-static void
-ipif_down_delete_ire(ire_t *ire, char *ipif_arg)
-{
-	ipif_t	*ipif = (ipif_t *)ipif_arg;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
-	if (ire->ire_ipif == NULL)
-		return;
-
-	if (ire->ire_ipif != ipif) {
-		/*
-		 * Look for a matching source address.
-		 */
-		if (ire->ire_type != IRE_CACHE)
-			return;
-		if (ipif->ipif_flags & IPIF_NOLOCAL)
-			return;
-
-		if (ire->ire_ipversion == IPV4_VERSION) {
-			if (ire->ire_src_addr != ipif->ipif_src_addr)
-				return;
-		} else {
-			if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
-			    &ipif->ipif_v6lcl_addr))
-				return;
-		}
-		ire_delete(ire);
-		return;
-	}
-	/*
-	 * ire_delete() will do an ire_flush_cache which will delete
-	 * all ire_ipif matches
-	 */
-	ire_delete(ire);
-}
-
-/*
- * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when
- * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or
- * 2) when an interface is brought up or down (on that ill).
- * This ensures that the IRE_CACHE entries don't retain stale source
- * address selection results.
- */
-void
-ill_ipif_cache_delete(ire_t *ire, char *ill_arg)
-{
-	ill_t	*ill = (ill_t *)ill_arg;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(ire->ire_type == IRE_CACHE);
-
-	/*
-	 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
-	 * ill, but we only want to delete the IRE if ire_ipif matches.
-	 */
-	ASSERT(ire->ire_ipif != NULL);
-	if (ill == ire->ire_ipif->ipif_ill)
-		ire_delete(ire);
-}
-
-/*
- * Delete all the IREs whose ire_stq's reference `ill_arg'.  IPMP uses this
- * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references
- * the IPMP ill.
- */
-void
-ill_stq_cache_delete(ire_t *ire, char *ill_arg)
-{
-	ill_t	*ill = (ill_t *)ill_arg;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(ire->ire_type == IRE_CACHE);
-
-	/*
-	 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
-	 * ill, but we only want to delete the IRE if ire_stq matches.
-	 */
-	if (ire->ire_stq->q_ptr == ill_arg)
-		ire_delete(ire);
-}
-
-/*
- * Delete all the IREs whose ire_stq's reference any ill in the same IPMP
- * group as `ill_arg'.  Used by ipmp_ill_deactivate() to flush all IRE_CACHE
- * entries for the illgrp.
- */
-void
-ill_grp_cache_delete(ire_t *ire, char *ill_arg)
-{
-	ill_t	*ill = (ill_t *)ill_arg;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	if (ire->ire_type == IRE_CACHE &&
-	    IS_IN_SAME_ILLGRP((ill_t *)ire->ire_stq->q_ptr, ill)) {
-		ire_delete(ire);
-	}
-}
-
-/*
- * Delete all broadcast IREs with a source address on `ill_arg'.
- */
-static void
-ill_broadcast_delete(ire_t *ire, char *ill_arg)
-{
-	ill_t *ill = (ill_t *)ill_arg;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(ire->ire_type == IRE_BROADCAST);
-
-	if (ire->ire_ipif->ipif_ill == ill)
-		ire_delete(ire);
-}
-
-/*
  * Initiate deallocate of an IPIF. Always called as writer. Called by
  * ill_delete or ip_sioctl_removeif.
  */
@@ -14959,16 +13003,6 @@ ipif_free(ipif_t *ipif)
 		(void) untimeout(ipif->ipif_recovery_id);
 	ipif->ipif_recovery_id = 0;
 
-	/* Remove conn references */
-	reset_conn_ipif(ipif);
-
-	/*
-	 * Make sure we have valid net and subnet broadcast ire's for the
-	 * other ipif's which share them with this ipif.
-	 */
-	if (!ipif->ipif_isv6)
-		ipif_check_bcast_ires(ipif);
-
 	/*
 	 * Take down the interface. We can be called either from ill_delete
 	 * or from ip_sioctl_removeif.
@@ -14996,27 +13030,15 @@ ipif_free(ipif_t *ipif)
 static void
 ipif_free_tail(ipif_t *ipif)
 {
-	mblk_t	*mp;
 	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
 
 	/*
-	 * Free state for addition IRE_IF_[NO]RESOLVER ire's.
-	 */
-	mutex_enter(&ipif->ipif_saved_ire_lock);
-	mp = ipif->ipif_saved_ire_mp;
-	ipif->ipif_saved_ire_mp = NULL;
-	mutex_exit(&ipif->ipif_saved_ire_lock);
-	freemsg(mp);
-
-	/*
 	 * Need to hold both ill_g_lock and ill_lock while
 	 * inserting or removing an ipif from the linked list
 	 * of ipifs hanging off the ill.
 	 */
 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
 
-	ASSERT(ilm_walk_ipif(ipif) == 0);
-
 #ifdef DEBUG
 	ipif_trace_cleanup(ipif);
 #endif
@@ -15028,10 +13050,9 @@ ipif_free_tail(ipif_t *ipif)
 	ipif_remove(ipif);
 	rw_exit(&ipst->ips_ill_g_lock);
 
-	mutex_destroy(&ipif->ipif_saved_ire_lock);
-
 	ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE)));
 	ASSERT(ipif->ipif_recovery_id == 0);
+	ASSERT(ipif->ipif_ire_local == NULL);
 
 	/* Free the memory. */
 	mi_free(ipif);
@@ -15064,6 +13085,23 @@ ipif_get_name(const ipif_t *ipif, char *buf, int len)
 }
 
 /*
+ * Sets `buf' to an ill name.
+ */
+void
+ill_get_name(const ill_t *ill, char *buf, int len)
+{
+	char	*name;
+	size_t	name_len;
+
+	name = ill->ill_name;
+	name_len = ill->ill_name_length;
+	len -= 1;
+	buf[len] = '\0';
+	len = MIN(len, name_len);
+	bcopy(name, buf, len);
+}
+
+/*
  * Find an IPIF based on the name passed in.  Names can be of the form <phys>
  * (e.g., le0) or <phys>:<#> (e.g., le0:1).  When there is no colon, the
  * implied unit id is zero. <phys> must correspond to the name of an ILL.
@@ -15071,8 +13109,7 @@ ipif_get_name(const ipif_t *ipif, char *buf, int len)
  */
 static ipif_t *
 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
-    boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q,
-    mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+    boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst)
 {
 	char	*cp;
 	char	*endp;
@@ -15081,10 +13118,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 	ipif_t	*ipif;
 	uint_t	ire_type;
 	boolean_t did_alloc = B_FALSE;
-	ipsq_t	*ipsq;
-
-	if (error != NULL)
-		*error = 0;
 
 	/*
 	 * If the caller wants to us to create the ipif, make sure we have a
@@ -15093,8 +13126,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 	ASSERT(!do_alloc || zoneid != ALL_ZONES);
 
 	if (namelen == 0) {
-		if (error != NULL)
-			*error = ENXIO;
 		return (NULL);
 	}
 
@@ -15121,8 +13152,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 		 * is zero, fail.
 		 */
 		if (&cp[2] < endp && cp[1] == '0') {
-			if (error != NULL)
-				*error = EINVAL;
 			return (NULL);
 		}
 	}
@@ -15140,7 +13169,7 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 	 * ill_lookup_on_name will clear it.
 	 */
 	ill = ill_lookup_on_name(name, do_alloc, isv6,
-	    q, mp, func, error, &did_alloc, ipst);
+	    &did_alloc, ipst);
 	if (cp != endp)
 		*cp = IPIF_SEPARATOR_CHAR;
 	if (ill == NULL)
@@ -15153,13 +13182,10 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 		cp++;
 		if (ddi_strtol(cp, NULL, 0, &id) != 0) {
 			ill_refrele(ill);
-			if (error != NULL)
-				*error = ENXIO;
 			return (NULL);
 		}
 	}
 
-	GRAB_CONN_LOCK(q);
 	mutex_enter(&ill->ill_lock);
 	/* Now see if there is an IPIF with this unit number. */
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
@@ -15168,16 +13194,9 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 			    zoneid != ipif->ipif_zoneid &&
 			    ipif->ipif_zoneid != ALL_ZONES) {
 				mutex_exit(&ill->ill_lock);
-				RELEASE_CONN_LOCK(q);
 				ill_refrele(ill);
-				if (error != NULL)
-					*error = ENXIO;
 				return (NULL);
 			}
-			/*
-			 * The block comment at the start of ipif_down
-			 * explains the use of the macros used below
-			 */
 			if (IPIF_CAN_LOOKUP(ipif)) {
 				ipif_refhold_locked(ipif);
 				mutex_exit(&ill->ill_lock);
@@ -15189,32 +13208,15 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 				 * ipif_ill_refrele_tail which can end up
 				 * in trying to acquire any lock.
 				 */
-				RELEASE_CONN_LOCK(q);
 				ill_refrele(ill);
 				return (ipif);
-			} else if (IPIF_CAN_WAIT(ipif, q)) {
-				ipsq = ill->ill_phyint->phyint_ipsq;
-				mutex_enter(&ipsq->ipsq_lock);
-				mutex_enter(&ipsq->ipsq_xop->ipx_lock);
-				mutex_exit(&ill->ill_lock);
-				ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
-				mutex_exit(&ipsq->ipsq_xop->ipx_lock);
-				mutex_exit(&ipsq->ipsq_lock);
-				RELEASE_CONN_LOCK(q);
-				ill_refrele(ill);
-				if (error != NULL)
-					*error = EINPROGRESS;
-				return (NULL);
 			}
 		}
 	}
-	RELEASE_CONN_LOCK(q);
 
 	if (!do_alloc) {
 		mutex_exit(&ill->ill_lock);
 		ill_refrele(ill);
-		if (error != NULL)
-			*error = ENXIO;
 		return (NULL);
 	}
 
@@ -15236,8 +13238,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 	ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE);
 	if (ipif != NULL)
 		ipif_refhold_locked(ipif);
-	else if (error != NULL)
-		*error = ENOMEM;
 	mutex_exit(&ill->ill_lock);
 	ill_refrele(ill);
 	return (ipif);
@@ -15258,6 +13258,7 @@ ipif_mask_reply(ipif_t *ipif)
 	ipha_t	*ipha;
 	mblk_t	*mp;
 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
+	ip_xmit_attr_t ixas;
 
 #define	REPLY_LEN	(sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
 
@@ -15269,6 +13270,9 @@ ipif_mask_reply(ipif_t *ipif)
 	/* ICMP mask reply is not for a loopback interface */
 	ASSERT(ipif->ipif_ill->ill_wq != NULL);
 
+	if (ipif->ipif_lcl_addr == INADDR_ANY)
+		return;
+
 	mp = allocb(REPLY_LEN, BPRI_HI);
 	if (mp == NULL)
 		return;
@@ -15278,7 +13282,7 @@ ipif_mask_reply(ipif_t *ipif)
 	bzero(ipha, REPLY_LEN);
 	*ipha = icmp_ipha;
 	ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
-	ipha->ipha_src = ipif->ipif_src_addr;
+	ipha->ipha_src = ipif->ipif_lcl_addr;
 	ipha->ipha_dst = ipif->ipif_brd_addr;
 	ipha->ipha_length = htons(REPLY_LEN);
 	ipha->ipha_ident = 0;
@@ -15288,64 +13292,19 @@ ipif_mask_reply(ipif_t *ipif)
 	bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
 	icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
 
-	put(ipif->ipif_wq, mp);
-
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+	ixas.ixa_flags |= IXAF_SET_SOURCE;
+	ixas.ixa_zoneid = ALL_ZONES;
+	ixas.ixa_ifindex = 0;
+	ixas.ixa_ipst = ipst;
+	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+	(void) ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
 #undef	REPLY_LEN
 }
 
 /*
- * When the mtu in the ipif changes, we call this routine through ire_walk
- * to update all the relevant IREs.
- * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq.
- */
-static void
-ipif_mtu_change(ire_t *ire, char *ipif_arg)
-{
-	ipif_t *ipif = (ipif_t *)ipif_arg;
-
-	if (ire->ire_stq == NULL || ire->ire_ipif != ipif)
-		return;
-
-	mutex_enter(&ire->ire_lock);
-	if (ire->ire_marks & IRE_MARK_PMTU) {
-		/* Avoid increasing the PMTU */
-		ire->ire_max_frag = MIN(ipif->ipif_mtu, ire->ire_max_frag);
-		if (ire->ire_max_frag == ipif->ipif_mtu)
-			ire->ire_marks &= ~IRE_MARK_PMTU;
-	} else {
-		ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET);
-	}
-	mutex_exit(&ire->ire_lock);
-}
-
-/*
- * When the mtu in the ill changes, we call this routine through ire_walk
- * to update all the relevant IREs.
- * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq.
- */
-void
-ill_mtu_change(ire_t *ire, char *ill_arg)
-{
-	ill_t	*ill = (ill_t *)ill_arg;
-
-	if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill)
-		return;
-
-	mutex_enter(&ire->ire_lock);
-	if (ire->ire_marks & IRE_MARK_PMTU) {
-		/* Avoid increasing the PMTU */
-		ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu,
-		    ire->ire_max_frag);
-		if (ire->ire_max_frag == ire->ire_ipif->ipif_mtu) {
-			ire->ire_marks &= ~IRE_MARK_PMTU;
-		}
-	} else {
-		ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, IP_MAXPACKET);
-	}
-	mutex_exit(&ire->ire_lock);
-}
-
-/*
  * Join the ipif specific multicast groups.
  * Must be called after a mapping has been set up in the resolver.  (Always
  * called as writer.)
@@ -15355,13 +13314,15 @@ ipif_multicast_up(ipif_t *ipif)
 {
 	int err;
 	ill_t *ill;
+	ilm_t *ilm;
 
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
 	ill = ipif->ipif_ill;
 
 	ip1dbg(("ipif_multicast_up\n"));
-	if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up)
+	if (!(ill->ill_flags & ILLF_MULTICAST) ||
+	    ipif->ipif_allhosts_ilm != NULL)
 		return;
 
 	if (ipif->ipif_isv6) {
@@ -15380,228 +13341,147 @@ ipif_multicast_up(ipif_t *ipif)
 		 * underlying IPMP interfaces since they should be invisible.
 		 */
 		if (!IS_UNDER_IPMP(ill)) {
-			err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid,
-			    ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
-			if (err != 0) {
+			ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid,
+			    &err);
+			if (ilm == NULL) {
+				ASSERT(err != 0);
 				ip0dbg(("ipif_multicast_up: "
 				    "all_hosts_mcast failed %d\n", err));
 				return;
 			}
-			ipif->ipif_joined_allhosts = 1;
+			ipif->ipif_allhosts_ilm = ilm;
 		}
 
 		/*
-		 * Enable multicast for the solicited node multicast address
+		 * Enable multicast for the solicited node multicast address.
+		 * If IPMP we need to put the membership on the upper ill.
 		 */
 		if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
-			err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid,
-			    ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
-			if (err != 0) {
+			ill_t *mcast_ill = NULL;
+			boolean_t need_refrele;
+
+			if (IS_UNDER_IPMP(ill) &&
+			    (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
+				need_refrele = B_TRUE;
+			} else {
+				mcast_ill = ill;
+				need_refrele = B_FALSE;
+			}
+
+			ilm = ip_addmulti(&v6solmc, mcast_ill,
+			    ipif->ipif_zoneid, &err);
+			if (need_refrele)
+				ill_refrele(mcast_ill);
+
+			if (ilm == NULL) {
+				ASSERT(err != 0);
 				ip0dbg(("ipif_multicast_up: solicited MC"
 				    " failed %d\n", err));
-				if (ipif->ipif_joined_allhosts) {
-					(void) ip_delmulti_v6(&v6allmc, ill,
-					    ipif->ipif_zoneid, B_TRUE, B_TRUE);
-					ipif->ipif_joined_allhosts = 0;
+				if ((ilm = ipif->ipif_allhosts_ilm) != NULL) {
+					ipif->ipif_allhosts_ilm = NULL;
+					(void) ip_delmulti(ilm);
 				}
 				return;
 			}
+			ipif->ipif_solmulti_ilm = ilm;
 		}
 	} else {
+		in6_addr_t v6group;
+
 		if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
 			return;
 
 		/* Join the all hosts multicast address */
 		ip1dbg(("ipif_multicast_up - addmulti\n"));
-		err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif,
-		    ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
-		if (err) {
+		IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group);
+
+		ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err);
+		if (ilm == NULL) {
+			ASSERT(err != 0);
 			ip0dbg(("ipif_multicast_up: failed %d\n", err));
 			return;
 		}
+		ipif->ipif_allhosts_ilm = ilm;
 	}
-	ipif->ipif_multicast_up = 1;
 }
 
 /*
  * Blow away any multicast groups that we joined in ipif_multicast_up().
- * (Explicit memberships are blown away in ill_leave_multicast() when the
- * ill is brought down.)
+ * (ilms from explicit memberships are handled in conn_update_ill.)
  */
 void
 ipif_multicast_down(ipif_t *ipif)
 {
-	int err;
-
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
 	ip1dbg(("ipif_multicast_down\n"));
-	if (!ipif->ipif_multicast_up)
-		return;
-
-	ip1dbg(("ipif_multicast_down - delmulti\n"));
-
-	if (!ipif->ipif_isv6) {
-		err = ip_delmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, B_TRUE,
-		    B_TRUE);
-		if (err != 0)
-			ip0dbg(("ipif_multicast_down: failed %d\n", err));
-
-		ipif->ipif_multicast_up = 0;
-		return;
-	}
 
-	/*
-	 * Leave the all-hosts multicast address.
-	 */
-	if (ipif->ipif_joined_allhosts) {
-		err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
-		    ipif->ipif_zoneid, B_TRUE, B_TRUE);
-		if (err != 0) {
-			ip0dbg(("ipif_multicast_down: all_hosts_mcast "
-			    "failed %d\n", err));
-		}
-		ipif->ipif_joined_allhosts = 0;
+	if (ipif->ipif_allhosts_ilm != NULL) {
+		(void) ip_delmulti(ipif->ipif_allhosts_ilm);
+		ipif->ipif_allhosts_ilm = NULL;
 	}
-
-	/*
-	 * Disable multicast for the solicited node multicast address
-	 */
-	if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
-		in6_addr_t ipv6_multi = ipv6_solicited_node_mcast;
-
-		ipv6_multi.s6_addr32[3] |=
-		    ipif->ipif_v6lcl_addr.s6_addr32[3];
-
-		err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill,
-		    ipif->ipif_zoneid, B_TRUE, B_TRUE);
-		if (err != 0) {
-			ip0dbg(("ipif_multicast_down: sol MC failed %d\n",
-			    err));
-		}
+	if (ipif->ipif_solmulti_ilm != NULL) {
+		(void) ip_delmulti(ipif->ipif_solmulti_ilm);
+		ipif->ipif_solmulti_ilm = NULL;
 	}
-
-	ipif->ipif_multicast_up = 0;
 }
 
 /*
  * Used when an interface comes up to recreate any extra routes on this
  * interface.
  */
-static ire_t **
-ipif_recover_ire(ipif_t *ipif)
+int
+ill_recover_saved_ire(ill_t *ill)
 {
-	mblk_t	*mp;
-	ire_t	**ipif_saved_irep;
-	ire_t	**irep;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
-
-	ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name,
-	    ipif->ipif_id));
+	mblk_t		*mp;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	mutex_enter(&ipif->ipif_saved_ire_lock);
-	ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) *
-	    ipif->ipif_saved_ire_cnt, KM_NOSLEEP);
-	if (ipif_saved_irep == NULL) {
-		mutex_exit(&ipif->ipif_saved_ire_lock);
-		return (NULL);
-	}
+	ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name));
 
-	irep = ipif_saved_irep;
-	for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
-		ire_t		*ire;
-		queue_t		*rfq;
-		queue_t		*stq;
+	mutex_enter(&ill->ill_saved_ire_lock);
+	for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
+		ire_t		*ire, *nire;
 		ifrt_t		*ifrt;
-		uchar_t		*src_addr;
-		uchar_t		*gateway_addr;
-		ushort_t	type;
 
-		/*
-		 * When the ire was initially created and then added in
-		 * ip_rt_add(), it was created either using ipif->ipif_net_type
-		 * in the case of a traditional interface route, or as one of
-		 * the IRE_OFFSUBNET types (with the exception of
-		 * IRE_HOST types ire which is created by icmp_redirect() and
-		 * which we don't need to save or recover).  In the case where
-		 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update
-		 * the ire_type to IRE_IF_NORESOLVER before calling ire_add()
-		 * to satisfy software like GateD and Sun Cluster which creates
-		 * routes using the the loopback interface's address as a
-		 * gateway.
-		 *
-		 * As ifrt->ifrt_type reflects the already updated ire_type,
-		 * ire_create() will be called in the same way here as
-		 * in ip_rt_add(), namely using ipif->ipif_net_type when
-		 * the route looks like a traditional interface route (where
-		 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using
-		 * the saved ifrt->ifrt_type.  This means that in the case where
-		 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by
-		 * ire_create() will be an IRE_LOOPBACK, it will then be turned
-		 * into an IRE_IF_NORESOLVER and then added by ire_add().
-		 */
 		ifrt = (ifrt_t *)mp->b_rptr;
-		ASSERT(ifrt->ifrt_type != IRE_CACHE);
-		if (ifrt->ifrt_type & IRE_INTERFACE) {
-			rfq = NULL;
-			stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
-			    ? ipif->ipif_rq : ipif->ipif_wq;
-			src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
-			    ? (uint8_t *)&ifrt->ifrt_src_addr
-			    : (uint8_t *)&ipif->ipif_src_addr;
-			gateway_addr = NULL;
-			type = ipif->ipif_net_type;
-		} else if (ifrt->ifrt_type & IRE_BROADCAST) {
-			/* Recover multiroute broadcast IRE. */
-			rfq = ipif->ipif_rq;
-			stq = ipif->ipif_wq;
-			src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
-			    ? (uint8_t *)&ifrt->ifrt_src_addr
-			    : (uint8_t *)&ipif->ipif_src_addr;
-			gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr;
-			type = ifrt->ifrt_type;
-		} else {
-			rfq = NULL;
-			stq = NULL;
-			src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
-			    ? (uint8_t *)&ifrt->ifrt_src_addr : NULL;
-			gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr;
-			type = ifrt->ifrt_type;
-		}
-
 		/*
 		 * Create a copy of the IRE with the saved address and netmask.
 		 */
-		ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for "
-		    "0x%x/0x%x\n",
-		    ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type,
-		    ntohl(ifrt->ifrt_addr),
-		    ntohl(ifrt->ifrt_mask)));
-		ire = ire_create(
-		    (uint8_t *)&ifrt->ifrt_addr,
-		    (uint8_t *)&ifrt->ifrt_mask,
-		    src_addr,
-		    gateway_addr,
-		    &ifrt->ifrt_max_frag,
-		    NULL,
-		    rfq,
-		    stq,
-		    type,
-		    ipif,
-		    0,
-		    0,
-		    0,
-		    ifrt->ifrt_flags,
-		    &ifrt->ifrt_iulp_info,
-		    NULL,
-		    NULL,
-		    ipst);
-
+		if (ill->ill_isv6) {
+			ire = ire_create_v6(
+			    &ifrt->ifrt_v6addr,
+			    &ifrt->ifrt_v6mask,
+			    &ifrt->ifrt_v6gateway_addr,
+			    ifrt->ifrt_type,
+			    ill,
+			    ifrt->ifrt_zoneid,
+			    ifrt->ifrt_flags,
+			    NULL,
+			    ipst);
+		} else {
+			ire = ire_create(
+			    (uint8_t *)&ifrt->ifrt_addr,
+			    (uint8_t *)&ifrt->ifrt_mask,
+			    (uint8_t *)&ifrt->ifrt_gateway_addr,
+			    ifrt->ifrt_type,
+			    ill,
+			    ifrt->ifrt_zoneid,
+			    ifrt->ifrt_flags,
+			    NULL,
+			    ipst);
+		}
 		if (ire == NULL) {
-			mutex_exit(&ipif->ipif_saved_ire_lock);
-			kmem_free(ipif_saved_irep,
-			    ipif->ipif_saved_ire_cnt * sizeof (ire_t *));
-			return (NULL);
+			mutex_exit(&ill->ill_saved_ire_lock);
+			return (ENOMEM);
+		}
+
+		if (ifrt->ifrt_flags & RTF_SETSRC) {
+			if (ill->ill_isv6) {
+				ire->ire_setsrc_addr_v6 =
+				    ifrt->ifrt_v6setsrc_addr;
+			} else {
+				ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr;
+			}
 		}
 
 		/*
@@ -15611,23 +13491,37 @@ ipif_recover_ire(ipif_t *ipif)
 		 * set up prefixes with the RTF_REJECT flag set (for example,
 		 * when generating aggregate routes.)
 		 *
-		 * If the IRE type (as defined by ipif->ipif_net_type) is
+		 * If the IRE type (as defined by ill->ill_net_type) is
 		 * IRE_LOOPBACK, then we map the request into a
 		 * IRE_IF_NORESOLVER.
 		 */
-		if (ipif->ipif_net_type == IRE_LOOPBACK)
+		if (ill->ill_net_type == IRE_LOOPBACK)
 			ire->ire_type = IRE_IF_NORESOLVER;
+
 		/*
 		 * ire held by ire_add, will be refreled' towards the
 		 * the end of ipif_up_done
 		 */
-		(void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
-		*irep = ire;
-		irep++;
-		ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire));
+		nire = ire_add(ire);
+		/*
+		 * Check if it was a duplicate entry. This handles
+		 * the case of two racing route adds for the same route
+		 */
+		if (nire == NULL) {
+			ip1dbg(("ill_recover_saved_ire: FAILED\n"));
+		} else if (nire != ire) {
+			ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n",
+			    (void *)nire));
+			ire_delete(nire);
+		} else {
+			ip1dbg(("ill_recover_saved_ire: added ire %p\n",
+			    (void *)nire));
+		}
+		if (nire != NULL)
+			ire_refrele(nire);
 	}
-	mutex_exit(&ipif->ipif_saved_ire_lock);
-	return (ipif_saved_irep);
+	mutex_exit(&ill->ill_saved_ire_lock);
+	return (0);
 }
 
 /*
@@ -15766,6 +13660,8 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
 	ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_up",
+	    ill_t *, ill, ipif_t *, ipif);
 
 	/* Shouldn't get here if it is already up. */
 	if (ipif->ipif_flags & IPIF_UP)
@@ -15786,7 +13682,7 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		/*
 		 * The ipif being brought up should be quiesced.  If it's not,
 		 * something has gone amiss and we need to bail out.  (If it's
-		 * quiesced, we know it will remain so via IPIF_CHANGING.)
+		 * quiesced, we know it will remain so via IPIF_CONDEMNED.)
 		 */
 		mutex_enter(&ill->ill_lock);
 		if (!ipif_is_quiescent(ipif)) {
@@ -15868,8 +13764,8 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		/*
 		 * If the ipif being brought up was on slot zero, then we
 		 * first need to bring up the placeholder we stuck there.  In
-		 * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call
-		 * to ipif_up() itself, if we successfully bring up the
+		 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive
+		 * call to ipif_up() itself, if we successfully bring up the
 		 * placeholder, we'll check ill_move_ipif and bring it up too.
 		 */
 		if (ipif_orig_id == 0) {
@@ -15907,13 +13803,13 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		}
 
 		/*
-		 * ipif_resolver_up may end up sending an
-		 * AR_INTERFACE_UP message to ARP, which would, in
-		 * turn send a DLPI message to the driver. ioctls are
+		 * ipif_resolver_up may end up needeing to bind/attach
+		 * the ARP stream, which in turn necessitates a
+		 * DLPI message exchange with the driver. ioctls are
 		 * serialized and so we cannot send more than one
 		 * interface up message at a time. If ipif_resolver_up
-		 * does send an interface up message to ARP, we get
-		 * EINPROGRESS and we will complete in ip_arp_done.
+		 * does need to wait for the DLPI handshake for the ARP stream,
+		 * we get EINPROGRESS and we will complete in arp_bringup_done.
 		 */
 
 		ASSERT(connp != NULL || !CONN_Q(q));
@@ -15928,18 +13824,12 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 			return (EINTR);
 
 		/*
-		 * Crank up the resolver.  For IPv6, this cranks up the
-		 * external resolver if one is configured, but even if an
-		 * external resolver isn't configured, it must be called to
-		 * reset DAD state.  For IPv6, if an external resolver is not
-		 * being used, ipif_resolver_up() will never return
-		 * EINPROGRESS, so we can always call ipif_ndp_up() here.
-		 * Note that if an external resolver is being used, there's no
-		 * need to call ipif_ndp_up() since it will do nothing.
+		 * Crank up IPv6 neighbor discovery. Unlike ARP, this should
+		 * complete when ipif_ndp_up returns.
 		 */
 		err = ipif_resolver_up(ipif, Res_act_initial);
 		if (err == EINPROGRESS) {
-			/* We will complete it in ip_arp_done() */
+			/* We will complete it in arp_bringup_done() */
 			return (err);
 		}
 
@@ -15958,9 +13848,13 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		 */
 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
 		ipif->ipif_addr_ready = 1;
+		err = ill_add_ires(ill);
+		/* allocation failure? */
+		if (err != 0)
+			return (err);
 	}
 
-	err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif);
+	err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
 	if (err == 0 && ill->ill_move_ipif != NULL) {
 		ipif = ill->ill_move_ipif;
 		ill->ill_move_ipif = NULL;
@@ -15970,6 +13864,53 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 }
 
 /*
+ * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST.
+ * The identical set of IREs need to be removed in ill_delete_ires().
+ */
+int
+ill_add_ires(ill_t *ill)
+{
+	ire_t	*ire;
+	in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1};
+	in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP);
+
+	if (ill->ill_ire_multicast != NULL)
+		return (0);
+
+	/*
+	 * provide some dummy ire_addr for creating the ire.
+	 */
+	if (ill->ill_isv6) {
+		ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill,
+		    ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
+	} else {
+		ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill,
+		    ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
+	}
+	if (ire == NULL)
+		return (ENOMEM);
+
+	ill->ill_ire_multicast = ire;
+	return (0);
+}
+
+void
+ill_delete_ires(ill_t *ill)
+{
+	if (ill->ill_ire_multicast != NULL) {
+		/*
+		 * BIND/ATTACH completed; Release the ref for ill_ire_multicast
+		 * which was taken without any th_tracing enabled.
+		 * We also mark it as condemned (note that it was never added)
+		 * so that caching conn's can move off of it.
+		 */
+		ire_make_condemned(ill->ill_ire_multicast);
+		ire_refrele_notr(ill->ill_ire_multicast);
+		ill->ill_ire_multicast = NULL;
+	}
+}
+
+/*
  * Perform a bind for the physical device.
  * When the routine returns EINPROGRESS then mp has been consumed and
  * the ioctl will be acked from ip_rput_dlpi.
@@ -15978,30 +13919,26 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 static int
 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 {
-	areq_t	*areq;
-	mblk_t	*areq_mp = NULL;
 	mblk_t	*bind_mp = NULL;
 	mblk_t	*unbind_mp = NULL;
 	conn_t	*connp;
 	boolean_t success;
-	uint16_t sap_addr;
+	int	err;
+
+	DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
 
 	ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
 	ASSERT(IAM_WRITER_ILL(ill));
 	ASSERT(mp != NULL);
 
-	/* Create a resolver cookie for ARP */
-	if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) {
-		areq_mp = ill_arp_alloc(ill, (uchar_t *)&ip_areq_template, 0);
-		if (areq_mp == NULL)
-			return (ENOMEM);
+	/*
+	 * Make sure we have an IRE_MULTICAST in case we immediately
+	 * start receiving packets.
+	 */
+	err = ill_add_ires(ill);
+	if (err != 0)
+		goto bad;
 
-		freemsg(ill->ill_resolver_mp);
-		ill->ill_resolver_mp = areq_mp;
-		areq = (areq_t *)areq_mp->b_rptr;
-		sap_addr = ill->ill_sap;
-		bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr));
-	}
 	bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
 	    DL_BIND_REQ);
 	if (bind_mp == NULL)
@@ -16067,46 +14004,39 @@ bad:
 	return (ENOMEM);
 }
 
+/* Add room for tcp+ip headers */
 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
 
 /*
  * DLPI and ARP is up.
- * Create all the IREs associated with an interface bring up multicast.
+ * Create all the IREs associated with an interface. Bring up multicast.
  * Set the interface flag and finish other initialization
- * that potentially had to be differed to after DL_BIND_ACK.
+ * that potentially had to be deferred to after DL_BIND_ACK.
  */
 int
 ipif_up_done(ipif_t *ipif)
 {
-	ire_t	*ire_array[20];
-	ire_t	**irep = ire_array;
-	ire_t	**irep1;
-	ipaddr_t net_mask = 0;
-	ipaddr_t subnet_mask, route_mask;
-	ill_t	*ill = ipif->ipif_ill;
-	queue_t	*stq;
-	ipif_t	 *src_ipif;
-	ipif_t   *tmp_ipif;
-	boolean_t	flush_ire_cache = B_TRUE;
-	int	err = 0;
-	ire_t	**ipif_saved_irep = NULL;
-	int ipif_saved_ire_cnt;
-	int	cnt;
-	boolean_t	src_ipif_held = B_FALSE;
+	ill_t		*ill = ipif->ipif_ill;
+	int		err = 0;
 	boolean_t	loopback = B_FALSE;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	boolean_t	update_src_selection = B_TRUE;
+	ipif_t		*tmp_ipif;
 
 	ip1dbg(("ipif_up_done(%s:%u)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id));
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done",
+	    ill_t *, ill, ipif_t *, ipif);
+
 	/* Check if this is a loopback interface */
 	if (ipif->ipif_ill->ill_wq == NULL)
 		loopback = B_TRUE;
 
 	ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
+
 	/*
 	 * If all other interfaces for this ill are down or DEPRECATED,
-	 * or otherwise unsuitable for source address selection, remove
-	 * any IRE_CACHE entries for this ill to make sure source
+	 * or otherwise unsuitable for source address selection,
+	 * reset the src generation numbers to make sure source
 	 * address selection gets to take this new ipif into account.
 	 * No need to hold ill_lock while traversing the ipif list since
 	 * we are writer
@@ -16119,31 +14049,16 @@ ipif_up_done(ipif_t *ipif)
 		    (tmp_ipif == ipif))
 			continue;
 		/* first useable pre-existing interface */
-		flush_ire_cache = B_FALSE;
+		update_src_selection = B_FALSE;
 		break;
 	}
-	if (flush_ire_cache)
-		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
-		    IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
+	if (update_src_selection)
+		ip_update_source_selection(ill->ill_ipst);
 
-	/*
-	 * Figure out which way the send-to queue should go.  Only
-	 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK
-	 * should show up here.
-	 */
-	switch (ill->ill_net_type) {
-	case IRE_IF_RESOLVER:
-		stq = ill->ill_rq;
-		break;
-	case IRE_IF_NORESOLVER:
-	case IRE_LOOPBACK:
-		stq = ill->ill_wq;
-		break;
-	default:
-		return (EINVAL);
-	}
+	if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
+		nce_t *loop_nce = NULL;
+		uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD);
 
-	if (IS_LOOPBACK(ill)) {
 		/*
 		 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
 		 * ipif_lookup_on_name(), but in the case of zones we can have
@@ -16155,29 +14070,130 @@ ipif_up_done(ipif_t *ipif)
 			ipif->ipif_ire_type = IRE_LOOPBACK;
 		else
 			ipif->ipif_ire_type = IRE_LOCAL;
+		if (ill->ill_net_type != IRE_LOOPBACK)
+			flags |= NCE_F_PUBLISH;
+
+		/* add unicast nce for the local addr */
+		err = nce_lookup_then_add_v4(ill, NULL,
+		    ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags,
+		    ND_REACHABLE, &loop_nce);
+		/* A shared-IP zone sees EEXIST for lo0:N */
+		if (err == 0 || err == EEXIST) {
+			ipif->ipif_added_nce = 1;
+			loop_nce->nce_ipif_cnt++;
+			nce_refrele(loop_nce);
+			err = 0;
+		} else {
+			ASSERT(loop_nce == NULL);
+			return (err);
+		}
 	}
 
-	if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
-	    ((ipif->ipif_flags & IPIF_DEPRECATED) &&
-	    !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
+	/* Create all the IREs associated with this interface */
+	err = ipif_add_ires_v4(ipif, loopback);
+	if (err != 0) {
 		/*
-		 * Can't use our source address. Select a different
-		 * source address for the IRE_INTERFACE and IRE_LOCAL
+		 * see comments about return value from
+		 * ip_addr_availability_check() in ipif_add_ires_v4().
 		 */
-		src_ipif = ipif_select_source(ipif->ipif_ill,
-		    ipif->ipif_subnet, ipif->ipif_zoneid);
-		if (src_ipif == NULL)
-			src_ipif = ipif;	/* Last resort */
-		else
-			src_ipif_held = B_TRUE;
-	} else {
-		src_ipif = ipif;
+		if (err != EADDRINUSE) {
+			(void) ipif_arp_down(ipif);
+		} else {
+			/*
+			 * Make IPMP aware of the deleted ipif so that
+			 * the needed ipmp cleanup (e.g., of ipif_bound_ill)
+			 * can be completed. Note that we do not want to
+			 * destroy the nce that was created on the ipmp_ill
+			 * for the active copy of the duplicate address in
+			 * use.
+			 */
+			if (IS_IPMP(ill))
+				ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+			err = EADDRNOTAVAIL;
+		}
+		return (err);
 	}
 
-	/* Create all the IREs associated with this interface */
+	if (ill->ill_ipif_up_count == 1 && !loopback) {
+		/* Recover any additional IREs entries for this ill */
+		(void) ill_recover_saved_ire(ill);
+	}
+
+	if (ill->ill_need_recover_multicast) {
+		/*
+		 * Need to recover all multicast memberships in the driver.
+		 * This had to be deferred until we had attached.  The same
+		 * code exists in ipif_up_done_v6() to recover IPv6
+		 * memberships.
+		 *
+		 * Note that it would be preferable to unconditionally do the
+		 * ill_recover_multicast() in ill_dl_up(), but we cannot do
+		 * that since ill_join_allmulti() depends on ill_dl_up being
+		 * set, and it is not set until we receive a DL_BIND_ACK after
+		 * having called ill_dl_up().
+		 */
+		ill_recover_multicast(ill);
+	}
+
+	if (ill->ill_ipif_up_count == 1) {
+		/*
+		 * Since the interface is now up, it may now be active.
+		 */
+		if (IS_UNDER_IPMP(ill))
+			ipmp_ill_refresh_active(ill);
+
+		/*
+		 * If this is an IPMP interface, we may now be able to
+		 * establish ARP entries.
+		 */
+		if (IS_IPMP(ill))
+			ipmp_illgrp_refresh_arpent(ill->ill_grp);
+	}
+
+	/* Join the allhosts multicast address */
+	ipif_multicast_up(ipif);
+
+	if (!loopback && !update_src_selection &&
+	    !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)))
+		ip_update_source_selection(ill->ill_ipst);
+
+	if (!loopback && ipif->ipif_addr_ready) {
+		/* Broadcast an address mask reply. */
+		ipif_mask_reply(ipif);
+	}
+	/* Perhaps ilgs should use this ill */
+	update_conn_ill(NULL, ill->ill_ipst);
+
+	/*
+	 * This had to be deferred until we had bound.  Tell routing sockets and
+	 * others that this interface is up if it looks like the address has
+	 * been validated.  Otherwise, if it isn't ready yet, wait for
+	 * duplicate address detection to do its thing.
+	 */
+	if (ipif->ipif_addr_ready)
+		ipif_up_notify(ipif);
+	return (0);
+}
+
+/*
+ * Add the IREs associated with the ipif.
+ * Those MUST be explicitly removed in ipif_delete_ires_v4.
+ */
+static int
+ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback)
+{
+	ill_t		*ill = ipif->ipif_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ire_t		*ire_array[20];
+	ire_t		**irep = ire_array;
+	ire_t		**irep1;
+	ipaddr_t	net_mask = 0;
+	ipaddr_t	subnet_mask, route_mask;
+	int		err;
+	ire_t		*ire_local = NULL;	/* LOCAL or LOOPBACK */
+
 	if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
 	    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
-
 		/*
 		 * If we're on a labeled system then make sure that zone-
 		 * private addresses have proper remote host database entries.
@@ -16191,38 +14207,34 @@ ipif_up_done(ipif_t *ipif)
 		err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
 		    ipif->ipif_zoneid, ipst);
 		if (err != 0) {
-			ip0dbg(("ipif_up_done: srcid_insert %d\n", err));
+			ip0dbg(("ipif_add_ires: srcid_insert %d\n", err));
 			return (err);
 		}
 
 		/* If the interface address is set, create the local IRE. */
-		ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n",
-		    (void *)ipif,
-		    ipif->ipif_ire_type,
-		    ntohl(ipif->ipif_lcl_addr)));
-		*irep++ = ire_create(
+		ire_local = ire_create(
 		    (uchar_t *)&ipif->ipif_lcl_addr,	/* dest address */
 		    (uchar_t *)&ip_g_all_ones,		/* mask */
-		    (uchar_t *)&src_ipif->ipif_src_addr, /* source address */
 		    NULL,				/* no gateway */
-		    &ip_loopback_mtuplus,		/* max frag size */
-		    NULL,
-		    ipif->ipif_rq,			/* recv-from queue */
-		    NULL,				/* no send-to queue */
 		    ipif->ipif_ire_type,		/* LOCAL or LOOPBACK */
-		    ipif,
-		    0,
-		    0,
-		    0,
-		    (ipif->ipif_flags & IPIF_PRIVATE) ?
-		    RTF_PRIVATE : 0,
-		    &ire_uinfo_null,
-		    NULL,
+		    ipif->ipif_ill,
+		    ipif->ipif_zoneid,
+		    ((ipif->ipif_flags & IPIF_PRIVATE) ?
+		    RTF_PRIVATE : 0) | RTF_KERNEL,
 		    NULL,
 		    ipst);
+		ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x"
+		    " for 0x%x\n", (void *)ipif, (void *)ire_local,
+		    ipif->ipif_ire_type,
+		    ntohl(ipif->ipif_lcl_addr)));
+		if (ire_local == NULL) {
+			ip1dbg(("ipif_up_done: NULL ire_local\n"));
+			err = ENOMEM;
+			goto bad;
+		}
 	} else {
 		ip1dbg((
-		    "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n",
+		    "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n",
 		    ipif->ipif_ire_type,
 		    ntohl(ipif->ipif_lcl_addr),
 		    (uint_t)ipif->ipif_flags));
@@ -16249,7 +14261,7 @@ ipif_up_done(ipif_t *ipif)
 	}
 
 	/* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
-	if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) &&
+	if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
 	    ipif->ipif_subnet != INADDR_ANY) {
 		/* ipif_subnet is ipif_pp_dst_addr for pt-pt */
 
@@ -16259,7 +14271,7 @@ ipif_up_done(ipif_t *ipif)
 			route_mask = subnet_mask;
 		}
 
-		ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p "
+		ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p "
 		    "creating if IRE ill_net_type 0x%x for 0x%x\n",
 		    (void *)ipif, (void *)ill,
 		    ill->ill_net_type,
@@ -16267,20 +14279,12 @@ ipif_up_done(ipif_t *ipif)
 		*irep++ = ire_create(
 		    (uchar_t *)&ipif->ipif_subnet,	/* dest address */
 		    (uchar_t *)&route_mask,		/* mask */
-		    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
-		    NULL,				/* no gateway */
-		    &ipif->ipif_mtu,			/* max frag */
-		    NULL,
-		    NULL,				/* no recv queue */
-		    stq,				/* send-to queue */
+		    (uchar_t *)&ipif->ipif_lcl_addr,	/* gateway */
 		    ill->ill_net_type,			/* IF_[NO]RESOLVER */
-		    ipif,
-		    0,
-		    0,
-		    0,
-		    (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0,
-		    &ire_uinfo_null,
-		    NULL,
+		    ill,
+		    ipif->ipif_zoneid,
+		    ((ipif->ipif_flags & IPIF_PRIVATE) ?
+		    RTF_PRIVATE: 0) | RTF_KERNEL,
 		    NULL,
 		    ipst);
 	}
@@ -16288,11 +14292,10 @@ ipif_up_done(ipif_t *ipif)
 	/*
 	 * Create any necessary broadcast IREs.
 	 */
-	if (ipif->ipif_flags & IPIF_BROADCAST)
+	if ((ipif->ipif_flags & IPIF_BROADCAST) &&
+	    !(ipif->ipif_flags & IPIF_NOXMIT))
 		irep = ipif_create_bcast_ires(ipif, irep);
 
-	ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
-
 	/* If an earlier ire_create failed, get out now */
 	for (irep1 = irep; irep1 > ire_array; ) {
 		irep1--;
@@ -16324,14 +14327,9 @@ ipif_up_done(ipif_t *ipif)
 		 * ipif. So we don't want to delete it (otherwise the other ipif
 		 * would be unable to send packets).
 		 * ip_addr_availability_check() identifies this case for us and
-		 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL
+		 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL
 		 * which is the expected error code.
 		 */
-		if (err == EADDRINUSE) {
-			freemsg(ipif->ipif_arp_del_mp);
-			ipif->ipif_arp_del_mp = NULL;
-			err = EADDRNOTAVAIL;
-		}
 		ill->ill_ipif_up_count--;
 		ipif->ipif_flags &= ~IPIF_UP;
 		goto bad;
@@ -16341,19 +14339,33 @@ ipif_up_done(ipif_t *ipif)
 	 * Add in all newly created IREs.  ire_create_bcast() has
 	 * already checked for duplicates of the IRE_BROADCAST type.
 	 */
+	if (ire_local != NULL) {
+		ire_local = ire_add(ire_local);
+#ifdef DEBUG
+		if (ire_local != NULL) {
+			ire_refhold_notr(ire_local);
+			ire_refrele(ire_local);
+		}
+#endif
+	}
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+	if (ire_local != NULL)
+		ipif->ipif_ire_local = ire_local;
+	rw_exit(&ipst->ips_ill_g_lock);
+	ire_local = NULL;
+
 	for (irep1 = irep; irep1 > ire_array; ) {
 		irep1--;
-		ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock)));
-		/*
-		 * refheld by ire_add. refele towards the end of the func
-		 */
-		(void) ire_add(irep1, NULL, NULL, NULL, B_FALSE);
+		ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock)));
+		/* refheld by ire_add. */
+		*irep1 = ire_add(*irep1);
+		if (*irep1 != NULL) {
+			ire_refrele(*irep1);
+			*irep1 = NULL;
+		}
 	}
 
-	/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
-	ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
-	ipif_saved_irep = ipif_recover_ire(ipif);
-
 	if (!loopback) {
 		/*
 		 * If the broadcast address has been set, make sure it makes
@@ -16364,9 +14376,9 @@ ipif_up_done(ipif_t *ipif)
 		    (ipif->ipif_flags & IPIF_BROADCAST)) {
 			ire_t	*ire;
 
-			ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0,
-			    IRE_BROADCAST, ipif, ALL_ZONES,
-			    NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
+			ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0,
+			    IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL,
+			    (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL);
 
 			if (ire == NULL) {
 				/*
@@ -16383,176 +14395,113 @@ ipif_up_done(ipif_t *ipif)
 		}
 
 	}
-
-	if (ill->ill_need_recover_multicast) {
-		/*
-		 * Need to recover all multicast memberships in the driver.
-		 * This had to be deferred until we had attached.  The same
-		 * code exists in ipif_up_done_v6() to recover IPv6
-		 * memberships.
-		 *
-		 * Note that it would be preferable to unconditionally do the
-		 * ill_recover_multicast() in ill_dl_up(), but we cannot do
-		 * that since ill_join_allmulti() depends on ill_dl_up being
-		 * set, and it is not set until we receive a DL_BIND_ACK after
-		 * having called ill_dl_up().
-		 */
-		ill_recover_multicast(ill);
-	}
-
-	if (ill->ill_ipif_up_count == 1) {
-		/*
-		 * Since the interface is now up, it may now be active.
-		 */
-		if (IS_UNDER_IPMP(ill))
-			ipmp_ill_refresh_active(ill);
-
-		/*
-		 * If this is an IPMP interface, we may now be able to
-		 * establish ARP entries.
-		 */
-		if (IS_IPMP(ill))
-			ipmp_illgrp_refresh_arpent(ill->ill_grp);
-	}
-
-	/* Join the allhosts multicast address */
-	ipif_multicast_up(ipif);
-
-	/*
-	 * See if anybody else would benefit from our new ipif.
-	 */
-	if (!loopback &&
-	    !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
-		ill_update_source_selection(ill);
-	}
-
-	for (irep1 = irep; irep1 > ire_array; ) {
-		irep1--;
-		if (*irep1 != NULL) {
-			/* was held in ire_add */
-			ire_refrele(*irep1);
-		}
-	}
-
-	cnt = ipif_saved_ire_cnt;
-	for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) {
-		if (*irep1 != NULL) {
-			/* was held in ire_add */
-			ire_refrele(*irep1);
-		}
-	}
-
-	if (!loopback && ipif->ipif_addr_ready) {
-		/* Broadcast an address mask reply. */
-		ipif_mask_reply(ipif);
-	}
-	if (ipif_saved_irep != NULL) {
-		kmem_free(ipif_saved_irep,
-		    ipif_saved_ire_cnt * sizeof (ire_t *));
-	}
-	if (src_ipif_held)
-		ipif_refrele(src_ipif);
-
-	/*
-	 * This had to be deferred until we had bound.  Tell routing sockets and
-	 * others that this interface is up if it looks like the address has
-	 * been validated.  Otherwise, if it isn't ready yet, wait for
-	 * duplicate address detection to do its thing.
-	 */
-	if (ipif->ipif_addr_ready)
-		ipif_up_notify(ipif);
 	return (0);
 
 bad:
-	ip1dbg(("ipif_up_done: FAILED \n"));
-
+	ip1dbg(("ipif_add_ires: FAILED \n"));
+	if (ire_local != NULL)
+		ire_delete(ire_local);
 	while (irep > ire_array) {
 		irep--;
-		if (*irep != NULL)
+		if (*irep != NULL) {
 			ire_delete(*irep);
+		}
 	}
 	(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
 
-	if (ipif_saved_irep != NULL) {
-		kmem_free(ipif_saved_irep,
-		    ipif_saved_ire_cnt * sizeof (ire_t *));
-	}
-	if (src_ipif_held)
-		ipif_refrele(src_ipif);
-
-	ipif_resolver_down(ipif);
 	return (err);
 }
 
-/*
- * Turn off the ARP with the ILLF_NOARP flag.
- */
-static int
-ill_arp_off(ill_t *ill)
+/* Remove all the IREs created by ipif_add_ires_v4 */
+void
+ipif_delete_ires_v4(ipif_t *ipif)
 {
-	mblk_t	*arp_off_mp = NULL;
-	mblk_t	*arp_on_mp = NULL;
+	ill_t		*ill = ipif->ipif_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ipaddr_t	net_mask = 0;
+	ipaddr_t	subnet_mask, route_mask;
+	int		match_args;
+	ire_t		*ire;
+	boolean_t	loopback;
 
-	ip1dbg(("ill_arp_off(%s)\n", ill->ill_name));
+	/* Check if this is a loopback interface */
+	loopback = (ipif->ipif_ill->ill_wq == NULL);
 
-	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
+	match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_MASK |
+	    MATCH_IRE_ZONEONLY;
 
-	/*
-	 * If the on message is still around we've already done
-	 * an arp_off without doing an arp_on thus there is no
-	 * work needed.
-	 */
-	if (ill->ill_arp_on_mp != NULL)
-		return (0);
+	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+	if ((ire = ipif->ipif_ire_local) != NULL) {
+		ipif->ipif_ire_local = NULL;
+		rw_exit(&ipst->ips_ill_g_lock);
+		/*
+		 * Move count to ipif so we don't loose the count due to
+		 * a down/up dance.
+		 */
+		atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
 
-	/*
-	 * Allocate an ARP on message (to be saved) and an ARP off message
-	 */
-	arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0);
-	if (!arp_off_mp)
-		return (ENOMEM);
+		ire_delete(ire);
+		ire_refrele_notr(ire);
+	} else {
+		rw_exit(&ipst->ips_ill_g_lock);
+	}
+
+	match_args |= MATCH_IRE_GW;
 
-	arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0);
-	if (!arp_on_mp)
-		goto failed;
+	if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
+	    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
+		net_mask = ip_net_mask(ipif->ipif_lcl_addr);
+	} else {
+		net_mask = htonl(IN_CLASSA_NET);	/* fallback */
+	}
 
-	ASSERT(ill->ill_arp_on_mp == NULL);
-	ill->ill_arp_on_mp = arp_on_mp;
+	subnet_mask = ipif->ipif_net_mask;
 
-	/* Send an AR_INTERFACE_OFF request */
-	putnext(ill->ill_rq, arp_off_mp);
-	return (0);
-failed:
+	/*
+	 * If mask was not specified, use natural netmask of
+	 * interface address. Also, store this mask back into the
+	 * ipif struct.
+	 */
+	if (subnet_mask == 0)
+		subnet_mask = net_mask;
 
-	if (arp_off_mp)
-		freemsg(arp_off_mp);
-	return (ENOMEM);
-}
+	/* Delete the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
+	if (IS_UNDER_IPMP(ill))
+		match_args |= MATCH_IRE_TESTHIDDEN;
 
-/*
- * Turn on ARP by turning off the ILLF_NOARP flag.
- */
-static int
-ill_arp_on(ill_t *ill)
-{
-	mblk_t	*mp;
+	if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
+	    ipif->ipif_subnet != INADDR_ANY) {
+		/* ipif_subnet is ipif_pp_dst_addr for pt-pt */
 
-	ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name));
+		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
+			route_mask = IP_HOST_MASK;
+		} else {
+			route_mask = subnet_mask;
+		}
 
-	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
+		ire = ire_ftable_lookup_v4(
+		    ipif->ipif_subnet,			/* dest address */
+		    route_mask,				/* mask */
+		    ipif->ipif_lcl_addr,		/* gateway */
+		    ill->ill_net_type,			/* IF_[NO]RESOLVER */
+		    ill,
+		    ipif->ipif_zoneid,
+		    NULL,
+		    match_args,
+		    0,
+		    ipst,
+		    NULL);
+		ASSERT(ire != NULL);
+		ire_delete(ire);
+		ire_refrele(ire);
+	}
 
-	ASSERT(IAM_WRITER_ILL(ill));
 	/*
-	 * Send an AR_INTERFACE_ON request if we have already done
-	 * an arp_off (which allocated the message).
+	 * Create any necessary broadcast IREs.
 	 */
-	if (ill->ill_arp_on_mp != NULL) {
-		mp = ill->ill_arp_on_mp;
-		ill->ill_arp_on_mp = NULL;
-		putnext(ill->ill_rq, mp);
-	}
-	return (0);
+	if ((ipif->ipif_flags & IPIF_BROADCAST) &&
+	    !(ipif->ipif_flags & IPIF_NOXMIT))
+		ipif_delete_bcast_ires(ipif);
 }
 
 /*
@@ -16561,49 +14510,72 @@ ill_arp_on(ill_t *ill)
  * this selection is done regardless of the destination.
  */
 boolean_t
-ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
+ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid,
+    ip_stack_t *ipst)
 {
-	uint_t	ifindex;
-	ipif_t	*ipif = NULL;
-	ill_t	*uill;
-	boolean_t isv6;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	ipif_t		*ipif = NULL;
+	ill_t		*uill;
 
-	ASSERT(ill != NULL);
+	ASSERT(ifindex != 0);
 
-	isv6 = ill->ill_isv6;
-	ifindex = ill->ill_usesrc_ifindex;
-	if (ifindex != 0) {
-		uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL,
-		    NULL, ipst);
-		if (uill == NULL)
-			return (B_FALSE);
-		mutex_enter(&uill->ill_lock);
-		for (ipif = uill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (!IPIF_CAN_LOOKUP(ipif))
-				continue;
-			if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
-				continue;
-			if (!(ipif->ipif_flags & IPIF_UP))
-				continue;
-			if (ipif->ipif_zoneid != zoneid)
-				continue;
-			if ((isv6 &&
-			    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) ||
-			    (ipif->ipif_lcl_addr == INADDR_ANY))
-				continue;
-			mutex_exit(&uill->ill_lock);
-			ill_refrele(uill);
-			return (B_TRUE);
-		}
+	uill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
+	if (uill == NULL)
+		return (B_FALSE);
+
+	mutex_enter(&uill->ill_lock);
+	for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+		if (IPIF_IS_CONDEMNED(ipif))
+			continue;
+		if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
+			continue;
+		if (!(ipif->ipif_flags & IPIF_UP))
+			continue;
+		if (ipif->ipif_zoneid != zoneid)
+			continue;
+		if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
+		    ipif->ipif_lcl_addr == INADDR_ANY)
+			continue;
 		mutex_exit(&uill->ill_lock);
 		ill_refrele(uill);
+		return (B_TRUE);
 	}
+	mutex_exit(&uill->ill_lock);
+	ill_refrele(uill);
 	return (B_FALSE);
 }
 
 /*
+ * Find an ipif with a good local address on the ill+zoneid.
+ */
+ipif_t *
+ipif_good_addr(ill_t *ill, zoneid_t zoneid)
+{
+	ipif_t		*ipif;
+
+	mutex_enter(&ill->ill_lock);
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+		if (IPIF_IS_CONDEMNED(ipif))
+			continue;
+		if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
+			continue;
+		if (!(ipif->ipif_flags & IPIF_UP))
+			continue;
+		if (ipif->ipif_zoneid != zoneid &&
+		    ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES)
+			continue;
+		if (ill->ill_isv6 ?
+		    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
+		    ipif->ipif_lcl_addr == INADDR_ANY)
+			continue;
+		ipif_refhold_locked(ipif);
+		mutex_exit(&ill->ill_lock);
+		return (ipif);
+	}
+	mutex_exit(&ill->ill_lock);
+	return (NULL);
+}
+
+/*
  * IP source address type, sorted from worst to best.  For a given type,
  * always prefer IP addresses on the same subnet.  All-zones addresses are
  * suboptimal because they pose problems with unlabeled destinations.
@@ -16615,7 +14587,8 @@ typedef enum {
 	IPIF_DIFFNET_ALLZONES,		/* allzones and different subnet */
 	IPIF_SAMENET_ALLZONES,		/* allzones and same subnet */
 	IPIF_DIFFNET,			/* normal and different subnet */
-	IPIF_SAMENET			/* normal and same subnet */
+	IPIF_SAMENET,			/* normal and same subnet */
+	IPIF_LOCALADDR			/* local loopback */
 } ipif_type_t;
 
 /*
@@ -16629,7 +14602,8 @@ typedef enum {
  * This only occurs when there is no valid source address for the ill.
  */
 ipif_t *
-ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
+ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid,
+    boolean_t allow_usesrc, boolean_t *notreadyp)
 {
 	ill_t	*usill = NULL;
 	ill_t	*ipmp_ill = NULL;
@@ -16639,9 +14613,9 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
 	ip_stack_t *ipst = ill->ill_ipst;
 	boolean_t samenet;
 
-	if (ill->ill_usesrc_ifindex != 0) {
+	if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) {
 		usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
-		    B_FALSE, NULL, NULL, NULL, NULL, ipst);
+		    B_FALSE, ipst);
 		if (usill != NULL)
 			ill = usill;	/* Select source from usesrc ILL */
 		else
@@ -16705,14 +14679,22 @@ retry:
 		if ((next_ipif = ipif->ipif_next) == NULL)
 			next_ipif = ill->ill_ipif;
 
-		if (!IPIF_CAN_LOOKUP(ipif))
+		if (IPIF_IS_CONDEMNED(ipif))
 			continue;
 		/* Always skip NOLOCAL and ANYCAST interfaces */
 		if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
 			continue;
-		if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready)
+		if (!(ipif->ipif_flags & IPIF_UP))
 			continue;
-		if (ipif->ipif_zoneid != zoneid &&
+
+		if (!ipif->ipif_addr_ready) {
+			if (notreadyp != NULL)
+				*notreadyp = B_TRUE;
+			continue;
+		}
+
+		if (zoneid != ALL_ZONES &&
+		    ipif->ipif_zoneid != zoneid &&
 		    ipif->ipif_zoneid != ALL_ZONES)
 			continue;
 
@@ -16749,7 +14731,9 @@ retry:
 
 		samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
 
-		if (ipif->ipif_flags & IPIF_DEPRECATED) {
+		if (ipif->ipif_lcl_addr == dst) {
+			type = IPIF_LOCALADDR;
+		} else if (ipif->ipif_flags & IPIF_DEPRECATED) {
 			type = samenet ? IPIF_SAMENET_DEPRECATED :
 			    IPIF_DIFFNET_DEPRECATED;
 		} else if (ipif->ipif_zoneid == ALL_ZONES) {
@@ -16762,14 +14746,14 @@ retry:
 		if (type > best_type) {
 			best_type = type;
 			best_ipif = ipif;
-			if (best_type == IPIF_SAMENET)
+			if (best_type == IPIF_LOCALADDR)
 				break; /* can't get better */
 		}
 	} while ((ipif = next_ipif) != start_ipif);
 
 	if ((ipif = best_ipif) != NULL) {
 		mutex_enter(&ipif->ipif_ill->ill_lock);
-		if (!IPIF_CAN_LOOKUP(ipif)) {
+		if (IPIF_IS_CONDEMNED(ipif)) {
 			mutex_exit(&ipif->ipif_ill->ill_lock);
 			goto retry;
 		}
@@ -16783,7 +14767,7 @@ retry:
 		 */
 		if (IS_IPMP(ill) && ipif != NULL) {
 			next_ipif = ipif->ipif_next;
-			if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+			if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
 				ill->ill_src_ipif = next_ipif;
 			else
 				ill->ill_src_ipif = NULL;
@@ -16803,14 +14787,14 @@ retry:
 	if (ipif == NULL) {
 		char buf1[INET6_ADDRSTRLEN];
 
-		ip1dbg(("ipif_select_source(%s, %s) -> NULL\n",
+		ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n",
 		    ill->ill_name,
 		    inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
 	} else {
 		char buf1[INET6_ADDRSTRLEN];
 		char buf2[INET6_ADDRSTRLEN];
 
-		ip1dbg(("ipif_select_source(%s, %s) -> %s\n",
+		ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n",
 		    ipif->ipif_ill->ill_name,
 		    inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
 		    inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
@@ -16821,172 +14805,80 @@ retry:
 }
 
 /*
- * If old_ipif is not NULL, see if ipif was derived from old
- * ipif and if so, recreate the interface route by re-doing
- * source address selection. This happens when ipif_down ->
- * ipif_update_other_ipifs calls us.
+ * Pick a source address based on the destination ill and an optional setsrc
+ * address.
+ * The result is stored in srcp. If generation is set, then put the source
+ * generation number there before we look for the source address (to avoid
+ * missing changes in the set of source addresses.
+ * If flagsp is set, then us it to pass back ipif_flags.
  *
- * If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when ipif_up_done calls us.
+ * If the caller wants to cache the returned source address and detect when
+ * that might be stale, the caller should pass in a generation argument,
+ * which the caller can later compare against ips_src_generation
+ *
+ * The precedence order for selecting an IPv4 source address is:
+ *  - RTF_SETSRC on the offlink ire always wins.
+ *  - If usrsrc is set, swap the ill to be the usesrc one.
+ *  - If IPMP is used on the ill, select a random address from the most
+ *    preferred ones below:
+ * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES
+ * 2. Not deprecated, not ALL_ZONES
+ * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES
+ * 4. Not deprecated, ALL_ZONES
+ * 5. If onlink destination, same subnet and deprecated
+ * 6. Deprecated.
+ *
+ * We have lower preference for ALL_ZONES IP addresses,
+ * as they pose problems with unlabeled destinations.
+ *
+ * Note that when multiple IP addresses match e.g., #1 we pick
+ * the first one if IPMP is not in use. With IPMP we randomize.
  */
-static void
-ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
+int
+ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst,
+    ipaddr_t multicast_ifaddr,
+    zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp,
+    uint32_t *generation, uint64_t *flagsp)
 {
-	ire_t *ire;
-	ire_t *ipif_ire;
-	queue_t *stq;
-	ipif_t *nipif;
-	ill_t *ill;
-	boolean_t need_rele = B_FALSE;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
-
-	ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif));
-	ASSERT(IAM_WRITER_IPIF(ipif));
+	ipif_t *ipif;
+	boolean_t notready = B_FALSE;	/* Set if !ipif_addr_ready found */
 
-	ill = ipif->ipif_ill;
-	if (!(ipif->ipif_flags &
-	    (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
-		/*
-		 * Can't possibly have borrowed the source
-		 * from old_ipif.
-		 */
-		return;
-	}
+	if (flagsp != NULL)
+		*flagsp = 0;
 
 	/*
-	 * Is there any work to be done? No work if the address
-	 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST (
-	 * ipif_select_source() does not borrow addresses from
-	 * NOLOCAL and ANYCAST interfaces).
+	 * Need to grab the generation number before we check to
+	 * avoid a race with a change to the set of local addresses.
+	 * No lock needed since the thread which updates the set of local
+	 * addresses use ipif/ill locks and exit those (hence a store memory
+	 * barrier) before doing the atomic increase of ips_src_generation.
 	 */
-	if ((old_ipif != NULL) &&
-	    ((old_ipif->ipif_lcl_addr == INADDR_ANY) ||
-	    (old_ipif->ipif_ill->ill_wq == NULL) ||
-	    (old_ipif->ipif_flags &
-	    (IPIF_NOLOCAL|IPIF_ANYCAST)))) {
-		return;
+	if (generation != NULL) {
+		*generation = ipst->ips_src_generation;
 	}
 
-	/*
-	 * Perform the same checks as when creating the
-	 * IRE_INTERFACE in ipif_up_done.
-	 */
-	if (!(ipif->ipif_flags & IPIF_UP))
-		return;
-
-	if ((ipif->ipif_flags & IPIF_NOXMIT) ||
-	    (ipif->ipif_subnet == INADDR_ANY))
-		return;
-
-	ipif_ire = ipif_to_ire(ipif);
-	if (ipif_ire == NULL)
-		return;
-
-	/*
-	 * We know that ipif uses some other source for its
-	 * IRE_INTERFACE. Is it using the source of this
-	 * old_ipif?
-	 */
-	if (old_ipif != NULL &&
-	    old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) {
-		ire_refrele(ipif_ire);
-		return;
-	}
-	if (ip_debug > 2) {
-		/* ip1dbg */
-		pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for"
-		    " src %s\n", AF_INET, &ipif_ire->ire_src_addr);
-	}
-
-	stq = ipif_ire->ire_stq;
-
-	/*
-	 * Can't use our source address. Select a different
-	 * source address for the IRE_INTERFACE.
-	 */
-	nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid);
-	if (nipif == NULL) {
-		/* Last resort - all ipif's have IPIF_NOLOCAL */
-		nipif = ipif;
-	} else {
-		need_rele = B_TRUE;
+	if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) {
+		*srcp = multicast_ifaddr;
+		return (0);
 	}
 
-	ire = ire_create(
-	    (uchar_t *)&ipif->ipif_subnet,	/* dest pref */
-	    (uchar_t *)&ipif->ipif_net_mask,	/* mask */
-	    (uchar_t *)&nipif->ipif_src_addr,	/* src addr */
-	    NULL,				/* no gateway */
-	    &ipif->ipif_mtu,			/* max frag */
-	    NULL,				/* no src nce */
-	    NULL,				/* no recv from queue */
-	    stq,				/* send-to queue */
-	    ill->ill_net_type,			/* IF_[NO]RESOLVER */
-	    ipif,
-	    0,
-	    0,
-	    0,
-	    0,
-	    &ire_uinfo_null,
-	    NULL,
-	    NULL,
-	    ipst);
-
-	if (ire != NULL) {
-		ire_t *ret_ire;
-		int error;
-
-		/*
-		 * We don't need ipif_ire anymore. We need to delete
-		 * before we add so that ire_add does not detect
-		 * duplicates.
-		 */
-		ire_delete(ipif_ire);
-		ret_ire = ire;
-		error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE);
-		ASSERT(error == 0);
-		ASSERT(ire == ret_ire);
-		/* Held in ire_add */
-		ire_refrele(ret_ire);
+	/* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
+	if (setsrc != INADDR_ANY) {
+		*srcp = setsrc;
+		return (0);
 	}
-	/*
-	 * Either we are falling through from above or could not
-	 * allocate a replacement.
-	 */
-	ire_refrele(ipif_ire);
-	if (need_rele)
-		ipif_refrele(nipif);
-}
-
-/*
- * This old_ipif is going away.
- *
- * Determine if any other ipif's are using our address as
- * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
- * IPIF_DEPRECATED).
- * Find the IRE_INTERFACE for such ipifs and recreate them
- * to use an different source address following the rules in
- * ipif_up_done.
- */
-static void
-ipif_update_other_ipifs(ipif_t *old_ipif)
-{
-	ipif_t	*ipif;
-	ill_t	*ill;
-	char	buf[INET6_ADDRSTRLEN];
-
-	ASSERT(IAM_WRITER_IPIF(old_ipif));
-
-	ill = old_ipif->ipif_ill;
-
-	ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name,
-	    inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf))));
-
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (ipif == old_ipif)
-			continue;
-		ipif_recreate_interface_routes(old_ipif, ipif);
+	ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, &notready);
+	if (ipif == NULL) {
+		if (notready)
+			return (ENETDOWN);
+		else
+			return (EADDRNOTAVAIL);
 	}
+	*srcp = ipif->ipif_lcl_addr;
+	if (flagsp != NULL)
+		*flagsp = ipif->ipif_flags;
+	ipif_refrele(ipif);
+	return (0);
 }
 
 /* ARGSUSED */
@@ -17049,51 +14941,12 @@ ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 }
 
 /*
- * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the
- * minimum (but complete) set exist.  This is necessary when adding or
- * removing an interface to/from an IPMP group, since interfaces in an
- * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever
- * its test address subnets overlap with IPMP data addresses).	It's also
- * used to refresh the IRE_BROADCAST entries associated with the IPMP
- * interface when the nominated broadcast interface changes.
- */
-void
-ill_refresh_bcast(ill_t *ill)
-{
-	ire_t *ire_array[12];	/* max ipif_create_bcast_ires() can create */
-	ire_t **irep;
-	ipif_t *ipif;
-
-	ASSERT(!ill->ill_isv6);
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	/*
-	 * Remove any old broadcast IREs.
-	 */
-	ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST,
-	    ill_broadcast_delete, ill, ill);
-
-	/*
-	 * Create new ones for any ipifs that are up and broadcast-capable.
-	 */
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) !=
-		    (IPIF_UP|IPIF_BROADCAST))
-			continue;
-
-		irep = ipif_create_bcast_ires(ipif, ire_array);
-		while (irep-- > ire_array) {
-			(void) ire_add(irep, NULL, NULL, NULL, B_FALSE);
-			if (*irep != NULL)
-				ire_refrele(*irep);
-		}
-	}
-}
-
-/*
  * Create any IRE_BROADCAST entries for `ipif', and store those entries in
- * `irep'.  Returns a pointer to the next free `irep' entry (just like
- * ire_check_and_create_bcast()).
+ * `irep'.  Returns a pointer to the next free `irep' entry
+ * A mirror exists in ipif_delete_bcast_ires().
+ *
+ * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is
+ * done in ire_add.
  */
 static ire_t **
 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
@@ -17101,18 +14954,20 @@ ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
 	ipaddr_t addr;
 	ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
 	ipaddr_t subnetmask = ipif->ipif_net_mask;
-	int flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
+	ill_t *ill = ipif->ipif_ill;
+	zoneid_t zoneid = ipif->ipif_zoneid;
 
 	ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n"));
 
 	ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
+	ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
 
 	if (ipif->ipif_lcl_addr == INADDR_ANY ||
 	    (ipif->ipif_flags & IPIF_NOLOCAL))
 		netmask = htonl(IN_CLASSA_NET);		/* fallback */
 
-	irep = ire_check_and_create_bcast(ipif, 0, irep, flags);
-	irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, flags);
+	irep = ire_create_bcast(ill, 0, zoneid, irep);
+	irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep);
 
 	/*
 	 * For backward compatibility, we create net broadcast IREs based on
@@ -17125,9 +14980,8 @@ ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
 	 */
 	if (netmask < subnetmask) {
 		addr = netmask & ipif->ipif_subnet;
-		irep = ire_check_and_create_bcast(ipif, addr, irep, flags);
-		irep = ire_check_and_create_bcast(ipif, ~netmask | addr, irep,
-		    flags);
+		irep = ire_create_bcast(ill, addr, zoneid, irep);
+		irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep);
 	}
 
 	/*
@@ -17138,282 +14992,73 @@ ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
 	 */
 	if (subnetmask != 0xFFFFFFFF) {
 		addr = ipif->ipif_subnet;
-		irep = ire_check_and_create_bcast(ipif, addr, irep, flags);
-		irep = ire_check_and_create_bcast(ipif, ~subnetmask | addr,
-		    irep, flags);
+		irep = ire_create_bcast(ill, addr, zoneid, irep);
+		irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep);
 	}
 
 	return (irep);
 }
 
 /*
- * Broadcast IRE info structure used in the functions below.  Since we
- * allocate BCAST_COUNT of them on the stack, keep the bit layout compact.
- */
-typedef struct bcast_ireinfo {
-	uchar_t		bi_type;	/* BCAST_* value from below */
-	uchar_t		bi_willdie:1, 	/* will this IRE be going away? */
-			bi_needrep:1,	/* do we need to replace it? */
-			bi_haverep:1,	/* have we replaced it? */
-			bi_pad:5;
-	ipaddr_t	bi_addr;	/* IRE address */
-	ipif_t		*bi_backup;	/* last-ditch ipif to replace it on */
-} bcast_ireinfo_t;
-
-enum { BCAST_ALLONES, BCAST_ALLZEROES, BCAST_NET, BCAST_SUBNET, BCAST_COUNT };
-
-/*
- * Check if `ipif' needs the dying broadcast IRE described by `bireinfop', and
- * return B_TRUE if it should immediately be used to recreate the IRE.
- */
-static boolean_t
-ipif_consider_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop)
-{
-	ipaddr_t addr;
-
-	ASSERT(!bireinfop->bi_haverep && bireinfop->bi_willdie);
-
-	switch (bireinfop->bi_type) {
-	case BCAST_NET:
-		addr = ipif->ipif_subnet & ip_net_mask(ipif->ipif_subnet);
-		if (addr != bireinfop->bi_addr)
-			return (B_FALSE);
-		break;
-	case BCAST_SUBNET:
-		if (ipif->ipif_subnet != bireinfop->bi_addr)
-			return (B_FALSE);
-		break;
-	}
-
-	bireinfop->bi_needrep = 1;
-	if (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_NOLOCAL|IPIF_ANYCAST)) {
-		if (bireinfop->bi_backup == NULL)
-			bireinfop->bi_backup = ipif;
-		return (B_FALSE);
-	}
-	return (B_TRUE);
-}
-
-/*
- * Create the broadcast IREs described by `bireinfop' on `ipif', and return
- * them ala ire_check_and_create_bcast().
- */
-static ire_t **
-ipif_create_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop, ire_t **irep)
-{
-	ipaddr_t mask, addr;
-
-	ASSERT(!bireinfop->bi_haverep && bireinfop->bi_needrep);
-
-	addr = bireinfop->bi_addr;
-	irep = ire_create_bcast(ipif, addr, irep);
-
-	switch (bireinfop->bi_type) {
-	case BCAST_NET:
-		mask = ip_net_mask(ipif->ipif_subnet);
-		irep = ire_create_bcast(ipif, addr | ~mask, irep);
-		break;
-	case BCAST_SUBNET:
-		mask = ipif->ipif_net_mask;
-		irep = ire_create_bcast(ipif, addr | ~mask, irep);
-		break;
-	}
-
-	bireinfop->bi_haverep = 1;
-	return (irep);
-}
-
-/*
- * Walk through all of the ipifs on `ill' that will be affected by `test_ipif'
- * going away, and determine if any of the broadcast IREs (named by `bireinfop')
- * that are going away are still needed.  If so, have ipif_create_bcast()
- * recreate them (except for the deprecated case, as explained below).
- */
-static ire_t **
-ill_create_bcast(ill_t *ill, ipif_t *test_ipif, bcast_ireinfo_t *bireinfo,
-    ire_t **irep)
-{
-	int i;
-	ipif_t *ipif;
-
-	ASSERT(!ill->ill_isv6);
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		/*
-		 * Skip this ipif if it's (a) the one being taken down, (b)
-		 * not in the same zone, or (c) has no valid local address.
-		 */
-		if (ipif == test_ipif ||
-		    ipif->ipif_zoneid != test_ipif->ipif_zoneid ||
-		    ipif->ipif_subnet == 0 ||
-		    (ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST|IPIF_NOXMIT)) !=
-		    (IPIF_UP|IPIF_BROADCAST))
-			continue;
-
-		/*
-		 * For each dying IRE that hasn't yet been replaced, see if
-		 * `ipif' needs it and whether the IRE should be recreated on
-		 * `ipif'.  If `ipif' is deprecated, ipif_consider_bcast()
-		 * will return B_FALSE even if `ipif' needs the IRE on the
-		 * hopes that we'll later find a needy non-deprecated ipif.
-		 * However, the ipif is recorded in bi_backup for possible
-		 * subsequent use by ipif_check_bcast_ires().
-		 */
-		for (i = 0; i < BCAST_COUNT; i++) {
-			if (!bireinfo[i].bi_willdie || bireinfo[i].bi_haverep)
-				continue;
-			if (!ipif_consider_bcast(ipif, &bireinfo[i]))
-				continue;
-			irep = ipif_create_bcast(ipif, &bireinfo[i], irep);
-		}
-
-		/*
-		 * If we've replaced all of the broadcast IREs that are going
-		 * to be taken down, we know we're done.
-		 */
-		for (i = 0; i < BCAST_COUNT; i++) {
-			if (bireinfo[i].bi_willdie && !bireinfo[i].bi_haverep)
-				break;
-		}
-		if (i == BCAST_COUNT)
-			break;
-	}
-	return (irep);
-}
-
-/*
- * Check if `test_ipif' (which is going away) is associated with any existing
- * broadcast IREs, and whether any other ipifs (e.g., on the same ill) were
- * using those broadcast IREs.  If so, recreate the broadcast IREs on one or
- * more of those other ipifs.  (The old IREs will be deleted in ipif_down().)
- *
- * This is necessary because broadcast IREs are shared.  In particular, a
- * given ill has one set of all-zeroes and all-ones broadcast IREs (for every
- * zone), plus one set of all-subnet-ones, all-subnet-zeroes, all-net-ones,
- * and all-net-zeroes for every net/subnet (and every zone) it has IPIF_UP
- * ipifs on.  Thus, if there are two IPIF_UP ipifs on the same subnet with the
- * same zone, they will share the same set of broadcast IREs.
- *
- * Note: the upper bound of 12 IREs comes from the worst case of replacing all
- * six pairs (loopback and non-loopback) of broadcast IREs (all-zeroes,
- * all-ones, subnet-zeroes, subnet-ones, net-zeroes, and net-ones).
+ * Mirror of ipif_create_bcast_ires()
  */
 static void
-ipif_check_bcast_ires(ipif_t *test_ipif)
+ipif_delete_bcast_ires(ipif_t *ipif)
 {
-	ill_t		*ill = test_ipif->ipif_ill;
-	ire_t		*ire, *ire_array[12]; 		/* see note above */
-	ire_t		**irep1, **irep = &ire_array[0];
-	uint_t 		i, willdie;
-	ipaddr_t	mask = ip_net_mask(test_ipif->ipif_subnet);
-	bcast_ireinfo_t	bireinfo[BCAST_COUNT];
-
-	ASSERT(!test_ipif->ipif_isv6);
-	ASSERT(IAM_WRITER_IPIF(test_ipif));
-
-	/*
-	 * No broadcast IREs for the LOOPBACK interface
-	 * or others such as point to point and IPIF_NOXMIT.
-	 */
-	if (!(test_ipif->ipif_flags & IPIF_BROADCAST) ||
-	    (test_ipif->ipif_flags & IPIF_NOXMIT))
-		return;
-
-	bzero(bireinfo, sizeof (bireinfo));
-	bireinfo[0].bi_type = BCAST_ALLZEROES;
-	bireinfo[0].bi_addr = 0;
-
-	bireinfo[1].bi_type = BCAST_ALLONES;
-	bireinfo[1].bi_addr = INADDR_BROADCAST;
-
-	bireinfo[2].bi_type = BCAST_NET;
-	bireinfo[2].bi_addr = test_ipif->ipif_subnet & mask;
-
-	if (test_ipif->ipif_net_mask != 0)
-		mask = test_ipif->ipif_net_mask;
-	bireinfo[3].bi_type = BCAST_SUBNET;
-	bireinfo[3].bi_addr = test_ipif->ipif_subnet & mask;
-
-	/*
-	 * Figure out what (if any) broadcast IREs will die as a result of
-	 * `test_ipif' going away.  If none will die, we're done.
-	 */
-	for (i = 0, willdie = 0; i < BCAST_COUNT; i++) {
-		ire = ire_ctable_lookup(bireinfo[i].bi_addr, 0, IRE_BROADCAST,
-		    test_ipif, ALL_ZONES, NULL,
-		    (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ill->ill_ipst);
-		if (ire != NULL) {
-			willdie++;
-			bireinfo[i].bi_willdie = 1;
-			ire_refrele(ire);
-		}
-	}
-
-	if (willdie == 0)
-		return;
-
-	/*
-	 * Walk through all the ipifs that will be affected by the dying IREs,
-	 * and recreate the IREs as necessary. Note that all interfaces in an
-	 * IPMP illgrp share the same broadcast IREs, and thus the entire
-	 * illgrp must be walked, starting with the IPMP meta-interface (so
-	 * that broadcast IREs end up on it whenever possible).
-	 */
-	if (IS_UNDER_IPMP(ill))
-		ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
-
-	irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
+	ipaddr_t	addr;
+	ipaddr_t	netmask = ip_net_mask(ipif->ipif_lcl_addr);
+	ipaddr_t	subnetmask = ipif->ipif_net_mask;
+	ill_t		*ill = ipif->ipif_ill;
+	zoneid_t	zoneid = ipif->ipif_zoneid;
+	ire_t		*ire;
 
-	if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
-		ipmp_illgrp_t *illg = ill->ill_grp;
+	ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
+	ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
 
-		ill = list_head(&illg->ig_if);
-		for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
-			for (i = 0; i < BCAST_COUNT; i++) {
-				if (bireinfo[i].bi_willdie &&
-				    !bireinfo[i].bi_haverep)
-					break;
-			}
-			if (i == BCAST_COUNT)
-				break;
+	if (ipif->ipif_lcl_addr == INADDR_ANY ||
+	    (ipif->ipif_flags & IPIF_NOLOCAL))
+		netmask = htonl(IN_CLASSA_NET);		/* fallback */
 
-			irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
-		}
-	}
+	ire = ire_lookup_bcast(ill, 0, zoneid);
+	ASSERT(ire != NULL);
+	ire_delete(ire); ire_refrele(ire);
+	ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid);
+	ASSERT(ire != NULL);
+	ire_delete(ire); ire_refrele(ire);
 
 	/*
-	 * Scan through the set of broadcast IREs and see if there are any
-	 * that we need to replace that have not yet been replaced.  If so,
-	 * replace them using the appropriate backup ipif.
+	 * For backward compatibility, we create net broadcast IREs based on
+	 * the old "IP address class system", since some old machines only
+	 * respond to these class derived net broadcast.  However, we must not
+	 * create these net broadcast IREs if the subnetmask is shorter than
+	 * the IP address class based derived netmask.  Otherwise, we may
+	 * create a net broadcast address which is the same as an IP address
+	 * on the subnet -- and then TCP will refuse to talk to that address.
 	 */
-	for (i = 0; i < BCAST_COUNT; i++) {
-		if (bireinfo[i].bi_needrep && !bireinfo[i].bi_haverep)
-			irep = ipif_create_bcast(bireinfo[i].bi_backup,
-			    &bireinfo[i], irep);
+	if (netmask < subnetmask) {
+		addr = netmask & ipif->ipif_subnet;
+		ire = ire_lookup_bcast(ill, addr, zoneid);
+		ASSERT(ire != NULL);
+		ire_delete(ire); ire_refrele(ire);
+		ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid);
+		ASSERT(ire != NULL);
+		ire_delete(ire); ire_refrele(ire);
 	}
 
 	/*
-	 * If we can't create all of them, don't add any of them.  (Code in
-	 * ip_wput_ire() and ire_to_ill() assumes that we always have a
-	 * non-loopback copy and loopback copy for a given address.)
+	 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
+	 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
+	 * created.  Creating these broadcast IREs will only create confusion
+	 * as `addr' will be the same as the IP address.
 	 */
-	for (irep1 = irep; irep1 > ire_array; ) {
-		irep1--;
-		if (*irep1 == NULL) {
-			ip0dbg(("ipif_check_bcast_ires: can't create "
-			    "IRE_BROADCAST, memory allocation failure\n"));
-			while (irep > ire_array) {
-				irep--;
-				if (*irep != NULL)
-					ire_delete(*irep);
-			}
-			return;
-		}
-	}
-
-	for (irep1 = irep; irep1 > ire_array; ) {
-		irep1--;
-		if (ire_add(irep1, NULL, NULL, NULL, B_FALSE) == 0)
-			ire_refrele(*irep1);		/* Held in ire_add */
+	if (subnetmask != 0xFFFFFFFF) {
+		addr = ipif->ipif_subnet;
+		ire = ire_lookup_bcast(ill, addr, zoneid);
+		ASSERT(ire != NULL);
+		ire_delete(ire); ire_refrele(ire);
+		ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid);
+		ASSERT(ire != NULL);
+		ire_delete(ire); ire_refrele(ire);
 	}
 }
 
@@ -17423,7 +15068,7 @@ ipif_check_bcast_ires(ipif_t *test_ipif)
  * Set IFF_IPV* and ill_isv6 prior to doing the lookup
  * since ipif_lookup_on_name uses the _isv6 flags when matching.
  * Returns EINPROGRESS when mp has been consumed by queueing it on
- * ill_pending_mp and the ioctl will complete in ip_rput.
+ * ipx_pending_mp and the ioctl will complete in ip_rput.
  *
  * Can operate on either a module or a driver queue.
  * Returns an error if not a module queue.
@@ -17485,7 +15130,7 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	 * We start off as IFF_IPV4 in ipif_allocate and become
 	 * IFF_IPV4 or IFF_IPV6 here depending  on lifr_flags value.
 	 * The only flags that we read from user space are IFF_IPV4,
-	 * IFF_IPV6, IFF_XRESOLV and IFF_BROADCAST.
+	 * IFF_IPV6, and IFF_BROADCAST.
 	 *
 	 * This ill has not been inserted into the global list.
 	 * So we are still single threaded and don't need any lock
@@ -17502,22 +15147,13 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	}
 
 	new_flags =
-	    lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_XRESOLV|IFF_BROADCAST);
+	    lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST);
 
 	if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) {
 		ip1dbg(("ip_sioctl_slifname: flags must be exactly one of "
 		    "IFF_IPV4 or IFF_IPV6\n"));
 		return (EINVAL);
 	}
-	/*
-	 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces.
-	 */
-	if ((new_flags & IFF_XRESOLV) && !(new_flags & IFF_IPV6) &&
-	    !(ipif->ipif_isv6)) {
-		ip1dbg(("ip_sioctl_slifname: XRESOLV only allowed on "
-		    "IPv6 interface\n"));
-		return (EINVAL);
-	}
 
 	/*
 	 * We always start off as IPv4, so only need to check for IPv6.
@@ -17532,11 +15168,6 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	else
 		ipif->ipif_flags &= ~IPIF_BROADCAST;
 
-	if ((new_flags & IFF_XRESOLV) != 0)
-		ill->ill_flags |= ILLF_XRESOLV;
-	else
-		ill->ill_flags &= ~ILLF_XRESOLV;
-
 	/* We started off as V4. */
 	if (ill->ill_flags & ILLF_IPV6) {
 		ill->ill_phyint->phyint_illv6 = ill;
@@ -17566,23 +15197,17 @@ ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
  */
 ipif_t *
 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
-    queue_t *q, mblk_t *mp, ipsq_func_t func, int *err, ip_stack_t *ipst)
+    ip_stack_t *ipst)
 {
 	ill_t	*ill;
 	ipif_t	*ipif = NULL;
 
-	ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) ||
-	    (q != NULL && mp != NULL && func != NULL && err != NULL));
-
-	if (err != NULL)
-		*err = 0;
-
-	ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst);
+	ill = ill_lookup_on_ifindex(index, isv6, ipst);
 	if (ill != NULL) {
 		mutex_enter(&ill->ill_lock);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
-			if (IPIF_CAN_LOOKUP(ipif) && (zoneid == ALL_ZONES ||
+			if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES ||
 			    zoneid == ipif->ipif_zoneid ||
 			    ipif->ipif_zoneid == ALL_ZONES)) {
 				ipif_refhold_locked(ipif);
@@ -17591,8 +15216,6 @@ ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
 		}
 		mutex_exit(&ill->ill_lock);
 		ill_refrele(ill);
-		if (ipif == NULL && err != NULL)
-			*err = ENXIO;
 	}
 	return (ipif);
 }
@@ -17673,6 +15296,8 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	if (ILL_OTHER(ill))
 		ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
 
+	/* Perhaps ilgs should use this ill */
+	update_conn_ill(NULL, ill->ill_ipst);
 	return (0);
 }
 
@@ -17764,7 +15389,7 @@ ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		err = ipif_logical_down(ipif, q, mp);
 		if (err == EINPROGRESS)
 			return (err);
-		ipif_down_tail(ipif);
+		(void) ipif_down_tail(ipif);
 		need_up = B_TRUE;
 	}
 
@@ -17801,6 +15426,9 @@ ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
 	/* Update sctp list */
 	sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
 
+	/* The default multicast interface might have changed */
+	ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6);
+
 	if (need_up) {
 		/*
 		 * Now bring the interface back up.  If this
@@ -17825,7 +15453,6 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	zone_t *zptr;
 	zone_status_t status;
 
-	ASSERT(ipif->ipif_id != 0);
 	ASSERT(ipip->ipi_cmd_type == LIF_CMD);
 	if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
 		zoneid = GLOBAL_ZONEID;
@@ -17863,7 +15490,7 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		return (EINVAL);
 	}
 
-	ipif_down_tail(ipif);
+	(void) ipif_down_tail(ipif);
 
 	return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp,
 	    B_TRUE));
@@ -17943,6 +15570,16 @@ ill_prev_usesrc(ill_t *uill)
  * Release all members of the usesrc group. This routine is called
  * from ill_delete when the interface being unplumbed is the
  * group head.
+ *
+ * This silently clears the usesrc that ifconfig setup.
+ * An alternative would be to keep that ifindex, and drop packets on the floor
+ * since no source address can be selected.
+ * Even if we keep the current semantics, don't need a lock and a linked list.
+ * Can walk all the ills checking if they have a ill_usesrc_ifindex matching
+ * the one that is being removed. Issue is how we return the usesrc users
+ * (SIOCGLIFSRCOF). We want to be able to find the ills which have an
+ * ill_usesrc_ifindex matching a target ill. We could also do that with an
+ * ill walk, but the walker would need to insert in the ioctl response.
  */
 static void
 ill_disband_usesrc_group(ill_t *uill)
@@ -18023,8 +15660,7 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *ifreq)
 {
 	struct lifreq *lifr = (struct lifreq *)ifreq;
-	boolean_t isv6 = B_FALSE, reset_flg = B_FALSE,
-	    ill_flag_changed = B_FALSE;
+	boolean_t isv6 = B_FALSE, reset_flg = B_FALSE;
 	ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
 	int err = 0, ret;
 	uint_t ifindex;
@@ -18035,7 +15671,7 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	ASSERT(q->q_next == NULL);
 	ASSERT(CONN_Q(q));
 
-	isv6 = (Q_TO_CONN(q))->conn_af_isv6;
+	isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
 
 	ifindex = lifr->lifr_index;
 	if (ifindex == 0) {
@@ -18048,10 +15684,9 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		reset_flg = B_TRUE;
 	}
 
-	usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp,
-	    ip_process_ioctl, &err, ipst);
+	usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
 	if (usesrc_ill == NULL) {
-		return (err);
+		return (ENXIO);
 	}
 
 	ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
@@ -18101,31 +15736,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	    usesrc_ill->ill_isv6));
 
 	/*
-	 * The next step ensures that no new ires will be created referencing
-	 * the client ill, until the ILL_CHANGING flag is cleared. Then
-	 * we go through an ire walk deleting all ire caches that reference
-	 * the client ill. New ires referencing the client ill that are added
-	 * to the ire table before the ILL_CHANGING flag is set, will be
-	 * cleaned up by the ire walk below. Attempt to add new ires referencing
-	 * the client ill while the ILL_CHANGING flag is set will be failed
-	 * during the ire_add in ire_atomic_start. ire_atomic_start atomically
-	 * checks (under the ill_g_usesrc_lock) that the ire being added
-	 * is not stale, i.e the ire_stq and ire_ipif are consistent and
-	 * belong to the same usesrc group.
-	 */
-	mutex_enter(&usesrc_cli_ill->ill_lock);
-	usesrc_cli_ill->ill_state_flags |= ILL_CHANGING;
-	mutex_exit(&usesrc_cli_ill->ill_lock);
-	ill_flag_changed = B_TRUE;
-
-	if (ipif->ipif_isv6)
-		ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill,
-		    ALL_ZONES, ipst);
-	else
-		ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill,
-		    ALL_ZONES, ipst);
-
-	/*
 	 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
 	 * and the ill_usesrc_ifindex fields
 	 */
@@ -18169,15 +15779,14 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
 
 done:
-	if (ill_flag_changed) {
-		mutex_enter(&usesrc_cli_ill->ill_lock);
-		usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING;
-		mutex_exit(&usesrc_cli_ill->ill_lock);
-	}
 	if (ipsq != NULL)
 		ipsq_exit(ipsq);
 	/* The refrele on the lifr_name ipif is done by ip_process_ioctl */
 	ill_refrele(usesrc_ill);
+
+	/* Let conn_ixa caching know that source address selection changed */
+	ip_update_source_selection(ipst);
+
 	return (err);
 }
 
@@ -18384,7 +15993,6 @@ ill_phyint_reinit(ill_t *ill)
 	 * Now that the phyint's ifindex has been assigned, complete the
 	 * remaining
 	 */
-
 	ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex;
 	if (ill->ill_isv6) {
 		ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
@@ -18449,6 +16057,8 @@ ip_ifname_notify(ill_t *ill, queue_t *q)
 	lifr->lifr_ppa = ill->ill_ppa;
 	lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
 
+	DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify",
+	    char *, "SIOCSLIFNAME", ill_t *, ill);
 	putnext(q, mp1);
 }
 
@@ -18503,23 +16113,6 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 	 */
 	err = ill_dl_phys(ill, ipif, mp, q);
 
-	/*
-	 * If there is no IRE expiration timer running, get one started.
-	 * igmp and mld timers will be triggered by the first multicast
-	 */
-	if (ipst->ips_ip_ire_expire_id == 0) {
-		/*
-		 * acquire the lock and check again.
-		 */
-		mutex_enter(&ipst->ips_ip_trash_timer_lock);
-		if (ipst->ips_ip_ire_expire_id == 0) {
-			ipst->ips_ip_ire_expire_id = timeout(
-			    ip_trash_timer_expire, ipst,
-			    MSEC_TO_TICK(ipst->ips_ip_timer_interval));
-		}
-		mutex_exit(&ipst->ips_ip_trash_timer_lock);
-	}
-
 	if (ill->ill_isv6) {
 		mutex_enter(&ipst->ips_mld_slowtimeout_lock);
 		if (ipst->ips_mld_slowtimeout_id == 0) {
@@ -18545,7 +16138,7 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
  * Common routine for ppa and ifname setting. Should be called exclusive.
  *
  * Returns EINPROGRESS when mp has been consumed by queueing it on
- * ill_pending_mp and the ioctl will complete in ip_rput.
+ * ipx_pending_mp and the ioctl will complete in ip_rput.
  *
  * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
  * the new name and new ppa in lifr_name and lifr_ppa respectively.
@@ -18576,6 +16169,7 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
 	ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
 	ASSERT(ill->ill_ppa == UINT_MAX);
 
+	ill->ill_defend_start = ill->ill_defend_count = 0;
 	/* The ppa is sent down by ifconfig or is chosen */
 	if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
 		return (EINVAL);
@@ -18630,18 +16224,18 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
 	if (ill->ill_flags & ILLF_IPV6) {
 
 		ill->ill_isv6 = B_TRUE;
+		ill_set_inputfn(ill);
 		if (ill->ill_rq != NULL) {
 			ill->ill_rq->q_qinfo = &iprinitv6;
-			ill->ill_wq->q_qinfo = &ipwinitv6;
 		}
 
 		/* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
 		ipif->ipif_v6lcl_addr = ipv6_all_zeros;
-		ipif->ipif_v6src_addr = ipv6_all_zeros;
 		ipif->ipif_v6subnet = ipv6_all_zeros;
 		ipif->ipif_v6net_mask = ipv6_all_zeros;
 		ipif->ipif_v6brd_addr = ipv6_all_zeros;
 		ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
+		ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
 		/*
 		 * point-to-point or Non-mulicast capable
 		 * interfaces won't do NUD unless explicitly
@@ -18670,8 +16264,9 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
 			ill->ill_flags |= ILLF_ROUTER;
 	} else if (ill->ill_flags & ILLF_IPV4) {
 		ill->ill_isv6 = B_FALSE;
+		ill_set_inputfn(ill);
+		ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER;
 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
-		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr);
 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
@@ -18783,6 +16378,7 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
 		 * restore previous values
 		 */
 		ill->ill_isv6 = B_FALSE;
+		ill_set_inputfn(ill);
 	}
 	return (error);
 }
@@ -18810,95 +16406,11 @@ ipif_init(ip_stack_t *ipst)
 }
 
 /*
- * Lookup the ipif corresponding to the onlink destination address. For
- * point-to-point interfaces, it matches with remote endpoint destination
- * address. For point-to-multipoint interfaces it only tries to match the
- * destination with the interface's subnet address. The longest, most specific
- * match is found to take care of such rare network configurations like -
- * le0: 129.146.1.1/16
- * le1: 129.146.2.2/24
- *
- * This is used by SO_DONTROUTE and IP_NEXTHOP.  Since neither of those are
- * supported on underlying interfaces in an IPMP group, underlying interfaces
- * are ignored when looking up a match.  (If we didn't ignore them, we'd
- * risk using a test address as a source for outgoing traffic.)
- */
-ipif_t *
-ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
-{
-	ipif_t	*ipif, *best_ipif;
-	ill_t	*ill;
-	ill_walk_context_t ctx;
-
-	ASSERT(zoneid != ALL_ZONES);
-	best_ipif = NULL;
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	ill = ILL_START_WALK_V4(&ctx, ipst);
-	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (IS_UNDER_IPMP(ill))
-			continue;
-		mutex_enter(&ill->ill_lock);
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (!IPIF_CAN_LOOKUP(ipif))
-				continue;
-			if (ipif->ipif_zoneid != zoneid &&
-			    ipif->ipif_zoneid != ALL_ZONES)
-				continue;
-			/*
-			 * Point-to-point case. Look for exact match with
-			 * destination address.
-			 */
-			if (ipif->ipif_flags & IPIF_POINTOPOINT) {
-				if (ipif->ipif_pp_dst_addr == addr) {
-					ipif_refhold_locked(ipif);
-					mutex_exit(&ill->ill_lock);
-					rw_exit(&ipst->ips_ill_g_lock);
-					if (best_ipif != NULL)
-						ipif_refrele(best_ipif);
-					return (ipif);
-				}
-			} else if (ipif->ipif_subnet == (addr &
-			    ipif->ipif_net_mask)) {
-				/*
-				 * Point-to-multipoint case. Looping through to
-				 * find the most specific match. If there are
-				 * multiple best match ipif's then prefer ipif's
-				 * that are UP. If there is only one best match
-				 * ipif and it is DOWN we must still return it.
-				 */
-				if ((best_ipif == NULL) ||
-				    (ipif->ipif_net_mask >
-				    best_ipif->ipif_net_mask) ||
-				    ((ipif->ipif_net_mask ==
-				    best_ipif->ipif_net_mask) &&
-				    ((ipif->ipif_flags & IPIF_UP) &&
-				    (!(best_ipif->ipif_flags & IPIF_UP))))) {
-					ipif_refhold_locked(ipif);
-					mutex_exit(&ill->ill_lock);
-					rw_exit(&ipst->ips_ill_g_lock);
-					if (best_ipif != NULL)
-						ipif_refrele(best_ipif);
-					best_ipif = ipif;
-					rw_enter(&ipst->ips_ill_g_lock,
-					    RW_READER);
-					mutex_enter(&ill->ill_lock);
-				}
-			}
-		}
-		mutex_exit(&ill->ill_lock);
-	}
-	rw_exit(&ipst->ips_ill_g_lock);
-	return (best_ipif);
-}
-
-/*
  * Save enough information so that we can recreate the IRE if
  * the interface goes down and then up.
  */
-static void
-ipif_save_ire(ipif_t *ipif, ire_t *ire)
+void
+ill_save_ire(ill_t *ill, ire_t *ire)
 {
 	mblk_t	*save_mp;
 
@@ -18910,115 +16422,148 @@ ipif_save_ire(ipif_t *ipif, ire_t *ire)
 		ifrt = (ifrt_t *)save_mp->b_rptr;
 		bzero(ifrt, sizeof (ifrt_t));
 		ifrt->ifrt_type = ire->ire_type;
-		ifrt->ifrt_addr = ire->ire_addr;
-		ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
-		ifrt->ifrt_src_addr = ire->ire_src_addr;
-		ifrt->ifrt_mask = ire->ire_mask;
+		if (ire->ire_ipversion == IPV4_VERSION) {
+			ASSERT(!ill->ill_isv6);
+			ifrt->ifrt_addr = ire->ire_addr;
+			ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
+			ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr;
+			ifrt->ifrt_mask = ire->ire_mask;
+		} else {
+			ASSERT(ill->ill_isv6);
+			ifrt->ifrt_v6addr = ire->ire_addr_v6;
+			/* ire_gateway_addr_v6 can change due to RTM_CHANGE */
+			mutex_enter(&ire->ire_lock);
+			ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
+			mutex_exit(&ire->ire_lock);
+			ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6;
+			ifrt->ifrt_v6mask = ire->ire_mask_v6;
+		}
 		ifrt->ifrt_flags = ire->ire_flags;
-		ifrt->ifrt_max_frag = ire->ire_max_frag;
-		mutex_enter(&ipif->ipif_saved_ire_lock);
-		save_mp->b_cont = ipif->ipif_saved_ire_mp;
-		ipif->ipif_saved_ire_mp = save_mp;
-		ipif->ipif_saved_ire_cnt++;
-		mutex_exit(&ipif->ipif_saved_ire_lock);
+		ifrt->ifrt_zoneid = ire->ire_zoneid;
+		mutex_enter(&ill->ill_saved_ire_lock);
+		save_mp->b_cont = ill->ill_saved_ire_mp;
+		ill->ill_saved_ire_mp = save_mp;
+		ill->ill_saved_ire_cnt++;
+		mutex_exit(&ill->ill_saved_ire_lock);
 	}
 }
 
-static void
-ipif_remove_ire(ipif_t *ipif, ire_t *ire)
+/*
+ * Remove one entry from ill_saved_ire_mp.
+ */
+void
+ill_remove_saved_ire(ill_t *ill, ire_t *ire)
 {
 	mblk_t	**mpp;
 	mblk_t	*mp;
 	ifrt_t	*ifrt;
 
-	/* Remove from ipif_saved_ire_mp list if it is there */
-	mutex_enter(&ipif->ipif_saved_ire_lock);
-	for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL;
+	/* Remove from ill_saved_ire_mp list if it is there */
+	mutex_enter(&ill->ill_saved_ire_lock);
+	for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL;
 	    mpp = &(*mpp)->b_cont) {
+		in6_addr_t	gw_addr_v6;
+
 		/*
-		 * On a given ipif, the triple of address, gateway and
-		 * mask is unique for each saved IRE (in the case of
-		 * ordinary interface routes, the gateway address is
-		 * all-zeroes).
+		 * On a given ill, the tuple of address, gateway, mask,
+		 * ire_type, and zoneid is unique for each saved IRE.
 		 */
 		mp = *mpp;
 		ifrt = (ifrt_t *)mp->b_rptr;
-		if (ifrt->ifrt_addr == ire->ire_addr &&
+		/* ire_gateway_addr_v6 can change - need lock */
+		mutex_enter(&ire->ire_lock);
+		gw_addr_v6 = ire->ire_gateway_addr_v6;
+		mutex_exit(&ire->ire_lock);
+
+		if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
+		    ifrt->ifrt_type != ire->ire_type)
+			continue;
+
+		if (ill->ill_isv6 ?
+		    (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
+		    &ire->ire_addr_v6) &&
+		    IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
+		    &gw_addr_v6) &&
+		    IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
+		    &ire->ire_mask_v6)) :
+		    (ifrt->ifrt_addr == ire->ire_addr &&
 		    ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
-		    ifrt->ifrt_mask == ire->ire_mask) {
+		    ifrt->ifrt_mask == ire->ire_mask)) {
 			*mpp = mp->b_cont;
-			ipif->ipif_saved_ire_cnt--;
+			ill->ill_saved_ire_cnt--;
 			freeb(mp);
 			break;
 		}
 	}
-	mutex_exit(&ipif->ipif_saved_ire_lock);
+	mutex_exit(&ill->ill_saved_ire_lock);
 }
 
 /*
  * IP multirouting broadcast routes handling
  * Append CGTP broadcast IREs to regular ones created
  * at ifconfig time.
+ * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both
+ * the destination and the gateway are broadcast addresses.
+ * The caller has verified that the destination is an IRE_BROADCAST and that
+ * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then
+ * we create a MULTIRT IRE_BROADCAST.
+ * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything
+ * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion.
  */
 static void
-ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst)
+ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst)
 {
 	ire_t *ire_prim;
 
 	ASSERT(ire != NULL);
-	ASSERT(ire_dst != NULL);
 
-	ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0,
-	    IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
+	ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
+	    IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
+	    NULL);
 	if (ire_prim != NULL) {
 		/*
 		 * We are in the special case of broadcasts for
 		 * CGTP. We add an IRE_BROADCAST that holds
 		 * the RTF_MULTIRT flag, the destination
-		 * address of ire_dst and the low level
+		 * address and the low level
 		 * info of ire_prim. In other words, CGTP
 		 * broadcast is added to the redundant ipif.
 		 */
-		ipif_t *ipif_prim;
+		ill_t *ill_prim;
 		ire_t  *bcast_ire;
 
-		ipif_prim = ire_prim->ire_ipif;
+		ill_prim = ire_prim->ire_ill;
 
-		ip2dbg(("ip_cgtp_filter_bcast_add: "
-		    "ire_dst %p, ire_prim %p, ipif_prim %p\n",
-		    (void *)ire_dst, (void *)ire_prim,
-		    (void *)ipif_prim));
+		ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n",
+		    (void *)ire_prim, (void *)ill_prim));
 
 		bcast_ire = ire_create(
 		    (uchar_t *)&ire->ire_addr,
 		    (uchar_t *)&ip_g_all_ones,
-		    (uchar_t *)&ire_dst->ire_src_addr,
 		    (uchar_t *)&ire->ire_gateway_addr,
-		    &ipif_prim->ipif_mtu,
-		    NULL,
-		    ipif_prim->ipif_rq,
-		    ipif_prim->ipif_wq,
 		    IRE_BROADCAST,
-		    ipif_prim,
-		    0,
-		    0,
-		    0,
-		    ire->ire_flags,
-		    &ire_uinfo_null,
-		    NULL,
+		    ill_prim,
+		    GLOBAL_ZONEID,	/* CGTP is only for the global zone */
+		    ire->ire_flags | RTF_KERNEL,
 		    NULL,
 		    ipst);
 
+		/*
+		 * Here we assume that ire_add does head insertion so that
+		 * the added IRE_BROADCAST comes before the existing IRE_HOST.
+		 */
 		if (bcast_ire != NULL) {
-
-			if (ire_add(&bcast_ire, NULL, NULL, NULL,
-			    B_FALSE) == 0) {
+			if (ire->ire_flags & RTF_SETSRC) {
+				bcast_ire->ire_setsrc_addr =
+				    ire->ire_setsrc_addr;
+			}
+			bcast_ire = ire_add(bcast_ire);
+			if (bcast_ire != NULL) {
 				ip2dbg(("ip_cgtp_filter_bcast_add: "
 				    "added bcast_ire %p\n",
 				    (void *)bcast_ire));
 
-				ipif_save_ire(bcast_ire->ire_ipif,
-				    bcast_ire);
+				ill_save_ire(ill_prim, bcast_ire);
 				ire_refrele(bcast_ire);
 			}
 		}
@@ -19028,430 +16573,52 @@ ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst)
 
 /*
  * IP multirouting broadcast routes handling
- * Remove the broadcast ire
+ * Remove the broadcast ire.
+ * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both
+ * the destination and the gateway are broadcast addresses.
+ * The caller has only verified that RTF_MULTIRT was set. We check
+ * that the destination is broadcast and that the gateway is a broadcast
+ * address, and if so delete the IRE added by ip_cgtp_bcast_add().
  */
 static void
 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst)
 {
-	ire_t *ire_dst;
-
 	ASSERT(ire != NULL);
-	ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST,
-	    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-	if (ire_dst != NULL) {
+
+	if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) {
 		ire_t *ire_prim;
 
-		ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0,
-		    IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
+		ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
+		    IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0,
+		    ipst, NULL);
 		if (ire_prim != NULL) {
-			ipif_t *ipif_prim;
+			ill_t *ill_prim;
 			ire_t  *bcast_ire;
 
-			ipif_prim = ire_prim->ire_ipif;
+			ill_prim = ire_prim->ire_ill;
 
 			ip2dbg(("ip_cgtp_filter_bcast_delete: "
-			    "ire_dst %p, ire_prim %p, ipif_prim %p\n",
-			    (void *)ire_dst, (void *)ire_prim,
-			    (void *)ipif_prim));
-
-			bcast_ire = ire_ctable_lookup(ire->ire_addr,
-			    ire->ire_gateway_addr,
-			    IRE_BROADCAST,
-			    ipif_prim, ALL_ZONES,
-			    NULL,
-			    MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF |
-			    MATCH_IRE_MASK, ipst);
+			    "ire_prim %p, ill_prim %p\n",
+			    (void *)ire_prim, (void *)ill_prim));
+
+			bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0,
+			    ire->ire_gateway_addr, IRE_BROADCAST,
+			    ill_prim, ALL_ZONES, NULL,
+			    MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL |
+			    MATCH_IRE_MASK, 0, ipst, NULL);
 
 			if (bcast_ire != NULL) {
 				ip2dbg(("ip_cgtp_filter_bcast_delete: "
 				    "looked up bcast_ire %p\n",
 				    (void *)bcast_ire));
-				ipif_remove_ire(bcast_ire->ire_ipif,
+				ill_remove_saved_ire(bcast_ire->ire_ill,
 				    bcast_ire);
 				ire_delete(bcast_ire);
 				ire_refrele(bcast_ire);
 			}
 			ire_refrele(ire_prim);
 		}
-		ire_refrele(ire_dst);
-	}
-}
-
-/*
- * IPsec hardware acceleration capabilities related functions.
- */
-
-/*
- * Free a per-ill IPsec capabilities structure.
- */
-static void
-ill_ipsec_capab_free(ill_ipsec_capab_t *capab)
-{
-	if (capab->auth_hw_algs != NULL)
-		kmem_free(capab->auth_hw_algs, capab->algs_size);
-	if (capab->encr_hw_algs != NULL)
-		kmem_free(capab->encr_hw_algs, capab->algs_size);
-	if (capab->encr_algparm != NULL)
-		kmem_free(capab->encr_algparm, capab->encr_algparm_size);
-	kmem_free(capab, sizeof (ill_ipsec_capab_t));
-}
-
-/*
- * Allocate a new per-ill IPsec capabilities structure. This structure
- * is specific to an IPsec protocol (AH or ESP). It is implemented as
- * an array which specifies, for each algorithm, whether this algorithm
- * is supported by the ill or not.
- */
-static ill_ipsec_capab_t *
-ill_ipsec_capab_alloc(void)
-{
-	ill_ipsec_capab_t *capab;
-	uint_t nelems;
-
-	capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP);
-	if (capab == NULL)
-		return (NULL);
-
-	/* we need one bit per algorithm */
-	nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t);
-	capab->algs_size = nelems * sizeof (ipsec_capab_elem_t);
-
-	/* allocate memory to store algorithm flags */
-	capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP);
-	if (capab->encr_hw_algs == NULL)
-		goto nomem;
-	capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP);
-	if (capab->auth_hw_algs == NULL)
-		goto nomem;
-	/*
-	 * Leave encr_algparm NULL for now since we won't need it half
-	 * the time
-	 */
-	return (capab);
-
-nomem:
-	ill_ipsec_capab_free(capab);
-	return (NULL);
-}
-
-/*
- * Resize capability array.  Since we're exclusive, this is OK.
- */
-static boolean_t
-ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid)
-{
-	ipsec_capab_algparm_t *nalp, *oalp;
-	uint32_t olen, nlen;
-
-	oalp = capab->encr_algparm;
-	olen = capab->encr_algparm_size;
-
-	if (oalp != NULL) {
-		if (algid < capab->encr_algparm_end)
-			return (B_TRUE);
-	}
-
-	nlen = (algid + 1) * sizeof (*nalp);
-	nalp = kmem_zalloc(nlen, KM_NOSLEEP);
-	if (nalp == NULL)
-		return (B_FALSE);
-
-	if (oalp != NULL) {
-		bcopy(oalp, nalp, olen);
-		kmem_free(oalp, olen);
-	}
-	capab->encr_algparm = nalp;
-	capab->encr_algparm_size = nlen;
-	capab->encr_algparm_end = algid + 1;
-
-	return (B_TRUE);
-}
-
-/*
- * Compare the capabilities of the specified ill with the protocol
- * and algorithms specified by the SA passed as argument.
- * If they match, returns B_TRUE, B_FALSE if they do not match.
- *
- * The ill can be passed as a pointer to it, or by specifying its index
- * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments).
- *
- * Called by ipsec_out_is_accelerated() do decide whether an outbound
- * packet is eligible for hardware acceleration, and by
- * ill_ipsec_capab_send_all() to decide whether a SA must be sent down
- * to a particular ill.
- */
-boolean_t
-ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6,
-    ipsa_t *sa, netstack_t *ns)
-{
-	boolean_t sa_isv6;
-	uint_t algid;
-	struct ill_ipsec_capab_s *cpp;
-	boolean_t need_refrele = B_FALSE;
-	ip_stack_t	*ipst = ns->netstack_ip;
-
-	if (ill == NULL) {
-		ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL,
-		    NULL, NULL, NULL, ipst);
-		if (ill == NULL) {
-			ip0dbg(("ipsec_capab_match: ill doesn't exist\n"));
-			return (B_FALSE);
-		}
-		need_refrele = B_TRUE;
-	}
-
-	/*
-	 * Use the address length specified by the SA to determine
-	 * if it corresponds to a IPv6 address, and fail the matching
-	 * if the isv6 flag passed as argument does not match.
-	 * Note: this check is used for SADB capability checking before
-	 * sending SA information to an ill.
-	 */
-	sa_isv6 = (sa->ipsa_addrfam == AF_INET6);
-	if (sa_isv6 != ill_isv6)
-		/* protocol mismatch */
-		goto done;
-
-	/*
-	 * Check if the ill supports the protocol, algorithm(s) and
-	 * key size(s) specified by the SA, and get the pointers to
-	 * the algorithms supported by the ill.
-	 */
-	switch (sa->ipsa_type) {
-
-	case SADB_SATYPE_ESP:
-		if (!(ill->ill_capabilities & ILL_CAPAB_ESP))
-			/* ill does not support ESP acceleration */
-			goto done;
-		cpp = ill->ill_ipsec_capab_esp;
-		algid = sa->ipsa_auth_alg;
-		if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs))
-			goto done;
-		algid = sa->ipsa_encr_alg;
-		if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs))
-			goto done;
-		if (algid < cpp->encr_algparm_end) {
-			ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid];
-			if (sa->ipsa_encrkeybits < alp->minkeylen)
-				goto done;
-			if (sa->ipsa_encrkeybits > alp->maxkeylen)
-				goto done;
-		}
-		break;
-
-	case SADB_SATYPE_AH:
-		if (!(ill->ill_capabilities & ILL_CAPAB_AH))
-			/* ill does not support AH acceleration */
-			goto done;
-		if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg,
-		    ill->ill_ipsec_capab_ah->auth_hw_algs))
-			goto done;
-		break;
 	}
-
-	if (need_refrele)
-		ill_refrele(ill);
-	return (B_TRUE);
-done:
-	if (need_refrele)
-		ill_refrele(ill);
-	return (B_FALSE);
-}
-
-/*
- * Add a new ill to the list of IPsec capable ills.
- * Called from ill_capability_ipsec_ack() when an ACK was received
- * indicating that IPsec hardware processing was enabled for an ill.
- *
- * ill must point to the ill for which acceleration was enabled.
- * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP.
- */
-static void
-ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync)
-{
-	ipsec_capab_ill_t **ills, *cur_ill, *new_ill;
-	uint_t sa_type;
-	uint_t ipproto;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) ||
-	    (dl_cap == DL_CAPAB_IPSEC_ESP));
-
-	switch (dl_cap) {
-	case DL_CAPAB_IPSEC_AH:
-		sa_type = SADB_SATYPE_AH;
-		ills = &ipst->ips_ipsec_capab_ills_ah;
-		ipproto = IPPROTO_AH;
-		break;
-	case DL_CAPAB_IPSEC_ESP:
-		sa_type = SADB_SATYPE_ESP;
-		ills = &ipst->ips_ipsec_capab_ills_esp;
-		ipproto = IPPROTO_ESP;
-		break;
-	}
-
-	rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER);
-
-	/*
-	 * Add ill index to list of hardware accelerators. If
-	 * already in list, do nothing.
-	 */
-	for (cur_ill = *ills; cur_ill != NULL &&
-	    (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex ||
-	    cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next)
-		;
-
-	if (cur_ill == NULL) {
-		/* if this is a new entry for this ill */
-		new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP);
-		if (new_ill == NULL) {
-			rw_exit(&ipst->ips_ipsec_capab_ills_lock);
-			return;
-		}
-
-		new_ill->ill_index = ill->ill_phyint->phyint_ifindex;
-		new_ill->ill_isv6 = ill->ill_isv6;
-		new_ill->next = *ills;
-		*ills = new_ill;
-	} else if (!sadb_resync) {
-		/* not resync'ing SADB and an entry exists for this ill */
-		rw_exit(&ipst->ips_ipsec_capab_ills_lock);
-		return;
-	}
-
-	rw_exit(&ipst->ips_ipsec_capab_ills_lock);
-
-	if (ipst->ips_ipcl_proto_fanout_v6[ipproto].connf_head != NULL)
-		/*
-		 * IPsec module for protocol loaded, initiate dump
-		 * of the SADB to this ill.
-		 */
-		sadb_ill_download(ill, sa_type);
-}
-
-/*
- * Remove an ill from the list of IPsec capable ills.
- */
-static void
-ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap)
-{
-	ipsec_capab_ill_t **ills, *cur_ill, *prev_ill;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	ASSERT(dl_cap == DL_CAPAB_IPSEC_AH ||
-	    dl_cap == DL_CAPAB_IPSEC_ESP);
-
-	ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipst->ips_ipsec_capab_ills_ah :
-	    &ipst->ips_ipsec_capab_ills_esp;
-
-	rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER);
-
-	prev_ill = NULL;
-	for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index !=
-	    ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 !=
-	    ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next)
-		;
-	if (cur_ill == NULL) {
-		/* entry not found */
-		rw_exit(&ipst->ips_ipsec_capab_ills_lock);
-		return;
-	}
-	if (prev_ill == NULL) {
-		/* entry at front of list */
-		*ills = NULL;
-	} else {
-		prev_ill->next = cur_ill->next;
-	}
-	kmem_free(cur_ill, sizeof (ipsec_capab_ill_t));
-	rw_exit(&ipst->ips_ipsec_capab_ills_lock);
-}
-
-/*
- * Called by SADB to send a DL_CONTROL_REQ message to every ill
- * supporting the specified IPsec protocol acceleration.
- * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP.
- * We free the mblk and, if sa is non-null, release the held referece.
- */
-void
-ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa,
-    netstack_t *ns)
-{
-	ipsec_capab_ill_t *ici, *cur_ici;
-	ill_t *ill;
-	mblk_t *nmp, *mp_ship_list = NULL, *next_mp;
-	ip_stack_t	*ipst = ns->netstack_ip;
-
-	ici = (sa_type == SADB_SATYPE_AH) ? ipst->ips_ipsec_capab_ills_ah :
-	    ipst->ips_ipsec_capab_ills_esp;
-
-	rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_READER);
-
-	for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) {
-		ill = ill_lookup_on_ifindex(cur_ici->ill_index,
-		    cur_ici->ill_isv6, NULL, NULL, NULL, NULL, ipst);
-
-		/*
-		 * Handle the case where the ill goes away while the SADB is
-		 * attempting to send messages.  If it's going away, it's
-		 * nuking its shadow SADB, so we don't care..
-		 */
-
-		if (ill == NULL)
-			continue;
-
-		if (sa != NULL) {
-			/*
-			 * Make sure capabilities match before
-			 * sending SA to ill.
-			 */
-			if (!ipsec_capab_match(ill, cur_ici->ill_index,
-			    cur_ici->ill_isv6, sa, ipst->ips_netstack)) {
-				ill_refrele(ill);
-				continue;
-			}
-
-			mutex_enter(&sa->ipsa_lock);
-			sa->ipsa_flags |= IPSA_F_HW;
-			mutex_exit(&sa->ipsa_lock);
-		}
-
-		/*
-		 * Copy template message, and add it to the front
-		 * of the mblk ship list. We want to avoid holding
-		 * the ipsec_capab_ills_lock while sending the
-		 * message to the ills.
-		 *
-		 * The b_next and b_prev are temporarily used
-		 * to build a list of mblks to be sent down, and to
-		 * save the ill to which they must be sent.
-		 */
-		nmp = copymsg(mp);
-		if (nmp == NULL) {
-			ill_refrele(ill);
-			continue;
-		}
-		ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL);
-		nmp->b_next = mp_ship_list;
-		mp_ship_list = nmp;
-		nmp->b_prev = (mblk_t *)ill;
-	}
-
-	rw_exit(&ipst->ips_ipsec_capab_ills_lock);
-
-	for (nmp = mp_ship_list; nmp != NULL; nmp = next_mp) {
-		/* restore the mblk to a sane state */
-		next_mp = nmp->b_next;
-		nmp->b_next = NULL;
-		ill = (ill_t *)nmp->b_prev;
-		nmp->b_prev = NULL;
-
-		ill_dlpi_send(ill, nmp);
-		ill_refrele(ill);
-	}
-
-	if (sa != NULL)
-		IPSA_REFRELE(sa);
-	freemsg(mp);
 }
 
 /*
@@ -19531,71 +16698,79 @@ ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
 	addr[0] &= ~0x2;				/* set local bit */
 }
 
-/* ARGSUSED */
-static boolean_t
-ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
-    uint32_t *hw_start, in6_addr_t *v6_extract_mask)
+/*
+ * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet.
+ */
+static void
+ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr)
 {
-	/*
-	 * Multicast address mappings used over Ethernet/802.X.
-	 * This address is used as a base for mappings.
-	 */
-	static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00,
-	    0x00, 0x00, 0x00};
+	phyint_t *phyi = ill->ill_phyint;
 
 	/*
-	 * Extract low order 32 bits from IPv6 multicast address.
-	 * Or that into the link layer address, starting from the
-	 * second byte.
+	 * Check PHYI_MULTI_BCAST and length of physical
+	 * address to determine if we use the mapping or the
+	 * broadcast address.
 	 */
-	*hw_start = 2;
-	v6_extract_mask->s6_addr32[0] = 0;
-	v6_extract_mask->s6_addr32[1] = 0;
-	v6_extract_mask->s6_addr32[2] = 0;
-	v6_extract_mask->s6_addr32[3] = 0xffffffffU;
-	bcopy(ipv6_g_phys_multi_addr, maddr, lla_length);
-	return (B_TRUE);
+	if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
+	    ill->ill_phys_addr_length != ETHERADDRL) {
+		ip_mbcast_mapping(ill, m_ip6addr, m_physaddr);
+		return;
+	}
+	m_physaddr[0] = 0x33;
+	m_physaddr[1] = 0x33;
+	m_physaddr[2] = m_ip6addr[12];
+	m_physaddr[3] = m_ip6addr[13];
+	m_physaddr[4] = m_ip6addr[14];
+	m_physaddr[5] = m_ip6addr[15];
 }
 
 /*
- * Indicate by return value whether multicast is supported. If not,
- * this code should not touch/change any parameters.
+ * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet.
  */
-/* ARGSUSED */
-static boolean_t
-ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
-    uint32_t *hw_start, ipaddr_t *extract_mask)
+static void
+ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
 {
+	phyint_t *phyi = ill->ill_phyint;
+
 	/*
-	 * Multicast address mappings used over Ethernet/802.X.
-	 * This address is used as a base for mappings.
+	 * Check PHYI_MULTI_BCAST and length of physical
+	 * address to determine if we use the mapping or the
+	 * broadcast address.
 	 */
-	static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e,
-	    0x00, 0x00, 0x00 };
-
-	if (phys_length != ETHERADDRL)
-		return (B_FALSE);
-
-	*extract_mask = htonl(0x007fffff);
-	*hw_start = 2;
-	bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL);
-	return (B_TRUE);
+	if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
+	    ill->ill_phys_addr_length != ETHERADDRL) {
+		ip_mbcast_mapping(ill, m_ipaddr, m_physaddr);
+		return;
+	}
+	m_physaddr[0] = 0x01;
+	m_physaddr[1] = 0x00;
+	m_physaddr[2] = 0x5e;
+	m_physaddr[3] = m_ipaddr[1] & 0x7f;
+	m_physaddr[4] = m_ipaddr[2];
+	m_physaddr[5] = m_ipaddr[3];
 }
 
 /* ARGSUSED */
-static boolean_t
-ip_nodef_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
-    uint32_t *hw_start, ipaddr_t *extract_mask)
+static void
+ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
 {
-	return (B_FALSE);
-}
+	/*
+	 * for the MULTI_BCAST case and other cases when we want to
+	 * use the link-layer broadcast address for multicast.
+	 */
+	uint8_t	*bphys_addr;
+	dl_unitdata_req_t *dlur;
 
-/* ARGSUSED */
-static boolean_t
-ip_nodef_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
-    uint32_t *hw_start, in6_addr_t *v6_extract_mask)
-{
-	return (B_FALSE);
+	dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
+	if (ill->ill_sap_length < 0) {
+		bphys_addr = (uchar_t *)dlur +
+		    dlur->dl_dest_addr_offset;
+	} else  {
+		bphys_addr = (uchar_t *)dlur +
+		    dlur->dl_dest_addr_offset + ill->ill_sap_length;
+	}
+
+	bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length);
 }
 
 /*
@@ -19624,6 +16799,7 @@ ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
 }
 
 /*
+ * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand.
  * Note on mapping from multicast IP addresses to IPoIB multicast link
  * addresses. IPoIB multicast link addresses are based on IBA link addresses.
  * The format of an IPoIB multicast address is:
@@ -19637,72 +16813,70 @@ ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
  * network interface. They can be ascertained from the broadcast address.
  * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
  */
-
-static boolean_t
-ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
-    uint32_t *hw_start, in6_addr_t *v6_extract_mask)
+static void
+ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
 {
-	/*
-	 * Base IPoIB IPv6 multicast address used for mappings.
-	 * Does not contain the IBA scope/Pkey values.
-	 */
-	static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
-	    0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
+	static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
+	    0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+	uint8_t	*bphys_addr;
+	dl_unitdata_req_t *dlur;
+
+	bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
 
 	/*
-	 * Extract low order 80 bits from IPv6 multicast address.
-	 * Or that into the link layer address, starting from the
-	 * sixth byte.
+	 * RFC 4391: IPv4 MGID is 28-bit long.
 	 */
-	*hw_start = 6;
-	bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length);
+	m_physaddr[16] = m_ipaddr[0] & 0x0f;
+	m_physaddr[17] = m_ipaddr[1];
+	m_physaddr[18] = m_ipaddr[2];
+	m_physaddr[19] = m_ipaddr[3];
+
 
+	dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
+	if (ill->ill_sap_length < 0) {
+		bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
+	} else  {
+		bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
+		    ill->ill_sap_length;
+	}
 	/*
 	 * Now fill in the IBA scope/Pkey values from the broadcast address.
 	 */
-	*(maddr + 5) = *(bphys_addr + 5);
-	*(maddr + 8) = *(bphys_addr + 8);
-	*(maddr + 9) = *(bphys_addr + 9);
-
-	v6_extract_mask->s6_addr32[0] = 0;
-	v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff);
-	v6_extract_mask->s6_addr32[2] = 0xffffffffU;
-	v6_extract_mask->s6_addr32[3] = 0xffffffffU;
-	return (B_TRUE);
+	m_physaddr[5] = bphys_addr[5];
+	m_physaddr[8] = bphys_addr[8];
+	m_physaddr[9] = bphys_addr[9];
 }
 
-static boolean_t
-ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
-    uint32_t *hw_start, ipaddr_t *extract_mask)
+static void
+ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
 {
-	/*
-	 * Base IPoIB IPv4 multicast address used for mappings.
-	 * Does not contain the IBA scope/Pkey values.
-	 */
 	static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
-	    0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
+	    0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+	uint8_t	*bphys_addr;
+	dl_unitdata_req_t *dlur;
 
-	if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr))
-		return (B_FALSE);
+	bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
 
 	/*
-	 * Extract low order 28 bits from IPv4 multicast address.
-	 * Or that into the link layer address, starting from the
-	 * sixteenth byte.
+	 * RFC 4391: IPv4 MGID is 80-bit long.
 	 */
-	*extract_mask = htonl(0x0fffffff);
-	*hw_start = 16;
-	bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length);
+	bcopy(&m_ipaddr[6], &m_physaddr[10], 10);
 
+	dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
+	if (ill->ill_sap_length < 0) {
+		bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
+	} else  {
+		bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
+		    ill->ill_sap_length;
+	}
 	/*
 	 * Now fill in the IBA scope/Pkey values from the broadcast address.
 	 */
-	*(maddr + 5) = *(bphys_addr + 5);
-	*(maddr + 8) = *(bphys_addr + 8);
-	*(maddr + 9) = *(bphys_addr + 9);
-	return (B_TRUE);
+	m_physaddr[5] = bphys_addr[5];
+	m_physaddr[8] = bphys_addr[8];
+	m_physaddr[9] = bphys_addr[9];
 }
 
 /*
@@ -19758,56 +16932,34 @@ ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
 }
 
 /*
- * Returns B_TRUE if an ipif is present in the given zone, matching some flags
- * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there.
- * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with
- * the link-local address is preferred.
+ * Lookup an ill and verify that the zoneid has an ipif on that ill.
+ * Returns an held ill, or NULL.
  */
-boolean_t
-ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
+ill_t *
+ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6,
+    ip_stack_t *ipst)
 {
+	ill_t	*ill;
 	ipif_t	*ipif;
-	ipif_t	*maybe_ipif = NULL;
 
-	mutex_enter(&ill->ill_lock);
-	if (ill->ill_state_flags & ILL_CONDEMNED) {
-		mutex_exit(&ill->ill_lock);
-		if (ipifp != NULL)
-			*ipifp = NULL;
-		return (B_FALSE);
-	}
+	ill = ill_lookup_on_ifindex(index, isv6, ipst);
+	if (ill == NULL)
+		return (NULL);
 
+	mutex_enter(&ill->ill_lock);
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (!IPIF_CAN_LOOKUP(ipif))
+		if (IPIF_IS_CONDEMNED(ipif))
 			continue;
 		if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid &&
 		    ipif->ipif_zoneid != ALL_ZONES)
 			continue;
-		if ((ipif->ipif_flags & flags) != flags)
-			continue;
 
-		if (ipifp == NULL) {
-			mutex_exit(&ill->ill_lock);
-			ASSERT(maybe_ipif == NULL);
-			return (B_TRUE);
-		}
-		if (!ill->ill_isv6 ||
-		    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) {
-			ipif_refhold_locked(ipif);
-			mutex_exit(&ill->ill_lock);
-			*ipifp = ipif;
-			return (B_TRUE);
-		}
-		if (maybe_ipif == NULL)
-			maybe_ipif = ipif;
-	}
-	if (ipifp != NULL) {
-		if (maybe_ipif != NULL)
-			ipif_refhold_locked(maybe_ipif);
-		*ipifp = maybe_ipif;
+		mutex_exit(&ill->ill_lock);
+		return (ill);
 	}
 	mutex_exit(&ill->ill_lock);
-	return (maybe_ipif != NULL);
+	ill_refrele(ill);
+	return (NULL);
 }
 
 /*
@@ -19822,8 +16974,7 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
 	ipif_t *ipif;
 	ill_t *ill;
 
-	ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
-	    ipst);
+	ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
 	if (ill == NULL)
 		return (NULL);
 
@@ -19849,19 +17000,52 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
 }
 
 /*
- * Flush the fastpath by deleting any nce's that are waiting for the fastpath,
- * There is one exceptions IRE_BROADCAST are difficult to recreate,
- * so instead we just nuke their nce_fp_mp's; see ndp_fastpath_flush()
- * for details.
+ * Set ill_inputfn based on the current know state.
+ * This needs to be called when any of the factors taken into
+ * account changes.
  */
 void
-ill_fastpath_flush(ill_t *ill)
+ill_set_inputfn(ill_t *ill)
 {
-	ip_stack_t *ipst = ill->ill_ipst;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	nce_fastpath_list_dispatch(ill, NULL, NULL);
-	ndp_walk_common((ill->ill_isv6 ? ipst->ips_ndp6 : ipst->ips_ndp4),
-	    ill, (pfi_t)ndp_fastpath_flush, NULL, B_TRUE);
+	if (ill->ill_isv6) {
+		if (is_system_labeled())
+			ill->ill_inputfn = ill_input_full_v6;
+		else
+			ill->ill_inputfn = ill_input_short_v6;
+	} else {
+		if (is_system_labeled())
+			ill->ill_inputfn = ill_input_full_v4;
+		else if (ill->ill_dhcpinit != 0)
+			ill->ill_inputfn = ill_input_full_v4;
+		else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head
+		    != NULL)
+			ill->ill_inputfn = ill_input_full_v4;
+		else if (ipst->ips_ip_cgtp_filter &&
+		    ipst->ips_ip_cgtp_filter_ops != NULL)
+			ill->ill_inputfn = ill_input_full_v4;
+		else
+			ill->ill_inputfn = ill_input_short_v4;
+	}
+}
+
+/*
+ * Re-evaluate ill_inputfn for all the IPv4 ills.
+ * Used when RSVP and CGTP comes and goes.
+ */
+void
+ill_set_inputfn_all(ip_stack_t *ipst)
+{
+	ill_walk_context_t	ctx;
+	ill_t			*ill;
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	ill = ILL_START_WALK_V4(&ctx, ipst);
+	for (; ill != NULL; ill = ill_next(&ctx, ill))
+		ill_set_inputfn(ill);
+
+	rw_exit(&ipst->ips_ill_g_lock);
 }
 
 /*
@@ -19897,6 +17081,10 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp)
 	}
 
 	ipsq_current_start(ipsq, ill->ill_ipif, 0);
+	mutex_enter(&ill->ill_lock);
+	ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
+	/* no more nce addition allowed */
+	mutex_exit(&ill->ill_lock);
 
 	/*
 	 * If we can quiesce the ill, then set the address.  If not, then
@@ -19923,8 +17111,8 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp)
  * are passed (linked by b_cont), since we sometimes need to save two distinct
  * copies in the ill_t, and our context doesn't permit sleeping or allocation
  * failure (we'll free the other copy if it's not needed).  Since the ill_t
- * is quiesced, we know any stale IREs with the old address information have
- * already been removed, so we don't need to call ill_fastpath_flush().
+ * is quiesced, we know any stale nce's with the old address information have
+ * already been removed, so we don't need to call nce_flush().
  */
 /* ARGSUSED */
 static void
@@ -19934,6 +17122,7 @@ ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
 	mblk_t		*addrmp2 = unlinkb(addrmp);
 	dl_notify_ind_t	*dlindp = (dl_notify_ind_t *)addrmp->b_rptr;
 	uint_t		addrlen, addroff;
+	int		status;
 
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
 
@@ -19962,7 +17151,7 @@ ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
 		ill->ill_phys_addr = addrmp->b_rptr + addroff;
 		ill->ill_phys_addr_mp = addrmp;
 		ill->ill_phys_addr_length = addrlen;
-		if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
+		if (ill->ill_isv6)
 			ill_set_ndmp(ill, addrmp2, addroff, addrlen);
 		else
 			freemsg(addrmp2);
@@ -19978,10 +17167,15 @@ ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
 	/*
 	 * If there are ipifs to bring up, ill_up_ipifs() will return
 	 * EINPROGRESS, and ipsq_current_finish() will be called by
-	 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is
+	 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is
 	 * brought up.
 	 */
-	if (ill_up_ipifs(ill, q, addrmp) != EINPROGRESS)
+	status = ill_up_ipifs(ill, q, addrmp);
+	mutex_enter(&ill->ill_lock);
+	if (ill->ill_dl_up)
+		ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
+	mutex_exit(&ill->ill_lock);
+	if (status != EINPROGRESS)
 		ipsq_current_finish(ipsq);
 }
 
@@ -20009,6 +17203,11 @@ ill_replumb(ill_t *ill, mblk_t *mp)
 
 	ipsq_current_start(ipsq, ill->ill_ipif, 0);
 
+	mutex_enter(&ill->ill_lock);
+	ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
+	/* no more nce addition allowed */
+	mutex_exit(&ill->ill_lock);
+
 	/*
 	 * If we can quiesce the ill, then continue.  If not, then
 	 * ill_replumb_tail() will be called from ipif_ill_refrele_tail().
@@ -20034,14 +17233,32 @@ static void
 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
 {
 	ill_t *ill = q->q_ptr;
+	int err;
+	conn_t *connp = NULL;
 
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
-
-	ill_down_ipifs_tail(ill);
-
 	freemsg(ill->ill_replumb_mp);
 	ill->ill_replumb_mp = copyb(mp);
 
+	if (ill->ill_replumb_mp == NULL) {
+		/* out of memory */
+		ipsq_current_finish(ipsq);
+		return;
+	}
+
+	mutex_enter(&ill->ill_lock);
+	ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif,
+	    ill->ill_rq, ill->ill_replumb_mp, 0);
+	mutex_exit(&ill->ill_lock);
+
+	if (!ill->ill_up_ipifs) {
+		/* already closing */
+		ipsq_current_finish(ipsq);
+		return;
+	}
+	ill->ill_replumbing = 1;
+	err = ill_down_ipifs_tail(ill);
+
 	/*
 	 * Successfully quiesced and brought down the interface, now we send
 	 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the
@@ -20055,15 +17272,23 @@ ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
 	ill_dlpi_send(ill, mp);
 
 	/*
-	 * If there are ipifs to bring up, ill_up_ipifs() will return
-	 * EINPROGRESS, and ipsq_current_finish() will be called by
-	 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is
-	 * brought up.
+	 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP
+	 * streams have to be unbound. When all the DLPI exchanges are done,
+	 * ipsq_current_finish() will be called by arp_bringup_done(). The
+	 * remainder of ipif bringup via ill_up_ipifs() will also be done in
+	 * arp_bringup_done().
 	 */
-	if (ill->ill_replumb_mp == NULL ||
-	    ill_up_ipifs(ill, q, ill->ill_replumb_mp) != EINPROGRESS) {
-		ipsq_current_finish(ipsq);
+	ASSERT(ill->ill_replumb_mp != NULL);
+	if (err == EINPROGRESS)
+		return;
+	else
+		ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp);
+	ASSERT(connp == NULL);
+	if (err == 0 && ill->ill_replumb_mp != NULL &&
+	    ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) {
+		return;
 	}
+	ipsq_current_finish(ipsq);
 }
 
 /*
@@ -20342,6 +17567,338 @@ fail:
 	    "information for %s (ENOMEM)\n", str, ill->ill_name));
 }
 
+static int
+ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act)
+{
+	int		err = 0;
+	const in_addr_t	*addr = NULL;
+	nce_t		*nce = NULL;
+	ill_t		*ill = ipif->ipif_ill;
+	ill_t		*bound_ill;
+	boolean_t	added_ipif = B_FALSE;
+	uint16_t	state;
+	uint16_t	flags;
+
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail",
+	    ill_t *, ill, ipif_t *, ipif);
+	if (ipif->ipif_lcl_addr != INADDR_ANY) {
+		addr = &ipif->ipif_lcl_addr;
+	}
+
+	if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) {
+		if (res_act != Res_act_initial)
+			return (EINVAL);
+	}
+
+	if (addr != NULL) {
+		ipmp_illgrp_t	*illg = ill->ill_grp;
+
+		/* add unicast nce for the local addr */
+
+		if (IS_IPMP(ill)) {
+			/*
+			 * If we're here via ipif_up(), then the ipif
+			 * won't be bound yet -- add it to the group,
+			 * which will bind it if possible. (We would
+			 * add it in ipif_up(), but deleting on failure
+			 * there is gruesome.)  If we're here via
+			 * ipmp_ill_bind_ipif(), then the ipif has
+			 * already been added to the group and we
+			 * just need to use the binding.
+			 */
+			if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
+				bound_ill  = ipmp_illgrp_add_ipif(illg, ipif);
+				if (bound_ill == NULL) {
+					/*
+					 * We couldn't bind the ipif to an ill
+					 * yet, so we have nothing to publish.
+					 * Mark the address as ready and return.
+					 */
+					ipif->ipif_addr_ready = 1;
+					return (0);
+				}
+				added_ipif = B_TRUE;
+			}
+		} else {
+			bound_ill = ill;
+		}
+
+		flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY |
+		    NCE_F_NONUD);
+		/*
+		 * If this is an initial bring-up (or the ipif was never
+		 * completely brought up), do DAD.  Otherwise, we're here
+		 * because IPMP has rebound an address to this ill: send
+		 * unsolicited advertisements (ARP announcements) to
+		 * inform others.
+		 */
+		if (res_act == Res_act_initial || !ipif->ipif_addr_ready) {
+			state = ND_UNCHANGED; /* compute in nce_add_common() */
+		} else {
+			state = ND_REACHABLE;
+			flags |= NCE_F_UNSOL_ADV;
+		}
+
+retry:
+		err = nce_lookup_then_add_v4(ill,
+		    bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length,
+		    addr, flags, state, &nce);
+
+		/*
+		 * note that we may encounter EEXIST if we are moving
+		 * the nce as a result of a rebind operation.
+		 */
+		switch (err) {
+		case 0:
+			ipif->ipif_added_nce = 1;
+			nce->nce_ipif_cnt++;
+			break;
+		case EEXIST:
+			ip1dbg(("ipif_arp_up: NCE already exists for %s\n",
+			    ill->ill_name));
+			if (!NCE_MYADDR(nce->nce_common)) {
+				/*
+				 * A leftover nce from before this address
+				 * existed
+				 */
+				ncec_delete(nce->nce_common);
+				nce_refrele(nce);
+				nce = NULL;
+				goto retry;
+			}
+			if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
+				nce_refrele(nce);
+				nce = NULL;
+				ip1dbg(("ipif_arp_up: NCE already exists "
+				    "for %s:%u\n", ill->ill_name,
+				    ipif->ipif_id));
+				goto arp_up_done;
+			}
+			/*
+			 * Duplicate local addresses are permissible for
+			 * IPIF_POINTOPOINT interfaces which will get marked
+			 * IPIF_UNNUMBERED later in
+			 * ip_addr_availability_check().
+			 *
+			 * The nce_ipif_cnt field tracks the number of
+			 * ipifs that have nce_addr as their local address.
+			 */
+			ipif->ipif_addr_ready = 1;
+			ipif->ipif_added_nce = 1;
+			nce->nce_ipif_cnt++;
+			err = 0;
+			break;
+		default:
+			ASSERT(nce == NULL);
+			goto arp_up_done;
+		}
+		if (arp_no_defense) {
+			if ((ipif->ipif_flags & IPIF_UP) &&
+			    !ipif->ipif_addr_ready)
+				ipif_up_notify(ipif);
+			ipif->ipif_addr_ready = 1;
+		}
+	} else {
+		/* zero address. nothing to publish */
+		ipif->ipif_addr_ready = 1;
+	}
+	if (nce != NULL)
+		nce_refrele(nce);
+arp_up_done:
+	if (added_ipif && err != 0)
+		ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+	return (err);
+}
+
+int
+ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup)
+{
+	int 		err = 0;
+	ill_t 		*ill = ipif->ipif_ill;
+	boolean_t	first_interface, wait_for_dlpi = B_FALSE;
+
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up",
+	    ill_t *, ill, ipif_t *, ipif);
+
+	/*
+	 * need to bring up ARP or setup mcast mapping only
+	 * when the first interface is coming UP.
+	 */
+	first_interface = (ill->ill_ipif_up_count == 0 &&
+	    ill->ill_ipif_dup_count == 0 && !was_dup);
+
+	if (res_act == Res_act_initial && first_interface) {
+		/*
+		 * Send ATTACH + BIND
+		 */
+		err = arp_ll_up(ill);
+		if (err != EINPROGRESS && err != 0)
+			return (err);
+
+		/*
+		 * Add NCE for local address. Start DAD.
+		 * we'll wait to hear that DAD has finished
+		 * before using the interface.
+		 */
+		if (err == EINPROGRESS)
+			wait_for_dlpi = B_TRUE;
+	}
+
+	if (!wait_for_dlpi)
+		(void) ipif_arp_up_done_tail(ipif, res_act);
+
+	return (!wait_for_dlpi ? 0 : EINPROGRESS);
+}
+
+/*
+ * Finish processing of "arp_up" after all the DLPI message
+ * exchanges have completed between arp and the driver.
+ */
+void
+arp_bringup_done(ill_t *ill, int err)
+{
+	mblk_t	*mp1;
+	ipif_t  *ipif;
+	conn_t *connp = NULL;
+	ipsq_t	*ipsq;
+	queue_t *q;
+
+	ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name));
+
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	ipsq = ill->ill_phyint->phyint_ipsq;
+	ipif = ipsq->ipsq_xop->ipx_pending_ipif;
+	mp1 = ipsq_pending_mp_get(ipsq, &connp);
+	ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
+	if (mp1 == NULL) /* bringup was aborted by the user */
+		return;
+
+	/*
+	 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
+	 * must have an associated conn_t.  Otherwise, we're bringing this
+	 * interface back up as part of handling an asynchronous event (e.g.,
+	 * physical address change).
+	 */
+	if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
+		ASSERT(connp != NULL);
+		q = CONNP_TO_WQ(connp);
+	} else {
+		ASSERT(connp == NULL);
+		q = ill->ill_rq;
+	}
+	if (err == 0) {
+		if (ipif->ipif_isv6) {
+			if ((err = ipif_up_done_v6(ipif)) != 0)
+				ip0dbg(("arp_bringup_done: init failed\n"));
+		} else {
+			err = ipif_arp_up_done_tail(ipif, Res_act_initial);
+			if (err != 0 || (err = ipif_up_done(ipif)) != 0)
+				ip0dbg(("arp_bringup_done: init failed\n"));
+		}
+	} else {
+		ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n"));
+	}
+
+	if ((err == 0) && (ill->ill_up_ipifs)) {
+		err = ill_up_ipifs(ill, q, mp1);
+		if (err == EINPROGRESS)
+			return;
+	}
+
+	/*
+	 * If we have a moved ipif to bring up, and everything has succeeded
+	 * to this point, bring it up on the IPMP ill.  Otherwise, leave it
+	 * down -- the admin can try to bring it up by hand if need be.
+	 */
+	if (ill->ill_move_ipif != NULL) {
+		ipif = ill->ill_move_ipif;
+		ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif,
+		    ipif->ipif_ill->ill_name));
+		ill->ill_move_ipif = NULL;
+		if (err == 0) {
+			err = ipif_up(ipif, q, mp1);
+			if (err == EINPROGRESS)
+				return;
+		}
+	}
+
+	/*
+	 * The operation must complete without EINPROGRESS since
+	 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
+	 * Otherwise, the operation will be stuck forever in the ipsq.
+	 */
+	ASSERT(err != EINPROGRESS);
+	if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
+		DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish",
+		    int, ipsq->ipsq_xop->ipx_current_ioctl,
+		    ill_t *, ill, ipif_t *, ipif);
+		ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
+	} else {
+		ipsq_current_finish(ipsq);
+	}
+}
+
+/*
+ * Finish processing of arp replumb after all the DLPI message
+ * exchanges have completed between arp and the driver.
+ */
+void
+arp_replumb_done(ill_t *ill, int err)
+{
+	mblk_t	*mp1;
+	ipif_t  *ipif;
+	conn_t *connp = NULL;
+	ipsq_t	*ipsq;
+	queue_t *q;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	ipsq = ill->ill_phyint->phyint_ipsq;
+	ipif = ipsq->ipsq_xop->ipx_pending_ipif;
+	mp1 = ipsq_pending_mp_get(ipsq, &connp);
+	ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
+	if (mp1 == NULL) {
+		ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n",
+		    ipsq->ipsq_xop->ipx_current_ioctl));
+		/* bringup was aborted by the user */
+		return;
+	}
+	/*
+	 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
+	 * must have an associated conn_t.  Otherwise, we're bringing this
+	 * interface back up as part of handling an asynchronous event (e.g.,
+	 * physical address change).
+	 */
+	if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
+		ASSERT(connp != NULL);
+		q = CONNP_TO_WQ(connp);
+	} else {
+		ASSERT(connp == NULL);
+		q = ill->ill_rq;
+	}
+	if ((err == 0) && (ill->ill_up_ipifs)) {
+		err = ill_up_ipifs(ill, q, mp1);
+		if (err == EINPROGRESS)
+			return;
+	}
+	/*
+	 * The operation must complete without EINPROGRESS since
+	 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
+	 * Otherwise, the operation will be stuck forever in the ipsq.
+	 */
+	ASSERT(err != EINPROGRESS);
+	if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
+		DTRACE_PROBE4(ipif__ioctl, char *,
+		    "arp_replumb_done finish",
+		    int, ipsq->ipsq_xop->ipx_current_ioctl,
+		    ill_t *, ill, ipif_t *, ipif);
+		ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
+	} else {
+		ipsq_current_finish(ipsq);
+	}
+}
+
 void
 ipif_up_notify(ipif_t *ipif)
 {
@@ -20610,3 +18167,48 @@ ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 done:
 	return (ret);
 }
+
+/* Remove all cache entries for this logical interface */
+void
+ipif_nce_down(ipif_t *ipif)
+{
+	ill_t *ill = ipif->ipif_ill;
+	nce_t *nce;
+
+	DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down",
+	    ill_t *, ill, ipif_t *, ipif);
+	if (ipif->ipif_added_nce) {
+		if (ipif->ipif_isv6)
+			nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
+		else
+			nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr);
+		if (nce != NULL) {
+			if (--nce->nce_ipif_cnt == 0)
+				ncec_delete(nce->nce_common);
+			ipif->ipif_added_nce = 0;
+			nce_refrele(nce);
+		} else {
+			/*
+			 * nce may already be NULL because it was already
+			 * flushed, e.g., due to a call to nce_flush
+			 */
+			ipif->ipif_added_nce = 0;
+		}
+	}
+	/*
+	 * Make IPMP aware of the deleted data address.
+	 */
+	if (IS_IPMP(ill))
+		ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+
+	/*
+	 * Remove all other nces dependent on this ill when the last ipif
+	 * is going away.
+	 */
+	if (ill->ill_ipif_up_count == 0) {
+		ncec_walk(ill, (pfi_t)ncec_delete_per_ill,
+		    (uchar_t *)ill, ill->ill_ipst);
+		if (IS_UNDER_IPMP(ill))
+			nce_flush(ill, B_TRUE);
+	}
+}
diff --git a/usr/src/uts/common/inet/ip/ip_input.c b/usr/src/uts/common/inet/ip/ip_input.c
new file mode 100644
index 0000000000..d47670f85d
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_input.c
@@ -0,0 +1,3095 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/dlpi.h>
+#include <sys/stropts.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strlog.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#define	_SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+#include <sys/xti_inet.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kobj.h>
+#include <sys/modctl.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/priv.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/vtrace.h>
+#include <sys/isa_defs.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/optcom.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+#include <inet/ilb_ip.h>
+#include <sys/squeue_impl.h>
+#include <sys/squeue.h>
+
+#include <sys/ethernet.h>
+#include <net/if_types.h>
+#include <sys/cpuvar.h>
+
+#include <ipp/ipp.h>
+#include <ipp/ipp_impl.h>
+#include <ipp/ipgpc/ipgpc.h>
+
+#include <sys/pattr.h>
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+#include <rpc/pmap_prot.h>
+
+#ifdef	DEBUG
+extern boolean_t skip_sctp_cksum;
+#endif
+
+static void	ip_input_local_v4(ire_t *, mblk_t *, ipha_t *,
+    ip_recv_attr_t *);
+
+static void	ip_input_broadcast_v4(ire_t *, mblk_t *, ipha_t *,
+    ip_recv_attr_t *);
+static void	ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *,
+    ip_recv_attr_t *);
+
+#pragma inline(ip_input_common_v4, ip_input_local_v4, ip_forward_xmit_v4)
+
+/*
+ * Direct read side procedure capable of dealing with chains. GLDv3 based
+ * drivers call this function directly with mblk chains while STREAMS
+ * read side procedure ip_rput() calls this for single packet with ip_ring
+ * set to NULL to process one packet at a time.
+ *
+ * The ill will always be valid if this function is called directly from
+ * the driver.
+ *
+ * If ip_input() is called from GLDv3:
+ *
+ *   - This must be a non-VLAN IP stream.
+ *   - 'mp' is either an untagged or a special priority-tagged packet.
+ *   - Any VLAN tag that was in the MAC header has been stripped.
+ *
+ * If the IP header in packet is not 32-bit aligned, every message in the
+ * chain will be aligned before further operations. This is required on SPARC
+ * platform.
+ */
+void
+ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
+    struct mac_header_info_s *mhip)
+{
+	(void) ip_input_common_v4(ill, ip_ring, mp_chain, mhip, NULL, NULL,
+	    NULL);
+}
+
+/*
+ * ip_accept_tcp() - This function is called by the squeue when it retrieves
+ * a chain of packets in the poll mode. The packets have gone through the
+ * data link processing but not IP processing. For performance and latency
+ * reasons, the squeue wants to process the chain in line instead of feeding
+ * it back via ip_input path.
+ *
+ * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4
+ * will pass back any TCP packets matching the target sqp to
+ * ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by
+ * ip_input_v4 and ip_fanout_v4 as normal.
+ * The TCP packets that match the target squeue are returned to the caller
+ * as a b_next chain after each packet has been prepend with an mblk
+ * from ip_recv_attr_to_mblk.
+ */
+mblk_t *
+ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
+    mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
+{
+	return (ip_input_common_v4(ill, ip_ring, mp_chain, NULL, target_sqp,
+	    last, cnt));
+}
+
+/*
+ * Used by ip_input and ip_accept_tcp
+ * The last three arguments are only used by ip_accept_tcp, and mhip is
+ * only used by ip_input.
+ */
+mblk_t *
+ip_input_common_v4(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
+    struct mac_header_info_s *mhip, squeue_t *target_sqp,
+    mblk_t **last, uint_t *cnt)
+{
+	mblk_t		*mp;
+	ipha_t		*ipha;
+	ip_recv_attr_t	iras;	/* Receive attributes */
+	rtc_t		rtc;
+	iaflags_t	chain_flags = 0;	/* Fixed for chain */
+	mblk_t 		*ahead = NULL;	/* Accepted head */
+	mblk_t		*atail = NULL;	/* Accepted tail */
+	uint_t		acnt = 0;	/* Accepted count */
+
+	ASSERT(mp_chain != NULL);
+	ASSERT(ill != NULL);
+
+	/* These ones do not change as we loop over packets */
+	iras.ira_ill = iras.ira_rill = ill;
+	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+	iras.ira_rifindex = iras.ira_ruifindex;
+	iras.ira_sqp = NULL;
+	iras.ira_ring = ip_ring;
+	/* For ECMP and outbound transmit ring selection */
+	iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring);
+
+	iras.ira_target_sqp = target_sqp;
+	iras.ira_target_sqp_mp = NULL;
+	if (target_sqp != NULL)
+		chain_flags |= IRAF_TARGET_SQP;
+
+	/*
+	 * We try to have a mhip pointer when possible, but
+	 * it might be NULL in some cases. In those cases we
+	 * have to assume unicast.
+	 */
+	iras.ira_mhip = mhip;
+	iras.ira_flags = 0;
+	if (mhip != NULL) {
+		switch (mhip->mhi_dsttype) {
+		case MAC_ADDRTYPE_MULTICAST :
+			chain_flags |= IRAF_L2DST_MULTICAST;
+			break;
+		case MAC_ADDRTYPE_BROADCAST :
+			chain_flags |= IRAF_L2DST_BROADCAST;
+			break;
+		}
+	}
+
+	/*
+	 * Initialize the one-element route cache.
+	 *
+	 * We do ire caching from one iteration to
+	 * another. In the event the packet chain contains
+	 * all packets from the same dst, this caching saves
+	 * an ire_route_recursive for each of the succeeding
+	 * packets in a packet chain.
+	 */
+	rtc.rtc_ire = NULL;
+	rtc.rtc_ipaddr = INADDR_ANY;
+
+	/* Loop over b_next */
+	for (mp = mp_chain; mp != NULL; mp = mp_chain) {
+		mp_chain = mp->b_next;
+		mp->b_next = NULL;
+
+		ASSERT(DB_TYPE(mp) == M_DATA);
+
+
+		/*
+		 * if db_ref > 1 then copymsg and free original. Packet
+		 * may be changed and we do not want the other entity
+		 * who has a reference to this message to trip over the
+		 * changes. This is a blind change because trying to
+		 * catch all places that might change the packet is too
+		 * difficult.
+		 *
+		 * This corresponds to the fast path case, where we have
+		 * a chain of M_DATA mblks.  We check the db_ref count
+		 * of only the 1st data block in the mblk chain. There
+		 * doesn't seem to be a reason why a device driver would
+		 * send up data with varying db_ref counts in the mblk
+		 * chain. In any case the Fast path is a private
+		 * interface, and our drivers don't do such a thing.
+		 * Given the above assumption, there is no need to walk
+		 * down the entire mblk chain (which could have a
+		 * potential performance problem)
+		 *
+		 * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
+		 * to here because of exclusive ip stacks and vnics.
+		 * Packets transmitted from exclusive stack over vnic
+		 * can have db_ref > 1 and when it gets looped back to
+		 * another vnic in a different zone, you have ip_input()
+		 * getting dblks with db_ref > 1. So if someone
+		 * complains of TCP performance under this scenario,
+		 * take a serious look here on the impact of copymsg().
+		 */
+		if (DB_REF(mp) > 1) {
+			if ((mp = ip_fix_dbref(mp, &iras)) == NULL) {
+				/* mhip might point into 1st packet in chain */
+				iras.ira_mhip = NULL;
+				continue;
+			}
+		}
+
+		/*
+		 * IP header ptr not aligned?
+		 * OR IP header not complete in first mblk
+		 */
+		ipha = (ipha_t *)mp->b_rptr;
+		if (!OK_32PTR(ipha) || MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) {
+			mp = ip_check_and_align_header(mp, IP_SIMPLE_HDR_LENGTH,
+			    &iras);
+			if (mp == NULL) {
+				/* mhip might point into 1st packet in chain */
+				iras.ira_mhip = NULL;
+				continue;
+			}
+			ipha = (ipha_t *)mp->b_rptr;
+		}
+
+		/* Protect against a mix of Ethertypes and IP versions */
+		if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+			freemsg(mp);
+			/* mhip might point into 1st packet in the chain. */
+			iras.ira_mhip = NULL;
+			continue;
+		}
+
+		/*
+		 * Check for Martian addrs; we have to explicitly
+		 * test for for zero dst since this is also used as
+		 * an indication that the rtc is not used.
+		 */
+		if (ipha->ipha_dst == INADDR_ANY) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+			freemsg(mp);
+			/* mhip might point into 1st packet in the chain. */
+			iras.ira_mhip = NULL;
+			continue;
+		}
+
+		/*
+		 * Keep L2SRC from a previous packet in chain since mhip
+		 * might point into an earlier packet in the chain.
+		 * Keep IRAF_VERIFIED_SRC to avoid redoing broadcast
+		 * source check in forwarding path.
+		 */
+		chain_flags |= (iras.ira_flags &
+		    (IRAF_L2SRC_SET|IRAF_VERIFIED_SRC));
+
+		iras.ira_flags = IRAF_IS_IPV4 | IRAF_VERIFY_IP_CKSUM |
+		    IRAF_VERIFY_ULP_CKSUM | chain_flags;
+		iras.ira_free_flags = 0;
+		iras.ira_cred = NULL;
+		iras.ira_cpid = NOPID;
+		iras.ira_tsl = NULL;
+		iras.ira_zoneid = ALL_ZONES;	/* Default for forwarding */
+
+		/*
+		 * We must count all incoming packets, even if they end
+		 * up being dropped later on. Defer counting bytes until
+		 * we have the whole IP header in first mblk.
+		 */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+
+		iras.ira_pktlen = ntohs(ipha->ipha_length);
+		UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
+		    iras.ira_pktlen);
+
+		/*
+		 * Call one of:
+		 * 	ill_input_full_v4
+		 *	ill_input_short_v4
+		 * The former is used in unusual cases. See ill_set_inputfn().
+		 */
+		(*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
+
+		/* Any references to clean up? No hold on ira_ill */
+		if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
+			ira_cleanup(&iras, B_FALSE);
+
+		if (iras.ira_target_sqp_mp != NULL) {
+			/* Better be called from ip_accept_tcp */
+			ASSERT(target_sqp != NULL);
+
+			/* Found one packet to accept */
+			mp = iras.ira_target_sqp_mp;
+			iras.ira_target_sqp_mp = NULL;
+			ASSERT(ip_recv_attr_is_mblk(mp));
+
+			if (atail != NULL)
+				atail->b_next = mp;
+			else
+				ahead = mp;
+			atail = mp;
+			acnt++;
+			mp = NULL;
+		}
+		/* mhip might point into 1st packet in the chain. */
+		iras.ira_mhip = NULL;
+	}
+	/* Any remaining references to the route cache? */
+	if (rtc.rtc_ire != NULL) {
+		ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
+		ire_refrele(rtc.rtc_ire);
+	}
+
+	if (ahead != NULL) {
+		/* Better be called from ip_accept_tcp */
+		ASSERT(target_sqp != NULL);
+		*last = atail;
+		*cnt = acnt;
+		return (ahead);
+	}
+
+	return (NULL);
+}
+
+/*
+ * This input function is used when
+ *  - is_system_labeled()
+ *  - CGTP filtering
+ *  - DHCP unicast before we have an IP address configured
+ *  - there is an listener for IPPROTO_RSVP
+ */
+void
+ill_input_full_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
+    ip_recv_attr_t *ira, rtc_t *rtc)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ipaddr_t	nexthop = *(ipaddr_t *)nexthop_arg;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	int		cgtp_flt_pkt;
+
+	ASSERT(ira->ira_tsl == NULL);
+
+	/*
+	 * Attach any necessary label information to
+	 * this packet
+	 */
+	if (is_system_labeled()) {
+		ira->ira_flags |= IRAF_SYSTEM_LABELED;
+
+		/*
+		 * This updates ira_cred, ira_tsl and ira_free_flags based
+		 * on the label.
+		 */
+		if (!tsol_get_pkt_label(mp, IPV4_VERSION, ira)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		/* Note that ira_tsl can be NULL here. */
+
+		/* tsol_get_pkt_label sometimes does pullupmsg */
+		ipha = (ipha_t *)mp->b_rptr;
+	}
+
+	/*
+	 * Invoke the CGTP (multirouting) filtering module to process
+	 * the incoming packet. Packets identified as duplicates
+	 * must be discarded. Filtering is active only if the
+	 * the ip_cgtp_filter ndd variable is non-zero.
+	 */
+	cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP;
+	if (ipst->ips_ip_cgtp_filter &&
+	    ipst->ips_ip_cgtp_filter_ops != NULL) {
+		netstackid_t stackid;
+
+		stackid = ipst->ips_netstack->netstack_stackid;
+		/*
+		 * CGTP and IPMP are mutually exclusive so
+		 * phyint_ifindex is fine here.
+		 */
+		cgtp_flt_pkt =
+		    ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid,
+		    ill->ill_phyint->phyint_ifindex, mp);
+		if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
+			ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill);
+			freemsg(mp);
+			return;
+		}
+	}
+
+	/*
+	 * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP
+	 * server to unicast DHCP packets to a DHCP client using the
+	 * IP address it is offering to the client.  This can be
+	 * disabled through the "broadcast bit", but not all DHCP
+	 * servers honor that bit.  Therefore, to interoperate with as
+	 * many DHCP servers as possible, the DHCP client allows the
+	 * server to unicast, but we treat those packets as broadcast
+	 * here.  Note that we don't rewrite the packet itself since
+	 * (a) that would mess up the checksums and (b) the DHCP
+	 * client conn is bound to INADDR_ANY so ip_fanout_udp() will
+	 * hand it the packet regardless.
+	 */
+	if (ill->ill_dhcpinit != 0 &&
+	    ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION &&
+	    ipha->ipha_protocol == IPPROTO_UDP) {
+		udpha_t *udpha;
+
+		ipha = ip_pullup(mp, sizeof (ipha_t) + sizeof (udpha_t), ira);
+		if (ipha == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards - dhcp", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		/* Reload since pullupmsg() can change b_rptr. */
+		udpha = (udpha_t *)&ipha[1];
+
+		if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) {
+			DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill,
+			    mblk_t *, mp);
+			/*
+			 * This assumes that we deliver to all conns for
+			 * multicast and broadcast packets.
+			 */
+			nexthop = INADDR_BROADCAST;
+			ira->ira_flags |= IRAF_DHCP_UNICAST;
+		}
+	}
+
+	/*
+	 * If rsvpd is running, let RSVP daemon handle its processing
+	 * and forwarding of RSVP multicast/unicast packets.
+	 * If rsvpd is not running but mrouted is running, RSVP
+	 * multicast packets are forwarded as multicast traffic
+	 * and RSVP unicast packets are forwarded by unicast router.
+	 * If neither rsvpd nor mrouted is running, RSVP multicast
+	 * packets are not forwarded, but the unicast packets are
+	 * forwarded like unicast traffic.
+	 */
+	if (ipha->ipha_protocol == IPPROTO_RSVP &&
+	    ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
+		/* RSVP packet and rsvpd running. Treat as ours */
+		ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(nexthop)));
+		/*
+		 * We use a multicast address to get the packet to
+		 * ire_recv_multicast_v4. There will not be a membership
+		 * check since we set IRAF_RSVP
+		 */
+		nexthop = htonl(INADDR_UNSPEC_GROUP);
+		ira->ira_flags |= IRAF_RSVP;
+	}
+
+	ill_input_short_v4(mp, ipha, &nexthop, ira, rtc);
+}
+
+/*
+ * This is the tail-end of the full receive side packet handling.
+ * It can be used directly when the configuration is simple.
+ */
+void
+ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
+    ip_recv_attr_t *ira, rtc_t *rtc)
+{
+	ire_t		*ire;
+	uint_t		opt_len;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	uint_t		pkt_len;
+	ssize_t 	len;
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ipaddr_t	nexthop = *(ipaddr_t *)nexthop_arg;
+	ilb_stack_t	*ilbs = ipst->ips_netstack->netstack_ilb;
+#define	rptr	((uchar_t *)ipha)
+
+	ASSERT(DB_TYPE(mp) == M_DATA);
+
+	/*
+	 * The following test for loopback is faster than
+	 * IP_LOOPBACK_ADDR(), because it avoids any bitwise
+	 * operations.
+	 * Note that these addresses are always in network byte order
+	 */
+	if (((*(uchar_t *)&ipha->ipha_dst) == 127) ||
+	    ((*(uchar_t *)&ipha->ipha_src) == 127)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+		ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+		freemsg(mp);
+		return;
+	}
+
+	len = mp->b_wptr - rptr;
+	pkt_len = ira->ira_pktlen;
+
+	/* multiple mblk or too short */
+	len -= pkt_len;
+	if (len != 0) {
+		mp = ip_check_length(mp, rptr, len, pkt_len,
+		    IP_SIMPLE_HDR_LENGTH, ira);
+		if (mp == NULL)
+			return;
+		ipha = (ipha_t *)mp->b_rptr;
+	}
+
+	DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
+	    int, 0);
+
+	/*
+	 * The event for packets being received from a 'physical'
+	 * interface is placed after validation of the source and/or
+	 * destination address as being local so that packets can be
+	 * redirected to loopback addresses using ipnat.
+	 */
+	DTRACE_PROBE4(ip4__physical__in__start,
+	    ill_t *, ill, ill_t *, NULL,
+	    ipha_t *, ipha, mblk_t *, mp);
+
+	if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) {
+		int	ll_multicast = 0;
+		int	error;
+		ipaddr_t orig_dst = ipha->ipha_dst;
+
+		if (ira->ira_flags & IRAF_L2DST_MULTICAST)
+			ll_multicast = HPE_MULTICAST;
+		else if (ira->ira_flags & IRAF_L2DST_BROADCAST)
+			ll_multicast = HPE_BROADCAST;
+
+		FW_HOOKS(ipst->ips_ip4_physical_in_event,
+		    ipst->ips_ipv4firewall_physical_in,
+		    ill, NULL, ipha, mp, mp, ll_multicast, ipst, error);
+
+		DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
+
+		if (mp == NULL)
+			return;
+		/* The length could have changed */
+		ipha = (ipha_t *)mp->b_rptr;
+		ira->ira_pktlen = ntohs(ipha->ipha_length);
+		pkt_len = ira->ira_pktlen;
+
+		/*
+		 * In case the destination changed we override any previous
+		 * change to nexthop.
+		 */
+		if (orig_dst != ipha->ipha_dst)
+			nexthop = ipha->ipha_dst;
+		if (nexthop == INADDR_ANY) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+			freemsg(mp);
+			return;
+		}
+	}
+
+	if (ipst->ips_ip4_observe.he_interested) {
+		zoneid_t dzone;
+
+		/*
+		 * On the inbound path the src zone will be unknown as
+		 * this packet has come from the wire.
+		 */
+		dzone = ip_get_zoneid_v4(nexthop, mp, ira, ALL_ZONES);
+		ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst);
+	}
+
+	/*
+	 * If there is a good HW IP header checksum we clear the need
+	 * look at the IP header checksum.
+	 */
+	if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
+	    ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
+		/* Header checksum was ok. Clear the flag */
+		DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
+		ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
+	}
+
+	/*
+	 * Here we check to see if we machine is setup as
+	 * L3 loadbalancer and if the incoming packet is for a VIP
+	 *
+	 * Check the following:
+	 * - there is at least a rule
+	 * - protocol of the packet is supported
+	 */
+	if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) {
+		ipaddr_t	lb_dst;
+		int		lb_ret;
+
+		/* For convenience, we pull up the mblk. */
+		if (mp->b_cont != NULL) {
+			if (pullupmsg(mp, -1) == 0) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards - pullupmsg",
+				    mp, ill);
+				freemsg(mp);
+				return;
+			}
+			ipha = (ipha_t *)mp->b_rptr;
+		}
+
+		/*
+		 * We just drop all fragments going to any VIP, at
+		 * least for now....
+		 */
+		if (ntohs(ipha->ipha_fragment_offset_and_flags) &
+		    (IPH_MF | IPH_OFFSET)) {
+			if (!ilb_rule_match_vip_v4(ilbs, nexthop, NULL)) {
+				goto after_ilb;
+			}
+
+			ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1);
+			ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1);
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ILB fragment", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, ipha->ipha_protocol,
+		    (uint8_t *)ipha + IPH_HDR_LENGTH(ipha), &lb_dst);
+
+		if (lb_ret == ILB_DROPPED) {
+			/* Is this the right counter to increase? */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ILB_DROPPED", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		if (lb_ret == ILB_BALANCED) {
+			/* Set the dst to that of the chosen server */
+			nexthop = lb_dst;
+			DB_CKSUMFLAGS(mp) = 0;
+		}
+	}
+
+after_ilb:
+	opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION;
+	ira->ira_ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
+	if (opt_len != 0) {
+		int error = 0;
+
+		ira->ira_ip_hdr_length += (opt_len << 2);
+		ira->ira_flags |= IRAF_IPV4_OPTIONS;
+
+		/* IP Options present!  Validate the length. */
+		mp = ip_check_optlen(mp, ipha, opt_len, pkt_len, ira);
+		if (mp == NULL)
+			return;
+
+		/* Might have changed */
+		ipha = (ipha_t *)mp->b_rptr;
+
+		/* Verify IP header checksum before parsing the options */
+		if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
+		    ip_csum_hdr(ipha)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
+			ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
+
+		/*
+		 * Go off to ip_input_options which returns the next hop
+		 * destination address, which may have been affected
+		 * by source routing.
+		 */
+		IP_STAT(ipst, ip_opt);
+
+		nexthop = ip_input_options(ipha, nexthop, mp, ira, &error);
+		if (error != 0) {
+			/*
+			 * An ICMP error has been sent and the packet has
+			 * been dropped.
+			 */
+			return;
+		}
+	}
+	/* Can not use route cache with TX since the labels can differ */
+	if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+		if (CLASSD(nexthop)) {
+			ire = ire_multicast(ill);
+		} else {
+			/* Match destination and label */
+			ire = ire_route_recursive_v4(nexthop, 0, NULL,
+			    ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR,
+			    (ill->ill_flags & ILLF_ROUTER),
+			    ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
+		}
+		/* Update the route cache so we do the ire_refrele */
+		ASSERT(ire != NULL);
+		if (rtc->rtc_ire != NULL)
+			ire_refrele(rtc->rtc_ire);
+		rtc->rtc_ire = ire;
+		rtc->rtc_ipaddr = nexthop;
+	} else if (nexthop == rtc->rtc_ipaddr) {
+		/* Use the route cache */
+		ASSERT(rtc->rtc_ire != NULL);
+		ire = rtc->rtc_ire;
+	} else {
+		/* Update the route cache */
+		if (CLASSD(nexthop)) {
+			ire = ire_multicast(ill);
+		} else {
+			/* Just match the destination */
+			ire = ire_route_recursive_dstonly_v4(nexthop,
+			    (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint,
+			    ipst);
+		}
+		ASSERT(ire != NULL);
+		if (rtc->rtc_ire != NULL)
+			ire_refrele(rtc->rtc_ire);
+		rtc->rtc_ire = ire;
+		rtc->rtc_ipaddr = nexthop;
+	}
+
+	ire->ire_ib_pkt_count++;
+
+	/*
+	 * Based on ire_type and ire_flags call one of:
+	 *	ire_recv_local_v4 - for IRE_LOCAL
+	 *	ire_recv_loopback_v4 - for IRE_LOOPBACK
+	 *	ire_recv_multirt_v4 - if RTF_MULTIRT
+	 *	ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
+	 *	ire_recv_multicast_v4 - for IRE_MULTICAST
+	 *	ire_recv_broadcast_v4 - for IRE_BROADCAST
+	 *	ire_recv_noaccept_v4 - for ire_noaccept ones
+	 *	ire_recv_forward_v4 - for the rest.
+	 */
+	(*ire->ire_recvfn)(ire, mp, ipha, ira);
+}
+#undef rptr
+
+/*
+ * ire_recvfn for IREs that need forwarding
+ */
+void
+ire_recv_forward_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ill_t		*dst_ill;
+	nce_t		*nce;
+	ipaddr_t	src = ipha->ipha_src;
+	uint32_t	added_tx_len;
+	uint32_t	mtu, iremtu;
+
+	if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+		ip_drop_input("l2 multicast not forwarded", mp, ill);
+		freemsg(mp);
+		return;
+	}
+
+	if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+		ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+		freemsg(mp);
+		return;
+	}
+
+	/*
+	 * Either ire_nce_capable or ire_dep_parent would be set for the IRE
+	 * when it is found by ire_route_recursive, but that some other thread
+	 * could have changed the routes with the effect of clearing
+	 * ire_dep_parent. In that case we'd end up dropping the packet, or
+	 * finding a new nce below.
+	 * Get, allocate, or update the nce.
+	 * We get a refhold on ire_nce_cache as a result of this to avoid races
+	 * where ire_nce_cache is deleted.
+	 *
+	 * This ensures that we don't forward if the interface is down since
+	 * ipif_down removes all the nces.
+	 */
+	mutex_enter(&ire->ire_lock);
+	nce = ire->ire_nce_cache;
+	if (nce == NULL) {
+		/* Not yet set up - try to set one up */
+		mutex_exit(&ire->ire_lock);
+		(void) ire_revalidate_nce(ire);
+		mutex_enter(&ire->ire_lock);
+		nce = ire->ire_nce_cache;
+		if (nce == NULL) {
+			mutex_exit(&ire->ire_lock);
+			/* The ire_dep_parent chain went bad, or no memory */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("No ire_dep_parent", mp, ill);
+			freemsg(mp);
+			return;
+		}
+	}
+	nce_refhold(nce);
+	mutex_exit(&ire->ire_lock);
+
+	if (nce->nce_is_condemned) {
+		nce_t *nce1;
+
+		nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_FALSE);
+		nce_refrele(nce);
+		if (nce1 == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("No nce", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		nce = nce1;
+	}
+	dst_ill = nce->nce_ill;
+
+	/*
+	 * Unless we are forwarding, drop the packet.
+	 * We have to let source routed packets through if they go out
+	 * the same interface i.e., they are 'ping -l' packets.
+	 */
+	if (!(dst_ill->ill_flags & ILLF_ROUTER) &&
+	    !(ip_source_routed(ipha, ipst) && dst_ill == ill)) {
+		if (ip_source_routed(ipha, ipst)) {
+			ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
+			icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
+			nce_refrele(nce);
+			return;
+		}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+		ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+		freemsg(mp);
+		nce_refrele(nce);
+		return;
+	}
+
+	if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) {
+		ipaddr_t	dst = ipha->ipha_dst;
+
+		ire->ire_ib_pkt_count--;
+		/*
+		 * Should only use IREs that are visible from the
+		 * global zone for forwarding.
+		 * Take a source route into account the same way as ip_input
+		 * did.
+		 */
+		if (ira->ira_flags & IRAF_IPV4_OPTIONS) {
+			int		error = 0;
+
+			dst = ip_input_options(ipha, dst, mp, ira, &error);
+			ASSERT(error == 0);	/* ip_input checked */
+		}
+		ire = ire_route_recursive_v4(dst, 0, NULL, GLOBAL_ZONEID,
+		    ira->ira_tsl, MATCH_IRE_SECATTR,
+		    (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, ipst,
+		    NULL, NULL, NULL);
+		ire->ire_ib_pkt_count++;
+		(*ire->ire_recvfn)(ire, mp, ipha, ira);
+		ire_refrele(ire);
+		nce_refrele(nce);
+		return;
+	}
+
+	/*
+	 * ipIfStatsHCInForwDatagrams should only be increment if there
+	 * will be an attempt to forward the packet, which is why we
+	 * increment after the above condition has been checked.
+	 */
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
+
+	/* Initiate Read side IPPF processing */
+	if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
+		/* ip_process translates an IS_UNDER_IPMP */
+		mp = ip_process(IPP_FWD_IN, mp, ill, ill);
+		if (mp == NULL) {
+			/* ip_drop_packet and MIB done */
+			ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred "
+			    "during IPPF processing\n"));
+			nce_refrele(nce);
+			return;
+		}
+	}
+
+	DTRACE_PROBE4(ip4__forwarding__start,
+	    ill_t *, ill, ill_t *, dst_ill, ipha_t *, ipha, mblk_t *, mp);
+
+	if (HOOKS4_INTERESTED_FORWARDING(ipst)) {
+		int error;
+
+		FW_HOOKS(ipst->ips_ip4_forwarding_event,
+		    ipst->ips_ipv4firewall_forwarding,
+		    ill, dst_ill, ipha, mp, mp, 0, ipst, error);
+
+		DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
+
+		if (mp == NULL) {
+			nce_refrele(nce);
+			return;
+		}
+		/*
+		 * Even if the destination was changed by the filter we use the
+		 * forwarding decision that was made based on the address
+		 * in ip_input.
+		 */
+
+		/* Might have changed */
+		ipha = (ipha_t *)mp->b_rptr;
+		ira->ira_pktlen = ntohs(ipha->ipha_length);
+	}
+
+	/* Packet is being forwarded. Turning off hwcksum flag. */
+	DB_CKSUMFLAGS(mp) = 0;
+
+	/*
+	 * Martian Address Filtering [RFC 1812, Section 5.3.7]
+	 * The loopback address check for both src and dst has already
+	 * been checked in ip_input
+	 * In the future one can envision adding RPF checks using number 3.
+	 * If we already checked the same source address we can skip this.
+	 */
+	if (!(ira->ira_flags & IRAF_VERIFIED_SRC) ||
+	    src != ira->ira_verified_src) {
+		switch (ipst->ips_src_check) {
+		case 0:
+			break;
+		case 2:
+			if (ip_type_v4(src, ipst) == IRE_BROADCAST) {
+				BUMP_MIB(ill->ill_ip_mib,
+				    ipIfStatsForwProhibits);
+				BUMP_MIB(ill->ill_ip_mib,
+				    ipIfStatsInAddrErrors);
+				ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+				freemsg(mp);
+				nce_refrele(nce);
+				return;
+			}
+			/* FALLTHRU */
+
+		case 1:
+			if (CLASSD(src)) {
+				BUMP_MIB(ill->ill_ip_mib,
+				    ipIfStatsForwProhibits);
+				BUMP_MIB(ill->ill_ip_mib,
+				    ipIfStatsInAddrErrors);
+				ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+				freemsg(mp);
+				nce_refrele(nce);
+				return;
+			}
+			break;
+		}
+		/* Remember for next packet */
+		ira->ira_flags |= IRAF_VERIFIED_SRC;
+		ira->ira_verified_src = src;
+	}
+
+	/*
+	 * Check if packet is going out the same link on which it arrived.
+	 * Means we might need to send a redirect.
+	 */
+	if (IS_ON_SAME_LAN(dst_ill, ill) && ipst->ips_ip_g_send_redirects) {
+		ip_send_potential_redirect_v4(mp, ipha, ire, ira);
+	}
+
+	added_tx_len = 0;
+	if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+		mblk_t		*mp1;
+		uint32_t	old_pkt_len = ira->ira_pktlen;
+
+		/*
+		 * Check if it can be forwarded and add/remove
+		 * CIPSO options as needed.
+		 */
+		if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+			ip_drop_input("tsol_ip_forward", mp, ill);
+			freemsg(mp);
+			nce_refrele(nce);
+			return;
+		}
+		/*
+		 * Size may have changed. Remember amount added in case
+		 * IP needs to send an ICMP too big.
+		 */
+		mp = mp1;
+		ipha = (ipha_t *)mp->b_rptr;
+		ira->ira_pktlen = ntohs(ipha->ipha_length);
+		ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
+		if (ira->ira_pktlen > old_pkt_len)
+			added_tx_len = ira->ira_pktlen - old_pkt_len;
+
+		/* Options can have been added or removed */
+		if (ira->ira_ip_hdr_length != IP_SIMPLE_HDR_LENGTH)
+			ira->ira_flags |= IRAF_IPV4_OPTIONS;
+		else
+			ira->ira_flags &= ~IRAF_IPV4_OPTIONS;
+	}
+
+	mtu = dst_ill->ill_mtu;
+	if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu)
+		mtu = iremtu;
+	ip_forward_xmit_v4(nce, ill, mp, ipha, ira, mtu, added_tx_len);
+	nce_refrele(nce);
+}
+
+/*
+ * Used for sending out unicast and multicast packets that are
+ * forwarded.
+ */
+void
+ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha,
+    ip_recv_attr_t *ira, uint32_t mtu, uint32_t added_tx_len)
+{
+	ill_t		*dst_ill = nce->nce_ill;
+	uint32_t	pkt_len;
+	uint32_t	sum;
+	iaflags_t	iraflags = ira->ira_flags;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	iaflags_t	ixaflags;
+
+	if (ipha->ipha_ttl <= 1) {
+		/* Perhaps the checksum was bad */
+		if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
+			ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ICMP_TTL_EXCEEDED", mp, ill);
+		icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira);
+		return;
+	}
+	ipha->ipha_ttl--;
+	/* Adjust the checksum to reflect the ttl decrement. */
+	sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
+	ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
+
+	/* Check if there are options to update */
+	if (iraflags & IRAF_IPV4_OPTIONS) {
+		ASSERT(ipha->ipha_version_and_hdr_length !=
+		    IP_SIMPLE_HDR_VERSION);
+		ASSERT(!(iraflags & IRAF_VERIFY_IP_CKSUM));
+
+		if (!ip_forward_options(mp, ipha, dst_ill, ira)) {
+			/* ipIfStatsForwProhibits and ip_drop_input done */
+			return;
+		}
+
+		ipha->ipha_hdr_checksum = 0;
+		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+	}
+
+	/* Initiate Write side IPPF processing before any fragmentation */
+	if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
+		/* ip_process translates an IS_UNDER_IPMP */
+		mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill);
+		if (mp == NULL) {
+			/* ip_drop_packet and MIB done */
+			ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred" \
+			    " during IPPF processing\n"));
+			return;
+		}
+	}
+
+	pkt_len = ira->ira_pktlen;
+
+	BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
+
+	ixaflags = IXAF_IS_IPV4 | IXAF_NO_DEV_FLOW_CTL;
+
+	if (pkt_len > mtu) {
+		/*
+		 * It needs fragging on its way out.  If we haven't
+		 * verified the header checksum yet we do it now since
+		 * are going to put a surely good checksum in the
+		 * outgoing header, we have to make sure that it
+		 * was good coming in.
+		 */
+		if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
+			ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		if (ipha->ipha_fragment_offset_and_flags & IPH_DF_HTONS) {
+			BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails);
+			ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill);
+			if (iraflags & IRAF_SYSTEM_LABELED) {
+				/*
+				 * Remove any CIPSO option added by
+				 * tsol_ip_forward, and make sure we report
+				 * a path MTU so that there
+				 * is room to add such a CIPSO option for future
+				 * packets.
+				 */
+				mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len,
+				    AF_INET);
+			}
+
+			icmp_frag_needed(mp, mtu, ira);
+			return;
+		}
+
+		(void) ip_fragment_v4(mp, nce, ixaflags, pkt_len, mtu,
+		    ira->ira_xmit_hint, GLOBAL_ZONEID, 0, ip_xmit, NULL);
+		return;
+	}
+
+	ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
+	if (iraflags & IRAF_LOOPBACK_COPY) {
+		/*
+		 * IXAF_NO_LOOP_ZONEID is not set hence 7th arg
+		 * is don't care
+		 */
+		(void) ip_postfrag_loopcheck(mp, nce,
+		    ixaflags | IXAF_LOOPBACK_COPY,
+		    pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
+	} else {
+		(void) ip_xmit(mp, nce, ixaflags, pkt_len, ira->ira_xmit_hint,
+		    GLOBAL_ZONEID, 0, NULL);
+	}
+}
+
+/*
+ * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE,
+ * which is what ire_route_recursive returns when there is no matching ire.
+ * Send ICMP unreachable unless blackhole.
+ */
+void
+ire_recv_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+
+	/* Would we have forwarded this packet if we had a route? */
+	if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+		ip_drop_input("l2 multicast not forwarded", mp, ill);
+		freemsg(mp);
+		return;
+	}
+
+	if (!(ill->ill_flags & ILLF_ROUTER)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+		ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+		freemsg(mp);
+		return;
+	}
+	/*
+	 * If we had a route this could have been forwarded. Count as such.
+	 *
+	 * ipIfStatsHCInForwDatagrams should only be increment if there
+	 * will be an attempt to forward the packet, which is why we
+	 * increment after the above condition has been checked.
+	 */
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
+
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
+
+	ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, RTA_DST,
+	    ipst);
+
+	if (ire->ire_flags & RTF_BLACKHOLE) {
+		ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill);
+		freemsg(mp);
+	} else {
+		ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill);
+
+		if (ip_source_routed(ipha, ipst)) {
+			icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
+		} else {
+			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, ira);
+		}
+	}
+}
+
+/*
+ * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for
+ * VRRP when in noaccept mode.
+ * We silently drop the packet. ARP handles packets even if noaccept is set.
+ */
+/* ARGSUSED */
+void
+ire_recv_noaccept_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_ill;
+
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+	ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
+	freemsg(mp);
+}
+
+/*
+ * ire_recvfn for IRE_BROADCAST.
+ */
+void
+ire_recv_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_recv_attr_t *ira)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	ill_t		*dst_ill = ire->ire_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	ire_t		*alt_ire;
+	nce_t		*nce;
+	ipaddr_t	ipha_dst;
+
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts);
+
+	/* Tag for higher-level protocols */
+	ira->ira_flags |= IRAF_BROADCAST;
+
+	/*
+	 * Whether local or directed broadcast forwarding: don't allow
+	 * for TCP.
+	 */
+	if (ipha->ipha_protocol == IPPROTO_TCP) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards", mp, ill);
+		freemsg(mp);
+		return;
+	}
+
+	/*
+	 * So that we don't end up with dups, only one ill an IPMP group is
+	 * nominated to receive broadcast traffic.
+	 * If we have no cast_ill we are liberal and accept everything.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		/* For an under ill_grp can change under lock */
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+		if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
+		    ill->ill_grp->ig_cast_ill != NULL) {
+			rw_exit(&ipst->ips_ill_g_lock);
+			/* No MIB since this is normal operation */
+			ip_drop_input("not nom_cast", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		rw_exit(&ipst->ips_ill_g_lock);
+
+		ira->ira_ruifindex = ill_get_upper_ifindex(ill);
+	}
+
+	/*
+	 * After reassembly and IPsec we will need to duplicate the
+	 * broadcast packet for all matching zones on the ill.
+	 */
+	ira->ira_zoneid = ALL_ZONES;
+
+	/*
+	 * Check for directed broadcast i.e. ire->ire_ill is different than
+	 * the incoming ill.
+	 * The same broadcast address can be assigned to multiple interfaces
+	 * so have to check explicitly for that case by looking up the alt_ire
+	 */
+	if (dst_ill == ill && !(ire->ire_flags & RTF_MULTIRT)) {
+		/* Reassemble on the ill on which the packet arrived */
+		ip_input_local_v4(ire, mp, ipha, ira);
+		/* Restore */
+		ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+		return;
+	}
+
+	/* Is there an IRE_BROADCAST on the incoming ill? */
+	ipha_dst = ((ira->ira_flags & IRAF_DHCP_UNICAST) ? INADDR_BROADCAST :
+	    ipha->ipha_dst);
+	alt_ire = ire_ftable_lookup_v4(ipha_dst, 0, 0, IRE_BROADCAST, ill,
+	    ALL_ZONES, ira->ira_tsl,
+	    MATCH_IRE_TYPE|MATCH_IRE_ILL|MATCH_IRE_SECATTR, 0, ipst, NULL);
+	if (alt_ire != NULL) {
+		/* Not a directed broadcast */
+		/*
+		 * In the special case of multirouted broadcast
+		 * packets, we unconditionally need to "gateway"
+		 * them to the appropriate interface here so that reassembly
+		 * works. We know that the IRE_BROADCAST on cgtp0 doesn't
+		 * have RTF_MULTIRT set so we look for such an IRE in the
+		 * bucket.
+		 */
+		if (alt_ire->ire_flags & RTF_MULTIRT) {
+			irb_t		*irb;
+			ire_t		*ire1;
+
+			irb = ire->ire_bucket;
+			irb_refhold(irb);
+			for (ire1 = irb->irb_ire; ire1 != NULL;
+			    ire1 = ire1->ire_next) {
+				if (IRE_IS_CONDEMNED(ire1))
+					continue;
+				if (!(ire1->ire_type & IRE_BROADCAST) ||
+				    (ire1->ire_flags & RTF_MULTIRT))
+					continue;
+				ill = ire1->ire_ill;
+				ill_refhold(ill);
+				break;
+			}
+			irb_refrele(irb);
+			if (ire1 != NULL) {
+				ill_t *orig_ill = ira->ira_ill;
+
+				ire_refrele(alt_ire);
+				/* Reassemble on the new ill */
+				ira->ira_ill = ill;
+				ip_input_local_v4(ire, mp, ipha, ira);
+				ill_refrele(ill);
+				/* Restore */
+				ira->ira_ill = orig_ill;
+				ira->ira_ruifindex =
+				    orig_ill->ill_phyint->phyint_ifindex;
+				return;
+			}
+		}
+		ire_refrele(alt_ire);
+		/* Reassemble on the ill on which the packet arrived */
+		ip_input_local_v4(ire, mp, ipha, ira);
+		goto done;
+	}
+
+	/*
+	 * This is a directed broadcast
+	 *
+	 * If directed broadcast is allowed, then forward the packet out
+	 * the destination interface with IXAF_LOOPBACK_COPY set. That will
+	 * result in ip_input() receiving a copy of the packet on the
+	 * appropriate ill. (We could optimize this to avoid the extra trip
+	 * via ip_input(), but since directed broadcasts are normally disabled
+	 * it doesn't make sense to optimize it.)
+	 */
+	if (!ipst->ips_ip_g_forward_directed_bcast ||
+	    (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST))) {
+		ip_drop_input("directed broadcast not allowed", mp, ill);
+		freemsg(mp);
+		goto done;
+	}
+	if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
+		ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
+		freemsg(mp);
+		goto done;
+	}
+
+	/*
+	 * Clear the indication that this may have hardware
+	 * checksum as we are not using it for forwarding.
+	 */
+	DB_CKSUMFLAGS(mp) = 0;
+
+	/*
+	 * Adjust ttl to 2 (1+1 - the forward engine will decrement it by one.
+	 */
+	ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1;
+	ipha->ipha_hdr_checksum = 0;
+	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+
+	/*
+	 * We use ip_forward_xmit to do any fragmentation.
+	 * and loopback copy on the outbound interface.
+	 *
+	 * Make it so that IXAF_LOOPBACK_COPY to be set on transmit side.
+	 */
+	ira->ira_flags |= IRAF_LOOPBACK_COPY;
+
+	nce = arp_nce_init(dst_ill, ipha->ipha_dst, IRE_BROADCAST);
+	if (nce == NULL) {
+		BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("No nce", mp, dst_ill);
+		freemsg(mp);
+		goto done;
+	}
+
+	ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mtu, 0);
+	nce_refrele(nce);
+done:
+	/* Restore */
+	ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+}
+
+/*
+ * ire_recvfn for IRE_MULTICAST.
+ */
+void
+ire_recv_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_recv_attr_t *ira)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+
+	ASSERT(ire->ire_ill == ira->ira_ill);
+
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
+	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen);
+
+	/* RSVP hook */
+	if (ira->ira_flags & IRAF_RSVP)
+		goto forus;
+
+	/* Tag for higher-level protocols */
+	ira->ira_flags |= IRAF_MULTICAST;
+
+	/*
+	 * So that we don't end up with dups, only one ill an IPMP group is
+	 * nominated to receive multicast traffic.
+	 * If we have no cast_ill we are liberal and accept everything.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		ip_stack_t	*ipst = ill->ill_ipst;
+
+		/* For an under ill_grp can change under lock */
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+		if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
+		    ill->ill_grp->ig_cast_ill != NULL) {
+			rw_exit(&ipst->ips_ill_g_lock);
+			ip_drop_input("not on cast ill", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		rw_exit(&ipst->ips_ill_g_lock);
+		/*
+		 * We switch to the upper ill so that mrouter and hasmembers
+		 * can operate on upper here and in ip_input_multicast.
+		 */
+		ill = ipmp_ill_hold_ipmp_ill(ill);
+		if (ill != NULL) {
+			ASSERT(ill != ira->ira_ill);
+			ASSERT(ire->ire_ill == ira->ira_ill);
+			ira->ira_ill = ill;
+			ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+		} else {
+			ill = ira->ira_ill;
+		}
+	}
+
+	/*
+	 * Check if we are a multicast router - send ip_mforward a copy of
+	 * the packet.
+	 * Due to mroute_decap tunnels we consider forwarding packets even if
+	 * mrouted has not joined the allmulti group on this interface.
+	 */
+	if (ipst->ips_ip_g_mrouter) {
+		int retval;
+
+		/*
+		 * Clear the indication that this may have hardware
+		 * checksum as we are not using it for forwarding.
+		 */
+		DB_CKSUMFLAGS(mp) = 0;
+
+		/*
+		 * ip_mforward helps us make these distinctions: If received
+		 * on tunnel and not IGMP, then drop.
+		 * If IGMP packet, then don't check membership
+		 * If received on a phyint and IGMP or PIM, then
+		 * don't check membership
+		 */
+		retval = ip_mforward(mp, ira);
+		/* ip_mforward updates mib variables if needed */
+
+		switch (retval) {
+		case 0:
+			/*
+			 * pkt is okay and arrived on phyint.
+			 *
+			 * If we are running as a multicast router
+			 * we need to see all IGMP and/or PIM packets.
+			 */
+			if ((ipha->ipha_protocol == IPPROTO_IGMP) ||
+			    (ipha->ipha_protocol == IPPROTO_PIM)) {
+				goto forus;
+			}
+			break;
+		case -1:
+			/* pkt is mal-formed, toss it */
+			freemsg(mp);
+			goto done;
+		case 1:
+			/*
+			 * pkt is okay and arrived on a tunnel
+			 *
+			 * If we are running a multicast router
+			 * we need to see all igmp packets.
+			 */
+			if (ipha->ipha_protocol == IPPROTO_IGMP) {
+				goto forus;
+			}
+			ip_drop_input("Multicast on tunnel ignored", mp, ill);
+			freemsg(mp);
+			goto done;
+		}
+	}
+
+	/*
+	 * Check if we have members on this ill. This is not necessary for
+	 * correctness because even if the NIC/GLD had a leaky filter, we
+	 * filter before passing to each conn_t.
+	 */
+	if (!ill_hasmembers_v4(ill, ipha->ipha_dst)) {
+		/*
+		 * Nobody interested
+		 *
+		 * This might just be caused by the fact that
+		 * multiple IP Multicast addresses map to the same
+		 * link layer multicast - no need to increment counter!
+		 */
+		ip_drop_input("Multicast with no members", mp, ill);
+		freemsg(mp);
+		goto done;
+	}
+forus:
+	ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n",
+	    ntohl(ipha->ipha_dst)));
+
+	/*
+	 * After reassembly and IPsec we will need to duplicate the
+	 * multicast packet for all matching zones on the ill.
+	 */
+	ira->ira_zoneid = ALL_ZONES;
+
+	/* Reassemble on the ill on which the packet arrived */
+	ip_input_local_v4(ire, mp, ipha, ira);
+done:
+	if (ill != ire->ire_ill) {
+		ill_refrele(ill);
+		ira->ira_ill = ire->ire_ill;
+		ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
+	}
+}
+
+/*
+ * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT.
+ * Drop packets since we don't forward out multirt routes.
+ */
+/* ARGSUSED */
+void
+ire_recv_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_ill;
+
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
+	ip_drop_input("Not forwarding out MULTIRT", mp, ill);
+	freemsg(mp);
+}
+
+/*
+ * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK
+ * has rewritten the packet to have a loopback destination address (We
+ * filter out packet with a loopback destination from arriving over the wire).
+ * We don't know what zone to use, thus we always use the GLOBAL_ZONEID.
+ */
+void
+ire_recv_loopback_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	ill_t		*ire_ill = ire->ire_ill;
+
+	ira->ira_zoneid = GLOBAL_ZONEID;
+
+	/* Switch to the lo0 ill for further processing  */
+	if (ire_ill != ill) {
+		/*
+		 * Update ira_ill to be the ILL on which the IP address
+		 * is hosted.
+		 * No need to hold the ill since we have a hold on the ire
+		 */
+		ASSERT(ira->ira_ill == ira->ira_rill);
+		ira->ira_ill = ire_ill;
+
+		ip_input_local_v4(ire, mp, ipha, ira);
+
+		/* Restore */
+		ASSERT(ira->ira_ill == ire_ill);
+		ira->ira_ill = ill;
+		return;
+
+	}
+	ip_input_local_v4(ire, mp, ipha, ira);
+}
+
+/*
+ * ire_recvfn for IRE_LOCAL.
+ */
+void
+ire_recv_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ill_t		*ill = ira->ira_ill;
+	ill_t		*ire_ill = ire->ire_ill;
+
+	/* Make a note for DAD that this address is in use */
+	ire->ire_last_used_time = lbolt;
+
+	/* Only target the IRE_LOCAL with the right zoneid. */
+	ira->ira_zoneid = ire->ire_zoneid;
+
+	/*
+	 * If the packet arrived on the wrong ill, we check that
+	 * this is ok.
+	 * If it is, then we ensure that we do the reassembly on
+	 * the ill on which the address is hosted. We keep ira_rill as
+	 * the one on which the packet arrived, so that IP_PKTINFO and
+	 * friends can report this.
+	 */
+	if (ire_ill != ill) {
+		ire_t *new_ire;
+
+		new_ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
+		if (new_ire == NULL) {
+			/* Drop packet */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+			ip_drop_input("ipIfStatsInForwProhibits", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		/*
+		 * Update ira_ill to be the ILL on which the IP address
+		 * is hosted. No need to hold the ill since we have a
+		 * hold on the ire. Note that we do the switch even if
+		 * new_ire == ire (for IPMP, ire would be the one corresponding
+		 * to the IPMP ill).
+		 */
+		ASSERT(ira->ira_ill == ira->ira_rill);
+		ira->ira_ill = new_ire->ire_ill;
+
+		/* ira_ruifindex tracks the upper for ira_rill */
+		if (IS_UNDER_IPMP(ill))
+			ira->ira_ruifindex = ill_get_upper_ifindex(ill);
+
+		ip_input_local_v4(new_ire, mp, ipha, ira);
+
+		/* Restore */
+		ASSERT(ira->ira_ill == new_ire->ire_ill);
+		ira->ira_ill = ill;
+		ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+
+		if (new_ire != ire)
+			ire_refrele(new_ire);
+		return;
+	}
+
+	ip_input_local_v4(ire, mp, ipha, ira);
+}
+
+/*
+ * Common function for packets arriving for the host. Handles
+ * checksum verification, reassembly checks, etc.
+ */
+static void
+ip_input_local_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_ill;
+	iaflags_t	iraflags = ira->ira_flags;
+
+	/*
+	 * Verify IP header checksum. If the packet was AH or ESP then
+	 * this flag has already been cleared. Likewise if the packet
+	 * had a hardware checksum.
+	 */
+	if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
+		ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
+		freemsg(mp);
+		return;
+	}
+
+	if (iraflags & IRAF_IPV4_OPTIONS) {
+		if (!ip_input_local_options(mp, ipha, ira)) {
+			/* Error has been sent and mp consumed */
+			return;
+		}
+	}
+
+	/*
+	 * Is packet part of fragmented IP packet?
+	 * We compare against defined values in network byte order
+	 */
+	if (ipha->ipha_fragment_offset_and_flags &
+	    (IPH_MF_HTONS | IPH_OFFSET_HTONS)) {
+		/*
+		 * Make sure we have ira_l2src before we loose the original
+		 * mblk
+		 */
+		if (!(ira->ira_flags & IRAF_L2SRC_SET))
+			ip_setl2src(mp, ira, ira->ira_rill);
+
+		mp = ip_input_fragment(mp, ipha, ira);
+		if (mp == NULL)
+			return;
+		/* Completed reassembly */
+		ipha = (ipha_t *)mp->b_rptr;
+	}
+
+	/*
+	 * For broadcast and multicast we need some extra work before
+	 * we call ip_fanout_v4(), since in the case of shared-IP zones
+	 * we need to pretend that a packet arrived for each zoneid.
+	 */
+	if (iraflags & IRAF_MULTIBROADCAST) {
+		if (iraflags & IRAF_BROADCAST)
+			ip_input_broadcast_v4(ire, mp, ipha, ira);
+		else
+			ip_input_multicast_v4(ire, mp, ipha, ira);
+		return;
+	}
+	ip_fanout_v4(mp, ipha, ira);
+}
+
+
+/*
+ * Handle multiple zones which match the same broadcast address
+ * and ill by delivering a packet to each of them.
+ * Walk the bucket and look for different ire_zoneid but otherwise
+ * the same IRE (same ill/addr/mask/type).
+ * Note that ire_add() tracks IREs that are identical in all
+ * fields (addr/mask/type/gw/ill/zoneid) within a single IRE by
+ * increasing ire_identical_cnt. Thus we don't need to be concerned
+ * about those.
+ */
+static void
+ip_input_broadcast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	netstack_t	*ns = ipst->ips_netstack;
+	irb_t		*irb;
+	ire_t		*ire1;
+	mblk_t		*mp1;
+	ipha_t		*ipha1;
+
+	irb = ire->ire_bucket;
+
+	/*
+	 * If we don't have more than one shared-IP zone, or if
+	 * there can't be more than one IRE_BROADCAST for this
+	 * IP address, then just set the zoneid and proceed.
+	 */
+	if (ns->netstack_numzones == 1 || irb->irb_ire_cnt == 1) {
+		ira->ira_zoneid = ire->ire_zoneid;
+
+		ip_fanout_v4(mp, ipha, ira);
+		return;
+	}
+	irb_refhold(irb);
+	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
+		/* We do the main IRE after the end of the loop */
+		if (ire1 == ire)
+			continue;
+
+		/*
+		 * Only IREs for the same IP address should be in the same
+		 * bucket.
+		 * But could have IRE_HOSTs in the case of CGTP.
+		 */
+		ASSERT(ire1->ire_addr == ire->ire_addr);
+		if (!(ire1->ire_type & IRE_BROADCAST))
+			continue;
+
+		if (IRE_IS_CONDEMNED(ire1))
+			continue;
+
+		mp1 = copymsg(mp);
+		if (mp1 == NULL) {
+			/* Failed to deliver to one zone */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			continue;
+		}
+		ira->ira_zoneid = ire1->ire_zoneid;
+		ipha1 = (ipha_t *)mp1->b_rptr;
+		ip_fanout_v4(mp1, ipha1, ira);
+	}
+	irb_refrele(irb);
+	/* Do the main ire */
+	ira->ira_zoneid = ire->ire_zoneid;
+	ip_fanout_v4(mp, ipha, ira);
+}
+
+/*
+ * Handle multiple zones which want to receive the same multicast packets
+ * on this ill by delivering a packet to each of them.
+ *
+ * Note that for packets delivered to transports we could instead do this
+ * as part of the fanout code, but since we need to handle icmp_inbound
+ * it is simpler to have multicast work the same as broadcast.
+ *
+ * The ip_fanout matching for multicast matches based on ilm independent of
+ * zoneid since the zoneid restriction is applied when joining a multicast
+ * group.
+ */
+/* ARGSUSED */
+static void
+ip_input_multicast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_ill;
+	iaflags_t	iraflags = ira->ira_flags;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	netstack_t	*ns = ipst->ips_netstack;
+	zoneid_t	zoneid;
+	mblk_t		*mp1;
+	ipha_t		*ipha1;
+
+	/* ire_recv_multicast has switched to the upper ill for IPMP */
+	ASSERT(!IS_UNDER_IPMP(ill));
+
+	/*
+	 * If we don't have more than one shared-IP zone, or if
+	 * there are no members in anything but the global zone,
+	 * then just set the zoneid and proceed.
+	 */
+	if (ns->netstack_numzones == 1 ||
+	    !ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
+	    GLOBAL_ZONEID)) {
+		ira->ira_zoneid = GLOBAL_ZONEID;
+
+		/* If sender didn't want this zone to receive it, drop */
+		if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+		    ira->ira_no_loop_zoneid == ira->ira_zoneid) {
+			ip_drop_input("Multicast but wrong zoneid", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		ip_fanout_v4(mp, ipha, ira);
+		return;
+	}
+
+	/*
+	 * Here we loop over all zoneids that have members in the group
+	 * and deliver a packet to ip_fanout for each zoneid.
+	 *
+	 * First find any members in the lowest numeric zoneid by looking for
+	 * first zoneid larger than -1 (ALL_ZONES).
+	 * We terminate the loop when we receive -1 (ALL_ZONES).
+	 */
+	zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, ALL_ZONES);
+	for (; zoneid != ALL_ZONES;
+	    zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, zoneid)) {
+		/*
+		 * Avoid an extra copymsg/freemsg by skipping global zone here
+		 * and doing that at the end.
+		 */
+		if (zoneid == GLOBAL_ZONEID)
+			continue;
+
+		ira->ira_zoneid = zoneid;
+
+		/* If sender didn't want this zone to receive it, skip */
+		if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+		    ira->ira_no_loop_zoneid == ira->ira_zoneid)
+			continue;
+
+		mp1 = copymsg(mp);
+		if (mp1 == NULL) {
+			/* Failed to deliver to one zone */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			continue;
+		}
+		ipha1 = (ipha_t *)mp1->b_rptr;
+		ip_fanout_v4(mp1, ipha1, ira);
+	}
+
+	/* Do the main ire */
+	ira->ira_zoneid = GLOBAL_ZONEID;
+	/* If sender didn't want this zone to receive it, drop */
+	if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+	    ira->ira_no_loop_zoneid == ira->ira_zoneid) {
+		ip_drop_input("Multicast but wrong zoneid", mp, ill);
+		freemsg(mp);
+	} else {
+		ip_fanout_v4(mp, ipha, ira);
+	}
+}
+
+
+/*
+ * Determine the zoneid and IRAF_TX_* flags if trusted extensions
+ * is in use. Updates ira_zoneid and ira_flags as a result.
+ */
+static void
+ip_fanout_tx_v4(mblk_t *mp, ipha_t *ipha, uint8_t protocol,
+    uint_t ip_hdr_length, ip_recv_attr_t *ira)
+{
+	uint16_t	*up;
+	uint16_t	lport;
+	zoneid_t	zoneid;
+
+	ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED);
+
+	/*
+	 * If the packet is unlabeled we might allow read-down
+	 * for MAC_EXEMPT. Below we clear this if it is a multi-level
+	 * port (MLP).
+	 * Note that ira_tsl can be NULL here.
+	 */
+	if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED)
+		ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE;
+
+	if (ira->ira_zoneid != ALL_ZONES)
+		return;
+
+	ira->ira_flags |= IRAF_TX_SHARED_ADDR;
+
+	up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
+	switch (protocol) {
+	case IPPROTO_TCP:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDP:
+		/* Caller ensures this */
+		ASSERT(((uchar_t *)ipha) + ip_hdr_length +4 <= mp->b_wptr);
+
+		/*
+		 * Only these transports support MLP.
+		 * We know their destination port numbers is in
+		 * the same place in the header.
+		 */
+		lport = up[1];
+
+		/*
+		 * No need to handle exclusive-stack zones
+		 * since ALL_ZONES only applies to the shared IP instance.
+		 */
+		zoneid = tsol_mlp_findzone(protocol, lport);
+		/*
+		 * If no shared MLP is found, tsol_mlp_findzone returns
+		 * ALL_ZONES.  In that case, we assume it's SLP, and
+		 * search for the zone based on the packet label.
+		 *
+		 * If there is such a zone, we prefer to find a
+		 * connection in it.  Otherwise, we look for a
+		 * MAC-exempt connection in any zone whose label
+		 * dominates the default label on the packet.
+		 */
+		if (zoneid == ALL_ZONES)
+			zoneid = tsol_attr_to_zoneid(ira);
+		else
+			ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE;
+		break;
+	default:
+		/* Handle shared address for other protocols */
+		zoneid = tsol_attr_to_zoneid(ira);
+		break;
+	}
+	ira->ira_zoneid = zoneid;
+}
+
+/*
+ * Increment checksum failure statistics
+ */
+static void
+ip_input_cksum_err_v4(uint8_t protocol, uint16_t hck_flags, ill_t *ill)
+{
+	ip_stack_t	*ipst = ill->ill_ipst;
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
+
+		if (hck_flags & HCK_FULLCKSUM)
+			IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err);
+		else if (hck_flags & HCK_PARTIALCKSUM)
+			IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err);
+		else
+			IP_STAT(ipst, ip_tcp_in_sw_cksum_err);
+		break;
+	case IPPROTO_UDP:
+		BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
+		if (hck_flags & HCK_FULLCKSUM)
+			IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
+		else if (hck_flags & HCK_PARTIALCKSUM)
+			IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
+		else
+			IP_STAT(ipst, ip_udp_in_sw_cksum_err);
+		break;
+	case IPPROTO_ICMP:
+		BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/* Calculate the IPv4 pseudo-header checksum */
+uint32_t
+ip_input_cksum_pseudo_v4(ipha_t *ipha, ip_recv_attr_t *ira)
+{
+	uint_t		ulp_len;
+	uint32_t	cksum;
+	uint8_t		protocol = ira->ira_protocol;
+	uint16_t	ip_hdr_length = ira->ira_ip_hdr_length;
+
+#define	iphs    ((uint16_t *)ipha)
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		ulp_len = ira->ira_pktlen - ip_hdr_length;
+
+		/* Protocol and length */
+		cksum = htons(ulp_len) + IP_TCP_CSUM_COMP;
+		/* IP addresses */
+		cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
+		break;
+
+	case IPPROTO_UDP: {
+		udpha_t		*udpha;
+
+		udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
+
+		/* Protocol and length */
+		cksum = udpha->uha_length + IP_UDP_CSUM_COMP;
+		/* IP addresses */
+		cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
+		break;
+	}
+
+	default:
+		cksum = 0;
+		break;
+	}
+#undef	iphs
+	return (cksum);
+}
+
+
+/*
+ * Software verification of the ULP checksums.
+ * Returns B_TRUE if ok.
+ * Increments statistics of failed.
+ */
+static boolean_t
+ip_input_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
+{
+	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
+	uint32_t	cksum;
+	uint8_t		protocol = ira->ira_protocol;
+	uint16_t	ip_hdr_length = ira->ira_ip_hdr_length;
+
+	IP_STAT(ipst, ip_in_sw_cksum);
+
+	ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
+
+	cksum = ip_input_cksum_pseudo_v4(ipha, ira);
+	cksum = IP_CSUM(mp, ip_hdr_length, cksum);
+	if (cksum == 0)
+		return (B_TRUE);
+
+	ip_input_cksum_err_v4(protocol, 0, ira->ira_ill);
+	return (B_FALSE);
+}
+
+/* There are drivers that can't do partial checksum with IP options */
+int eri_cksum_workaround = 1;
+
+/*
+ * Verify the ULP checksums.
+ * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum
+ * algorithm.
+ * Increments statistics if failed.
+ */
+static boolean_t
+ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
+    ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_rill;
+	uint16_t	hck_flags;
+	uint32_t	cksum;
+	mblk_t		*mp1;
+	int32_t		len;
+	uint8_t		protocol = ira->ira_protocol;
+	uint16_t	ip_hdr_length = ira->ira_ip_hdr_length;
+
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		break;
+
+	case IPPROTO_UDP: {
+		udpha_t		*udpha;
+
+		udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
+		if (udpha->uha_checksum == 0) {
+			/* Packet doesn't have a UDP checksum */
+			return (B_TRUE);
+		}
+		break;
+	}
+	case IPPROTO_SCTP: {
+		sctp_hdr_t	*sctph;
+		uint32_t	pktsum;
+
+		sctph = (sctp_hdr_t *)((uchar_t *)ipha + ip_hdr_length);
+#ifdef	DEBUG
+		if (skip_sctp_cksum)
+			return (B_TRUE);
+#endif
+		pktsum = sctph->sh_chksum;
+		sctph->sh_chksum = 0;
+		cksum = sctp_cksum(mp, ip_hdr_length);
+		sctph->sh_chksum = pktsum;
+		if (cksum == pktsum)
+			return (B_TRUE);
+
+		/*
+		 * Defer until later whether a bad checksum is ok
+		 * in order to allow RAW sockets to use Adler checksum
+		 * with SCTP.
+		 */
+		ira->ira_flags |= IRAF_SCTP_CSUM_ERR;
+		return (B_TRUE);
+	}
+
+	default:
+		/* No ULP checksum to verify. */
+		return (B_TRUE);
+	}
+	/*
+	 * Revert to software checksum calculation if the interface
+	 * isn't capable of checksum offload.
+	 * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout.
+	 * Note: IRAF_NO_HW_CKSUM is not currently used.
+	 */
+	ASSERT(!IS_IPMP(ill));
+	if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
+	    !dohwcksum) {
+		return (ip_input_sw_cksum_v4(mp, ipha, ira));
+	}
+
+	/*
+	 * We apply this for all ULP protocols. Does the HW know to
+	 * not set the flags for SCTP and other protocols.
+	 */
+
+	hck_flags = DB_CKSUMFLAGS(mp);
+
+	if (hck_flags & HCK_FULLCKSUM) {
+		/*
+		 * Full checksum has been computed by the hardware
+		 * and has been attached.  If the driver wants us to
+		 * verify the correctness of the attached value, in
+		 * order to protect against faulty hardware, compare
+		 * it against -0 (0xFFFF) to see if it's valid.
+		 */
+		if (hck_flags & HCK_FULLCKSUM_OK)
+			return (B_TRUE);
+
+		cksum = DB_CKSUM16(mp);
+		if (cksum == 0xFFFF)
+			return (B_TRUE);
+		ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
+		return (B_FALSE);
+	}
+
+	mp1 = mp->b_cont;
+	if ((hck_flags & HCK_PARTIALCKSUM) &&
+	    (mp1 == NULL || mp1->b_cont == NULL) &&
+	    ip_hdr_length >= DB_CKSUMSTART(mp) &&
+	    (!eri_cksum_workaround || ip_hdr_length == IP_SIMPLE_HDR_LENGTH) &&
+	    ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) {
+		uint32_t	adj;
+		uchar_t		*cksum_start;
+
+		cksum = ip_input_cksum_pseudo_v4(ipha, ira);
+
+		cksum_start = ((uchar_t *)ipha + DB_CKSUMSTART(mp));
+
+		/*
+		 * Partial checksum has been calculated by hardware
+		 * and attached to the packet; in addition, any
+		 * prepended extraneous data is even byte aligned,
+		 * and there are at most two mblks associated with
+		 * the packet.  If any such data exists, we adjust
+		 * the checksum; also take care any postpended data.
+		 */
+		IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj);
+		/*
+		 * One's complement subtract extraneous checksum
+		 */
+		cksum += DB_CKSUM16(mp);
+		if (adj >= cksum)
+			cksum = ~(adj - cksum) & 0xFFFF;
+		else
+			cksum -= adj;
+		cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
+		cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
+		if (!(~cksum & 0xFFFF))
+			return (B_TRUE);
+
+		ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
+		return (B_FALSE);
+	}
+	return (ip_input_sw_cksum_v4(mp, ipha, ira));
+}
+
+
+/*
+ * Handle fanout of received packets.
+ * Unicast packets that are looped back (from ire_send_local_v4) and packets
+ * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM.
+ *
+ * IPQoS Notes
+ * Before sending it to the client, invoke IPPF processing. Policy processing
+ * takes place only if the callout_position, IPP_LOCAL_IN, is enabled.
+ */
+void
+ip_fanout_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
+{
+	ill_t		*ill = ira->ira_ill;
+	iaflags_t	iraflags = ira->ira_flags;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	uint8_t		protocol = ipha->ipha_protocol;
+	conn_t		*connp;
+#define	rptr	((uchar_t *)ipha)
+	uint_t		ip_hdr_length;
+	uint_t		min_ulp_header_length;
+	int		offset;
+	ssize_t		len;
+	netstack_t	*ns = ipst->ips_netstack;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ill_t		*rill = ira->ira_rill;
+
+	ASSERT(ira->ira_pktlen == ntohs(ipha->ipha_length));
+
+	ip_hdr_length = ira->ira_ip_hdr_length;
+	ira->ira_protocol = protocol;
+
+	/*
+	 * Time for IPP once we've done reassembly and IPsec.
+	 * We skip this for loopback packets since we don't do IPQoS
+	 * on loopback.
+	 */
+	if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
+	    !(iraflags & IRAF_LOOPBACK) &&
+	    (protocol != IPPROTO_ESP || protocol != IPPROTO_AH)) {
+		/*
+		 * Use the interface on which the packet arrived - not where
+		 * the IP address is hosted.
+		 */
+		/* ip_process translates an IS_UNDER_IPMP */
+		mp = ip_process(IPP_LOCAL_IN, mp, rill, ill);
+		if (mp == NULL) {
+			/* ip_drop_packet and MIB done */
+			return;
+		}
+	}
+
+	/* Determine the minimum required size of the upper-layer header */
+	/* Need to do this for at least the set of ULPs that TX handles. */
+	switch (protocol) {
+	case IPPROTO_TCP:
+		min_ulp_header_length = TCP_MIN_HEADER_LENGTH;
+		break;
+	case IPPROTO_SCTP:
+		min_ulp_header_length = SCTP_COMMON_HDR_LENGTH;
+		break;
+	case IPPROTO_UDP:
+		min_ulp_header_length = UDPH_SIZE;
+		break;
+	case IPPROTO_ICMP:
+		min_ulp_header_length = ICMPH_SIZE;
+		break;
+	default:
+		min_ulp_header_length = 0;
+		break;
+	}
+	/* Make sure we have the min ULP header length */
+	len = mp->b_wptr - rptr;
+	if (len < ip_hdr_length + min_ulp_header_length) {
+		if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		IP_STAT(ipst, ip_recv_pullup);
+		ipha = ip_pullup(mp, ip_hdr_length + min_ulp_header_length,
+		    ira);
+		if (ipha == NULL)
+			goto discard;
+		len = mp->b_wptr - rptr;
+	}
+
+	/*
+	 * If trusted extensions then determine the zoneid and TX specific
+	 * ira_flags.
+	 */
+	if (iraflags & IRAF_SYSTEM_LABELED) {
+		/* This can update ira->ira_flags and ira->ira_zoneid */
+		ip_fanout_tx_v4(mp, ipha, protocol, ip_hdr_length, ira);
+		iraflags = ira->ira_flags;
+	}
+
+
+	/* Verify ULP checksum. Handles TCP, UDP, and SCTP */
+	if (iraflags & IRAF_VERIFY_ULP_CKSUM) {
+		if (!ip_input_cksum_v4(iraflags, mp, ipha, ira)) {
+			/* Bad checksum. Stats are already incremented */
+			ip_drop_input("Bad ULP checksum", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		/* IRAF_SCTP_CSUM_ERR could have been set */
+		iraflags = ira->ira_flags;
+	}
+	switch (protocol) {
+	case IPPROTO_TCP:
+		/* For TCP, discard broadcast and multicast packets. */
+		if (iraflags & IRAF_MULTIBROADCAST)
+			goto discard;
+
+		/* First mblk contains IP+TCP headers per above check */
+		ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH);
+
+		/* TCP options present? */
+		offset = ((uchar_t *)ipha)[ip_hdr_length + 12] >> 4;
+		if (offset != 5) {
+			if (offset < 5)
+				goto discard;
+
+			/*
+			 * There must be TCP options.
+			 * Make sure we can grab them.
+			 */
+			offset <<= 2;
+			offset += ip_hdr_length;
+			if (len < offset) {
+				if (ira->ira_pktlen < offset) {
+					BUMP_MIB(ill->ill_ip_mib,
+					    ipIfStatsInTruncatedPkts);
+					ip_drop_input(
+					    "ipIfStatsInTruncatedPkts",
+					    mp, ill);
+					freemsg(mp);
+					return;
+				}
+				IP_STAT(ipst, ip_recv_pullup);
+				ipha = ip_pullup(mp, offset, ira);
+				if (ipha == NULL)
+					goto discard;
+				len = mp->b_wptr - rptr;
+			}
+		}
+
+		/*
+		 * Pass up a squeue hint to tcp.
+		 * If ira_sqp is already set (this is loopback) we leave it
+		 * alone.
+		 */
+		if (ira->ira_sqp == NULL) {
+			ira->ira_sqp = ip_squeue_get(ira->ira_ring);
+		}
+
+		/* Look for AF_INET or AF_INET6 that matches */
+		connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_length,
+		    ira, ipst);
+		if (connp == NULL) {
+			/* Send the TH_RST */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+			tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
+			return;
+		}
+		if (connp->conn_incoming_ifindex != 0 &&
+		    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+			CONN_DEC_REF(connp);
+
+			/* Send the TH_RST */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+			tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
+			return;
+		}
+		if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
+		    (iraflags & IRAF_IPSEC_SECURE)) {
+			mp = ipsec_check_inbound_policy(mp, connp,
+			    ipha, NULL, ira);
+			if (mp == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				/* Note that mp is NULL */
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				CONN_DEC_REF(connp);
+				return;
+			}
+		}
+		/* Found a client; up it goes */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		ira->ira_ill = ira->ira_rill = NULL;
+		if (!IPCL_IS_TCP(connp)) {
+			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
+			(connp->conn_recv)(connp, mp, NULL, ira);
+			CONN_DEC_REF(connp);
+			ira->ira_ill = ill;
+			ira->ira_rill = rill;
+			return;
+		}
+
+		/*
+		 * We do different processing whether called from
+		 * ip_accept_tcp and we match the target, don't match
+		 * the target, and when we are called by ip_input.
+		 */
+		if (iraflags & IRAF_TARGET_SQP) {
+			if (ira->ira_target_sqp == connp->conn_sqp) {
+				mblk_t	*attrmp;
+
+				attrmp = ip_recv_attr_to_mblk(ira);
+				if (attrmp == NULL) {
+					BUMP_MIB(ill->ill_ip_mib,
+					    ipIfStatsInDiscards);
+					ip_drop_input("ipIfStatsInDiscards",
+					    mp, ill);
+					freemsg(mp);
+					CONN_DEC_REF(connp);
+				} else {
+					SET_SQUEUE(attrmp, connp->conn_recv,
+					    connp);
+					attrmp->b_cont = mp;
+					ASSERT(ira->ira_target_sqp_mp == NULL);
+					ira->ira_target_sqp_mp = attrmp;
+					/*
+					 * Conn ref release when drained from
+					 * the squeue.
+					 */
+				}
+			} else {
+				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+				    connp->conn_recv, connp, ira, SQ_FILL,
+				    SQTAG_IP_TCP_INPUT);
+			}
+		} else {
+			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv,
+			    connp, ira, ip_squeue_flag, SQTAG_IP_TCP_INPUT);
+		}
+		ira->ira_ill = ill;
+		ira->ira_rill = rill;
+		return;
+
+	case IPPROTO_SCTP: {
+		sctp_hdr_t	*sctph;
+		in6_addr_t	map_src, map_dst;
+		uint32_t	ports;	/* Source and destination ports */
+		sctp_stack_t	*sctps = ipst->ips_netstack->netstack_sctp;
+
+		/* For SCTP, discard broadcast and multicast packets. */
+		if (iraflags & IRAF_MULTIBROADCAST)
+			goto discard;
+
+		/*
+		 * Since there is no SCTP h/w cksum support yet, just
+		 * clear the flag.
+		 */
+		DB_CKSUMFLAGS(mp) = 0;
+
+		/* Length ensured above */
+		ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH);
+		sctph = (sctp_hdr_t *)(rptr + ip_hdr_length);
+
+		/* get the ports */
+		ports = *(uint32_t *)&sctph->sh_sport;
+
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
+		if (iraflags & IRAF_SCTP_CSUM_ERR) {
+			/*
+			 * No potential sctp checksum errors go to the Sun
+			 * sctp stack however they might be Adler-32 summed
+			 * packets a userland stack bound to a raw IP socket
+			 * could reasonably use. Note though that Adler-32 is
+			 * a long deprecated algorithm and customer sctp
+			 * networks should eventually migrate to CRC-32 at
+			 * which time this facility should be removed.
+			 */
+			ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
+			return;
+		}
+		connp = sctp_fanout(&map_src, &map_dst, ports, ira, mp, sctps);
+		if (connp == NULL) {
+			/* Check for raw socket or OOTB handling */
+			ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
+			return;
+		}
+		if (connp->conn_incoming_ifindex != 0 &&
+		    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+			CONN_DEC_REF(connp);
+			/* Check for raw socket or OOTB handling */
+			ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
+			return;
+		}
+
+		/* Found a client; up it goes */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		sctp_input(connp, ipha, NULL, mp, ira);
+		/* sctp_input does a rele of the sctp_t */
+		return;
+	}
+
+	case IPPROTO_UDP:
+		/* First mblk contains IP+UDP headers as checked above */
+		ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE);
+
+		if (iraflags & IRAF_MULTIBROADCAST) {
+			uint16_t *up;	/* Pointer to ports in ULP header */
+
+			up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
+			ip_fanout_udp_multi_v4(mp, ipha, up[1], up[0], ira);
+			return;
+		}
+
+		/* Look for AF_INET or AF_INET6 that matches */
+		connp = ipcl_classify_v4(mp, IPPROTO_UDP, ip_hdr_length,
+		    ira, ipst);
+		if (connp == NULL) {
+	no_udp_match:
+			if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].
+			    connf_head != NULL) {
+				ASSERT(ira->ira_protocol == IPPROTO_UDP);
+				ip_fanout_proto_v4(mp, ipha, ira);
+			} else {
+				ip_fanout_send_icmp_v4(mp,
+				    ICMP_DEST_UNREACHABLE,
+				    ICMP_PORT_UNREACHABLE, ira);
+			}
+			return;
+
+		}
+		if (connp->conn_incoming_ifindex != 0 &&
+		    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+			CONN_DEC_REF(connp);
+			goto no_udp_match;
+		}
+		if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
+		    !canputnext(connp->conn_rq)) {
+			CONN_DEC_REF(connp);
+			BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
+			ip_drop_input("udpIfStatsInOverflows", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
+		    (iraflags & IRAF_IPSEC_SECURE)) {
+			mp = ipsec_check_inbound_policy(mp, connp,
+			    ipha, NULL, ira);
+			if (mp == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				/* Note that mp is NULL */
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				CONN_DEC_REF(connp);
+				return;
+			}
+		}
+		/*
+		 * Remove 0-spi if it's 0, or move everything behind
+		 * the UDP header over it and forward to ESP via
+		 * ip_fanout_v4().
+		 */
+		if (connp->conn_udp->udp_nat_t_endpoint) {
+			if (iraflags & IRAF_IPSEC_SECURE) {
+				ip_drop_packet(mp, B_TRUE, ira->ira_ill,
+				    DROPPER(ipss, ipds_esp_nat_t_ipsec),
+				    &ipss->ipsec_dropper);
+				CONN_DEC_REF(connp);
+				return;
+			}
+
+			mp = zero_spi_check(mp, ira);
+			if (mp == NULL) {
+				/*
+				 * Packet was consumed - probably sent to
+				 * ip_fanout_v4.
+				 */
+				CONN_DEC_REF(connp);
+				return;
+			}
+			/* Else continue like a normal UDP packet. */
+			ipha = (ipha_t *)mp->b_rptr;
+			protocol = ipha->ipha_protocol;
+			ira->ira_protocol = protocol;
+		}
+		/* Found a client; up it goes */
+		IP_STAT(ipst, ip_udp_fannorm);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		ira->ira_ill = ira->ira_rill = NULL;
+		(connp->conn_recv)(connp, mp, NULL, ira);
+		CONN_DEC_REF(connp);
+		ira->ira_ill = ill;
+		ira->ira_rill = rill;
+		return;
+	default:
+		break;
+	}
+
+	/*
+	 * Clear hardware checksumming flag as it is currently only
+	 * used by TCP and UDP.
+	 */
+	DB_CKSUMFLAGS(mp) = 0;
+
+	switch (protocol) {
+	case IPPROTO_ICMP:
+		/*
+		 * We need to accomodate icmp messages coming in clear
+		 * until we get everything secure from the wire. If
+		 * icmp_accept_clear_messages is zero we check with
+		 * the global policy and act accordingly. If it is
+		 * non-zero, we accept the message without any checks.
+		 * But *this does not mean* that this will be delivered
+		 * to RAW socket clients. By accepting we might send
+		 * replies back, change our MTU value etc.,
+		 * but delivery to the ULP/clients depends on their
+		 * policy dispositions.
+		 */
+		if (ipst->ips_icmp_accept_clear_messages == 0) {
+			mp = ipsec_check_global_policy(mp, NULL,
+			    ipha, NULL, ira, ns);
+			if (mp == NULL)
+				return;
+		}
+
+		/*
+		 * On a labeled system, we have to check whether the zone
+		 * itself is permitted to receive raw traffic.
+		 */
+		if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+			if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
+				BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
+				ip_drop_input("tsol_can_accept_raw", mp, ill);
+				freemsg(mp);
+				return;
+			}
+		}
+
+		/*
+		 * ICMP header checksum, including checksum field,
+		 * should be zero.
+		 */
+		if (IP_CSUM(mp, ip_hdr_length, 0)) {
+			BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
+			ip_drop_input("icmpInCksumErrs", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		mp = icmp_inbound_v4(mp, ira);
+		if (mp == NULL) {
+			/* No need to pass to RAW sockets */
+			return;
+		}
+		break;
+
+	case IPPROTO_IGMP:
+		/*
+		 * If we are not willing to accept IGMP packets in clear,
+		 * then check with global policy.
+		 */
+		if (ipst->ips_igmp_accept_clear_messages == 0) {
+			mp = ipsec_check_global_policy(mp, NULL,
+			    ipha, NULL, ira, ns);
+			if (mp == NULL)
+				return;
+		}
+		if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
+		    !tsol_can_accept_raw(mp, ira, B_TRUE)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		/*
+		 * Validate checksum
+		 */
+		if (IP_CSUM(mp, ip_hdr_length, 0)) {
+			++ipst->ips_igmpstat.igps_rcv_badsum;
+			ip_drop_input("igps_rcv_badsum", mp, ill);
+			freemsg(mp);
+			return;
+		}
+
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		mp = igmp_input(mp, ira);
+		if (mp == NULL) {
+			/* Bad packet - discarded by igmp_input */
+			return;
+		}
+		break;
+	case IPPROTO_PIM:
+		/*
+		 * If we are not willing to accept PIM packets in clear,
+		 * then check with global policy.
+		 */
+		if (ipst->ips_pim_accept_clear_messages == 0) {
+			mp = ipsec_check_global_policy(mp, NULL,
+			    ipha, NULL, ira, ns);
+			if (mp == NULL)
+				return;
+		}
+		if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
+		    !tsol_can_accept_raw(mp, ira, B_TRUE)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+
+		/* Checksum is verified in pim_input */
+		mp = pim_input(mp, ira);
+		if (mp == NULL) {
+			/* Bad packet - discarded by pim_input */
+			return;
+		}
+		break;
+	case IPPROTO_AH:
+	case IPPROTO_ESP: {
+		/*
+		 * Fast path for AH/ESP.
+		 */
+		netstack_t *ns = ipst->ips_netstack;
+		ipsec_stack_t *ipss = ns->netstack_ipsec;
+
+		IP_STAT(ipst, ipsec_proto_ahesp);
+
+		if (!ipsec_loaded(ipss)) {
+			ip_proto_not_sup(mp, ira);
+			return;
+		}
+
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+		/* select inbound SA and have IPsec process the pkt */
+		if (protocol == IPPROTO_ESP) {
+			esph_t *esph;
+			boolean_t esp_in_udp_sa;
+			boolean_t esp_in_udp_packet;
+
+			mp = ipsec_inbound_esp_sa(mp, ira, &esph);
+			if (mp == NULL)
+				return;
+
+			ASSERT(esph != NULL);
+			ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+			ASSERT(ira->ira_ipsec_esp_sa != NULL);
+			ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL);
+
+			esp_in_udp_sa = ((ira->ira_ipsec_esp_sa->ipsa_flags &
+			    IPSA_F_NATT) != 0);
+			esp_in_udp_packet =
+			    (ira->ira_flags & IRAF_ESP_UDP_PORTS) != 0;
+
+			/*
+			 * The following is a fancy, but quick, way of saying:
+			 * ESP-in-UDP SA and Raw ESP packet --> drop
+			 *    OR
+			 * ESP SA and ESP-in-UDP packet --> drop
+			 */
+			if (esp_in_udp_sa != esp_in_udp_packet) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_packet(mp, B_TRUE, ira->ira_ill,
+				    DROPPER(ipss, ipds_esp_no_sa),
+				    &ipss->ipsec_dropper);
+				return;
+			}
+			mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph,
+			    ira);
+		} else {
+			ah_t *ah;
+
+			mp = ipsec_inbound_ah_sa(mp, ira, &ah);
+			if (mp == NULL)
+				return;
+
+			ASSERT(ah != NULL);
+			ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+			ASSERT(ira->ira_ipsec_ah_sa != NULL);
+			ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
+			mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah,
+			    ira);
+		}
+
+		if (mp == NULL) {
+			/*
+			 * Either it failed or is pending. In the former case
+			 * ipIfStatsInDiscards was increased.
+			 */
+			return;
+		}
+		/* we're done with IPsec processing, send it up */
+		ip_input_post_ipsec(mp, ira);
+		return;
+	}
+	case IPPROTO_ENCAP: {
+		ipha_t		*inner_ipha;
+
+		/*
+		 * Handle self-encapsulated packets (IP-in-IP where
+		 * the inner addresses == the outer addresses).
+		 */
+		if ((uchar_t *)ipha + ip_hdr_length + sizeof (ipha_t) >
+		    mp->b_wptr) {
+			if (ira->ira_pktlen <
+			    ip_hdr_length + sizeof (ipha_t)) {
+				BUMP_MIB(ill->ill_ip_mib,
+				    ipIfStatsInTruncatedPkts);
+				ip_drop_input("ipIfStatsInTruncatedPkts",
+				    mp, ill);
+				freemsg(mp);
+				return;
+			}
+			ipha = ip_pullup(mp, (uchar_t *)ipha + ip_hdr_length +
+			    sizeof (ipha_t) - mp->b_rptr, ira);
+			if (ipha == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				freemsg(mp);
+				return;
+			}
+		}
+		inner_ipha = (ipha_t *)((uchar_t *)ipha + ip_hdr_length);
+		/*
+		 * Check the sanity of the inner IP header.
+		 */
+		if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		if (inner_ipha->ipha_src != ipha->ipha_src ||
+		    inner_ipha->ipha_dst != ipha->ipha_dst) {
+			/* We fallthru to iptun fanout below */
+			goto iptun;
+		}
+
+		/*
+		 * Self-encapsulated tunnel packet. Remove
+		 * the outer IP header and fanout again.
+		 * We also need to make sure that the inner
+		 * header is pulled up until options.
+		 */
+		mp->b_rptr = (uchar_t *)inner_ipha;
+		ipha = inner_ipha;
+		ip_hdr_length = IPH_HDR_LENGTH(ipha);
+		if ((uchar_t *)ipha + ip_hdr_length > mp->b_wptr) {
+			if (ira->ira_pktlen <
+			    (uchar_t *)ipha + ip_hdr_length - mp->b_rptr) {
+				BUMP_MIB(ill->ill_ip_mib,
+				    ipIfStatsInTruncatedPkts);
+				ip_drop_input("ipIfStatsInTruncatedPkts",
+				    mp, ill);
+				freemsg(mp);
+				return;
+			}
+			ipha = ip_pullup(mp,
+			    (uchar_t *)ipha + ip_hdr_length - mp->b_rptr, ira);
+			if (ipha == NULL) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				freemsg(mp);
+				return;
+			}
+		}
+		if (ip_hdr_length > sizeof (ipha_t)) {
+			/* We got options on the inner packet. */
+			ipaddr_t	dst = ipha->ipha_dst;
+			int		error = 0;
+
+			dst = ip_input_options(ipha, dst, mp, ira, &error);
+			if (error != 0) {
+				/*
+				 * An ICMP error has been sent and the packet
+				 * has been dropped.
+				 */
+				return;
+			}
+			if (dst != ipha->ipha_dst) {
+				/*
+				 * Someone put a source-route in
+				 * the inside header of a self-
+				 * encapsulated packet.  Drop it
+				 * with extreme prejudice and let
+				 * the sender know.
+				 */
+				ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
+				    mp, ill);
+				icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
+				    ira);
+				return;
+			}
+		}
+		if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
+			/*
+			 * This means that somebody is sending
+			 * Self-encapsualted packets without AH/ESP.
+			 *
+			 * Send this packet to find a tunnel endpoint.
+			 * if I can't find one, an ICMP
+			 * PROTOCOL_UNREACHABLE will get sent.
+			 */
+			protocol = ipha->ipha_protocol;
+			ira->ira_protocol = protocol;
+			goto iptun;
+		}
+
+		/* Update based on removed IP header */
+		ira->ira_ip_hdr_length = ip_hdr_length;
+		ira->ira_pktlen = ntohs(ipha->ipha_length);
+
+		if (ira->ira_flags & IRAF_IPSEC_DECAPS) {
+			/*
+			 * This packet is self-encapsulated multiple
+			 * times. We don't want to recurse infinitely.
+			 * To keep it simple, drop the packet.
+			 */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+		ira->ira_flags |= IRAF_IPSEC_DECAPS;
+
+		ip_input_post_ipsec(mp, ira);
+		return;
+	}
+
+	iptun:	/* IPPROTO_ENCAPS that is not self-encapsulated */
+	case IPPROTO_IPV6:
+		/* iptun will verify trusted label */
+		connp = ipcl_classify_v4(mp, protocol, ip_hdr_length,
+		    ira, ipst);
+		if (connp != NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+			ira->ira_ill = ira->ira_rill = NULL;
+			(connp->conn_recv)(connp, mp, NULL, ira);
+			CONN_DEC_REF(connp);
+			ira->ira_ill = ill;
+			ira->ira_rill = rill;
+			return;
+		}
+		/* FALLTHRU */
+	default:
+		/*
+		 * On a labeled system, we have to check whether the zone
+		 * itself is permitted to receive raw traffic.
+		 */
+		if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+			if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+				ip_drop_input("ipIfStatsInDiscards", mp, ill);
+				freemsg(mp);
+				return;
+			}
+		}
+		break;
+	}
+
+	/*
+	 * The above input functions may have returned the pulled up message.
+	 * So ipha need to be reinitialized.
+	 */
+	ipha = (ipha_t *)mp->b_rptr;
+	ira->ira_protocol = protocol = ipha->ipha_protocol;
+	if (ipst->ips_ipcl_proto_fanout_v4[protocol].connf_head == NULL) {
+		/*
+		 * No user-level listener for these packets packets.
+		 * Check for IPPROTO_ENCAP...
+		 */
+		if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) {
+			/*
+			 * Check policy here,
+			 * THEN ship off to ip_mroute_decap().
+			 *
+			 * BTW,  If I match a configured IP-in-IP
+			 * tunnel above, this path will not be reached, and
+			 * ip_mroute_decap will never be called.
+			 */
+			mp = ipsec_check_global_policy(mp, connp,
+			    ipha, NULL, ira, ns);
+			if (mp != NULL) {
+				ip_mroute_decap(mp, ira);
+			} /* Else we already freed everything! */
+		} else {
+			ip_proto_not_sup(mp, ira);
+		}
+		return;
+	}
+
+	/*
+	 * Handle fanout to raw sockets.  There
+	 * can be more than one stream bound to a particular
+	 * protocol.  When this is the case, each one gets a copy
+	 * of any incoming packets.
+	 */
+	ASSERT(ira->ira_protocol == ipha->ipha_protocol);
+	ip_fanout_proto_v4(mp, ipha, ira);
+	return;
+
+discard:
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+	ip_drop_input("ipIfStatsInDiscards", mp, ill);
+	freemsg(mp);
+#undef rptr
+}
diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c
index 63a6863844..be0017cb62 100644
--- a/usr/src/uts/common/inet/ip/ip_ire.c
+++ b/usr/src/uts/common/inet/ip/ip_ire.c
@@ -60,9 +60,6 @@
 #include <inet/ip_rts.h>
 #include <inet/nd.h>
 
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
-#include <inet/sadb.h>
 #include <inet/tcp.h>
 #include <inet/ipclassifier.h>
 #include <sys/zone.h>
@@ -73,6 +70,11 @@
 
 struct kmem_cache *rt_entry_cache;
 
+typedef struct nce_clookup_s {
+	ipaddr_t ncecl_addr;
+	boolean_t ncecl_found;
+} nce_clookup_t;
+
 /*
  * Synchronization notes:
  *
@@ -80,17 +82,17 @@ struct kmem_cache *rt_entry_cache;
  *
  * ire_next/ire_ptpn
  *
- *	- bucket lock of the respective tables (cache or forwarding tables).
+ *	- bucket lock of the forwarding table in which is ire stored.
  *
- * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask,
- * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif,
- * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr
+ * ire_ill, ire_u *except* ire_gateway_addr[v6], ire_mask,
+ * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags,
+ * ire_bucket
  *
  *	- Set in ire_create_v4/v6 and never changes after that. Thus,
  *	  we don't need a lock whenever these fields are accessed.
  *
  *	- ire_bucket and ire_masklen (also set in ire_create) is set in
- *        ire_add_v4/ire_add_v6 before inserting in the bucket and never
+ *        ire_add before inserting in the bucket and never
  *        changes after that. Thus we don't need a lock whenever these
  *	  fields are accessed.
  *
@@ -102,7 +104,7 @@ struct kmem_cache *rt_entry_cache;
  *	  does not use any locks. ire_gateway_addr_v6 updates are not atomic
  *	  and hence any access to it uses ire_lock to get/set the right value.
  *
- * ire_ident, ire_refcnt
+ * ire_refcnt, ire_identical_ref
  *
  *	- Updated atomically using atomic_add_32
  *
@@ -111,40 +113,33 @@ struct kmem_cache *rt_entry_cache;
  *	- Assumes that 32 bit writes are atomic. No locks. ire_lock is
  *	  used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt.
  *
- * ire_max_frag, ire_frag_flag
- *
- *	- ire_lock is used to set/read both of them together.
- *
- * ire_tire_mark
+ * ire_generation
+ *	- Under ire_lock
  *
- *	- Set in ire_create and updated in ire_expire, which is called
- *	  by only one function namely ip_trash_timer_expire. Thus only
- *	  one function updates and examines the value.
+ * ire_nce_cache
+ *	- Under ire_lock
  *
- * ire_marks
- *	- bucket lock protects this.
+ * ire_dep_parent (To next IRE in recursive lookup chain)
+ *	- Under ips_ire_dep_lock. Write held when modifying. Read held when
+ *	  walking. We also hold ire_lock when modifying to allow the data path
+ *	  to only acquire ire_lock.
  *
- * ire_ll_hdr_length
+ * ire_dep_parent_generation (Generation number from ire_dep_parent)
+ *	- Under ips_ire_dep_lock and/or ire_lock. (A read claim on the dep_lock
+ *	  and ire_lock held when modifying)
  *
- *	- Place holder for returning the information to the upper layers
- *	  when IRE_DB_REQ comes down.
- *
- *
- * ipv6_ire_default_count is protected by the bucket lock of
- * ip_forwarding_table_v6[0][0].
- *
- * ipv6_ire_default_index is not protected as it  is just a hint
- * at which default gateway to use. There is nothing
- * wrong in using the same gateway for two different connections.
+ * ire_dep_children (From parent to first child)
+ * ire_dep_sib_next (linked list of siblings)
+ * ire_dep_sib_ptpn (linked list of siblings)
+ *	- Under ips_ire_dep_lock. Write held when modifying. Read held when
+ *	  walking.
  *
  * As we always hold the bucket locks in all the places while accessing
  * the above values, it is natural to use them for protecting them.
  *
- * We have a separate cache table and forwarding table for IPv4 and IPv6.
- * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an
- * array of irb_t structures. The IPv6 forwarding table
+ * We have a forwarding table for IPv4 and IPv6. The IPv6 forwarding table
  * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t
- *  structure. ip_forwarding_table_v6 is allocated dynamically in
+ * structures. ip_forwarding_table_v6 is allocated dynamically in
  * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads
  * initializing the same bucket. Once a bucket is initialized, it is never
  * de-alloacted. This assumption enables us to access
@@ -158,39 +153,37 @@ struct kmem_cache *rt_entry_cache;
  * a bucket and the ires residing in the bucket have a back pointer to
  * the bucket structure. It also has a reference count for the number
  * of threads walking the bucket - irb_refcnt which is bumped up
- * using the macro IRB_REFHOLD macro. The flags irb_flags can be
- * set to IRE_MARK_CONDEMNED indicating that there are some ires
- * in this bucket that are marked with IRE_MARK_CONDEMNED and the
+ * using the irb_refhold function. The flags irb_marks can be
+ * set to IRB_MARK_CONDEMNED indicating that there are some ires
+ * in this bucket that are IRE_IS_CONDEMNED and the
  * last thread to leave the bucket should delete the ires. Usually
- * this is done by the IRB_REFRELE macro which is used to decrement
+ * this is done by the irb_refrele function which is used to decrement
  * the reference count on a bucket. See comments above irb_t structure
  * definition in ip.h for further details.
  *
- * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/
+ * The ire_refhold/ire_refrele functions operate on the ire which increments/
  * decrements the reference count, ire_refcnt, atomically on the ire.
- * ire_refcnt is modified only using this macro. Operations on the IRE
+ * ire_refcnt is modified only using those functions. Operations on the IRE
  * could be described as follows :
  *
  * CREATE an ire with reference count initialized to 1.
  *
  * ADDITION of an ire holds the bucket lock, checks for duplicates
- * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after
+ * and then adds the ire. ire_add returns the ire after
  * bumping up once more i.e the reference count is 2. This is to avoid
  * an extra lookup in the functions calling ire_add which wants to
  * work with the ire after adding.
  *
- * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD
- * macro. It is valid to bump up the referece count of the IRE,
+ * LOOKUP of an ire bumps up the reference count using ire_refhold
+ * function. It is valid to bump up the referece count of the IRE,
  * after the lookup has returned an ire. Following are the lookup
  * functions that return an HELD ire :
  *
- * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6],
- * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6],
- * ipif_to_ire[_v6].
+ * ire_ftable_lookup[_v6], ire_lookup_multi_ill[_v6]
  *
  * DELETION of an ire holds the bucket lock, removes it from the list
  * and then decrements the reference count for having removed from the list
- * by using the IRE_REFRELE macro. If some other thread has looked up
+ * by using the ire_refrele function. If some other thread has looked up
  * the ire, the reference count would have been bumped up and hence
  * this ire will not be freed once deleted. It will be freed once the
  * reference count drops to zero.
@@ -198,27 +191,12 @@ struct kmem_cache *rt_entry_cache;
  * Add and Delete acquires the bucket lock as RW_WRITER, while all the
  * lookups acquire the bucket lock as RW_READER.
  *
- * NOTE : The only functions that does the IRE_REFRELE when an ire is
- *	  passed as an argument are :
- *
- *	  1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the
- *			   broadcast ires it looks up internally within
- *			   the function. Currently, for simplicity it does
- *			   not differentiate the one that is passed in and
- *			   the ones it looks up internally. It always
- *			   IRE_REFRELEs.
- *	  2) ire_send
- *	     ire_send_v6 : As ire_send calls ip_wput_ire and other functions
- *			   that take ire as an argument, it has to selectively
- *			   IRE_REFRELE the ire. To maintain symmetry,
- *			   ire_send_v6 does the same.
- *
- * Otherwise, the general rule is to do the IRE_REFRELE in the function
+ * The general rule is to do the ire_refrele in the function
  * that is passing the ire as an argument.
  *
  * In trying to locate ires the following points are to be noted.
  *
- * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is
+ * IRE_IS_CONDEMNED signifies that the ire has been logically deleted and is
  * to be ignored when walking the ires using ire_next.
  *
  * Zones note:
@@ -230,14 +208,6 @@ struct kmem_cache *rt_entry_cache;
  */
 
 /*
- * The minimum size of IRE cache table.  It will be recalcuated in
- * ip_ire_init().
- * Setable in /etc/system
- */
-uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE;
-uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE;
-
-/*
  * The size of the forwarding table.  We will make sure that it is a
  * power of 2 in ip_ire_init().
  * Setable in /etc/system
@@ -245,313 +215,213 @@ uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE;
 uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE;
 
 struct	kmem_cache	*ire_cache;
-static ire_t	ire_null;
-
-/*
- * The threshold number of IRE in a bucket when the IREs are
- * cleaned up.  This threshold is calculated later in ip_open()
- * based on the speed of CPU and available memory.  This default
- * value is the maximum.
- *
- * We have two kinds of cached IRE, temporary and
- * non-temporary.  Temporary IREs are marked with
- * IRE_MARK_TEMPORARY.  They are IREs created for non
- * TCP traffic and for forwarding purposes.  All others
- * are non-temporary IREs.  We don't mark IRE created for
- * TCP as temporary because TCP is stateful and there are
- * info stored in the IRE which can be shared by other TCP
- * connections to the same destination.  For connected
- * endpoint, we also don't want to mark the IRE used as
- * temporary because the same IRE will be used frequently,
- * otherwise, the app should not do a connect().  We change
- * the marking at ip_bind_connected_*() if necessary.
- *
- * We want to keep the cache IRE hash bucket length reasonably
- * short, otherwise IRE lookup functions will take "forever."
- * We use the "crude" function that the IRE bucket
- * length should be based on the CPU speed, which is 1 entry
- * per x MHz, depending on the shift factor ip_ire_cpu_ratio
- * (n).  This means that with a 750MHz CPU, the max bucket
- * length can be (750 >> n) entries.
- *
- * Note that this threshold is separate for temp and non-temp
- * IREs.  This means that the actual bucket length can be
- * twice as that.  And while we try to keep temporary IRE
- * length at most at the threshold value, we do not attempt to
- * make the length for non-temporary IREs fixed, for the
- * reason stated above.  Instead, we start trying to find
- * "unused" non-temporary IREs when the bucket length reaches
- * this threshold and clean them up.
- *
- * We also want to limit the amount of memory used by
- * IREs.  So if we are allowed to use ~3% of memory (M)
- * for those IREs, each bucket should not have more than
- *
- * 	M / num of cache bucket / sizeof (ire_t)
- *
- * Again the above memory uses are separate for temp and
- * non-temp cached IREs.
- *
- * We may also want the limit to be a function of the number
- * of interfaces and number of CPUs.  Doing the initialization
- * in ip_open() means that every time an interface is plumbed,
- * the max is re-calculated.  Right now, we don't do anything
- * different.  In future, when we have more experience, we
- * may want to change this behavior.
- */
-uint32_t ip_ire_max_bucket_cnt = 10;	/* Setable in /etc/system */
-uint32_t ip6_ire_max_bucket_cnt = 10;
-uint32_t ip_ire_cleanup_cnt = 2;
-
-/*
- * The minimum of the temporary IRE bucket count.  We do not want
- * the length of each bucket to be too short.  This may hurt
- * performance of some apps as the temporary IREs are removed too
- * often.
- */
-uint32_t ip_ire_min_bucket_cnt = 3;	/* /etc/system - not used */
-uint32_t ip6_ire_min_bucket_cnt = 3;
-
-/*
- * The ratio of memory consumed by IRE used for temporary to available
- * memory.  This is a shift factor, so 6 means the ratio 1 to 64.  This
- * value can be changed in /etc/system.  6 is a reasonable number.
- */
-uint32_t ip_ire_mem_ratio = 6;	/* /etc/system */
-/* The shift factor for CPU speed to calculate the max IRE bucket length. */
-uint32_t ip_ire_cpu_ratio = 7;	/* /etc/system */
-
-typedef struct nce_clookup_s {
-	ipaddr_t ncecl_addr;
-	boolean_t ncecl_found;
-} nce_clookup_t;
-
-/*
- * The maximum number of buckets in IRE cache table.  In future, we may
- * want to make it a dynamic hash table.  For the moment, we fix the
- * size and allocate the table in ip_ire_init() when IP is first loaded.
- * We take into account the amount of memory a system has.
- */
-#define	IP_MAX_CACHE_TABLE_SIZE	4096
-
-/* Setable in /etc/system */
-static uint32_t	ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
-static uint32_t	ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
+struct	kmem_cache	*ncec_cache;
+struct	kmem_cache	*nce_cache;
 
-/* Zero iulp_t for initialization. */
-const iulp_t	ire_uinfo_null = { 0 };
+static ire_t	ire_null;
 
-static int	ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp,
-    ipsq_func_t func, boolean_t);
+static ire_t	*ire_add_v4(ire_t *ire);
 static void	ire_delete_v4(ire_t *ire);
+static void	ire_dep_invalidate_children(ire_t *child);
 static void	ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers,
     zoneid_t zoneid, ip_stack_t *);
 static void	ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type,
     pfv_t func, void *arg, uchar_t vers, ill_t *ill);
-static void	ire_cache_cleanup(irb_t *irb, uint32_t threshold,
-    ire_t *ref_ire);
-static	void	ip_nce_clookup_and_delete(nce_t *nce, void *arg);
-static	ire_t	*ip4_ctable_lookup_impl(ire_ctable_args_t *margs);
 #ifdef DEBUG
 static void	ire_trace_cleanup(const ire_t *);
 #endif
 
 /*
- * To avoid bloating the code, we call this function instead of
- * using the macro IRE_REFRELE. Use macro only in performance
- * critical paths.
- *
- * Must not be called while holding any locks. Otherwise if this is
- * the last reference to be released there is a chance of recursive mutex
- * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
- * to restart an ioctl. The one exception is when the caller is sure that
- * this is not the last reference to be released. Eg. if the caller is
- * sure that the ire has not been deleted and won't be deleted.
+ * Following are the functions to increment/decrement the reference
+ * count of the IREs and IRBs (ire bucket).
+ *
+ * 1) We bump up the reference count of an IRE to make sure that
+ *    it does not get deleted and freed while we are using it.
+ *    Typically all the lookup functions hold the bucket lock,
+ *    and look for the IRE. If it finds an IRE, it bumps up the
+ *    reference count before dropping the lock. Sometimes we *may* want
+ *    to bump up the reference count after we *looked* up i.e without
+ *    holding the bucket lock. So, the ire_refhold function does not assert
+ *    on the bucket lock being held. Any thread trying to delete from
+ *    the hash bucket can still do so but cannot free the IRE if
+ *    ire_refcnt is not 0.
+ *
+ * 2) We bump up the reference count on the bucket where the IRE resides
+ *    (IRB), when we want to prevent the IREs getting deleted from a given
+ *    hash bucket. This makes life easier for ire_walk type functions which
+ *    wants to walk the IRE list, call a function, but needs to drop
+ *    the bucket lock to prevent recursive rw_enters. While the
+ *    lock is dropped, the list could be changed by other threads or
+ *    the same thread could end up deleting the ire or the ire pointed by
+ *    ire_next. ire_refholding the ire or ire_next is not sufficient as
+ *    a delete will still remove the ire from the bucket while we have
+ *    dropped the lock and hence the ire_next would be NULL. Thus, we
+ *    need a mechanism to prevent deletions from a given bucket.
+ *
+ *    To prevent deletions, we bump up the reference count on the
+ *    bucket. If the bucket is held, ire_delete just marks both
+ *    the ire and irb as CONDEMNED. When the
+ *    reference count on the bucket drops to zero, all the CONDEMNED ires
+ *    are deleted. We don't have to bump up the reference count on the
+ *    bucket if we are walking the bucket and never have to drop the bucket
+ *    lock. Note that irb_refhold does not prevent addition of new ires
+ *    in the list. It is okay because addition of new ires will not cause
+ *    ire_next to point to freed memory. We do irb_refhold only when
+ *    all of the 3 conditions are true :
+ *
+ *    1) The code needs to walk the IRE bucket from start to end.
+ *    2) It may have to drop the bucket lock sometimes while doing (1)
+ *    3) It does not want any ires to be deleted meanwhile.
+ */
+
+/*
+ * Bump up the reference count on the hash bucket - IRB to
+ * prevent ires from being deleted in this bucket.
  */
 void
-ire_refrele(ire_t *ire)
+irb_refhold(irb_t *irb)
 {
-	IRE_REFRELE(ire);
+	rw_enter(&irb->irb_lock, RW_WRITER);
+	irb->irb_refcnt++;
+	ASSERT(irb->irb_refcnt != 0);
+	rw_exit(&irb->irb_lock);
 }
 
 void
-ire_refrele_notr(ire_t *ire)
+irb_refhold_locked(irb_t *irb)
 {
-	IRE_REFRELE_NOTR(ire);
+	ASSERT(RW_WRITE_HELD(&irb->irb_lock));
+	irb->irb_refcnt++;
+	ASSERT(irb->irb_refcnt != 0);
 }
 
 /*
- * kmem_cache_alloc constructor for IRE in kma space.
- * Note that when ire_mp is set the IRE is stored in that mblk and
- * not in this cache.
+ * Note: when IRB_MARK_DYNAMIC is not set the irb_t
+ * is statically allocated, so that when the irb_refcnt goes to 0,
+ * we simply clean up the ire list and continue.
  */
-/* ARGSUSED */
-static int
-ip_ire_constructor(void *buf, void *cdrarg, int kmflags)
+void
+irb_refrele(irb_t *irb)
 {
-	ire_t	*ire = buf;
+	if (irb->irb_marks & IRB_MARK_DYNAMIC) {
+		irb_refrele_ftable(irb);
+	} else {
+		rw_enter(&irb->irb_lock, RW_WRITER);
+		ASSERT(irb->irb_refcnt != 0);
+		if (--irb->irb_refcnt	== 0 &&
+		    (irb->irb_marks & IRB_MARK_CONDEMNED)) {
+			ire_t *ire_list;
+
+			ire_list = ire_unlink(irb);
+			rw_exit(&irb->irb_lock);
+			ASSERT(ire_list != NULL);
+			ire_cleanup(ire_list);
+		} else {
+			rw_exit(&irb->irb_lock);
+		}
+	}
+}
 
-	ire->ire_nce = NULL;
 
-	return (0);
+/*
+ * Bump up the reference count on the IRE. We cannot assert that the
+ * bucket lock is being held as it is legal to bump up the reference
+ * count after the first lookup has returned the IRE without
+ * holding the lock.
+ */
+void
+ire_refhold(ire_t *ire)
+{
+	atomic_add_32(&(ire)->ire_refcnt, 1);
+	ASSERT((ire)->ire_refcnt != 0);
+#ifdef DEBUG
+	ire_trace_ref(ire);
+#endif
 }
 
-/* ARGSUSED1 */
-static void
-ip_ire_destructor(void *buf, void *cdrarg)
+void
+ire_refhold_notr(ire_t *ire)
 {
-	ire_t	*ire = buf;
+	atomic_add_32(&(ire)->ire_refcnt, 1);
+	ASSERT((ire)->ire_refcnt != 0);
+}
 
-	ASSERT(ire->ire_nce == NULL);
+void
+ire_refhold_locked(ire_t *ire)
+{
+#ifdef DEBUG
+	ire_trace_ref(ire);
+#endif
+	ire->ire_refcnt++;
 }
 
 /*
- * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY
- * IOCTL.  It is used by TCP (or other ULPs) to supply revised information
- * for an existing CACHED IRE.
+ * Release a ref on an IRE.
+ *
+ * Must not be called while holding any locks. Otherwise if this is
+ * the last reference to be released there is a chance of recursive mutex
+ * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
+ * to restart an ioctl. The one exception is when the caller is sure that
+ * this is not the last reference to be released. Eg. if the caller is
+ * sure that the ire has not been deleted and won't be deleted.
+ *
+ * In architectures e.g sun4u, where atomic_add_32_nv is just
+ * a cas, we need to maintain the right memory barrier semantics
+ * as that of mutex_exit i.e all the loads and stores should complete
+ * before the cas is executed. membar_exit() does that here.
  */
-/* ARGSUSED */
-int
-ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
+void
+ire_refrele(ire_t *ire)
 {
-	uchar_t	*addr_ucp;
-	ipic_t	*ipic;
-	ire_t	*ire;
-	ipaddr_t	addr;
-	in6_addr_t	v6addr;
-	irb_t	*irb;
-	zoneid_t	zoneid;
-	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
-
-	ASSERT(q->q_next == NULL);
-	zoneid = Q_TO_CONN(q)->conn_zoneid;
-
-	/*
-	 * Check privilege using the ioctl credential; if it is NULL
-	 * then this is a kernel message and therefor privileged.
-	 */
-	if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
-		return (EPERM);
-
-	ipic = (ipic_t *)mp->b_rptr;
-	if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset,
-	    ipic->ipic_addr_length))) {
-		return (EINVAL);
-	}
-	if (!OK_32PTR(addr_ucp))
-		return (EINVAL);
-	switch (ipic->ipic_addr_length) {
-	case IP_ADDR_LEN: {
-		/* Extract the destination address. */
-		addr = *(ipaddr_t *)addr_ucp;
-		/* Find the corresponding IRE. */
-		ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
-		break;
-	}
-	case IPV6_ADDR_LEN: {
-		/* Extract the destination address. */
-		v6addr = *(in6_addr_t *)addr_ucp;
-		/* Find the corresponding IRE. */
-		ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst);
-		break;
-	}
-	default:
-		return (EINVAL);
-	}
-
-	if (ire == NULL)
-		return (ENOENT);
-	/*
-	 * Update the round trip time estimate and/or the max frag size
-	 * and/or the slow start threshold.
-	 *
-	 * We serialize multiple advises using ire_lock.
-	 */
-	mutex_enter(&ire->ire_lock);
-	if (ipic->ipic_rtt) {
-		/*
-		 * If there is no old cached values, initialize them
-		 * conservatively.  Set them to be (1.5 * new value).
-		 */
-		if (ire->ire_uinfo.iulp_rtt != 0) {
-			ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt +
-			    ipic->ipic_rtt) >> 1;
-		} else {
-			ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt +
-			    (ipic->ipic_rtt >> 1);
-		}
-		if (ire->ire_uinfo.iulp_rtt_sd != 0) {
-			ire->ire_uinfo.iulp_rtt_sd =
-			    (ire->ire_uinfo.iulp_rtt_sd +
-			    ipic->ipic_rtt_sd) >> 1;
-		} else {
-			ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd +
-			    (ipic->ipic_rtt_sd >> 1);
-		}
-	}
-	if (ipic->ipic_max_frag)
-		ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET);
-	if (ipic->ipic_ssthresh != 0) {
-		if (ire->ire_uinfo.iulp_ssthresh != 0)
-			ire->ire_uinfo.iulp_ssthresh =
-			    (ipic->ipic_ssthresh +
-			    ire->ire_uinfo.iulp_ssthresh) >> 1;
-		else
-			ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh;
-	}
-	/*
-	 * Don't need the ire_lock below this. ire_type does not change
-	 * after initialization. ire_marks is protected by irb_lock.
-	 */
-	mutex_exit(&ire->ire_lock);
-
-	if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) {
-		/*
-		 * Only increment the temporary IRE count if the original
-		 * IRE is not already marked temporary.
-		 */
-		irb = ire->ire_bucket;
-		rw_enter(&irb->irb_lock, RW_WRITER);
-		if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) &&
-		    !(ire->ire_marks & IRE_MARK_TEMPORARY)) {
-			irb->irb_tmp_ire_cnt++;
-		}
-		ire->ire_marks |= ipic->ipic_ire_marks;
-		rw_exit(&irb->irb_lock);
-	}
+#ifdef DEBUG
+	ire_untrace_ref(ire);
+#endif
+	ASSERT((ire)->ire_refcnt != 0);
+	membar_exit();
+	if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0)
+		ire_inactive(ire);
+}
 
-	ire_refrele(ire);
-	return (0);
+void
+ire_refrele_notr(ire_t *ire)
+{
+	ASSERT((ire)->ire_refcnt != 0);
+	membar_exit();
+	if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0)
+		ire_inactive(ire);
 }
 
 /*
  * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY]
- * IOCTL[s].  The NO_REPLY form is used by TCP to delete a route IRE
- * for a host that is not responding.  This will force an attempt to
- * establish a new route, if available, and flush out the ARP entry so
- * it will re-resolve.  Management processes may want to use the
- * version that generates a reply.
- *
- * This function does not support IPv6 since Neighbor Unreachability Detection
- * means that negative advise like this is useless.
+ * IOCTL[s].  The NO_REPLY form is used by TCP to tell IP that it is
+ * having problems reaching a particular destination.
+ * This will make IP consider alternate routes (e.g., when there are
+ * muliple default routes), and it will also make IP discard any (potentially)
+ * stale redirect.
+ * Management processes may want to use the version that generates a reply.
+ *
+ * With the use of NUD like behavior for IPv4/ARP in addition to IPv6
+ * this function shouldn't be necessary for IP to recover from a bad redirect,
+ * a bad default router (when there are multiple default routers), or
+ * a stale ND/ARP entry. But we retain it in any case.
+ * For instance, this is helpful when TCP suspects a failure before NUD does.
  */
-/* ARGSUSED */
 int
 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
 {
 	uchar_t		*addr_ucp;
-	ipaddr_t	addr;
+	uint_t		ipversion;
+	sin_t		*sin;
+	sin6_t		*sin6;
+	ipaddr_t	v4addr;
+	in6_addr_t	v6addr;
 	ire_t		*ire;
 	ipid_t		*ipid;
-	boolean_t	routing_sock_info = B_FALSE;	/* Sent info? */
 	zoneid_t	zoneid;
-	ire_t		*gire = NULL;
-	ill_t		*ill;
-	mblk_t		*arp_mp;
 	ip_stack_t	*ipst;
 
 	ASSERT(q->q_next == NULL);
-	zoneid = Q_TO_CONN(q)->conn_zoneid;
+	zoneid = IPCL_ZONEID(Q_TO_CONN(q));
 	ipst = CONNQ_TO_IPST(q);
 
 	/*
@@ -563,948 +433,192 @@ ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
 
 	ipid = (ipid_t *)mp->b_rptr;
 
-	/* Only actions on IRE_CACHEs are acceptable at present. */
-	if (ipid->ipid_ire_type != IRE_CACHE)
-		return (EINVAL);
-
 	addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset,
 	    ipid->ipid_addr_length);
 	if (addr_ucp == NULL || !OK_32PTR(addr_ucp))
 		return (EINVAL);
 	switch (ipid->ipid_addr_length) {
-	case IP_ADDR_LEN:
-		/* addr_ucp points at IP addr */
-		break;
-	case sizeof (sin_t): {
-		sin_t	*sin;
+	case sizeof (sin_t):
 		/*
 		 * got complete (sockaddr) address - increment addr_ucp to point
 		 * at the ip_addr field.
 		 */
 		sin = (sin_t *)addr_ucp;
 		addr_ucp = (uchar_t *)&sin->sin_addr.s_addr;
+		ipversion = IPV4_VERSION;
+		break;
+	case sizeof (sin6_t):
+		/*
+		 * got complete (sockaddr) address - increment addr_ucp to point
+		 * at the ip_addr field.
+		 */
+		sin6 = (sin6_t *)addr_ucp;
+		addr_ucp = (uchar_t *)&sin6->sin6_addr;
+		ipversion = IPV6_VERSION;
 		break;
-	}
 	default:
 		return (EINVAL);
 	}
-	/* Extract the destination address. */
-	bcopy(addr_ucp, &addr, IP_ADDR_LEN);
-
-	/* Try to find the CACHED IRE. */
-	ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
-
-	/* Nail it. */
-	if (ire) {
-		/* Allow delete only on CACHE entries */
-		if (ire->ire_type != IRE_CACHE) {
-			ire_refrele(ire);
-			return (EINVAL);
-		}
-
-		/*
-		 * Verify that the IRE has been around for a while.
-		 * This is to protect against transport protocols
-		 * that are too eager in sending delete messages.
-		 */
-		if (gethrestime_sec() <
-		    ire->ire_create_time + ipst->ips_ip_ignore_delete_time) {
-			ire_refrele(ire);
-			return (EINVAL);
-		}
-		/*
-		 * Now we have a potentially dead cache entry. We need
-		 * to remove it.
-		 * If this cache entry is generated from a
-		 * default route (i.e., ire_cmask == 0),
-		 * search the default list and mark it dead and some
-		 * background process will try to activate it.
-		 */
-		if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) {
-			/*
-			 * Make sure that we pick a different
-			 * IRE_DEFAULT next time.
-			 */
-			ire_t *gw_ire;
-			irb_t *irb = NULL;
-			uint_t match_flags;
-
-			match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE);
-
-			gire = ire_ftable_lookup(ire->ire_addr,
-			    ire->ire_cmask, 0, 0,
-			    ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags,
-			    ipst);
-
-			ip3dbg(("ire_ftable_lookup() returned gire %p\n",
-			    (void *)gire));
-
-			if (gire != NULL) {
-				irb = gire->ire_bucket;
-
-				/*
-				 * We grab it as writer just to serialize
-				 * multiple threads trying to bump up
-				 * irb_rr_origin
-				 */
-				rw_enter(&irb->irb_lock, RW_WRITER);
-				if ((gw_ire = irb->irb_rr_origin) == NULL) {
-					rw_exit(&irb->irb_lock);
-					goto done;
-				}
-
-				DTRACE_PROBE1(ip__ire__del__origin,
-				    (ire_t *), gw_ire);
-
-				/* Skip past the potentially bad gateway */
-				if (ire->ire_gateway_addr ==
-				    gw_ire->ire_gateway_addr) {
-					ire_t *next = gw_ire->ire_next;
-
-					DTRACE_PROBE2(ip__ire__del,
-					    (ire_t *), gw_ire, (irb_t *), irb);
-					IRE_FIND_NEXT_ORIGIN(next);
-					irb->irb_rr_origin = next;
-				}
-				rw_exit(&irb->irb_lock);
-			}
-		}
-done:
-		if (gire != NULL)
-			IRE_REFRELE(gire);
-		/* report the bad route to routing sockets */
-		ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr,
-		    ire->ire_mask, ire->ire_src_addr, 0, 0, 0,
-		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst);
-		routing_sock_info = B_TRUE;
+	if (ipversion == IPV4_VERSION) {
+		/* Extract the destination address. */
+		bcopy(addr_ucp, &v4addr, IP_ADDR_LEN);
 
-		/*
-		 * TCP is really telling us to start over completely, and it
-		 * expects that we'll resend the ARP query.  Tell ARP to
-		 * discard the entry, if this is a local destination.
-		 *
-		 * But, if the ARP entry is permanent then it shouldn't be
-		 * deleted, so we set ARED_F_PRESERVE_PERM.
-		 */
-		ill = ire->ire_stq->q_ptr;
-		if (ire->ire_gateway_addr == 0 &&
-		    (arp_mp = ill_ared_alloc(ill, addr)) != NULL) {
-			ared_t *ared = (ared_t *)arp_mp->b_rptr;
-
-			ASSERT(ared->ared_cmd == AR_ENTRY_DELETE);
-			ared->ared_flags |= ARED_F_PRESERVE_PERM;
-			putnext(ill->ill_rq, arp_mp);
-		}
+		ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
+		    zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
+	} else {
+		/* Extract the destination address. */
+		bcopy(addr_ucp, &v6addr, IPV6_ADDR_LEN);
 
-		ire_delete(ire);
-		ire_refrele(ire);
+		ire = ire_ftable_lookup_v6(&v6addr, NULL, NULL, 0, NULL,
+		    zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
 	}
-	/*
-	 * Also look for an IRE_HOST type redirect ire and
-	 * remove it if present.
-	 */
-	ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL,
-	    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-
-	/* Nail it. */
 	if (ire != NULL) {
-		if (ire->ire_flags & RTF_DYNAMIC) {
-			if (!routing_sock_info) {
-				ip_rts_change(RTM_LOSING, ire->ire_addr,
-				    ire->ire_gateway_addr, ire->ire_mask,
-				    ire->ire_src_addr, 0, 0, 0,
-				    (RTA_DST | RTA_GATEWAY |
-				    RTA_NETMASK | RTA_IFA),
-				    ipst);
-			}
-			ire_delete(ire);
-		}
+		if (ipversion == IPV4_VERSION) {
+			ip_rts_change(RTM_LOSING, ire->ire_addr,
+			    ire->ire_gateway_addr, ire->ire_mask,
+			    (Q_TO_CONN(q))->conn_laddr_v4,  0, 0, 0,
+			    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
+			    ire->ire_ipst);
+		}
+		(void) ire_no_good(ire);
 		ire_refrele(ire);
 	}
 	return (0);
 }
 
 /*
- * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed
- * down from the Upper Level Protocol to request a copy of the IRE (to check
- * its type or to extract information like round-trip time estimates or the
- * MTU.)
- * The address is assumed to be in the ire_addr field. If no IRE is found
- * an IRE is returned with ire_type being zero.
- * Note that the upper lavel protocol has to check for broadcast
- * (IRE_BROADCAST) and multicast (CLASSD(addr)).
- * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the
- * end of the returned message.
- *
- * TCP sends down a message of this type with a connection request packet
- * chained on. UDP and ICMP send it down to verify that a route exists for
- * the destination address when they get connected.
- */
-void
-ip_ire_req(queue_t *q, mblk_t *mp)
-{
-	ire_t	*inire;
-	ire_t	*ire;
-	mblk_t	*mp1;
-	ire_t	*sire = NULL;
-	zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid;
-	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
-
-	ASSERT(q->q_next == NULL);
-
-	if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) ||
-	    !OK_32PTR(mp->b_rptr)) {
-		freemsg(mp);
-		return;
-	}
-	inire = (ire_t *)mp->b_rptr;
-	/*
-	 * Got it, now take our best shot at an IRE.
-	 */
-	if (inire->ire_ipversion == IPV6_VERSION) {
-		ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0,
-		    NULL, &sire, zoneid, NULL,
-		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
-	} else {
-		ASSERT(inire->ire_ipversion == IPV4_VERSION);
-		ire = ire_route_lookup(inire->ire_addr, 0, 0, 0,
-		    NULL, &sire, zoneid, NULL,
-		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
-	}
-
-	/*
-	 * We prevent returning IRES with source address INADDR_ANY
-	 * as these were temporarily created for sending packets
-	 * from endpoints that have conn_unspec_src set.
-	 */
-	if (ire == NULL ||
-	    (ire->ire_ipversion == IPV4_VERSION &&
-	    ire->ire_src_addr == INADDR_ANY) ||
-	    (ire->ire_ipversion == IPV6_VERSION &&
-	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) {
-		inire->ire_type = 0;
-	} else {
-		bcopy(ire, inire, sizeof (ire_t));
-		/* Copy the route metrics from the parent. */
-		if (sire != NULL) {
-			bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo),
-			    sizeof (iulp_t));
-		}
-
-		/* Pass the latest setting of the ip_path_mtu_discovery */
-		inire->ire_frag_flag |=
-		    (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
-	}
-	if (ire != NULL)
-		ire_refrele(ire);
-	if (sire != NULL)
-		ire_refrele(sire);
-	mp->b_wptr = &mp->b_rptr[sizeof (ire_t)];
-	mp->b_datap->db_type = IRE_DB_TYPE;
-
-	/* Put the IRE_DB_TYPE mblk last in the chain */
-	mp1 = mp->b_cont;
-	if (mp1 != NULL) {
-		mp->b_cont = NULL;
-		linkb(mp1, mp);
-		mp = mp1;
-	}
-	qreply(q, mp);
-}
-
-/*
- * Send a packet using the specified IRE.
- * If ire_src_addr_v6 is all zero then discard the IRE after
- * the packet has been sent.
- */
-static void
-ire_send(queue_t *q, mblk_t *pkt, ire_t *ire)
-{
-	mblk_t *ipsec_mp;
-	boolean_t is_secure;
-	uint_t ifindex;
-	ill_t	*ill;
-	zoneid_t zoneid = ire->ire_zoneid;
-	ip_stack_t	*ipst = ire->ire_ipst;
-
-	ASSERT(ire->ire_ipversion == IPV4_VERSION);
-	ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
-	ipsec_mp = pkt;
-	is_secure = (pkt->b_datap->db_type == M_CTL);
-	if (is_secure) {
-		ipsec_out_t *io;
-
-		pkt = pkt->b_cont;
-		io = (ipsec_out_t *)ipsec_mp->b_rptr;
-		if (io->ipsec_out_type == IPSEC_OUT)
-			zoneid = io->ipsec_out_zoneid;
-	}
-
-	/* If the packet originated externally then */
-	if (pkt->b_prev) {
-		ire_refrele(ire);
-		/*
-		 * Extract the ifindex from b_prev (set in ip_rput_noire).
-		 * Look up interface to see if it still exists (it could have
-		 * been unplumbed by the time the reply came back from ARP)
-		 */
-		ifindex = (uint_t)(uintptr_t)pkt->b_prev;
-		ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
-		    NULL, NULL, NULL, NULL, ipst);
-		if (ill == NULL) {
-			pkt->b_prev = NULL;
-			pkt->b_next = NULL;
-			freemsg(ipsec_mp);
-			return;
-		}
-		q = ill->ill_rq;
-		pkt->b_prev = NULL;
-		/*
-		 * This packet has not gone through IPSEC processing
-		 * and hence we should not have any IPSEC message
-		 * prepended.
-		 */
-		ASSERT(ipsec_mp == pkt);
-		put(q, pkt);
-		ill_refrele(ill);
-	} else if (pkt->b_next) {
-		/* Packets from multicast router */
-		pkt->b_next = NULL;
-		/*
-		 * We never get the IPSEC_OUT while forwarding the
-		 * packet for multicast router.
-		 */
-		ASSERT(ipsec_mp == pkt);
-		ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL);
-		ire_refrele(ire);
-	} else {
-		/* Locally originated packets */
-		boolean_t delete_ire = B_FALSE;
-		ipha_t *ipha = (ipha_t *)pkt->b_rptr;
-
-		/*
-		 * If this IRE shouldn't be kept in the table (because its
-		 * source address is unspecified), hold a reference to it so
-		 * we can delete it even after e.g. ip_wput_ire() has dropped
-		 * its reference.
-		 */
-		if (!(ire->ire_marks & IRE_MARK_NOADD) &&
-		    ire->ire_src_addr == INADDR_ANY) {
-			delete_ire = B_TRUE;
-			IRE_REFHOLD(ire);
-		}
-
-		/*
-		 * If we were resolving a router we can not use the
-		 * routers IRE for sending the packet (since it would
-		 * violate the uniqness of the IP idents) thus we
-		 * make another pass through ip_wput to create the IRE_CACHE
-		 * for the destination.
-		 * When IRE_MARK_NOADD is set, ire_add() is not called.
-		 * Thus ip_wput() will never find a ire and result in an
-		 * infinite loop. Thus we check whether IRE_MARK_NOADD is
-		 * is set. This also implies that IRE_MARK_NOADD can only be
-		 * used to send packets to directly connected hosts.
-		 */
-		if (ipha->ipha_dst != ire->ire_addr &&
-		    !(ire->ire_marks & IRE_MARK_NOADD)) {
-			ire_refrele(ire);	/* Held in ire_add */
-			if (CONN_Q(q)) {
-				(void) ip_output(Q_TO_CONN(q), ipsec_mp, q,
-				    IRE_SEND);
-			} else {
-				(void) ip_output((void *)(uintptr_t)zoneid,
-				    ipsec_mp, q, IRE_SEND);
-			}
-		} else {
-			if (is_secure) {
-				ipsec_out_t *oi;
-				ipha_t *ipha;
-
-				oi = (ipsec_out_t *)ipsec_mp->b_rptr;
-				ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
-				if (oi->ipsec_out_proc_begin) {
-					/*
-					 * This is the case where
-					 * ip_wput_ipsec_out could not find
-					 * the IRE and recreated a new one.
-					 * As ip_wput_ipsec_out does ire
-					 * lookups, ire_refrele for the extra
-					 * bump in ire_add.
-					 */
-					ire_refrele(ire);
-					ip_wput_ipsec_out(q, ipsec_mp, ipha,
-					    NULL, NULL);
-				} else {
-					/*
-					 * IRE_REFRELE will be done in
-					 * ip_wput_ire.
-					 */
-					ip_wput_ire(q, ipsec_mp, ire, NULL,
-					    IRE_SEND, zoneid);
-				}
-			} else {
-				/*
-				 * IRE_REFRELE will be done in ip_wput_ire.
-				 */
-				ip_wput_ire(q, ipsec_mp, ire, NULL,
-				    IRE_SEND, zoneid);
-			}
-		}
-		/*
-		 * Special code to support sending a single packet with
-		 * conn_unspec_src using an IRE which has no source address.
-		 * The IRE is deleted here after sending the packet to avoid
-		 * having other code trip on it. But before we delete the
-		 * ire, somebody could have looked up this ire.
-		 * We prevent returning/using this IRE by the upper layers
-		 * by making checks to NULL source address in other places
-		 * like e.g ip_ire_append, ip_ire_req and ip_bind_connected.
-		 * Though this does not completely prevent other threads
-		 * from using this ire, this should not cause any problems.
-		 */
-		if (delete_ire) {
-			ip1dbg(("ire_send: delete IRE\n"));
-			ire_delete(ire);
-			ire_refrele(ire);	/* Held above */
-		}
-	}
-}
-
-/*
- * Send a packet using the specified IRE.
- * If ire_src_addr_v6 is all zero then discard the IRE after
- * the packet has been sent.
- */
-static void
-ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire)
-{
-	mblk_t *ipsec_mp;
-	boolean_t secure;
-	uint_t ifindex;
-	zoneid_t zoneid = ire->ire_zoneid;
-	ip_stack_t	*ipst = ire->ire_ipst;
-
-	ASSERT(ire->ire_ipversion == IPV6_VERSION);
-	ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
-	if (pkt->b_datap->db_type == M_CTL) {
-		ipsec_out_t *io;
-
-		ipsec_mp = pkt;
-		pkt = pkt->b_cont;
-		secure = B_TRUE;
-		io = (ipsec_out_t *)ipsec_mp->b_rptr;
-		if (io->ipsec_out_type == IPSEC_OUT)
-			zoneid = io->ipsec_out_zoneid;
-	} else {
-		ipsec_mp = pkt;
-		secure = B_FALSE;
-	}
-
-	/* If the packet originated externally then */
-	if (pkt->b_prev) {
-		ill_t	*ill;
-		/*
-		 * Extract the ifindex from b_prev (set in ip_rput_data_v6).
-		 * Look up interface to see if it still exists (it could have
-		 * been unplumbed by the time the reply came back from the
-		 * resolver).
-		 */
-		ifindex = (uint_t)(uintptr_t)pkt->b_prev;
-		ill = ill_lookup_on_ifindex(ifindex, B_TRUE,
-		    NULL, NULL, NULL, NULL, ipst);
-		if (ill == NULL) {
-			pkt->b_prev = NULL;
-			pkt->b_next = NULL;
-			freemsg(ipsec_mp);
-			ire_refrele(ire);	/* Held in ire_add */
-			return;
-		}
-		q = ill->ill_rq;
-		pkt->b_prev = NULL;
-		/*
-		 * This packet has not gone through IPSEC processing
-		 * and hence we should not have any IPSEC message
-		 * prepended.
-		 */
-		ASSERT(ipsec_mp == pkt);
-		put(q, pkt);
-		ill_refrele(ill);
-	} else if (pkt->b_next) {
-		/* Packets from multicast router */
-		pkt->b_next = NULL;
-		/*
-		 * We never get the IPSEC_OUT while forwarding the
-		 * packet for multicast router.
-		 */
-		ASSERT(ipsec_mp == pkt);
-		/*
-		 * XXX TODO IPv6.
-		 */
-		freemsg(pkt);
-#ifdef XXX
-		ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL);
-#endif
-	} else {
-		if (secure) {
-			ipsec_out_t *oi;
-			ip6_t *ip6h;
-
-			oi = (ipsec_out_t *)ipsec_mp->b_rptr;
-			ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr;
-			if (oi->ipsec_out_proc_begin) {
-				/*
-				 * This is the case where
-				 * ip_wput_ipsec_out could not find
-				 * the IRE and recreated a new one.
-				 */
-				ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h,
-				    NULL, NULL);
-			} else {
-				if (CONN_Q(q)) {
-					(void) ip_output_v6(Q_TO_CONN(q),
-					    ipsec_mp, q, IRE_SEND);
-				} else {
-					(void) ip_output_v6(
-					    (void *)(uintptr_t)zoneid,
-					    ipsec_mp, q, IRE_SEND);
-				}
-			}
-		} else {
-			/*
-			 * Send packets through ip_output_v6 so that any
-			 * ip6_info header can be processed again.
-			 */
-			if (CONN_Q(q)) {
-				(void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q,
-				    IRE_SEND);
-			} else {
-				(void) ip_output_v6((void *)(uintptr_t)zoneid,
-				    ipsec_mp, q, IRE_SEND);
-			}
-		}
-		/*
-		 * Special code to support sending a single packet with
-		 * conn_unspec_src using an IRE which has no source address.
-		 * The IRE is deleted here after sending the packet to avoid
-		 * having other code trip on it. But before we delete the
-		 * ire, somebody could have looked up this ire.
-		 * We prevent returning/using this IRE by the upper layers
-		 * by making checks to NULL source address in other places
-		 * like e.g ip_ire_append_v6, ip_ire_req and
-		 * ip_bind_connected_v6. Though, this does not completely
-		 * prevent other threads from using this ire, this should
-		 * not cause any problems.
-		 */
-		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
-			ip1dbg(("ire_send_v6: delete IRE\n"));
-			ire_delete(ire);
-		}
-	}
-	ire_refrele(ire);	/* Held in ire_add */
-}
-
-/*
- * Make sure that IRE bucket does not get too long.
- * This can cause lock up because ire_cache_lookup()
- * may take "forever" to finish.
- *
- * We only remove a maximum of cnt IREs each time.  This
- * should keep the bucket length approximately constant,
- * depending on cnt.  This should be enough to defend
- * against DoS attack based on creating temporary IREs
- * (for forwarding and non-TCP traffic).
- *
- * We also pass in the address of the newly created IRE
- * as we do not want to remove this straight after adding
- * it. New IREs are normally added at the tail of the
- * bucket.  This means that we are removing the "oldest"
- * temporary IREs added.  Only if there are IREs with
- * the same ire_addr, do we not add it at the tail.  Refer
- * to ire_add_v*().  It should be OK for our purpose.
- *
- * For non-temporary cached IREs, we make sure that they
- * have not been used for some time (defined below), they
- * are non-local destinations, and there is no one using
- * them at the moment (refcnt == 1).
- *
- * The above means that the IRE bucket length may become
- * very long, consisting of mostly non-temporary IREs.
- * This can happen when the hash function does a bad job
- * so that most TCP connections cluster to a specific bucket.
- * This "hopefully" should never happen.  It can also
- * happen if most TCP connections have very long lives.
- * Even with the minimal hash table size of 256, there
- * has to be a lot of such connections to make the bucket
- * length unreasonably long.  This should probably not
- * happen either.  The third can when this can happen is
- * when the machine is under attack, such as SYN flooding.
- * TCP should already have the proper mechanism to protect
- * that.  So we should be safe.
- *
- * This function is called by ire_add_then_send() after
- * a new IRE is added and the packet is sent.
- *
- * The idle cutoff interval is set to 60s.  It can be
- * changed using /etc/system.
- */
-uint32_t ire_idle_cutoff_interval = 60000;
-
-static void
-ire_cache_cleanup(irb_t *irb, uint32_t threshold, ire_t *ref_ire)
-{
-	ire_t *ire;
-	clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000);
-	int cnt = ip_ire_cleanup_cnt;
-
-	/*
-	 * Try to remove cnt temporary IREs first.
-	 */
-	for (ire = irb->irb_ire; cnt > 0 && ire != NULL; ire = ire->ire_next) {
-		if (ire == ref_ire)
-			continue;
-		if (ire->ire_marks & IRE_MARK_CONDEMNED)
-			continue;
-		if (ire->ire_marks & IRE_MARK_TEMPORARY) {
-			ASSERT(ire->ire_type == IRE_CACHE);
-			ire_delete(ire);
-			cnt--;
-		}
-	}
-	if (cnt == 0)
-		return;
-
-	/*
-	 * If we didn't satisfy our removal target from temporary IREs
-	 * we see how many non-temporary IREs are currently in the bucket.
-	 * If this quantity is above the threshold then we see if there are any
-	 * candidates for removal. We are still limited to removing a maximum
-	 * of cnt IREs.
-	 */
-	if ((irb->irb_ire_cnt - irb->irb_tmp_ire_cnt) > threshold) {
-		for (ire = irb->irb_ire; cnt > 0 && ire != NULL;
-		    ire = ire->ire_next) {
-			if (ire == ref_ire)
-				continue;
-			if (ire->ire_type != IRE_CACHE)
-				continue;
-			if (ire->ire_marks & IRE_MARK_CONDEMNED)
-				continue;
-			if ((ire->ire_refcnt == 1) &&
-			    (lbolt - ire->ire_last_used_time > cut_off)) {
-				ire_delete(ire);
-				cnt--;
-			}
-		}
-	}
-}
-
-/*
- * ire_add_then_send is called when a new IRE has been created in order to
- * route an outgoing packet.  Typically, it is called from ip_wput when
- * a response comes back down from a resolver.  We add the IRE, and then
- * possibly run the packet through ip_wput or ip_rput, as appropriate.
- * However, we do not add the newly created IRE in the cache when
- * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at
- * ip_newroute_ipif(). The ires with IRE_MARK_NOADD are ire_refrele'd by
- * ip_wput_ire() and get deleted.
- * Multirouting support: the packet is silently discarded when the new IRE
- * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the
- * RTF_MULTIRT flag for the same destination address.
- * In this case, we just want to register this additional ire without
- * sending the packet, as it has already been replicated through
- * existing multirt routes in ip_wput().
- */
-void
-ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
-{
-	irb_t *irb;
-	boolean_t drop = B_FALSE;
-	boolean_t mctl_present;
-	mblk_t *first_mp = NULL;
-	mblk_t *data_mp = NULL;
-	ire_t *dst_ire;
-	ipha_t *ipha;
-	ip6_t *ip6h;
-	ip_stack_t	*ipst = ire->ire_ipst;
-	int		ire_limit;
-
-	if (mp != NULL) {
-		/*
-		 * We first have to retrieve the destination address carried
-		 * by the packet.
-		 * We can't rely on ire as it can be related to a gateway.
-		 * The destination address will help in determining if
-		 * other RTF_MULTIRT ires are already registered.
-		 *
-		 * We first need to know where we are going : v4 or V6.
-		 * the ire version is enough, as there is no risk that
-		 * we resolve an IPv6 address with an IPv4 ire
-		 * or vice versa.
-		 */
-		EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-		data_mp = mp;
-		mp = first_mp;
-		if (ire->ire_ipversion == IPV4_VERSION) {
-			ipha = (ipha_t *)data_mp->b_rptr;
-			dst_ire = ire_cache_lookup(ipha->ipha_dst,
-			    ire->ire_zoneid, msg_getlabel(mp), ipst);
-		} else {
-			ASSERT(ire->ire_ipversion == IPV6_VERSION);
-			ip6h = (ip6_t *)data_mp->b_rptr;
-			dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst,
-			    ire->ire_zoneid, msg_getlabel(mp), ipst);
-		}
-		if (dst_ire != NULL) {
-			if (dst_ire->ire_flags & RTF_MULTIRT) {
-				/*
-				 * At least one resolved multirt route
-				 * already exists for the destination,
-				 * don't sent this packet: either drop it
-				 * or complete the pending resolution,
-				 * depending on the ire.
-				 */
-				drop = B_TRUE;
-			}
-			ip1dbg(("ire_add_then_send: dst_ire %p "
-			    "[dst %08x, gw %08x], drop %d\n",
-			    (void *)dst_ire,
-			    (dst_ire->ire_ipversion == IPV4_VERSION) ? \
-			    ntohl(dst_ire->ire_addr) : \
-			    ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)),
-			    (dst_ire->ire_ipversion == IPV4_VERSION) ? \
-			    ntohl(dst_ire->ire_gateway_addr) : \
-			    ntohl(V4_PART_OF_V6(
-			    dst_ire->ire_gateway_addr_v6)),
-			    drop));
-			ire_refrele(dst_ire);
-		}
-	}
-
-	if (!(ire->ire_marks & IRE_MARK_NOADD)) {
-		/* Regular packets with cache bound ires are here. */
-		(void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
-
-		if (ire == NULL) {
-			mp->b_prev = NULL;
-			mp->b_next = NULL;
-			MULTIRT_DEBUG_UNTAG(mp);
-			freemsg(mp);
-			return;
-		}
-		if (mp == NULL) {
-			ire_refrele(ire);	/* Held in ire_add_v4/v6 */
-			return;
-		}
-	}
-	if (drop) {
-		/*
-		 * If we're adding an RTF_MULTIRT ire, the resolution
-		 * is over: we just drop the packet.
-		 */
-		if (ire->ire_flags & RTF_MULTIRT) {
-			data_mp->b_prev = NULL;
-			data_mp->b_next = NULL;
-			MULTIRT_DEBUG_UNTAG(mp);
-			freemsg(mp);
-		} else {
-			/*
-			 * Otherwise, we're adding the ire to a gateway
-			 * for a multirt route.
-			 * Invoke ip_newroute() to complete the resolution
-			 * of the route. We will then come back here and
-			 * finally drop this packet in the above code.
-			 */
-			if (ire->ire_ipversion == IPV4_VERSION) {
-				/*
-				 * TODO: in order for CGTP to work in non-global
-				 * zones, ip_newroute() must create the IRE
-				 * cache in the zone indicated by
-				 * ire->ire_zoneid.
-				 */
-				ip_newroute(q, mp, ipha->ipha_dst,
-				    (CONN_Q(q) ? Q_TO_CONN(q) : NULL),
-				    ire->ire_zoneid, ipst);
-			} else {
-				int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN;
-
-				ASSERT(ire->ire_ipversion == IPV6_VERSION);
-
-				/*
-				 * If necessary, skip over the ip6i_t to find
-				 * the header with the actual source address.
-				 */
-				if (ip6h->ip6_nxt == IPPROTO_RAW) {
-					if (MBLKL(data_mp) < minlen &&
-					    pullupmsg(data_mp, -1) == 0) {
-						ip1dbg(("ire_add_then_send: "
-						    "cannot pullupmsg ip6i\n"));
-						if (mctl_present)
-							freeb(first_mp);
-						ire_refrele(ire);
-						return;
-					}
-					ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN);
-					ip6h = (ip6_t *)(data_mp->b_rptr +
-					    sizeof (ip6i_t));
-				}
-				ip_newroute_v6(q, mp, &ip6h->ip6_dst,
-				    &ip6h->ip6_src, NULL, ire->ire_zoneid,
-				    ipst);
-			}
-		}
-
-		ire_refrele(ire); /* As done by ire_send(). */
-		return;
-	}
-	/*
-	 * Need to remember ire_bucket here as ire_send*() may delete
-	 * the ire so we cannot reference it after that.
-	 */
-	irb = ire->ire_bucket;
-	if (ire->ire_ipversion == IPV4_VERSION) {
-		ire_send(q, mp, ire);
-		ire_limit = ip_ire_max_bucket_cnt;
-	} else {
-		ire_send_v6(q, mp, ire);
-		ire_limit = ip6_ire_max_bucket_cnt;
-	}
-
-	/*
-	 * irb is NULL if the IRE was not added to the hash. This happens
-	 * when IRE_MARK_NOADD is set and when IREs are returned from
-	 * ire_update_srcif_v4().
-	 */
-	if (irb != NULL) {
-		IRB_REFHOLD(irb);
-		if (irb->irb_ire_cnt > ire_limit)
-			ire_cache_cleanup(irb, ire_limit, ire);
-		IRB_REFRELE(irb);
-	}
-}
-
-/*
  * Initialize the ire that is specific to IPv4 part and call
  * ire_init_common to finish it.
+ * Returns zero or errno.
  */
-ire_t *
-ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr,
-    uchar_t *gateway, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
-    queue_t *stq, ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle,
-    uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
-    tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+int
+ire_init_v4(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *gateway,
+    ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags,
+    tsol_gc_t *gc, ip_stack_t *ipst)
 {
-	ASSERT(type != IRE_CACHE || stq != NULL);
+	int error;
+
 	/*
 	 * Reject IRE security attribute creation/initialization
 	 * if system is not running in Trusted mode.
 	 */
-	if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
-		return (NULL);
+	if (gc != NULL && !is_system_labeled())
+		return (EINVAL);
 
 	BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced);
 
 	if (addr != NULL)
 		bcopy(addr, &ire->ire_addr, IP_ADDR_LEN);
-	if (src_addr != NULL)
-		bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN);
-	if (mask != NULL) {
-		bcopy(mask, &ire->ire_mask, IP_ADDR_LEN);
-		ire->ire_masklen = ip_mask_to_plen(ire->ire_mask);
-	}
-	if (gateway != NULL) {
+	if (gateway != NULL)
 		bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN);
+
+	/* Make sure we don't have stray values in some fields */
+	switch (type) {
+	case IRE_LOOPBACK:
+		bcopy(&ire->ire_addr, &ire->ire_gateway_addr, IP_ADDR_LEN);
+		/* FALLTHRU */
+	case IRE_HOST:
+	case IRE_BROADCAST:
+	case IRE_LOCAL:
+	case IRE_IF_CLONE:
+		ire->ire_mask = IP_HOST_MASK;
+		ire->ire_masklen = IPV4_ABITS;
+		break;
+	case IRE_PREFIX:
+	case IRE_DEFAULT:
+	case IRE_IF_RESOLVER:
+	case IRE_IF_NORESOLVER:
+		if (mask != NULL) {
+			bcopy(mask, &ire->ire_mask, IP_ADDR_LEN);
+			ire->ire_masklen = ip_mask_to_plen(ire->ire_mask);
+		}
+		break;
+	case IRE_MULTICAST:
+	case IRE_NOROUTE:
+		ASSERT(mask == NULL);
+		break;
+	default:
+		ASSERT(0);
+		return (EINVAL);
 	}
 
-	if (type == IRE_CACHE)
-		ire->ire_cmask = cmask;
+	error = ire_init_common(ire, type, ill, zoneid, flags, IPV4_VERSION,
+	    gc, ipst);
+	if (error != NULL)
+		return (error);
 
-	/* ire_init_common will free the mblks upon encountering any failure */
-	if (!ire_init_common(ire, max_fragp, src_nce, rfq, stq, type, ipif,
-	    phandle, ihandle, flags, IPV4_VERSION, ulp_info, gc, gcgrp, ipst))
-		return (NULL);
+	/* Determine which function pointers to use */
+	ire->ire_postfragfn = ip_xmit;		/* Common case */
 
-	return (ire);
+	switch (ire->ire_type) {
+	case IRE_LOCAL:
+		ire->ire_sendfn = ire_send_local_v4;
+		ire->ire_recvfn = ire_recv_local_v4;
+#ifdef SO_VRRP
+		ASSERT(ire->ire_ill != NULL);
+		if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) {
+			ire->ire_noaccept = B_TRUE;
+			ire->ire_recvfn = ire_recv_noaccept_v6;
+		}
+#endif
+		break;
+	case IRE_LOOPBACK:
+		ire->ire_sendfn = ire_send_local_v4;
+		ire->ire_recvfn = ire_recv_loopback_v4;
+		break;
+	case IRE_BROADCAST:
+		ire->ire_postfragfn = ip_postfrag_loopcheck;
+		ire->ire_sendfn = ire_send_broadcast_v4;
+		ire->ire_recvfn = ire_recv_broadcast_v4;
+		break;
+	case IRE_MULTICAST:
+		ire->ire_postfragfn = ip_postfrag_loopcheck;
+		ire->ire_sendfn = ire_send_multicast_v4;
+		ire->ire_recvfn = ire_recv_multicast_v4;
+		break;
+	default:
+		/*
+		 * For IRE_IF_ALL and IRE_OFFLINK we forward received
+		 * packets by default.
+		 */
+		ire->ire_sendfn = ire_send_wire_v4;
+		ire->ire_recvfn = ire_recv_forward_v4;
+		break;
+	}
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+		ire->ire_sendfn = ire_send_noroute_v4;
+		ire->ire_recvfn = ire_recv_noroute_v4;
+	} else if (ire->ire_flags & RTF_MULTIRT) {
+		ire->ire_postfragfn = ip_postfrag_multirt_v4;
+		ire->ire_sendfn = ire_send_multirt_v4;
+		/* Multirt receive of broadcast uses ire_recv_broadcast_v4 */
+		if (ire->ire_type != IRE_BROADCAST)
+			ire->ire_recvfn = ire_recv_multirt_v4;
+	}
+	ire->ire_nce_capable = ire_determine_nce_capable(ire);
+	return (0);
 }
 
 /*
- * Similar to ire_create except that it is called only when
- * we want to allocate ire as an mblk e.g. we have an external
- * resolver ARP.
+ * Determine ire_nce_capable
  */
-ire_t *
-ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
-    uint_t max_frag, nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type,
-    ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle,
-    uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp,
-    ip_stack_t *ipst)
+boolean_t
+ire_determine_nce_capable(ire_t *ire)
 {
-	ire_t	*ire, *buf;
-	ire_t	*ret_ire;
-	mblk_t	*mp;
-	size_t	bufsize;
-	frtn_t	*frtnp;
-	ill_t	*ill;
+	int max_masklen;
 
-	bufsize = sizeof (ire_t) + sizeof (frtn_t);
-	buf = kmem_alloc(bufsize, KM_NOSLEEP);
-	if (buf == NULL) {
-		ip1dbg(("ire_create_mp: alloc failed\n"));
-		return (NULL);
-	}
-	frtnp = (frtn_t *)(buf + 1);
-	frtnp->free_arg = (caddr_t)buf;
-	frtnp->free_func = ire_freemblk;
+	if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+	    (ire->ire_type & IRE_MULTICAST))
+		return (B_TRUE);
 
-	/*
-	 * Allocate the new IRE. The ire created will hold a ref on
-	 * an nce_t after ire_nce_init, and this ref must either be
-	 * (a)  transferred to the ire_cache entry created when ire_add_v4
-	 *	is called after successful arp resolution, or,
-	 * (b)  released, when arp resolution fails
-	 * Case (b) is handled in ire_freemblk() which will be called
-	 * when mp is freed as a result of failed arp.
-	 */
-	mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
-	if (mp == NULL) {
-		ip1dbg(("ire_create_mp: alloc failed\n"));
-		kmem_free(buf, bufsize);
-		return (NULL);
-	}
-	ire = (ire_t *)mp->b_rptr;
-	mp->b_wptr = (uchar_t *)&ire[1];
+	if (ire->ire_ipversion == IPV4_VERSION)
+		max_masklen = IPV4_ABITS;
+	else
+		max_masklen = IPV6_ABITS;
 
-	/* Start clean. */
-	*ire = ire_null;
-	ire->ire_mp = mp;
-	mp->b_datap->db_type = IRE_DB_TYPE;
-	ire->ire_marks |= IRE_MARK_UNCACHED;
-
-	ret_ire = ire_init(ire, addr, mask, src_addr, gateway, NULL, src_nce,
-	    rfq, stq, type, ipif, cmask, phandle, ihandle, flags, ulp_info, gc,
-	    gcgrp, ipst);
-
-	ill = (ill_t *)(stq->q_ptr);
-	if (ret_ire == NULL) {
-		/* ire_freemblk needs these set */
-		ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
-		ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
-		ire->ire_ipst = ipst;
-		freeb(ire->ire_mp);
-		return (NULL);
-	}
-	ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
-	ret_ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
-	ASSERT(ret_ire == ire);
-	ASSERT(ret_ire->ire_ipst == ipst);
-	/*
-	 * ire_max_frag is normally zero here and is atomically set
-	 * under the irebucket lock in ire_add_v[46] except for the
-	 * case of IRE_MARK_NOADD. In that event the the ire_max_frag
-	 * is non-zero here.
-	 */
-	ire->ire_max_frag = max_frag;
-	return (ire);
+	if ((ire->ire_type & IRE_ONLINK) && ire->ire_masklen == max_masklen)
+		return (B_TRUE);
+	return (B_FALSE);
 }
 
 /*
@@ -1514,49 +628,43 @@ ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
  * by this function.
  */
 ire_t *
-ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
-    uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq,
-    ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle,
-    uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
-    tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+ire_create(uchar_t *addr, uchar_t *mask, uchar_t *gateway,
+    ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, tsol_gc_t *gc,
+    ip_stack_t *ipst)
 {
 	ire_t	*ire;
-	ire_t	*ret_ire;
+	int	error;
 
 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
 	if (ire == NULL) {
-		ip1dbg(("ire_create: alloc failed\n"));
+		DTRACE_PROBE(kmem__cache__alloc);
 		return (NULL);
 	}
 	*ire = ire_null;
 
-	ret_ire = ire_init(ire, addr, mask, src_addr, gateway, max_fragp,
-	    src_nce, rfq, stq, type, ipif, cmask, phandle, ihandle, flags,
-	    ulp_info, gc, gcgrp, ipst);
-
-	if (ret_ire == NULL) {
+	error = ire_init_v4(ire, addr, mask, gateway, type, ill, zoneid, flags,
+	    gc, ipst);
+	if (error != 0) {
+		DTRACE_PROBE2(ire__init, ire_t *, ire, int, error);
 		kmem_cache_free(ire_cache, ire);
 		return (NULL);
 	}
-	ASSERT(ret_ire == ire);
 	return (ire);
 }
 
 /*
  * Common to IPv4 and IPv6
+ * Returns zero or errno.
  */
-boolean_t
-ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
-    queue_t *stq, ushort_t type, ipif_t *ipif, uint32_t phandle,
-    uint32_t ihandle, uint32_t flags, uchar_t ipversion, const iulp_t *ulp_info,
-    tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+int
+ire_init_common(ire_t *ire, ushort_t type, ill_t *ill, zoneid_t zoneid,
+    uint_t flags, uchar_t ipversion, tsol_gc_t *gc, ip_stack_t *ipst)
 {
-	ire->ire_max_fragp = max_fragp;
-	ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
+	int error;
 
 #ifdef DEBUG
-	if (ipif != NULL) {
-		if (ipif->ipif_isv6)
+	if (ill != NULL) {
+		if (ill->ill_isv6)
 			ASSERT(ipversion == IPV6_VERSION);
 		else
 			ASSERT(ipversion == IPV4_VERSION);
@@ -1565,223 +673,73 @@ ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
 
 	/*
 	 * Create/initialize IRE security attribute only in Trusted mode;
-	 * if the passed in gc/gcgrp is non-NULL, we expect that the caller
+	 * if the passed in gc is non-NULL, we expect that the caller
 	 * has held a reference to it and will release it when this routine
 	 * returns a failure, otherwise we own the reference.  We do this
 	 * prior to initializing the rest IRE fields.
-	 *
-	 * Don't allocate ire_gw_secattr for the resolver case to prevent
-	 * memory leak (in case of external resolution failure). We'll
-	 * allocate it after a successful external resolution, in ire_add().
-	 * Note that ire->ire_mp != NULL here means this ire is headed
-	 * to an external resolver.
 	 */
 	if (is_system_labeled()) {
 		if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
-		    IRE_INTERFACE)) != 0) {
+		    IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)) != 0) {
 			/* release references on behalf of caller */
 			if (gc != NULL)
 				GC_REFRELE(gc);
-			if (gcgrp != NULL)
-				GCGRP_REFRELE(gcgrp);
-		} else if ((ire->ire_mp == NULL) &&
-		    tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) {
-			return (B_FALSE);
+		} else {
+			error = tsol_ire_init_gwattr(ire, ipversion, gc);
+			if (error != 0)
+				return (error);
 		}
 	}
 
-	ire->ire_stq = stq;
-	ire->ire_rfq = rfq;
 	ire->ire_type = type;
 	ire->ire_flags = RTF_UP | flags;
-	ire->ire_ident = TICK_TO_MSEC(lbolt);
-	bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t));
-
-	ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count;
-	ire->ire_last_used_time = lbolt;
 	ire->ire_create_time = (uint32_t)gethrestime_sec();
+	ire->ire_generation = IRE_GENERATION_INITIAL;
 
 	/*
-	 * If this IRE is an IRE_CACHE, inherit the handles from the
-	 * parent IREs. For others in the forwarding table, assign appropriate
-	 * new ones.
+	 * The ill_ire_cnt isn't increased until
+	 * the IRE is added to ensure that a walker will find
+	 * all IREs that hold a reference on an ill.
 	 *
-	 * The mutex protecting ire_handle is because ire_create is not always
-	 * called as a writer.
+	 * Note that ill_ire_multicast doesn't hold a ref on the ill since
+	 * ire_add() is not called for the IRE_MULTICAST.
 	 */
-	if (ire->ire_type & IRE_OFFSUBNET) {
-		mutex_enter(&ipst->ips_ire_handle_lock);
-		ire->ire_phandle = (uint32_t)ipst->ips_ire_handle++;
-		mutex_exit(&ipst->ips_ire_handle_lock);
-	} else if (ire->ire_type & IRE_INTERFACE) {
-		mutex_enter(&ipst->ips_ire_handle_lock);
-		ire->ire_ihandle = (uint32_t)ipst->ips_ire_handle++;
-		mutex_exit(&ipst->ips_ire_handle_lock);
-	} else if (ire->ire_type == IRE_CACHE) {
-		ire->ire_phandle = phandle;
-		ire->ire_ihandle = ihandle;
-	}
-	ire->ire_ipif = ipif;
-	if (ipif != NULL) {
-		ire->ire_ipif_seqid = ipif->ipif_seqid;
-		ire->ire_ipif_ifindex =
-		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
-		ire->ire_zoneid = ipif->ipif_zoneid;
-	} else {
-		ire->ire_zoneid = GLOBAL_ZONEID;
-	}
+	ire->ire_ill = ill;
+	ire->ire_zoneid = zoneid;
 	ire->ire_ipversion = ipversion;
+
 	mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL);
-	if (ipversion == IPV4_VERSION) {
-		/*
-		 * IPv6 initializes the ire_nce in ire_add_v6, which expects
-		 * to find the ire_nce to be null when it is called.
-		 */
-		if (ire_nce_init(ire, src_nce) != 0) {
-			/* some failure occurred. propagate error back */
-			return (B_FALSE);
-		}
-	}
 	ire->ire_refcnt = 1;
+	ire->ire_identical_ref = 1;	/* Number of ire_delete's needed */
 	ire->ire_ipst = ipst;	/* No netstack_hold */
 	ire->ire_trace_disable = B_FALSE;
 
-	return (B_TRUE);
+	return (0);
 }
 
 /*
- * This routine is called repeatedly by ipif_up to create broadcast IREs.
- * It is passed a pointer to a slot in an IRE pointer array into which to
- * place the pointer to the new IRE, if indeed we create one.  If the
- * IRE corresponding to the address passed in would be a duplicate of an
- * existing one, we don't create the new one.  irep is incremented before
- * return only if we do create a new IRE.  (Always called as writer.)
+ * This creates an IRE_BROADCAST based on the arguments.
+ * A mirror is ire_lookup_bcast().
  *
- * Note that with the "match_flags" parameter, we can match on either
- * a particular logical interface (MATCH_IRE_IPIF) or for all logical
- * interfaces for a given physical interface (MATCH_IRE_ILL).  Currently,
- * we only create broadcast ire's on a per physical interface basis. If
- * someone is going to be mucking with logical interfaces, it is important
- * to call "ipif_check_bcast_ires()" to make sure that any change to a
- * logical interface will not cause critical broadcast IRE's to be deleted.
- */
-ire_t **
-ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep,
-    int match_flags)
-{
-	ire_t *ire;
-	uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST;
-	boolean_t prefer;
-	ill_t *ill = ipif->ipif_ill;
-	ip_stack_t *ipst = ill->ill_ipst;
-
-	/*
-	 * No broadcast IREs for the LOOPBACK interface
-	 * or others such as point to point and IPIF_NOXMIT.
-	 */
-	if (!(ipif->ipif_flags & IPIF_BROADCAST) ||
-	    (ipif->ipif_flags & IPIF_NOXMIT))
-		return (irep);
-
-	/*
-	 * If this new IRE would be a duplicate, only prefer it if one of
-	 * the following is true:
-	 *
-	 * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST
-	 *    set and the new one has all of those clear.
-	 *
-	 * 2. The existing one corresponds to an underlying ILL in an IPMP
-	 *    group and the new one corresponds to an IPMP group interface.
-	 */
-	if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif,
-	    ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) {
-		prefer = ((ire->ire_ipif->ipif_flags & check_flags) &&
-		    !(ipif->ipif_flags & check_flags)) ||
-		    (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill));
-		if (!prefer) {
-			ire_refrele(ire);
-			return (irep);
-		}
-
-		/*
-		 * Bcast ires exist in pairs. Both have to be deleted,
-		 * Since we are exclusive we can make the above assertion.
-		 * The 1st has to be refrele'd since it was ctable_lookup'd.
-		 */
-		ASSERT(IAM_WRITER_IPIF(ipif));
-		ASSERT(ire->ire_next->ire_addr == ire->ire_addr);
-		ire_delete(ire->ire_next);
-		ire_delete(ire);
-		ire_refrele(ire);
-	}
-	return (ire_create_bcast(ipif, addr, irep));
-}
-
-uint_t ip_loopback_mtu = IP_LOOPBACK_MTU;
-
-/*
- * This routine is called from ipif_check_bcast_ires and ire_check_bcast.
- * It leaves all the verifying and deleting to those routines. So it always
- * creates 2 bcast ires and chains them into the ire array passed in.
+ * Any supression of unneeded ones is done in ire_add_v4.
+ * We add one IRE_BROADCAST per address. ire_send_broadcast_v4()
+ * takes care of generating a loopback copy of the packet.
  */
 ire_t **
-ire_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep)
+ire_create_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid, ire_t **irep)
 {
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
-	ill_t		*ill = ipif->ipif_ill;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	if (IS_IPMP(ill)) {
-		/*
-		 * Broadcast IREs for the IPMP meta-interface use the
-		 * nominated broadcast interface to send and receive packets.
-		 * If there's no nominated interface, send the packets down to
-		 * the IPMP stub driver, which will discard them.  If the
-		 * nominated broadcast interface changes, ill_refresh_bcast()
-		 * will refresh the broadcast IREs.
-		 */
-		if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
-			ill = ipif->ipif_ill;
-	}
+	ASSERT(IAM_WRITER_ILL(ill));
 
 	*irep++ = ire_create(
 	    (uchar_t *)&addr,			/* dest addr */
 	    (uchar_t *)&ip_g_all_ones,		/* mask */
-	    (uchar_t *)&ipif->ipif_src_addr,	/* source addr */
 	    NULL,				/* no gateway */
-	    &ipif->ipif_mtu,			/* max frag */
-	    NULL,				/* no src nce */
-	    ill->ill_rq,			/* recv-from queue */
-	    ill->ill_wq,			/* send-to queue */
 	    IRE_BROADCAST,
-	    ipif,
-	    0,
-	    0,
-	    0,
-	    0,
-	    &ire_uinfo_null,
-	    NULL,
-	    NULL,
-	    ipst);
-
-	*irep++ = ire_create(
-	    (uchar_t *)&addr,			/* dest address */
-	    (uchar_t *)&ip_g_all_ones,		/* mask */
-	    (uchar_t *)&ipif->ipif_src_addr,	/* source address */
-	    NULL,				/* no gateway */
-	    &ip_loopback_mtu,			/* max frag size */
-	    NULL,				/* no src_nce */
-	    ill->ill_rq,			/* recv-from queue */
-	    NULL,				/* no send-to queue */
-	    IRE_BROADCAST,			/* Needed for fanout in wput */
-	    ipif,
-	    0,
-	    0,
-	    0,
-	    0,
-	    &ire_uinfo_null,
-	    NULL,
+	    ill,
+	    zoneid,
+	    RTF_KERNEL,
 	    NULL,
 	    ipst);
 
@@ -1789,174 +747,34 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep)
 }
 
 /*
- * ire_walk routine to delete or update any IRE_CACHE that might contain
- * stale information.
- * The flags state which entries to delete or update.
- * Garbage collection is done separately using kmem alloc callbacks to
- * ip_trash_ire_reclaim.
- * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME
- * since other stale information is cleaned up using NUD.
- */
-void
-ire_expire(ire_t *ire, char *arg)
-{
-	ire_expire_arg_t	*ieap = (ire_expire_arg_t *)(uintptr_t)arg;
-	ill_t			*stq_ill;
-	int			flush_flags = ieap->iea_flush_flag;
-	ip_stack_t		*ipst = ieap->iea_ipst;
-
-	if ((flush_flags & FLUSH_REDIRECT_TIME) &&
-	    (ire->ire_flags & RTF_DYNAMIC)) {
-		/* Make sure we delete the corresponding IRE_CACHE */
-		ip1dbg(("ire_expire: all redirects\n"));
-		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
-		ire_delete(ire);
-		atomic_dec_32(&ipst->ips_ip_redirect_cnt);
-		return;
-	}
-	if (ire->ire_type != IRE_CACHE)
-		return;
-
-	if (flush_flags & FLUSH_ARP_TIME) {
-		/*
-		 * Remove all IRE_CACHE except IPv4 multicast ires. These
-		 * ires will be deleted by ip_trash_ire_reclaim_stack()
-		 * when system runs low in memory.
-		 * Verify that create time is more than ip_ire_arp_interval
-		 * milliseconds ago.
-		 */
-
-		if (!(ire->ire_ipversion == IPV4_VERSION &&
-		    CLASSD(ire->ire_addr)) && NCE_EXPIRED(ire->ire_nce, ipst)) {
-			ire_delete(ire);
-			return;
-		}
-	}
-
-	if (ipst->ips_ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) &&
-	    (ire->ire_ipif != NULL)) {
-		/* Increase pmtu if it is less than the interface mtu */
-		mutex_enter(&ire->ire_lock);
-		/*
-		 * If the ipif is a vni (whose mtu is 0, since it's virtual)
-		 * get the mtu from the sending interfaces' ipif
-		 */
-		if (IS_VNI(ire->ire_ipif->ipif_ill)) {
-			stq_ill = ire->ire_stq->q_ptr;
-			ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu,
-			    IP_MAXPACKET);
-		} else {
-			ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu,
-			    IP_MAXPACKET);
-		}
-		ire->ire_marks &= ~IRE_MARK_PMTU;
-		ire->ire_frag_flag |= IPH_DF;
-		mutex_exit(&ire->ire_lock);
-	}
-}
-
-/*
- * Return any local address.  We use this to target ourselves
- * when the src address was specified as 'default'.
- * Preference for IRE_LOCAL entries.
+ * This looks up an IRE_BROADCAST based on the arguments.
+ * Mirrors ire_create_bcast().
  */
 ire_t *
-ire_lookup_local(zoneid_t zoneid, ip_stack_t *ipst)
+ire_lookup_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
 {
-	ire_t	*ire;
-	irb_t	*irb;
-	ire_t	*maybe = NULL;
-	int i;
+	ire_t		*ire;
+	int		match_args;
 
-	for (i = 0; i < ipst->ips_ip_cache_table_size;  i++) {
-		irb = &ipst->ips_ip_cache_table[i];
-		if (irb->irb_ire == NULL)
-			continue;
-		rw_enter(&irb->irb_lock, RW_READER);
-		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
-			if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
-			    (ire->ire_zoneid != zoneid &&
-			    ire->ire_zoneid != ALL_ZONES))
-				continue;
-			switch (ire->ire_type) {
-			case IRE_LOOPBACK:
-				if (maybe == NULL) {
-					IRE_REFHOLD(ire);
-					maybe = ire;
-				}
-				break;
-			case IRE_LOCAL:
-				if (maybe != NULL) {
-					ire_refrele(maybe);
-				}
-				IRE_REFHOLD(ire);
-				rw_exit(&irb->irb_lock);
-				return (ire);
-			}
-		}
-		rw_exit(&irb->irb_lock);
-	}
-	return (maybe);
-}
+	match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_GW |
+	    MATCH_IRE_MASK | MATCH_IRE_ZONEONLY;
 
-/*
- * If the specified IRE is associated with a particular ILL, return
- * that ILL pointer (May be called as writer.).
- *
- * NOTE : This is not a generic function that can be used always.
- * This function always returns the ill of the outgoing packets
- * if this ire is used.
- */
-ill_t *
-ire_to_ill(const ire_t *ire)
-{
-	ill_t *ill = NULL;
+	if (IS_UNDER_IPMP(ill))
+		match_args |= MATCH_IRE_TESTHIDDEN;
 
-	/*
-	 * 1) For an IRE_CACHE, ire_ipif is the one where it obtained
-	 *    the source address from. ire_stq is the one where the
-	 *    packets will be sent out on. We return that here.
-	 *
-	 * 2) IRE_BROADCAST normally has a loopback and a non-loopback
-	 *    copy and they always exist next to each other with loopback
-	 *    copy being the first one. If we are called on the non-loopback
-	 *    copy, return the one pointed by ire_stq. If it was called on
-	 *    a loopback copy, we still return the one pointed by the next
-	 *    ire's ire_stq pointer i.e the one pointed by the non-loopback
-	 *    copy. We don't want use ire_ipif as it might represent the
-	 *    source address (if we borrow source addresses for
-	 *    IRE_BROADCASTS in the future).
-	 *    However if an interface is currently coming up, the above
-	 *    condition may not hold during that period since the ires
-	 *    are added one at a time. Thus one of the pair could have been
-	 *    added and the other not yet added.
-	 * 3) For many other IREs (e.g., IRE_LOCAL), ire_rfq indicates the ill.
-	 * 4) For all others return the ones pointed by ire_ipif->ipif_ill.
-	 *    That handles IRE_LOOPBACK.
-	 */
-
-	if (ire->ire_type == IRE_CACHE) {
-		ill = (ill_t *)ire->ire_stq->q_ptr;
-	} else if (ire->ire_type == IRE_BROADCAST) {
-		if (ire->ire_stq != NULL) {
-			ill = (ill_t *)ire->ire_stq->q_ptr;
-		} else {
-			ire_t  *ire_next;
-
-			ire_next = ire->ire_next;
-			if (ire_next != NULL &&
-			    ire_next->ire_type == IRE_BROADCAST &&
-			    ire_next->ire_addr == ire->ire_addr &&
-			    ire_next->ire_ipif == ire->ire_ipif) {
-				ill = (ill_t *)ire_next->ire_stq->q_ptr;
-			}
-		}
-	} else if (ire->ire_rfq != NULL) {
-		ill = ire->ire_rfq->q_ptr;
-	} else if (ire->ire_ipif != NULL) {
-		ill = ire->ire_ipif->ipif_ill;
-	}
-	return (ill);
+	ire = ire_ftable_lookup_v4(
+	    addr,				/* dest addr */
+	    ip_g_all_ones,			/* mask */
+	    0,					/* no gateway */
+	    IRE_BROADCAST,
+	    ill,
+	    zoneid,
+	    NULL,
+	    match_args,
+	    0,
+	    ill->ill_ipst,
+	    NULL);
+	return (ire);
 }
 
 /* Arrange to call the specified function for every IRE in the world. */
@@ -1992,15 +810,13 @@ ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid,
 		 */
 		ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE,
 		    0, NULL,
-		    ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table,
 		    NULL, zoneid, ipst);
 	}
 	if (vers != IPV4_VERSION) {
 		ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE,
 		    ipst->ips_ip6_ftable_hash_size,
 		    ipst->ips_ip_forwarding_table_v6,
-		    ipst->ips_ip6_cache_table_size,
-		    ipst->ips_ip_cache_table_v6, NULL, zoneid, ipst);
+		    NULL, zoneid, ipst);
 	}
 }
 
@@ -2016,22 +832,6 @@ ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
 	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill);
 }
 
-void
-ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
-    ill_t *ill)
-{
-	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION,
-	    ill);
-}
-
-void
-ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
-    ill_t *ill)
-{
-	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION,
-	    ill);
-}
-
 /*
  * Walk a particular ill and version.
  */
@@ -2043,137 +843,121 @@ ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func,
 
 	if (vers == IPV4_VERSION) {
 		ire_walk_ill_tables(match_flags, ire_type, func, arg,
-		    IP_MASK_TABLE_SIZE, 0,
-		    NULL, ipst->ips_ip_cache_table_size,
-		    ipst->ips_ip_cache_table, ill, ALL_ZONES, ipst);
-	} else if (vers == IPV6_VERSION) {
+		    IP_MASK_TABLE_SIZE,
+		    0, NULL,
+		    ill, ALL_ZONES, ipst);
+	}
+	if (vers != IPV4_VERSION) {
 		ire_walk_ill_tables(match_flags, ire_type, func, arg,
 		    IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size,
 		    ipst->ips_ip_forwarding_table_v6,
-		    ipst->ips_ip6_cache_table_size,
-		    ipst->ips_ip_cache_table_v6, ill, ALL_ZONES, ipst);
+		    ill, ALL_ZONES, ipst);
 	}
 }
 
+/*
+ * Do the specific matching of IREs to shared-IP zones.
+ *
+ * We have the same logic as in ire_match_args but implemented slightly
+ * differently.
+ */
 boolean_t
 ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
     ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst)
 {
-	ill_t *ire_stq_ill = NULL;
-	ill_t *ire_ipif_ill = NULL;
+	ill_t *dst_ill = NULL;
 
 	ASSERT(match_flags != 0 || zoneid != ALL_ZONES);
-	/*
-	 * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and
-	 *    ire_ipif.  Only in the case of IRE_CACHEs can ire_stq and
-	 *    ire_ipif be pointing to different ills. But we want to keep
-	 *    this function generic enough for future use. So, we always
-	 *    try to match on both.  The only caller of this function
-	 *    ire_walk_ill_tables, will call "func" after we return from
-	 *    this function. We expect "func" to do the right filtering
-	 *    of ires in this case.
-	 */
 	if (match_flags & MATCH_IRE_ILL) {
-		if (ire->ire_stq != NULL)
-			ire_stq_ill = ire->ire_stq->q_ptr;
-		if (ire->ire_ipif != NULL)
-			ire_ipif_ill = ire->ire_ipif->ipif_ill;
+		dst_ill = ire->ire_ill;
 	}
 
-	if (zoneid != ALL_ZONES) {
+	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
+	    ire->ire_zoneid != ALL_ZONES) {
 		/*
 		 * We're walking the IREs for a specific zone. The only relevant
 		 * IREs are:
 		 * - all IREs with a matching ire_zoneid
-		 * - all IRE_OFFSUBNETs as they're shared across all zones
-		 * - IRE_INTERFACE IREs for interfaces with a usable source addr
+		 * - IRE_IF_ALL IREs for interfaces with a usable source addr
 		 *   with a matching zone
-		 * - IRE_DEFAULTs with a gateway reachable from the zone
-		 * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs
-		 * using the same rule; but the above rules are consistent with
-		 * the behavior of ire_ftable_lookup[_v6]() so that all the
-		 * routes that can be matched during lookup are also matched
-		 * here.
+		 * - IRE_OFFLINK with a gateway reachable from the zone
+		 * Note that ealier we only did the IRE_OFFLINK check for
+		 * IRE_DEFAULT (and only when we had multiple IRE_DEFAULTs).
 		 */
-		if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) {
+		dst_ill = ire->ire_ill;
+
+		if (ire->ire_type & IRE_ONLINK) {
+			uint_t	ifindex;
+
 			/*
-			 * Note, IRE_INTERFACE can have the stq as NULL. For
-			 * example, if the default multicast route is tied to
-			 * the loopback address.
+			 * Note there is no IRE_INTERFACE on vniN thus
+			 * can't do an IRE lookup for a matching route.
 			 */
-			if ((ire->ire_type & IRE_INTERFACE) &&
-			    (ire->ire_stq != NULL)) {
-				ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr;
-				if (ire->ire_ipversion == IPV4_VERSION) {
-					if (!ipif_usesrc_avail(ire_stq_ill,
-					    zoneid))
-						/* No usable src addr in zone */
-						return (B_FALSE);
-				} else if (ire_stq_ill->ill_usesrc_ifindex
-				    != 0) {
-					/*
-					 * For IPv6 use ipif_select_source_v6()
-					 * so the right scope selection is done
-					 */
-					ipif_t *src_ipif;
-					src_ipif =
-					    ipif_select_source_v6(ire_stq_ill,
-					    &ire->ire_addr_v6, B_FALSE,
-					    IPV6_PREFER_SRC_DEFAULT,
-					    zoneid);
-					if (src_ipif != NULL) {
-						ipif_refrele(src_ipif);
-					} else {
-						return (B_FALSE);
-					}
-				} else {
-					return (B_FALSE);
-				}
+			ifindex = dst_ill->ill_usesrc_ifindex;
+			if (ifindex == 0)
+				return (B_FALSE);
 
-			} else if (!(ire->ire_type & IRE_OFFSUBNET)) {
+			/*
+			 * If there is a usable source address in the
+			 * zone, then it's ok to return an
+			 * IRE_INTERFACE
+			 */
+			if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
+			    zoneid, ipst)) {
+				return (B_FALSE);
+			}
+		}
+
+		if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
+			ipif_t	*tipif;
+
+			mutex_enter(&dst_ill->ill_lock);
+			for (tipif = dst_ill->ill_ipif;
+			    tipif != NULL; tipif = tipif->ipif_next) {
+				if (!IPIF_IS_CONDEMNED(tipif) &&
+				    (tipif->ipif_flags & IPIF_UP) &&
+				    (tipif->ipif_zoneid == zoneid ||
+				    tipif->ipif_zoneid == ALL_ZONES))
+					break;
+			}
+			mutex_exit(&dst_ill->ill_lock);
+			if (tipif == NULL) {
 				return (B_FALSE);
 			}
 		}
 
 		/*
-		 * Match all default routes from the global zone, irrespective
+		 * Match all offlink routes from the global zone, irrespective
 		 * of reachability. For a non-global zone only match those
-		 * where ire_gateway_addr has a IRE_INTERFACE for the zoneid.
+		 * where ire_gateway_addr has an IRE_INTERFACE for the zoneid.
 		 */
-		if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) {
-			int ire_match_flags = 0;
+		if ((ire->ire_type & IRE_OFFLINK) && zoneid != GLOBAL_ZONEID &&
+		    zoneid != ALL_ZONES) {
 			in6_addr_t gw_addr_v6;
-			ire_t *rire;
-
-			ire_match_flags |= MATCH_IRE_TYPE;
-			if (ire->ire_ipif != NULL)
-				ire_match_flags |= MATCH_IRE_ILL;
 
 			if (ire->ire_ipversion == IPV4_VERSION) {
-				rire = ire_route_lookup(ire->ire_gateway_addr,
-				    0, 0, IRE_INTERFACE, ire->ire_ipif, NULL,
-				    zoneid, NULL, ire_match_flags, ipst);
+				if (!ire_gateway_ok_zone_v4(
+				    ire->ire_gateway_addr, zoneid,
+				    dst_ill, NULL, ipst, B_FALSE))
+					return (B_FALSE);
 			} else {
 				ASSERT(ire->ire_ipversion == IPV6_VERSION);
 				mutex_enter(&ire->ire_lock);
 				gw_addr_v6 = ire->ire_gateway_addr_v6;
 				mutex_exit(&ire->ire_lock);
-				rire = ire_route_lookup_v6(&gw_addr_v6,
-				    NULL, NULL, IRE_INTERFACE, ire->ire_ipif,
-				    NULL, zoneid, NULL, ire_match_flags, ipst);
-			}
-			if (rire == NULL) {
-				return (B_FALSE);
+
+				if (!ire_gateway_ok_zone_v6(&gw_addr_v6, zoneid,
+				    dst_ill, NULL, ipst, B_FALSE))
+					return (B_FALSE);
 			}
-			ire_refrele(rire);
 		}
 	}
 
 	if (((!(match_flags & MATCH_IRE_TYPE)) ||
 	    (ire->ire_type & ire_type)) &&
 	    ((!(match_flags & MATCH_IRE_ILL)) ||
-	    (ire_stq_ill == ill || ire_ipif_ill == ill ||
-	    ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) {
+	    (dst_ill == ill ||
+	    dst_ill != NULL && IS_IN_SAME_ILLGRP(dst_ill, ill)))) {
 		return (B_TRUE);
 	}
 	return (B_FALSE);
@@ -2197,8 +981,9 @@ rtfunc(struct radix_node *rn, void *arg)
 			ret = ire_walk_ill_match(rtf->rt_match_flags,
 			    rtf->rt_ire_type, ire,
 			    rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst);
-		} else
+		} else {
 			ret = B_TRUE;
+		}
 		if (ret)
 			(*rtf->rt_func)(ire, rtf->rt_arg);
 	}
@@ -2206,12 +991,12 @@ rtfunc(struct radix_node *rn, void *arg)
 }
 
 /*
- * Walk the ftable and the ctable entries that match the ill.
+ * Walk the ftable entries that match the ill.
  */
 void
 ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
     void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl,
-    size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid,
+    ill_t *ill, zoneid_t zoneid,
     ip_stack_t *ipst)
 {
 	irb_t	*irb_ptr;
@@ -2223,85 +1008,50 @@ ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
 
 	ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL));
 	ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0));
-	/*
-	 * Optimize by not looking at the forwarding table if there
-	 * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE
-	 * specified in ire_type.
-	 */
-	if (!(match_flags & MATCH_IRE_TYPE) ||
-	    ((ire_type & IRE_FORWARDTABLE) != 0)) {
-		/* knobs such that routine is called only for v6 case */
-		if (ipftbl == ipst->ips_ip_forwarding_table_v6) {
-			for (i = (ftbl_sz - 1);  i >= 0; i--) {
-				if ((irb_ptr = ipftbl[i]) == NULL)
+
+	/* knobs such that routine is called only for v6 case */
+	if (ipftbl == ipst->ips_ip_forwarding_table_v6) {
+		for (i = (ftbl_sz - 1);  i >= 0; i--) {
+			if ((irb_ptr = ipftbl[i]) == NULL)
+				continue;
+			for (j = 0; j < htbl_sz; j++) {
+				irb = &irb_ptr[j];
+				if (irb->irb_ire == NULL)
 					continue;
-				for (j = 0; j < htbl_sz; j++) {
-					irb = &irb_ptr[j];
-					if (irb->irb_ire == NULL)
-						continue;
-
-					IRB_REFHOLD(irb);
-					for (ire = irb->irb_ire; ire != NULL;
-					    ire = ire->ire_next) {
-						if (match_flags == 0 &&
-						    zoneid == ALL_ZONES) {
-							ret = B_TRUE;
-						} else {
-							ret =
-							    ire_walk_ill_match(
-							    match_flags,
-							    ire_type, ire, ill,
-							    zoneid, ipst);
-						}
-						if (ret)
-							(*func)(ire, arg);
+
+				irb_refhold(irb);
+				for (ire = irb->irb_ire; ire != NULL;
+				    ire = ire->ire_next) {
+					if (match_flags == 0 &&
+					    zoneid == ALL_ZONES) {
+						ret = B_TRUE;
+					} else {
+						ret =
+						    ire_walk_ill_match(
+						    match_flags,
+						    ire_type, ire, ill,
+						    zoneid, ipst);
 					}
-					IRB_REFRELE(irb);
+					if (ret)
+						(*func)(ire, arg);
 				}
+				irb_refrele(irb);
 			}
-		} else {
-			(void) memset(&rtfarg, 0, sizeof (rtfarg));
-			rtfarg.rt_func = func;
-			rtfarg.rt_arg = arg;
-			if (match_flags != 0) {
-				rtfarg.rt_match_flags = match_flags;
-			}
-			rtfarg.rt_ire_type = ire_type;
-			rtfarg.rt_ill = ill;
-			rtfarg.rt_zoneid = zoneid;
-			rtfarg.rt_ipst = ipst;	/* No netstack_hold */
-			(void) ipst->ips_ip_ftable->rnh_walktree_mt(
-			    ipst->ips_ip_ftable,
-			    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
 		}
-	}
-
-	/*
-	 * Optimize by not looking at the cache table if there
-	 * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE
-	 * specified in ire_type.
-	 */
-	if (!(match_flags & MATCH_IRE_TYPE) ||
-	    ((ire_type & IRE_CACHETABLE) != 0)) {
-		for (i = 0; i < ctbl_sz;  i++) {
-			irb = &ipctbl[i];
-			if (irb->irb_ire == NULL)
-				continue;
-			IRB_REFHOLD(irb);
-			for (ire = irb->irb_ire; ire != NULL;
-			    ire = ire->ire_next) {
-				if (match_flags == 0 && zoneid == ALL_ZONES) {
-					ret = B_TRUE;
-				} else {
-					ret = ire_walk_ill_match(
-					    match_flags, ire_type,
-					    ire, ill, zoneid, ipst);
-				}
-				if (ret)
-					(*func)(ire, arg);
-			}
-			IRB_REFRELE(irb);
+	} else {
+		(void) memset(&rtfarg, 0, sizeof (rtfarg));
+		rtfarg.rt_func = func;
+		rtfarg.rt_arg = arg;
+		if (match_flags != 0) {
+			rtfarg.rt_match_flags = match_flags;
 		}
+		rtfarg.rt_ire_type = ire_type;
+		rtfarg.rt_ill = ill;
+		rtfarg.rt_zoneid = zoneid;
+		rtfarg.rt_ipst = ipst;	/* No netstack_hold */
+		(void) ipst->ips_ip_ftable->rnh_walktree_mt(
+		    ipst->ips_ip_ftable,
+		    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
 	}
 }
 
@@ -2323,557 +1073,178 @@ ip_mask_to_plen(ipaddr_t mask)
 ipaddr_t
 ip_plen_to_mask(uint_t masklen)
 {
+	if (masklen == 0)
+		return (0);
+
 	return (htonl(IP_HOST_MASK << (IP_ABITS - masklen)));
 }
 
 void
 ire_atomic_end(irb_t *irb_ptr, ire_t *ire)
 {
-	ill_t *stq_ill, *ipif_ill;
-	ip_stack_t *ipst = ire->ire_ipst;
+	ill_t		*ill;
 
-	stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL;
-	ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL;
-	RELEASE_ILL_LOCKS(ipif_ill, stq_ill);
+	ill = ire->ire_ill;
+	if (ill != NULL)
+		mutex_exit(&ill->ill_lock);
 	rw_exit(&irb_ptr->irb_lock);
-	rw_exit(&ipst->ips_ill_g_usesrc_lock);
 }
 
 /*
- * ire_add_v[46] atomically make sure that the ipif or ill associated
- * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING
- * before adding the ire to the table. This ensures that we don't create
- * new IRE_CACHEs with stale values for parameters that are passed to
- * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer
- * to the ipif_mtu, and not the value. The actual value is derived from the
- * parent ire or ipif under the bucket lock.
+ * ire_add_v[46] atomically make sure that the ill associated
+ * with the new ire is not going away i.e., we check ILL_CONDEMNED.
  */
 int
-ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp,
-    ipsq_func_t func)
+ire_atomic_start(irb_t *irb_ptr, ire_t *ire)
 {
-	ill_t	*stq_ill;
-	ill_t	*ipif_ill;
-	int	error = 0;
-	ill_t	*ill = NULL;
-	ip_stack_t	*ipst = ire->ire_ipst;
+	ill_t		*ill;
 
-	stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL;
-	ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL;
+	ill = ire->ire_ill;
 
-	ASSERT((q != NULL && mp != NULL && func != NULL) ||
-	    (q == NULL && mp == NULL && func == NULL));
-	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
-	GRAB_CONN_LOCK(q);
 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
-	GRAB_ILL_LOCKS(ipif_ill, stq_ill);
+	if (ill != NULL) {
+		mutex_enter(&ill->ill_lock);
 
-	/*
-	 * While the IRE is in the process of being added, a user may have
-	 * invoked the ifconfig usesrc option on the stq_ill to make it a
-	 * usesrc client ILL. Check for this possibility here, if it is true
-	 * then we fail adding the IRE_CACHE. Another check is to make sure
-	 * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc
-	 * group. The ill_g_usesrc_lock is released in ire_atomic_end
-	 */
-	if ((ire->ire_type & IRE_CACHE) &&
-	    (ire->ire_marks & IRE_MARK_USESRC_CHECK)) {
-		if (stq_ill->ill_usesrc_ifindex != 0) {
-			ASSERT(stq_ill->ill_usesrc_grp_next != NULL);
-			if ((ipif_ill->ill_phyint->phyint_ifindex !=
-			    stq_ill->ill_usesrc_ifindex) ||
-			    (ipif_ill->ill_usesrc_grp_next == NULL) ||
-			    (ipif_ill->ill_usesrc_ifindex != 0)) {
-				error = EINVAL;
-				goto done;
-			}
-		} else if (ipif_ill->ill_usesrc_grp_next != NULL) {
-			error = EINVAL;
-			goto done;
+		/*
+		 * Don't allow IRE's to be created on dying ills.
+		 */
+		if (ill->ill_state_flags & ILL_CONDEMNED) {
+			ire_atomic_end(irb_ptr, ire);
+			return (ENXIO);
 		}
-	}
 
-	/*
-	 * Don't allow IRE's to be created on changing ill's.  Also, since
-	 * IPMP flags can be set on an ill without quiescing it, if we're not
-	 * a writer on stq_ill, check that the flags still allow IRE creation.
-	 */
-	if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) {
-		if (stq_ill->ill_state_flags & ILL_CHANGING) {
-			ill = stq_ill;
-			error = EAGAIN;
-		} else if (IS_UNDER_IPMP(stq_ill)) {
-			mutex_enter(&stq_ill->ill_phyint->phyint_lock);
-			if (!ipmp_ill_is_active(stq_ill) &&
-			    !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) {
+		if (IS_UNDER_IPMP(ill)) {
+			int	error = 0;
+			mutex_enter(&ill->ill_phyint->phyint_lock);
+			if (!ipmp_ill_is_active(ill) &&
+			    IRE_HIDDEN_TYPE(ire->ire_type) &&
+			    !ire->ire_testhidden) {
 				error = EINVAL;
 			}
-			mutex_exit(&stq_ill->ill_phyint->phyint_lock);
+			mutex_exit(&ill->ill_phyint->phyint_lock);
+			if (error != 0) {
+				ire_atomic_end(irb_ptr, ire);
+				return (error);
+			}
 		}
-		if (error != 0)
-			goto done;
-	}
 
-	if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) &&
-	    (ipif_ill->ill_state_flags & ILL_CHANGING)) {
-		ill = ipif_ill;
-		error = EAGAIN;
-		goto done;
 	}
-
-	if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) &&
-	    (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) {
-		ill = ire->ire_ipif->ipif_ill;
-		ASSERT(ill != NULL);
-		error = EAGAIN;
-		goto done;
-	}
-
-done:
-	if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) {
-		ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
-		mutex_enter(&ipsq->ipsq_lock);
-		mutex_enter(&ipsq->ipsq_xop->ipx_lock);
-		ire_atomic_end(irb_ptr, ire);
-		ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
-		mutex_exit(&ipsq->ipsq_xop->ipx_lock);
-		mutex_exit(&ipsq->ipsq_lock);
-		error = EINPROGRESS;
-	} else if (error != 0) {
-		ire_atomic_end(irb_ptr, ire);
-	}
-
-	RELEASE_CONN_LOCK(q);
-	return (error);
+	return (0);
 }
 
 /*
- * Add a fully initialized IRE to an appropriate table based on
- * ire_type.
- *
- * allow_unresolved == B_FALSE indicates a legacy code-path call
- * that has prohibited the addition of incomplete ire's. If this
- * parameter is set, and we find an nce that is in a state other
- * than ND_REACHABLE, we fail the add. Note that nce_state could be
- * something other than ND_REACHABLE if the nce had just expired and
- * the ire_create preceding the ire_add added a new ND_INITIAL nce.
+ * Add a fully initialized IRE to the forwarding table.
+ * This returns NULL on failure, or a held IRE on success.
+ * Normally the returned IRE is the same as the argument. But a different
+ * IRE will be returned if the added IRE is deemed identical to an existing
+ * one. In that case ire_identical_ref will be increased.
+ * The caller always needs to do an ire_refrele() on the returned IRE.
  */
-int
-ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
-    boolean_t allow_unresolved)
+ire_t *
+ire_add(ire_t *ire)
 {
-	ire_t	*ire1;
-	ill_t	*stq_ill = NULL;
-	ill_t	*ill;
-	ipif_t	*ipif = NULL;
-	ill_walk_context_t ctx;
-	ire_t	*ire = *irep;
-	int	error;
-	boolean_t ire_is_mblk = B_FALSE;
-	tsol_gcgrp_t *gcgrp = NULL;
-	tsol_gcgrp_addr_t ga;
-	ip_stack_t	*ipst = ire->ire_ipst;
-
-	/* get ready for the day when original ire is not created as mblk */
-	if (ire->ire_mp != NULL) {
-		ire_is_mblk = B_TRUE;
-		/* Copy the ire to a kmem_alloc'ed area */
-		ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
-		if (ire1 == NULL) {
-			ip1dbg(("ire_add: alloc failed\n"));
-			ire_delete(ire);
-			*irep = NULL;
-			return (ENOMEM);
-		}
-		ire->ire_marks &= ~IRE_MARK_UNCACHED;
-		*ire1 = *ire;
-		ire1->ire_mp = NULL;
-		ire1->ire_stq_ifindex = 0;
-		freeb(ire->ire_mp);
-		ire = ire1;
-	}
-	if (ire->ire_stq != NULL)
-		stq_ill = ire->ire_stq->q_ptr;
-
-	if (stq_ill != NULL && ire->ire_type == IRE_CACHE &&
-	    stq_ill->ill_net_type == IRE_IF_RESOLVER) {
-		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-		ill = ILL_START_WALK_ALL(&ctx, ipst);
-		for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-			mutex_enter(&ill->ill_lock);
-			if (ill->ill_state_flags & ILL_CONDEMNED) {
-				mutex_exit(&ill->ill_lock);
-				continue;
-			}
-			/*
-			 * We need to make sure that the ipif is a valid one
-			 * before adding the IRE_CACHE. This happens only
-			 * with IRE_CACHE when there is an external resolver.
-			 *
-			 * We can unplumb a logical interface while the
-			 * packet is waiting in ARP with the IRE. Then,
-			 * later on when we feed the IRE back, the ipif
-			 * has to be re-checked. This can't happen with
-			 * NDP currently, as we never queue the IRE with
-			 * the packet. We always try to recreate the IRE
-			 * when the resolution is completed. But, we do
-			 * it for IPv6 also here so that in future if
-			 * we have external resolvers, it will work without
-			 * any change.
-			 */
-			ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid);
-			if (ipif != NULL) {
-				ipif_refhold_locked(ipif);
-				mutex_exit(&ill->ill_lock);
-				break;
-			}
-			mutex_exit(&ill->ill_lock);
-		}
-		rw_exit(&ipst->ips_ill_g_lock);
-		if (ipif == NULL ||
-		    (ipif->ipif_isv6 &&
-		    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) &&
-		    !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
-		    &ipif->ipif_v6src_addr)) ||
-		    (!ipif->ipif_isv6 &&
-		    ire->ire_src_addr != ipif->ipif_src_addr) ||
-		    ire->ire_zoneid != ipif->ipif_zoneid) {
-			if (ipif != NULL)
-				ipif_refrele(ipif);
-			ire->ire_ipif = NULL;
-			ire_delete(ire);
-			*irep = NULL;
-			return (EINVAL);
-		}
-
-		ASSERT(ill != NULL);
-
+	if (IRE_HIDDEN_TYPE(ire->ire_type) &&
+	    ire->ire_ill != NULL && IS_UNDER_IPMP(ire->ire_ill)) {
 		/*
-		 * Since we didn't attach label security attributes to the
-		 * ire for the resolver case, we need to add it now. (only
-		 * for v4 resolver and v6 xresolv case).
+		 * IREs hosted on interfaces that are under IPMP
+		 * should be hidden so that applications don't
+		 * accidentally end up sending packets with test
+		 * addresses as their source addresses, or
+		 * sending out interfaces that are e.g. IFF_INACTIVE.
+		 * Hide them here.
 		 */
-		if (is_system_labeled() && ire_is_mblk) {
-			if (ire->ire_ipversion == IPV4_VERSION) {
-				ga.ga_af = AF_INET;
-				IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr !=
-				    INADDR_ANY ? ire->ire_gateway_addr :
-				    ire->ire_addr, &ga.ga_addr);
-			} else {
-				ga.ga_af = AF_INET6;
-				ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED(
-				    &ire->ire_gateway_addr_v6) ?
-				    ire->ire_addr_v6 :
-				    ire->ire_gateway_addr_v6;
-			}
-			gcgrp = gcgrp_lookup(&ga, B_FALSE);
-			error = tsol_ire_init_gwattr(ire, ire->ire_ipversion,
-			    NULL, gcgrp);
-			if (error != 0) {
-				if (gcgrp != NULL) {
-					GCGRP_REFRELE(gcgrp);
-					gcgrp = NULL;
-				}
-				ipif_refrele(ipif);
-				ire->ire_ipif = NULL;
-				ire_delete(ire);
-				*irep = NULL;
-				return (error);
-			}
-		}
+		ire->ire_testhidden = B_TRUE;
 	}
 
-	/*
-	 * In case ire was changed
-	 */
-	*irep = ire;
 	if (ire->ire_ipversion == IPV6_VERSION)
-		error = ire_add_v6(irep, q, mp, func);
+		return (ire_add_v6(ire));
 	else
-		error = ire_add_v4(irep, q, mp, func, allow_unresolved);
-	if (ipif != NULL)
-		ipif_refrele(ipif);
-	return (error);
+		return (ire_add_v4(ire));
 }
 
 /*
- * Add an initialized IRE to an appropriate table based on ire_type.
- *
- * The forward table contains IRE_PREFIX/IRE_HOST and
- * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
- *
- * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
- * and IRE_CACHE.
- *
- * NOTE : This function is called as writer though not required
- * by this function.
+ * Add a fully initialized IPv4 IRE to the forwarding table.
+ * This returns NULL on failure, or a held IRE on success.
+ * Normally the returned IRE is the same as the argument. But a different
+ * IRE will be returned if the added IRE is deemed identical to an existing
+ * one. In that case ire_identical_ref will be increased.
+ * The caller always needs to do an ire_refrele() on the returned IRE.
  */
-static int
-ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
-    boolean_t allow_unresolved)
+static ire_t *
+ire_add_v4(ire_t *ire)
 {
 	ire_t	*ire1;
 	irb_t	*irb_ptr;
 	ire_t	**irep;
-	int	flags;
-	ire_t	*pire = NULL;
-	ill_t	*stq_ill;
-	ire_t	*ire = *ire_p;
+	int	match_flags;
 	int	error;
-	boolean_t need_refrele = B_FALSE;
-	nce_t	*nce;
 	ip_stack_t	*ipst = ire->ire_ipst;
-	uint_t	marks = 0;
 
-	/*
-	 * IREs with source addresses hosted on interfaces that are under IPMP
-	 * should be hidden so that applications don't accidentally end up
-	 * sending packets with test addresses as their source addresses, or
-	 * sending out interfaces that are e.g. IFF_INACTIVE.  Hide them here.
-	 */
-	if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill))
-		marks |= IRE_MARK_TESTHIDDEN;
-
-	if (ire->ire_ipif != NULL)
-		ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
-	if (ire->ire_stq != NULL)
-		ASSERT(!MUTEX_HELD(
-		    &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock));
+	if (ire->ire_ill != NULL)
+		ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock));
 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
-	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
-
-	/* Find the appropriate list head. */
-	switch (ire->ire_type) {
-	case IRE_HOST:
-		ire->ire_mask = IP_HOST_MASK;
-		ire->ire_masklen = IP_ABITS;
-		ire->ire_marks |= marks;
-		if ((ire->ire_flags & RTF_SETSRC) == 0)
-			ire->ire_src_addr = 0;
-		break;
-	case IRE_CACHE:
-		ire->ire_mask = IP_HOST_MASK;
-		ire->ire_masklen = IP_ABITS;
-		ire->ire_marks |= marks;
-		break;
-	case IRE_BROADCAST:
-	case IRE_LOCAL:
-	case IRE_LOOPBACK:
-		ire->ire_mask = IP_HOST_MASK;
-		ire->ire_masklen = IP_ABITS;
-		break;
-	case IRE_PREFIX:
-	case IRE_DEFAULT:
-		ire->ire_marks |= marks;
-		if ((ire->ire_flags & RTF_SETSRC) == 0)
-			ire->ire_src_addr = 0;
-		break;
-	case IRE_IF_RESOLVER:
-	case IRE_IF_NORESOLVER:
-		ire->ire_marks |= marks;
-		break;
-	default:
-		ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n",
-		    (void *)ire, ire->ire_type));
-		ire_delete(ire);
-		*ire_p = NULL;
-		return (EINVAL);
-	}
 
 	/* Make sure the address is properly masked. */
 	ire->ire_addr &= ire->ire_mask;
 
-	/*
-	 * ip_newroute/ip_newroute_multi are unable to prevent the deletion
-	 * of the interface route while adding an IRE_CACHE for an on-link
-	 * destination in the IRE_IF_RESOLVER case, since the ire has to
-	 * go to ARP and return. We can't do a REFHOLD on the
-	 * associated interface ire for fear of ARP freeing the message.
-	 * Here we look up the interface ire in the forwarding table and
-	 * make sure that the interface route has not been deleted.
-	 */
-	if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 &&
-	    ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) {
-
-		ASSERT(ire->ire_max_fragp == NULL);
-		if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) {
-			/*
-			 * The ihandle that we used in ip_newroute_multi
-			 * comes from the interface route corresponding
-			 * to ire_ipif. Lookup here to see if it exists
-			 * still.
-			 * If the ire has a source address assigned using
-			 * RTF_SETSRC, ire_ipif is the logical interface holding
-			 * this source address, so we can't use it to check for
-			 * the existence of the interface route. Instead we rely
-			 * on the brute force ihandle search in
-			 * ire_ihandle_lookup_onlink() below.
-			 */
-			pire = ipif_to_ire(ire->ire_ipif);
-			if (pire == NULL) {
-				ire_delete(ire);
-				*ire_p = NULL;
-				return (EINVAL);
-			} else if (pire->ire_ihandle != ire->ire_ihandle) {
-				ire_refrele(pire);
-				ire_delete(ire);
-				*ire_p = NULL;
-				return (EINVAL);
-			}
-		} else {
-			pire = ire_ihandle_lookup_onlink(ire);
-			if (pire == NULL) {
-				ire_delete(ire);
-				*ire_p = NULL;
-				return (EINVAL);
-			}
-		}
-		/* Prevent pire from getting deleted */
-		IRB_REFHOLD(pire->ire_bucket);
-		/* Has it been removed already ? */
-		if (pire->ire_marks & IRE_MARK_CONDEMNED) {
-			IRB_REFRELE(pire->ire_bucket);
-			ire_refrele(pire);
-			ire_delete(ire);
-			*ire_p = NULL;
-			return (EINVAL);
-		}
-	} else {
-		ASSERT(ire->ire_max_fragp != NULL);
-	}
-	flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
+	match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
 
-	if (ire->ire_ipif != NULL) {
-		/*
-		 * We use MATCH_IRE_IPIF while adding IRE_CACHES only
-		 * for historic reasons and to maintain symmetry with
-		 * IPv6 code path. Historically this was used by
-		 * multicast code to create multiple IRE_CACHES on
-		 * a single ill with different ipifs. This was used
-		 * so that multicast packets leaving the node had the
-		 * right source address. This is no longer needed as
-		 * ip_wput initializes the address correctly.
-		 */
-		flags |= MATCH_IRE_IPIF;
-		/*
-		 * If we are creating a hidden IRE, make sure we search for
-		 * hidden IREs when searching for duplicates below.
-		 * Otherwise, we might find an IRE on some other interface
-		 * that's not marked hidden.
-		 */
-		if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
-			flags |= MATCH_IRE_MARK_TESTHIDDEN;
+	if (ire->ire_ill != NULL) {
+		match_flags |= MATCH_IRE_ILL;
 	}
-	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
-		irb_ptr = ire_get_bucket(ire);
-		need_refrele = B_TRUE;
-		if (irb_ptr == NULL) {
-			/*
-			 * This assumes that the ire has not added
-			 * a reference to the ipif.
-			 */
-			ire->ire_ipif = NULL;
-			ire_delete(ire);
-			if (pire != NULL) {
-				IRB_REFRELE(pire->ire_bucket);
-				ire_refrele(pire);
-			}
-			*ire_p = NULL;
-			return (EINVAL);
-		}
-	} else {
-		irb_ptr = &(ipst->ips_ip_cache_table[IRE_ADDR_HASH(
-		    ire->ire_addr, ipst->ips_ip_cache_table_size)]);
+	irb_ptr = ire_get_bucket(ire);
+	if (irb_ptr == NULL) {
+		printf("no bucket for %p\n", (void *)ire);
+		ire_delete(ire);
+		return (NULL);
 	}
 
 	/*
-	 * Start the atomic add of the ire. Grab the ill locks,
-	 * ill_g_usesrc_lock and the bucket lock. Check for condemned
-	 *
-	 * If ipif or ill is changing ire_atomic_start() may queue the
-	 * request and return EINPROGRESS.
-	 * To avoid lock order problems, get the ndp4->ndp_g_lock.
+	 * Start the atomic add of the ire. Grab the ill lock,
+	 * the bucket lock. Check for condemned.
 	 */
-	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
-	error = ire_atomic_start(irb_ptr, ire, q, mp, func);
+	error = ire_atomic_start(irb_ptr, ire);
 	if (error != 0) {
-		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
-		/*
-		 * We don't know whether it is a valid ipif or not.
-		 * So, set it to NULL. This assumes that the ire has not added
-		 * a reference to the ipif.
-		 */
-		ire->ire_ipif = NULL;
+		printf("no ire_atomic_start for %p\n", (void *)ire);
 		ire_delete(ire);
-		if (pire != NULL) {
-			IRB_REFRELE(pire->ire_bucket);
-			ire_refrele(pire);
-		}
-		*ire_p = NULL;
-		if (need_refrele)
-			IRB_REFRELE(irb_ptr);
-		return (error);
+		irb_refrele(irb_ptr);
+		return (NULL);
 	}
 	/*
-	 * To avoid creating ires having stale values for the ire_max_frag
-	 * we get the latest value atomically here. For more details
-	 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
-	 * in ip_rput_dlpi_writer
+	 * If we are creating a hidden IRE, make sure we search for
+	 * hidden IREs when searching for duplicates below.
+	 * Otherwise, we might find an IRE on some other interface
+	 * that's not marked hidden.
 	 */
-	if (ire->ire_max_fragp == NULL) {
-		if (CLASSD(ire->ire_addr))
-			ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
-		else
-			ire->ire_max_frag = pire->ire_max_frag;
-	} else {
-		uint_t	max_frag;
+	if (ire->ire_testhidden)
+		match_flags |= MATCH_IRE_TESTHIDDEN;
 
-		max_frag = *ire->ire_max_fragp;
-		ire->ire_max_fragp = NULL;
-		ire->ire_max_frag = max_frag;
-	}
 	/*
 	 * Atomically check for duplicate and insert in the table.
 	 */
 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
-		if (ire1->ire_marks & IRE_MARK_CONDEMNED)
+		if (IRE_IS_CONDEMNED(ire1))
 			continue;
-		if (ire->ire_ipif != NULL) {
-			/*
-			 * We do MATCH_IRE_ILL implicitly here for IREs
-			 * with a non-null ire_ipif, including IRE_CACHEs.
-			 * As ire_ipif and ire_stq could point to two
-			 * different ills, we can't pass just ire_ipif to
-			 * ire_match_args and get a match on both ills.
-			 * This is just needed for duplicate checks here and
-			 * so we don't add an extra argument to
-			 * ire_match_args for this. Do it locally.
-			 *
-			 * NOTE : Currently there is no part of the code
-			 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
-			 * match for IRE_CACHEs. Thus we don't want to
-			 * extend the arguments to ire_match_args.
-			 */
-			if (ire1->ire_stq != ire->ire_stq)
-				continue;
-			/*
-			 * Multiroute IRE_CACHEs for a given destination can
-			 * have the same ire_ipif, typically if their source
-			 * address is forced using RTF_SETSRC, and the same
-			 * send-to queue. We differentiate them using the parent
-			 * handle.
-			 */
-			if (ire->ire_type == IRE_CACHE &&
-			    (ire1->ire_flags & RTF_MULTIRT) &&
-			    (ire->ire_flags & RTF_MULTIRT) &&
-			    (ire1->ire_phandle != ire->ire_phandle))
-				continue;
-		}
+		/*
+		 * Here we need an exact match on zoneid, i.e.,
+		 * ire_match_args doesn't fit.
+		 */
 		if (ire1->ire_zoneid != ire->ire_zoneid)
 			continue;
+
+		if (ire1->ire_type != ire->ire_type)
+			continue;
+
+		/*
+		 * Note: We do not allow multiple routes that differ only
+		 * in the gateway security attributes; such routes are
+		 * considered duplicates.
+		 * To change that we explicitly have to treat them as
+		 * different here.
+		 */
 		if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask,
-		    ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif,
-		    ire->ire_zoneid, 0, NULL, flags, NULL)) {
+		    ire->ire_gateway_addr, ire->ire_type, ire->ire_ill,
+		    ire->ire_zoneid, NULL, match_flags)) {
 			/*
 			 * Return the old ire after doing a REFHOLD.
 			 * As most of the callers continue to use the IRE
@@ -2881,149 +1252,36 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 			 * avoid a lookup in the caller again. If the callers
 			 * don't want to use it, they need to do a REFRELE.
 			 */
-			ip1dbg(("found dup ire existing %p new %p\n",
-			    (void *)ire1, (void *)ire));
-			IRE_REFHOLD(ire1);
+			atomic_add_32(&ire1->ire_identical_ref, 1);
+			DTRACE_PROBE2(ire__add__exist, ire_t *, ire1,
+			    ire_t *, ire);
+			ire_refhold(ire1);
 			ire_atomic_end(irb_ptr, ire);
-			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 			ire_delete(ire);
-			if (pire != NULL) {
-				/*
-				 * Assert that it is not removed from the
-				 * list yet.
-				 */
-				ASSERT(pire->ire_ptpn != NULL);
-				IRB_REFRELE(pire->ire_bucket);
-				ire_refrele(pire);
-			}
-			*ire_p = ire1;
-			if (need_refrele)
-				IRB_REFRELE(irb_ptr);
-			return (0);
+			irb_refrele(irb_ptr);
+			return (ire1);
 		}
 	}
 
-	if (ire->ire_type & IRE_CACHE) {
-		ASSERT(ire->ire_stq != NULL);
-		nce = ndp_lookup_v4(ire_to_ill(ire),
-		    ((ire->ire_gateway_addr != INADDR_ANY) ?
-		    &ire->ire_gateway_addr : &ire->ire_addr),
-		    B_TRUE);
-		if (nce != NULL)
-			mutex_enter(&nce->nce_lock);
-		/*
-		 * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE
-		 * and the caller has prohibited the addition of incomplete
-		 * ire's, we fail the add. Note that nce_state could be
-		 * something other than ND_REACHABLE if the nce had
-		 * just expired and the ire_create preceding the
-		 * ire_add added a new ND_INITIAL nce.
-		 */
-		if ((nce == NULL) ||
-		    (nce->nce_flags & NCE_F_CONDEMNED) ||
-		    (!allow_unresolved &&
-		    (nce->nce_state != ND_REACHABLE))) {
-			if (nce != NULL) {
-				DTRACE_PROBE1(ire__bad__nce, nce_t *, nce);
-				mutex_exit(&nce->nce_lock);
-			}
-			ire_atomic_end(irb_ptr, ire);
-			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
-			if (nce != NULL)
-				NCE_REFRELE(nce);
-			DTRACE_PROBE1(ire__no__nce, ire_t *, ire);
-			ire_delete(ire);
-			if (pire != NULL) {
-				IRB_REFRELE(pire->ire_bucket);
-				ire_refrele(pire);
-			}
-			*ire_p = NULL;
-			if (need_refrele)
-				IRB_REFRELE(irb_ptr);
-			return (EINVAL);
-		} else {
-			ire->ire_nce = nce;
-			mutex_exit(&nce->nce_lock);
-			/*
-			 * We are associating this nce to the ire, so
-			 * change the nce ref taken in ndp_lookup_v4() from
-			 * NCE_REFHOLD to NCE_REFHOLD_NOTR
-			 */
-			NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
-		}
-	}
 	/*
-	 * Make it easy for ip_wput_ire() to hit multiple broadcast ires by
-	 * grouping identical addresses together on the hash chain.  We do
-	 * this only for IRE_BROADCASTs as ip_wput_ire is currently interested
-	 * in such groupings only for broadcasts.
-	 *
-	 * Find the first entry that matches ire_addr. *irep will be null
-	 * if no match.
-	 *
-	 * Note: the loopback and non-loopback broadcast entries for an
-	 * interface MUST be added before any MULTIRT entries.
+	 * Normally we do head insertion since most things do not care about
+	 * the order of the IREs in the bucket. Note that ip_cgtp_bcast_add
+	 * assumes we at least do head insertion so that its IRE_BROADCAST
+	 * arrive ahead of existing IRE_HOST for the same address.
+	 * However, due to shared-IP zones (and restrict_interzone_loopback)
+	 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
+	 * address. For that reason we do tail insertion for IRE_IF_CLONE.
+	 * Due to the IRE_BROADCAST on cgtp0, which must be last in the bucket,
+	 * we do tail insertion of IRE_BROADCASTs that do not have RTF_MULTIRT
+	 * set.
 	 */
 	irep = (ire_t **)irb_ptr;
-	while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr)
-		irep = &ire1->ire_next;
-	if (ire->ire_type == IRE_BROADCAST && *irep != NULL) {
-		/*
-		 * We found some ire (i.e *irep) with a matching addr. We
-		 * want to group ires with same addr.
-		 */
-		for (;;) {
-			ire1 = *irep;
-			if ((ire1->ire_next == NULL) ||
-			    (ire1->ire_next->ire_addr != ire->ire_addr) ||
-			    (ire1->ire_type != IRE_BROADCAST) ||
-			    (ire1->ire_flags & RTF_MULTIRT) ||
-			    (ire1->ire_ipif->ipif_ill->ill_grp ==
-			    ire->ire_ipif->ipif_ill->ill_grp))
-				break;
-			irep = &ire1->ire_next;
-		}
-		ASSERT(*irep != NULL);
-		/*
-		 * The ire will be added before *irep, so
-		 * if irep is a MULTIRT ire, just break to
-		 * ire insertion code.
-		 */
-		if (((*irep)->ire_flags & RTF_MULTIRT) != 0)
-			goto insert_ire;
-
-		irep = &((*irep)->ire_next);
-
-		/*
-		 * Either we have hit the end of the list or the address
-		 * did not match.
-		 */
-		while (*irep != NULL) {
-			ire1 = *irep;
-			if ((ire1->ire_addr != ire->ire_addr) ||
-			    (ire1->ire_type != IRE_BROADCAST))
-				break;
-			if (ire1->ire_ipif == ire->ire_ipif) {
-				irep = &ire1->ire_next;
-				break;
-			}
-			irep = &ire1->ire_next;
-		}
-	} else if (*irep != NULL) {
-		/*
-		 * Find the last ire which matches ire_addr.
-		 * Needed to do tail insertion among entries with the same
-		 * ire_addr.
-		 */
-		while (ire->ire_addr == ire1->ire_addr) {
+	if ((ire->ire_type & IRE_IF_CLONE) ||
+	    ((ire->ire_type & IRE_BROADCAST) &&
+	    !(ire->ire_flags & RTF_MULTIRT))) {
+		while ((ire1 = *irep) != NULL)
 			irep = &ire1->ire_next;
-			ire1 = *irep;
-			if (ire1 == NULL)
-				break;
-		}
 	}
-
-insert_ire:
 	/* Insert at *irep */
 	ire1 = *irep;
 	if (ire1 != NULL)
@@ -3058,82 +1316,31 @@ insert_ire:
 	 * in the list for the first time and no one else can bump
 	 * up the reference count on this yet.
 	 */
-	IRE_REFHOLD_LOCKED(ire);
+	ire_refhold_locked(ire);
 	BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted);
 
 	irb_ptr->irb_ire_cnt++;
-	if (irb_ptr->irb_marks & IRB_MARK_FTABLE)
+	if (irb_ptr->irb_marks & IRB_MARK_DYNAMIC)
 		irb_ptr->irb_nire++;
 
-	if (ire->ire_marks & IRE_MARK_TEMPORARY)
-		irb_ptr->irb_tmp_ire_cnt++;
-
-	if (ire->ire_ipif != NULL) {
-		DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif,
-		    (char *), "ire", (void *), ire);
-		ire->ire_ipif->ipif_ire_cnt++;
-		if (ire->ire_stq != NULL) {
-			stq_ill = (ill_t *)ire->ire_stq->q_ptr;
-			DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill,
-			    (char *), "ire", (void *), ire);
-			stq_ill->ill_ire_cnt++;
-		}
-	} else {
-		ASSERT(ire->ire_stq == NULL);
+	if (ire->ire_ill != NULL) {
+		ire->ire_ill->ill_ire_cnt++;
+		ASSERT(ire->ire_ill->ill_ire_cnt != 0);	/* Wraparound */
 	}
 
 	ire_atomic_end(irb_ptr, ire);
-	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 
-	if (pire != NULL) {
-		/* Assert that it is not removed from the list yet */
-		ASSERT(pire->ire_ptpn != NULL);
-		IRB_REFRELE(pire->ire_bucket);
-		ire_refrele(pire);
-	}
+	/* Make any caching of the IREs be notified or updated */
+	ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
 
-	if (ire->ire_type != IRE_CACHE) {
-		/*
-		 * For ire's with host mask see if there is an entry
-		 * in the cache. If there is one flush the whole cache as
-		 * there might be multiple entries due to RTF_MULTIRT (CGTP).
-		 * If no entry is found than there is no need to flush the
-		 * cache.
-		 */
-		if (ire->ire_mask == IP_HOST_MASK) {
-			ire_t *lire;
-			lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE,
-			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-			if (lire != NULL) {
-				ire_refrele(lire);
-				ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
-			}
-		} else {
-			ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
-		}
-	}
-	/*
-	 * We had to delay the fast path probe until the ire is inserted
-	 * in the list. Otherwise the fast path ack won't find the ire in
-	 * the table.
-	 */
-	if (ire->ire_type == IRE_CACHE ||
-	    (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL)) {
-		ASSERT(ire->ire_nce != NULL);
-		if (ire->ire_nce->nce_state == ND_REACHABLE)
-			nce_fastpath(ire->ire_nce);
-	}
-	if (ire->ire_ipif != NULL)
-		ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
-	*ire_p = ire;
-	if (need_refrele) {
-		IRB_REFRELE(irb_ptr);
-	}
-	return (0);
+	if (ire->ire_ill != NULL)
+		ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock));
+	irb_refrele(irb_ptr);
+	return (ire);
 }
 
 /*
- * IRB_REFRELE is the only caller of the function. ire_unlink calls to
+ * irb_refrele is the only caller of the function. ire_unlink calls to
  * do the final cleanup for this ire.
  */
 void
@@ -3162,13 +1369,13 @@ ire_cleanup(ire_t *ire)
 		 * so.
 		 */
 		ire->ire_next = NULL;
-		IRE_REFRELE_NOTR(ire);
+		ire_refrele_notr(ire);
 		ire = ire_next;
 	}
 }
 
 /*
- * IRB_REFRELE is the only caller of the function. It calls to unlink
+ * irb_refrele is the only caller of the function. It calls to unlink
  * all the CONDEMNED ires from this bucket.
  */
 ire_t *
@@ -3180,16 +1387,14 @@ ire_unlink(irb_t *irb)
 	ire_t *ire_list = NULL;
 
 	ASSERT(RW_WRITE_HELD(&irb->irb_lock));
-	ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) ||
+	ASSERT(((irb->irb_marks & IRB_MARK_DYNAMIC) && irb->irb_refcnt == 1) ||
 	    (irb->irb_refcnt == 0));
 	ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED);
 	ASSERT(irb->irb_ire != NULL);
 
 	for (ire = irb->irb_ire; ire != NULL; ire = ire1) {
-		ip_stack_t	*ipst = ire->ire_ipst;
-
 		ire1 = ire->ire_next;
-		if (ire->ire_marks & IRE_MARK_CONDEMNED) {
+		if (IRE_IS_CONDEMNED(ire)) {
 			ptpn = ire->ire_ptpn;
 			ire1 = ire->ire_next;
 			if (ire1)
@@ -3197,22 +1402,10 @@ ire_unlink(irb_t *irb)
 			*ptpn = ire1;
 			ire->ire_ptpn = NULL;
 			ire->ire_next = NULL;
-			if (ire->ire_type == IRE_DEFAULT) {
-				/*
-				 * IRE is out of the list. We need to adjust
-				 * the accounting before the caller drops
-				 * the lock.
-				 */
-				if (ire->ire_ipversion == IPV6_VERSION) {
-					ASSERT(ipst->
-					    ips_ipv6_ire_default_count !=
-					    0);
-					ipst->ips_ipv6_ire_default_count--;
-				}
-			}
+
 			/*
-			 * We need to call ire_delete_v4 or ire_delete_v6
-			 * to clean up the cache or the redirects pointing at
+			 * We need to call ire_delete_v4 or ire_delete_v6 to
+			 * clean up dependents and the redirects pointing at
 			 * the default gateway. We need to drop the lock
 			 * as ire_flush_cache/ire_delete_host_redircts require
 			 * so. But we can't drop the lock, as ire_unlink needs
@@ -3230,76 +1423,7 @@ ire_unlink(irb_t *irb)
 }
 
 /*
- * Delete all the cache entries with this 'addr'.  When IP gets a gratuitous
- * ARP message on any of its interface queue, it scans the nce table and
- * deletes and calls ndp_delete() for the appropriate nce. This action
- * also deletes all the neighbor/ire cache entries for that address.
- * This function is called from ip_arp_news in ip.c and also for
- * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns
- * true if it finds a nce entry which is used by ip_arp_news to determine if
- * it needs to do an ire_walk_v4. The return value is also  used for the
- * same purpose by ARP IOCTL processing * in ip_if.c when deleting
- * ARP entries. For SIOC*IFARP ioctls in addition to the address,
- * ip_if->ipif_ill also needs to be matched.
- */
-boolean_t
-ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif, ip_stack_t *ipst)
-{
-	ill_t	*ill;
-	nce_t	*nce;
-
-	ill = (ipif ? ipif->ipif_ill : NULL);
-
-	if (ill != NULL) {
-		/*
-		 * clean up the nce (and any relevant ire's) that matches
-		 * on addr and ill.
-		 */
-		nce = ndp_lookup_v4(ill, &addr, B_FALSE);
-		if (nce != NULL) {
-			ndp_delete(nce);
-			return (B_TRUE);
-		}
-	} else {
-		/*
-		 * ill is wildcard. clean up all nce's and
-		 * ire's that match on addr
-		 */
-		nce_clookup_t cl;
-
-		cl.ncecl_addr = addr;
-		cl.ncecl_found = B_FALSE;
-
-		ndp_walk_common(ipst->ips_ndp4, NULL,
-		    (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE);
-
-		/*
-		 *  ncecl_found would be set by ip_nce_clookup_and_delete if
-		 *  we found a matching nce.
-		 */
-		return (cl.ncecl_found);
-	}
-	return (B_FALSE);
-
-}
-
-/* Delete the supplied nce if its nce_addr matches the supplied address */
-static void
-ip_nce_clookup_and_delete(nce_t *nce, void *arg)
-{
-	nce_clookup_t *cl = (nce_clookup_t *)arg;
-	ipaddr_t nce_addr;
-
-	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
-	if (nce_addr == cl->ncecl_addr) {
-		cl->ncecl_found = B_TRUE;
-		/* clean up the nce (and any relevant ire's) */
-		ndp_delete(nce);
-	}
-}
-
-/*
- * Clean up the radix node for this ire. Must be called by IRB_REFRELE
+ * Clean up the radix node for this ire. Must be called by irb_refrele
  * when there are no ire's left in the bucket. Returns TRUE if the bucket
  * is deleted and freed.
  */
@@ -3335,40 +1459,55 @@ irb_inactive(irb_t *irb)
 
 /*
  * Delete the specified IRE.
+ * We assume that if ire_bucket is not set then ire_ill->ill_ire_cnt was
+ * not incremented i.e., that the insertion in the bucket and the increment
+ * of that counter is done atomically.
  */
 void
 ire_delete(ire_t *ire)
 {
 	ire_t	*ire1;
 	ire_t	**ptpn;
-	irb_t *irb;
+	irb_t	*irb;
+	nce_t	*nce;
 	ip_stack_t	*ipst = ire->ire_ipst;
 
+	/* We can clear ire_nce_cache under ire_lock even if the IRE is used */
+	mutex_enter(&ire->ire_lock);
+	nce = ire->ire_nce_cache;
+	ire->ire_nce_cache = NULL;
+	mutex_exit(&ire->ire_lock);
+	if (nce != NULL)
+		nce_refrele(nce);
+
 	if ((irb = ire->ire_bucket) == NULL) {
 		/*
 		 * It was never inserted in the list. Should call REFRELE
 		 * to free this IRE.
 		 */
-		IRE_REFRELE_NOTR(ire);
+		ire_refrele_notr(ire);
 		return;
 	}
 
-	rw_enter(&irb->irb_lock, RW_WRITER);
-
-	if (irb->irb_rr_origin == ire) {
-		irb->irb_rr_origin = NULL;
-	}
-
 	/*
-	 * In case of V4 we might still be waiting for fastpath ack.
+	 * Move the use counts from an IRE_IF_CLONE to its parent
+	 * IRE_INTERFACE.
+	 * We need to do this before acquiring irb_lock.
 	 */
-	if (ire->ire_ipversion == IPV4_VERSION &&
-	    (ire->ire_type == IRE_CACHE ||
-	    (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL))) {
-		ASSERT(ire->ire_nce != NULL);
-		nce_fastpath_list_delete(ire->ire_nce);
+	if (ire->ire_type & IRE_IF_CLONE) {
+		ire_t *parent;
+
+		rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+		if ((parent = ire->ire_dep_parent) != NULL) {
+			parent->ire_ob_pkt_count += ire->ire_ob_pkt_count;
+			parent->ire_ib_pkt_count += ire->ire_ib_pkt_count;
+			ire->ire_ob_pkt_count = 0;
+			ire->ire_ib_pkt_count = 0;
+		}
+		rw_exit(&ipst->ips_ire_dep_lock);
 	}
 
+	rw_enter(&irb->irb_lock, RW_WRITER);
 	if (ire->ire_ptpn == NULL) {
 		/*
 		 * Some other thread has removed us from the list.
@@ -3378,13 +1517,17 @@ ire_delete(ire_t *ire)
 		return;
 	}
 
-	if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-		irb->irb_ire_cnt--;
-		ire->ire_marks |= IRE_MARK_CONDEMNED;
-		if (ire->ire_marks & IRE_MARK_TEMPORARY) {
-			irb->irb_tmp_ire_cnt--;
-			ire->ire_marks &= ~IRE_MARK_TEMPORARY;
+	if (!IRE_IS_CONDEMNED(ire)) {
+		/* Is this an IRE representing multiple duplicate entries? */
+		ASSERT(ire->ire_identical_ref >= 1);
+		if (atomic_add_32_nv(&ire->ire_identical_ref, -1) != 0) {
+			/* Removed one of the identical parties */
+			rw_exit(&irb->irb_lock);
+			return;
 		}
+
+		irb->irb_ire_cnt--;
+		ire_make_condemned(ire);
 	}
 
 	if (irb->irb_refcnt != 0) {
@@ -3419,22 +1562,9 @@ ire_delete(ire_t *ire)
 	} else {
 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted);
 	}
-	/*
-	 * ip_wput/ip_wput_v6 checks this flag to see whether
-	 * it should still use the cached ire or not.
-	 */
-	if (ire->ire_type == IRE_DEFAULT) {
-		/*
-		 * IRE is out of the list. We need to adjust the
-		 * accounting before we drop the lock.
-		 */
-		if (ire->ire_ipversion == IPV6_VERSION) {
-			ASSERT(ipst->ips_ipv6_ire_default_count != 0);
-			ipst->ips_ipv6_ire_default_count--;
-		}
-	}
 	rw_exit(&irb->irb_lock);
 
+	/* Cleanup dependents and related stuff */
 	if (ire->ire_ipversion == IPV6_VERSION) {
 		ire_delete_v6(ire);
 	} else {
@@ -3444,7 +1574,7 @@ ire_delete(ire_t *ire)
 	 * We removed it from the list. Decrement the
 	 * reference count.
 	 */
-	IRE_REFRELE_NOTR(ire);
+	ire_refrele_notr(ire);
 }
 
 /*
@@ -3463,8 +1593,7 @@ ire_delete_v4(ire_t *ire)
 	ASSERT(ire->ire_refcnt >= 1);
 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
 
-	if (ire->ire_type != IRE_CACHE)
-		ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
+	ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
 	if (ire->ire_type == IRE_DEFAULT) {
 		/*
 		 * when a default gateway is going away
@@ -3473,20 +1602,33 @@ ire_delete_v4(ire_t *ire)
 		 */
 		ire_delete_host_redirects(ire->ire_gateway_addr, ipst);
 	}
+
+	/*
+	 * If we are deleting an IRE_INTERFACE then we make sure we also
+	 * delete any IRE_IF_CLONE that has been created from it.
+	 * Those are always in ire_dep_children.
+	 */
+	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != NULL)
+		ire_dep_delete_if_clone(ire);
+
+	/* Remove from parent dependencies and child */
+	rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
+	if (ire->ire_dep_parent != NULL)
+		ire_dep_remove(ire);
+
+	while (ire->ire_dep_children != NULL)
+		ire_dep_remove(ire->ire_dep_children);
+	rw_exit(&ipst->ips_ire_dep_lock);
 }
 
 /*
- * IRE_REFRELE/ire_refrele are the only caller of the function. It calls
+ * ire_refrele is the only caller of the function. It calls
  * to free the ire when the reference count goes to zero.
  */
 void
 ire_inactive(ire_t *ire)
 {
-	nce_t	*nce;
-	ill_t	*ill = NULL;
-	ill_t	*stq_ill = NULL;
-	ipif_t	*ipif;
-	boolean_t	need_wakeup = B_FALSE;
+	ill_t	*ill;
 	irb_t 	*irb;
 	ip_stack_t	*ipst = ire->ire_ipst;
 
@@ -3494,128 +1636,71 @@ ire_inactive(ire_t *ire)
 	ASSERT(ire->ire_ptpn == NULL);
 	ASSERT(ire->ire_next == NULL);
 
+	/* Count how many condemned ires for kmem_cache callback */
+	if (IRE_IS_CONDEMNED(ire))
+		atomic_add_32(&ipst->ips_num_ire_condemned, -1);
+
 	if (ire->ire_gw_secattr != NULL) {
 		ire_gw_secattr_free(ire->ire_gw_secattr);
 		ire->ire_gw_secattr = NULL;
 	}
 
-	if (ire->ire_mp != NULL) {
-		ASSERT(ire->ire_bucket == NULL);
-		mutex_destroy(&ire->ire_lock);
-		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
-		if (ire->ire_nce != NULL)
-			NCE_REFRELE_NOTR(ire->ire_nce);
-		freeb(ire->ire_mp);
-		return;
-	}
-
-	if ((nce = ire->ire_nce) != NULL) {
-		NCE_REFRELE_NOTR(nce);
-		ire->ire_nce = NULL;
-	}
-
-	if (ire->ire_ipif == NULL)
-		goto end;
-
-	ipif = ire->ire_ipif;
-	ill = ipif->ipif_ill;
+	/*
+	 * ire_nce_cache is cleared in ire_delete, and we make sure we don't
+	 * set it once the ire is marked condemned.
+	 */
+	ASSERT(ire->ire_nce_cache == NULL);
 
-	if (ire->ire_bucket == NULL) {
-		/* The ire was never inserted in the table. */
-		goto end;
-	}
+	/*
+	 * Since any parent would have a refhold on us they would already
+	 * have been removed.
+	 */
+	ASSERT(ire->ire_dep_parent == NULL);
+	ASSERT(ire->ire_dep_sib_next == NULL);
+	ASSERT(ire->ire_dep_sib_ptpn == NULL);
 
 	/*
-	 * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is
-	 * non-null ill_ire_count also goes down by 1.
-	 *
-	 * The ipif that is associated with an ire is ire->ire_ipif and
-	 * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call
-	 * ipif_ill_refrele_tail. Usually stq_ill is null or the same as
-	 * ire->ire_ipif->ipif_ill. So nothing more needs to be done.
-	 * However, for VNI or IPMP IRE entries, stq_ill can be different.
-	 * If this is different from ire->ire_ipif->ipif_ill and if the
-	 * ill_ire_cnt on the stq_ill also has dropped to zero, we call
-	 * ipif_ill_refrele_tail on the stq_ill.
+	 * Since any children would have a refhold on us they should have
+	 * already been removed.
 	 */
-	if (ire->ire_stq != NULL)
-		stq_ill = ire->ire_stq->q_ptr;
+	ASSERT(ire->ire_dep_children == NULL);
 
-	if (stq_ill == NULL || stq_ill == ill) {
-		/* Optimize the most common case */
+	/*
+	 * ill_ire_ref is increased when the IRE is inserted in the
+	 * bucket - not when the IRE is created.
+	 */
+	irb = ire->ire_bucket;
+	ill = ire->ire_ill;
+	if (irb != NULL && ill != NULL) {
 		mutex_enter(&ill->ill_lock);
-		ASSERT(ipif->ipif_ire_cnt != 0);
-		DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif,
+		ASSERT(ill->ill_ire_cnt != 0);
+		DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
 		    (char *), "ire", (void *), ire);
-		ipif->ipif_ire_cnt--;
-		if (IPIF_DOWN_OK(ipif))
-			need_wakeup = B_TRUE;
-		if (stq_ill != NULL) {
-			ASSERT(stq_ill->ill_ire_cnt != 0);
-			DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill,
-			    (char *), "ire", (void *), ire);
-			stq_ill->ill_ire_cnt--;
-			if (ILL_DOWN_OK(stq_ill))
-				need_wakeup = B_TRUE;
-		}
-		if (need_wakeup) {
+		ill->ill_ire_cnt--;
+		if (ILL_DOWN_OK(ill)) {
 			/* Drops the ill lock */
 			ipif_ill_refrele_tail(ill);
 		} else {
 			mutex_exit(&ill->ill_lock);
 		}
-	} else {
-		/*
-		 * We can't grab all the ill locks at the same time.
-		 * It can lead to recursive lock enter in the call to
-		 * ipif_ill_refrele_tail and later. Instead do it 1 at
-		 * a time.
-		 */
-		mutex_enter(&ill->ill_lock);
-		ASSERT(ipif->ipif_ire_cnt != 0);
-		DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif,
-		    (char *), "ire", (void *), ire);
-		ipif->ipif_ire_cnt--;
-		if (IPIF_DOWN_OK(ipif)) {
-			/* Drops the lock */
-			ipif_ill_refrele_tail(ill);
-		} else {
-			mutex_exit(&ill->ill_lock);
-		}
-		if (stq_ill != NULL) {
-			mutex_enter(&stq_ill->ill_lock);
-			ASSERT(stq_ill->ill_ire_cnt != 0);
-			DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill,
-			    (char *), "ire", (void *), ire);
-			stq_ill->ill_ire_cnt--;
-			if (ILL_DOWN_OK(stq_ill)) {
-				/* Drops the ill lock */
-				ipif_ill_refrele_tail(stq_ill);
-			} else {
-				mutex_exit(&stq_ill->ill_lock);
-			}
-		}
 	}
-end:
-	/* This should be true for both V4 and V6 */
+	ire->ire_ill = NULL;
 
-	if ((ire->ire_type & IRE_FORWARDTABLE) &&
-	    (ire->ire_ipversion == IPV4_VERSION) &&
-	    ((irb = ire->ire_bucket) != NULL)) {
+	/* This should be true for both V4 and V6 */
+	if (irb != NULL && (irb->irb_marks & IRB_MARK_DYNAMIC)) {
 		rw_enter(&irb->irb_lock, RW_WRITER);
 		irb->irb_nire--;
 		/*
 		 * Instead of examining the conditions for freeing
 		 * the radix node here, we do it by calling
-		 * IRB_REFRELE which is a single point in the code
+		 * irb_refrele which is a single point in the code
 		 * that embeds that logic. Bump up the refcnt to
-		 * be able to call IRB_REFRELE
+		 * be able to call irb_refrele
 		 */
-		IRB_REFHOLD_LOCKED(irb);
+		irb_refhold_locked(irb);
 		rw_exit(&irb->irb_lock);
-		IRB_REFRELE(irb);
+		irb_refrele(irb);
 	}
-	ire->ire_ipif = NULL;
 
 #ifdef DEBUG
 	ire_trace_cleanup(ire);
@@ -3626,333 +1711,276 @@ end:
 	} else {
 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
 	}
-	ASSERT(ire->ire_mp == NULL);
-	/* Has been allocated out of the cache */
 	kmem_cache_free(ire_cache, ire);
 }
 
 /*
- * ire_walk routine to delete all IRE_CACHE/IRE_HOST types redirect
- * entries that have a given gateway address.
+ * ire_update_generation is the callback function provided by
+ * ire_get_bucket() to update the generation number of any
+ * matching shorter route when a new route is added.
+ *
+ * This fucntion always returns a failure return (B_FALSE)
+ * to force the caller (rn_matchaddr_args)
+ * to back-track up the tree looking for shorter matches.
+ */
+/* ARGSUSED */
+static boolean_t
+ire_update_generation(struct radix_node *rn, void *arg)
+{
+	struct rt_entry *rt = (struct rt_entry *)rn;
+
+	/* We need to handle all in the same bucket */
+	irb_increment_generation(&rt->rt_irb);
+	return (B_FALSE);
+}
+
+/*
+ * Take care of all the generation numbers in the bucket.
  */
 void
-ire_delete_cache_gw(ire_t *ire, char *cp)
+irb_increment_generation(irb_t *irb)
 {
-	ipaddr_t	gw_addr;
+	ire_t *ire;
 
-	if (!(ire->ire_type & IRE_CACHE) &&
-	    !(ire->ire_flags & RTF_DYNAMIC))
+	if (irb == NULL || irb->irb_ire_cnt == 0)
 		return;
 
-	bcopy(cp, &gw_addr, sizeof (gw_addr));
-	if (ire->ire_gateway_addr == gw_addr) {
-		ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n",
-		    (int)ntohl(ire->ire_addr), ire->ire_type,
-		    (int)ntohl(ire->ire_gateway_addr)));
-		ire_delete(ire);
+	irb_refhold(irb);
+	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
+		if (!IRE_IS_CONDEMNED(ire))
+			ire_increment_generation(ire);	/* Ourselves */
+		ire_dep_incr_generation(ire);	/* Dependants */
 	}
+	irb_refrele(irb);
 }
 
 /*
- * Remove all IRE_CACHE entries that match the ire specified.
+ * When an IRE is added or deleted this routine is called to make sure
+ * any caching of IRE information is notified or updated.
  *
  * The flag argument indicates if the flush request is due to addition
- * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE).
- *
- * This routine takes only the IREs from the forwarding table and flushes
- * the corresponding entries from the cache table.
- *
- * When flushing due to the deletion of an old route, it
- * just checks the cache handles (ire_phandle and ire_ihandle) and
- * deletes the ones that match.
- *
- * When flushing due to the creation of a new route, it checks
- * if a cache entry's address matches the one in the IRE and
- * that the cache entry's parent has a less specific mask than the
- * one in IRE. The destination of such a cache entry could be the
- * gateway for other cache entries, so we need to flush those as
- * well by looking for gateway addresses matching the IRE's address.
+ * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
+ * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
  */
 void
 ire_flush_cache_v4(ire_t *ire, int flag)
 {
-	int i;
-	ire_t *cire;
-	irb_t *irb;
-	ip_stack_t	*ipst = ire->ire_ipst;
+	irb_t *irb = ire->ire_bucket;
+	struct rt_entry *rt = IRB2RT(irb);
+	ip_stack_t *ipst = ire->ire_ipst;
 
-	if (ire->ire_type & IRE_CACHE)
+	/*
+	 * IRE_IF_CLONE ire's don't provide any new information
+	 * than the parent from which they are cloned, so don't
+	 * perturb the generation numbers.
+	 */
+	if (ire->ire_type & IRE_IF_CLONE)
 		return;
 
 	/*
-	 * If a default is just created, there is no point
-	 * in going through the cache, as there will not be any
-	 * cached ires.
+	 * Ensure that an ire_add during a lookup serializes the updates of the
+	 * generation numbers under the radix head lock so that the lookup gets
+	 * either the old ire and old generation number, or a new ire and new
+	 * generation number.
+	 */
+	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
+
+	/*
+	 * If a route was just added, we need to notify everybody that
+	 * has cached an IRE_NOROUTE since there might now be a better
+	 * route for them.
 	 */
-	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
-		return;
 	if (flag == IRE_FLUSH_ADD) {
+		ire_increment_generation(ipst->ips_ire_reject_v4);
+		ire_increment_generation(ipst->ips_ire_blackhole_v4);
+	}
+
+	/* Adding a default can't otherwise provide a better route */
+	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
+		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+		return;
+	}
+
+	switch (flag) {
+	case IRE_FLUSH_DELETE:
+	case IRE_FLUSH_GWCHANGE:
 		/*
-		 * This selective flush is due to the addition of
-		 * new IRE.
+		 * Update ire_generation for all ire_dep_children chains
+		 * starting with this IRE
 		 */
-		for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
-			irb = &ipst->ips_ip_cache_table[i];
-			if ((cire = irb->irb_ire) == NULL)
-				continue;
-			IRB_REFHOLD(irb);
-			for (cire = irb->irb_ire; cire != NULL;
-			    cire = cire->ire_next) {
-				if (cire->ire_type != IRE_CACHE)
-					continue;
-				/*
-				 * If 'cire' belongs to the same subnet
-				 * as the new ire being added, and 'cire'
-				 * is derived from a prefix that is less
-				 * specific than the new ire being added,
-				 * we need to flush 'cire'; for instance,
-				 * when a new interface comes up.
-				 */
-				if (((cire->ire_addr & ire->ire_mask) ==
-				    (ire->ire_addr & ire->ire_mask)) &&
-				    (ip_mask_to_plen(cire->ire_cmask) <=
-				    ire->ire_masklen)) {
-					ire_delete(cire);
-					continue;
-				}
-				/*
-				 * This is the case when the ire_gateway_addr
-				 * of 'cire' belongs to the same subnet as
-				 * the new ire being added.
-				 * Flushing such ires is sometimes required to
-				 * avoid misrouting: say we have a machine with
-				 * two interfaces (I1 and I2), a default router
-				 * R on the I1 subnet, and a host route to an
-				 * off-link destination D with a gateway G on
-				 * the I2 subnet.
-				 * Under normal operation, we will have an
-				 * on-link cache entry for G and an off-link
-				 * cache entry for D with G as ire_gateway_addr,
-				 * traffic to D will reach its destination
-				 * through gateway G.
-				 * If the administrator does 'ifconfig I2 down',
-				 * the cache entries for D and G will be
-				 * flushed. However, G will now be resolved as
-				 * an off-link destination using R (the default
-				 * router) as gateway. Then D will also be
-				 * resolved as an off-link destination using G
-				 * as gateway - this behavior is due to
-				 * compatibility reasons, see comment in
-				 * ire_ihandle_lookup_offlink(). Traffic to D
-				 * will go to the router R and probably won't
-				 * reach the destination.
-				 * The administrator then does 'ifconfig I2 up'.
-				 * Since G is on the I2 subnet, this routine
-				 * will flush its cache entry. It must also
-				 * flush the cache entry for D, otherwise
-				 * traffic will stay misrouted until the IRE
-				 * times out.
-				 */
-				if ((cire->ire_gateway_addr & ire->ire_mask) ==
-				    (ire->ire_addr & ire->ire_mask)) {
-					ire_delete(cire);
-					continue;
-				}
-			}
-			IRB_REFRELE(irb);
-		}
-	} else {
+		ire_dep_incr_generation(ire);
+		break;
+	case IRE_FLUSH_ADD:
 		/*
-		 * delete the cache entries based on
-		 * handle in the IRE as this IRE is
-		 * being deleted/changed.
+		 * Update the generation numbers of all shorter matching routes.
+		 * ire_update_generation takes care of the dependants by
+		 * using ire_dep_incr_generation.
 		 */
-		for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
-			irb = &ipst->ips_ip_cache_table[i];
-			if ((cire = irb->irb_ire) == NULL)
-				continue;
-			IRB_REFHOLD(irb);
-			for (cire = irb->irb_ire; cire != NULL;
-			    cire = cire->ire_next) {
-				if (cire->ire_type != IRE_CACHE)
-					continue;
-				if ((cire->ire_phandle == 0 ||
-				    cire->ire_phandle != ire->ire_phandle) &&
-				    (cire->ire_ihandle == 0 ||
-				    cire->ire_ihandle != ire->ire_ihandle))
-					continue;
-				ire_delete(cire);
-			}
-			IRB_REFRELE(irb);
-		}
+		(void) ipst->ips_ip_ftable->rnh_matchaddr_args(&rt->rt_dst,
+		    ipst->ips_ip_ftable, ire_update_generation, NULL);
+		break;
 	}
+	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 }
 
 /*
  * Matches the arguments passed with the values in the ire.
  *
- * Note: for match types that match using "ipif" passed in, ipif
+ * Note: for match types that match using "ill" passed in, ill
  * must be checked for non-NULL before calling this routine.
  */
 boolean_t
 ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
-    int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
-    const ts_label_t *tsl, int match_flags, queue_t *wq)
+    int type, const ill_t *ill, zoneid_t zoneid,
+    const ts_label_t *tsl, int match_flags)
 {
 	ill_t *ire_ill = NULL, *dst_ill;
-	ill_t *ipif_ill = NULL;
+	ip_stack_t *ipst = ire->ire_ipst;
 
 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
 	ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
 	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
-	    (ipif != NULL && !ipif->ipif_isv6));
-	ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL);
+	    (ill != NULL && !ill->ill_isv6));
 
 	/*
-	 * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
-	 * is in fact hidden, to ensure the caller gets the right one.  One
-	 * exception: if the caller passed MATCH_IRE_IHANDLE, then they
-	 * already know the identity of the given IRE_INTERFACE entry and
-	 * there's no point trying to hide it from them.
+	 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it is
+	 * in fact hidden, to ensure the caller gets the right one.
 	 */
-	if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
-		if (match_flags & MATCH_IRE_IHANDLE)
-			match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
-		if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+	if (ire->ire_testhidden) {
+		if (!(match_flags & MATCH_IRE_TESTHIDDEN))
 			return (B_FALSE);
 	}
 
-	/*
-	 * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option
-	 * is used. In that case the routing table is bypassed and the
-	 * packets are sent directly to the specified nexthop. The
-	 * IRE_CACHE entry representing this route should be marked
-	 * with IRE_MARK_PRIVATE_ADDR.
-	 */
-
-	if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) &&
-	    (ire->ire_marks & IRE_MARK_PRIVATE_ADDR))
-		return (B_FALSE);
-
 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
 	    ire->ire_zoneid != ALL_ZONES) {
 		/*
-		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
-		 * valid and does not match that of ire_zoneid, a failure to
+		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
+		 * does not match that of ire_zoneid, a failure to
 		 * match is reported at this point. Otherwise, since some IREs
 		 * that are available in the global zone can be used in local
 		 * zones, additional checks need to be performed:
 		 *
-		 *	IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK
+		 * IRE_LOOPBACK
 		 *	entries should never be matched in this situation.
+		 *	Each zone has its own IRE_LOOPBACK.
+		 *
+		 * IRE_LOCAL
+		 *	We allow them for any zoneid. ire_route_recursive
+		 *	does additional checks when
+		 *	ip_restrict_interzone_loopback is set.
 		 *
-		 *	IRE entries that have an interface associated with them
-		 *	should in general not match unless they are an IRE_LOCAL
-		 *	or in the case when MATCH_IRE_DEFAULT has been set in
-		 *	the caller.  In the case of the former, checking of the
-		 *	other fields supplied should take place.
+		 * If ill_usesrc_ifindex is set
+		 *	Then we check if the zone has a valid source address
+		 *	on the usesrc ill.
 		 *
-		 *	In the case where MATCH_IRE_DEFAULT has been set,
-		 *	all of the ipif's associated with the IRE's ill are
-		 *	checked to see if there is a matching zoneid.  If any
-		 *	one ipif has a matching zoneid, this IRE is a
-		 *	potential candidate so checking of the other fields
-		 *	takes place.
+		 * If ire_ill is set, then check that the zone has an ipif
+		 *	on that ill.
 		 *
-		 *	In the case where the IRE_INTERFACE has a usable source
-		 *	address (indicated by ill_usesrc_ifindex) in the
-		 *	correct zone then it's permitted to return this IRE
+		 * Outside of this function (in ire_round_robin) we check
+		 * that any IRE_OFFLINK has a gateway that reachable from the
+		 * zone when we have multiple choices (ECMP).
 		 */
 		if (match_flags & MATCH_IRE_ZONEONLY)
 			return (B_FALSE);
-		if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK))
+		if (ire->ire_type & IRE_LOOPBACK)
 			return (B_FALSE);
+
+		if (ire->ire_type & IRE_LOCAL)
+			goto matchit;
+
 		/*
-		 * Note, IRE_INTERFACE can have the stq as NULL. For
-		 * example, if the default multicast route is tied to
-		 * the loopback address.
+		 * The normal case of IRE_ONLINK has a matching zoneid.
+		 * Here we handle the case when shared-IP zones have been
+		 * configured with IP addresses on vniN. In that case it
+		 * is ok for traffic from a zone to use IRE_ONLINK routes
+		 * if the ill has a usesrc pointing at vniN
 		 */
-		if ((ire->ire_type & IRE_INTERFACE) &&
-		    (ire->ire_stq != NULL)) {
-			dst_ill = (ill_t *)ire->ire_stq->q_ptr;
+		dst_ill = ire->ire_ill;
+		if (ire->ire_type & IRE_ONLINK) {
+			uint_t	ifindex;
+
+			/*
+			 * Note there is no IRE_INTERFACE on vniN thus
+			 * can't do an IRE lookup for a matching route.
+			 */
+			ifindex = dst_ill->ill_usesrc_ifindex;
+			if (ifindex == 0)
+				return (B_FALSE);
+
 			/*
 			 * If there is a usable source address in the
-			 * zone, then it's ok to return an
-			 * IRE_INTERFACE
+			 * zone, then it's ok to return this IRE_INTERFACE
 			 */
-			if (ipif_usesrc_avail(dst_ill, zoneid)) {
-				ip3dbg(("ire_match_args: dst_ill %p match %d\n",
-				    (void *)dst_ill,
-				    (ire->ire_addr == (addr & mask))));
-			} else {
-				ip3dbg(("ire_match_args: src_ipif NULL"
+			if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
+			    zoneid, ipst)) {
+				ip3dbg(("ire_match_args: no usrsrc for zone"
 				    " dst_ill %p\n", (void *)dst_ill));
 				return (B_FALSE);
 			}
 		}
-		if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
-		    !(ire->ire_type & IRE_INTERFACE)) {
+		/*
+		 * For exampe, with
+		 * route add 11.0.0.0 gw1 -ifp bge0
+		 * route add 11.0.0.0 gw2 -ifp bge1
+		 * this code would differentiate based on
+		 * where the sending zone has addresses.
+		 * Only if the zone has an address on bge0 can it use the first
+		 * route. It isn't clear if this behavior is documented
+		 * anywhere.
+		 */
+		if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
 			ipif_t	*tipif;
 
-			if ((match_flags & MATCH_IRE_DEFAULT) == 0) {
-				return (B_FALSE);
-			}
-			mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
-			for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
+			mutex_enter(&dst_ill->ill_lock);
+			for (tipif = dst_ill->ill_ipif;
 			    tipif != NULL; tipif = tipif->ipif_next) {
-				if (IPIF_CAN_LOOKUP(tipif) &&
+				if (!IPIF_IS_CONDEMNED(tipif) &&
 				    (tipif->ipif_flags & IPIF_UP) &&
 				    (tipif->ipif_zoneid == zoneid ||
 				    tipif->ipif_zoneid == ALL_ZONES))
 					break;
 			}
-			mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
+			mutex_exit(&dst_ill->ill_lock);
 			if (tipif == NULL) {
 				return (B_FALSE);
 			}
 		}
 	}
 
-	/*
-	 * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
-	 * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
-	 * of getting a source address -- i.e., ire_src_addr ==
-	 * ire->ire_ipif->ipif_src_addr).  ire_to_ill() handles this.
-	 *
-	 * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
-	 * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
-	 * IPMP test traffic), then the ill must match exactly.
-	 */
+matchit:
 	if (match_flags & MATCH_IRE_ILL) {
-		ire_ill = ire_to_ill(ire);
-		ipif_ill = ipif->ipif_ill;
+		ire_ill = ire->ire_ill;
+
+		/*
+		 * If asked to match an ill, we *must* match
+		 * on the ire_ill for ipmp test addresses, or
+		 * any of the ill in the group for data addresses.
+		 * If we don't, we may as well fail.
+		 * However, we need an exception for IRE_LOCALs to ensure
+		 * we loopback packets even sent to test addresses on different
+		 * interfaces in the group.
+		 */
+		if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
+		    !(ire->ire_type & IRE_LOCAL)) {
+			if (ire->ire_ill != ill)
+				return (B_FALSE);
+		} else  {
+			match_flags &= ~MATCH_IRE_TESTHIDDEN;
+			/*
+			 * We know that ill is not NULL, but ire_ill could be
+			 * NULL
+			 */
+			if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
+				return (B_FALSE);
+		}
 	}
 
 	if ((ire->ire_addr == (addr & mask)) &&
 	    ((!(match_flags & MATCH_IRE_GW)) ||
 	    (ire->ire_gateway_addr == gateway)) &&
-	    ((!(match_flags & MATCH_IRE_TYPE)) ||
-	    (ire->ire_type & type)) &&
-	    ((!(match_flags & MATCH_IRE_SRC)) ||
-	    (ire->ire_src_addr == ipif->ipif_src_addr)) &&
-	    ((!(match_flags & MATCH_IRE_IPIF)) ||
-	    (ire->ire_ipif == ipif)) &&
-	    ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
-	    (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
-	    ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) ||
-	    (ire->ire_type != IRE_CACHE ||
-	    ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) &&
-	    ((!(match_flags & MATCH_IRE_WQ)) ||
-	    (ire->ire_stq == wq)) &&
-	    ((!(match_flags & MATCH_IRE_ILL)) ||
-	    (ire_ill == ipif_ill ||
-	    (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
-	    ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
-	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
-	    (ire->ire_ihandle == ihandle)) &&
-	    ((!(match_flags & MATCH_IRE_MASK)) ||
-	    (ire->ire_mask == mask)) &&
+	    ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
+	    ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
+	    ((!(match_flags & MATCH_IRE_MASK)) || (ire->ire_mask == mask)) &&
 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
 	    (!is_system_labeled()) ||
 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -3963,494 +1991,207 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
 }
 
 /*
- * Lookup for a route in all the tables
+ * Check if the IRE_LOCAL uses the same ill as another route would use.
+ * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
+ * then we don't allow this IRE_LOCAL to be used.
+ * We always return an IRE; will be RTF_REJECT if no route available.
  */
 ire_t *
-ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
-    int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid,
-    const ts_label_t *tsl, int flags, ip_stack_t *ipst)
+ire_alt_local(ire_t *ire, zoneid_t zoneid, const ts_label_t *tsl,
+    const ill_t *ill, uint_t *generationp)
 {
-	ire_t *ire = NULL;
+	ip_stack_t	*ipst = ire->ire_ipst;
+	ire_t		*alt_ire;
+	uint_t		ire_type;
+	uint_t		generation;
+	uint_t		match_flags;
 
-	/*
-	 * ire_match_args() will dereference ipif MATCH_IRE_SRC or
-	 * MATCH_IRE_ILL is set.
-	 */
-	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
-		return (NULL);
+	ASSERT(ire->ire_type & IRE_LOCAL);
+	ASSERT(ire->ire_ill != NULL);
 
 	/*
-	 * might be asking for a cache lookup,
-	 * This is not best way to lookup cache,
-	 * user should call ire_cache_lookup directly.
-	 *
-	 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
-	 * in the forwarding table, if the applicable type flags were set.
+	 * Need to match on everything but local.
+	 * This might result in the creation of a IRE_IF_CLONE for the
+	 * same address as the IRE_LOCAL when restrict_interzone_loopback is
+	 * set. ire_add_*() ensures that the IRE_IF_CLONE are tail inserted
+	 * to make sure the IRE_LOCAL is always found first.
 	 */
-	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
-		ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid,
-		    tsl, flags, ipst);
-		if (ire != NULL)
-			return (ire);
+	ire_type = (IRE_ONLINK | IRE_OFFLINK) & ~(IRE_LOCAL|IRE_LOOPBACK);
+	match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
+	if (ill != NULL)
+		match_flags |= MATCH_IRE_ILL;
+
+	if (ire->ire_ipversion == IPV4_VERSION) {
+		alt_ire = ire_route_recursive_v4(ire->ire_addr, ire_type,
+		    ill, zoneid, tsl, match_flags, B_TRUE, 0, ipst, NULL, NULL,
+		    &generation);
+	} else {
+		alt_ire = ire_route_recursive_v6(&ire->ire_addr_v6, ire_type,
+		    ill, zoneid, tsl, match_flags, B_TRUE, 0, ipst, NULL, NULL,
+		    &generation);
 	}
-	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
-		ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire,
-		    zoneid, 0, tsl, flags, ipst);
+	ASSERT(alt_ire != NULL);
+
+	if (alt_ire->ire_ill == ire->ire_ill) {
+		/* Going out the same ILL - ok to send to IRE_LOCAL */
+		ire_refrele(alt_ire);
+	} else {
+		/* Different ill - ignore IRE_LOCAL */
+		ire_refrele(ire);
+		ire = alt_ire;
+		if (generationp != NULL)
+			*generationp = generation;
 	}
 	return (ire);
 }
 
-/*
- * Delete the IRE cache for the gateway and all IRE caches whose
- * ire_gateway_addr points to this gateway, and allow them to
- * be created on demand by ip_newroute.
- */
-void
-ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
+boolean_t
+ire_find_zoneid(struct radix_node *rn, void *arg)
 {
+	struct rt_entry *rt = (struct rt_entry *)rn;
 	irb_t *irb;
 	ire_t *ire;
+	ire_ftable_args_t *margs = arg;
 
-	irb = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
-	    ipst->ips_ip_cache_table_size)];
-	IRB_REFHOLD(irb);
-	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (ire->ire_marks & IRE_MARK_CONDEMNED)
-			continue;
-
-		ASSERT(ire->ire_mask == IP_HOST_MASK);
-		if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE,
-		    NULL, zoneid, 0, NULL, MATCH_IRE_TYPE, NULL)) {
-			ire_delete(ire);
-		}
-	}
-	IRB_REFRELE(irb);
+	ASSERT(rt != NULL);
 
-	ire_walk_v4(ire_delete_cache_gw, &addr, zoneid, ipst);
-}
+	irb = &rt->rt_irb;
 
-/*
- * Looks up cache table for a route.
- * specific lookup can be indicated by
- * passing the MATCH_* flags and the
- * necessary parameters.
- */
-ire_t *
-ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif,
-    zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
-{
-	ire_ctable_args_t	margs;
-
-	margs.ict_addr = &addr;
-	margs.ict_gateway = &gateway;
-	margs.ict_type = type;
-	margs.ict_ipif = ipif;
-	margs.ict_zoneid = zoneid;
-	margs.ict_tsl = tsl;
-	margs.ict_flags = flags;
-	margs.ict_ipst = ipst;
-	margs.ict_wq = NULL;
-
-	return (ip4_ctable_lookup_impl(&margs));
-}
+	if (irb->irb_ire_cnt == 0)
+		return (B_FALSE);
 
-/*
- * Check whether the IRE_LOCAL and the IRE potentially used to transmit
- * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical
- * or part of the same illgrp.  (In the IPMP case, usually the two IREs
- * will both belong to the IPMP ill, but exceptions are possible -- e.g.
- * if IPMP test addresses are on their own subnet.)
- */
-boolean_t
-ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire)
-{
-	ill_t *recv_ill, *xmit_ill;
+	rw_enter(&irb->irb_lock, RW_READER);
+	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
+		if (IRE_IS_CONDEMNED(ire))
+			continue;
 
-	ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK));
-	ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE));
+		if (ire->ire_zoneid != ALL_ZONES &&
+		    ire->ire_zoneid != margs->ift_zoneid)
+			continue;
 
-	recv_ill = ire_to_ill(ire_local);
-	xmit_ill = ire_to_ill(xmit_ire);
+		if (margs->ift_ill != NULL && margs->ift_ill != ire->ire_ill)
+			continue;
 
-	ASSERT(recv_ill != NULL);
-	ASSERT(xmit_ill != NULL);
+		if (is_system_labeled() &&
+		    tsol_ire_match_gwattr(ire, margs->ift_tsl) != 0)
+			continue;
 
-	return (IS_ON_SAME_LAN(recv_ill, xmit_ill));
+		rw_exit(&irb->irb_lock);
+		return (B_TRUE);
+	}
+	rw_exit(&irb->irb_lock);
+	return (B_FALSE);
 }
 
 /*
- * Check if the IRE_LOCAL uses the same ill as another route would use.
- * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
- * then we don't allow this IRE_LOCAL to be used.
+ * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
+ * gateway address. If ill is non-NULL we also match on it.
+ * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
  */
 boolean_t
-ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
-    const ts_label_t *tsl, ip_stack_t *ipst)
+ire_gateway_ok_zone_v4(ipaddr_t gateway, zoneid_t zoneid, ill_t *ill,
+    const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
 {
-	ire_t		*alt_ire;
-	boolean_t	rval;
-	int		flags;
+	struct rt_sockaddr rdst;
+	struct rt_entry *rt;
+	ire_ftable_args_t margs;
 
-	flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE;
+	ASSERT(ill == NULL || !ill->ill_isv6);
+	if (lock_held)
+		ASSERT(RW_READ_HELD(&ipst->ips_ip_ftable->rnh_lock));
+	else
+		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
 
-	if (ire_local->ire_ipversion == IPV4_VERSION) {
-		alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL,
-		    NULL, zoneid, 0, tsl, flags, ipst);
-	} else {
-		alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL,
-		    NULL, zoneid, 0, tsl, flags, ipst);
-	}
+	rdst.rt_sin_len = sizeof (rdst);
+	rdst.rt_sin_family = AF_INET;
+	rdst.rt_sin_addr.s_addr = gateway;
 
-	if (alt_ire == NULL)
-		return (B_FALSE);
+	/*
+	 * We only use margs for ill, zoneid, and tsl matching in
+	 * ire_find_zoneid
+	 */
+	(void) memset(&margs, 0, sizeof (margs));
+	margs.ift_ill = ill;
+	margs.ift_zoneid = zoneid;
+	margs.ift_tsl = tsl;
+	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
+	    ipst->ips_ip_ftable, ire_find_zoneid, (void *)&margs);
 
-	if (alt_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
-		ire_refrele(alt_ire);
-		return (B_FALSE);
-	}
-	rval = ire_local_same_lan(ire_local, alt_ire);
+	if (!lock_held)
+		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 
-	ire_refrele(alt_ire);
-	return (rval);
+	return (rt != NULL);
 }
 
 /*
- * Lookup cache
- *
- * In general the zoneid has to match (where ALL_ZONES match all of them).
- * But for IRE_LOCAL we also need to handle the case where L2 should
- * conceptually loop back the packet. This is necessary since neither
- * Ethernet drivers nor Ethernet hardware loops back packets sent to their
- * own MAC address. This loopback is needed when the normal
- * routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill as the ill with which this IRE_LOCAL is associated.
- *
- * Earlier versions of this code always matched an IRE_LOCAL independently of
- * the zoneid. We preserve that earlier behavior when
- * ip_restrict_interzone_loopback is turned off.
+ * ire_walk routine to delete a fraction of redirect IREs and IRE_CLONE_IF IREs.
+ * The fraction argument tells us what fraction of the IREs to delete.
+ * Common for IPv4 and IPv6.
+ * Used when memory backpressure.
  */
-ire_t *
-ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl,
-    ip_stack_t *ipst)
+static void
+ire_delete_reclaim(ire_t *ire, char *arg)
 {
-	irb_t *irb_ptr;
-	ire_t *ire;
-
-	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
-	    ipst->ips_ip_cache_table_size)];
-	rw_enter(&irb_ptr->irb_lock, RW_READER);
-	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (ire->ire_marks & (IRE_MARK_CONDEMNED |
-		    IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) {
-			continue;
-		}
-		if (ire->ire_addr == addr) {
-			/*
-			 * Finally, check if the security policy has any
-			 * restriction on using this route for the specified
-			 * message.
-			 */
-			if (tsl != NULL &&
-			    ire->ire_gw_secattr != NULL &&
-			    tsol_ire_match_gwattr(ire, tsl) != 0) {
-				continue;
-			}
+	ip_stack_t	*ipst = ire->ire_ipst;
+	uint_t		fraction = *(uint_t *)arg;
+	uint_t		rand;
 
-			if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
-			    ire->ire_zoneid == ALL_ZONES) {
-				IRE_REFHOLD(ire);
-				rw_exit(&irb_ptr->irb_lock);
-				return (ire);
-			}
+	if ((ire->ire_flags & RTF_DYNAMIC) ||
+	    (ire->ire_type & IRE_IF_CLONE)) {
 
-			if (ire->ire_type == IRE_LOCAL) {
-				if (ipst->ips_ip_restrict_interzone_loopback &&
-				    !ire_local_ok_across_zones(ire, zoneid,
-				    &addr, tsl, ipst))
-					continue;
+		/* Pick a random number */
+		rand = (uint_t)lbolt +
+		    IRE_ADDR_HASH_V6(ire->ire_addr_v6, 256);
 
-				IRE_REFHOLD(ire);
-				rw_exit(&irb_ptr->irb_lock);
-				return (ire);
-			}
+		/* Use truncation */
+		if ((rand/fraction)*fraction == rand) {
+			IP_STAT(ipst, ip_ire_reclaim_deleted);
+			ire_delete(ire);
 		}
 	}
-	rw_exit(&irb_ptr->irb_lock);
-	return (NULL);
-}
 
-ire_t *
-ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
-{
-	irb_t *irb_ptr;
-	ire_t *ire;
-
-	/*
-	 * Look for an ire in the cachetable whose
-	 * ire_addr matches the destination.
-	 * Since we are being called by forwarding fastpath
-	 * no need to check for Trusted Solaris label.
-	 */
-	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(
-	    dst, ipst->ips_ip_cache_table_size)];
-	rw_enter(&irb_ptr->irb_lock, RW_READER);
-	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN |
-		    IRE_MARK_PRIVATE_ADDR)) {
-			continue;
-		}
-		if (ire->ire_addr == dst) {
-			IRE_REFHOLD(ire);
-			rw_exit(&irb_ptr->irb_lock);
-			return (ire);
-		}
-	}
-	rw_exit(&irb_ptr->irb_lock);
-	return (NULL);
 }
 
 /*
- * Locate the interface ire that is tied to the cache ire 'cire' via
- * cire->ire_ihandle.
+ * kmem_cache callback to free up memory.
  *
- * We are trying to create the cache ire for an offlink destn based
- * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
- * as found by ip_newroute(). We are called from ip_newroute() in
- * the IRE_CACHE case.
+ * Free a fraction (ips_ip_ire_reclaim_fraction) of things IP added dynamically
+ * (RTF_DYNAMIC and IRE_IF_CLONE).
  */
-ire_t *
-ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
+static void
+ip_ire_reclaim_stack(ip_stack_t *ipst)
 {
-	ire_t	*ire;
-	int	match_flags;
-	ipaddr_t gw_addr;
-	ipif_t	*gw_ipif;
-	ip_stack_t	*ipst = cire->ire_ipst;
-
-	ASSERT(cire != NULL && pire != NULL);
-
-	/*
-	 * We don't need to specify the zoneid to ire_ftable_lookup() below
-	 * because the ihandle refers to an ipif which can be in only one zone.
-	 */
-	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
-	if (pire->ire_ipif != NULL)
-		match_flags |= MATCH_IRE_ILL;
-	/*
-	 * We know that the mask of the interface ire equals cire->ire_cmask.
-	 * (When ip_newroute() created 'cire' for the gateway it set its
-	 * cmask from the interface ire's mask)
-	 */
-	ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0,
-	    IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
-	    NULL, match_flags, ipst);
-	if (ire != NULL)
-		return (ire);
-	/*
-	 * If we didn't find an interface ire above, we can't declare failure.
-	 * For backwards compatibility, we need to support prefix routes
-	 * pointing to next hop gateways that are not on-link.
-	 *
-	 * Assume we are trying to ping some offlink destn, and we have the
-	 * routing table below.
-	 *
-	 * Eg.	default	- gw1		<--- pire	(line 1)
-	 *	gw1	- gw2				(line 2)
-	 *	gw2	- hme0				(line 3)
-	 *
-	 * If we already have a cache ire for gw1 in 'cire', the
-	 * ire_ftable_lookup above would have failed, since there is no
-	 * interface ire to reach gw1. We will fallthru below.
-	 *
-	 * Here we duplicate the steps that ire_ftable_lookup() did in
-	 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
-	 * The differences are the following
-	 * i.   We want the interface ire only, so we call ire_ftable_lookup()
-	 *	instead of ire_route_lookup()
-	 * ii.  We look for only prefix routes in the 1st call below.
-	 * ii.  We want to match on the ihandle in the 2nd call below.
-	 */
-	match_flags =  MATCH_IRE_TYPE;
-	if (pire->ire_ipif != NULL)
-		match_flags |= MATCH_IRE_ILL;
-	ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET,
-	    pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
-	if (ire == NULL)
-		return (NULL);
-	/*
-	 * At this point 'ire' corresponds to the entry shown in line 2.
-	 * gw_addr is 'gw2' in the example above.
-	 */
-	gw_addr = ire->ire_gateway_addr;
-	gw_ipif = ire->ire_ipif;
-	ire_refrele(ire);
+	uint_t	fraction = ipst->ips_ip_ire_reclaim_fraction;
 
-	match_flags |= MATCH_IRE_IHANDLE;
-	ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE,
-	    gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags,
-	    ipst);
-	return (ire);
-}
+	IP_STAT(ipst, ip_ire_reclaim_calls);
 
-/*
- * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
- * ire associated with the specified ipif.
- *
- * This might occasionally be called when IPIF_UP is not set since
- * the IP_MULTICAST_IF as well as creating interface routes
- * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
- *
- * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
- * the ipif, this routine might return NULL.
- */
-ire_t *
-ipif_to_ire(const ipif_t *ipif)
-{
-	ire_t	*ire;
-	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-	uint_t	match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK;
+	ire_walk(ire_delete_reclaim, &fraction, ipst);
 
 	/*
-	 * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
-	 * so that they aren't accidentally returned.  However, if the
-	 * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+	 * Walk all CONNs that can have a reference on an ire, nce or dce.
+	 * Get them to update any stale references to drop any refholds they
+	 * have.
 	 */
-	if (IS_UNDER_IPMP(ipif->ipif_ill))
-		match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
-	ASSERT(!ipif->ipif_isv6);
-	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
-		ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK,
-		    ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF),
-		    ipst);
-	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
-		/* In this case we need to lookup destination address. */
-		ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0,
-		    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags,
-		    ipst);
-	} else {
-		ire = ire_ftable_lookup(ipif->ipif_subnet,
-		    ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL,
-		    ALL_ZONES, 0, NULL, match_flags, ipst);
-	}
-	return (ire);
+	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
 }
 
 /*
- * ire_walk function.
- * Count the number of IRE_CACHE entries in different categories.
- */
-void
-ire_cache_count(ire_t *ire, char *arg)
-{
-	ire_cache_count_t *icc = (ire_cache_count_t *)arg;
-
-	if (ire->ire_type != IRE_CACHE)
-		return;
-
-	icc->icc_total++;
-
-	if (ire->ire_ipversion == IPV6_VERSION) {
-		mutex_enter(&ire->ire_lock);
-		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
-			mutex_exit(&ire->ire_lock);
-			icc->icc_onlink++;
-			return;
-		}
-		mutex_exit(&ire->ire_lock);
-	} else {
-		if (ire->ire_gateway_addr == 0) {
-			icc->icc_onlink++;
-			return;
-		}
-	}
-
-	ASSERT(ire->ire_ipif != NULL);
-	if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu)
-		icc->icc_pmtu++;
-	else if (ire->ire_tire_mark != ire->ire_ob_pkt_count +
-	    ire->ire_ib_pkt_count)
-		icc->icc_offlink++;
-	else
-		icc->icc_unused++;
-}
-
-/*
- * ire_walk function called by ip_trash_ire_reclaim().
- * Free a fraction of the IRE_CACHE cache entries. The fractions are
- * different for different categories of IRE_CACHE entries.
- * A fraction of zero means to not free any in that category.
- * Use the hash bucket id plus lbolt as a random number. Thus if the fraction
- * is N then every Nth hash bucket chain will be freed.
+ * Called by the memory allocator subsystem directly, when the system
+ * is running low on memory.
  */
+/* ARGSUSED */
 void
-ire_cache_reclaim(ire_t *ire, char *arg)
+ip_ire_reclaim(void *args)
 {
-	ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg;
-	uint_t rand;
-	ip_stack_t	*ipst = icr->icr_ipst;
-
-	if (ire->ire_type != IRE_CACHE)
-		return;
+	netstack_handle_t nh;
+	netstack_t *ns;
 
-	if (ire->ire_ipversion == IPV6_VERSION) {
-		rand = (uint_t)lbolt +
-		    IRE_ADDR_HASH_V6(ire->ire_addr_v6,
-		    ipst->ips_ip6_cache_table_size);
-		mutex_enter(&ire->ire_lock);
-		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
-			mutex_exit(&ire->ire_lock);
-			if (icr->icr_onlink != 0 &&
-			    (rand/icr->icr_onlink)*icr->icr_onlink == rand) {
-				ire_delete(ire);
-				return;
-			}
-			goto done;
-		}
-		mutex_exit(&ire->ire_lock);
-	} else {
-		rand = (uint_t)lbolt +
-		    IRE_ADDR_HASH(ire->ire_addr, ipst->ips_ip_cache_table_size);
-		if (ire->ire_gateway_addr == 0) {
-			if (icr->icr_onlink != 0 &&
-			    (rand/icr->icr_onlink)*icr->icr_onlink == rand) {
-				ire_delete(ire);
-				return;
-			}
-			goto done;
-		}
-	}
-	/* Not onlink IRE */
-	ASSERT(ire->ire_ipif != NULL);
-	if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) {
-		/* Use ptmu fraction */
-		if (icr->icr_pmtu != 0 &&
-		    (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) {
-			ire_delete(ire);
-			return;
-		}
-	} else if (ire->ire_tire_mark != ire->ire_ob_pkt_count +
-	    ire->ire_ib_pkt_count) {
-		/* Use offlink fraction */
-		if (icr->icr_offlink != 0 &&
-		    (rand/icr->icr_offlink)*icr->icr_offlink == rand) {
-			ire_delete(ire);
-			return;
-		}
-	} else {
-		/* Use unused fraction */
-		if (icr->icr_unused != 0 &&
-		    (rand/icr->icr_unused)*icr->icr_unused == rand) {
-			ire_delete(ire);
-			return;
-		}
+	netstack_next_init(&nh);
+	while ((ns = netstack_next(&nh)) != NULL) {
+		ip_ire_reclaim_stack(ns->netstack_ip);
+		netstack_rele(ns);
 	}
-done:
-	/*
-	 * Update tire_mark so that those that haven't been used since this
-	 * reclaim will be considered unused next time we reclaim.
-	 */
-	ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count;
+	netstack_next_fini(&nh);
 }
 
 static void
@@ -4470,14 +2211,21 @@ void
 ip_ire_g_init()
 {
 	/*
-	 * Create ire caches, ire_reclaim()
-	 * will give IRE_CACHE back to system when needed.
+	 * Create kmem_caches.  ip_ire_reclaim() and ip_nce_reclaim()
+	 * will give disposable IREs back to system when needed.
 	 * This needs to be done here before anything else, since
 	 * ire_add() expects the cache to be created.
 	 */
 	ire_cache = kmem_cache_create("ire_cache",
-	    sizeof (ire_t), 0, ip_ire_constructor,
-	    ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0);
+	    sizeof (ire_t), 0, NULL, NULL,
+	    ip_ire_reclaim, NULL, NULL, 0);
+
+	ncec_cache = kmem_cache_create("ncec_cache",
+	    sizeof (ncec_t), 0, NULL, NULL,
+	    ip_nce_reclaim, NULL, NULL, 0);
+	nce_cache = kmem_cache_create("nce_cache",
+	    sizeof (nce_t), 0, NULL, NULL,
+	    NULL, NULL, NULL, 0);
 
 	rt_entry_cache = kmem_cache_create("rt_entry",
 	    sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0);
@@ -4491,104 +2239,65 @@ ip_ire_g_init()
 void
 ip_ire_init(ip_stack_t *ipst)
 {
-	int i;
-	uint32_t mem_cnt;
-	uint32_t cpu_cnt;
-	uint32_t min_cnt;
-	pgcnt_t mem_avail;
-
-	/*
-	 * ip_ire_max_bucket_cnt is sized below based on the memory
-	 * size and the cpu speed of the machine. This is upper
-	 * bounded by the compile time value of ip_ire_max_bucket_cnt
-	 * and is lower bounded by the compile time value of
-	 * ip_ire_min_bucket_cnt.  Similar logic applies to
-	 * ip6_ire_max_bucket_cnt.
-	 *
-	 * We calculate this for each IP Instances in order to use
-	 * the kmem_avail and ip_ire_{min,max}_bucket_cnt that are
-	 * in effect when the zone is booted.
-	 */
-	mem_avail = kmem_avail();
-	mem_cnt = (mem_avail >> ip_ire_mem_ratio) /
-	    ip_cache_table_size / sizeof (ire_t);
-	cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio;
-
-	min_cnt = MIN(cpu_cnt, mem_cnt);
-	if (min_cnt < ip_ire_min_bucket_cnt)
-		min_cnt = ip_ire_min_bucket_cnt;
-	if (ip_ire_max_bucket_cnt > min_cnt) {
-		ip_ire_max_bucket_cnt = min_cnt;
-	}
-
-	mem_cnt = (mem_avail >> ip_ire_mem_ratio) /
-	    ip6_cache_table_size / sizeof (ire_t);
-	min_cnt = MIN(cpu_cnt, mem_cnt);
-	if (min_cnt < ip6_ire_min_bucket_cnt)
-		min_cnt = ip6_ire_min_bucket_cnt;
-	if (ip6_ire_max_bucket_cnt > min_cnt) {
-		ip6_ire_max_bucket_cnt = min_cnt;
-	}
+	ire_t	*ire;
+	int	error;
 
 	mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0);
-	mutex_init(&ipst->ips_ire_handle_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	(void) rn_inithead((void **)&ipst->ips_ip_ftable, 32);
 
-	/* Calculate the IPv4 cache table size. */
-	ipst->ips_ip_cache_table_size = MAX(ip_cache_table_size,
-	    ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) /
-	    ip_ire_max_bucket_cnt));
-	if (ipst->ips_ip_cache_table_size > ip_max_cache_table_size)
-		ipst->ips_ip_cache_table_size = ip_max_cache_table_size;
 	/*
-	 * Make sure that the table size is always a power of 2.  The
-	 * hash macro IRE_ADDR_HASH() depends on that.
+	 * Make sure that the forwarding table size is a power of 2.
+	 * The IRE*_ADDR_HASH() macroes depend on that.
 	 */
-	power2_roundup(&ipst->ips_ip_cache_table_size);
-
-	ipst->ips_ip_cache_table = kmem_zalloc(ipst->ips_ip_cache_table_size *
-	    sizeof (irb_t), KM_SLEEP);
-
-	for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
-		rw_init(&ipst->ips_ip_cache_table[i].irb_lock, NULL,
-		    RW_DEFAULT, NULL);
-	}
+	ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size;
+	power2_roundup(&ipst->ips_ip6_ftable_hash_size);
 
-	/* Calculate the IPv6 cache table size. */
-	ipst->ips_ip6_cache_table_size = MAX(ip6_cache_table_size,
-	    ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) /
-	    ip6_ire_max_bucket_cnt));
-	if (ipst->ips_ip6_cache_table_size > ip6_max_cache_table_size)
-		ipst->ips_ip6_cache_table_size = ip6_max_cache_table_size;
 	/*
-	 * Make sure that the table size is always a power of 2.  The
-	 * hash macro IRE_ADDR_HASH_V6() depends on that.
+	 * Allocate/initialize a pair of IRE_NOROUTEs for each of IPv4 and IPv6.
+	 * The ire_reject_v* has RTF_REJECT set, and the ire_blackhole_v* has
+	 * RTF_BLACKHOLE set. We use the latter for transient errors such
+	 * as memory allocation failures and tripping on IRE_IS_CONDEMNED
+	 * entries.
 	 */
-	power2_roundup(&ipst->ips_ip6_cache_table_size);
+	ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
+	*ire = ire_null;
+	error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
+	    RTF_REJECT|RTF_UP, NULL, ipst);
+	ASSERT(error == 0);
+	ipst->ips_ire_reject_v4 = ire;
 
-	ipst->ips_ip_cache_table_v6 = kmem_zalloc(
-	    ipst->ips_ip6_cache_table_size * sizeof (irb_t), KM_SLEEP);
+	ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
+	*ire = ire_null;
+	error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
+	    RTF_REJECT|RTF_UP, NULL, ipst);
+	ASSERT(error == 0);
+	ipst->ips_ire_reject_v6 = ire;
 
-	for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
-		rw_init(&ipst->ips_ip_cache_table_v6[i].irb_lock, NULL,
-		    RW_DEFAULT, NULL);
-	}
+	ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
+	*ire = ire_null;
+	error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
+	    RTF_BLACKHOLE|RTF_UP, NULL, ipst);
+	ASSERT(error == 0);
+	ipst->ips_ire_blackhole_v4 = ire;
 
-	/*
-	 * Make sure that the forwarding table size is a power of 2.
-	 * The IRE*_ADDR_HASH() macroes depend on that.
-	 */
-	ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size;
-	power2_roundup(&ipst->ips_ip6_ftable_hash_size);
+	ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
+	*ire = ire_null;
+	error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
+	    RTF_BLACKHOLE|RTF_UP, NULL, ipst);
+	ASSERT(error == 0);
+	ipst->ips_ire_blackhole_v6 = ire;
 
-	ipst->ips_ire_handle = 1;
+	rw_init(&ipst->ips_ip6_ire_head_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&ipst->ips_ire_dep_lock, NULL, RW_DEFAULT, NULL);
 }
 
 void
 ip_ire_g_fini(void)
 {
 	kmem_cache_destroy(ire_cache);
+	kmem_cache_destroy(ncec_cache);
+	kmem_cache_destroy(nce_cache);
 	kmem_cache_destroy(rt_entry_cache);
 
 	rn_fini();
@@ -4599,9 +2308,21 @@ ip_ire_fini(ip_stack_t *ipst)
 {
 	int i;
 
+	rw_destroy(&ipst->ips_ire_dep_lock);
+	rw_destroy(&ipst->ips_ip6_ire_head_lock);
+
+	ire_refrele_notr(ipst->ips_ire_reject_v6);
+	ipst->ips_ire_reject_v6 = NULL;
+	ire_refrele_notr(ipst->ips_ire_reject_v4);
+	ipst->ips_ire_reject_v4 = NULL;
+	ire_refrele_notr(ipst->ips_ire_blackhole_v6);
+	ipst->ips_ire_blackhole_v6 = NULL;
+	ire_refrele_notr(ipst->ips_ire_blackhole_v4);
+	ipst->ips_ire_blackhole_v4 = NULL;
+
 	/*
 	 * Delete all IREs - assumes that the ill/ipifs have
-	 * been removed so what remains are just the ftable and IRE_CACHE.
+	 * been removed so what remains are just the ftable to handle.
 	 */
 	ire_walk(ire_delete, NULL, ipst);
 
@@ -4609,23 +2330,6 @@ ip_ire_fini(ip_stack_t *ipst)
 	ipst->ips_ip_ftable = NULL;
 
 	mutex_destroy(&ipst->ips_ire_ft_init_lock);
-	mutex_destroy(&ipst->ips_ire_handle_lock);
-
-	for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
-		ASSERT(ipst->ips_ip_cache_table[i].irb_ire == NULL);
-		rw_destroy(&ipst->ips_ip_cache_table[i].irb_lock);
-	}
-	kmem_free(ipst->ips_ip_cache_table,
-	    ipst->ips_ip_cache_table_size * sizeof (irb_t));
-	ipst->ips_ip_cache_table = NULL;
-
-	for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
-		ASSERT(ipst->ips_ip_cache_table_v6[i].irb_ire == NULL);
-		rw_destroy(&ipst->ips_ip_cache_table_v6[i].irb_lock);
-	}
-	kmem_free(ipst->ips_ip_cache_table_v6,
-	    ipst->ips_ip6_cache_table_size * sizeof (irb_t));
-	ipst->ips_ip_cache_table_v6 = NULL;
 
 	for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) {
 		irb_t *ptr;
@@ -4643,1116 +2347,1177 @@ ip_ire_fini(ip_stack_t *ipst)
 	}
 }
 
+#ifdef DEBUG
+void
+ire_trace_ref(ire_t *ire)
+{
+	mutex_enter(&ire->ire_lock);
+	if (ire->ire_trace_disable) {
+		mutex_exit(&ire->ire_lock);
+		return;
+	}
+
+	if (th_trace_ref(ire, ire->ire_ipst)) {
+		mutex_exit(&ire->ire_lock);
+	} else {
+		ire->ire_trace_disable = B_TRUE;
+		mutex_exit(&ire->ire_lock);
+		ire_trace_cleanup(ire);
+	}
+}
+
+void
+ire_untrace_ref(ire_t *ire)
+{
+	mutex_enter(&ire->ire_lock);
+	if (!ire->ire_trace_disable)
+		th_trace_unref(ire);
+	mutex_exit(&ire->ire_lock);
+}
+
+static void
+ire_trace_cleanup(const ire_t *ire)
+{
+	th_trace_cleanup(ire, ire->ire_trace_disable);
+}
+#endif /* DEBUG */
+
 /*
- * Check if another multirt route resolution is needed.
- * B_TRUE is returned is there remain a resolvable route,
- * or if no route for that dst is resolved yet.
- * B_FALSE is returned if all routes for that dst are resolved
- * or if the remaining unresolved routes are actually not
- * resolvable.
- * This only works in the global zone.
+ * Find, or create if needed, the nce_t pointer to the neighbor cache
+ * entry ncec_t for an IPv4 address. The nce_t will be created on the ill_t
+ * in the non-IPMP case, or on the cast-ill in the IPMP bcast/mcast case, or
+ * on the next available under-ill (selected by the IPMP rotor) in the
+ * unicast IPMP case.
+ *
+ * If a neighbor-cache entry has to be created (i.e., one does not already
+ * exist in the nce list) the ncec_lladdr and ncec_state of the neighbor cache
+ * entry are initialized in nce_add_v4(). The broadcast, multicast, and
+ * link-layer type determine the contents of {ncec_state, ncec_lladdr} of
+ * the ncec_t created. The ncec_lladdr is non-null for all link types with
+ * non-zero ill_phys_addr_length, though the contents may be zero in cases
+ * where the link-layer type is not known at the time of creation
+ * (e.g., IRE_IFRESOLVER links)
+ *
+ * All IRE_BROADCAST entries have ncec_state = ND_REACHABLE, and the nce_lladr
+ * has the physical broadcast address of the outgoing interface.
+ * For unicast ire entries,
+ *   - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created
+ *     ncec_t with 0 nce_lladr contents, and will be in the ND_INITIAL state.
+ *   - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link
+ *     layer resolution is necessary, so that the ncec_t will be in the
+ *     ND_REACHABLE state
+ *
+ * The link layer information needed for broadcast addresses, and for
+ * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that
+ * never needs re-verification for the lifetime of the ncec_t. These are
+ * therefore marked NCE_F_NONUD.
+ *
+ * The nce returned will be created such that the nce_ill == ill that
+ * is passed in. Note that the nce itself may not have ncec_ill == ill
+ * where IPMP links are involved.
  */
-boolean_t
-ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst)
+static nce_t *
+ire_nce_init(ill_t *ill, const void *addr, int ire_type)
 {
-	ire_t	*first_fire;
-	ire_t	*first_cire;
-	ire_t	*fire;
-	ire_t	*cire;
-	irb_t	*firb;
-	irb_t	*cirb;
-	int	unres_cnt = 0;
-	boolean_t resolvable = B_FALSE;
-
-	/* Retrieve the first IRE_HOST that matches the destination */
-	first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL,
-	    NULL, ALL_ZONES, 0, tsl,
-	    MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst);
-
-	/* No route at all */
-	if (first_fire == NULL) {
-		return (B_TRUE);
+	int		err;
+	nce_t		*nce = NULL;
+	uint16_t	ncec_flags;
+	uchar_t		*hwaddr;
+	boolean_t	need_refrele = B_FALSE;
+	ill_t		*in_ill = ill;
+	boolean_t	is_unicast;
+	uint_t		hwaddr_len;
+
+	is_unicast = ((ire_type & (IRE_MULTICAST|IRE_BROADCAST)) == 0);
+	if (IS_IPMP(ill) ||
+	    ((ire_type & IRE_BROADCAST) && IS_UNDER_IPMP(ill))) {
+		if ((ill = ipmp_ill_get_xmit_ill(ill, is_unicast)) == NULL)
+			return (NULL);
+		need_refrele = B_TRUE;
 	}
+	ncec_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
 
-	firb = first_fire->ire_bucket;
-	ASSERT(firb != NULL);
+	switch (ire_type) {
+	case IRE_BROADCAST:
+		ASSERT(!ill->ill_isv6);
+		ncec_flags |= (NCE_F_BCAST|NCE_F_NONUD);
+		break;
+	case IRE_MULTICAST:
+		ncec_flags |= (NCE_F_MCAST|NCE_F_NONUD);
+		break;
+	}
 
-	/* Retrieve the first IRE_CACHE ire for that destination. */
-	first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst);
+	if (ill->ill_net_type == IRE_IF_NORESOLVER && is_unicast) {
+		hwaddr = ill->ill_dest_addr;
+	} else {
+		hwaddr = NULL;
+	}
+	hwaddr_len = ill->ill_phys_addr_length;
 
-	/* No resolved route. */
-	if (first_cire == NULL) {
-		ire_refrele(first_fire);
-		return (B_TRUE);
+retry:
+	/* nce_state will be computed by nce_add_common() */
+	if (!ill->ill_isv6) {
+		err = nce_lookup_then_add_v4(ill, hwaddr, hwaddr_len, addr,
+		    ncec_flags, ND_UNCHANGED, &nce);
+	} else {
+		err = nce_lookup_then_add_v6(ill, hwaddr, hwaddr_len, addr,
+		    ncec_flags, ND_UNCHANGED, &nce);
 	}
 
+	switch (err) {
+	case 0:
+		break;
+	case EEXIST:
+		/*
+		 * When subnets change or partially overlap what was once
+		 * a broadcast address could now be a unicast, or vice versa.
+		 */
+		if (((ncec_flags ^ nce->nce_common->ncec_flags) &
+		    NCE_F_BCAST) != 0) {
+			ASSERT(!ill->ill_isv6);
+			ncec_delete(nce->nce_common);
+			nce_refrele(nce);
+			goto retry;
+		}
+		break;
+	default:
+		DTRACE_PROBE2(nce__init__fail, ill_t *, ill, int, err);
+		if (need_refrele)
+			ill_refrele(ill);
+		return (NULL);
+	}
 	/*
-	 * At least one route is resolved. Here we look through the forward
-	 * and cache tables, to compare the number of declared routes
-	 * with the number of resolved routes. The search for a resolvable
-	 * route is performed only if at least one route remains
-	 * unresolved.
+	 * If the ill was an under-ill of an IPMP group, we need to verify
+	 * that it is still active so that we select an active interface in
+	 * the group. However, since ipmp_ill_is_active ASSERTs for
+	 * IS_UNDER_IPMP(), we first need to verify that the ill is an
+	 * under-ill, and since this is being done in the data path, the
+	 * only way to ascertain this is by holding the ill_g_lock.
 	 */
-	cirb = first_cire->ire_bucket;
-	ASSERT(cirb != NULL);
-
-	/* Count the number of routes to that dest that are declared. */
-	IRB_REFHOLD(firb);
-	for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
-		if (!(fire->ire_flags & RTF_MULTIRT))
-			continue;
-		if (fire->ire_addr != dst)
-			continue;
-		unres_cnt++;
+	rw_enter(&ill->ill_ipst->ips_ill_g_lock, RW_READER);
+	mutex_enter(&ill->ill_lock);
+	mutex_enter(&ill->ill_phyint->phyint_lock);
+	if (need_refrele && IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
+		/*
+		 * need_refrele implies that the under ill was selected by
+		 * ipmp_ill_get_xmit_ill() because either the in_ill was an
+		 * ipmp_ill, or we are sending a non-unicast packet on
+		 * an under_ill. However, when we get here, the ill selected by
+		 * ipmp_ill_get_xmit_ill  was pulled out of the active set
+		 * (for unicast)  or cast_ill nomination (for
+		 * !unicast) after it was  picked as the outgoing ill.
+		 * We have to pick an active interface and/or cast_ill in the
+		 * group.
+		 */
+		mutex_exit(&ill->ill_phyint->phyint_lock);
+		nce_delete(nce);
+		mutex_exit(&ill->ill_lock);
+		rw_exit(&ill->ill_ipst->ips_ill_g_lock);
+		nce_refrele(nce);
+		ill_refrele(ill);
+		if ((ill = ipmp_ill_get_xmit_ill(in_ill, is_unicast)) == NULL)
+			return (NULL);
+		goto retry;
+	} else {
+		mutex_exit(&ill->ill_phyint->phyint_lock);
+		mutex_exit(&ill->ill_lock);
+		rw_exit(&ill->ill_ipst->ips_ill_g_lock);
 	}
-	IRB_REFRELE(firb);
+done:
+	ASSERT(nce->nce_ill == ill);
+	if (need_refrele)
+		ill_refrele(ill);
+	return (nce);
+}
 
-	/* Then subtract the number of routes to that dst that are resolved */
-	IRB_REFHOLD(cirb);
-	for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
-		if (!(cire->ire_flags & RTF_MULTIRT))
-			continue;
-		if (cire->ire_addr != dst)
-			continue;
-		if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
-			continue;
-		unres_cnt--;
-	}
-	IRB_REFRELE(cirb);
+nce_t *
+arp_nce_init(ill_t *ill, in_addr_t addr4, int ire_type)
+{
+	return (ire_nce_init(ill, &addr4, ire_type));
+}
 
-	/* At least one route is unresolved; search for a resolvable route. */
-	if (unres_cnt > 0)
-		resolvable = ire_multirt_lookup(&first_cire, &first_fire,
-		    MULTIRT_USESTAMP | MULTIRT_CACHEGW, NULL, tsl, ipst);
+nce_t *
+ndp_nce_init(ill_t *ill, const in6_addr_t *addr6, int ire_type)
+{
+	ASSERT((ire_type & IRE_BROADCAST) == 0);
+	return (ire_nce_init(ill, addr6, ire_type));
+}
 
-	if (first_fire != NULL)
-		ire_refrele(first_fire);
+/*
+ * The caller should hold irb_lock as a writer if the ire is in a bucket.
+ */
+void
+ire_make_condemned(ire_t *ire)
+{
+	ip_stack_t	*ipst = ire->ire_ipst;
+
+	mutex_enter(&ire->ire_lock);
+	ASSERT(ire->ire_bucket == NULL ||
+	    RW_WRITE_HELD(&ire->ire_bucket->irb_lock));
+	ASSERT(!IRE_IS_CONDEMNED(ire));
+	ire->ire_generation = IRE_GENERATION_CONDEMNED;
+	/* Count how many condemned ires for kmem_cache callback */
+	atomic_add_32(&ipst->ips_num_ire_condemned, 1);
+	mutex_exit(&ire->ire_lock);
+}
 
-	if (first_cire != NULL)
-		ire_refrele(first_cire);
+/*
+ * Increment the generation avoiding the special condemned value
+ */
+void
+ire_increment_generation(ire_t *ire)
+{
+	uint_t generation;
 
-	return (resolvable);
+	mutex_enter(&ire->ire_lock);
+	/*
+	 * Even though the caller has a hold it can't prevent a concurrent
+	 * ire_delete marking the IRE condemned
+	 */
+	if (!IRE_IS_CONDEMNED(ire)) {
+		generation = ire->ire_generation + 1;
+		if (generation == IRE_GENERATION_CONDEMNED)
+			generation = IRE_GENERATION_INITIAL;
+		ASSERT(generation != IRE_GENERATION_VERIFY);
+		ire->ire_generation = generation;
+	}
+	mutex_exit(&ire->ire_lock);
 }
 
 /*
- * Explore a forward_table bucket, starting from fire_arg.
- * fire_arg MUST be an IRE_HOST entry.
- *
- * Return B_TRUE and update *ire_arg and *fire_arg
- * if at least one resolvable route is found. *ire_arg
- * is the IRE entry for *fire_arg's gateway.
- *
- * Return B_FALSE otherwise (all routes are resolved or
- * the remaining unresolved routes are all unresolvable).
- *
- * The IRE selection relies on a priority mechanism
- * driven by the flags passed in by the caller.
- * The caller, such as ip_newroute_ipif(), can get the most
- * relevant ire at each stage of a multiple route resolution.
- *
- * The rules are:
- *
- * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE
- *   ires are preferred for the gateway. This gives the highest
- *   priority to routes that can be resolved without using
- *   a resolver.
+ * Increment ire_generation on all the IRE_MULTICASTs
+ * Used when the default multicast interface (as determined by
+ * ill_lookup_multicast) might have changed.
  *
- * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW
- *   is specified but no IRE_CACHETABLE ire entry for the gateway
- *   is found, the following rules apply.
- *
- * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE
- *   ires for the gateway, that have not been tried since
- *   a configurable amount of time, are preferred.
- *   This applies when a resolver must be invoked for
- *   a missing route, but we don't want to use the resolver
- *   upon each packet emission. If no such resolver is found,
- *   B_FALSE is returned.
- *   The MULTIRT_USESTAMP flag can be combined with
- *   MULTIRT_CACHEGW.
- *
- * - if MULTIRT_USESTAMP is not specified in flags, the first
- *   unresolved but resolvable route is selected.
- *
- * - Otherwise, there is no resolvable route, and
- *   B_FALSE is returned.
- *
- * At last, MULTIRT_SETSTAMP can be specified in flags to
- * request the timestamp of unresolvable routes to
- * be refreshed. This prevents the useless exploration
- * of those routes for a while, when MULTIRT_USESTAMP is used.
- *
- * The argument already_resolved_count is an output variable to track number
- * of already resolved multirt routes.
- *
- * This only works in the global zone.
+ * That includes the zoneid, IFF_ flags, the IPv6 scope of the address, and
+ * ill unplumb.
  */
-boolean_t
-ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
-    int *already_resolved_count, const ts_label_t *tsl, ip_stack_t *ipst)
+void
+ire_increment_multicast_generation(ip_stack_t *ipst, boolean_t isv6)
 {
-	clock_t	delta;
-	ire_t	*best_fire = NULL;
-	ire_t	*best_cire = NULL;
-	ire_t	*first_fire;
-	ire_t	*first_cire;
-	ire_t	*fire;
-	ire_t	*cire;
-	irb_t	*firb = NULL;
-	irb_t	*cirb = NULL;
-	ire_t	*gw_ire;
-	boolean_t	already_resolved;
-	boolean_t	res;
-	ipaddr_t	dst;
-	ipaddr_t	gw;
-
-	ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n",
-	    (void *)*ire_arg, (void *)*fire_arg, flags));
-
-	ASSERT(ire_arg != NULL);
-	ASSERT(fire_arg != NULL);
-
-	/* Not an IRE_HOST ire; give up. */
-	if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) {
-		return (B_FALSE);
+	ill_t	*ill;
+	ill_walk_context_t ctx;
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	if (isv6)
+		ill = ILL_START_WALK_V6(&ctx, ipst);
+	else
+		ill = ILL_START_WALK_V4(&ctx, ipst);
+	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (ILL_IS_CONDEMNED(ill))
+			continue;
+		if (ill->ill_ire_multicast != NULL)
+			ire_increment_generation(ill->ill_ire_multicast);
 	}
+	rw_exit(&ipst->ips_ill_g_lock);
+}
 
-	/* This is the first IRE_HOST ire for that destination. */
-	first_fire = *fire_arg;
-	firb = first_fire->ire_bucket;
-	ASSERT(firb != NULL);
+/*
+ * Return a held IRE_NOROUTE with RTF_REJECT set
+ */
+ire_t *
+ire_reject(ip_stack_t *ipst, boolean_t isv6)
+{
+	ire_t *ire;
 
-	dst = first_fire->ire_addr;
+	if (isv6)
+		ire = ipst->ips_ire_reject_v6;
+	else
+		ire = ipst->ips_ire_reject_v4;
 
-	ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst)));
+	ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED);
+	ire_refhold(ire);
+	return (ire);
+}
 
-	/*
-	 * Retrieve the first IRE_CACHE ire for that destination;
-	 * if we don't find one, no route for that dest is
-	 * resolved yet.
-	 */
-	first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst);
-	if (first_cire != NULL) {
-		cirb = first_cire->ire_bucket;
-	}
+/*
+ * Return a held IRE_NOROUTE with RTF_BLACKHOLE set
+ */
+ire_t *
+ire_blackhole(ip_stack_t *ipst, boolean_t isv6)
+{
+	ire_t *ire;
 
-	ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire));
+	if (isv6)
+		ire = ipst->ips_ire_blackhole_v6;
+	else
+		ire = ipst->ips_ire_blackhole_v4;
 
-	/*
-	 * Search for a resolvable route, giving the top priority
-	 * to routes that can be resolved without any call to the resolver.
-	 */
-	IRB_REFHOLD(firb);
+	ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED);
+	ire_refhold(ire);
+	return (ire);
+}
+
+/*
+ * Return a held IRE_MULTICAST.
+ */
+ire_t *
+ire_multicast(ill_t *ill)
+{
+	ire_t *ire = ill->ill_ire_multicast;
+
+	ASSERT(ire == NULL || ire->ire_generation != IRE_GENERATION_CONDEMNED);
+	if (ire == NULL)
+		ire = ire_blackhole(ill->ill_ipst, ill->ill_isv6);
+	else
+		ire_refhold(ire);
+	return (ire);
+}
+
+/*
+ * Given an IRE return its nexthop IRE. The nexthop IRE is an IRE_ONLINK
+ * that is an exact match (i.e., a /32 for IPv4 and /128 for IPv6).
+ * This can return an RTF_REJECT|RTF_BLACKHOLE.
+ * The returned IRE is held.
+ * The assumption is that ip_select_route() has been called and returned the
+ * IRE (thus ip_select_route would have set up the ire_dep* information.)
+ * If some IRE is deleteted then ire_dep_remove() will have been called and
+ * we might not find a nexthop IRE, in which case we return NULL.
+ */
+ire_t *
+ire_nexthop(ire_t *ire)
+{
+	ip_stack_t	*ipst = ire->ire_ipst;
 
-	if (!CLASSD(dst)) {
+	/* Acquire lock to walk ire_dep_parent */
+	rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+	while (ire != NULL) {
+		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+			goto done;
+		}
 		/*
-		 * For all multiroute IRE_HOST ires for that destination,
-		 * check if the route via the IRE_HOST's gateway is
-		 * resolved yet.
+		 * If we find an IRE_ONLINK we are done. This includes
+		 * the case of IRE_MULTICAST.
+		 * Note that in order to send packets we need a host-specific
+		 * IRE_IF_ALL first in the ire_dep_parent chain. Normally this
+		 * is done by inserting an IRE_IF_CLONE if the IRE_INTERFACE
+		 * was not host specific.
+		 * However, ip_rts_request doesn't want to send packets
+		 * hence doesn't want to allocate an IRE_IF_CLONE. Yet
+		 * it needs an IRE_IF_ALL to get to the ill. Thus
+		 * we return IRE_IF_ALL that are not host specific here.
 		 */
-		for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
-
-			if (!(fire->ire_flags & RTF_MULTIRT))
-				continue;
-			if (fire->ire_addr != dst)
-				continue;
+		if (ire->ire_type & IRE_ONLINK)
+			goto done;
+		ire = ire->ire_dep_parent;
+	}
+	rw_exit(&ipst->ips_ire_dep_lock);
+	return (NULL);
 
-			if (fire->ire_gw_secattr != NULL &&
-			    tsol_ire_match_gwattr(fire, tsl) != 0) {
-				continue;
-			}
+done:
+	ire_refhold(ire);
+	rw_exit(&ipst->ips_ire_dep_lock);
+	return (ire);
+}
 
-			gw = fire->ire_gateway_addr;
-
-			ip2dbg(("ire_multirt_lookup: fire %p, "
-			    "ire_addr %08x, ire_gateway_addr %08x\n",
-			    (void *)fire, ntohl(fire->ire_addr), ntohl(gw)));
-
-			already_resolved = B_FALSE;
-
-			if (first_cire != NULL) {
-				ASSERT(cirb != NULL);
-
-				IRB_REFHOLD(cirb);
-				/*
-				 * For all IRE_CACHE ires for that
-				 * destination.
-				 */
-				for (cire = first_cire;
-				    cire != NULL;
-				    cire = cire->ire_next) {
-
-					if (!(cire->ire_flags & RTF_MULTIRT))
-						continue;
-					if (cire->ire_addr != dst)
-						continue;
-					if (cire->ire_marks &
-					    (IRE_MARK_CONDEMNED |
-					    IRE_MARK_TESTHIDDEN))
-						continue;
-
-					if (cire->ire_gw_secattr != NULL &&
-					    tsol_ire_match_gwattr(cire,
-					    tsl) != 0) {
-						continue;
-					}
+/*
+ * Find the ill used to send packets. This will be NULL in case
+ * of a reject or blackhole.
+ * The returned ill is held; caller needs to do ill_refrele when done.
+ */
+ill_t *
+ire_nexthop_ill(ire_t *ire)
+{
+	ill_t		*ill;
 
-					/*
-					 * Check if the IRE_CACHE's gateway
-					 * matches the IRE_HOST's gateway.
-					 */
-					if (cire->ire_gateway_addr == gw) {
-						already_resolved = B_TRUE;
-						break;
-					}
-				}
-				IRB_REFRELE(cirb);
-			}
+	ire = ire_nexthop(ire);
+	if (ire == NULL)
+		return (NULL);
 
-			/*
-			 * This route is already resolved;
-			 * proceed with next one.
-			 */
-			if (already_resolved) {
-				ip2dbg(("ire_multirt_lookup: found cire %p, "
-				    "already resolved\n", (void *)cire));
+	/* ire_ill can not change for an existing ire */
+	ill = ire->ire_ill;
+	if (ill != NULL)
+		ill_refhold(ill);
+	ire_refrele(ire);
+	return (ill);
+}
 
-				if (already_resolved_count != NULL)
-					(*already_resolved_count)++;
-				continue;
-			}
+#ifdef DEBUG
+static boolean_t
+parent_has_child(ire_t *parent, ire_t *child)
+{
+	ire_t	*ire;
+	ire_t	*prev;
 
-			/*
-			 * The route is unresolved; is it actually
-			 * resolvable, i.e. is there a cache or a resolver
-			 * for the gateway?
-			 */
-			gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL,
-			    ALL_ZONES, tsl,
-			    MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR, ipst);
+	ire = parent->ire_dep_children;
+	prev = NULL;
+	while (ire != NULL) {
+		if (prev == NULL) {
+			ASSERT(ire->ire_dep_sib_ptpn ==
+			    &(parent->ire_dep_children));
+		} else {
+			ASSERT(ire->ire_dep_sib_ptpn ==
+			    &(prev->ire_dep_sib_next));
+		}
+		if (ire == child)
+			return (B_TRUE);
+		prev = ire;
+		ire = ire->ire_dep_sib_next;
+	}
+	return (B_FALSE);
+}
 
-			ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n",
-			    (void *)gw_ire));
+static void
+ire_dep_verify(ire_t *ire)
+{
+	ire_t		*parent = ire->ire_dep_parent;
+	ire_t		*child = ire->ire_dep_children;
 
-			/*
-			 * If gw_ire is typed IRE_CACHETABLE,
-			 * this route can be resolved without any call to the
-			 * resolver. If the MULTIRT_CACHEGW flag is set,
-			 * give the top priority to this ire and exit the
-			 * loop.
-			 * This is typically the case when an ARP reply
-			 * is processed through ip_wput_nondata().
-			 */
-			if ((flags & MULTIRT_CACHEGW) &&
-			    (gw_ire != NULL) &&
-			    (gw_ire->ire_type & IRE_CACHETABLE)) {
-				ASSERT(gw_ire->ire_nce == NULL ||
-				    gw_ire->ire_nce->nce_state == ND_REACHABLE);
-				/*
-				 * Release the resolver associated to the
-				 * previous candidate best ire, if any.
-				 */
-				if (best_cire != NULL) {
-					ire_refrele(best_cire);
-					ASSERT(best_fire != NULL);
-				}
+	ASSERT(ire->ire_ipversion == IPV4_VERSION ||
+	    ire->ire_ipversion == IPV6_VERSION);
+	if (parent != NULL) {
+		ASSERT(parent->ire_ipversion == IPV4_VERSION ||
+		    parent->ire_ipversion == IPV6_VERSION);
+		ASSERT(parent->ire_refcnt >= 1);
+		ASSERT(parent_has_child(parent, ire));
+	}
+	if (child != NULL) {
+		ASSERT(child->ire_ipversion == IPV4_VERSION ||
+		    child->ire_ipversion == IPV6_VERSION);
+		ASSERT(child->ire_dep_parent == ire);
+		ASSERT(child->ire_dep_sib_ptpn != NULL);
+		ASSERT(parent_has_child(ire, child));
+	}
+}
+#endif /* DEBUG */
 
-				best_fire = fire;
-				best_cire = gw_ire;
+/*
+ * Assumes ire_dep_parent is set. Remove this child from its parent's linkage.
+ */
+void
+ire_dep_remove(ire_t *ire)
+{
+	ip_stack_t	*ipst = ire->ire_ipst;
+	ire_t		*parent = ire->ire_dep_parent;
+	ire_t		*next;
+	nce_t		*nce;
 
-				ip2dbg(("ire_multirt_lookup: found top prio "
-				    "best_fire %p, best_cire %p\n",
-				    (void *)best_fire, (void *)best_cire));
-				break;
-			}
+	ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
+	ASSERT(ire->ire_dep_parent != NULL);
+	ASSERT(ire->ire_dep_sib_ptpn != NULL);
 
-			/*
-			 * Compute the time elapsed since our preceding
-			 * attempt to  resolve that route.
-			 * If the MULTIRT_USESTAMP flag is set, we take that
-			 * route into account only if this time interval
-			 * exceeds ip_multirt_resolution_interval;
-			 * this prevents us from attempting to resolve a
-			 * broken route upon each sending of a packet.
-			 */
-			delta = lbolt - fire->ire_last_used_time;
-			delta = TICK_TO_MSEC(delta);
-
-			res = (boolean_t)((delta >
-			    ipst->ips_ip_multirt_resolution_interval) ||
-			    (!(flags & MULTIRT_USESTAMP)));
-
-			ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, "
-			    "res %d\n",
-			    (void *)fire, delta, res));
-
-			if (res) {
-				/*
-				 * We are here if MULTIRT_USESTAMP flag is set
-				 * and the resolver for fire's gateway
-				 * has not been tried since
-				 * ip_multirt_resolution_interval, or if
-				 * MULTIRT_USESTAMP is not set but gw_ire did
-				 * not fill the conditions for MULTIRT_CACHEGW,
-				 * or if neither MULTIRT_USESTAMP nor
-				 * MULTIRT_CACHEGW are set.
-				 */
-				if (gw_ire != NULL) {
-					if (best_fire == NULL) {
-						ASSERT(best_cire == NULL);
-
-						best_fire = fire;
-						best_cire = gw_ire;
-
-						ip2dbg(("ire_multirt_lookup:"
-						    "found candidate "
-						    "best_fire %p, "
-						    "best_cire %p\n",
-						    (void *)best_fire,
-						    (void *)best_cire));
-
-						/*
-						 * If MULTIRT_CACHEGW is not
-						 * set, we ignore the top
-						 * priority ires that can
-						 * be resolved without any
-						 * call to the resolver;
-						 * In that case, there is
-						 * actually no need
-						 * to continue the loop.
-						 */
-						if (!(flags &
-						    MULTIRT_CACHEGW)) {
-							break;
-						}
-						continue;
-					}
-				} else {
-					/*
-					 * No resolver for the gateway: the
-					 * route is not resolvable.
-					 * If the MULTIRT_SETSTAMP flag is
-					 * set, we stamp the IRE_HOST ire,
-					 * so we will not select it again
-					 * during this resolution interval.
-					 */
-					if (flags & MULTIRT_SETSTAMP)
-						fire->ire_last_used_time =
-						    lbolt;
-				}
-			}
+#ifdef DEBUG
+	ire_dep_verify(ire);
+	ire_dep_verify(parent);
+#endif
 
-			if (gw_ire != NULL)
-				ire_refrele(gw_ire);
-		}
-	} else { /* CLASSD(dst) */
+	next = ire->ire_dep_sib_next;
+	if (next != NULL)
+		next->ire_dep_sib_ptpn = ire->ire_dep_sib_ptpn;
 
-		for (fire = first_fire;
-		    fire != NULL;
-		    fire = fire->ire_next) {
+	ASSERT(*(ire->ire_dep_sib_ptpn) == ire);
+	*(ire->ire_dep_sib_ptpn) = ire->ire_dep_sib_next;
 
-			if (!(fire->ire_flags & RTF_MULTIRT))
-				continue;
-			if (fire->ire_addr != dst)
-				continue;
+	ire->ire_dep_sib_ptpn = NULL;
+	ire->ire_dep_sib_next = NULL;
 
-			if (fire->ire_gw_secattr != NULL &&
-			    tsol_ire_match_gwattr(fire, tsl) != 0) {
-				continue;
-			}
+	mutex_enter(&ire->ire_lock);
+	parent = ire->ire_dep_parent;
+	ire->ire_dep_parent = NULL;
+	mutex_exit(&ire->ire_lock);
 
-			already_resolved = B_FALSE;
+	/*
+	 * Make sure all our children, grandchildren, etc set
+	 * ire_dep_parent_generation to IRE_GENERATION_VERIFY since
+	 * we can no longer guarantee than the children have a current
+	 * ire_nce_cache and ire_nexthop_ill().
+	 */
+	if (ire->ire_dep_children != NULL)
+		ire_dep_invalidate_children(ire->ire_dep_children);
 
-			gw = fire->ire_gateway_addr;
+	/*
+	 * Since the parent is gone we make sure we clear ire_nce_cache.
+	 * We can clear it under ire_lock even if the IRE is used
+	 */
+	mutex_enter(&ire->ire_lock);
+	nce = ire->ire_nce_cache;
+	ire->ire_nce_cache = NULL;
+	mutex_exit(&ire->ire_lock);
+	if (nce != NULL)
+		nce_refrele(nce);
 
-			gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE,
-			    NULL, NULL, ALL_ZONES, 0, tsl,
-			    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
-			    MATCH_IRE_SECATTR, ipst);
+#ifdef DEBUG
+	ire_dep_verify(ire);
+	ire_dep_verify(parent);
+#endif
 
-			/* No resolver for the gateway; we skip this ire. */
-			if (gw_ire == NULL) {
-				continue;
-			}
-			ASSERT(gw_ire->ire_nce == NULL ||
-			    gw_ire->ire_nce->nce_state == ND_REACHABLE);
-
-			if (first_cire != NULL) {
-
-				IRB_REFHOLD(cirb);
-				/*
-				 * For all IRE_CACHE ires for that
-				 * destination.
-				 */
-				for (cire = first_cire;
-				    cire != NULL;
-				    cire = cire->ire_next) {
-
-					if (!(cire->ire_flags & RTF_MULTIRT))
-						continue;
-					if (cire->ire_addr != dst)
-						continue;
-					if (cire->ire_marks &
-					    (IRE_MARK_CONDEMNED |
-					    IRE_MARK_TESTHIDDEN))
-						continue;
-
-					if (cire->ire_gw_secattr != NULL &&
-					    tsol_ire_match_gwattr(cire,
-					    tsl) != 0) {
-						continue;
-					}
+	ire_refrele_notr(parent);
+	ire_refrele_notr(ire);
+}
 
-					/*
-					 * Cache entries are linked to the
-					 * parent routes using the parent handle
-					 * (ire_phandle). If no cache entry has
-					 * the same handle as fire, fire is
-					 * still unresolved.
-					 */
-					ASSERT(cire->ire_phandle != 0);
-					if (cire->ire_phandle ==
-					    fire->ire_phandle) {
-						already_resolved = B_TRUE;
-						break;
-					}
-				}
-				IRB_REFRELE(cirb);
-			}
+/*
+ * Insert the child in the linkage of the parent
+ */
+static void
+ire_dep_parent_insert(ire_t *child, ire_t *parent)
+{
+	ip_stack_t	*ipst = child->ire_ipst;
+	ire_t		*next;
 
-			/*
-			 * This route is already resolved; proceed with
-			 * next one.
-			 */
-			if (already_resolved) {
-				ire_refrele(gw_ire);
-				if (already_resolved_count != NULL)
-					(*already_resolved_count)++;
-				continue;
-			}
+	ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
+	ASSERT(child->ire_dep_parent == NULL);
 
-			/*
-			 * Compute the time elapsed since our preceding
-			 * attempt to resolve that route.
-			 * If the MULTIRT_USESTAMP flag is set, we take
-			 * that route into account only if this time
-			 * interval exceeds ip_multirt_resolution_interval;
-			 * this prevents us from attempting to resolve a
-			 * broken route upon each sending of a packet.
-			 */
-			delta = lbolt - fire->ire_last_used_time;
-			delta = TICK_TO_MSEC(delta);
-
-			res = (boolean_t)((delta >
-			    ipst->ips_ip_multirt_resolution_interval) ||
-			    (!(flags & MULTIRT_USESTAMP)));
-
-			ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, "
-			    "flags %04x, res %d\n",
-			    (void *)fire, delta, flags, res));
-
-			if (res) {
-				if (best_cire != NULL) {
-					/*
-					 * Release the resolver associated
-					 * to the preceding candidate best
-					 * ire, if any.
-					 */
-					ire_refrele(best_cire);
-					ASSERT(best_fire != NULL);
-				}
-				best_fire = fire;
-				best_cire = gw_ire;
-				continue;
-			}
+#ifdef DEBUG
+	ire_dep_verify(child);
+	ire_dep_verify(parent);
+#endif
+	/* No parents => no siblings */
+	ASSERT(child->ire_dep_sib_ptpn == NULL);
+	ASSERT(child->ire_dep_sib_next == NULL);
 
-			ire_refrele(gw_ire);
-		}
-	}
+	ire_refhold_notr(parent);
+	ire_refhold_notr(child);
 
-	if (best_fire != NULL) {
-		IRE_REFHOLD(best_fire);
+	/* Head insertion */
+	next = parent->ire_dep_children;
+	if (next != NULL) {
+		ASSERT(next->ire_dep_sib_ptpn == &(parent->ire_dep_children));
+		child->ire_dep_sib_next = next;
+		next->ire_dep_sib_ptpn = &(child->ire_dep_sib_next);
 	}
-	IRB_REFRELE(firb);
+	parent->ire_dep_children = child;
+	child->ire_dep_sib_ptpn = &(parent->ire_dep_children);
 
-	/* Release the first IRE_CACHE we initially looked up, if any. */
-	if (first_cire != NULL)
-		ire_refrele(first_cire);
+	mutex_enter(&child->ire_lock);
+	child->ire_dep_parent = parent;
+	mutex_exit(&child->ire_lock);
 
-	/* Found a resolvable route. */
-	if (best_fire != NULL) {
-		ASSERT(best_cire != NULL);
-
-		if (*fire_arg != NULL)
-			ire_refrele(*fire_arg);
-		if (*ire_arg != NULL)
-			ire_refrele(*ire_arg);
+#ifdef DEBUG
+	ire_dep_verify(child);
+	ire_dep_verify(parent);
+#endif
+}
 
-		/*
-		 * Update the passed-in arguments with the
-		 * resolvable multirt route we found.
-		 */
-		*fire_arg = best_fire;
-		*ire_arg = best_cire;
 
-		ip2dbg(("ire_multirt_lookup: returning B_TRUE, "
-		    "*fire_arg %p, *ire_arg %p\n",
-		    (void *)best_fire, (void *)best_cire));
+/*
+ * Given count worth of ires and generations, build ire_dep_* relationships
+ * from ires[0] to ires[count-1]. Record generations[i+1] in
+ * ire_dep_parent_generation for ires[i].
+ * We graft onto an existing parent chain by making sure that we don't
+ * touch ire_dep_parent for ires[count-1].
+ *
+ * We check for any condemned ire_generation count and return B_FALSE in
+ * that case so that the caller can tear it apart.
+ *
+ * Note that generations[0] is not used. Caller handles that.
+ */
+boolean_t
+ire_dep_build(ire_t *ires[], uint_t generations[], uint_t count)
+{
+	ire_t		*ire = ires[0];
+	ip_stack_t	*ipst;
+	uint_t		i;
 
+	ASSERT(count > 0);
+	if (count == 1) {
+		/* No work to do */
 		return (B_TRUE);
 	}
+	ipst = ire->ire_ipst;
+	rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
+	/*
+	 * Do not remove the linkage for any existing parent chain i.e.,
+	 * ires[count-1] is left alone.
+	 */
+	for (i = 0; i < count-1; i++) {
+		/* Remove existing parent if we need to change it */
+		if (ires[i]->ire_dep_parent != NULL &&
+		    ires[i]->ire_dep_parent != ires[i+1])
+			ire_dep_remove(ires[i]);
+	}
 
-	ASSERT(best_cire == NULL);
+	for (i = 0; i < count - 1; i++) {
+		ASSERT(ires[i]->ire_ipversion == IPV4_VERSION ||
+		    ires[i]->ire_ipversion == IPV6_VERSION);
+		/* Does it need to change? */
+		if (ires[i]->ire_dep_parent != ires[i+1])
+			ire_dep_parent_insert(ires[i], ires[i+1]);
 
-	ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, "
-	    "*ire_arg %p\n",
-	    (void *)*fire_arg, (void *)*ire_arg));
+		mutex_enter(&ires[i+1]->ire_lock);
+		if (IRE_IS_CONDEMNED(ires[i+1])) {
+			mutex_exit(&ires[i+1]->ire_lock);
+			rw_exit(&ipst->ips_ire_dep_lock);
+			return (B_FALSE);
+		}
+		mutex_exit(&ires[i+1]->ire_lock);
 
-	/* No resolvable route. */
-	return (B_FALSE);
+		mutex_enter(&ires[i]->ire_lock);
+		ires[i]->ire_dep_parent_generation = generations[i+1];
+		mutex_exit(&ires[i]->ire_lock);
+	}
+	rw_exit(&ipst->ips_ire_dep_lock);
+	return (B_TRUE);
 }
 
 /*
- * IRE iterator for inbound and loopback broadcast processing.
- * Given an IRE_BROADCAST ire, walk the ires with the same destination
- * address, but skip over the passed-in ire. Returns the next ire without
- * a hold - assumes that the caller holds a reference on the IRE bucket.
+ * Given count worth of ires, unbuild ire_dep_* relationships
+ * from ires[0] to ires[count-1].
  */
-ire_t *
-ire_get_next_bcast_ire(ire_t *curr, ire_t *ire)
+void
+ire_dep_unbuild(ire_t *ires[], uint_t count)
 {
-	ill_t *ill;
+	ip_stack_t	*ipst;
+	uint_t		i;
 
-	if (curr == NULL) {
-		for (curr = ire->ire_bucket->irb_ire; curr != NULL;
-		    curr = curr->ire_next) {
-			if (curr->ire_addr == ire->ire_addr)
-				break;
-		}
-	} else {
-		curr = curr->ire_next;
+	if (count == 0) {
+		/* No work to do */
+		return;
 	}
-	ill = ire_to_ill(ire);
-	for (; curr != NULL; curr = curr->ire_next) {
-		if (curr->ire_addr != ire->ire_addr) {
-			/*
-			 * All the IREs to a given destination are contiguous;
-			 * break out once the address doesn't match.
-			 */
-			break;
-		}
-		if (curr == ire) {
-			/* skip over the passed-in ire */
-			continue;
-		}
-		if ((curr->ire_stq != NULL && ire->ire_stq == NULL) ||
-		    (curr->ire_stq == NULL && ire->ire_stq != NULL)) {
+	ipst = ires[0]->ire_ipst;
+	rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
+	for (i = 0; i < count; i++) {
+		ASSERT(ires[i]->ire_ipversion == IPV4_VERSION ||
+		    ires[i]->ire_ipversion == IPV6_VERSION);
+		if (ires[i]->ire_dep_parent != NULL)
+			ire_dep_remove(ires[i]);
+		mutex_enter(&ires[i]->ire_lock);
+		ires[i]->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
+		mutex_exit(&ires[i]->ire_lock);
+	}
+	rw_exit(&ipst->ips_ire_dep_lock);
+}
+
+/*
+ * Both the forwarding and the outbound code paths can trip on
+ * a condemned NCE, in which case we call this function.
+ * We have two different behaviors: if the NCE was UNREACHABLE
+ * it is an indication that something failed. In that case
+ * we see if we should look for a different IRE (for example,
+ * delete any matching redirect IRE, or try a different
+ * IRE_DEFAULT (ECMP)). We mark the ire as bad so a hopefully
+ * different IRE will be picked next time we send/forward.
+ *
+ * If we are called by the output path then fail_if_better is set
+ * and we return NULL if there could be a better IRE. This is because the
+ * output path retries the IRE lookup. (The input/forward path can not retry.)
+ *
+ * If the NCE was not unreachable then we pick/allocate a
+ * new (most likely ND_INITIAL) NCE and proceed with it.
+ *
+ * ipha/ip6h are needed for multicast packets; ipha needs to be
+ * set for IPv4 and ip6h needs to be set for IPv6 packets.
+ */
+nce_t *
+ire_handle_condemned_nce(nce_t *nce, ire_t *ire, ipha_t *ipha, ip6_t *ip6h,
+    boolean_t fail_if_better)
+{
+	if (nce->nce_common->ncec_state == ND_UNREACHABLE) {
+		if (ire_no_good(ire) && fail_if_better) {
 			/*
-			 * If the passed-in ire is loopback, skip over
-			 * non-loopback ires and vice versa.
+			 * Did some changes, or ECMP likely to exist.
+			 * Make ip_output look for a different IRE
 			 */
-			continue;
+			return (NULL);
 		}
-		if (ire_to_ill(curr) != ill) {
-			/* skip over IREs going through a different interface */
-			continue;
+	}
+	if (ire_revalidate_nce(ire) == ENETUNREACH) {
+		/* The ire_dep_parent chain went bad, or no memory? */
+		(void) ire_no_good(ire);
+		return (NULL);
+	}
+	if (ire->ire_ipversion == IPV4_VERSION) {
+		ASSERT(ipha != NULL);
+		nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
+	} else {
+		ASSERT(ip6h != NULL);
+		nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst);
+	}
+
+	if (nce == NULL)
+		return (NULL);
+	if (nce->nce_is_condemned) {
+		nce_refrele(nce);
+		return (NULL);
+	}
+	return (nce);
+}
+
+/*
+ * The caller has found that the ire is bad, either due to a reference to an NCE
+ * in ND_UNREACHABLE state, or a MULTIRT route whose gateway can't be resolved.
+ * We update things so a subsequent attempt to send to the destination
+ * is likely to find different IRE, or that a new NCE would be created.
+ *
+ * Returns B_TRUE if it is likely that a subsequent ire_ftable_lookup would
+ * find a different route (either due to having deleted a redirect, or there
+ * being ECMP routes.)
+ *
+ * If we have a redirect (RTF_DYNAMIC) we delete it.
+ * Otherwise we increment ire_badcnt and increment the generation number so
+ * that a cached ixa_ire will redo the route selection. ire_badcnt is taken
+ * into account in the route selection when we have multiple choices (multiple
+ * default routes or ECMP in general).
+ * Any time ip_select_route find an ire with a condemned ire_nce_cache
+ * (e.g., if no equal cost route to the bad one) ip_select_route will make
+ * sure the NCE is revalidated to avoid getting stuck on a
+ * NCE_F_CONDMNED ncec that caused ire_no_good to be called.
+ */
+boolean_t
+ire_no_good(ire_t *ire)
+{
+	ip_stack_t	*ipst = ire->ire_ipst;
+	ire_t		*ire2;
+	nce_t		*nce;
+
+	if (ire->ire_flags & RTF_DYNAMIC) {
+		ire_delete(ire);
+		return (B_TRUE);
+	}
+	if (ire->ire_flags & RTF_INDIRECT) {
+		/* Check if next IRE is a redirect */
+		rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+		if (ire->ire_dep_parent != NULL &&
+		    (ire->ire_dep_parent->ire_flags & RTF_DYNAMIC)) {
+			ire2 = ire->ire_dep_parent;
+			ire_refhold(ire2);
+		} else {
+			ire2 = NULL;
 		}
-		if (curr->ire_marks & IRE_MARK_CONDEMNED) {
-			/* skip over deleted IREs */
-			continue;
+		rw_exit(&ipst->ips_ire_dep_lock);
+		if (ire2 != NULL) {
+			ire_delete(ire2);
+			ire_refrele(ire2);
+			return (B_TRUE);
 		}
-		return (curr);
 	}
-	return (NULL);
+	/*
+	 * No redirect involved. Increment badcnt so that if we have ECMP
+	 * routes we are likely to pick a different one for the next packet.
+	 *
+	 * If the NCE is unreachable and condemned we should drop the reference
+	 * to it so that a new NCE can be created.
+	 *
+	 * Finally we increment the generation number so that any ixa_ire
+	 * cache will be revalidated.
+	 */
+	mutex_enter(&ire->ire_lock);
+	ire->ire_badcnt++;
+	ire->ire_last_badcnt = TICK_TO_SEC(lbolt64);
+	nce = ire->ire_nce_cache;
+	if (nce != NULL && nce->nce_is_condemned &&
+	    nce->nce_common->ncec_state == ND_UNREACHABLE)
+		ire->ire_nce_cache = NULL;
+	else
+		nce = NULL;
+	mutex_exit(&ire->ire_lock);
+	if (nce != NULL)
+		nce_refrele(nce);
+
+	ire_increment_generation(ire);
+	ire_dep_incr_generation(ire);
+
+	return (ire->ire_bucket->irb_ire_cnt > 1);
 }
 
-#ifdef DEBUG
-void
-ire_trace_ref(ire_t *ire)
+/*
+ * Walk ire_dep_parent chain and validate that ire_dep_parent->ire_generation ==
+ * ire_dep_parent_generation.
+ * If they all match we just return ire_generation from the topmost IRE.
+ * Otherwise we propagate the mismatch by setting all ire_dep_parent_generation
+ * above the mismatch to IRE_GENERATION_VERIFY and also returning
+ * IRE_GENERATION_VERIFY.
+ */
+uint_t
+ire_dep_validate_generations(ire_t *ire)
 {
-	mutex_enter(&ire->ire_lock);
-	if (ire->ire_trace_disable) {
+	ip_stack_t	*ipst = ire->ire_ipst;
+	uint_t		generation;
+	ire_t		*ire1;
+
+	rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+	generation = ire->ire_generation;	/* Assuming things match */
+	for (ire1 = ire; ire1 != NULL; ire1 = ire1->ire_dep_parent) {
+		ASSERT(ire1->ire_ipversion == IPV4_VERSION ||
+		    ire1->ire_ipversion == IPV6_VERSION);
+		if (ire1->ire_dep_parent == NULL)
+			break;
+		if (ire1->ire_dep_parent_generation !=
+		    ire1->ire_dep_parent->ire_generation)
+			goto mismatch;
+	}
+	rw_exit(&ipst->ips_ire_dep_lock);
+	return (generation);
+
+mismatch:
+	generation = IRE_GENERATION_VERIFY;
+	/* Fill from top down to the mismatch with _VERIFY */
+	while (ire != ire1) {
+		ASSERT(ire->ire_ipversion == IPV4_VERSION ||
+		    ire->ire_ipversion == IPV6_VERSION);
+		mutex_enter(&ire->ire_lock);
+		ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
 		mutex_exit(&ire->ire_lock);
-		return;
+		ire = ire->ire_dep_parent;
 	}
+	rw_exit(&ipst->ips_ire_dep_lock);
+	return (generation);
+}
 
-	if (th_trace_ref(ire, ire->ire_ipst)) {
-		mutex_exit(&ire->ire_lock);
-	} else {
-		ire->ire_trace_disable = B_TRUE;
+/*
+ * Used when we need to return an ire with ire_dep_parent, but we
+ * know the chain is invalid for instance we didn't create an IRE_IF_CLONE
+ * Using IRE_GENERATION_VERIFY means that next time we'll redo the
+ * recursive lookup.
+ */
+void
+ire_dep_invalidate_generations(ire_t *ire)
+{
+	ip_stack_t	*ipst = ire->ire_ipst;
+
+	rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+	while (ire != NULL) {
+		ASSERT(ire->ire_ipversion == IPV4_VERSION ||
+		    ire->ire_ipversion == IPV6_VERSION);
+		mutex_enter(&ire->ire_lock);
+		ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
 		mutex_exit(&ire->ire_lock);
-		ire_trace_cleanup(ire);
+		ire = ire->ire_dep_parent;
 	}
+	rw_exit(&ipst->ips_ire_dep_lock);
 }
 
-void
-ire_untrace_ref(ire_t *ire)
+/* Set _VERIFY ire_dep_parent_generation for all children recursively */
+static void
+ire_dep_invalidate_children(ire_t *child)
 {
-	mutex_enter(&ire->ire_lock);
-	if (!ire->ire_trace_disable)
-		th_trace_unref(ire);
-	mutex_exit(&ire->ire_lock);
+	ip_stack_t	*ipst = child->ire_ipst;
+
+	ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
+	/* Depth first */
+	if (child->ire_dep_children != NULL)
+		ire_dep_invalidate_children(child->ire_dep_children);
+
+	while (child != NULL) {
+		mutex_enter(&child->ire_lock);
+		child->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
+		mutex_exit(&child->ire_lock);
+		child = child->ire_dep_sib_next;
+	}
 }
 
 static void
-ire_trace_cleanup(const ire_t *ire)
+ire_dep_increment_children(ire_t *child)
 {
-	th_trace_cleanup(ire, ire->ire_trace_disable);
+	ip_stack_t	*ipst = child->ire_ipst;
+
+	ASSERT(RW_READ_HELD(&ipst->ips_ire_dep_lock));
+	/* Depth first */
+	if (child->ire_dep_children != NULL)
+		ire_dep_increment_children(child->ire_dep_children);
+
+	while (child != NULL) {
+		if (!IRE_IS_CONDEMNED(child))
+			ire_increment_generation(child);
+		child = child->ire_dep_sib_next;
+	}
 }
-#endif /* DEBUG */
 
 /*
- * Generate a message chain with an arp request to resolve the in_ire.
- * It is assumed that in_ire itself is currently in the ire cache table,
- * so we create a fake_ire filled with enough information about ire_addr etc.
- * to retrieve in_ire when the DL_UNITDATA response from the resolver
- * comes back. The fake_ire itself is created by calling esballoc with
- * the fr_rtnp (free routine) set to ire_freemblk. This routine will be
- * invoked when the mblk containing fake_ire is freed.
+ * Walk all the children of this ire recursively and increment their
+ * generation number.
  */
 void
-ire_arpresolve(ire_t *in_ire)
+ire_dep_incr_generation(ire_t *parent)
 {
-	areq_t		*areq;
-	ipaddr_t	*addrp;
-	mblk_t 		*ire_mp, *areq_mp;
-	ire_t 		*ire, *buf;
-	size_t		bufsize;
-	frtn_t		*frtnp;
-	ill_t		*dst_ill;
-	ip_stack_t	*ipst;
+	ip_stack_t	*ipst = parent->ire_ipst;
 
-	ASSERT(in_ire->ire_nce != NULL);
+	rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+	if (parent->ire_dep_children != NULL)
+		ire_dep_increment_children(parent->ire_dep_children);
+	rw_exit(&ipst->ips_ire_dep_lock);
+}
 
-	dst_ill = ire_to_ill(in_ire);
-	ipst = dst_ill->ill_ipst;
+/*
+ * Get a new ire_nce_cache for this IRE as well as its nexthop.
+ * Returns zero if it succeeds. Can fail due to lack of memory or when
+ * the route has become unreachable. Returns ENOMEM and ENETUNREACH in those
+ * cases.
+ *
+ * In the in.mpathd case, the ire will have ire_testhidden
+ * set; so we should create the ncec for the underlying ill.
+ *
+ * Note that the error returned by ire_revalidate_nce() is ignored by most
+ * callers except ire_handle_condemned_nce(), which handles the ENETUNREACH
+ * error to mark potentially bad ire's. For all the other callers, an
+ * error return could indicate a transient condition like ENOMEM, or could
+ * be the result of an interface that is going down/unplumbing. In the former
+ * case (transient error), we would leave the old stale ire/ire_nce_cache
+ * in place, and possibly use incorrect link-layer information to send packets
+ * but would eventually recover. In the latter case (ill down/replumb),
+ * ire_revalidate_nce() might return a condemned nce back, but we would then
+ * recover in the packet output path.
+ */
+int
+ire_revalidate_nce(ire_t *ire)
+{
+	nce_t		*nce, *old_nce;
+	ire_t		*nexthop;
 
 	/*
-	 * Construct message chain for the resolver
-	 * of the form:
-	 *	ARP_REQ_MBLK-->IRE_MBLK
-	 *
-	 * NOTE : If the response does not
-	 * come back, ARP frees the packet. For this reason,
-	 * we can't REFHOLD the bucket of save_ire to prevent
-	 * deletions. We may not be able to REFRELE the bucket
-	 * if the response never comes back. Thus, before
-	 * adding the ire, ire_add_v4 will make sure that the
-	 * interface route does not get deleted. This is the
-	 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6
-	 * where we can always prevent deletions because of
-	 * the synchronous nature of adding IRES i.e
-	 * ire_add_then_send is called after creating the IRE.
+	 * For multicast we conceptually have an NCE but we don't store it
+	 * in ire_nce_cache; when ire_to_nce is called we allocate the nce.
 	 */
+	if (ire->ire_type & IRE_MULTICAST)
+		return (0);
 
-	/*
-	 * We use esballoc to allocate the second part (IRE_MBLK)
-	 * of the message chain depicted above.  This mblk will be freed
-	 * by arp when there is a timeout, and otherwise passed to IP
-	 * and IP will free it after processing the ARP response.
-	 */
+	/* ire_testhidden should only be set on under-interfaces */
+	ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill));
 
-	bufsize = sizeof (ire_t) + sizeof (frtn_t);
-	buf = kmem_alloc(bufsize, KM_NOSLEEP);
-	if (buf == NULL) {
-		ip1dbg(("ire_arpresolve: alloc buffer failed\n"));
-		return;
-	}
-	frtnp = (frtn_t *)(buf + 1);
-	frtnp->free_arg = (caddr_t)buf;
-	frtnp->free_func = ire_freemblk;
-
-	ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
-	if (ire_mp == NULL) {
-		ip1dbg(("ire_arpresolve: esballoc failed\n"));
-		kmem_free(buf, bufsize);
-		return;
+	nexthop = ire_nexthop(ire);
+	if (nexthop == NULL) {
+		/* The route is potentially bad */
+		(void) ire_no_good(ire);
+		return (ENETUNREACH);
 	}
+	if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
+		ASSERT(ire->ire_ill != NULL);
 
-	areq_mp = copyb(dst_ill->ill_resolver_mp);
-	if (areq_mp == NULL) {
-		freemsg(ire_mp);
-		return;
+		if (ire->ire_ipversion == IPV4_VERSION)
+			nce = nce_lookup_v4(ire->ire_ill, &ire->ire_addr);
+		else
+			nce = nce_lookup_v6(ire->ire_ill, &ire->ire_addr_v6);
+	} else {
+		ASSERT(nexthop->ire_type & IRE_ONLINK);
+		if (ire->ire_ipversion == IPV4_VERSION) {
+			nce = arp_nce_init(nexthop->ire_ill, nexthop->ire_addr,
+			    nexthop->ire_type);
+		} else {
+			nce = ndp_nce_init(nexthop->ire_ill,
+			    &nexthop->ire_addr_v6, nexthop->ire_type);
+		}
+	}
+	if (nce == NULL) {
+		/*
+		 * Leave the old stale one in place to avoid a NULL
+		 * ire_nce_cache.
+		 */
+		ire_refrele(nexthop);
+		return (ENOMEM);
 	}
 
-	ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE;
-	ire = (ire_t *)buf;
-	/*
-	 * keep enough info in the fake ire so that we can pull up
-	 * the incomplete ire (in_ire) after result comes back from
-	 * arp and make it complete.
-	 */
-	*ire = ire_null;
-	ire->ire_u = in_ire->ire_u;
-	ire->ire_ipif_seqid = in_ire->ire_ipif_seqid;
-	ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex;
-	ire->ire_ipif = in_ire->ire_ipif;
-	ire->ire_stq = dst_ill->ill_wq;
-	ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex;
-	ire->ire_zoneid = in_ire->ire_zoneid;
-	ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
-	ire->ire_ipst = ipst;
-
-	/*
-	 * ire_freemblk will be called when ire_mp is freed, both for
-	 * successful and failed arp resolution. IRE_MARK_UNCACHED will be set
-	 * when the arp resolution failed.
-	 */
-	ire->ire_marks |= IRE_MARK_UNCACHED;
-	ire->ire_mp = ire_mp;
-	ire_mp->b_wptr = (uchar_t *)&ire[1];
-	ire_mp->b_cont = NULL;
-	linkb(areq_mp, ire_mp);
-
-	/*
-	 * Fill in the source and dest addrs for the resolver.
-	 * NOTE: this depends on memory layouts imposed by
-	 * ill_init().
-	 */
-	areq = (areq_t *)areq_mp->b_rptr;
-	addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset);
-	*addrp = ire->ire_src_addr;
-
-	addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset);
-	if (ire->ire_gateway_addr != INADDR_ANY) {
-		*addrp = ire->ire_gateway_addr;
-	} else {
-		*addrp = ire->ire_addr;
+	if (nexthop != ire) {
+		/* Update the nexthop ire */
+		mutex_enter(&nexthop->ire_lock);
+		old_nce = nexthop->ire_nce_cache;
+		if (!IRE_IS_CONDEMNED(nexthop)) {
+			nce_refhold(nce);
+			nexthop->ire_nce_cache = nce;
+		} else {
+			nexthop->ire_nce_cache = NULL;
+		}
+		mutex_exit(&nexthop->ire_lock);
+		if (old_nce != NULL)
+			nce_refrele(old_nce);
 	}
+	ire_refrele(nexthop);
 
-	/* Up to the resolver. */
-	if (canputnext(dst_ill->ill_rq)) {
-		putnext(dst_ill->ill_rq, areq_mp);
+	mutex_enter(&ire->ire_lock);
+	old_nce = ire->ire_nce_cache;
+	if (!IRE_IS_CONDEMNED(ire)) {
+		nce_refhold(nce);
+		ire->ire_nce_cache = nce;
 	} else {
-		freemsg(areq_mp);
+		ire->ire_nce_cache = NULL;
 	}
+	mutex_exit(&ire->ire_lock);
+	if (old_nce != NULL)
+		nce_refrele(old_nce);
+
+	nce_refrele(nce);
+	return (0);
 }
 
 /*
- * Esballoc free function for AR_ENTRY_QUERY request to clean up any
- * unresolved ire_t and/or nce_t structures when ARP resolution fails.
- *
- * This function can be called by ARP via free routine for ire_mp or
- * by IPv4(both host and forwarding path) via ire_delete
- * in case ARP resolution fails.
- * NOTE: Since IP is MT, ARP can call into IP but not vice versa
- * (for IP to talk to ARP, it still has to send AR* messages).
- *
- * Note that the ARP/IP merge should replace the functioanlity by providing
- * direct function calls to clean up unresolved entries in ire/nce lists.
+ * Get a held nce for a given ire.
+ * In the common case this is just from ire_nce_cache.
+ * For IRE_MULTICAST this needs to do an explicit lookup since we do not
+ * have an IRE_MULTICAST per address.
+ * Note that this explicitly returns CONDEMNED NCEs. The caller needs those
+ * so they can check whether the NCE went unreachable (as opposed to was
+ * condemned for some other reason).
  */
-void
-ire_freemblk(ire_t *ire_mp)
+nce_t *
+ire_to_nce(ire_t *ire, ipaddr_t v4nexthop, const in6_addr_t *v6nexthop)
 {
-	nce_t		*nce = NULL;
-	ill_t		*ill;
-	ip_stack_t	*ipst;
-	netstack_t	*ns = NULL;
+	nce_t	*nce;
 
-	ASSERT(ire_mp != NULL);
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+		return (NULL);
 
-	if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) {
-		ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n",
-		    (void *)ire_mp));
-		goto cleanup;
-	}
-	if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) {
-		goto cleanup; /* everything succeeded. just free and return */
+	/* ire_testhidden should only be set on under-interfaces */
+	ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill));
+
+	mutex_enter(&ire->ire_lock);
+	nce = ire->ire_nce_cache;
+	if (nce != NULL) {
+		nce_refhold(nce);
+		mutex_exit(&ire->ire_lock);
+		return (nce);
 	}
+	mutex_exit(&ire->ire_lock);
 
-	/*
-	 * the arp information corresponding to this ire_mp was not
-	 * transferred to an ire_cache entry. Need
-	 * to clean up incomplete ire's and nce, if necessary.
-	 */
-	ASSERT(ire_mp->ire_stq != NULL);
-	ASSERT(ire_mp->ire_stq_ifindex != 0);
-	ASSERT(ire_mp->ire_ipst != NULL);
+	if (ire->ire_type & IRE_MULTICAST) {
+		ASSERT(ire->ire_ill != NULL);
 
-	ns = netstack_find_by_stackid(ire_mp->ire_stackid);
-	ipst = (ns ? ns->netstack_ip : NULL);
-	if (ipst == NULL || ipst != ire_mp->ire_ipst) /* Disapeared on us */
-		goto  cleanup;
+		if (ire->ire_ipversion == IPV4_VERSION) {
+			ASSERT(v6nexthop == NULL);
 
-	/*
-	 * Get any nce's corresponding to this ire_mp. We first have to
-	 * make sure that the ill is still around.
-	 */
-	ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex,
-	    B_FALSE, NULL, NULL, NULL, NULL, ipst);
-	if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) ||
-	    (ill->ill_state_flags & ILL_CONDEMNED)) {
-		/*
-		 * ill went away. no nce to clean up.
-		 * Note that the ill_state_flags could be set to
-		 * ILL_CONDEMNED after this point, but if we know
-		 * that it is CONDEMNED now, we just bail out quickly.
-		 */
-		if (ill != NULL)
-			ill_refrele(ill);
-		goto cleanup;
+			nce = arp_nce_init(ire->ire_ill, v4nexthop,
+			    ire->ire_type);
+		} else {
+			ASSERT(v6nexthop != NULL);
+			ASSERT(v4nexthop == 0);
+			nce = ndp_nce_init(ire->ire_ill, v6nexthop,
+			    ire->ire_type);
+		}
+		return (nce);
 	}
-	nce = ndp_lookup_v4(ill,
-	    ((ire_mp->ire_gateway_addr != INADDR_ANY) ?
-	    &ire_mp->ire_gateway_addr : &ire_mp->ire_addr),
-	    B_FALSE);
-	ill_refrele(ill);
+	return (NULL);
+}
 
-	if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) {
-		/*
-		 * some incomplete nce was found.
-		 */
-		DTRACE_PROBE2(ire__freemblk__arp__resolv__fail,
-		    nce_t *, nce, ire_t *, ire_mp);
-		/*
-		 * Send the icmp_unreachable messages for the queued mblks in
-		 * ire->ire_nce->nce_qd_mp, since ARP resolution failed
-		 * for this ire
-		 */
-		arp_resolv_failed(nce);
-		/*
-		 * Delete the nce and clean up all ire's pointing at this nce
-		 * in the cachetable
-		 */
-		ndp_delete(nce);
-	}
-	if (nce != NULL)
-		NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */
+nce_t *
+ire_to_nce_pkt(ire_t *ire, mblk_t *mp)
+{
+	ipha_t		*ipha;
+	ip6_t		*ip6h;
 
-cleanup:
-	if (ns != NULL)
-		netstack_rele(ns);
-	/*
-	 * Get rid of the ire buffer
-	 * We call kmem_free here(instead of ire_delete()), since
-	 * this is the freeb's callback.
-	 */
-	kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t));
+	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
+		ipha = (ipha_t *)mp->b_rptr;
+		return (ire_to_nce(ire, ipha->ipha_dst, NULL));
+	} else {
+		ip6h = (ip6_t *)mp->b_rptr;
+		return (ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst));
+	}
 }
 
 /*
- * find, or create if needed, a neighbor cache entry nce_t for IRE_CACHE and
- * non-loopback IRE_BROADCAST ire's.
- *
- * If a neighbor-cache entry has to be created (i.e., one does not already
- * exist in the nce list) the nce_res_mp and nce_state of the neighbor cache
- * entry are initialized in ndp_add_v4(). These values are picked from
- * the src_nce, if one is passed in. Otherwise (if src_nce == NULL) the
- * ire->ire_type and the outgoing interface (ire_to_ill(ire)) values
- * determine the {nce_state, nce_res_mp} of the nce_t created. All
- * IRE_BROADCAST entries have nce_state = ND_REACHABLE, and the nce_res_mp
- * is set to the ill_bcast_mp of the outgoing inerface. For unicast ire
- * entries,
- *   - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created
- *     nce_t will have a null nce_res_mp, and will be in the ND_INITIAL state.
- *   - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link
- *     layer resolution is necessary, so that the nce_t will be in the
- *     ND_REACHABLE state and the nce_res_mp will have a copy of the
- *     ill_resolver_mp of the outgoing interface.
- *
- * The link layer information needed for broadcast addresses, and for
- * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that
- * never needs re-verification for the lifetime of the nce_t. These are
- * therefore marked NCE_F_PERMANENT, and never allowed to expire via
- * NCE_EXPIRED.
- *
- * IRE_CACHE ire's contain the information for  the nexthop (ire_gateway_addr)
- * in the case of indirect routes, and for the dst itself (ire_addr) in the
- * case of direct routes, with the nce_res_mp containing a template
- * DL_UNITDATA request.
- *
- * The actual association of the ire_nce to the nce created here is
- * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions
- * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which
- * the ire_nce assignment is done in ire_add_then_send.
+ * Given an IRE_INTERFACE (that matches more than one address) create
+ * and return an IRE_IF_CLONE for the specific address.
+ * Return the generation number.
+ * Returns NULL is no memory for the IRE.
+ * Handles both IPv4 and IPv6.
  */
-int
-ire_nce_init(ire_t *ire, nce_t *src_nce)
+ire_t *
+ire_create_if_clone(ire_t *ire_if, const in6_addr_t *addr, uint_t *generationp)
 {
-	in_addr_t	addr4;
-	int		err;
-	nce_t		*nce = NULL;
-	ill_t		*ire_ill;
-	uint16_t	nce_flags = 0;
-	ip_stack_t	*ipst;
-
-	if (ire->ire_stq == NULL)
-		return (0); /* no need to create nce for local/loopback */
-
-	switch (ire->ire_type) {
-	case IRE_CACHE:
-		if (ire->ire_gateway_addr != INADDR_ANY)
-			addr4 = ire->ire_gateway_addr; /* 'G' route */
-		else
-			addr4 = ire->ire_addr; /* direct route */
-		break;
-	case IRE_BROADCAST:
-		addr4 = ire->ire_addr;
-		nce_flags |= (NCE_F_PERMANENT|NCE_F_BCAST);
-		break;
-	default:
-		return (0);
+	ire_t		*ire;
+	ire_t		*nire;
+
+	if (ire_if->ire_ipversion == IPV4_VERSION) {
+		ipaddr_t	v4addr;
+		ipaddr_t	mask = IP_HOST_MASK;
+
+		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
+		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
+
+		ire = ire_create(
+		    (uchar_t *)&v4addr,			/* dest address */
+		    (uchar_t *)&mask,			/* mask */
+		    (uchar_t *)&ire_if->ire_gateway_addr,
+		    IRE_IF_CLONE,			/* IRE type */
+		    ire_if->ire_ill,
+		    ire_if->ire_zoneid,
+		    ire_if->ire_flags | RTF_HOST,
+		    NULL,		/* No security attr for IRE_IF_ALL */
+		    ire_if->ire_ipst);
+	} else {
+		ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
+		ire = ire_create_v6(
+		    addr,				/* dest address */
+		    &ipv6_all_ones,			/* mask */
+		    &ire_if->ire_gateway_addr_v6,	/* gateway addr */
+		    IRE_IF_CLONE,			/* IRE type */
+		    ire_if->ire_ill,
+		    ire_if->ire_zoneid,
+		    ire_if->ire_flags | RTF_HOST,
+		    NULL,		/* No security attr for IRE_IF_ALL */
+		    ire_if->ire_ipst);
 	}
+	if (ire == NULL)
+		return (NULL);
 
-	/*
-	 * ire_ipif is picked based on RTF_SETSRC, usesrc etc.
-	 * rules in ire_forward_src_ipif. We want the dlureq_mp
-	 * for the outgoing interface, which we get from the ire_stq.
-	 */
-	ire_ill = ire_to_ill(ire);
-	ipst = ire_ill->ill_ipst;
-
-	/*
-	 * IRE_IF_NORESOLVER entries never need re-verification and
-	 * do not expire, so we mark them as NCE_F_PERMANENT.
-	 */
-	if (ire_ill->ill_net_type == IRE_IF_NORESOLVER)
-		nce_flags |= NCE_F_PERMANENT;
-
-retry_nce:
-	err = ndp_lookup_then_add_v4(ire_ill, &addr4, nce_flags,
-	    &nce, src_nce);
+	/* Take the metrics, in particular the mtu, from the IRE_IF */
+	ire->ire_metrics = ire_if->ire_metrics;
 
-	if (err == EEXIST && NCE_EXPIRED(nce, ipst)) {
-		/*
-		 * We looked up an expired nce.
-		 * Go back and try to create one again.
-		 */
-		ndp_delete(nce);
-		NCE_REFRELE(nce);
-		nce = NULL;
-		goto retry_nce;
-	}
+	nire = ire_add(ire);
+	if (nire == NULL) /* Some failure */
+		return (NULL);
 
-	ip1dbg(("ire 0x%p addr 0x%lx type 0x%x; found nce 0x%p err %d\n",
-	    (void *)ire, (ulong_t)addr4, ire->ire_type, (void *)nce, err));
+	if (generationp != NULL)
+		*generationp = nire->ire_generation;
 
-	switch (err) {
-	case 0:
-	case EEXIST:
-		/*
-		 * return a pointer to a newly created or existing nce_t;
-		 * note that the ire-nce mapping is many-one, i.e.,
-		 * multiple ire's could point to the same nce_t.
-		 */
-		break;
-	default:
-		DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err);
-		return (EINVAL);
-	}
 	/*
-	 * IRE_BROADCAST ire's must be linked to NCE_F_BCAST nce's and
-	 * vice-versa (IRE_CACHE <-> unicast nce entries). We may have found an
-	 * existing unicast (or bcast) nce when trying to add a BROADCAST (or
-	 * unicast) ire, e.g., when address/netmask modifications were in
-	 * progress, and the ipif_ndp_down() call to quiesce existing state
-	 * during the addr/mask modification may have skipped the ndp_delete()
-	 * because the ipif being affected was not the last one on the ill.  We
-	 * recover from the missed ndp_delete() now, by deleting the old nce and
-	 * adding a new one with the correct NCE_F_BCAST state.
+	 * Make sure races don't add a duplicate by
+	 * catching the case when an identical was returned.
 	 */
-	if (ire->ire_type == IRE_BROADCAST) {
-		if ((nce->nce_flags & NCE_F_BCAST) == 0) {
-			/* IRE_BROADCAST needs NCE_F_BCAST */
-			ndp_delete(nce);
-			NCE_REFRELE(nce);
-			goto retry_nce;
-		}
-		/*
-		 * Two bcast ires are created for each interface;
-		 * 1. loopback copy (which does not  have an
-		 *    ire_stq, and therefore has no ire_nce), and,
-		 * 2. the non-loopback copy, which has the nce_res_mp
-		 *    initialized to a copy of the ill_bcast_mp, and
-		 *    is marked as ND_REACHABLE at this point.
-		 *    This nce does not undergo any further state changes,
-		 *    and exists as long as the interface is plumbed.
-		 * Note: the assignment of ire_nce here is a historical
-		 * artifact of old code that used to inline ire_add().
-		 */
-		ire->ire_nce = nce;
-		/*
-		 * We are associating this nce to the ire,
-		 * so change the nce ref taken in
-		 * ndp_lookup_then_add_v4() from
-		 * NCE_REFHOLD to NCE_REFHOLD_NOTR
-		 */
-		NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
-	} else {
-		if ((nce->nce_flags & NCE_F_BCAST) != 0) {
-			/* IRE_CACHE needs unicast nce */
-			ndp_delete(nce);
-			NCE_REFRELE(nce);
-			goto retry_nce;
-		}
-		/*
-		 * We are not using this nce_t just yet so release
-		 * the ref taken in ndp_lookup_then_add_v4()
-		 */
-		NCE_REFRELE(nce);
+	if (nire != ire) {
+		ASSERT(nire->ire_identical_ref > 1);
+		ire_delete(nire);
 	}
-	return (0);
+	return (nire);
 }
 
 /*
- * This is the implementation of the IPv4 IRE cache lookup procedure.
- * Separating the interface from the implementation allows additional
- * flexibility when specifying search criteria.
+ * The argument is an IRE_INTERFACE. Delete all of IRE_IF_CLONE in the
+ * ire_dep_children (just walk the ire_dep_sib_next since they are all
+ * immediate children.)
+ * Since we hold a lock while we remove them we need to defer the actual
+ * calls to ire_delete() until we have dropped the lock. This makes things
+ * less efficient since we restart at the top after dropping the lock. But
+ * we only run when an IRE_INTERFACE is deleted which is infrquent.
+ *
+ * Note that ire_dep_children can be any mixture of offlink routes and
+ * IRE_IF_CLONE entries.
  */
-static ire_t *
-ip4_ctable_lookup_impl(ire_ctable_args_t *margs)
+void
+ire_dep_delete_if_clone(ire_t *parent)
 {
-	irb_t			*irb_ptr;
-	ire_t			*ire;
-	ip_stack_t		*ipst = margs->ict_ipst;
+	ip_stack_t	*ipst = parent->ire_ipst;
+	ire_t		*child, *next;
 
-	if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
-	    (margs->ict_ipif == NULL)) {
-		return (NULL);
+restart:
+	rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+	if (parent->ire_dep_children == NULL) {
+		rw_exit(&ipst->ips_ire_dep_lock);
+		return;
 	}
-
-	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(
-	    *((ipaddr_t *)margs->ict_addr), ipst->ips_ip_cache_table_size)];
-	rw_enter(&irb_ptr->irb_lock, RW_READER);
-	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (ire->ire_marks & IRE_MARK_CONDEMNED)
-			continue;
-		ASSERT(ire->ire_mask == IP_HOST_MASK);
-		if (ire_match_args(ire, *((ipaddr_t *)margs->ict_addr),
-		    ire->ire_mask, *((ipaddr_t *)margs->ict_gateway),
-		    margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0,
-		    margs->ict_tsl, margs->ict_flags, margs->ict_wq)) {
-			IRE_REFHOLD(ire);
-			rw_exit(&irb_ptr->irb_lock);
-			return (ire);
+	child = parent->ire_dep_children;
+	while (child != NULL) {
+		next = child->ire_dep_sib_next;
+		if ((child->ire_type & IRE_IF_CLONE) &&
+		    !IRE_IS_CONDEMNED(child)) {
+			ire_refhold(child);
+			rw_exit(&ipst->ips_ire_dep_lock);
+			ire_delete(child);
+			ASSERT(IRE_IS_CONDEMNED(child));
+			ire_refrele(child);
+			goto restart;
 		}
+		child = next;
 	}
-
-	rw_exit(&irb_ptr->irb_lock);
-	return (NULL);
+	rw_exit(&ipst->ips_ire_dep_lock);
 }
 
 /*
- * This function locates IRE_CACHE entries which were added by the
- * ire_forward() path. We can fully specify the IRE we are looking for by
- * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ).
+ * ire_pref() is used in recursive route-resolution for a destination to
+ * determine the preference of an ire, where "preference" is determined
+ * based on the level of indirection to the destination of the ire.
+ * A higher preference indicates that fewer lookups are needed to complete
+ * recursive route lookup. Thus
+ * ire_pref(RTF_INDIRECT) < ire_pref(IRE_IF_RESOLVER) < ire_pref(IRE_PREF_CLONE)
  */
-ire_t *
-ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif,
-    zoneid_t zoneid, ip_stack_t *ipst, queue_t *wq)
-{
-	ire_ctable_args_t	margs;
-
-	margs.ict_addr = &addr;
-	margs.ict_gateway = &gw;
-	margs.ict_type = IRE_CACHE;
-	margs.ict_ipif = ipif;
-	margs.ict_zoneid = zoneid;
-	margs.ict_tsl = NULL;
-	margs.ict_flags = MATCH_IRE_GW | MATCH_IRE_IPIF | MATCH_IRE_ZONEONLY |
-	    MATCH_IRE_TYPE | MATCH_IRE_WQ;
-	margs.ict_ipst = ipst;
-	margs.ict_wq = wq;
-
-	return (ip4_ctable_lookup_impl(&margs));
+int
+ire_pref(ire_t *ire)
+{
+	if (ire->ire_flags & RTF_INDIRECT)
+		return (1);
+	if (ire->ire_type & IRE_OFFLINK)
+		return (2);
+	if (ire->ire_type & (IRE_IF_RESOLVER|IRE_IF_NORESOLVER))
+		return (3);
+	if (ire->ire_type & IRE_IF_CLONE)
+		return (4);
+	if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST))
+		return (5);
+	return (-1); /* unknown ire_type */
 }
diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c
index 5418c2d8d4..41f4f3f221 100644
--- a/usr/src/uts/common/inet/ip/ip_mroute.c
+++ b/usr/src/uts/common/inet/ip/ip_mroute.c
@@ -1,8 +1,4 @@
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
@@ -23,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.
- * All rights reserved.  Use is subject to license terms.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -65,6 +61,7 @@
 #include <netinet/in.h>
 #include <net/if_dl.h>
 
+#include <inet/ipsec_impl.h>
 #include <inet/common.h>
 #include <inet/mi.h>
 #include <inet/nd.h>
@@ -79,6 +76,7 @@
 #include <netinet/ip_mroute.h>
 #include <inet/ip_multi.h>
 #include <inet/ip_ire.h>
+#include <inet/ip_ndp.h>
 #include <inet/ip_if.h>
 #include <inet/ipclassifier.h>
 
@@ -98,7 +96,7 @@
  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
  * that vif is being initalized.
  * Each structure is freed when the refcnt goes down to zero. If a delete comes
- * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
+ * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
  * which prevents the struct from further use.  When the refcnt goes to zero
  * the struct is freed and is marked VIF_MARK_NOTINUSE.
  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
@@ -171,9 +169,9 @@
 
 /* Function declarations */
 static int	add_mfc(struct mfcctl *, ip_stack_t *);
-static int	add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *);
+static int	add_vif(struct vifctl *, conn_t *, ip_stack_t *);
 static int	del_mfc(struct mfcctl *, ip_stack_t *);
-static int	del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *);
+static int	del_vif(vifi_t *, ip_stack_t *);
 static void	del_vifp(struct vif *);
 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
 static void	expire_upcalls(void *);
@@ -188,7 +186,7 @@ static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
 		    ipaddr_t, struct mfc *);
 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
-static int	register_mforward(queue_t *, mblk_t *, ill_t *);
+static int	register_mforward(mblk_t *, ip_recv_attr_t *);
 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
 static int	set_assert(int *, ip_stack_t *);
 
@@ -331,10 +329,9 @@ static ipha_t multicast_encap_iphdr = {
  * Handle MRT setsockopt commands to modify the multicast routing tables.
  */
 int
-ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
-    int datalen, mblk_t *first_mp)
+ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
+    int datalen)
 {
-	conn_t		*connp = Q_TO_CONN(q);
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
 
 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
@@ -376,11 +373,9 @@ ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
 
 	switch (cmd) {
 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
-	case MRT_DONE:	return (ip_mrouter_done(first_mp, ipst));
-	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp,
-			    first_mp, ipst));
-	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, connp, first_mp,
-			    ipst));
+	case MRT_DONE:	return (ip_mrouter_done(ipst));
+	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp, ipst));
+	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, ipst));
 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
@@ -392,9 +387,8 @@ ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
  * Handle MRT getsockopt commands
  */
 int
-ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
+ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
 {
-	conn_t		*connp = Q_TO_CONN(q);
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
 
 	if (connp != ipst->ips_ip_g_mrouter)
@@ -611,7 +605,7 @@ ip_mrouter_stack_init(ip_stack_t *ipst)
  * Didn't use global timeout_val (BSD version), instead check the mfctable.
  */
 int
-ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
+ip_mrouter_done(ip_stack_t *ipst)
 {
 	conn_t		*mrouter;
 	vifi_t 		vifi;
@@ -665,47 +659,19 @@ ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
 			/* Phyint only */
 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
 				ipif_t *ipif = vifp->v_ipif;
-				ipsq_t  *ipsq;
-				boolean_t suc;
-				ill_t *ill;
+				ilm_t *ilm = vifp->v_ilm;
 
-				ill = ipif->ipif_ill;
-				suc = B_FALSE;
-				if (mp == NULL) {
-					/*
-					 * being called from ip_close,
-					 * lets do it synchronously.
-					 * Clear VIF_MARK_GOOD and
-					 * set VIF_MARK_CONDEMNED.
-					 */
-					vifp->v_marks &= ~VIF_MARK_GOOD;
-					vifp->v_marks |= VIF_MARK_CONDEMNED;
-					mutex_exit(&(vifp)->v_lock);
-					suc = ipsq_enter(ill, B_FALSE, NEW_OP);
-					ipsq = ill->ill_phyint->phyint_ipsq;
-				} else {
-					ipsq = ipsq_try_enter(ipif, NULL,
-					    mrouter->conn_wq, mp,
-					    ip_restart_optmgmt, NEW_OP, B_TRUE);
-					if (ipsq == NULL) {
-						mutex_exit(&(vifp)->v_lock);
-						ipif_refrele(ipif);
-						return (EINPROGRESS);
-					}
-					/*
-					 * Clear VIF_MARK_GOOD and
-					 * set VIF_MARK_CONDEMNED.
-					 */
-					vifp->v_marks &= ~VIF_MARK_GOOD;
-					vifp->v_marks |= VIF_MARK_CONDEMNED;
-					mutex_exit(&(vifp)->v_lock);
-					suc = B_TRUE;
-				}
+				vifp->v_ilm = NULL;
+				vifp->v_marks &= ~VIF_MARK_GOOD;
+				vifp->v_marks |= VIF_MARK_CONDEMNED;
 
-				if (suc) {
-					(void) ip_delmulti(INADDR_ANY, ipif,
-					    B_TRUE, B_TRUE);
-					ipsq_exit(ipsq);
+				mutex_exit(&(vifp)->v_lock);
+				if (ilm != NULL) {
+					ill_t *ill = ipif->ipif_ill;
+
+					(void) ip_delmulti(ilm);
+					ASSERT(ill->ill_mrouter_cnt > 0);
+					atomic_dec_32(&ill->ill_mrouter_cnt);
 				}
 				mutex_enter(&vifp->v_lock);
 			}
@@ -866,14 +832,15 @@ lock_good_vif(struct vif *vifp)
  * Add a vif to the vif table.
  */
 static int
-add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
+add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
 {
 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
 	ipif_t		*ipif;
-	int		error;
+	int		error = 0;
 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
-	ipsq_t  	*ipsq;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
+	ilm_t		*ilm;
+	ill_t		*ill;
 
 	ASSERT(connp != NULL);
 
@@ -913,28 +880,12 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
 	mutex_exit(&vifp->v_lock);
 	/* Find the interface with the local address */
 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
-	    connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
-	    ip_restart_optmgmt, &error, ipst);
+	    IPCL_ZONEID(connp), ipst);
 	if (ipif == NULL) {
 		VIF_REFRELE(vifp);
-		if (error == EINPROGRESS)
-			return (error);
 		return (EADDRNOTAVAIL);
 	}
 
-	/*
-	 * We have to be exclusive as we have to call ip_addmulti()
-	 * This is the best position to try to be exclusive in case
-	 * we have to wait.
-	 */
-	ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
-	    ip_restart_optmgmt, NEW_OP, B_TRUE);
-	if ((ipsq) == NULL) {
-		VIF_REFRELE(vifp);
-		ipif_refrele(ipif);
-		return (EINPROGRESS);
-	}
-
 	if (ipst->ips_ip_mrtdebug > 1) {
 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 		    "add_vif: src 0x%x enter",
@@ -959,7 +910,6 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
 			    "add_vif: source route tunnels not supported\n");
 			VIF_REFRELE_LOCKED(vifp);
 			ipif_refrele(ipif);
-			ipsq_exit(ipsq);
 			return (EOPNOTSUPP);
 		}
 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
@@ -981,7 +931,6 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
 				mutex_exit(&ipst->ips_numvifs_mutex);
 				VIF_REFRELE_LOCKED(vifp);
 				ipif_refrele(ipif);
-				ipsq_exit(ipsq);
 				return (EADDRINUSE);
 			}
 		}
@@ -995,22 +944,39 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
 				ipst->ips_reg_vif_num = ALL_VIFS;
 				mutex_exit(&ipst->ips_numvifs_mutex);
 			}
-			ipsq_exit(ipsq);
 			return (EOPNOTSUPP);
 		}
 		/* Enable promiscuous reception of all IP mcasts from the if */
 		mutex_exit(&vifp->v_lock);
-		error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
-		    MODE_IS_EXCLUDE, NULL);
+
+		ill = ipif->ipif_ill;
+		if (IS_UNDER_IPMP(ill))
+			ill = ipmp_ill_hold_ipmp_ill(ill);
+
+		if (ill == NULL) {
+			ilm = NULL;
+		} else {
+			ilm = ip_addmulti(&ipv6_all_zeros, ill,
+			    ipif->ipif_zoneid, &error);
+			if (ilm != NULL)
+				atomic_inc_32(&ill->ill_mrouter_cnt);
+			if (IS_UNDER_IPMP(ipif->ipif_ill)) {
+				ill_refrele(ill);
+				ill = ipif->ipif_ill;
+			}
+		}
+
 		mutex_enter(&vifp->v_lock);
 		/*
 		 * since we released the lock lets make sure that
 		 * ip_mrouter_done() has not been called.
 		 */
-		if (error != 0 || is_mrouter_off(ipst)) {
-			if (error == 0)
-				(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
-				    B_TRUE);
+		if (ilm == NULL || is_mrouter_off(ipst)) {
+			if (ilm != NULL) {
+				(void) ip_delmulti(ilm);
+				ASSERT(ill->ill_mrouter_cnt > 0);
+				atomic_dec_32(&ill->ill_mrouter_cnt);
+			}
 			if (vifcp->vifc_flags & VIFF_REGISTER) {
 				mutex_enter(&ipst->ips_numvifs_mutex);
 				ipst->ips_reg_vif_num = ALL_VIFS;
@@ -1018,9 +984,9 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
 			}
 			VIF_REFRELE_LOCKED(vifp);
 			ipif_refrele(ipif);
-			ipsq_exit(ipsq);
 			return (error?error:EINVAL);
 		}
+		vifp->v_ilm = ilm;
 	}
 	/* Define parameters for the tbf structure */
 	vifp->v_tbf = v_tbf;
@@ -1063,7 +1029,6 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
 
 	vifp->v_marks = VIF_MARK_GOOD;
 	mutex_exit(&vifp->v_lock);
-	ipsq_exit(ipsq);
 	return (0);
 }
 
@@ -1131,10 +1096,9 @@ del_vifp(struct vif *vifp)
 }
 
 static int
-del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
+del_vif(vifi_t *vifip, ip_stack_t *ipst)
 {
 	struct vif	*vifp = ipst->ips_vifs + *vifip;
-	ipsq_t  	*ipsq;
 
 	if (*vifip >= ipst->ips_numvifs)
 		return (EINVAL);
@@ -1151,41 +1115,6 @@ del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
 		return (EADDRNOTAVAIL);
 	}
 
-	/*
-	 * This is an optimization, if first_mp == NULL
-	 * than we are being called from reset_mrt_vif_ipif()
-	 * so we already have exclusive access to the ipsq.
-	 * the ASSERT below is a check for this condition.
-	 */
-	if (first_mp != NULL &&
-	    !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
-		ASSERT(connp != NULL);
-		/*
-		 * We have to be exclusive as we have to call ip_delmulti()
-		 * This is the best position to try to be exclusive in case
-		 * we have to wait.
-		 */
-		ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp),
-		    first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE);
-		if ((ipsq) == NULL) {
-			mutex_exit(&vifp->v_lock);
-			return (EINPROGRESS);
-		}
-		/* recheck after being exclusive */
-		if (vifp->v_lcl_addr.s_addr == 0 ||
-		    !vifp->v_marks & VIF_MARK_GOOD) {
-			/*
-			 * someone beat us.
-			 */
-			mutex_exit(&vifp->v_lock);
-			ipsq_exit(ipsq);
-			return (EADDRNOTAVAIL);
-		}
-	}
-
-
-	ASSERT(IAM_WRITER_IPIF(vifp->v_ipif));
-
 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
 	vifp->v_marks &= ~VIF_MARK_GOOD;
 	vifp->v_marks |= VIF_MARK_CONDEMNED;
@@ -1193,18 +1122,30 @@ del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
 	/* Phyint only */
 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
 		ipif_t *ipif = vifp->v_ipif;
+		ilm_t *ilm = vifp->v_ilm;
+
+		vifp->v_ilm = NULL;
+
 		ASSERT(ipif != NULL);
 		/*
 		 * should be OK to drop the lock as we
 		 * have marked this as CONDEMNED.
 		 */
 		mutex_exit(&(vifp)->v_lock);
-		(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE);
-		if (first_mp != NULL)
-			ipsq_exit(ipsq);
+		if (ilm != NULL) {
+			(void) ip_delmulti(ilm);
+			ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
+			atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
+		}
 		mutex_enter(&(vifp)->v_lock);
 	}
 
+	if (vifp->v_flags & VIFF_REGISTER) {
+		mutex_enter(&ipst->ips_numvifs_mutex);
+		ipst->ips_reg_vif_num = ALL_VIFS;
+		mutex_exit(&ipst->ips_numvifs_mutex);
+	}
+
 	/*
 	 * decreases the refcnt added in add_vif.
 	 */
@@ -1584,16 +1525,21 @@ del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
  *                   1 - pkt came in on tunnel
  */
 int
-ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
+ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
 {
+	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
+	ill_t		*ill = ira->ira_ill;
 	struct mfc 	*rt;
 	ipaddr_t	src, dst, tunnel_src = 0;
 	static int	srctun = 0;
 	vifi_t		vifi;
 	boolean_t	pim_reg_packet = B_FALSE;
-	struct mfcb *mfcbp;
+	struct mfcb	*mfcbp;
 	ip_stack_t	*ipst = ill->ill_ipst;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
+	ill_t		*rill = ira->ira_rill;
+
+	ASSERT(ira->ira_pktlen == msgdsize(mp));
 
 	if (ipst->ips_ip_mrtdebug > 1) {
 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
@@ -1603,10 +1549,10 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
 	}
 
 	dst = ipha->ipha_dst;
-	if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER)
+	if (ira->ira_flags & IRAF_PIM_REGISTER)
 		pim_reg_packet = B_TRUE;
-	else
-		tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev;
+	else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
+		tunnel_src = ira->ira_mroute_tunnel;
 
 	/*
 	 * Don't forward a packet with time-to-live of zero or one,
@@ -1620,7 +1566,6 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
 			    " dst 0x%x ill %s",
 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
 		}
-		mp->b_prev = NULL;
 		if (tunnel_src != 0)
 			return (1);
 		else
@@ -1630,10 +1575,8 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
 	if ((tunnel_src != 0) || pim_reg_packet) {
 		/*
 		 * Packet arrived over an encapsulated tunnel or via a PIM
-		 * register message. Both ip_mroute_decap() and pim_input()
-		 * encode information in mp->b_prev.
+		 * register message.
 		 */
-		mp->b_prev = NULL;
 		if (ipst->ips_ip_mrtdebug > 1) {
 			if (tunnel_src != 0) {
 				(void) mi_strlog(mrouter->conn_rq, 1,
@@ -1926,10 +1869,16 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
 			mutex_exit(&mfc_rt->mfc_mutex);
 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
 			/* Pass to RAWIP */
-			(mrouter->conn_recv)(mrouter, mp_copy, NULL);
+			ira->ira_ill = ira->ira_rill = NULL;
+			(mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
+			ira->ira_ill = ill;
+			ira->ira_rill = rill;
 		} else {
 			mutex_exit(&mfc_rt->mfc_mutex);
 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ip_mforward - upcall already waiting",
+			    mp_copy, ill);
 			freemsg(mp_copy);
 		}
 
@@ -1945,8 +1894,11 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
 			mi_free((char *)mfc_rt);
 		if (rte != NULL)
 			mi_free((char *)rte);
-		if (mp_copy != NULL)
+		if (mp_copy != NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ip_mforward error", mp_copy, ill);
 			freemsg(mp_copy);
+		}
 		if (mp0 != NULL)
 			freemsg(mp0);
 		return (-1);
@@ -2023,7 +1975,6 @@ static int
 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
     struct mfc *rt)
 {
-	ill_t *vill;
 	vifi_t vifi;
 	struct vif *vifp;
 	ipaddr_t dst = ipha->ipha_dst;
@@ -2031,6 +1982,7 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
 	vifi_t num_of_vifs;
 	ip_stack_t	*ipst = ill->ill_ipst;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
+	ip_recv_attr_t	iras;
 
 	if (ipst->ips_ip_mrtdebug > 1) {
 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
@@ -2091,19 +2043,19 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
 	 * Don't forward if it didn't arrive from the parent vif for its
 	 * origin.
 	 */
-	vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill;
-	if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) ||
+	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
 		/* Came in the wrong interface */
 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
 			"numvifs %d ill %s viftable ill %s\n",
 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
-			vill->ill_name));
+			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
 		if (ipst->ips_ip_mrtdebug > 1) {
 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 			    "ip_mdq: arrived wrong if, vifi %d ill "
 			    "%s viftable ill %s\n",
-			    (int)vifi, ill->ill_name, vill->ill_name);
+			    (int)vifi, ill->ill_name,
+			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
 		}
 		ipst->ips_mrtstat->mrts_wrong_if++;
 		rt->mfc_wrong_if++;
@@ -2137,7 +2089,14 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
 			im->im_mbz = 0;
 			im->im_vif = (ushort_t)vifi;
 			/* Pass to RAWIP */
-			(mrouter->conn_recv)(mrouter, mp_copy, NULL);
+
+			bzero(&iras, sizeof (iras));
+			iras.ira_flags = IRAF_IS_IPV4;
+			iras.ira_ip_hdr_length =
+			    IPH_HDR_LENGTH(mp_copy->b_rptr);
+			iras.ira_pktlen = msgdsize(mp_copy);
+			(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
+			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
 		}
 		unlock_good_vif(&ipst->ips_vifs[vifi]);
 		if (tunnel_src != 0)
@@ -2239,8 +2198,10 @@ register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
 	struct igmpmsg	*im;
 	mblk_t		*mp_copy;
 	ipha_t		*ipha_copy;
-	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
+	ill_t		*ill = vifp->v_ipif->ipif_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
+	ip_recv_attr_t	iras;
 
 	if (ipst->ips_ip_mrtdebug > 1) {
 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
@@ -2307,16 +2268,24 @@ register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
 	im->im_mbz = 0;
 
 	++ipst->ips_mrtstat->mrts_upcalls;
-	if (!canputnext(mrouter->conn_rq)) {
+	if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
+	    !canputnext(mrouter->conn_rq)) {
 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
 		if (ipst->ips_ip_mrtdebug > 3) {
 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 			    "register_send: register upcall failure.");
 		}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
 		freemsg(mp_copy);
 	} else {
 		/* Pass to RAWIP */
-		(mrouter->conn_recv)(mrouter, mp_copy, NULL);
+		bzero(&iras, sizeof (iras));
+		iras.ira_flags = IRAF_IS_IPV4;
+		iras.ira_ip_hdr_length = sizeof (ipha_t);
+		iras.ira_pktlen = msgdsize(mp_copy);
+		(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
+		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
 	}
 }
 
@@ -2349,18 +2318,22 @@ pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
 }
 
 /*
- * int
- * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets.
- *	IP Protocol 103. Register messages are decapsulated and sent
- *	onto multicast forwarding.
+ * Process PIM protocol packets i.e. IP Protocol 103.
+ * Register messages are decapsulated and sent onto multicast forwarding.
+ *
+ * Return NULL for a bad packet that is discarded here.
+ * Return mp if the message is OK and should be handed to "raw" receivers.
+ * Callers of pim_input() may need to reinitialize variables that were copied
+ * from the mblk as this calls pullupmsg().
  */
-int
-pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
+mblk_t *
+pim_input(mblk_t *mp, ip_recv_attr_t *ira)
 {
 	ipha_t		*eip, *ip;
 	int		iplen, pimlen, iphlen;
 	struct pim	*pimp;	/* pointer to a pim struct */
 	uint32_t	*reghdr;
+	ill_t		*ill = ira->ira_ill;
 	ip_stack_t	*ipst = ill->ill_ipst;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
 
@@ -2369,8 +2342,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
 	 */
 	if (pullupmsg(mp, -1) == 0) {
 		++ipst->ips_mrtstat->mrts_pim_nomemory;
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("mrts_pim_nomemory", mp, ill);
 		freemsg(mp);
-		return (-1);
+		return (NULL);
 	}
 
 	ip = (ipha_t *)mp->b_rptr;
@@ -2387,8 +2362,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 			    "pim_input: length not at least minlen");
 		}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("mrts_pim_malformed", mp, ill);
 		freemsg(mp);
-		return (-1);
+		return (NULL);
 	}
 
 	/*
@@ -2405,8 +2382,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 			    "pim_input: unknown version of PIM");
 		}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("mrts_pim_badversion", mp, ill);
 		freemsg(mp);
-		return (-1);
+		return (NULL);
 	}
 
 	/*
@@ -2418,12 +2397,14 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 			    "pim_input: invalid checksum");
 		}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("pim_rcv_badcsum", mp, ill);
 		freemsg(mp);
-		return (-1);
+		return (NULL);
 	}
 
 	if (pimp->pim_type != PIM_REGISTER)
-		return (0);
+		return (mp);
 
 	reghdr = (uint32_t *)(pimp + 1);
 	eip = (ipha_t *)(reghdr + 1);
@@ -2437,8 +2418,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 			    "pim_input: Inner pkt not mcast .. !");
 		}
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("mrts_pim_badregisters", mp, ill);
 		freemsg(mp);
-		return (-1);
+		return (NULL);
 	}
 	if (ipst->ips_ip_mrtdebug > 1) {
 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
@@ -2450,27 +2433,36 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
 	/*
 	 * If the null register bit is not set, decapsulate
 	 * the packet before forwarding it.
+	 * Avoid this in no register vif
 	 */
-	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) {
+	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
+	    ipst->ips_reg_vif_num != ALL_VIFS) {
 		mblk_t *mp_copy;
+		uint_t saved_pktlen;
 
 		/* Copy the message */
 		if ((mp_copy = copymsg(mp)) == NULL) {
 			++ipst->ips_mrtstat->mrts_pim_nomemory;
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("mrts_pim_nomemory", mp, ill);
 			freemsg(mp);
-			return (-1);
+			return (NULL);
 		}
 
 		/*
 		 * Decapsulate the packet and give it to
 		 * register_mforward.
 		 */
-		mp_copy->b_rptr += iphlen + sizeof (pim_t) +
-		    sizeof (*reghdr);
-		if (register_mforward(q, mp_copy, ill) != 0) {
+		mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
+		saved_pktlen = ira->ira_pktlen;
+		ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
+		if (register_mforward(mp_copy, ira) != 0) {
+			/* register_mforward already called ip_drop_input */
 			freemsg(mp);
-			return (-1);
+			ira->ira_pktlen = saved_pktlen;
+			return (NULL);
 		}
+		ira->ira_pktlen = saved_pktlen;
 	}
 
 	/*
@@ -2478,7 +2470,7 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
 	 * PIM socket. For Solaris it is done right after pim_input() is
 	 * called.
 	 */
-	return (0);
+	return (mp);
 }
 
 /*
@@ -2486,38 +2478,52 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
  * the packet. Loop back the packet, as if we have received it.
  * In pim_input() we have to check if the destination is a multicast address.
  */
-/* ARGSUSED */
 static int
-register_mforward(queue_t *q, mblk_t *mp, ill_t *ill)
+register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
 {
+	ire_t		*ire;
+	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
+	ill_t		*ill = ira->ira_ill;
 	ip_stack_t	*ipst = ill->ill_ipst;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
 
 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
 
 	if (ipst->ips_ip_mrtdebug > 3) {
-		ipha_t *ipha;
-
-		ipha = (ipha_t *)mp->b_rptr;
 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 		    "register_mforward: src %x, dst %x\n",
 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
 	}
 	/*
 	 * Need to pass in to ip_mforward() the information that the
-	 * packet has arrived on the register_vif. We use the solution that
-	 * ip_mroute_decap() employs: use mp->b_prev to pass some information
-	 * to ip_mforward(). Nonzero value means the packet has arrived on a
-	 * tunnel (ip_mroute_decap() puts the address of the other side of the
-	 * tunnel there.) This is safe since ip_rput() either frees the packet
-	 * or passes it to ip_mforward(). We use
-	 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the
-	 * register vif. If in the future we have more than one register vifs,
-	 * then this will need re-examination.
+	 * packet has arrived on the register_vif. We mark it with
+	 * the IRAF_PIM_REGISTER attribute.
+	 * pim_input verified that the (inner) destination is multicast,
+	 * hence we skip the generic code in ip_input.
 	 */
-	mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER;
+	ira->ira_flags |= IRAF_PIM_REGISTER;
 	++ipst->ips_mrtstat->mrts_pim_regforwards;
-	ip_rput(q, mp);
+
+	if (!CLASSD(ipha->ipha_dst)) {
+		ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
+		    ira->ira_tsl, MATCH_IRE_SECATTR, B_TRUE, 0, ipst, NULL,
+		    NULL, NULL);
+	} else {
+		ire = ire_multicast(ill);
+	}
+	ASSERT(ire != NULL);
+	/* Normally this will return the IRE_MULTICAST */
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
+		freemsg(mp);
+		ire_refrele(ire);
+		return (-1);
+	}
+	ASSERT(ire->ire_type & IRE_MULTICAST);
+	(*ire->ire_recvfn)(ire, mp, ipha, ira);
+	ire_refrele(ire);
+
 	return (0);
 }
 
@@ -2575,6 +2581,8 @@ encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
 	ipha->ipha_hdr_checksum = 0;
 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
 
+	ipha_copy->ipha_ttl = ipha->ipha_ttl;
+
 	if (ipst->ips_ip_mrtdebug > 1) {
 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
@@ -2587,21 +2595,53 @@ encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
 }
 
 /*
- * De-encapsulate a packet and feed it back through IP input.
+ * De-encapsulate a packet and feed it back through IP input if it
+ * matches one of our multicast tunnels.
+ *
  * This routine is called whenever IP gets a packet with prototype
- * IPPROTO_ENCAP and a local destination address.
+ * IPPROTO_ENCAP and a local destination address and the packet didn't
+ * match one of our configured IP-in-IP tunnels.
  */
 void
-ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
+ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
 {
 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
 	ipha_t		*ipha_encap;
 	int		hlen = IPH_HDR_LENGTH(ipha);
+	int		hlen_encap;
 	ipaddr_t	src;
 	struct vif	*vifp;
+	ire_t		*ire;
+	ill_t		*ill = ira->ira_ill;
 	ip_stack_t	*ipst = ill->ill_ipst;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
 
+	/* Make sure we have all of the inner header */
+	ipha_encap = (ipha_t *)((char *)ipha + hlen);
+	if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
+		ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
+		if (ipha == NULL) {
+			ipst->ips_mrtstat->mrts_bad_tunnel++;
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ip_mroute_decap: too short", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		ipha_encap = (ipha_t *)((char *)ipha + hlen);
+	}
+	hlen_encap = IPH_HDR_LENGTH(ipha_encap);
+	if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
+		ipha = ip_pullup(mp, hlen + hlen_encap, ira);
+		if (ipha == NULL) {
+			ipst->ips_mrtstat->mrts_bad_tunnel++;
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ip_mroute_decap: too short", mp, ill);
+			freemsg(mp);
+			return;
+		}
+		ipha_encap = (ipha_t *)((char *)ipha + hlen);
+	}
+
 	/*
 	 * Dump the packet if it's not to a multicast destination or if
 	 * we don't have an encapsulating tunnel with the source.
@@ -2609,10 +2649,11 @@ ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
 	 * uniquely identifies the tunnel (i.e., that this site has
 	 * at most one tunnel with the remote site).
 	 */
-	ipha_encap = (ipha_t *)((char *)ipha + hlen);
 	if (!CLASSD(ipha_encap->ipha_dst)) {
 		ipst->ips_mrtstat->mrts_bad_tunnel++;
 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("mrts_bad_tunnel", mp, ill);
 		freemsg(mp);
 		return;
 	}
@@ -2648,6 +2689,8 @@ ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
 		mutex_exit(&ipst->ips_last_encap_lock);
 		ipst->ips_mrtstat->mrts_bad_tunnel++;
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("mrts_bad_tunnel", mp, ill);
 		freemsg(mp);
 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
@@ -2657,14 +2700,43 @@ ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
 
 	/*
 	 * Need to pass in the tunnel source to ip_mforward (so that it can
-	 * verify that the packet arrived over the correct vif.)  We use b_prev
-	 * to pass this information. This is safe since the ip_rput either
-	 * frees the packet or passes it to ip_mforward.
+	 * verify that the packet arrived over the correct vif.)
 	 */
-	mp->b_prev = (mblk_t *)(uintptr_t)src;
+	ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
+	ira->ira_mroute_tunnel = src;
 	mp->b_rptr += hlen;
-	/* Feed back into ip_rput as an M_DATA. */
-	ip_rput(q, mp);
+	ira->ira_pktlen -= hlen;
+	ira->ira_ip_hdr_length = hlen_encap;
+
+	/*
+	 * We don't redo any of the filtering in ill_input_full_v4 and we
+	 * have checked that all of ipha_encap and any IP options are
+	 * pulled up. Hence we call ire_recv_multicast_v4 directly.
+	 * However, we have to check for RSVP as in ip_input_full_v4
+	 * and if so we pass it to ire_recv_broadcast_v4 for local delivery
+	 * to the rsvpd.
+	 */
+	if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
+	    ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
+		ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
+		    ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
+		    B_TRUE, 0, ipst, NULL, NULL, NULL);
+	} else {
+		ire = ire_multicast(ill);
+	}
+	ASSERT(ire != NULL);
+	/* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
+		freemsg(mp);
+		ire_refrele(ire);
+		return;
+	}
+	ire->ire_ib_pkt_count++;
+	ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
+	(*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
+	ire_refrele(ire);
 }
 
 /*
@@ -2687,7 +2759,7 @@ reset_mrt_vif_ipif(ipif_t *ipif)
 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
 		tmp_vifi = vifi - 1;
 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
-			(void) del_vif(&tmp_vifi, NULL, NULL, ipst);
+			(void) del_vif(&tmp_vifi, ipst);
 		}
 	}
 }
@@ -2696,11 +2768,12 @@ reset_mrt_vif_ipif(ipif_t *ipif)
 void
 reset_mrt_ill(ill_t *ill)
 {
-	struct mfc		*rt;
+	struct mfc	*rt;
 	struct rtdetq	*rte;
-	int			i;
+	int		i;
 	ip_stack_t	*ipst = ill->ill_ipst;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
+	timeout_id_t	id;
 
 	for (i = 0; i < MFCTBLSIZ; i++) {
 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
@@ -2713,6 +2786,18 @@ reset_mrt_ill(ill_t *ill)
 			while (rt != NULL) {
 				mutex_enter(&rt->mfc_mutex);
 				while ((rte = rt->mfc_rte) != NULL) {
+					if (rte->ill == ill &&
+					    (id = rt->mfc_timeout_id) != 0) {
+						/*
+						 * Its ok to drop the lock,  the
+						 * struct cannot be freed since
+						 * we have a ref on the hash
+						 * bucket.
+						 */
+						mutex_exit(&rt->mfc_mutex);
+						(void) untimeout(id);
+						mutex_enter(&rt->mfc_mutex);
+					}
 					if (rte->ill == ill) {
 						if (ipst->ips_ip_mrtdebug > 1) {
 						(void) mi_strlog(
@@ -2744,12 +2829,15 @@ tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
 	size_t 	p_len =  msgdsize(mp);
 	struct tbf	*t    = vifp->v_tbf;
 	timeout_id_t id = 0;
-	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
+	ill_t		*ill = vifp->v_ipif->ipif_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
 
 	/* Drop if packet is too large */
 	if (p_len > MAX_BKT_SIZE) {
 		ipst->ips_mrtstat->mrts_pkt2large++;
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("tbf_control - too large", mp, ill);
 		freemsg(mp);
 		return;
 	}
@@ -2800,6 +2888,9 @@ tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
 
 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
 			if (!pullupmsg(mp, hdr_length)) {
+				BUMP_MIB(ill->ill_ip_mib,
+				    ipIfStatsOutDiscards);
+				ip_drop_output("tbf_control - pullup", mp, ill);
 				freemsg(mp);
 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
 				    "vif %ld src 0x%x dst 0x%x\n",
@@ -2818,6 +2909,8 @@ tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
 		 */
 		if (!tbf_dq_sel(vifp, ipha)) {
 			ipst->ips_mrtstat->mrts_q_overflow++;
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("mrts_q_overflow", mp, ill);
 			freemsg(mp);
 		} else {
 			tbf_queue(vifp, mp);
@@ -2958,7 +3051,8 @@ tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
 	struct tbf		*t = vifp->v_tbf;
 	mblk_t		**np;
 	mblk_t		*last, *mp;
-	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
+	ill_t		*ill = vifp->v_ipif->ipif_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
 
 	if (ipst->ips_ip_mrtdebug > 1) {
@@ -2979,6 +3073,8 @@ tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
 			if (mp == t->tbf_t)
 				t->tbf_t = last;
 			mp->b_prev = mp->b_next = NULL;
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("tbf_dq_send", mp, ill);
 			freemsg(mp);
 			/*
 			 * It's impossible for the queue to be empty, but
@@ -3000,76 +3096,97 @@ tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
 static void
 tbf_send_packet(struct vif *vifp, mblk_t *mp)
 {
-	ipif_t  *ipif;
-	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
+	ipif_t		*ipif = vifp->v_ipif;
+	ill_t		*ill = ipif->ipif_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
+	ipha_t		*ipha;
 
+	ipha = (ipha_t *)mp->b_rptr;
 	/* If encap tunnel options */
 	if (vifp->v_flags & VIFF_TUNNEL)  {
+		ip_xmit_attr_t	ixas;
+
 		if (ipst->ips_ip_mrtdebug > 1) {
 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
-			    "tbf_send_pkt: ENCAP tunnel vif %ld",
+			    "tbf_send_packet: ENCAP tunnel vif %ld",
 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
 		}
+		bzero(&ixas, sizeof (ixas));
+		ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE;
+		ixas.ixa_ipst = ipst;
+		ixas.ixa_ifindex = 0;
+		ixas.ixa_cred = kcred;
+		ixas.ixa_cpid = NOPID;
+		ixas.ixa_tsl = NULL;
+		ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
+		ixas.ixa_pktlen = ntohs(ipha->ipha_length);
+		ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
 
 		/*
-		 * Feed into ip_wput which will set the ident field and
-		 * checksum the encapsulating header.
+		 * Feed into ip_output_simple which will set the ident field
+		 * and checksum the encapsulating header.
 		 * BSD gets the cached route vifp->v_route from ip_output()
 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
+		 * One could make multicast forwarding faster by putting an
+		 * ip_xmit_attr_t in each vif thereby caching the ire/nce.
 		 */
-		put(vifp->v_ipif->ipif_wq, mp);
+		(void) ip_output_simple(mp, &ixas);
+		ixa_cleanup(&ixas);
 		return;
 
 		/* phyint */
 	} else {
 		/* Need to loop back to members on the outgoing interface. */
-		ipha_t  *ipha;
-		ipaddr_t    dst;
-		ipha  = (ipha_t *)mp->b_rptr;
-		dst  = ipha->ipha_dst;
-		ipif = vifp->v_ipif;
-
-		if (ilm_lookup_ipif(ipif, dst) != NULL) {
-			/*
-			 * The packet is not yet reassembled, thus we need to
-			 * pass it to ip_rput_local for checksum verification
-			 * and reassembly (and fanout the user stream).
-			 */
-			mblk_t 	*mp_loop;
-			ire_t	*ire;
-
-			if (ipst->ips_ip_mrtdebug > 1) {
-				(void) mi_strlog(mrouter->conn_rq, 1,
-				    SL_TRACE,
-				    "tbf_send_pkt: loopback vif %ld",
-				    (ptrdiff_t)(vifp - ipst->ips_vifs));
-			}
-			mp_loop = copymsg(mp);
-			ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL,
-			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-
-			if (mp_loop != NULL && ire != NULL) {
-				IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop,
-				    ((ipha_t *)mp_loop->b_rptr),
-				    ire, (ill_t *)ipif->ipif_rq->q_ptr);
-			} else {
-				/* Either copymsg failed or no ire */
-				(void) mi_strlog(mrouter->conn_rq, 1,
-				    SL_TRACE,
-				    "tbf_send_pkt: mp_loop 0x%p, ire 0x%p "
-				    "vif %ld\n", (void *)mp_loop, (void *)ire,
-				    (ptrdiff_t)(vifp - ipst->ips_vifs));
-			}
-			if (ire != NULL)
-				ire_refrele(ire);
+		ipaddr_t	dst;
+		ip_recv_attr_t	iras;
+		nce_t		*nce;
+
+		bzero(&iras, sizeof (iras));
+		iras.ira_flags = IRAF_IS_IPV4;
+		iras.ira_ill = iras.ira_rill = ill;
+		iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+		iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
+		iras.ira_pktlen = ntohs(ipha->ipha_length);
+		iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
+
+		dst = ipha->ipha_dst;
+		if (ill_hasmembers_v4(ill, dst)) {
+			iras.ira_flags |= IRAF_LOOPBACK_COPY;
 		}
 		if (ipst->ips_ip_mrtdebug > 1) {
 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
 		}
-		ip_rput_forward_multicast(dst, mp, ipif);
+		/*
+		 * Find an NCE which matches the nexthop.
+		 * For a pt-pt interface we use the other end of the pt-pt
+		 * link.
+		 */
+		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
+			dst = ipif->ipif_pp_dst_addr;
+			nce = arp_nce_init(ill, dst, ill->ill_net_type);
+		} else {
+			nce = arp_nce_init(ill, dst, IRE_MULTICAST);
+		}
+		if (nce == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("tbf_send_packet - no nce", mp, ill);
+			freemsg(mp);
+			return;
+		}
+
+		/*
+		 * We don't remeber the incoming ill. Thus we
+		 * pretend the  packet arrived on the outbound ill. This means
+		 * statistics for input errors will be increased on the wrong
+		 * ill but that isn't a big deal.
+		 */
+		ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mtu, 0);
+		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
+
+		nce_refrele(nce);
 	}
 }
 
diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c
index d7be67cd26..0912d87227 100644
--- a/usr/src/uts/common/inet/ip/ip_multi.c
+++ b/usr/src/uts/common/inet/ip/ip_multi.c
@@ -66,29 +66,41 @@ static void	ilm_bld_flists(conn_t *conn, void *arg);
 static void	ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode,
     slist_t *flist);
 
-static ilm_t	*ilm_add_v6(ipif_t *ipif, const in6_addr_t *group,
+static ilm_t	*ilm_add(ill_t *ill, const in6_addr_t *group,
     ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist,
     zoneid_t zoneid);
 static void	ilm_delete(ilm_t *ilm);
-static int	ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *group);
-static int	ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *group);
-static ilg_t	*ilg_lookup_ipif(conn_t *connp, ipaddr_t group,
-    ipif_t *ipif);
-static int	ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif,
-    mcast_record_t fmode, ipaddr_t src);
-static int	ilg_add_v6(conn_t *connp, const in6_addr_t *group, ill_t *ill,
-    mcast_record_t fmode, const in6_addr_t *v6src);
+static int	ilm_numentries(ill_t *, const in6_addr_t *);
+
+static ilm_t	*ip_addmulti_serial(const in6_addr_t *, ill_t *, zoneid_t,
+    ilg_stat_t, mcast_record_t, slist_t *, int *);
+static ilm_t	*ip_addmulti_impl(const in6_addr_t *, ill_t *,
+    zoneid_t, ilg_stat_t, mcast_record_t, slist_t *, int *);
+static int	ip_delmulti_serial(ilm_t *, boolean_t, boolean_t);
+static int	ip_delmulti_impl(ilm_t *, boolean_t, boolean_t);
+
+static int	ip_ll_multireq(ill_t *ill, const in6_addr_t *group,
+    t_uscalar_t);
+static ilg_t	*ilg_lookup(conn_t *, const in6_addr_t *, ipaddr_t ifaddr,
+    uint_t ifindex);
+
+static int	ilg_add(conn_t *connp, const in6_addr_t *group,
+    ipaddr_t ifaddr, uint_t ifindex, ill_t *ill, mcast_record_t fmode,
+    const in6_addr_t *v6src);
 static void	ilg_delete(conn_t *connp, ilg_t *ilg, const in6_addr_t *src);
 static mblk_t	*ill_create_dl(ill_t *ill, uint32_t dl_primitive,
-    uint32_t length, uint32_t *addr_lenp, uint32_t *addr_offp);
-static void	conn_ilg_reap(conn_t *connp);
-static int	ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group,
-    ipif_t *ipif, mcast_record_t fmode, ipaddr_t src);
-static int	ip_opt_delete_group_excl_v6(conn_t *connp,
-    const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode,
-    const in6_addr_t *v6src);
-static void	ill_ilm_walker_hold(ill_t *ill);
-static void	ill_ilm_walker_rele(ill_t *ill);
+    uint32_t *addr_lenp, uint32_t *addr_offp);
+static int	ip_opt_delete_group_excl(conn_t *connp,
+    const in6_addr_t *v6group, ipaddr_t ifaddr, uint_t ifindex,
+    mcast_record_t fmode, const in6_addr_t *v6src);
+
+static	ilm_t	*ilm_lookup(ill_t *, const in6_addr_t *, zoneid_t);
+
+static int	ip_msfilter_ill(conn_t *, mblk_t *, const ip_ioctl_cmd_t *,
+    ill_t **);
+
+static void	ilg_check_detach(conn_t *, ill_t *);
+static void	ilg_check_reattach(conn_t *);
 
 /*
  * MT notes:
@@ -98,124 +110,122 @@ static void	ill_ilm_walker_rele(ill_t *ill);
  * need to synchronize when operating on the ilg. Multiple threads
  * potentially operating on different conn (socket endpoints) trying to
  * do multicast joins could eventually end up trying to manipulate the
- * ilm simultaneously and need to synchronize access to the ilm.  Currently,
- * this is done by synchronizing join/leave via per-phyint ipsq_t
- * serialization.
+ * ilm simulatenously and need to synchronize on the access to the ilm.
+ * The access and lookup of the ilm, as well as other ill multicast state,
+ * is under ill_mcast_lock.
+ * The modifications and lookup of ilg entries is serialized using conn_ilg_lock
+ * rwlock. An ilg will not be freed until ilg_refcnt drops to zero.
+ *
+ * In some cases we hold ill_mcast_lock and then acquire conn_ilg_lock, but
+ * never the other way around.
  *
  * An ilm is an IP data structure used to track multicast join/leave.
  * An ilm is associated with a <multicast group, ipif> tuple in IPv4 and
  * with just <multicast group> in IPv6. ilm_refcnt is the number of ilg's
- * referencing the ilm. ilms are created / destroyed only as writer. ilms
- * are not passed around, instead they are looked up and used under the
- * ill_lock or as writer. So we don't need a dynamic refcount of the number
+ * referencing the ilm.
+ * The modifications and lookup of ilm entries is serialized using the
+ * ill_mcast_lock rwlock; that lock handles all the igmp/mld modifications
+ * of the ilm state.
+ * ilms are created / destroyed only as writer. ilms
+ * are not passed around. The datapath (anything outside of this file
+ * and igmp.c) use functions that do not return ilms - just the number
+ * of members. So we don't need a dynamic refcount of the number
  * of threads holding reference to an ilm.
  *
- * Multicast Join operation:
- *
- * The first step is to determine the ipif (v4) or ill (v6) on which
- * the join operation is to be done. The join is done after becoming
- * exclusive on the ipsq associated with the ipif or ill. The conn->conn_ilg
- * and ill->ill_ilm are thus accessed and modified exclusively per ill.
- * Multiple threads can attempt to join simultaneously on different ipif/ill
- * on the same conn. In this case the ipsq serialization does not help in
- * protecting the ilg. It is the conn_lock that is used to protect the ilg.
- * The conn_lock also protects all the ilg_t members.
+ * In the cases where we serially access the ilg and ilm, which happens when
+ * we handle the applications requests to join or leave groups and sources,
+ * we use the ill_mcast_serializer mutex to ensure that a multithreaded
+ * application which does concurrent joins and/or leaves on the same group on
+ * the same socket always results in a consistent order for the ilg and ilm
+ * modifications.
  *
- * Leave operation.
- *
- * Similar to the join operation, the first step is to determine the ipif
- * or ill (v6) on which the leave operation is to be done. The leave operation
- * is done after becoming exclusive on the ipsq associated with the ipif or ill.
- * As with join ilg modification is done under the protection of the conn lock.
+ * When a multicast operation results in needing to send a message to
+ * the driver (to join/leave a L2 multicast address), we use ill_dlpi_queue()
+ * which serialized the DLPI requests. The IGMP/MLD code uses ill_mcast_queue()
+ * to send IGMP/MLD IP packet to avoid dropping the lock just to send a packet.
  */
 
-#define	IPSQ_ENTER_IPIF(ipif, connp, first_mp, func, ipsq, type)	\
-	ASSERT(connp != NULL);					\
-	(ipsq) = ipsq_try_enter((ipif), NULL, CONNP_TO_WQ(connp),	\
-	    (first_mp), (func), (type), B_TRUE);		\
-	if ((ipsq) == NULL) {					\
-		ipif_refrele(ipif);				\
-		return (EINPROGRESS);				\
-	}
-
-#define	IPSQ_ENTER_ILL(ill, connp, first_mp, func, ipsq, type)	\
-	ASSERT(connp != NULL);					\
-	(ipsq) = ipsq_try_enter(NULL, ill, CONNP_TO_WQ(connp),	\
-	    (first_mp),	(func), (type), B_TRUE);		\
-	if ((ipsq) == NULL) {					\
-		ill_refrele(ill);				\
-		return (EINPROGRESS);				\
-	}
-
-#define	IPSQ_EXIT(ipsq)	\
-	if (ipsq != NULL)	\
-		ipsq_exit(ipsq);
+#define	GETSTRUCT(structure, number)	\
+	((structure *)mi_zalloc(sizeof (structure) * (number)))
 
-#define	ILG_WALKER_HOLD(connp)	(connp)->conn_ilg_walker_cnt++
+/*
+ * Caller must ensure that the ilg has not been condemned
+ * The condemned flag is only set in ilg_delete under conn_ilg_lock.
+ *
+ * The caller must hold conn_ilg_lock as writer.
+ */
+static void
+ilg_refhold(ilg_t *ilg)
+{
+	ASSERT(ilg->ilg_refcnt != 0);
+	ASSERT(!ilg->ilg_condemned);
+	ASSERT(RW_WRITE_HELD(&ilg->ilg_connp->conn_ilg_lock));
 
-#define	ILG_WALKER_RELE(connp)				\
-	{						\
-		(connp)->conn_ilg_walker_cnt--;		\
-		if ((connp)->conn_ilg_walker_cnt == 0)	\
-			conn_ilg_reap(connp);		\
-	}
+	ilg->ilg_refcnt++;
+}
 
 static void
-conn_ilg_reap(conn_t *connp)
+ilg_inactive(ilg_t *ilg)
 {
-	int	to;
-	int	from;
-	ilg_t	*ilg;
-
-	ASSERT(MUTEX_HELD(&connp->conn_lock));
+	ASSERT(ilg->ilg_ill == NULL);
+	ASSERT(ilg->ilg_ilm == NULL);
+	ASSERT(ilg->ilg_filter == NULL);
+	ASSERT(ilg->ilg_condemned);
 
-	to = 0;
-	from = 0;
-	while (from < connp->conn_ilg_inuse) {
-		if (connp->conn_ilg[from].ilg_flags & ILG_DELETED) {
-			ilg = &connp->conn_ilg[from];
-			FREE_SLIST(ilg->ilg_filter);
-			ilg->ilg_flags &= ~ILG_DELETED;
-			from++;
-			continue;
-		}
-		if (to != from)
-			connp->conn_ilg[to] = connp->conn_ilg[from];
-		to++;
-		from++;
-	}
+	/* Unlink from list */
+	*ilg->ilg_ptpn = ilg->ilg_next;
+	if (ilg->ilg_next != NULL)
+		ilg->ilg_next->ilg_ptpn = ilg->ilg_ptpn;
+	ilg->ilg_next = NULL;
+	ilg->ilg_ptpn = NULL;
 
-	connp->conn_ilg_inuse = to;
+	ilg->ilg_connp = NULL;
+	kmem_free(ilg, sizeof (*ilg));
+}
 
-	if (connp->conn_ilg_inuse == 0) {
-		mi_free((char *)connp->conn_ilg);
-		connp->conn_ilg = NULL;
-		cv_broadcast(&connp->conn_refcv);
-	}
+/*
+ * The caller must hold conn_ilg_lock as writer.
+ */
+static void
+ilg_refrele(ilg_t *ilg)
+{
+	ASSERT(RW_WRITE_HELD(&ilg->ilg_connp->conn_ilg_lock));
+	ASSERT(ilg->ilg_refcnt != 0);
+	if (--ilg->ilg_refcnt == 0)
+		ilg_inactive(ilg);
 }
 
-#define	GETSTRUCT(structure, number)	\
-	((structure *)mi_zalloc(sizeof (structure) * (number)))
+/*
+ * Acquire reference on ilg and drop reference on held_ilg.
+ * In the case when held_ilg is the same as ilg we already have
+ * a reference, but the held_ilg might be condemned. In that case
+ * we avoid the ilg_refhold/rele so that we can assert in ire_refhold
+ * that the ilg isn't condemned.
+ */
+static void
+ilg_transfer_hold(ilg_t *held_ilg, ilg_t *ilg)
+{
+	if (held_ilg == ilg)
+		return;
 
-#define	ILG_ALLOC_CHUNK	16
+	ilg_refhold(ilg);
+	if (held_ilg != NULL)
+		ilg_refrele(held_ilg);
+}
 
 /*
- * Returns a pointer to the next available ilg in conn_ilg.  Allocs more
- * buffers in size of ILG_ALLOC_CHUNK ilgs when needed, and updates conn's
- * ilg tracking fields appropriately (conn_ilg_inuse reflects usage of the
- * returned ilg).  Returns NULL on failure, in which case `*errp' will be
+ * Allocate a new ilg_t and links it into conn_ilg.
+ * Returns NULL on failure, in which case `*errp' will be
  * filled in with the reason.
  *
- * Assumes connp->conn_lock is held.
+ * Assumes connp->conn_ilg_lock is held.
  */
 static ilg_t *
 conn_ilg_alloc(conn_t *connp, int *errp)
 {
-	ilg_t *new, *ret;
-	int curcnt;
+	ilg_t *ilg;
 
-	ASSERT(MUTEX_HELD(&connp->conn_lock));
-	ASSERT(connp->conn_ilg_inuse <= connp->conn_ilg_allocated);
+	ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock));
 
 	/*
 	 * If CONN_CLOSING is set, conn_ilg cleanup has begun and we must not
@@ -226,44 +236,23 @@ conn_ilg_alloc(conn_t *connp, int *errp)
 		return (NULL);
 	}
 
-	if (connp->conn_ilg == NULL) {
-		connp->conn_ilg = GETSTRUCT(ilg_t, ILG_ALLOC_CHUNK);
-		if (connp->conn_ilg == NULL) {
-			*errp = ENOMEM;
-			return (NULL);
-		}
-		connp->conn_ilg_allocated = ILG_ALLOC_CHUNK;
-		connp->conn_ilg_inuse = 0;
-	}
-	if (connp->conn_ilg_inuse == connp->conn_ilg_allocated) {
-		if (connp->conn_ilg_walker_cnt != 0) {
-			/*
-			 * XXX We cannot grow the array at this point
-			 * because a list walker could be in progress, and
-			 * we cannot wipe out the existing array until the
-			 * walker is done. Just return NULL for now.
-			 * ilg_delete_all() will have to be changed when
-			 * this logic is changed.
-			 */
-			*errp = EBUSY;
-			return (NULL);
-		}
-		curcnt = connp->conn_ilg_allocated;
-		new = GETSTRUCT(ilg_t, curcnt + ILG_ALLOC_CHUNK);
-		if (new == NULL) {
-			*errp = ENOMEM;
-			return (NULL);
-		}
-		bcopy(connp->conn_ilg, new, sizeof (ilg_t) * curcnt);
-		mi_free((char *)connp->conn_ilg);
-		connp->conn_ilg = new;
-		connp->conn_ilg_allocated += ILG_ALLOC_CHUNK;
+	ilg = kmem_zalloc(sizeof (ilg_t), KM_NOSLEEP);
+	if (ilg == NULL) {
+		*errp = ENOMEM;
+		return (NULL);
 	}
 
-	ret = &connp->conn_ilg[connp->conn_ilg_inuse++];
-	ASSERT((ret->ilg_flags & ILG_DELETED) == 0);
-	bzero(ret, sizeof (*ret));
-	return (ret);
+	ilg->ilg_refcnt = 1;
+
+	/* Insert at head */
+	if (connp->conn_ilg != NULL)
+		connp->conn_ilg->ilg_ptpn = &ilg->ilg_next;
+	ilg->ilg_next = connp->conn_ilg;
+	ilg->ilg_ptpn = &connp->conn_ilg;
+	connp->conn_ilg = ilg;
+
+	ilg->ilg_connp = connp;
+	return (ilg);
 }
 
 typedef struct ilm_fbld_s {
@@ -275,15 +264,18 @@ typedef struct ilm_fbld_s {
 	boolean_t	fbld_in_overflow;
 } ilm_fbld_t;
 
+/*
+ * Caller must hold ill_mcast_lock
+ */
 static void
-ilm_bld_flists(conn_t *conn, void *arg)
+ilm_bld_flists(conn_t *connp, void *arg)
 {
-	int i;
+	ilg_t *ilg;
 	ilm_fbld_t *fbld = (ilm_fbld_t *)(arg);
 	ilm_t *ilm = fbld->fbld_ilm;
 	in6_addr_t *v6group = &ilm->ilm_v6addr;
 
-	if (conn->conn_ilg_inuse == 0)
+	if (connp->conn_ilg == NULL)
 		return;
 
 	/*
@@ -300,12 +292,26 @@ ilm_bld_flists(conn_t *conn, void *arg)
 	 * ilm (group, interface match).  If so, update the master
 	 * include and exclude lists we're building in the fbld struct
 	 * with this ilg's filter info.
+	 *
+	 * Note that the caller has already serialized on the ill we care
+	 * about.
 	 */
-	mutex_enter(&conn->conn_lock);
-	for (i = 0; i < conn->conn_ilg_inuse; i++) {
-		ilg_t *ilg = &conn->conn_ilg[i];
+	ASSERT(MUTEX_HELD(&ilm->ilm_ill->ill_mcast_serializer));
+
+	rw_enter(&connp->conn_ilg_lock, RW_READER);
+	for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+		if (ilg->ilg_condemned)
+			continue;
+
+		/*
+		 * Since we are under the ill_mcast_serializer we know
+		 * that any ilg+ilm operations on this ilm have either
+		 * not started or completed, except for the last ilg
+		 * (the one that caused us to be called) which doesn't
+		 * have ilg_ilm set yet. Hence we compare using ilg_ill
+		 * and the address.
+		 */
 		if ((ilg->ilg_ill == ilm->ilm_ill) &&
-		    (ilg->ilg_ipif == ilm->ilm_ipif) &&
 		    IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) {
 			if (ilg->ilg_fmode == MODE_IS_INCLUDE) {
 				fbld->fbld_in_cnt++;
@@ -337,9 +343,12 @@ ilm_bld_flists(conn_t *conn, void *arg)
 			break;
 		}
 	}
-	mutex_exit(&conn->conn_lock);
+	rw_exit(&connp->conn_ilg_lock);
 }
 
+/*
+ * Caller must hold ill_mcast_lock
+ */
 static void
 ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist)
 {
@@ -385,15 +394,17 @@ ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist)
 	}
 }
 
+/*
+ * Caller must hold ill_mcast_lock
+ */
 static int
-ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist,
-    boolean_t isv6)
+ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist)
 {
 	mcast_record_t fmode;
 	slist_t *flist;
 	boolean_t fdefault;
 	char buf[INET6_ADDRSTRLEN];
-	ill_t *ill = isv6 ? ilm->ilm_ill : ilm->ilm_ipif->ipif_ill;
+	ill_t *ill = ilm->ilm_ill;
 
 	/*
 	 * There are several cases where the ilm's filter state
@@ -444,7 +455,7 @@ ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist,
 
 	/* send the state change report */
 	if (!IS_LOOPBACK(ill)) {
-		if (isv6)
+		if (ill->ill_isv6)
 			mld_statechange(ilm, fmode, flist);
 		else
 			igmp_statechange(ilm, fmode, flist);
@@ -464,12 +475,15 @@ ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist,
 	return (0);
 }
 
+/*
+ * Caller must hold ill_mcast_lock
+ */
 static int
-ilm_update_del(ilm_t *ilm, boolean_t isv6)
+ilm_update_del(ilm_t *ilm)
 {
 	mcast_record_t fmode;
 	slist_t *flist;
-	ill_t *ill = isv6 ? ilm->ilm_ill : ilm->ilm_ipif->ipif_ill;
+	ill_t *ill = ilm->ilm_ill;
 
 	ip1dbg(("ilm_update_del: still %d left; updating state\n",
 	    ilm->ilm_refcnt));
@@ -500,7 +514,7 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6)
 	}
 
 	if (!IS_LOOPBACK(ill)) {
-		if (isv6)
+		if (ill->ill_isv6)
 			mld_statechange(ilm, fmode, flist);
 		else
 			igmp_statechange(ilm, fmode, flist);
@@ -531,240 +545,245 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6)
 }
 
 /*
- * INADDR_ANY means all multicast addresses.
- * INADDR_ANY is stored as IPv6 unspecified addr.
+ * Create/update the ilm for the group/ill. Used by other parts of IP to
+ * do the ILGSTAT_NONE (no ilg), MODE_IS_EXCLUDE, with no slist join.
+ * Returns with a refhold on the ilm.
+ *
+ * The unspecified address means all multicast addresses for in both the
+ * case of IPv4 and IPv6.
+ *
+ * The caller should have already mapped an IPMP under ill to the upper.
  */
-int
-ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
-    mcast_record_t ilg_fmode, slist_t *ilg_flist)
+ilm_t *
+ip_addmulti(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+    int *errorp)
 {
-	ill_t	*ill = ipif->ipif_ill;
-	ilm_t 	*ilm;
-	in6_addr_t v6group;
-	int	ret;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
-	if (!CLASSD(group) && group != INADDR_ANY)
-		return (EINVAL);
-
-	if (IS_UNDER_IPMP(ill))
-		return (EINVAL);
-
-	/*
-	 * INADDR_ANY is represented as the IPv6 unspecified addr.
-	 */
-	if (group == INADDR_ANY)
-		v6group = ipv6_all_zeros;
-	else
-		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
-
-	ilm = ilm_lookup_ipif(ipif, group);
-	/*
-	 * Since we are writer, we know the ilm_flags itself cannot
-	 * change at this point, and ilm_lookup_ipif would not have
-	 * returned a DELETED ilm. However, the data path can free
-	 * ilm->ilm_next via ilm_walker_cleanup() so we can safely
-	 * access anything in ilm except ilm_next (for safe access to
-	 * ilm_next we'd have to take the ill_lock).
-	 */
-	if (ilm != NULL)
-		return (ilm_update_add(ilm, ilgstat, ilg_flist, B_FALSE));
-
-	ilm = ilm_add_v6(ipif, &v6group, ilgstat, ilg_fmode, ilg_flist,
-	    ipif->ipif_zoneid);
-	if (ilm == NULL)
-		return (ENOMEM);
-
-	if (group == INADDR_ANY) {
-		/*
-		 * Check how many ipif's have members in this group -
-		 * if more then one we should not tell the driver to join
-		 * this time
-		 */
-		if (ilm_numentries_v6(ill, &v6group) > 1)
-			return (0);
-		ret = ill_join_allmulti(ill);
-		if (ret != 0)
-			ilm_delete(ilm);
-		return (ret);
-	}
-
-	if (!IS_LOOPBACK(ill))
-		igmp_joingroup(ilm);
-
-	if (ilm_numentries_v6(ill, &v6group) > 1)
-		return (0);
+	ilm_t *ilm;
 
-	ret = ip_ll_addmulti_v6(ipif, &v6group);
-	if (ret != 0)
-		ilm_delete(ilm);
-	return (ret);
+	/* Acquire serializer to keep assert in ilm_bld_flists happy */
+	mutex_enter(&ill->ill_mcast_serializer);
+	ilm = ip_addmulti_serial(v6group, ill, zoneid, ILGSTAT_NONE,
+	    MODE_IS_EXCLUDE, NULL, errorp);
+	mutex_exit(&ill->ill_mcast_serializer);
+	return (ilm);
 }
 
 /*
- * The unspecified address means all multicast addresses.
+ * Create/update the ilm for the group/ill. If ILGSTAT_CHANGE is not set
+ * then this returns with a refhold on the ilm.
+ *
+ * Internal routine which assumes the caller has already acquired
+ * ill_multi_serializer.
  *
- * ill identifies the interface to join on.
+ * The unspecified address means all multicast addresses for in both the
+ * case of IPv4 and IPv6.
  *
  * ilgstat tells us if there's an ilg associated with this join,
  * and if so, if it's a new ilg or a change to an existing one.
  * ilg_fmode and ilg_flist give us the current filter state of
  * the ilg (and will be EXCLUDE {NULL} in the case of no ilg).
+ *
+ * The caller should have already mapped an IPMP under ill to the upper.
  */
-int
-ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
-    ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist)
+static ilm_t *
+ip_addmulti_serial(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+    ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist,
+    int *errorp)
 {
-	ilm_t	*ilm;
-	int	ret;
+	ilm_t *ilm;
 
-	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(MUTEX_HELD(&ill->ill_mcast_serializer));
 
-	if (!IN6_IS_ADDR_MULTICAST(v6group) &&
-	    !IN6_IS_ADDR_UNSPECIFIED(v6group)) {
-		return (EINVAL);
+	if (ill->ill_isv6) {
+		if (!IN6_IS_ADDR_MULTICAST(v6group) &&
+		    !IN6_IS_ADDR_UNSPECIFIED(v6group)) {
+			*errorp = EINVAL;
+			return (NULL);
+		}
+	} else {
+		if (IN6_IS_ADDR_V4MAPPED(v6group)) {
+			ipaddr_t v4group;
+
+			IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
+			if (!CLASSD(v4group)) {
+				*errorp = EINVAL;
+				return (NULL);
+			}
+		} else if (!IN6_IS_ADDR_UNSPECIFIED(v6group)) {
+			*errorp = EINVAL;
+			return (NULL);
+		}
 	}
 
-	if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_MC_SOLICITEDNODE(v6group))
-		return (EINVAL);
+	if (IS_UNDER_IPMP(ill)) {
+		*errorp = EINVAL;
+		return (NULL);
+	}
+
+	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
+	/*
+	 * We do the equivalent of a lookup by checking after we get the lock
+	 * This is needed since the ill could have been condemned after
+	 * we looked it up, and we need to check condemned after we hold
+	 * ill_mcast_lock to synchronize with the unplumb code.
+	 */
+	if (ill->ill_state_flags & ILL_CONDEMNED) {
+		rw_exit(&ill->ill_mcast_lock);
+		*errorp = ENXIO;
+		return (NULL);
+	}
+	ilm = ip_addmulti_impl(v6group, ill, zoneid, ilgstat, ilg_fmode,
+	    ilg_flist, errorp);
+	rw_exit(&ill->ill_mcast_lock);
+
+	/* Send any deferred/queued DLPI or IP packets */
+	ill_mcast_send_queued(ill);
+	ill_dlpi_send_queued(ill);
+	ill_mcast_timer_start(ill->ill_ipst);
+	return (ilm);
+}
+
+static ilm_t *
+ip_addmulti_impl(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+    ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist,
+    int *errorp)
+{
+	ilm_t	*ilm;
+	int	ret = 0;
+
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
+	*errorp = 0;
 
 	/*
 	 * An ilm is uniquely identified by the tuple of (group, ill) where
 	 * `group' is the multicast group address, and `ill' is the interface
 	 * on which it is currently joined.
 	 */
-	ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid);
-	if (ilm != NULL)
-		return (ilm_update_add(ilm, ilgstat, ilg_flist, B_TRUE));
 
-	ilm = ilm_add_v6(ill->ill_ipif, v6group, ilgstat, ilg_fmode,
-	    ilg_flist, zoneid);
-	if (ilm == NULL)
-		return (ENOMEM);
+	ilm = ilm_lookup(ill, v6group, zoneid);
+	if (ilm != NULL) {
+		/* ilm_update_add bumps ilm_refcnt unless ILGSTAT_CHANGE */
+		ret = ilm_update_add(ilm, ilgstat, ilg_flist);
+		if (ret == 0)
+			return (ilm);
 
-	if (IN6_IS_ADDR_UNSPECIFIED(v6group)) {
-		/*
-		 * Check how many ipif's that have members in this group -
-		 * if more then one we should not tell the driver to join
-		 * this time
-		 */
-		if (ilm_numentries_v6(ill, v6group) > 1)
-			return (0);
-		ret = ill_join_allmulti(ill);
-		if (ret != 0)
-			ilm_delete(ilm);
-		return (ret);
+		*errorp = ret;
+		return (NULL);
 	}
 
-	if (!IS_LOOPBACK(ill))
-		mld_joingroup(ilm);
-
 	/*
-	 * If we have more then one we should not tell the driver
-	 * to join this time.
+	 * The callers checks on the ilg and the ilg+ilm consistency under
+	 * ill_mcast_serializer ensures that we can not have ILGSTAT_CHANGE
+	 * and no ilm.
 	 */
-	if (ilm_numentries_v6(ill, v6group) > 1)
-		return (0);
-
-	ret = ip_ll_addmulti_v6(ill->ill_ipif, v6group);
-	if (ret != 0)
-		ilm_delete(ilm);
-	return (ret);
-}
+	ASSERT(ilgstat != ILGSTAT_CHANGE);
+	ilm = ilm_add(ill, v6group, ilgstat, ilg_fmode, ilg_flist, zoneid);
+	if (ilm == NULL) {
+		*errorp = ENOMEM;
+		return (NULL);
+	}
 
-/*
- * Mapping the given IP multicast address to the L2 multicast mac address.
- */
-static void
-ill_multicast_mapping(ill_t *ill, ipaddr_t ip_addr, uint8_t *hw_addr,
-    uint32_t hw_addrlen)
-{
-	dl_unitdata_req_t *dlur;
-	ipaddr_t proto_extract_mask;
-	uint8_t *from, *bcast_addr;
-	uint32_t hw_extract_start;
-	int len;
+	if (IN6_IS_ADDR_UNSPECIFIED(v6group)) {
+		/*
+		 * If we have more then one we should not tell the driver
+		 * to join this time.
+		 */
+		if (ilm_numentries(ill, v6group) == 1) {
+			ret = ill_join_allmulti(ill);
+		}
+	} else {
+		if (!IS_LOOPBACK(ill)) {
+			if (ill->ill_isv6)
+				mld_joingroup(ilm);
+			else
+				igmp_joingroup(ilm);
+		}
 
-	ASSERT(IN_CLASSD(ntohl(ip_addr)));
-	ASSERT(hw_addrlen == ill->ill_phys_addr_length);
-	ASSERT((ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) == 0);
-	ASSERT((ill->ill_flags & ILLF_MULTICAST) != 0);
+		/*
+		 * If we have more then one we should not tell the driver
+		 * to join this time.
+		 */
+		if (ilm_numentries(ill, v6group) == 1) {
+			ret = ip_ll_multireq(ill, v6group, DL_ENABMULTI_REQ);
+		}
+	}
+	if (ret != 0) {
+		if (ret == ENETDOWN) {
+			char buf[INET6_ADDRSTRLEN];
 
-	/*
-	 * Find the physical broadcast address.
-	 */
-	dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
-	bcast_addr = (uint8_t *)dlur + dlur->dl_dest_addr_offset;
-	if (ill->ill_sap_length > 0)
-		bcast_addr += ill->ill_sap_length;
-
-	VERIFY(MEDIA_V4MINFO(ill->ill_media, hw_addrlen, bcast_addr,
-	    hw_addr, &hw_extract_start, &proto_extract_mask));
-
-	len = MIN((int)hw_addrlen - hw_extract_start, IP_ADDR_LEN);
-	ip_addr &= proto_extract_mask;
-	from = (uint8_t *)&ip_addr;
-	while (len-- > 0)
-		hw_addr[hw_extract_start + len] |= from[len];
+			ip0dbg(("ip_addmulti: ENETDOWN for %s on %s",
+			    inet_ntop(AF_INET6, &ilm->ilm_v6addr,
+			    buf, sizeof (buf)), ill->ill_name));
+		}
+		ilm_delete(ilm);
+		*errorp = ret;
+		return (NULL);
+	} else {
+		return (ilm);
+	}
 }
 
 /*
- * Send a multicast request to the driver for enabling multicast reception
- * for v6groupp address. The caller has already checked whether it is
- * appropriate to send one or not.
+ * Send a multicast request to the driver for enabling or disabling
+ * multicast reception for v6groupp address. The caller has already
+ * checked whether it is appropriate to send one or not.
+ *
+ * For IPMP we switch to the cast_ill since it has the right hardware
+ * information.
  */
-int
-ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
+static int
+ip_ll_send_multireq(ill_t *ill, const in6_addr_t *v6groupp, t_uscalar_t prim)
 {
 	mblk_t	*mp;
 	uint32_t addrlen, addroff;
-	char	group_buf[INET6_ADDRSTRLEN];
-
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	/*
-	 * If we're on the IPMP ill, use the nominated multicast interface to
-	 * send and receive DLPI messages, if one exists.  (If none exists,
-	 * there are no usable interfaces and thus nothing to do.)
-	 */
-	if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
-		return (0);
-
-	/*
-	 * Create a DL_ENABMULTI_REQ.
-	 */
-	mp = ill_create_dl(ill, DL_ENABMULTI_REQ, sizeof (dl_enabmulti_req_t),
-	    &addrlen, &addroff);
-	if (!mp)
-		return (ENOMEM);
-
-	if (IN6_IS_ADDR_V4MAPPED(v6groupp)) {
-		ipaddr_t v4group;
+	ill_t *release_ill = NULL;
+	int err = 0;
 
-		IN6_V4MAPPED_TO_IPADDR(v6groupp, v4group);
+	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
 
-		ill_multicast_mapping(ill, v4group,
-		    mp->b_rptr + addroff, addrlen);
+	if (IS_IPMP(ill)) {
+		/* On the upper IPMP ill. */
+		release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+		if (release_ill == NULL) {
+			/*
+			 * Avoid sending it down to the ipmpstub.
+			 * We will be called again once the members of the
+			 * group are in place
+			 */
+			ip1dbg(("ip_ll_send_multireq: no cast_ill for %s %d\n",
+			    ill->ill_name, ill->ill_isv6));
+			return (0);
+		}
+		ill = release_ill;
+	}
+	/* Create a DL_ENABMULTI_REQ or DL_DISABMULTI_REQ message. */
+	mp = ill_create_dl(ill, prim, &addrlen, &addroff);
+	if (mp == NULL) {
+		err = ENOMEM;
+		goto done;
+	}
 
-		ip1dbg(("ip_ll_send_enabmulti_req: IPv4 %s on %s\n",
-		    inet_ntop(AF_INET6, v6groupp, group_buf,
-		    sizeof (group_buf)),
-		    ill->ill_name));
+	mp = ndp_mcastreq(ill, v6groupp, addrlen, addroff, mp);
+	if (mp == NULL) {
+		ip0dbg(("null from ndp_mcastreq(ill %s)\n", ill->ill_name));
+		err = ENOMEM;
+		goto done;
+	}
 
+	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
+	case DL_ENABMULTI_REQ:
+		mutex_enter(&ill->ill_lock);
 		/* Track the state if this is the first enabmulti */
 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
-		ill_dlpi_send(ill, mp);
-	} else {
-		ip1dbg(("ip_ll_send_enabmulti_req: IPv6 ndp_mcastreq %s on"
-		    " %s\n",
-		    inet_ntop(AF_INET6, v6groupp, group_buf,
-		    sizeof (group_buf)),
-		    ill->ill_name));
-		return (ndp_mcastreq(ill, v6groupp, addrlen, addroff, mp));
+		mutex_exit(&ill->ill_lock);
+		break;
 	}
-	return (0);
+	ill_dlpi_queue(ill, mp);
+done:
+	if (release_ill != NULL)
+		ill_refrele(release_ill);
+	return (err);
 }
 
 /*
@@ -772,132 +791,71 @@ ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
  * membership for v6group if appropriate.
  */
 static int
-ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *v6groupp)
+ip_ll_multireq(ill_t *ill, const in6_addr_t *v6groupp, t_uscalar_t prim)
 {
-	ill_t	*ill = ipif->ipif_ill;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
 	if (ill->ill_net_type != IRE_IF_RESOLVER ||
-	    ipif->ipif_flags & IPIF_POINTOPOINT) {
-		ip1dbg(("ip_ll_addmulti_v6: not resolver\n"));
+	    ill->ill_ipif->ipif_flags & IPIF_POINTOPOINT) {
+		ip1dbg(("ip_ll_multireq: not resolver\n"));
 		return (0);	/* Must be IRE_IF_NORESOLVER */
 	}
 
 	if (ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
-		ip1dbg(("ip_ll_addmulti_v6: MULTI_BCAST\n"));
-		return (0);
-	}
-	if (!ill->ill_dl_up) {
-		/*
-		 * Nobody there. All multicast addresses will be re-joined
-		 * when we get the DL_BIND_ACK bringing the interface up.
-		 */
-		ip1dbg(("ip_ll_addmulti_v6: nobody up\n"));
+		ip1dbg(("ip_ll_multireq: MULTI_BCAST\n"));
 		return (0);
 	}
-	return (ip_ll_send_enabmulti_req(ill, v6groupp));
+	return (ip_ll_send_multireq(ill, v6groupp, prim));
 }
 
 /*
- * INADDR_ANY means all multicast addresses.
- * INADDR_ANY is stored as the IPv6 unspecified addr.
+ * Delete the ilm. Used by other parts of IP for the case of no_ilg/leaving
+ * being true.
  */
 int
-ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
+ip_delmulti(ilm_t *ilm)
 {
-	ill_t	*ill = ipif->ipif_ill;
-	ilm_t *ilm;
-	in6_addr_t v6group;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
-	if (!CLASSD(group) && group != INADDR_ANY)
-		return (EINVAL);
-
-	/*
-	 * INADDR_ANY is represented as the IPv6 unspecified addr.
-	 */
-	if (group == INADDR_ANY)
-		v6group = ipv6_all_zeros;
-	else
-		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+	ill_t *ill = ilm->ilm_ill;
+	int error;
 
-	/*
-	 * Look for a match on the ipif.
-	 * (IP_DROP_MEMBERSHIP specifies an ipif using an IP address).
-	 */
-	ilm = ilm_lookup_ipif(ipif, group);
-	if (ilm == NULL)
-		return (ENOENT);
-
-	/* Update counters */
-	if (no_ilg)
-		ilm->ilm_no_ilg_cnt--;
-
-	if (leaving)
-		ilm->ilm_refcnt--;
-
-	if (ilm->ilm_refcnt > 0)
-		return (ilm_update_del(ilm, B_FALSE));
-
-	if (group == INADDR_ANY) {
-		ilm_delete(ilm);
-		/*
-		 * Check how many ipif's that have members in this group -
-		 * if there are still some left then don't tell the driver
-		 * to drop it.
-		 */
-		if (ilm_numentries_v6(ill, &v6group) != 0)
-			return (0);
-
-		/* If we never joined, then don't leave. */
-		if (ill->ill_join_allmulti)
-			ill_leave_allmulti(ill);
-
-		return (0);
-	}
-
-	if (!IS_LOOPBACK(ill))
-		igmp_leavegroup(ilm);
-
-	ilm_delete(ilm);
-	/*
-	 * Check how many ipif's that have members in this group -
-	 * if there are still some left then don't tell the driver
-	 * to drop it.
-	 */
-	if (ilm_numentries_v6(ill, &v6group) != 0)
-		return (0);
-	return (ip_ll_delmulti_v6(ipif, &v6group));
+	/* Acquire serializer to keep assert in ilm_bld_flists happy */
+	mutex_enter(&ill->ill_mcast_serializer);
+	error = ip_delmulti_serial(ilm, B_TRUE, B_TRUE);
+	mutex_exit(&ill->ill_mcast_serializer);
+	return (error);
 }
 
+
 /*
- * The unspecified address means all multicast addresses.
+ * Delete the ilm.
+ * Assumes ill_multi_serializer is held by the caller.
  */
-int
-ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
-    boolean_t no_ilg, boolean_t leaving)
+static int
+ip_delmulti_serial(ilm_t *ilm, boolean_t no_ilg, boolean_t leaving)
 {
-	ipif_t	*ipif;
-	ilm_t *ilm;
+	ill_t *ill = ilm->ilm_ill;
+	int ret;
 
-	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(MUTEX_HELD(&ill->ill_mcast_serializer));
+	ASSERT(!(IS_UNDER_IPMP(ill)));
 
-	if (!IN6_IS_ADDR_MULTICAST(v6group) &&
-	    !IN6_IS_ADDR_UNSPECIFIED(v6group))
-		return (EINVAL);
+	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
+	ret = ip_delmulti_impl(ilm, no_ilg, leaving);
+	rw_exit(&ill->ill_mcast_lock);
+	/* Send any deferred/queued DLPI or IP packets */
+	ill_mcast_send_queued(ill);
+	ill_dlpi_send_queued(ill);
+	ill_mcast_timer_start(ill->ill_ipst);
 
-	/*
-	 * Look for a match on the ill.
-	 */
-	ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid);
-	if (ilm == NULL)
-		return (ENOENT);
+	return (ret);
+}
 
-	ASSERT(ilm->ilm_ill == ill);
+static int
+ip_delmulti_impl(ilm_t *ilm, boolean_t no_ilg, boolean_t leaving)
+{
+	ill_t *ill = ilm->ilm_ill;
+	int error;
+	in6_addr_t v6group;
 
-	ipif = ill->ill_ipif;
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 
 	/* Update counters */
 	if (no_ilg)
@@ -907,150 +865,90 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
 		ilm->ilm_refcnt--;
 
 	if (ilm->ilm_refcnt > 0)
-		return (ilm_update_del(ilm, B_TRUE));
+		return (ilm_update_del(ilm));
 
-	if (IN6_IS_ADDR_UNSPECIFIED(v6group)) {
+	v6group = ilm->ilm_v6addr;
+
+	if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
 		ilm_delete(ilm);
 		/*
-		 * Check how many ipif's that have members in this group -
-		 * if there are still some left then don't tell the driver
-		 * to drop it.
+		 * If we have some left then one we should not tell the driver
+		 * to leave.
 		 */
-		if (ilm_numentries_v6(ill, v6group) != 0)
+		if (ilm_numentries(ill, &v6group) != 0)
 			return (0);
 
-		/* If we never joined, then don't leave. */
-		if (ill->ill_join_allmulti)
-			ill_leave_allmulti(ill);
+		ill_leave_allmulti(ill);
 
 		return (0);
 	}
 
-	if (!IS_LOOPBACK(ill))
-		mld_leavegroup(ilm);
+	if (!IS_LOOPBACK(ill)) {
+		if (ill->ill_isv6)
+			mld_leavegroup(ilm);
+		else
+			igmp_leavegroup(ilm);
+	}
 
 	ilm_delete(ilm);
 	/*
-	 * Check how many ipif's that have members in this group -
-	 * if there are still some left then don't tell the driver
-	 * to drop it.
+	 * If we have some left then one we should not tell the driver
+	 * to leave.
 	 */
-	if (ilm_numentries_v6(ill, v6group) != 0)
+	if (ilm_numentries(ill, &v6group) != 0)
 		return (0);
-	return (ip_ll_delmulti_v6(ipif, v6group));
-}
 
-/*
- * Send a multicast request to the driver for disabling multicast reception
- * for v6groupp address. The caller has already checked whether it is
- * appropriate to send one or not.
- */
-int
-ip_ll_send_disabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
-{
-	mblk_t	*mp;
-	char	group_buf[INET6_ADDRSTRLEN];
-	uint32_t addrlen, addroff;
+	error = ip_ll_multireq(ill, &v6group, DL_DISABMULTI_REQ);
+	/* We ignore the case when ill_dl_up is not set */
+	if (error == ENETDOWN) {
+		char buf[INET6_ADDRSTRLEN];
 
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	/*
-	 * See comment in ip_ll_send_enabmulti_req().
-	 */
-	if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
-		return (0);
-
-	/*
-	 * Create a DL_DISABMULTI_REQ.
-	 */
-	mp = ill_create_dl(ill, DL_DISABMULTI_REQ,
-	    sizeof (dl_disabmulti_req_t), &addrlen, &addroff);
-	if (!mp)
-		return (ENOMEM);
-
-	if (IN6_IS_ADDR_V4MAPPED(v6groupp)) {
-		ipaddr_t v4group;
-
-		IN6_V4MAPPED_TO_IPADDR(v6groupp, v4group);
-
-		ill_multicast_mapping(ill, v4group,
-		    mp->b_rptr + addroff, addrlen);
-
-		ip1dbg(("ip_ll_send_disabmulti_req: IPv4 %s on %s\n",
-		    inet_ntop(AF_INET6, v6groupp, group_buf,
-		    sizeof (group_buf)),
+		ip0dbg(("ip_delmulti: ENETDOWN for %s on %s",
+		    inet_ntop(AF_INET6, &v6group, buf, sizeof (buf)),
 		    ill->ill_name));
-		ill_dlpi_send(ill, mp);
-	} else {
-		ip1dbg(("ip_ll_send_disabmulti_req: IPv6 ndp_mcastreq %s on"
-		    " %s\n",
-		    inet_ntop(AF_INET6, v6groupp, group_buf,
-		    sizeof (group_buf)),
-		    ill->ill_name));
-		return (ndp_mcastreq(ill, v6groupp, addrlen, addroff, mp));
-	}
-	return (0);
-}
-
-/*
- * Send a multicast request to the driver for disabling multicast
- * membership for v6group if appropriate.
- */
-static int
-ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *v6group)
-{
-	ill_t	*ill = ipif->ipif_ill;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
-	if (ill->ill_net_type != IRE_IF_RESOLVER ||
-	    ipif->ipif_flags & IPIF_POINTOPOINT) {
-		return (0);	/* Must be IRE_IF_NORESOLVER */
-	}
-	if (ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
-		ip1dbg(("ip_ll_delmulti_v6: MULTI_BCAST\n"));
-		return (0);
 	}
-	if (!ill->ill_dl_up) {
-		/*
-		 * Nobody there. All multicast addresses will be re-joined
-		 * when we get the DL_BIND_ACK bringing the interface up.
-		 */
-		ip1dbg(("ip_ll_delmulti_v6: nobody up\n"));
-		return (0);
-	}
-	return (ip_ll_send_disabmulti_req(ill, v6group));
+	return (error);
 }
 
 /*
- * Make the driver pass up all multicast packets.  NOTE: to keep callers
- * IPMP-unaware, if an IPMP ill is passed in, the ill_join_allmulti flag is
- * set on it (rather than the cast ill).
+ * Make the driver pass up all multicast packets.
  */
 int
 ill_join_allmulti(ill_t *ill)
 {
-	mblk_t		*promiscon_mp, *promiscoff_mp;
+	mblk_t		*promiscon_mp, *promiscoff_mp = NULL;
 	uint32_t	addrlen, addroff;
-	ill_t		*join_ill = ill;
+	ill_t		*release_ill = NULL;
 
-	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 
 	if (!ill->ill_dl_up) {
 		/*
 		 * Nobody there. All multicast addresses will be re-joined
 		 * when we get the DL_BIND_ACK bringing the interface up.
 		 */
-		return (0);
+		return (ENETDOWN);
 	}
 
-	/*
-	 * See comment in ip_ll_send_enabmulti_req().
-	 */
-	if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
-		return (0);
-
-	ASSERT(!join_ill->ill_join_allmulti);
+	if (IS_IPMP(ill)) {
+		/* On the upper IPMP ill. */
+		release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+		if (release_ill == NULL) {
+			/*
+			 * Avoid sending it down to the ipmpstub.
+			 * We will be called again once the members of the
+			 * group are in place
+			 */
+			ip1dbg(("ill_join_allmulti: no cast_ill for %s %d\n",
+			    ill->ill_name, ill->ill_isv6));
+			return (0);
+		}
+		ill = release_ill;
+		if (!ill->ill_dl_up) {
+			ill_refrele(ill);
+			return (ENETDOWN);
+		}
+	}
 
 	/*
 	 * Create a DL_PROMISCON_REQ message and send it directly to the DLPI
@@ -1062,19 +960,24 @@ ill_join_allmulti(ill_t *ill)
 	if ((ill->ill_net_type == IRE_IF_RESOLVER) &&
 	    !(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) {
 		promiscon_mp = ill_create_dl(ill, DL_PROMISCON_REQ,
-		    sizeof (dl_promiscon_req_t), &addrlen, &addroff);
-		promiscoff_mp = ill_create_dl(ill, DL_PROMISCOFF_REQ,
-		    sizeof (dl_promiscoff_req_t), &addrlen, &addroff);
-		if (promiscon_mp == NULL || promiscoff_mp == NULL) {
+		    &addrlen, &addroff);
+		if (ill->ill_promiscoff_mp == NULL)
+			promiscoff_mp = ill_create_dl(ill, DL_PROMISCOFF_REQ,
+			    &addrlen, &addroff);
+		if (promiscon_mp == NULL ||
+		    (ill->ill_promiscoff_mp == NULL && promiscoff_mp == NULL)) {
 			freemsg(promiscon_mp);
 			freemsg(promiscoff_mp);
+			if (release_ill != NULL)
+				ill_refrele(release_ill);
 			return (ENOMEM);
 		}
-		ill->ill_promiscoff_mp = promiscoff_mp;
-		ill_dlpi_send(ill, promiscon_mp);
+		if (ill->ill_promiscoff_mp == NULL)
+			ill->ill_promiscoff_mp = promiscoff_mp;
+		ill_dlpi_queue(ill, promiscon_mp);
 	}
-
-	join_ill->ill_join_allmulti = B_TRUE;
+	if (release_ill != NULL)
+		ill_refrele(release_ill);
 	return (0);
 }
 
@@ -1085,9 +988,9 @@ void
 ill_leave_allmulti(ill_t *ill)
 {
 	mblk_t	*promiscoff_mp;
-	ill_t	*leave_ill = ill;
+	ill_t	*release_ill = NULL;
 
-	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 
 	if (!ill->ill_dl_up) {
 		/*
@@ -1097,105 +1000,130 @@ ill_leave_allmulti(ill_t *ill)
 		return;
 	}
 
-	/*
-	 * See comment in ip_ll_send_enabmulti_req().
-	 */
-	if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
-		return;
-
-	ASSERT(leave_ill->ill_join_allmulti);
+	if (IS_IPMP(ill)) {
+		/* On the upper IPMP ill. */
+		release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+		if (release_ill == NULL) {
+			/*
+			 * Avoid sending it down to the ipmpstub.
+			 * We will be called again once the members of the
+			 * group are in place
+			 */
+			ip1dbg(("ill_leave_allmulti: no cast_ill on %s %d\n",
+			    ill->ill_name, ill->ill_isv6));
+			return;
+		}
+		ill = release_ill;
+		if (!ill->ill_dl_up)
+			goto done;
+	}
 
 	/*
-	 * Create a DL_PROMISCOFF_REQ message and send it directly to
-	 * the DLPI provider.  We don't need to do this for certain
-	 * media types for which we never need to turn promiscuous
-	 * mode on.
+	 * In the case of IPMP and ill_dl_up not being set when we joined
+	 * we didn't allocate a promiscoff_mp. In that case we have
+	 * nothing to do when we leave.
+	 * Ditto for PHYI_MULTI_BCAST
 	 */
-	if ((ill->ill_net_type == IRE_IF_RESOLVER) &&
-	    !(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) {
-		promiscoff_mp = ill->ill_promiscoff_mp;
-		ASSERT(promiscoff_mp != NULL);
+	promiscoff_mp = ill->ill_promiscoff_mp;
+	if (promiscoff_mp != NULL) {
 		ill->ill_promiscoff_mp = NULL;
-		ill_dlpi_send(ill, promiscoff_mp);
-	}
-
-	leave_ill->ill_join_allmulti = B_FALSE;
-}
-
-static ill_t *
-ipsq_enter_byifindex(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
-{
-	ill_t		*ill;
-	boolean_t	in_ipsq;
-
-	ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
-	    ipst);
-	if (ill != NULL) {
-		if (!ill_waiter_inc(ill)) {
-			ill_refrele(ill);
-			return (NULL);
-		}
-		ill_refrele(ill);
-		in_ipsq = ipsq_enter(ill, B_FALSE, NEW_OP);
-		ill_waiter_dcr(ill);
-		if (!in_ipsq)
-			ill = NULL;
+		ill_dlpi_queue(ill, promiscoff_mp);
 	}
-	return (ill);
+done:
+	if (release_ill != NULL)
+		ill_refrele(release_ill);
 }
 
 int
 ip_join_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
 {
 	ill_t		*ill;
-	int		ret = 0;
+	int		ret;
+	ilm_t		*ilm;
 
-	if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL)
+	ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
+	if (ill == NULL)
 		return (ENODEV);
 
 	/*
-	 * The ip_addmulti*() functions won't allow IPMP underlying interfaces
+	 * The ip_addmulti() function doesn't allow IPMP underlying interfaces
 	 * to join allmulti since only the nominated underlying interface in
 	 * the group should receive multicast.  We silently succeed to avoid
 	 * having to teach IPobs (currently the only caller of this routine)
 	 * to ignore failures in this case.
 	 */
-	if (IS_UNDER_IPMP(ill))
-		goto out;
+	if (IS_UNDER_IPMP(ill)) {
+		ill_refrele(ill);
+		return (0);
+	}
+	mutex_enter(&ill->ill_lock);
+	if (ill->ill_ipallmulti_cnt > 0) {
+		/* Already joined */
+		ASSERT(ill->ill_ipallmulti_ilm != NULL);
+		ill->ill_ipallmulti_cnt++;
+		mutex_exit(&ill->ill_lock);
+		goto done;
+	}
+	mutex_exit(&ill->ill_lock);
 
-	if (isv6) {
-		ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ill->ill_zoneid,
-		    ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
-	} else {
-		ret = ip_addmulti(INADDR_ANY, ill->ill_ipif, ILGSTAT_NONE,
-		    MODE_IS_EXCLUDE, NULL);
+	ilm = ip_addmulti(&ipv6_all_zeros, ill, ill->ill_zoneid, &ret);
+	if (ilm == NULL) {
+		ASSERT(ret != 0);
+		ill_refrele(ill);
+		return (ret);
 	}
+
+	mutex_enter(&ill->ill_lock);
+	if (ill->ill_ipallmulti_cnt > 0) {
+		/* Another thread added it concurrently */
+		(void) ip_delmulti(ilm);
+		mutex_exit(&ill->ill_lock);
+		goto done;
+	}
+	ASSERT(ill->ill_ipallmulti_ilm == NULL);
+	ill->ill_ipallmulti_ilm = ilm;
 	ill->ill_ipallmulti_cnt++;
-out:
-	ipsq_exit(ill->ill_phyint->phyint_ipsq);
-	return (ret);
+	mutex_exit(&ill->ill_lock);
+done:
+	ill_refrele(ill);
+	return (0);
 }
 
-
 int
 ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
 {
 	ill_t		*ill;
+	ilm_t		*ilm;
 
-	if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL)
+	ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
+	if (ill == NULL)
 		return (ENODEV);
 
-	if (ill->ill_ipallmulti_cnt > 0) {
-		if (isv6) {
-			(void) ip_delmulti_v6(&ipv6_all_zeros, ill,
-			    ill->ill_zoneid, B_TRUE, B_TRUE);
-		} else {
-			(void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE,
-			    B_TRUE);
-		}
-		ill->ill_ipallmulti_cnt--;
+	if (IS_UNDER_IPMP(ill)) {
+		ill_refrele(ill);
+		return (0);
+	}
+
+	mutex_enter(&ill->ill_lock);
+	if (ill->ill_ipallmulti_cnt == 0) {
+		/* ip_purge_allmulti could have removed them all */
+		mutex_exit(&ill->ill_lock);
+		goto done;
+	}
+	ill->ill_ipallmulti_cnt--;
+	if (ill->ill_ipallmulti_cnt == 0) {
+		/* Last one */
+		ilm = ill->ill_ipallmulti_ilm;
+		ill->ill_ipallmulti_ilm = NULL;
+	} else {
+		ilm = NULL;
 	}
-	ipsq_exit(ill->ill_phyint->phyint_ipsq);
+	mutex_exit(&ill->ill_lock);
+	if (ilm != NULL)
+		(void) ip_delmulti(ilm);
+
+done:
+	ill_refrele(ill);
 	return (0);
 }
 
@@ -1206,108 +1134,34 @@ ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
 void
 ip_purge_allmulti(ill_t *ill)
 {
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	for (; ill->ill_ipallmulti_cnt > 0; ill->ill_ipallmulti_cnt--) {
-		if (ill->ill_isv6) {
-			(void) ip_delmulti_v6(&ipv6_all_zeros, ill,
-			    ill->ill_zoneid, B_TRUE, B_TRUE);
-		} else {
-			(void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE,
-			    B_TRUE);
-		}
-	}
-}
-
-/*
- * Copy mp_orig and pass it in as a local message.
- */
-void
-ip_multicast_loopback(queue_t *q, ill_t *ill, mblk_t *mp_orig, int fanout_flags,
-    zoneid_t zoneid)
-{
-	mblk_t	*mp;
-	mblk_t	*ipsec_mp;
-	ipha_t	*iph;
-	ip_stack_t *ipst = ill->ill_ipst;
-
-	if (DB_TYPE(mp_orig) == M_DATA &&
-	    ((ipha_t *)mp_orig->b_rptr)->ipha_protocol == IPPROTO_UDP) {
-		uint_t hdrsz;
-
-		hdrsz = IPH_HDR_LENGTH((ipha_t *)mp_orig->b_rptr) +
-		    sizeof (udpha_t);
-		ASSERT(MBLKL(mp_orig) >= hdrsz);
-
-		if (((mp = allocb(hdrsz, BPRI_MED)) != NULL) &&
-		    (mp_orig = dupmsg(mp_orig)) != NULL) {
-			cred_t *cr;
-
-			bcopy(mp_orig->b_rptr, mp->b_rptr, hdrsz);
-			mp->b_wptr += hdrsz;
-			mp->b_cont = mp_orig;
-			mp_orig->b_rptr += hdrsz;
-			if (is_system_labeled() &&
-			    (cr = msg_getcred(mp_orig, NULL)) != NULL)
-				mblk_setcred(mp, cr, NOPID);
-			if (MBLKL(mp_orig) == 0) {
-				mp->b_cont = mp_orig->b_cont;
-				mp_orig->b_cont = NULL;
-				freeb(mp_orig);
-			}
-		} else if (mp != NULL) {
-			freeb(mp);
-			mp = NULL;
-		}
-	} else {
-		mp = ip_copymsg(mp_orig); /* No refcnt on ipsec_out netstack */
-	}
-
-	if (mp == NULL)
-		return;
-	if (DB_TYPE(mp) == M_CTL) {
-		ipsec_mp = mp;
-		mp = mp->b_cont;
-	} else {
-		ipsec_mp = mp;
-	}
-
-	iph = (ipha_t *)mp->b_rptr;
-
-	/*
-	 * DTrace this as ip:::send.  A blocked packet will fire the send
-	 * probe, but not the receive probe.
-	 */
-	DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, void_ip_t *, iph,
-	    __dtrace_ipsr_ill_t *, ill, ipha_t *, iph, ip6_t *, NULL, int, 1);
-
-	DTRACE_PROBE4(ip4__loopback__out__start,
-	    ill_t *, NULL, ill_t *, ill,
-	    ipha_t *, iph, mblk_t *, ipsec_mp);
+	ilm_t	*ilm;
 
-	FW_HOOKS(ipst->ips_ip4_loopback_out_event,
-	    ipst->ips_ipv4firewall_loopback_out,
-	    NULL, ill, iph, ipsec_mp, mp, HPE_MULTICAST, ipst);
+	ASSERT(IAM_WRITER_ILL(ill));
 
-	DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, ipsec_mp);
+	mutex_enter(&ill->ill_lock);
+	ilm = ill->ill_ipallmulti_ilm;
+	ill->ill_ipallmulti_ilm = NULL;
+	ill->ill_ipallmulti_cnt = 0;
+	mutex_exit(&ill->ill_lock);
 
-	if (ipsec_mp != NULL)
-		ip_wput_local(q, ill, iph, ipsec_mp, NULL,
-		    fanout_flags, zoneid);
+	if (ilm != NULL)
+		(void) ip_delmulti(ilm);
 }
 
 /*
- * Create a DLPI message; for DL_{ENAB,DISAB}MULTI_REQ, room is left for
- * the hardware address.
+ * Create a dlpi message with room for phys+sap. Later
+ * we will strip the sap for those primitives which
+ * only need a physical address.
  */
 static mblk_t *
-ill_create_dl(ill_t *ill, uint32_t dl_primitive, uint32_t length,
+ill_create_dl(ill_t *ill, uint32_t dl_primitive,
     uint32_t *addr_lenp, uint32_t *addr_offp)
 {
 	mblk_t	*mp;
 	uint32_t	hw_addr_length;
 	char		*cp;
 	uint32_t	offset;
+	uint32_t	length;
 	uint32_t 	size;
 
 	*addr_lenp = *addr_offp = 0;
@@ -1318,14 +1172,18 @@ ill_create_dl(ill_t *ill, uint32_t dl_primitive, uint32_t length,
 		return (NULL);
 	}
 
-	size = length;
 	switch (dl_primitive) {
 	case DL_ENABMULTI_REQ:
+		length = sizeof (dl_enabmulti_req_t);
+		size = length + hw_addr_length;
+		break;
 	case DL_DISABMULTI_REQ:
-		size += hw_addr_length;
+		length = sizeof (dl_disabmulti_req_t);
+		size = length + hw_addr_length;
 		break;
 	case DL_PROMISCON_REQ:
 	case DL_PROMISCOFF_REQ:
+		size = length = sizeof (dl_promiscon_req_t);
 		break;
 	default:
 		return (NULL);
@@ -1373,33 +1231,29 @@ ill_create_dl(ill_t *ill, uint32_t dl_primitive, uint32_t length,
 }
 
 /*
- * Rejoin any groups which have been explicitly joined by the application (we
- * left all explicitly joined groups as part of ill_leave_multicast() prior to
- * bringing the interface down).  Note that because groups can be joined and
- * left while an interface is down, this may not be the same set of groups
- * that we left in ill_leave_multicast().
+ * Rejoin any groups for which we have ilms.
+ *
+ * This is only needed for IPMP when the cast_ill changes since that
+ * change is invisible to the ilm. Other interface changes are handled
+ * by conn_update_ill.
  */
 void
 ill_recover_multicast(ill_t *ill)
 {
 	ilm_t	*ilm;
-	ipif_t	*ipif = ill->ill_ipif;
 	char    addrbuf[INET6_ADDRSTRLEN];
 
-	ASSERT(IAM_WRITER_ILL(ill));
-
 	ill->ill_need_recover_multicast = 0;
 
-	ill_ilm_walker_hold(ill);
+	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
 		/*
-		 * Check how many ipif's that have members in this group -
-		 * if more then one we make sure that this entry is first
-		 * in the list.
+		 * If we have more then one ilm for the group (e.g., with
+		 * different zoneid) then we should not tell the driver
+		 * to join unless this is the first ilm for the group.
 		 */
-		if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 &&
-		    ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE,
-		    ALL_ZONES) != ilm) {
+		if (ilm_numentries(ill, &ilm->ilm_v6addr) > 1 &&
+		    ilm_lookup(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm) {
 			continue;
 		}
 
@@ -1414,38 +1268,42 @@ ill_recover_multicast(ill_t *ill)
 			else
 				igmp_joingroup(ilm);
 
-			(void) ip_ll_addmulti_v6(ipif, &ilm->ilm_v6addr);
+			(void) ip_ll_multireq(ill, &ilm->ilm_v6addr,
+			    DL_ENABMULTI_REQ);
 		}
 	}
-	ill_ilm_walker_rele(ill);
-
+	rw_exit(&ill->ill_mcast_lock);
+	/* Send any deferred/queued DLPI or IP packets */
+	ill_mcast_send_queued(ill);
+	ill_dlpi_send_queued(ill);
+	ill_mcast_timer_start(ill->ill_ipst);
 }
 
 /*
  * The opposite of ill_recover_multicast() -- leaves all multicast groups
  * that were explicitly joined.
+ *
+ * This is only needed for IPMP when the cast_ill changes since that
+ * change is invisible to the ilm. Other interface changes are handled
+ * by conn_update_ill.
  */
 void
 ill_leave_multicast(ill_t *ill)
 {
 	ilm_t	*ilm;
-	ipif_t	*ipif = ill->ill_ipif;
 	char    addrbuf[INET6_ADDRSTRLEN];
 
-	ASSERT(IAM_WRITER_ILL(ill));
-
 	ill->ill_need_recover_multicast = 1;
 
-	ill_ilm_walker_hold(ill);
+	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
 		/*
-		 * Check how many ipif's that have members in this group -
-		 * if more then one we make sure that this entry is first
-		 * in the list.
+		 * If we have more then one ilm for the group (e.g., with
+		 * different zoneid) then we should not tell the driver
+		 * to leave unless this is the first ilm for the group.
 		 */
-		if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 &&
-		    ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE,
-		    ALL_ZONES) != ilm) {
+		if (ilm_numentries(ill, &ilm->ilm_v6addr) > 1 &&
+		    ilm_lookup(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm) {
 			continue;
 		}
 
@@ -1460,126 +1318,186 @@ ill_leave_multicast(ill_t *ill)
 			else
 				igmp_leavegroup(ilm);
 
-			(void) ip_ll_delmulti_v6(ipif, &ilm->ilm_v6addr);
+			(void) ip_ll_multireq(ill, &ilm->ilm_v6addr,
+			    DL_DISABMULTI_REQ);
 		}
 	}
-	ill_ilm_walker_rele(ill);
+	rw_exit(&ill->ill_mcast_lock);
+	/* Send any deferred/queued DLPI or IP packets */
+	ill_mcast_send_queued(ill);
+	ill_dlpi_send_queued(ill);
+	ill_mcast_timer_start(ill->ill_ipst);
 }
 
-/* Find an ilm for matching the ill */
-ilm_t *
-ilm_lookup_ill(ill_t *ill, ipaddr_t group, zoneid_t zoneid)
+/*
+ * Interface used by IP input/output.
+ * Returns true if there is a member on the ill for any zoneid.
+ */
+boolean_t
+ill_hasmembers_v6(ill_t *ill, const in6_addr_t *v6group)
+{
+	ilm_t		*ilm;
+
+	rw_enter(&ill->ill_mcast_lock, RW_READER);
+	ilm = ilm_lookup(ill, v6group, ALL_ZONES);
+	rw_exit(&ill->ill_mcast_lock);
+	return (ilm != NULL);
+}
+
+/*
+ * Interface used by IP input/output.
+ * Returns true if there is a member on the ill for any zoneid.
+ *
+ * The group and source can't be INADDR_ANY here so no need to translate to
+ * the unspecified IPv6 address.
+ */
+boolean_t
+ill_hasmembers_v4(ill_t *ill, ipaddr_t group)
 {
 	in6_addr_t	v6group;
 
-	/*
-	 * INADDR_ANY is represented as the IPv6 unspecified addr.
-	 */
-	if (group == INADDR_ANY)
-		v6group = ipv6_all_zeros;
-	else
-		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+	IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+	return (ill_hasmembers_v6(ill, &v6group));
+}
+
+/*
+ * Interface used by IP input/output.
+ * Returns true if there is a member on the ill for any zoneid except skipzone.
+ */
+boolean_t
+ill_hasmembers_otherzones_v6(ill_t *ill, const in6_addr_t *v6group,
+    zoneid_t skipzone)
+{
+	ilm_t		*ilm;
 
-	return (ilm_lookup_ill_v6(ill, &v6group, B_TRUE, zoneid));
+	rw_enter(&ill->ill_mcast_lock, RW_READER);
+	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+		if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) &&
+		    ilm->ilm_zoneid != skipzone) {
+			rw_exit(&ill->ill_mcast_lock);
+			return (B_TRUE);
+		}
+	}
+	rw_exit(&ill->ill_mcast_lock);
+	return (B_FALSE);
 }
 
 /*
- * Find an ilm for address `v6group' on `ill' and zone `zoneid' (which may be
- * ALL_ZONES).  In general, if `ill' is in an IPMP group, we will match
- * against any ill in the group.  However, if `restrict_solicited' is set,
- * then specifically for IPv6 solicited-node multicast, the match will be
- * restricted to the specified `ill'.
+ * Interface used by IP input/output.
+ * Returns true if there is a member on the ill for any zoneid except skipzone.
+ *
+ * The group and source can't be INADDR_ANY here so no need to translate to
+ * the unspecified IPv6 address.
  */
-ilm_t *
-ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group,
-    boolean_t restrict_solicited, zoneid_t zoneid)
+boolean_t
+ill_hasmembers_otherzones_v4(ill_t *ill, ipaddr_t group, zoneid_t skipzone)
 {
-	ilm_t	*ilm;
-	ilm_walker_t ilw;
-	boolean_t restrict_ill = B_FALSE;
+	in6_addr_t	v6group;
 
-	/*
-	 * In general, underlying interfaces cannot have multicast memberships
-	 * and thus lookups always match across the illgrp.  However, we must
-	 * allow IPv6 solicited-node multicast memberships on underlying
-	 * interfaces, and thus an IPMP meta-interface and one of its
-	 * underlying ills may have the same solicited-node multicast address.
-	 * In that case, we need to restrict the lookup to the requested ill.
-	 * However, we may receive packets on an underlying interface that
-	 * are for the corresponding IPMP interface's solicited-node multicast
-	 * address, and thus in that case we need to match across the group --
-	 * hence the unfortunate `restrict_solicited' argument.
-	 */
-	if (IN6_IS_ADDR_MC_SOLICITEDNODE(v6group) && restrict_solicited)
-		restrict_ill = (IS_IPMP(ill) || IS_UNDER_IPMP(ill));
+	IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+	return (ill_hasmembers_otherzones_v6(ill, &v6group, skipzone));
+}
 
-	ilm = ilm_walker_start(&ilw, ill);
-	for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
-		if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group))
-			continue;
-		if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid)
-			continue;
-		if (!restrict_ill || ill == (ill->ill_isv6 ?
-		    ilm->ilm_ill : ilm->ilm_ipif->ipif_ill)) {
-			break;
+/*
+ * Interface used by IP input.
+ * Returns the next numerically larger zoneid that has a member. If none exist
+ * then returns -1 (ALL_ZONES).
+ * The normal usage is for the caller to start with a -1 zoneid (ALL_ZONES)
+ * to find the first zoneid which has a member, and then pass that in for
+ * subsequent calls until ALL_ZONES is returned.
+ *
+ * The implementation of ill_hasmembers_nextzone() assumes the ilms
+ * are sorted by zoneid for efficiency.
+ */
+zoneid_t
+ill_hasmembers_nextzone_v6(ill_t *ill, const in6_addr_t *v6group,
+    zoneid_t zoneid)
+{
+	ilm_t		*ilm;
+
+	rw_enter(&ill->ill_mcast_lock, RW_READER);
+	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+		if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) &&
+		    ilm->ilm_zoneid > zoneid) {
+			zoneid = ilm->ilm_zoneid;
+			rw_exit(&ill->ill_mcast_lock);
+			return (zoneid);
 		}
 	}
-	ilm_walker_finish(&ilw);
-	return (ilm);
+	rw_exit(&ill->ill_mcast_lock);
+	return (ALL_ZONES);
 }
 
 /*
- * Find an ilm for the ipif. Only needed for IPv4 which does
- * ipif specific socket options.
+ * Interface used by IP input.
+ * Returns the next numerically larger zoneid that has a member. If none exist
+ * then returns -1 (ALL_ZONES).
+ *
+ * The group and source can't be INADDR_ANY here so no need to translate to
+ * the unspecified IPv6 address.
  */
-ilm_t *
-ilm_lookup_ipif(ipif_t *ipif, ipaddr_t group)
+zoneid_t
+ill_hasmembers_nextzone_v4(ill_t *ill, ipaddr_t group, zoneid_t zoneid)
 {
-	ilm_t *ilm;
-	ilm_walker_t ilw;
+	in6_addr_t	v6group;
 
-	ilm = ilm_walker_start(&ilw, ipif->ipif_ill);
-	for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
-		if (ilm->ilm_ipif == ipif && ilm->ilm_addr == group)
-			break;
+	IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+
+	return (ill_hasmembers_nextzone_v6(ill, &v6group, zoneid));
+}
+
+/*
+ * Find an ilm matching the ill, group, and zoneid.
+ */
+static ilm_t *
+ilm_lookup(ill_t *ill, const in6_addr_t *v6group, zoneid_t zoneid)
+{
+	ilm_t	*ilm;
+
+	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
+
+	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+		if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group))
+			continue;
+		if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid)
+			continue;
+
+		ASSERT(ilm->ilm_ill == ill);
+		return (ilm);
 	}
-	ilm_walker_finish(&ilw);
-	return (ilm);
+	return (NULL);
 }
 
 /*
  * How many members on this ill?
+ * Since each shared-IP zone has a separate ilm for the same group/ill
+ * we can have several.
  */
-int
-ilm_numentries_v6(ill_t *ill, const in6_addr_t *v6group)
+static int
+ilm_numentries(ill_t *ill, const in6_addr_t *v6group)
 {
 	ilm_t	*ilm;
 	int i = 0;
 
-	mutex_enter(&ill->ill_lock);
+	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
-		if (ilm->ilm_flags & ILM_DELETED)
-			continue;
 		if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group)) {
 			i++;
 		}
 	}
-	mutex_exit(&ill->ill_lock);
 	return (i);
 }
 
 /* Caller guarantees that the group is not already on the list */
 static ilm_t *
-ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
+ilm_add(ill_t *ill, const in6_addr_t *v6group, ilg_stat_t ilgstat,
     mcast_record_t ilg_fmode, slist_t *ilg_flist, zoneid_t zoneid)
 {
-	ill_t	*ill = ipif->ipif_ill;
 	ilm_t	*ilm;
 	ilm_t	*ilm_cur;
 	ilm_t	**ilm_ptpn;
 
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 	ilm = GETSTRUCT(ilm_t, 1);
 	if (ilm == NULL)
 		return (NULL);
@@ -1596,44 +1514,23 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
 	ilm->ilm_timer = INFINITY;
 	ilm->ilm_rtx.rtx_timer = INFINITY;
 
-	/*
-	 * IPv4 Multicast groups are joined using ipif.
-	 * IPv6 Multicast groups are joined using ill.
-	 */
-	if (ill->ill_isv6) {
-		ilm->ilm_ill = ill;
-		ilm->ilm_ipif = NULL;
-		DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
-		    (char *), "ilm", (void *), ilm);
-		ill->ill_ilm_cnt++;
-	} else {
-		ASSERT(ilm->ilm_zoneid == ipif->ipif_zoneid);
-		ilm->ilm_ipif = ipif;
-		ilm->ilm_ill = NULL;
-		DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ipif,
-		    (char *), "ilm", (void *), ilm);
-		ipif->ipif_ilm_cnt++;
-	}
+	ilm->ilm_ill = ill;
+	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
+	    (char *), "ilm", (void *), ilm);
+	ill->ill_ilm_cnt++;
 
 	ASSERT(ill->ill_ipst);
 	ilm->ilm_ipst = ill->ill_ipst;	/* No netstack_hold */
 
-	ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED));
-	ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
+	/* The ill/ipif could have just been marked as condemned */
 
 	/*
-	 * Grab lock to give consistent view to readers
-	 */
-	mutex_enter(&ill->ill_lock);
-	/*
-	 * All ilms in the same zone are contiguous in the ill_ilm list.
-	 * The loops in ip_proto_input() and ip_wput_local() use this to avoid
-	 * sending duplicates up when two applications in the same zone join the
-	 * same group on different logical interfaces.
+	 * To make ill_hasmembers_nextzone_v6 work we keep the list
+	 * sorted by zoneid.
 	 */
 	ilm_cur = ill->ill_ilm;
 	ilm_ptpn = &ill->ill_ilm;
-	while (ilm_cur != NULL && ilm_cur->ilm_zoneid != ilm->ilm_zoneid) {
+	while (ilm_cur != NULL && ilm_cur->ilm_zoneid < ilm->ilm_zoneid) {
 		ilm_ptpn = &ilm_cur->ilm_next;
 		ilm_cur = ilm_cur->ilm_next;
 	}
@@ -1653,7 +1550,6 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
 		ilm->ilm_fmode = MODE_IS_EXCLUDE;
 	}
 
-	mutex_exit(&ill->ill_lock);
 	return (ilm);
 }
 
@@ -1668,118 +1564,40 @@ ilm_inactive(ilm_t *ilm)
 	mi_free((char *)ilm);
 }
 
-void
-ilm_walker_cleanup(ill_t *ill)
-{
-	ilm_t	**ilmp;
-	ilm_t	*ilm;
-	boolean_t need_wakeup = B_FALSE;
-
-	ASSERT(MUTEX_HELD(&ill->ill_lock));
-	ASSERT(ill->ill_ilm_walker_cnt == 0);
-
-	ilmp = &ill->ill_ilm;
-	while (*ilmp != NULL) {
-		if ((*ilmp)->ilm_flags & ILM_DELETED) {
-			ilm = *ilmp;
-			*ilmp = ilm->ilm_next;
-			/*
-			 * check if there are any pending FREE or unplumb
-			 * operations that need to be restarted.
-			 */
-			if (ilm->ilm_ipif != NULL) {
-				/*
-				 * IPv4 ilms hold a ref on the ipif.
-				 */
-				DTRACE_PROBE3(ipif__decr__cnt,
-				    (ipif_t *), ilm->ilm_ipif,
-				    (char *), "ilm", (void *), ilm);
-				ilm->ilm_ipif->ipif_ilm_cnt--;
-				if (IPIF_FREE_OK(ilm->ilm_ipif))
-					need_wakeup = B_TRUE;
-			} else {
-				/*
-				 * IPv6 ilms hold a ref on the ill.
-				 */
-				ASSERT(ilm->ilm_ill == ill);
-				DTRACE_PROBE3(ill__decr__cnt,
-				    (ill_t *), ill,
-				    (char *), "ilm", (void *), ilm);
-				ASSERT(ill->ill_ilm_cnt > 0);
-				ill->ill_ilm_cnt--;
-				if (ILL_FREE_OK(ill))
-					need_wakeup = B_TRUE;
-			}
-			ilm_inactive(ilm); /* frees ilm */
-		} else {
-			ilmp = &(*ilmp)->ilm_next;
-		}
-	}
-	ill->ill_ilm_cleanup_reqd = 0;
-	if (need_wakeup)
-		ipif_ill_refrele_tail(ill);
-	else
-		mutex_exit(&ill->ill_lock);
-}
-
 /*
  * Unlink ilm and free it.
  */
 static void
 ilm_delete(ilm_t *ilm)
 {
-	ill_t		*ill;
+	ill_t		*ill = ilm->ilm_ill;
 	ilm_t		**ilmp;
 	boolean_t	need_wakeup;
 
-
-	if (ilm->ilm_ipif != NULL) {
-		ASSERT(IAM_WRITER_IPIF(ilm->ilm_ipif));
-		ASSERT(ilm->ilm_ill == NULL);
-		ill = ilm->ilm_ipif->ipif_ill;
-		ASSERT(!ill->ill_isv6);
-	} else {
-		ASSERT(IAM_WRITER_ILL(ilm->ilm_ill));
-		ASSERT(ilm->ilm_ipif == NULL);
-		ill = ilm->ilm_ill;
-		ASSERT(ill->ill_isv6);
-	}
 	/*
 	 * Delete under lock protection so that readers don't stumble
 	 * on bad ilm_next
 	 */
-	mutex_enter(&ill->ill_lock);
-	if (ill->ill_ilm_walker_cnt != 0) {
-		ilm->ilm_flags |= ILM_DELETED;
-		ill->ill_ilm_cleanup_reqd = 1;
-		mutex_exit(&ill->ill_lock);
-		return;
-	}
+	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
 
 	for (ilmp = &ill->ill_ilm; *ilmp != ilm; ilmp = &(*ilmp)->ilm_next)
-				;
+		;
+
 	*ilmp = ilm->ilm_next;
 
+	mutex_enter(&ill->ill_lock);
 	/*
-	 * if we are the last reference to the ipif (for IPv4 ilms)
-	 * or the ill (for IPv6 ilms), we may need to wakeup any
-	 * pending FREE or unplumb operations.
+	 * if we are the last reference to the ill, we may need to wakeup any
+	 * pending FREE or unplumb operations. This is because conn_update_ill
+	 * bails if there is a ilg_delete_all in progress.
 	 */
 	need_wakeup = B_FALSE;
-	if (ilm->ilm_ipif != NULL) {
-		DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ilm->ilm_ipif,
-		    (char *), "ilm", (void *), ilm);
-		ilm->ilm_ipif->ipif_ilm_cnt--;
-		if (IPIF_FREE_OK(ilm->ilm_ipif))
-			need_wakeup = B_TRUE;
-	} else {
-		DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
-		    (char *), "ilm", (void *), ilm);
-		ASSERT(ill->ill_ilm_cnt > 0);
-		ill->ill_ilm_cnt--;
-		if (ILL_FREE_OK(ill))
-			need_wakeup = B_TRUE;
-	}
+	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
+	    (char *), "ilm", (void *), ilm);
+	ASSERT(ill->ill_ilm_cnt > 0);
+	ill->ill_ilm_cnt--;
+	if (ILL_FREE_OK(ill))
+		need_wakeup = B_TRUE;
 
 	ilm_inactive(ilm); /* frees this ilm */
 
@@ -1791,185 +1609,103 @@ ilm_delete(ilm_t *ilm)
 	}
 }
 
-/* Increment the ILM walker count for `ill' */
-static void
-ill_ilm_walker_hold(ill_t *ill)
-{
-	mutex_enter(&ill->ill_lock);
-	ill->ill_ilm_walker_cnt++;
-	mutex_exit(&ill->ill_lock);
-}
-
-/* Decrement the ILM walker count for `ill' */
-static void
-ill_ilm_walker_rele(ill_t *ill)
-{
-	mutex_enter(&ill->ill_lock);
-	ill->ill_ilm_walker_cnt--;
-	if (ill->ill_ilm_walker_cnt == 0 && ill->ill_ilm_cleanup_reqd)
-		ilm_walker_cleanup(ill);	/* drops ill_lock */
-	else
-		mutex_exit(&ill->ill_lock);
-}
-
-/*
- * Start walking the ILMs associated with `ill'; the first ILM in the walk
- * (if any) is returned.  State associated with the walk is stored in `ilw'.
- * Note that walks associated with interfaces under IPMP also walk the ILMs
- * on the associated IPMP interface; this is handled transparently to callers
- * via ilm_walker_step().  (Usually with IPMP all ILMs will be on the IPMP
- * interface; the only exception is to support IPv6 test addresses, which
- * require ILMs for their associated solicited-node multicast addresses.)
- */
-ilm_t *
-ilm_walker_start(ilm_walker_t *ilw, ill_t *ill)
-{
-	ilw->ilw_ill = ill;
-	if (IS_UNDER_IPMP(ill))
-		ilw->ilw_ipmp_ill = ipmp_ill_hold_ipmp_ill(ill);
-	else
-		ilw->ilw_ipmp_ill = NULL;
-
-	ill_ilm_walker_hold(ill);
-	if (ilw->ilw_ipmp_ill != NULL)
-		ill_ilm_walker_hold(ilw->ilw_ipmp_ill);
-
-	if (ilw->ilw_ipmp_ill != NULL && ilw->ilw_ipmp_ill->ill_ilm != NULL)
-		ilw->ilw_walk_ill = ilw->ilw_ipmp_ill;
-	else
-		ilw->ilw_walk_ill = ilw->ilw_ill;
-
-	return (ilm_walker_step(ilw, NULL));
-}
-
 /*
- * Helper function for ilm_walker_step() that returns the next ILM
- * associated with `ilw', regardless of whether it's deleted.
+ * Lookup an ill based on the group, ifindex, ifaddr, and zoneid.
+ * Applies to both IPv4 and IPv6, although ifaddr is only used with
+ * IPv4.
+ * Returns an error for IS_UNDER_IPMP and VNI interfaces.
+ * On error it sets *errorp.
  */
-static ilm_t *
-ilm_walker_step_all(ilm_walker_t *ilw, ilm_t *ilm)
+static ill_t *
+ill_mcast_lookup(const in6_addr_t *group, ipaddr_t ifaddr, uint_t ifindex,
+    zoneid_t zoneid, ip_stack_t *ipst, int *errorp)
 {
-	if (ilm == NULL)
-		return (ilw->ilw_walk_ill->ill_ilm);
+	ill_t *ill;
+	ipaddr_t v4group;
 
-	if (ilm->ilm_next != NULL)
-		return (ilm->ilm_next);
+	if (IN6_IS_ADDR_V4MAPPED(group)) {
+		IN6_V4MAPPED_TO_IPADDR(group, v4group);
 
-	if (ilw->ilw_ipmp_ill != NULL && IS_IPMP(ilw->ilw_walk_ill)) {
-		ilw->ilw_walk_ill = ilw->ilw_ill;
-		/*
-		 * It's possible that ilw_ill left the group during our walk,
-		 * so we can't ASSERT() that it's under IPMP.  Callers that
-		 * care will be writer on the IPSQ anyway.
-		 */
-		return (ilw->ilw_walk_ill->ill_ilm);
-	}
-	return (NULL);
-}
+		if (ifindex != 0) {
+			ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid,
+			    B_FALSE, ipst);
+		} else if (ifaddr != INADDR_ANY) {
+			ipif_t *ipif;
 
-/*
- * Step to the next ILM associated with `ilw'.
- */
-ilm_t *
-ilm_walker_step(ilm_walker_t *ilw, ilm_t *ilm)
-{
-	while ((ilm = ilm_walker_step_all(ilw, ilm)) != NULL) {
-		if (!(ilm->ilm_flags & ILM_DELETED))
-			break;
-	}
-	return (ilm);
-}
-
-/*
- * Finish the ILM walk associated with `ilw'.
- */
-void
-ilm_walker_finish(ilm_walker_t *ilw)
-{
-	ill_ilm_walker_rele(ilw->ilw_ill);
-	if (ilw->ilw_ipmp_ill != NULL) {
-		ill_ilm_walker_rele(ilw->ilw_ipmp_ill);
-		ill_refrele(ilw->ilw_ipmp_ill);
+			ipif = ipif_lookup_addr(ifaddr, NULL, zoneid, ipst);
+			if (ipif == NULL) {
+				ill = NULL;
+			} else {
+				ill = ipif->ipif_ill;
+				ill_refhold(ill);
+				ipif_refrele(ipif);
+			}
+		} else {
+			ill = ill_lookup_group_v4(v4group, zoneid, ipst, NULL,
+			    NULL);
+		}
+	} else {
+		if (ifindex != 0) {
+			ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid,
+			    B_TRUE, ipst);
+		} else {
+			ill = ill_lookup_group_v6(group, zoneid, ipst, NULL,
+			    NULL);
+		}
 	}
-	bzero(&ilw, sizeof (ilw));
-}
-
-/*
- * Looks up the appropriate ipif given a v4 multicast group and interface
- * address.  On success, returns 0, with *ipifpp pointing to the found
- * struct.  On failure, returns an errno and *ipifpp is NULL.
- */
-int
-ip_opt_check(conn_t *connp, ipaddr_t group, ipaddr_t src, ipaddr_t ifaddr,
-    uint_t *ifindexp, mblk_t *first_mp, ipsq_func_t func, ipif_t **ipifpp)
-{
-	ipif_t *ipif;
-	int err = 0;
-	zoneid_t zoneid;
-	ip_stack_t	*ipst =  connp->conn_netstack->netstack_ip;
-
-	if (!CLASSD(group) || CLASSD(src)) {
-		return (EINVAL);
+	if (ill == NULL) {
+		if (ifindex != 0)
+			*errorp = ENXIO;
+		else
+			*errorp = EADDRNOTAVAIL;
+		return (NULL);
 	}
-	*ipifpp = NULL;
-
-	zoneid = IPCL_ZONEID(connp);
-
-	ASSERT(!(ifaddr != INADDR_ANY && ifindexp != NULL && *ifindexp != 0));
-	if (ifaddr != INADDR_ANY) {
-		ipif = ipif_lookup_addr(ifaddr, NULL, zoneid,
-		    CONNP_TO_WQ(connp), first_mp, func, &err, ipst);
-		if (err != 0 && err != EINPROGRESS)
-			err = EADDRNOTAVAIL;
-	} else if (ifindexp != NULL && *ifindexp != 0) {
-		ipif = ipif_lookup_on_ifindex(*ifindexp, B_FALSE, zoneid,
-		    CONNP_TO_WQ(connp), first_mp, func, &err, ipst);
-	} else {
-		ipif = ipif_lookup_group(group, zoneid, ipst);
-		if (ipif == NULL)
-			return (EADDRNOTAVAIL);
+	/* operation not supported on the virtual network interface */
+	if (IS_UNDER_IPMP(ill) || IS_VNI(ill)) {
+		ill_refrele(ill);
+		*errorp = EINVAL;
+		return (NULL);
 	}
-	if (ipif == NULL)
-		return (err);
-
-	*ipifpp = ipif;
-	return (0);
+	return (ill);
 }
 
 /*
- * Looks up the appropriate ill (or ipif if v4mapped) given an interface
- * index and IPv6 multicast group.  On success, returns 0, with *illpp (or
- * *ipifpp if v4mapped) pointing to the found struct.  On failure, returns
- * an errno and *illpp and *ipifpp are undefined.
+ * Looks up the appropriate ill given an interface index (or interface address)
+ * and multicast group.  On success, returns 0, with *illpp pointing to the
+ * found struct.  On failure, returns an errno and *illpp is set to NULL.
+ *
+ * Returns an error for IS_UNDER_IPMP and VNI interfaces.
+ *
+ * Handles both IPv4 and IPv6. The ifaddr argument only applies in the
+ * case of IPv4.
  */
 int
-ip_opt_check_v6(conn_t *connp, const in6_addr_t *v6group, ipaddr_t *v4group,
-    const in6_addr_t *v6src, ipaddr_t *v4src, boolean_t *isv6, int ifindex,
-    mblk_t *first_mp, ipsq_func_t func, ill_t **illpp, ipif_t **ipifpp)
+ip_opt_check(conn_t *connp, const in6_addr_t *v6group,
+    const in6_addr_t *v6src, ipaddr_t ifaddr, uint_t ifindex, ill_t **illpp)
 {
 	boolean_t src_unspec;
 	ill_t *ill = NULL;
-	ipif_t *ipif = NULL;
-	int err;
-	zoneid_t zoneid = connp->conn_zoneid;
-	queue_t *wq = CONNP_TO_WQ(connp);
 	ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+	int error = 0;
+
+	*illpp = NULL;
 
 	src_unspec = IN6_IS_ADDR_UNSPECIFIED(v6src);
 
 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
+		ipaddr_t v4group;
+		ipaddr_t v4src;
+
 		if (!IN6_IS_ADDR_V4MAPPED(v6src) && !src_unspec)
 			return (EINVAL);
-		IN6_V4MAPPED_TO_IPADDR(v6group, *v4group);
+		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
 		if (src_unspec) {
-			*v4src = INADDR_ANY;
+			v4src = INADDR_ANY;
 		} else {
-			IN6_V4MAPPED_TO_IPADDR(v6src, *v4src);
+			IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
 		}
-		if (!CLASSD(*v4group) || CLASSD(*v4src))
+		if (!CLASSD(v4group) || CLASSD(v4src))
 			return (EINVAL);
-		*ipifpp = NULL;
-		*isv6 = B_FALSE;
 	} else {
 		if (IN6_IS_ADDR_V4MAPPED(v6src) && !src_unspec)
 			return (EINVAL);
@@ -1977,43 +1713,17 @@ ip_opt_check_v6(conn_t *connp, const in6_addr_t *v6group, ipaddr_t *v4group,
 		    IN6_IS_ADDR_MULTICAST(v6src)) {
 			return (EINVAL);
 		}
-		*illpp = NULL;
-		*isv6 = B_TRUE;
 	}
 
-	if (ifindex == 0) {
-		if (*isv6)
-			ill = ill_lookup_group_v6(v6group, zoneid, ipst);
-		else
-			ipif = ipif_lookup_group(*v4group, zoneid, ipst);
-		if (ill == NULL && ipif == NULL)
-			return (EADDRNOTAVAIL);
-	} else {
-		if (*isv6) {
-			ill = ill_lookup_on_ifindex(ifindex, B_TRUE,
-			    wq, first_mp, func, &err, ipst);
-			if (ill != NULL &&
-			    !ipif_lookup_zoneid(ill, zoneid, 0, NULL)) {
-				ill_refrele(ill);
-				ill = NULL;
-				err = EADDRNOTAVAIL;
-			}
-		} else {
-			ipif = ipif_lookup_on_ifindex(ifindex, B_FALSE,
-			    zoneid, wq, first_mp, func, &err, ipst);
-		}
-		if (ill == NULL && ipif == NULL)
-			return (err);
-	}
-
-	*ipifpp = ipif;
+	ill = ill_mcast_lookup(v6group, ifaddr, ifindex, IPCL_ZONEID(connp),
+	    ipst, &error);
 	*illpp = ill;
-	return (0);
+	return (error);
 }
 
 static int
 ip_get_srcfilter(conn_t *connp, struct group_filter *gf,
-    struct ip_msfilter *imsf, ipaddr_t grp, ipif_t *ipif, boolean_t isv4mapped)
+    struct ip_msfilter *imsf, const struct in6_addr *group, boolean_t issin6)
 {
 	ilg_t *ilg;
 	int i, numsrc, fmode, outsrcs;
@@ -2022,24 +1732,30 @@ ip_get_srcfilter(conn_t *connp, struct group_filter *gf,
 	struct in_addr *addrp;
 	slist_t *fp;
 	boolean_t is_v4only_api;
-
-	mutex_enter(&connp->conn_lock);
-
-	ilg = ilg_lookup_ipif(connp, grp, ipif);
-	if (ilg == NULL) {
-		mutex_exit(&connp->conn_lock);
-		return (EADDRNOTAVAIL);
-	}
+	ipaddr_t ifaddr;
+	uint_t ifindex;
 
 	if (gf == NULL) {
 		ASSERT(imsf != NULL);
-		ASSERT(!isv4mapped);
+		ASSERT(!issin6);
 		is_v4only_api = B_TRUE;
 		outsrcs = imsf->imsf_numsrc;
+		ifaddr = imsf->imsf_interface.s_addr;
+		ifindex = 0;
 	} else {
 		ASSERT(imsf == NULL);
 		is_v4only_api = B_FALSE;
 		outsrcs = gf->gf_numsrc;
+		ifaddr = INADDR_ANY;
+		ifindex = gf->gf_interface;
+	}
+
+	/* No need to use ill_mcast_serializer for the reader */
+	rw_enter(&connp->conn_ilg_lock, RW_READER);
+	ilg = ilg_lookup(connp, group, ifaddr, ifindex);
+	if (ilg == NULL) {
+		rw_exit(&connp->conn_ilg_lock);
+		return (EADDRNOTAVAIL);
 	}
 
 	/*
@@ -2055,7 +1771,7 @@ ip_get_srcfilter(conn_t *connp, struct group_filter *gf,
 		for (i = 0; i < outsrcs; i++) {
 			if (i == fp->sl_numsrc)
 				break;
-			if (isv4mapped) {
+			if (issin6) {
 				sin6 = (struct sockaddr_in6 *)&gf->gf_slist[i];
 				sin6->sin6_family = AF_INET6;
 				sin6->sin6_addr = fp->sl_addr[i];
@@ -2082,57 +1798,18 @@ ip_get_srcfilter(conn_t *connp, struct group_filter *gf,
 		gf->gf_fmode = fmode;
 	}
 
-	mutex_exit(&connp->conn_lock);
-
-	return (0);
-}
-
-static int
-ip_get_srcfilter_v6(conn_t *connp, struct group_filter *gf,
-    const struct in6_addr *grp, ill_t *ill)
-{
-	ilg_t *ilg;
-	int i;
-	struct sockaddr_storage *sl;
-	struct sockaddr_in6 *sin6;
-	slist_t *fp;
-
-	mutex_enter(&connp->conn_lock);
-
-	ilg = ilg_lookup_ill_v6(connp, grp, ill);
-	if (ilg == NULL) {
-		mutex_exit(&connp->conn_lock);
-		return (EADDRNOTAVAIL);
-	}
-
-	/*
-	 * In the kernel, we use the state definitions MODE_IS_[IN|EX]CLUDE
-	 * to identify the filter mode; but the API uses MCAST_[IN|EX]CLUDE.
-	 * So we need to translate here.
-	 */
-	gf->gf_fmode = (ilg->ilg_fmode == MODE_IS_INCLUDE) ?
-	    MCAST_INCLUDE : MCAST_EXCLUDE;
-	if ((fp = ilg->ilg_filter) == NULL) {
-		gf->gf_numsrc = 0;
-	} else {
-		for (i = 0, sl = gf->gf_slist; i < gf->gf_numsrc; i++, sl++) {
-			if (i == fp->sl_numsrc)
-				break;
-			sin6 = (struct sockaddr_in6 *)sl;
-			sin6->sin6_family = AF_INET6;
-			sin6->sin6_addr = fp->sl_addr[i];
-		}
-		gf->gf_numsrc = fp->sl_numsrc;
-	}
-
-	mutex_exit(&connp->conn_lock);
+	rw_exit(&connp->conn_ilg_lock);
 
 	return (0);
 }
 
+/*
+ * Common for IPv4 and IPv6.
+ */
 static int
 ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
-    struct ip_msfilter *imsf, ipaddr_t grp, ipif_t *ipif, boolean_t isv4mapped)
+    struct ip_msfilter *imsf, const struct in6_addr *group, ill_t *ill,
+    boolean_t issin6)
 {
 	ilg_t *ilg;
 	int i, err, infmode, new_fmode;
@@ -2143,20 +1820,27 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
 	slist_t *orig_filter = NULL;
 	slist_t *new_filter = NULL;
 	mcast_record_t orig_fmode;
-	boolean_t leave_grp, is_v4only_api;
+	boolean_t leave_group, is_v4only_api;
 	ilg_stat_t ilgstat;
+	ilm_t *ilm;
+	ipaddr_t ifaddr;
+	uint_t ifindex;
 
 	if (gf == NULL) {
 		ASSERT(imsf != NULL);
-		ASSERT(!isv4mapped);
+		ASSERT(!issin6);
 		is_v4only_api = B_TRUE;
 		insrcs = imsf->imsf_numsrc;
 		infmode = imsf->imsf_fmode;
+		ifaddr = imsf->imsf_interface.s_addr;
+		ifindex = 0;
 	} else {
 		ASSERT(imsf == NULL);
 		is_v4only_api = B_FALSE;
 		insrcs = gf->gf_numsrc;
 		infmode = gf->gf_fmode;
+		ifaddr = INADDR_ANY;
+		ifindex = gf->gf_interface;
 	}
 
 	/* Make sure we can handle the source list */
@@ -2167,32 +1851,52 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
 	 * setting the filter to (INCLUDE, NULL) is treated
 	 * as a request to leave the group.
 	 */
-	leave_grp = (infmode == MCAST_INCLUDE && insrcs == 0);
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
+	leave_group = (infmode == MCAST_INCLUDE && insrcs == 0);
 
-	mutex_enter(&connp->conn_lock);
-
-	ilg = ilg_lookup_ipif(connp, grp, ipif);
+	mutex_enter(&ill->ill_mcast_serializer);
+	rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+	ilg = ilg_lookup(connp, group, ifaddr, ifindex);
 	if (ilg == NULL) {
 		/*
 		 * if the request was actually to leave, and we
 		 * didn't find an ilg, there's nothing to do.
 		 */
-		if (!leave_grp)
-			ilg = conn_ilg_alloc(connp, &err);
-		if (leave_grp || ilg == NULL) {
-			mutex_exit(&connp->conn_lock);
-			return (leave_grp ? 0 : err);
+		if (leave_group) {
+			rw_exit(&connp->conn_ilg_lock);
+			mutex_exit(&ill->ill_mcast_serializer);
+			return (0);
+		}
+		ilg = conn_ilg_alloc(connp, &err);
+		if (ilg == NULL) {
+			rw_exit(&connp->conn_ilg_lock);
+			mutex_exit(&ill->ill_mcast_serializer);
+			return (err);
 		}
 		ilgstat = ILGSTAT_NEW;
-		IN6_IPADDR_TO_V4MAPPED(grp, &ilg->ilg_v6group);
-		ilg->ilg_ipif = ipif;
-		ilg->ilg_ill = NULL;
-	} else if (leave_grp) {
+		ilg->ilg_v6group = *group;
+		ilg->ilg_ill = ill;
+		ilg->ilg_ifaddr = ifaddr;
+		ilg->ilg_ifindex = ifindex;
+	} else if (leave_group) {
+		/*
+		 * Make sure we have the correct serializer. The ill argument
+		 * might not match ilg_ill.
+		 */
+		ilg_refhold(ilg);
+		mutex_exit(&ill->ill_mcast_serializer);
+		ill = ilg->ilg_ill;
+		rw_exit(&connp->conn_ilg_lock);
+
+		mutex_enter(&ill->ill_mcast_serializer);
+		rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+		ilm = ilg->ilg_ilm;
+		ilg->ilg_ilm = NULL;
 		ilg_delete(connp, ilg, NULL);
-		mutex_exit(&connp->conn_lock);
-		(void) ip_delmulti(grp, ipif, B_FALSE, B_TRUE);
+		ilg_refrele(ilg);
+		rw_exit(&connp->conn_ilg_lock);
+		if (ilm != NULL)
+			(void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE);
+		mutex_exit(&ill->ill_mcast_serializer);
 		return (0);
 	} else {
 		ilgstat = ILGSTAT_CHANGE;
@@ -2203,7 +1907,8 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
 		} else {
 			orig_filter = l_alloc_copy(ilg->ilg_filter);
 			if (orig_filter == NULL) {
-				mutex_exit(&connp->conn_lock);
+				rw_exit(&connp->conn_ilg_lock);
+				mutex_exit(&ill->ill_mcast_serializer);
 				return (ENOMEM);
 			}
 		}
@@ -2214,7 +1919,7 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
 	 * we make any changes, so we can bail if it fails.
 	 */
 	if ((new_filter = l_alloc()) == NULL) {
-		mutex_exit(&connp->conn_lock);
+		rw_exit(&connp->conn_ilg_lock);
 		err = ENOMEM;
 		goto free_and_exit;
 	}
@@ -2228,7 +1933,7 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
 			if (fp == NULL) {
 				if (ilgstat == ILGSTAT_NEW)
 					ilg_delete(connp, ilg, NULL);
-				mutex_exit(&connp->conn_lock);
+				rw_exit(&connp->conn_ilg_lock);
 				err = ENOMEM;
 				goto free_and_exit;
 			}
@@ -2236,7 +1941,7 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
 			fp = ilg->ilg_filter;
 		}
 		for (i = 0; i < insrcs; i++) {
-			if (isv4mapped) {
+			if (issin6) {
 				sin6 = (struct sockaddr_in6 *)&gf->gf_slist[i];
 				fp->sl_addr[i] = sin6->sin6_addr;
 			} else {
@@ -2263,177 +1968,70 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
 
 	/*
 	 * Save copy of ilg's filter state to pass to other functions,
-	 * so we can release conn_lock now.
+	 * so we can release conn_ilg_lock now.
 	 */
 	new_fmode = ilg->ilg_fmode;
 	l_copy(ilg->ilg_filter, new_filter);
 
-	mutex_exit(&connp->conn_lock);
-
-	err = ip_addmulti(grp, ipif, ilgstat, new_fmode, new_filter);
-	if (err != 0) {
-		/*
-		 * Restore the original filter state, or delete the
-		 * newly-created ilg.  We need to look up the ilg
-		 * again, though, since we've not been holding the
-		 * conn_lock.
-		 */
-		mutex_enter(&connp->conn_lock);
-		ilg = ilg_lookup_ipif(connp, grp, ipif);
-		ASSERT(ilg != NULL);
-		if (ilgstat == ILGSTAT_NEW) {
-			ilg_delete(connp, ilg, NULL);
-		} else {
-			ilg->ilg_fmode = orig_fmode;
-			if (SLIST_IS_EMPTY(orig_filter)) {
-				CLEAR_SLIST(ilg->ilg_filter);
-			} else {
-				/*
-				 * We didn't free the filter, even if we
-				 * were trying to make the source list empty;
-				 * so if orig_filter isn't empty, the ilg
-				 * must still have a filter alloc'd.
-				 */
-				l_copy(orig_filter, ilg->ilg_filter);
-			}
-		}
-		mutex_exit(&connp->conn_lock);
-	}
-
-free_and_exit:
-	l_free(orig_filter);
-	l_free(new_filter);
-
-	return (err);
-}
-
-static int
-ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
-    const struct in6_addr *grp, ill_t *ill)
-{
-	ilg_t *ilg;
-	int i, orig_fmode, new_fmode, err;
-	slist_t *orig_filter = NULL;
-	slist_t *new_filter = NULL;
-	struct sockaddr_storage *sl;
-	struct sockaddr_in6 *sin6;
-	boolean_t leave_grp;
-	ilg_stat_t ilgstat;
-
-	/* Make sure we can handle the source list */
-	if (gf->gf_numsrc > MAX_FILTER_SIZE)
-		return (ENOBUFS);
+	rw_exit(&connp->conn_ilg_lock);
 
 	/*
-	 * setting the filter to (INCLUDE, NULL) is treated
-	 * as a request to leave the group.
+	 * Now update the ill. We wait to do this until after the ilg
+	 * has been updated because we need to update the src filter
+	 * info for the ill, which involves looking at the status of
+	 * all the ilgs associated with this group/interface pair.
 	 */
-	leave_grp = (gf->gf_fmode == MCAST_INCLUDE && gf->gf_numsrc == 0);
-
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	mutex_enter(&connp->conn_lock);
-	ilg = ilg_lookup_ill_v6(connp, grp, ill);
-	if (ilg == NULL) {
-		/*
-		 * if the request was actually to leave, and we
-		 * didn't find an ilg, there's nothing to do.
-		 */
-		if (!leave_grp)
-			ilg = conn_ilg_alloc(connp, &err);
-		if (leave_grp || ilg == NULL) {
-			mutex_exit(&connp->conn_lock);
-			return (leave_grp ? 0 : err);
-		}
-		ilgstat = ILGSTAT_NEW;
-		ilg->ilg_v6group = *grp;
-		ilg->ilg_ipif = NULL;
-		ilg->ilg_ill = ill;
-	} else if (leave_grp) {
-		ilg_delete(connp, ilg, NULL);
-		mutex_exit(&connp->conn_lock);
-		(void) ip_delmulti_v6(grp, ill, connp->conn_zoneid, B_FALSE,
-		    B_TRUE);
-		return (0);
-	} else {
-		ilgstat = ILGSTAT_CHANGE;
-		/* preserve existing state in case ip_addmulti() fails */
-		orig_fmode = ilg->ilg_fmode;
-		if (ilg->ilg_filter == NULL) {
-			orig_filter = NULL;
-		} else {
-			orig_filter = l_alloc_copy(ilg->ilg_filter);
-			if (orig_filter == NULL) {
-				mutex_exit(&connp->conn_lock);
-				return (ENOMEM);
-			}
-		}
-	}
+	ilm = ip_addmulti_serial(group, ill, connp->conn_zoneid, ilgstat,
+	    new_fmode, new_filter, &err);
 
+	rw_enter(&connp->conn_ilg_lock, RW_WRITER);
 	/*
-	 * Alloc buffer to copy new state into (see below) before
-	 * we make any changes, so we can bail if it fails.
+	 * Must look up the ilg again since we've not been holding
+	 * conn_ilg_lock. The ilg could have disappeared due to an unplumb
+	 * having called conn_update_ill, which can run once we dropped the
+	 * conn_ilg_lock above.
 	 */
-	if ((new_filter = l_alloc()) == NULL) {
-		mutex_exit(&connp->conn_lock);
-		err = ENOMEM;
+	ilg = ilg_lookup(connp, group, ifaddr, ifindex);
+	if (ilg == NULL) {
+		rw_exit(&connp->conn_ilg_lock);
+		if (ilm != NULL) {
+			(void) ip_delmulti_serial(ilm, B_FALSE,
+			    (ilgstat == ILGSTAT_NEW));
+		}
+		err = ENXIO;
 		goto free_and_exit;
 	}
 
-	if (gf->gf_numsrc == 0) {
-		CLEAR_SLIST(ilg->ilg_filter);
-	} else {
-		slist_t *fp;
-		if (ilg->ilg_filter == NULL) {
-			fp = l_alloc();
-			if (fp == NULL) {
-				if (ilgstat == ILGSTAT_NEW)
-					ilg_delete(connp, ilg, NULL);
-				mutex_exit(&connp->conn_lock);
-				err = ENOMEM;
-				goto free_and_exit;
-			}
+	if (ilm != NULL) {
+		/* Succeeded. Update the ilg to point at the ilm */
+		if (ilgstat == ILGSTAT_NEW) {
+			ASSERT(ilg->ilg_ilm == NULL);
+			ilg->ilg_ilm = ilm;
+			ilm->ilm_ifaddr = ifaddr;	/* For netstat */
 		} else {
-			fp = ilg->ilg_filter;
-		}
-		for (i = 0, sl = gf->gf_slist; i < gf->gf_numsrc; i++, sl++) {
-			sin6 = (struct sockaddr_in6 *)sl;
-			fp->sl_addr[i] = sin6->sin6_addr;
+			/*
+			 * ip_addmulti didn't get a held ilm for
+			 * ILGSTAT_CHANGE; ilm_refcnt was unchanged.
+			 */
+			ASSERT(ilg->ilg_ilm == ilm);
 		}
-		fp->sl_numsrc = gf->gf_numsrc;
-		ilg->ilg_filter = fp;
-	}
-	/*
-	 * In the kernel, we use the state definitions MODE_IS_[IN|EX]CLUDE
-	 * to identify the filter mode; but the API uses MCAST_[IN|EX]CLUDE.
-	 * So we need to translate here.
-	 */
-	ilg->ilg_fmode = (gf->gf_fmode == MCAST_INCLUDE) ?
-	    MODE_IS_INCLUDE : MODE_IS_EXCLUDE;
-
-	/*
-	 * Save copy of ilg's filter state to pass to other functions,
-	 * so we can release conn_lock now.
-	 */
-	new_fmode = ilg->ilg_fmode;
-	l_copy(ilg->ilg_filter, new_filter);
-
-	mutex_exit(&connp->conn_lock);
-
-	err = ip_addmulti_v6(grp, ill, connp->conn_zoneid, ilgstat, new_fmode,
-	    new_filter);
-	if (err != 0) {
+	} else {
+		ASSERT(err != 0);
 		/*
+		 * Failed to allocate the ilm.
 		 * Restore the original filter state, or delete the
-		 * newly-created ilg.  We need to look up the ilg
-		 * again, though, since we've not been holding the
-		 * conn_lock.
+		 * newly-created ilg.
+		 * If ENETDOWN just clear ill_ilg since so that we
+		 * will rejoin when the ill comes back; don't report ENETDOWN
+		 * to application.
 		 */
-		mutex_enter(&connp->conn_lock);
-		ilg = ilg_lookup_ill_v6(connp, grp, ill);
-		ASSERT(ilg != NULL);
 		if (ilgstat == ILGSTAT_NEW) {
-			ilg_delete(connp, ilg, NULL);
+			if (err == ENETDOWN) {
+				ilg->ilg_ill = NULL;
+				err = 0;
+			} else {
+				ilg_delete(connp, ilg, NULL);
+			}
 		} else {
 			ilg->ilg_fmode = orig_fmode;
 			if (SLIST_IS_EMPTY(orig_filter)) {
@@ -2448,10 +2046,11 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
 				l_copy(orig_filter, ilg->ilg_filter);
 			}
 		}
-		mutex_exit(&connp->conn_lock);
 	}
+	rw_exit(&connp->conn_ilg_lock);
 
 free_and_exit:
+	mutex_exit(&ill->ill_mcast_serializer);
 	l_free(orig_filter);
 	l_free(new_filter);
 
@@ -2475,11 +2074,17 @@ ip_sioctl_msfilter(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	boolean_t isv6, is_v4only_api, getcmd;
 	struct sockaddr_in *gsin;
 	struct sockaddr_in6 *gsin6;
-	ipaddr_t v4grp;
-	in6_addr_t v6grp;
+	ipaddr_t v4group;
+	in6_addr_t v6group;
 	struct group_filter *gf = NULL;
 	struct ip_msfilter *imsf = NULL;
 	mblk_t *ndp;
+	ill_t *ill;
+
+	connp = Q_TO_CONN(q);
+	err = ip_msfilter_ill(connp, mp, ipip, &ill);
+	if (err != 0)
+		return (err);
 
 	if (data_mp->b_cont != NULL) {
 		if ((ndp = msgpullup(data_mp, -1)) == NULL)
@@ -2519,132 +2124,119 @@ ip_sioctl_msfilter(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	if (datalen < expsize)
 		return (EINVAL);
 
-	connp = Q_TO_CONN(q);
-
-	/* operation not supported on the virtual network interface */
-	if (IS_VNI(ipif->ipif_ill))
-		return (EINVAL);
-
 	if (isv6) {
-		ill_t *ill = ipif->ipif_ill;
-		ill_refhold(ill);
-
 		gsin6 = (struct sockaddr_in6 *)&gf->gf_group;
-		v6grp = gsin6->sin6_addr;
-		if (getcmd)
-			err = ip_get_srcfilter_v6(connp, gf, &v6grp, ill);
-		else
-			err = ip_set_srcfilter_v6(connp, gf, &v6grp, ill);
-
-		ill_refrele(ill);
+		v6group = gsin6->sin6_addr;
+		if (getcmd) {
+			err = ip_get_srcfilter(connp, gf, NULL, &v6group,
+			    B_TRUE);
+		} else {
+			err = ip_set_srcfilter(connp, gf, NULL, &v6group, ill,
+			    B_TRUE);
+		}
 	} else {
-		boolean_t isv4mapped = B_FALSE;
+		boolean_t issin6 = B_FALSE;
 		if (is_v4only_api) {
-			v4grp = (ipaddr_t)imsf->imsf_multiaddr.s_addr;
+			v4group = (ipaddr_t)imsf->imsf_multiaddr.s_addr;
+			IN6_IPADDR_TO_V4MAPPED(v4group, &v6group);
 		} else {
 			if (gf->gf_group.ss_family == AF_INET) {
 				gsin = (struct sockaddr_in *)&gf->gf_group;
-				v4grp = (ipaddr_t)gsin->sin_addr.s_addr;
+				v4group = (ipaddr_t)gsin->sin_addr.s_addr;
+				IN6_IPADDR_TO_V4MAPPED(v4group, &v6group);
 			} else {
 				gsin6 = (struct sockaddr_in6 *)&gf->gf_group;
 				IN6_V4MAPPED_TO_IPADDR(&gsin6->sin6_addr,
-				    v4grp);
-				isv4mapped = B_TRUE;
+				    v4group);
+				issin6 = B_TRUE;
 			}
 		}
-		if (getcmd)
-			err = ip_get_srcfilter(connp, gf, imsf, v4grp, ipif,
-			    isv4mapped);
+		/*
+		 * INADDR_ANY is represented as the IPv6 unspecifed addr.
+		 */
+		if (v4group == INADDR_ANY)
+			v6group = ipv6_all_zeros;
 		else
-			err = ip_set_srcfilter(connp, gf, imsf, v4grp, ipif,
-			    isv4mapped);
+			IN6_IPADDR_TO_V4MAPPED(v4group, &v6group);
+
+		if (getcmd) {
+			err = ip_get_srcfilter(connp, gf, imsf, &v6group,
+			    issin6);
+		} else {
+			err = ip_set_srcfilter(connp, gf, imsf, &v6group, ill,
+			    issin6);
+		}
 	}
+	ill_refrele(ill);
 
 	return (err);
 }
 
 /*
- * Finds the ipif based on information in the ioctl headers.  Needed to make
- * ip_process_ioctl() happy (it needs to know the ipif for IPI_WR-flagged
- * ioctls prior to calling the ioctl's handler function).
+ * Determine the ill for the SIOC*MSFILTER ioctls
+ *
+ * Returns an error for IS_UNDER_IPMP interfaces.
+ *
+ * Finds the ill based on information in the ioctl headers.
  */
-int
-ip_extract_msfilter(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
-    cmd_info_t *ci, ipsq_func_t func)
+static int
+ip_msfilter_ill(conn_t *connp, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
+    ill_t **illp)
 {
 	int cmd = ipip->ipi_cmd;
 	int err = 0;
-	conn_t *connp;
-	ipif_t *ipif;
+	ill_t *ill;
 	/* caller has verified this mblk exists */
 	char *dbuf = (char *)mp->b_cont->b_cont->b_rptr;
 	struct ip_msfilter *imsf;
 	struct group_filter *gf;
-	ipaddr_t v4addr, v4grp;
-	in6_addr_t v6grp;
+	ipaddr_t v4addr, v4group;
+	in6_addr_t v6group;
 	uint32_t index;
-	zoneid_t zoneid;
 	ip_stack_t *ipst;
 
-	connp = Q_TO_CONN(q);
-	zoneid = connp->conn_zoneid;
 	ipst = connp->conn_netstack->netstack_ip;
 
+	*illp = NULL;
+
 	/* don't allow multicast operations on a tcp conn */
 	if (IPCL_IS_TCP(connp))
 		return (ENOPROTOOPT);
 
 	if (cmd == SIOCSIPMSFILTER || cmd == SIOCGIPMSFILTER) {
 		/* don't allow v4-specific ioctls on v6 socket */
-		if (connp->conn_af_isv6)
+		if (connp->conn_family == AF_INET6)
 			return (EAFNOSUPPORT);
 
 		imsf = (struct ip_msfilter *)dbuf;
 		v4addr = imsf->imsf_interface.s_addr;
-		v4grp = imsf->imsf_multiaddr.s_addr;
-		if (v4addr == INADDR_ANY) {
-			ipif = ipif_lookup_group(v4grp, zoneid, ipst);
-			if (ipif == NULL)
-				err = EADDRNOTAVAIL;
-		} else {
-			ipif = ipif_lookup_addr(v4addr, NULL, zoneid, q, mp,
-			    func, &err, ipst);
-		}
+		v4group = imsf->imsf_multiaddr.s_addr;
+		IN6_IPADDR_TO_V4MAPPED(v4group, &v6group);
+		ill = ill_mcast_lookup(&v6group, v4addr, 0, IPCL_ZONEID(connp),
+		    ipst, &err);
+		if (ill == NULL && v4addr != INADDR_ANY)
+			err = ENXIO;
 	} else {
-		boolean_t isv6 = B_FALSE;
 		gf = (struct group_filter *)dbuf;
 		index = gf->gf_interface;
 		if (gf->gf_group.ss_family == AF_INET6) {
 			struct sockaddr_in6 *sin6;
+
 			sin6 = (struct sockaddr_in6 *)&gf->gf_group;
-			v6grp = sin6->sin6_addr;
-			if (IN6_IS_ADDR_V4MAPPED(&v6grp))
-				IN6_V4MAPPED_TO_IPADDR(&v6grp, v4grp);
-			else
-				isv6 = B_TRUE;
+			v6group = sin6->sin6_addr;
 		} else if (gf->gf_group.ss_family == AF_INET) {
 			struct sockaddr_in *sin;
+
 			sin = (struct sockaddr_in *)&gf->gf_group;
-			v4grp = sin->sin_addr.s_addr;
+			v4group = sin->sin_addr.s_addr;
+			IN6_IPADDR_TO_V4MAPPED(v4group, &v6group);
 		} else {
 			return (EAFNOSUPPORT);
 		}
-		if (index == 0) {
-			if (isv6) {
-				ipif = ipif_lookup_group_v6(&v6grp, zoneid,
-				    ipst);
-			} else {
-				ipif = ipif_lookup_group(v4grp, zoneid, ipst);
-			}
-			if (ipif == NULL)
-				err = EADDRNOTAVAIL;
-		} else {
-			ipif = ipif_lookup_on_ifindex(index, isv6, zoneid,
-			    q, mp, func, &err, ipst);
-		}
+		ill = ill_mcast_lookup(&v6group, INADDR_ANY, index,
+		    IPCL_ZONEID(connp), ipst, &err);
 	}
-
-	ci->ci_ipif = ipif;
+	*illp = ill;
 	return (err);
 }
 
@@ -2695,6 +2287,7 @@ ip_copyin_msfilter(queue_t *q, mblk_t *mp)
 /*
  * Handle the following optmgmt:
  *	IP_ADD_MEMBERSHIP		must not have joined already
+ *	IPV6_JOIN_GROUP			must not have joined already
  *	MCAST_JOIN_GROUP		must not have joined already
  *	IP_BLOCK_SOURCE			must have joined already
  *	MCAST_BLOCK_SOURCE		must have joined already
@@ -2702,91 +2295,15 @@ ip_copyin_msfilter(queue_t *q, mblk_t *mp)
  *	MCAST_JOIN_SOURCE_GROUP		may have joined already
  *
  * fmode and src parameters may be used to determine which option is
- * being set, as follows (the IP_* and MCAST_* versions of each option
- * are functionally equivalent):
- *	opt			fmode			src
- *	IP_ADD_MEMBERSHIP	MODE_IS_EXCLUDE		INADDR_ANY
- *	MCAST_JOIN_GROUP	MODE_IS_EXCLUDE		INADDR_ANY
- *	IP_BLOCK_SOURCE		MODE_IS_EXCLUDE		v4 addr
- *	MCAST_BLOCK_SOURCE	MODE_IS_EXCLUDE		v4 addr
- *	IP_JOIN_SOURCE_GROUP	MODE_IS_INCLUDE		v4 addr
- *	MCAST_JOIN_SOURCE_GROUP	MODE_IS_INCLUDE		v4 addr
- *
- * Changing the filter mode is not allowed; if a matching ilg already
- * exists and fmode != ilg->ilg_fmode, EINVAL is returned.
- *
- * Verifies that there is a source address of appropriate scope for
- * the group; if not, EADDRNOTAVAIL is returned.
- *
- * The interface to be used may be identified by an address or by an
- * index.  A pointer to the index is passed; if it is NULL, use the
- * address, otherwise, use the index.
- */
-int
-ip_opt_add_group(conn_t *connp, boolean_t checkonly, ipaddr_t group,
-    ipaddr_t ifaddr, uint_t *ifindexp, mcast_record_t fmode, ipaddr_t src,
-    mblk_t *first_mp)
-{
-	ipif_t	*ipif;
-	ipsq_t	*ipsq;
-	int err = 0;
-	ill_t	*ill;
-
-	err = ip_opt_check(connp, group, src, ifaddr, ifindexp, first_mp,
-	    ip_restart_optmgmt, &ipif);
-	if (err != 0) {
-		if (err != EINPROGRESS) {
-			ip1dbg(("ip_opt_add_group: no ipif for group 0x%x, "
-			    "ifaddr 0x%x, ifindex %d\n", ntohl(group),
-			    ntohl(ifaddr), (ifindexp == NULL) ? 0 : *ifindexp));
-		}
-		return (err);
-	}
-	ASSERT(ipif != NULL);
-
-	ill = ipif->ipif_ill;
-	/* Operation not supported on a virtual network interface */
-	if (IS_VNI(ill)) {
-		ipif_refrele(ipif);
-		return (EINVAL);
-	}
-
-	if (checkonly) {
-		/*
-		 * do not do operation, just pretend to - new T_CHECK
-		 * semantics. The error return case above if encountered
-		 * considered a good enough "check" here.
-		 */
-		ipif_refrele(ipif);
-		return (0);
-	}
-
-	IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt, ipsq,
-	    NEW_OP);
-
-	/* unspecified source addr => no source filtering */
-	err = ilg_add(connp, group, ipif, fmode, src);
-
-	IPSQ_EXIT(ipsq);
-
-	ipif_refrele(ipif);
-	return (err);
-}
-
-/*
- * Handle the following optmgmt:
- *	IPV6_JOIN_GROUP			must not have joined already
- *	MCAST_JOIN_GROUP		must not have joined already
- *	MCAST_BLOCK_SOURCE		must have joined already
- *	MCAST_JOIN_SOURCE_GROUP		may have joined already
- *
- * fmode and src parameters may be used to determine which option is
  * being set, as follows (IPV6_JOIN_GROUP and MCAST_JOIN_GROUP options
  * are functionally equivalent):
  *	opt			fmode			v6src
+ *	IP_ADD_MEMBERSHIP	MODE_IS_EXCLUDE		unspecified
  *	IPV6_JOIN_GROUP		MODE_IS_EXCLUDE		unspecified
  *	MCAST_JOIN_GROUP	MODE_IS_EXCLUDE		unspecified
+ *	IP_BLOCK_SOURCE		MODE_IS_EXCLUDE		IPv4-mapped addr
  *	MCAST_BLOCK_SOURCE	MODE_IS_EXCLUDE		v6 addr
+ *	IP_JOIN_SOURCE_GROUP	MODE_IS_INCLUDE		IPv4-mapped addr
  *	MCAST_JOIN_SOURCE_GROUP	MODE_IS_INCLUDE		v6 addr
  *
  * Changing the filter mode is not allowed; if a matching ilg already
@@ -2795,47 +2312,29 @@ ip_opt_add_group(conn_t *connp, boolean_t checkonly, ipaddr_t group,
  * Verifies that there is a source address of appropriate scope for
  * the group; if not, EADDRNOTAVAIL is returned.
  *
+ * The interface to be used may be identified by an IPv4 address or by an
+ * interface index.
+ *
  * Handles IPv4-mapped IPv6 multicast addresses by associating them
- * with the link-local ipif.  Assumes that if v6group is v4-mapped,
+ * with the IPv4 address.  Assumes that if v6group is v4-mapped,
  * v6src is also v4-mapped.
  */
 int
-ip_opt_add_group_v6(conn_t *connp, boolean_t checkonly,
-    const in6_addr_t *v6group, int ifindex, mcast_record_t fmode,
-    const in6_addr_t *v6src, mblk_t *first_mp)
+ip_opt_add_group(conn_t *connp, boolean_t checkonly,
+    const in6_addr_t *v6group, ipaddr_t ifaddr, uint_t ifindex,
+    mcast_record_t fmode, const in6_addr_t *v6src)
 {
 	ill_t *ill;
-	ipif_t	*ipif;
 	char buf[INET6_ADDRSTRLEN];
-	ipaddr_t v4group, v4src;
-	boolean_t isv6;
-	ipsq_t	*ipsq;
 	int	err;
 
-	err = ip_opt_check_v6(connp, v6group, &v4group, v6src, &v4src, &isv6,
-	    ifindex, first_mp, ip_restart_optmgmt, &ill, &ipif);
+	err = ip_opt_check(connp, v6group, v6src, ifaddr, ifindex, &ill);
 	if (err != 0) {
-		if (err != EINPROGRESS) {
-			ip1dbg(("ip_opt_add_group_v6: no ill for group %s/"
-			    "index %d\n", inet_ntop(AF_INET6, v6group, buf,
-			    sizeof (buf)), ifindex));
-		}
+		ip1dbg(("ip_opt_add_group: no ill for group %s/"
+		    "index %d\n", inet_ntop(AF_INET6, v6group, buf,
+		    sizeof (buf)), ifindex));
 		return (err);
 	}
-	ASSERT((!isv6 && ipif != NULL) || (isv6 && ill != NULL));
-
-	/* operation is not supported on the virtual network interface */
-	if (isv6) {
-		if (IS_VNI(ill)) {
-			ill_refrele(ill);
-			return (EINVAL);
-		}
-	} else {
-		if (IS_VNI(ipif->ipif_ill)) {
-			ipif_refrele(ipif);
-			return (EINVAL);
-		}
-	}
 
 	if (checkonly) {
 		/*
@@ -2843,104 +2342,70 @@ ip_opt_add_group_v6(conn_t *connp, boolean_t checkonly,
 		 * semantics. The error return case above if encountered
 		 * considered a good enough "check" here.
 		 */
-		if (isv6)
-			ill_refrele(ill);
-		else
-			ipif_refrele(ipif);
-		return (0);
-	}
-
-	if (!isv6) {
-		IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt,
-		    ipsq, NEW_OP);
-		err = ilg_add(connp, v4group, ipif, fmode, v4src);
-		IPSQ_EXIT(ipsq);
-		ipif_refrele(ipif);
-	} else {
-		IPSQ_ENTER_ILL(ill, connp, first_mp, ip_restart_optmgmt,
-		    ipsq, NEW_OP);
-		err = ilg_add_v6(connp, v6group, ill, fmode, v6src);
-		IPSQ_EXIT(ipsq);
 		ill_refrele(ill);
+		return (0);
 	}
 
+	mutex_enter(&ill->ill_mcast_serializer);
+	err = ilg_add(connp, v6group, ifaddr, ifindex, ill, fmode, v6src);
+	mutex_exit(&ill->ill_mcast_serializer);
+	ill_refrele(ill);
 	return (err);
 }
 
+/*
+ * Common for IPv6 and IPv4.
+ * Here we handle ilgs that are still attached to their original ill
+ * (the one ifaddr/ifindex points at), as well as detached ones.
+ * The detached ones might have been attached to some other ill.
+ */
 static int
-ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group, ipif_t *ipif,
-    mcast_record_t fmode, ipaddr_t src)
+ip_opt_delete_group_excl(conn_t *connp, const in6_addr_t *v6group,
+    ipaddr_t ifaddr, uint_t ifindex, mcast_record_t fmode,
+    const in6_addr_t *v6src)
 {
 	ilg_t	*ilg;
-	in6_addr_t v6src;
-	boolean_t leaving = B_FALSE;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
-	/*
-	 * The ilg is valid only while we hold the conn lock. Once we drop
-	 * the lock, another thread can locate another ilg on this connp,
-	 * but on a different ipif, and delete it, and cause the ilg array
-	 * to be reallocated and copied. Hence do the ilg_delete before
-	 * dropping the lock.
-	 */
-	mutex_enter(&connp->conn_lock);
-	ilg = ilg_lookup_ipif(connp, group, ipif);
-	if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) {
-		mutex_exit(&connp->conn_lock);
-		return (EADDRNOTAVAIL);
-	}
+	boolean_t leaving;
+	ilm_t *ilm;
+	ill_t *ill;
+	int err = 0;
 
-	/*
-	 * Decide if we're actually deleting the ilg or just removing a
-	 * source filter address; if just removing an addr, make sure we
-	 * aren't trying to change the filter mode, and that the addr is
-	 * actually in our filter list already.  If we're removing the
-	 * last src in an include list, just delete the ilg.
-	 */
-	if (src == INADDR_ANY) {
-		v6src = ipv6_all_zeros;
-		leaving = B_TRUE;
-	} else {
-		int err = 0;
-		IN6_IPADDR_TO_V4MAPPED(src, &v6src);
-		if (fmode != ilg->ilg_fmode)
-			err = EINVAL;
-		else if (ilg->ilg_filter == NULL ||
-		    !list_has_addr(ilg->ilg_filter, &v6src))
+retry:
+	rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+	ilg = ilg_lookup(connp, v6group, ifaddr, ifindex);
+	if (ilg == NULL) {
+		rw_exit(&connp->conn_ilg_lock);
+		/*
+		 * Since we didn't have any ilg we now do the error checks
+		 * to determine the best errno.
+		 */
+		err = ip_opt_check(connp, v6group, v6src, ifaddr, ifindex,
+		    &ill);
+		if (ill != NULL) {
+			/* The only error was a missing ilg for the group */
+			ill_refrele(ill);
 			err = EADDRNOTAVAIL;
-		if (err != 0) {
-			mutex_exit(&connp->conn_lock);
-			return (err);
-		}
-		if (fmode == MODE_IS_INCLUDE &&
-		    ilg->ilg_filter->sl_numsrc == 1) {
-			v6src = ipv6_all_zeros;
-			leaving = B_TRUE;
 		}
+		return (err);
 	}
 
-	ilg_delete(connp, ilg, &v6src);
-	mutex_exit(&connp->conn_lock);
-
-	(void) ip_delmulti(group, ipif, B_FALSE, leaving);
-	return (0);
-}
-
-static int
-ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group,
-    ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src)
-{
-	ilg_t	*ilg;
-	boolean_t leaving = B_TRUE;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	mutex_enter(&connp->conn_lock);
-	ilg = ilg_lookup_ill_v6(connp, v6group, ill);
-	if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) {
-		mutex_exit(&connp->conn_lock);
-		return (EADDRNOTAVAIL);
+	/* If the ilg is attached then we serialize using that ill */
+	ill = ilg->ilg_ill;
+	if (ill != NULL) {
+		/* Prevent the ill and ilg from being freed */
+		ill_refhold(ill);
+		ilg_refhold(ilg);
+		rw_exit(&connp->conn_ilg_lock);
+		mutex_enter(&ill->ill_mcast_serializer);
+		rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+		if (ilg->ilg_condemned) {
+			/* Disappeared */
+			ilg_refrele(ilg);
+			rw_exit(&connp->conn_ilg_lock);
+			mutex_exit(&ill->ill_mcast_serializer);
+			ill_refrele(ill);
+			goto retry;
+		}
 	}
 
 	/*
@@ -2950,198 +2415,107 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group,
 	 * actually in our filter list already.  If we're removing the
 	 * last src in an include list, just delete the ilg.
 	 */
-	if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) {
-		int err = 0;
+	if (IN6_IS_ADDR_UNSPECIFIED(v6src)) {
+		leaving = B_TRUE;
+	} else {
 		if (fmode != ilg->ilg_fmode)
 			err = EINVAL;
 		else if (ilg->ilg_filter == NULL ||
 		    !list_has_addr(ilg->ilg_filter, v6src))
 			err = EADDRNOTAVAIL;
 		if (err != 0) {
-			mutex_exit(&connp->conn_lock);
-			return (err);
+			if (ill != NULL)
+				ilg_refrele(ilg);
+			rw_exit(&connp->conn_ilg_lock);
+			goto done;
 		}
 		if (fmode == MODE_IS_INCLUDE &&
-		    ilg->ilg_filter->sl_numsrc == 1)
+		    ilg->ilg_filter->sl_numsrc == 1) {
+			leaving = B_TRUE;
 			v6src = NULL;
-		else
+		} else {
 			leaving = B_FALSE;
+		}
 	}
+	ilm = ilg->ilg_ilm;
+	if (leaving)
+		ilg->ilg_ilm = NULL;
 
 	ilg_delete(connp, ilg, v6src);
-	mutex_exit(&connp->conn_lock);
-	(void) ip_delmulti_v6(v6group, ill, connp->conn_zoneid, B_FALSE,
-	    leaving);
-
-	return (0);
-}
-
-/*
- * Handle the following optmgmt:
- *	IP_DROP_MEMBERSHIP		will leave
- *	MCAST_LEAVE_GROUP		will leave
- *	IP_UNBLOCK_SOURCE		will not leave
- *	MCAST_UNBLOCK_SOURCE		will not leave
- *	IP_LEAVE_SOURCE_GROUP		may leave (if leaving last source)
- *	MCAST_LEAVE_SOURCE_GROUP	may leave (if leaving last source)
- *
- * fmode and src parameters may be used to determine which option is
- * being set, as follows (the IP_* and MCAST_* versions of each option
- * are functionally equivalent):
- *	opt			 fmode			src
- *	IP_DROP_MEMBERSHIP	 MODE_IS_INCLUDE	INADDR_ANY
- *	MCAST_LEAVE_GROUP	 MODE_IS_INCLUDE	INADDR_ANY
- *	IP_UNBLOCK_SOURCE	 MODE_IS_EXCLUDE	v4 addr
- *	MCAST_UNBLOCK_SOURCE	 MODE_IS_EXCLUDE	v4 addr
- *	IP_LEAVE_SOURCE_GROUP	 MODE_IS_INCLUDE	v4 addr
- *	MCAST_LEAVE_SOURCE_GROUP MODE_IS_INCLUDE	v4 addr
- *
- * Changing the filter mode is not allowed; if a matching ilg already
- * exists and fmode != ilg->ilg_fmode, EINVAL is returned.
- *
- * The interface to be used may be identified by an address or by an
- * index.  A pointer to the index is passed; if it is NULL, use the
- * address, otherwise, use the index.
- */
-int
-ip_opt_delete_group(conn_t *connp, boolean_t checkonly, ipaddr_t group,
-    ipaddr_t ifaddr, uint_t *ifindexp, mcast_record_t fmode, ipaddr_t src,
-    mblk_t *first_mp)
-{
-	ipif_t	*ipif;
-	ipsq_t	*ipsq;
-	int	err;
-	ill_t	*ill;
-
-	err = ip_opt_check(connp, group, src, ifaddr, ifindexp, first_mp,
-	    ip_restart_optmgmt, &ipif);
-	if (err != 0) {
-		if (err != EINPROGRESS) {
-			ip1dbg(("ip_opt_delete_group: no ipif for group "
-			    "0x%x, ifaddr 0x%x\n",
-			    (int)ntohl(group), (int)ntohl(ifaddr)));
-		}
-		return (err);
-	}
-	ASSERT(ipif != NULL);
+	if (ill != NULL)
+		ilg_refrele(ilg);
+	rw_exit(&connp->conn_ilg_lock);
 
-	ill = ipif->ipif_ill;
-	/* Operation not supported on a virtual network interface */
-	if (IS_VNI(ill)) {
-		ipif_refrele(ipif);
-		return (EINVAL);
+	if (ilm != NULL) {
+		ASSERT(ill != NULL);
+		(void) ip_delmulti_serial(ilm, B_FALSE, leaving);
 	}
-
-	if (checkonly) {
-		/*
-		 * do not do operation, just pretend to - new T_CHECK
-		 * semantics. The error return case above if encountered
-		 * considered a good enough "check" here.
-		 */
-		ipif_refrele(ipif);
-		return (0);
+done:
+	if (ill != NULL) {
+		mutex_exit(&ill->ill_mcast_serializer);
+		ill_refrele(ill);
 	}
-
-	IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt, ipsq,
-	    NEW_OP);
-	err = ip_opt_delete_group_excl(connp, group, ipif, fmode, src);
-	IPSQ_EXIT(ipsq);
-
-	ipif_refrele(ipif);
 	return (err);
 }
 
 /*
  * Handle the following optmgmt:
+ *	IP_DROP_MEMBERSHIP		will leave
  *	IPV6_LEAVE_GROUP		will leave
  *	MCAST_LEAVE_GROUP		will leave
+ *	IP_UNBLOCK_SOURCE		will not leave
  *	MCAST_UNBLOCK_SOURCE		will not leave
+ *	IP_LEAVE_SOURCE_GROUP		may leave (if leaving last source)
  *	MCAST_LEAVE_SOURCE_GROUP	may leave (if leaving last source)
  *
  * fmode and src parameters may be used to determine which option is
- * being set, as follows (IPV6_LEAVE_GROUP and MCAST_LEAVE_GROUP options
- * are functionally equivalent):
+ * being set, as follows:
  *	opt			 fmode			v6src
+ *	IP_DROP_MEMBERSHIP	 MODE_IS_INCLUDE	unspecified
  *	IPV6_LEAVE_GROUP	 MODE_IS_INCLUDE	unspecified
  *	MCAST_LEAVE_GROUP	 MODE_IS_INCLUDE	unspecified
+ *	IP_UNBLOCK_SOURCE	 MODE_IS_EXCLUDE	IPv4-mapped addr
  *	MCAST_UNBLOCK_SOURCE	 MODE_IS_EXCLUDE	v6 addr
+ *	IP_LEAVE_SOURCE_GROUP	 MODE_IS_INCLUDE	IPv4-mapped addr
  *	MCAST_LEAVE_SOURCE_GROUP MODE_IS_INCLUDE	v6 addr
  *
  * Changing the filter mode is not allowed; if a matching ilg already
  * exists and fmode != ilg->ilg_fmode, EINVAL is returned.
  *
+ * The interface to be used may be identified by an IPv4 address or by an
+ * interface index.
+ *
  * Handles IPv4-mapped IPv6 multicast addresses by associating them
- * with the link-local ipif.  Assumes that if v6group is v4-mapped,
+ * with the IPv4 address.  Assumes that if v6group is v4-mapped,
  * v6src is also v4-mapped.
  */
 int
-ip_opt_delete_group_v6(conn_t *connp, boolean_t checkonly,
-    const in6_addr_t *v6group, int ifindex, mcast_record_t fmode,
-    const in6_addr_t *v6src, mblk_t *first_mp)
+ip_opt_delete_group(conn_t *connp, boolean_t checkonly,
+    const in6_addr_t *v6group, ipaddr_t ifaddr, uint_t ifindex,
+    mcast_record_t fmode, const in6_addr_t *v6src)
 {
-	ill_t *ill;
-	ipif_t	*ipif;
-	char	buf[INET6_ADDRSTRLEN];
-	ipaddr_t v4group, v4src;
-	boolean_t isv6;
-	ipsq_t	*ipsq;
-	int	err;
-
-	err = ip_opt_check_v6(connp, v6group, &v4group, v6src, &v4src, &isv6,
-	    ifindex, first_mp, ip_restart_optmgmt, &ill, &ipif);
-	if (err != 0) {
-		if (err != EINPROGRESS) {
-			ip1dbg(("ip_opt_delete_group_v6: no ill for group %s/"
-			    "index %d\n", inet_ntop(AF_INET6, v6group, buf,
-			    sizeof (buf)), ifindex));
-		}
-		return (err);
-	}
-	ASSERT((isv6 && ill != NULL) || (!isv6 && ipif != NULL));
-
-	/* operation is not supported on the virtual network interface */
-	if (isv6) {
-		if (IS_VNI(ill)) {
-			ill_refrele(ill);
-			return (EINVAL);
-		}
-	} else {
-		if (IS_VNI(ipif->ipif_ill)) {
-			ipif_refrele(ipif);
-			return (EINVAL);
-		}
-	}
 
+	/*
+	 * In the normal case below we don't check for the ill existing.
+	 * Instead we look for an existing ilg in _excl.
+	 * If checkonly we sanity check the arguments
+	 */
 	if (checkonly) {
+		ill_t	*ill;
+		int	err;
+
+		err = ip_opt_check(connp, v6group, v6src, ifaddr, ifindex,
+		    &ill);
 		/*
-		 * do not do operation, just pretend to - new T_CHECK
-		 * semantics. The error return case above if encountered
-		 * considered a good enough "check" here.
+		 * do not do operation, just pretend to - new T_CHECK semantics.
+		 * ip_opt_check is considered a good enough "check" here.
 		 */
-		if (isv6)
+		if (ill != NULL)
 			ill_refrele(ill);
-		else
-			ipif_refrele(ipif);
-		return (0);
-	}
-
-	if (!isv6) {
-		IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt,
-		    ipsq, NEW_OP);
-		err = ip_opt_delete_group_excl(connp, v4group, ipif, fmode,
-		    v4src);
-		IPSQ_EXIT(ipsq);
-		ipif_refrele(ipif);
-	} else {
-		IPSQ_ENTER_ILL(ill, connp, first_mp, ip_restart_optmgmt,
-		    ipsq, NEW_OP);
-		err = ip_opt_delete_group_excl_v6(connp, v6group, ill, fmode,
-		    v6src);
-		IPSQ_EXIT(ipsq);
-		ill_refrele(ill);
+		return (err);
 	}
-
-	return (err);
+	return (ip_opt_delete_group_excl(connp, v6group, ifaddr, ifindex,
+	    fmode, v6src));
 }
 
 /*
@@ -3155,185 +2529,26 @@ ip_opt_delete_group_v6(conn_t *connp, boolean_t checkonly,
 /*
  * Add a group to an upper conn group data structure and pass things down
  * to the interface multicast list (and DLPI)
+ * Common for IPv4 and IPv6; for IPv4 we can have an ifaddr.
  */
 static int
-ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode,
-    ipaddr_t src)
-{
-	int	error = 0;
-	ill_t	*ill;
-	ilg_t	*ilg;
-	ilg_stat_t ilgstat;
-	slist_t	*new_filter = NULL;
-	int	new_fmode;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
-	ill = ipif->ipif_ill;
-
-	if (!(ill->ill_flags & ILLF_MULTICAST))
-		return (EADDRNOTAVAIL);
-
-	/*
-	 * conn_ilg[] is protected by conn_lock. Need to hold the conn_lock
-	 * to walk the conn_ilg[] list in ilg_lookup_ipif(); also needed to
-	 * serialize 2 threads doing join (sock, group1, hme0:0) and
-	 * (sock, group2, hme1:0) where hme0 and hme1 map to different ipsqs,
-	 * but both operations happen on the same conn.
-	 */
-	mutex_enter(&connp->conn_lock);
-	ilg = ilg_lookup_ipif(connp, group, ipif);
-
-	/*
-	 * Depending on the option we're handling, may or may not be okay
-	 * if group has already been added.  Figure out our rules based
-	 * on fmode and src params.  Also make sure there's enough room
-	 * in the filter if we're adding a source to an existing filter.
-	 */
-	if (src == INADDR_ANY) {
-		/* we're joining for all sources, must not have joined */
-		if (ilg != NULL)
-			error = EADDRINUSE;
-	} else {
-		if (fmode == MODE_IS_EXCLUDE) {
-			/* (excl {addr}) => block source, must have joined */
-			if (ilg == NULL)
-				error = EADDRNOTAVAIL;
-		}
-		/* (incl {addr}) => join source, may have joined */
-
-		if (ilg != NULL &&
-		    SLIST_CNT(ilg->ilg_filter) == MAX_FILTER_SIZE)
-			error = ENOBUFS;
-	}
-	if (error != 0) {
-		mutex_exit(&connp->conn_lock);
-		return (error);
-	}
-
-	ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED));
-
-	/*
-	 * Alloc buffer to copy new state into (see below) before
-	 * we make any changes, so we can bail if it fails.
-	 */
-	if ((new_filter = l_alloc()) == NULL) {
-		mutex_exit(&connp->conn_lock);
-		return (ENOMEM);
-	}
-
-	if (ilg == NULL) {
-		ilgstat = ILGSTAT_NEW;
-		if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) {
-			mutex_exit(&connp->conn_lock);
-			l_free(new_filter);
-			return (error);
-		}
-		if (src != INADDR_ANY) {
-			ilg->ilg_filter = l_alloc();
-			if (ilg->ilg_filter == NULL) {
-				ilg_delete(connp, ilg, NULL);
-				mutex_exit(&connp->conn_lock);
-				l_free(new_filter);
-				return (ENOMEM);
-			}
-			ilg->ilg_filter->sl_numsrc = 1;
-			IN6_IPADDR_TO_V4MAPPED(src,
-			    &ilg->ilg_filter->sl_addr[0]);
-		}
-		if (group == INADDR_ANY) {
-			ilg->ilg_v6group = ipv6_all_zeros;
-		} else {
-			IN6_IPADDR_TO_V4MAPPED(group, &ilg->ilg_v6group);
-		}
-		ilg->ilg_ipif = ipif;
-		ilg->ilg_ill = NULL;
-		ilg->ilg_fmode = fmode;
-	} else {
-		int index;
-		in6_addr_t v6src;
-		ilgstat = ILGSTAT_CHANGE;
-		if (ilg->ilg_fmode != fmode || src == INADDR_ANY) {
-			mutex_exit(&connp->conn_lock);
-			l_free(new_filter);
-			return (EINVAL);
-		}
-		if (ilg->ilg_filter == NULL) {
-			ilg->ilg_filter = l_alloc();
-			if (ilg->ilg_filter == NULL) {
-				mutex_exit(&connp->conn_lock);
-				l_free(new_filter);
-				return (ENOMEM);
-			}
-		}
-		IN6_IPADDR_TO_V4MAPPED(src, &v6src);
-		if (list_has_addr(ilg->ilg_filter, &v6src)) {
-			mutex_exit(&connp->conn_lock);
-			l_free(new_filter);
-			return (EADDRNOTAVAIL);
-		}
-		index = ilg->ilg_filter->sl_numsrc++;
-		ilg->ilg_filter->sl_addr[index] = v6src;
-	}
-
-	/*
-	 * Save copy of ilg's filter state to pass to other functions,
-	 * so we can release conn_lock now.
-	 */
-	new_fmode = ilg->ilg_fmode;
-	l_copy(ilg->ilg_filter, new_filter);
-
-	mutex_exit(&connp->conn_lock);
-
-	error = ip_addmulti(group, ipif, ilgstat, new_fmode, new_filter);
-	if (error != 0) {
-		/*
-		 * Need to undo what we did before calling ip_addmulti()!
-		 * Must look up the ilg again since we've not been holding
-		 * conn_lock.
-		 */
-		in6_addr_t v6src;
-		if (ilgstat == ILGSTAT_NEW)
-			v6src = ipv6_all_zeros;
-		else
-			IN6_IPADDR_TO_V4MAPPED(src, &v6src);
-		mutex_enter(&connp->conn_lock);
-		ilg = ilg_lookup_ipif(connp, group, ipif);
-		ASSERT(ilg != NULL);
-		ilg_delete(connp, ilg, &v6src);
-		mutex_exit(&connp->conn_lock);
-		l_free(new_filter);
-		return (error);
-	}
-
-	l_free(new_filter);
-	return (0);
-}
-
-static int
-ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
-    mcast_record_t fmode, const in6_addr_t *v6src)
+ilg_add(conn_t *connp, const in6_addr_t *v6group, ipaddr_t ifaddr,
+    uint_t ifindex, ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src)
 {
 	int	error = 0;
 	ilg_t	*ilg;
 	ilg_stat_t ilgstat;
 	slist_t	*new_filter = NULL;
 	int	new_fmode;
-
-	ASSERT(IAM_WRITER_ILL(ill));
+	ilm_t *ilm;
 
 	if (!(ill->ill_flags & ILLF_MULTICAST))
 		return (EADDRNOTAVAIL);
 
-	/*
-	 * conn_lock protects the ilg list.  Serializes 2 threads doing
-	 * join (sock, group1, hme0) and (sock, group2, hme1) where hme0
-	 * and hme1 map to different ipsq's, but both operations happen
-	 * on the same conn.
-	 */
-	mutex_enter(&connp->conn_lock);
-
-	ilg = ilg_lookup_ill_v6(connp, v6group, ill);
+	/* conn_ilg_lock protects the ilg list. */
+	ASSERT(MUTEX_HELD(&ill->ill_mcast_serializer));
+	rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+	ilg = ilg_lookup(connp, v6group, ifaddr, ifindex);
 
 	/*
 	 * Depending on the option we're handling, may or may not be okay
@@ -3358,7 +2573,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 			error = ENOBUFS;
 	}
 	if (error != 0) {
-		mutex_exit(&connp->conn_lock);
+		rw_exit(&connp->conn_ilg_lock);
 		return (error);
 	}
 
@@ -3367,21 +2582,23 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 	 * we make any changes, so we can bail if it fails.
 	 */
 	if ((new_filter = l_alloc()) == NULL) {
-		mutex_exit(&connp->conn_lock);
+		rw_exit(&connp->conn_ilg_lock);
 		return (ENOMEM);
 	}
 
 	if (ilg == NULL) {
 		if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) {
-			mutex_exit(&connp->conn_lock);
+			rw_exit(&connp->conn_ilg_lock);
 			l_free(new_filter);
 			return (error);
 		}
+		ilg->ilg_ifindex = ifindex;
+		ilg->ilg_ifaddr = ifaddr;
 		if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) {
 			ilg->ilg_filter = l_alloc();
 			if (ilg->ilg_filter == NULL) {
 				ilg_delete(connp, ilg, NULL);
-				mutex_exit(&connp->conn_lock);
+				rw_exit(&connp->conn_ilg_lock);
 				l_free(new_filter);
 				return (ENOMEM);
 			}
@@ -3391,25 +2608,24 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 		ilgstat = ILGSTAT_NEW;
 		ilg->ilg_v6group = *v6group;
 		ilg->ilg_fmode = fmode;
-		ilg->ilg_ipif = NULL;
 		ilg->ilg_ill = ill;
 	} else {
 		int index;
 		if (ilg->ilg_fmode != fmode || IN6_IS_ADDR_UNSPECIFIED(v6src)) {
-			mutex_exit(&connp->conn_lock);
+			rw_exit(&connp->conn_ilg_lock);
 			l_free(new_filter);
 			return (EINVAL);
 		}
 		if (ilg->ilg_filter == NULL) {
 			ilg->ilg_filter = l_alloc();
 			if (ilg->ilg_filter == NULL) {
-				mutex_exit(&connp->conn_lock);
+				rw_exit(&connp->conn_ilg_lock);
 				l_free(new_filter);
 				return (ENOMEM);
 			}
 		}
 		if (list_has_addr(ilg->ilg_filter, v6src)) {
-			mutex_exit(&connp->conn_lock);
+			rw_exit(&connp->conn_ilg_lock);
 			l_free(new_filter);
 			return (EADDRNOTAVAIL);
 		}
@@ -3420,12 +2636,12 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 
 	/*
 	 * Save copy of ilg's filter state to pass to other functions,
-	 * so we can release conn_lock now.
+	 * so we can release conn_ilg_lock now.
 	 */
 	new_fmode = ilg->ilg_fmode;
 	l_copy(ilg->ilg_filter, new_filter);
 
-	mutex_exit(&connp->conn_lock);
+	rw_exit(&connp->conn_ilg_lock);
 
 	/*
 	 * Now update the ill. We wait to do this until after the ilg
@@ -3433,72 +2649,105 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 	 * info for the ill, which involves looking at the status of
 	 * all the ilgs associated with this group/interface pair.
 	 */
-	error = ip_addmulti_v6(v6group, ill, connp->conn_zoneid, ilgstat,
-	    new_fmode, new_filter);
-	if (error != 0) {
+	ilm = ip_addmulti_serial(v6group, ill, connp->conn_zoneid, ilgstat,
+	    new_fmode, new_filter, &error);
+
+	rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+	/*
+	 * Must look up the ilg again since we've not been holding
+	 * conn_ilg_lock. The ilg could have disappeared due to an unplumb
+	 * having called conn_update_ill, which can run once we dropped the
+	 * conn_ilg_lock above.
+	 */
+	ilg = ilg_lookup(connp, v6group, ifaddr, ifindex);
+	if (ilg == NULL) {
+		rw_exit(&connp->conn_ilg_lock);
+		if (ilm != NULL) {
+			(void) ip_delmulti_serial(ilm, B_FALSE,
+			    (ilgstat == ILGSTAT_NEW));
+		}
+		error = ENXIO;
+		goto free_and_exit;
+	}
+
+	if (ilm != NULL) {
+		/* Succeeded. Update the ilg to point at the ilm */
+		if (ilgstat == ILGSTAT_NEW) {
+			ASSERT(ilg->ilg_ilm == NULL);
+			ilg->ilg_ilm = ilm;
+			ilm->ilm_ifaddr = ifaddr;	/* For netstat */
+		} else {
+			/*
+			 * ip_addmulti didn't get a held ilm for
+			 * ILGSTAT_CHANGE; ilm_refcnt was unchanged.
+			 */
+			ASSERT(ilg->ilg_ilm == ilm);
+		}
+	} else {
+		ASSERT(error != 0);
 		/*
-		 * But because we waited, we have to undo the ilg update
-		 * if ip_addmulti_v6() fails.  We also must lookup ilg
-		 * again, since we've not been holding conn_lock.
+		 * Failed to allocate the ilm.
+		 * Need to undo what we did before calling ip_addmulti()
+		 * If ENETDOWN just clear ill_ilg since so that we
+		 * will rejoin when the ill comes back; don't report ENETDOWN
+		 * to application.
 		 */
-		in6_addr_t delsrc =
-		    (ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src;
-		mutex_enter(&connp->conn_lock);
-		ilg = ilg_lookup_ill_v6(connp, v6group, ill);
-		ASSERT(ilg != NULL);
-		ilg_delete(connp, ilg, &delsrc);
-		mutex_exit(&connp->conn_lock);
-		l_free(new_filter);
-		return (error);
+		if (ilgstat == ILGSTAT_NEW && error == ENETDOWN) {
+			ilg->ilg_ill = NULL;
+			error = 0;
+		} else {
+			in6_addr_t delsrc =
+			    (ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src;
+
+			ilg_delete(connp, ilg, &delsrc);
+		}
 	}
+	rw_exit(&connp->conn_ilg_lock);
 
+free_and_exit:
 	l_free(new_filter);
-
-	return (0);
+	return (error);
 }
 
 /*
- * Find an IPv4 ilg matching group, ill and source
+ * Find an IPv4 ilg matching group, ill and source.
+ * The group and source can't be INADDR_ANY here so no need to translate to
+ * the unspecified IPv6 address.
  */
-ilg_t *
-ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill)
+boolean_t
+conn_hasmembers_ill_withsrc_v4(conn_t *connp, ipaddr_t group, ipaddr_t src,
+    ill_t *ill)
 {
 	in6_addr_t v6group, v6src;
 	int i;
 	boolean_t isinlist;
 	ilg_t *ilg;
-	ipif_t *ipif;
-	ill_t *ilg_ill;
-
-	ASSERT(MUTEX_HELD(&connp->conn_lock));
 
-	/*
-	 * INADDR_ANY is represented as the IPv6 unspecified addr.
-	 */
-	if (group == INADDR_ANY)
-		v6group = ipv6_all_zeros;
-	else
-		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+	rw_enter(&connp->conn_ilg_lock, RW_READER);
+	IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+	for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+		if (ilg->ilg_condemned)
+			continue;
 
-	for (i = 0; i < connp->conn_ilg_inuse; i++) {
-		ilg = &connp->conn_ilg[i];
-		if ((ipif = ilg->ilg_ipif) == NULL ||
-		    (ilg->ilg_flags & ILG_DELETED) != 0)
+		/* ilg_ill could be NULL if an add is in progress */
+		if (ilg->ilg_ill != ill)
 			continue;
-		ASSERT(ilg->ilg_ill == NULL);
-		ilg_ill = ipif->ipif_ill;
-		ASSERT(!ilg_ill->ill_isv6);
-		if (IS_ON_SAME_LAN(ilg_ill, ill) &&
-		    IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) {
+
+		/* The callers use upper ill for IPMP */
+		ASSERT(!IS_UNDER_IPMP(ill));
+		if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) {
 			if (SLIST_IS_EMPTY(ilg->ilg_filter)) {
 				/* no source filter, so this is a match */
-				return (ilg);
+				rw_exit(&connp->conn_ilg_lock);
+				return (B_TRUE);
 			}
 			break;
 		}
 	}
-	if (i == connp->conn_ilg_inuse)
-		return (NULL);
+	if (ilg == NULL) {
+		rw_exit(&connp->conn_ilg_lock);
+		return (B_FALSE);
+	}
 
 	/*
 	 * we have an ilg with matching ill and group; but
@@ -3514,44 +2763,49 @@ ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill)
 	}
 
 	if ((isinlist && ilg->ilg_fmode == MODE_IS_INCLUDE) ||
-	    (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE))
-		return (ilg);
-
-	return (NULL);
+	    (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE)) {
+		rw_exit(&connp->conn_ilg_lock);
+		return (B_TRUE);
+	}
+	rw_exit(&connp->conn_ilg_lock);
+	return (B_FALSE);
 }
 
 /*
  * Find an IPv6 ilg matching group, ill, and source
  */
-ilg_t *
-ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
+boolean_t
+conn_hasmembers_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
     const in6_addr_t *v6src, ill_t *ill)
 {
 	int i;
 	boolean_t isinlist;
 	ilg_t *ilg;
-	ill_t *ilg_ill;
 
-	ASSERT(MUTEX_HELD(&connp->conn_lock));
+	rw_enter(&connp->conn_ilg_lock, RW_READER);
+	for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+		if (ilg->ilg_condemned)
+			continue;
 
-	for (i = 0; i < connp->conn_ilg_inuse; i++) {
-		ilg = &connp->conn_ilg[i];
-		if ((ilg_ill = ilg->ilg_ill) == NULL ||
-		    (ilg->ilg_flags & ILG_DELETED) != 0)
+		/* ilg_ill could be NULL if an add is in progress */
+		if (ilg->ilg_ill != ill)
 			continue;
-		ASSERT(ilg->ilg_ipif == NULL);
-		ASSERT(ilg_ill->ill_isv6);
-		if (IS_ON_SAME_LAN(ilg_ill, ill) &&
-		    IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) {
+
+		/* The callers use upper ill for IPMP */
+		ASSERT(!IS_UNDER_IPMP(ill));
+		if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) {
 			if (SLIST_IS_EMPTY(ilg->ilg_filter)) {
 				/* no source filter, so this is a match */
-				return (ilg);
+				rw_exit(&connp->conn_ilg_lock);
+				return (B_TRUE);
 			}
 			break;
 		}
 	}
-	if (i == connp->conn_ilg_inuse)
-		return (NULL);
+	if (ilg == NULL) {
+		rw_exit(&connp->conn_ilg_lock);
+		return (B_FALSE);
+	}
 
 	/*
 	 * we have an ilg with matching ill and group; but
@@ -3566,61 +2820,34 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
 	}
 
 	if ((isinlist && ilg->ilg_fmode == MODE_IS_INCLUDE) ||
-	    (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE))
-		return (ilg);
-
-	return (NULL);
-}
-
-/*
- * Find an IPv6 ilg matching group and ill
- */
-ilg_t *
-ilg_lookup_ill_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill)
-{
-	ilg_t	*ilg;
-	int	i;
-	ill_t 	*mem_ill;
-
-	ASSERT(MUTEX_HELD(&connp->conn_lock));
-
-	for (i = 0; i < connp->conn_ilg_inuse; i++) {
-		ilg = &connp->conn_ilg[i];
-		if ((mem_ill = ilg->ilg_ill) == NULL ||
-		    (ilg->ilg_flags & ILG_DELETED) != 0)
-			continue;
-		ASSERT(ilg->ilg_ipif == NULL);
-		ASSERT(mem_ill->ill_isv6);
-		if (mem_ill == ill &&
-		    IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group))
-			return (ilg);
+	    (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE)) {
+		rw_exit(&connp->conn_ilg_lock);
+		return (B_TRUE);
 	}
-	return (NULL);
+	rw_exit(&connp->conn_ilg_lock);
+	return (B_FALSE);
 }
 
 /*
- * Find an IPv4 ilg matching group and ipif
+ * Find an ilg matching group and ifaddr/ifindex.
+ * We check both ifaddr and ifindex even though at most one of them
+ * will be non-zero; that way we always find the right one.
  */
 static ilg_t *
-ilg_lookup_ipif(conn_t *connp, ipaddr_t group, ipif_t *ipif)
+ilg_lookup(conn_t *connp, const in6_addr_t *v6group, ipaddr_t ifaddr,
+    uint_t ifindex)
 {
-	in6_addr_t v6group;
-	int	i;
 	ilg_t	*ilg;
 
-	ASSERT(MUTEX_HELD(&connp->conn_lock));
-	ASSERT(!ipif->ipif_ill->ill_isv6);
+	ASSERT(RW_LOCK_HELD(&connp->conn_ilg_lock));
 
-	if (group == INADDR_ANY)
-		v6group = ipv6_all_zeros;
-	else
-		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+	for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+		if (ilg->ilg_condemned)
+			continue;
 
-	for (i = 0; i < connp->conn_ilg_inuse; i++) {
-		ilg = &connp->conn_ilg[i];
-		if ((ilg->ilg_flags & ILG_DELETED) == 0 &&
-		    IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group) &&
-		    ilg->ilg_ipif == ipif)
+		if (ilg->ilg_ifaddr == ifaddr &&
+		    ilg->ilg_ifindex == ifindex &&
+		    IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group))
 			return (ilg);
 	}
 	return (NULL);
@@ -3634,363 +2861,479 @@ ilg_lookup_ipif(conn_t *connp, ipaddr_t group, ipif_t *ipif)
 static void
 ilg_delete(conn_t *connp, ilg_t *ilg, const in6_addr_t *src)
 {
-	int	i;
-
-	ASSERT((ilg->ilg_ipif != NULL) ^ (ilg->ilg_ill != NULL));
-	ASSERT(ilg->ilg_ipif == NULL || IAM_WRITER_IPIF(ilg->ilg_ipif));
-	ASSERT(ilg->ilg_ill == NULL || IAM_WRITER_ILL(ilg->ilg_ill));
-	ASSERT(MUTEX_HELD(&connp->conn_lock));
-	ASSERT(!(ilg->ilg_flags & ILG_DELETED));
+	ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock));
+	ASSERT(ilg->ilg_ptpn != NULL);
+	ASSERT(!ilg->ilg_condemned);
 
 	if (src == NULL || IN6_IS_ADDR_UNSPECIFIED(src)) {
-		if (connp->conn_ilg_walker_cnt != 0) {
-			ilg->ilg_flags |= ILG_DELETED;
-			return;
-		}
-
 		FREE_SLIST(ilg->ilg_filter);
+		ilg->ilg_filter = NULL;
 
-		i = ilg - &connp->conn_ilg[0];
-		ASSERT(i >= 0 && i < connp->conn_ilg_inuse);
-
-		/* Move other entries up one step */
-		connp->conn_ilg_inuse--;
-		for (; i < connp->conn_ilg_inuse; i++)
-			connp->conn_ilg[i] = connp->conn_ilg[i+1];
+		ASSERT(ilg->ilg_ilm == NULL);
+		ilg->ilg_ill = NULL;
+		ilg->ilg_condemned = B_TRUE;
 
-		if (connp->conn_ilg_inuse == 0) {
-			mi_free((char *)connp->conn_ilg);
-			connp->conn_ilg = NULL;
-			cv_broadcast(&connp->conn_refcv);
-		}
+		/* ilg_inactive will unlink from the list */
+		ilg_refrele(ilg);
 	} else {
 		l_remove(ilg->ilg_filter, src);
 	}
 }
 
 /*
- * Called from conn close. No new ilg can be added or removed.
+ * Called from conn close. No new ilg can be added or removed
  * because CONN_CLOSING has been set by ip_close. ilg_add / ilg_delete
  * will return error if conn has started closing.
+ *
+ * We handle locking as follows.
+ * Under conn_ilg_lock we get the first ilg. As we drop the conn_ilg_lock to
+ * proceed with the ilm part of the delete we hold a reference on both the ill
+ * and the ilg. This doesn't prevent changes to the ilg, but prevents it from
+ * being deleted.
+ *
+ * Since the ilg_add code path uses two locks (conn_ilg_lock for the ilg part,
+ * and ill_mcast_lock for the ip_addmulti part) we can run at a point between
+ * the two. At that point ilg_ill is set, but ilg_ilm hasn't yet been set. In
+ * that case we delete the ilg here, which makes ilg_add discover that the ilg
+ * has disappeared when ip_addmulti returns, so it will discard the ilm it just
+ * added.
  */
 void
 ilg_delete_all(conn_t *connp)
 {
-	int	i;
-	ipif_t	*ipif = NULL;
-	ill_t	*ill = NULL;
-	ilg_t	*ilg;
-	in6_addr_t v6group;
-	boolean_t success;
-	ipsq_t	*ipsq;
+	ilg_t	*ilg, *next_ilg, *held_ilg;
+	ilm_t	*ilm;
+	ill_t	*ill;
+	boolean_t need_refrele;
 
+	/*
+	 * Can not run if there is a conn_update_ill already running.
+	 * Wait for it to complete. Caller should have already set CONN_CLOSING
+	 * which prevents any new threads to run in conn_update_ill.
+	 */
 	mutex_enter(&connp->conn_lock);
-retry:
-	ILG_WALKER_HOLD(connp);
-	for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
-		ilg = &connp->conn_ilg[i];
-		/*
-		 * Since this walk is not atomic (we drop the
-		 * conn_lock and wait in ipsq_enter) we need
-		 * to check for the ILG_DELETED flag.
-		 */
-		if (ilg->ilg_flags & ILG_DELETED)
-			continue;
-
-		if (IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group)) {
-			ipif = ilg->ilg_ipif;
-			ill = ipif->ipif_ill;
-		} else {
-			ipif = NULL;
-			ill = ilg->ilg_ill;
-		}
+	ASSERT(connp->conn_state_flags & CONN_CLOSING);
+	while (connp->conn_state_flags & CONN_UPDATE_ILL)
+		cv_wait(&connp->conn_cv, &connp->conn_lock);
+	mutex_exit(&connp->conn_lock);
 
-		/*
-		 * We may not be able to refhold the ill if the ill/ipif
-		 * is changing. But we need to make sure that the ill will
-		 * not vanish. So we just bump up the ill_waiter count.
-		 * If we are unable to do even that, then the ill is closing,
-		 * in which case the unplumb thread will handle the cleanup,
-		 * and we move on to the next ilg.
-		 */
-		if (!ill_waiter_inc(ill))
+	rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+	ilg = connp->conn_ilg;
+	held_ilg = NULL;
+	while (ilg != NULL) {
+		if (ilg->ilg_condemned) {
+			ilg = ilg->ilg_next;
 			continue;
-
-		mutex_exit(&connp->conn_lock);
-		/*
-		 * To prevent deadlock between ill close which waits inside
-		 * the perimeter, and conn close, ipsq_enter returns error,
-		 * the moment ILL_CONDEMNED is set, in which case ill close
-		 * takes responsibility to cleanup the ilgs. Note that we
-		 * have not yet set condemned flag, otherwise the conn can't
-		 * be refheld for cleanup by those routines and it would be
-		 * a mutual deadlock.
-		 */
-		success = ipsq_enter(ill, B_FALSE, NEW_OP);
-		ipsq = ill->ill_phyint->phyint_ipsq;
-		ill_waiter_dcr(ill);
-		mutex_enter(&connp->conn_lock);
-		if (!success)
+		}
+		/* If the ilg is detached then no need to serialize */
+		if (ilg->ilg_ilm == NULL) {
+			next_ilg = ilg->ilg_next;
+			ilg_delete(connp, ilg, NULL);
+			ilg = next_ilg;
 			continue;
+		}
+		ill = ilg->ilg_ilm->ilm_ill;
 
 		/*
-		 * Move on if the ilg was deleted while conn_lock was dropped.
+		 * In order to serialize on the ill we try to enter
+		 * and if that fails we unlock and relock and then
+		 * check that we still have an ilm.
 		 */
-		if (ilg->ilg_flags & ILG_DELETED) {
-			mutex_exit(&connp->conn_lock);
-			ipsq_exit(ipsq);
-			mutex_enter(&connp->conn_lock);
-			continue;
+		need_refrele = B_FALSE;
+		if (!mutex_tryenter(&ill->ill_mcast_serializer)) {
+			ill_refhold(ill);
+			need_refrele = B_TRUE;
+			ilg_refhold(ilg);
+			if (held_ilg != NULL)
+				ilg_refrele(held_ilg);
+			held_ilg = ilg;
+			rw_exit(&connp->conn_ilg_lock);
+			mutex_enter(&ill->ill_mcast_serializer);
+			rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+			if (ilg->ilg_condemned) {
+				ilg = ilg->ilg_next;
+				goto next;
+			}
 		}
-		v6group = ilg->ilg_v6group;
+		ilm = ilg->ilg_ilm;
+		ilg->ilg_ilm = NULL;
+		next_ilg = ilg->ilg_next;
 		ilg_delete(connp, ilg, NULL);
-		mutex_exit(&connp->conn_lock);
+		ilg = next_ilg;
+		rw_exit(&connp->conn_ilg_lock);
 
-		if (ipif != NULL) {
-			(void) ip_delmulti(V4_PART_OF_V6(v6group), ipif,
-			    B_FALSE, B_TRUE);
-		} else {
-			(void) ip_delmulti_v6(&v6group, ill,
-			    connp->conn_zoneid, B_FALSE, B_TRUE);
-		}
-		ipsq_exit(ipsq);
-		mutex_enter(&connp->conn_lock);
-	}
-	ILG_WALKER_RELE(connp);
+		if (ilm != NULL)
+			(void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE);
 
-	/* If any ill was skipped above wait and retry */
-	if (connp->conn_ilg_inuse != 0) {
-		cv_wait(&connp->conn_refcv, &connp->conn_lock);
-		goto retry;
+	next:
+		mutex_exit(&ill->ill_mcast_serializer);
+		if (need_refrele) {
+			/* Drop ill reference while we hold no locks */
+			ill_refrele(ill);
+		}
+		rw_enter(&connp->conn_ilg_lock, RW_WRITER);
 	}
-	mutex_exit(&connp->conn_lock);
+	if (held_ilg != NULL)
+		ilg_refrele(held_ilg);
+	rw_exit(&connp->conn_ilg_lock);
 }
 
 /*
- * Called from ill close by ipcl_walk for clearing conn_ilg and
- * conn_multicast_ipif for a given ipif. conn is held by caller.
- * Note that ipcl_walk only walks conns that are not yet condemned.
- * condemned conns can't be refheld. For this reason, conn must become clean
- * first, i.e. it must not refer to any ill/ire/ipif and then only set
- * condemned flag.
+ * Attach the ilg to an ilm on the ill. If it fails we leave ilg_ill as NULL so
+ * that a subsequent attempt can attach it.
+ * Drops and reacquires conn_ilg_lock.
  */
 static void
-conn_delete_ipif(conn_t *connp, caddr_t arg)
+ilg_attach(conn_t *connp, ilg_t *ilg, ill_t *ill)
 {
-	ipif_t	*ipif = (ipif_t *)arg;
-	int	i;
-	char	group_buf1[INET6_ADDRSTRLEN];
-	char	group_buf2[INET6_ADDRSTRLEN];
-	ipaddr_t group;
-	ilg_t	*ilg;
+	ilg_stat_t	ilgstat;
+	slist_t		*new_filter;
+	int		new_fmode;
+	in6_addr_t	v6group;
+	ipaddr_t	ifaddr;
+	uint_t		ifindex;
+	ilm_t		*ilm;
+	int		error = 0;
 
+	ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock));
 	/*
-	 * Even though conn_ilg_inuse can change while we are in this loop,
-	 * i.e.ilgs can be created or deleted on this connp, no new ilgs can
-	 * be created or deleted for this connp, on this ill, since this ill
-	 * is the perimeter. So we won't miss any ilg in this cleanup.
+	 * Alloc buffer to copy new state into (see below) before
+	 * we make any changes, so we can bail if it fails.
 	 */
-	mutex_enter(&connp->conn_lock);
+	if ((new_filter = l_alloc()) == NULL)
+		return;
 
 	/*
-	 * Increment the walker count, so that ilg repacking does not
-	 * occur while we are in the loop.
+	 * Save copy of ilg's filter state to pass to other functions, so
+	 * we can release conn_ilg_lock now.
+	 * Set ilg_ill so that an unplumb can find us.
 	 */
-	ILG_WALKER_HOLD(connp);
-	for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
-		ilg = &connp->conn_ilg[i];
-		if (ilg->ilg_ipif != ipif || (ilg->ilg_flags & ILG_DELETED))
-			continue;
-		/*
-		 * ip_close cannot be cleaning this ilg at the same time.
-		 * since it also has to execute in this ill's perimeter which
-		 * we are now holding. Only a clean conn can be condemned.
-		 */
-		ASSERT(!(connp->conn_state_flags & CONN_CONDEMNED));
-
-		/* Blow away the membership */
-		ip1dbg(("conn_delete_ilg_ipif: %s on %s (%s)\n",
-		    inet_ntop(AF_INET6, &connp->conn_ilg[i].ilg_v6group,
-		    group_buf1, sizeof (group_buf1)),
-		    inet_ntop(AF_INET6, &ipif->ipif_v6lcl_addr,
-		    group_buf2, sizeof (group_buf2)),
-		    ipif->ipif_ill->ill_name));
-
-		/* ilg_ipif is NULL for V6, so we won't be here */
-		ASSERT(IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group));
+	new_fmode = ilg->ilg_fmode;
+	l_copy(ilg->ilg_filter, new_filter);
+	v6group = ilg->ilg_v6group;
+	ifaddr = ilg->ilg_ifaddr;
+	ifindex = ilg->ilg_ifindex;
+	ilgstat = ILGSTAT_NEW;
 
-		group = V4_PART_OF_V6(ilg->ilg_v6group);
-		ilg_delete(connp, &connp->conn_ilg[i], NULL);
-		mutex_exit(&connp->conn_lock);
+	ilg->ilg_ill = ill;
+	ASSERT(ilg->ilg_ilm == NULL);
+	rw_exit(&connp->conn_ilg_lock);
 
-		(void) ip_delmulti(group, ipif, B_FALSE, B_TRUE);
-		mutex_enter(&connp->conn_lock);
-	}
+	ilm = ip_addmulti_serial(&v6group, ill, connp->conn_zoneid, ilgstat,
+	    new_fmode, new_filter, &error);
+	l_free(new_filter);
 
+	rw_enter(&connp->conn_ilg_lock, RW_WRITER);
 	/*
-	 * If we are the last walker, need to physically delete the
-	 * ilgs and repack.
+	 * Must look up the ilg again since we've not been holding
+	 * conn_ilg_lock. The ilg could have disappeared due to an unplumb
+	 * having called conn_update_ill, which can run once we dropped the
+	 * conn_ilg_lock above.
 	 */
-	ILG_WALKER_RELE(connp);
-
-	if (connp->conn_multicast_ipif == ipif) {
-		/* Revert to late binding */
-		connp->conn_multicast_ipif = NULL;
+	ilg = ilg_lookup(connp, &v6group, ifaddr, ifindex);
+	if (ilg == NULL) {
+		if (ilm != NULL) {
+			rw_exit(&connp->conn_ilg_lock);
+			(void) ip_delmulti_serial(ilm, B_FALSE,
+			    (ilgstat == ILGSTAT_NEW));
+			rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+		}
+		return;
 	}
-	mutex_exit(&connp->conn_lock);
-
-	conn_delete_ire(connp, (caddr_t)ipif);
+	if (ilm == NULL) {
+		ilg->ilg_ill = NULL;
+		return;
+	}
+	ASSERT(ilg->ilg_ilm == NULL);
+	ilg->ilg_ilm = ilm;
+	ilm->ilm_ifaddr = ifaddr;	/* For netstat */
 }
 
 /*
- * Called from ill close by ipcl_walk for clearing conn_ilg and
- * conn_multicast_ill for a given ill. conn is held by caller.
+ * Called when an ill is unplumbed to make sure that there are no
+ * dangling conn references to that ill. In that case ill is non-NULL and
+ * we make sure we remove all references to it.
+ * Also called when we should revisit the ilg_ill used for multicast
+ * memberships, in which case ill is NULL.
+ *
+ * conn is held by caller.
+ *
  * Note that ipcl_walk only walks conns that are not yet condemned.
  * condemned conns can't be refheld. For this reason, conn must become clean
- * first, i.e. it must not refer to any ill/ire/ipif and then only set
+ * first, i.e. it must not refer to any ill/ire and then only set
  * condemned flag.
+ *
+ * We leave ixa_multicast_ifindex in place. We prefer dropping
+ * packets instead of sending them out the wrong interface.
+ *
+ * We keep the ilg around in a detached state (with ilg_ill and ilg_ilm being
+ * NULL) so that the application can leave it later. Also, if ilg_ifaddr and
+ * ilg_ifindex are zero, indicating that the system should pick the interface,
+ * then we attempt to reselect the ill and join on it.
+ *
+ * Locking notes:
+ * Under conn_ilg_lock we get the first ilg. As we drop the conn_ilg_lock to
+ * proceed with the ilm part of the delete we hold a reference on both the ill
+ * and the ilg. This doesn't prevent changes to the ilg, but prevents it from
+ * being deleted.
+ *
+ * Note: if this function is called when new ill/ipif's arrive or change status
+ * (SIOCSLIFINDEX, SIOCSLIFADDR) then we will attempt to attach any ilgs with
+ * a NULL ilg_ill to an ill/ilm.
  */
 static void
-conn_delete_ill(conn_t *connp, caddr_t arg)
+conn_update_ill(conn_t *connp, caddr_t arg)
 {
 	ill_t	*ill = (ill_t *)arg;
-	int	i;
-	char	group_buf[INET6_ADDRSTRLEN];
-	in6_addr_t v6group;
-	ilg_t	*ilg;
 
 	/*
-	 * Even though conn_ilg_inuse can change while we are in this loop,
-	 * no new ilgs can be created/deleted for this connp, on this
-	 * ill, since this ill is the perimeter. So we won't miss any ilg
-	 * in this cleanup.
+	 * We have to prevent ip_close/ilg_delete_all from running at
+	 * the same time. ip_close sets CONN_CLOSING before doing the ilg_delete
+	 * all, and we set CONN_UPDATE_ILL. That ensures that only one of
+	 * ilg_delete_all and conn_update_ill run at a time for a given conn.
+	 * If ilg_delete_all got here first, then we have nothing to do.
 	 */
 	mutex_enter(&connp->conn_lock);
+	if (connp->conn_state_flags & (CONN_CLOSING|CONN_UPDATE_ILL)) {
+		/* Caller has to wait for ill_ilm_cnt to drop to zero */
+		mutex_exit(&connp->conn_lock);
+		return;
+	}
+	connp->conn_state_flags |= CONN_UPDATE_ILL;
+	mutex_exit(&connp->conn_lock);
 
-	/*
-	 * Increment the walker count, so that ilg repacking does not
-	 * occur while we are in the loop.
-	 */
-	ILG_WALKER_HOLD(connp);
-	for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
-		ilg = &connp->conn_ilg[i];
-		if ((ilg->ilg_ill == ill) && !(ilg->ilg_flags & ILG_DELETED)) {
-			/*
-			 * ip_close cannot be cleaning this ilg at the same
-			 * time, since it also has to execute in this ill's
-			 * perimeter which we are now holding. Only a clean
-			 * conn can be condemned.
-			 */
-			ASSERT(!(connp->conn_state_flags & CONN_CONDEMNED));
-
-			/* Blow away the membership */
-			ip1dbg(("conn_delete_ilg_ill: %s on %s\n",
-			    inet_ntop(AF_INET6, &ilg->ilg_v6group,
-			    group_buf, sizeof (group_buf)),
-			    ill->ill_name));
+	if (ill != NULL)
+		ilg_check_detach(connp, ill);
 
-			v6group = ilg->ilg_v6group;
-			ilg_delete(connp, ilg, NULL);
-			mutex_exit(&connp->conn_lock);
+	ilg_check_reattach(connp);
 
-			(void) ip_delmulti_v6(&v6group, ill,
-			    connp->conn_zoneid, B_FALSE, B_TRUE);
-			mutex_enter(&connp->conn_lock);
-		}
-	}
-	/*
-	 * If we are the last walker, need to physically delete the
-	 * ilgs and repack.
-	 */
-	ILG_WALKER_RELE(connp);
-
-	if (connp->conn_multicast_ill == ill) {
-		/* Revert to late binding */
-		connp->conn_multicast_ill = NULL;
-	}
+	/* Do we need to wake up a thread in ilg_delete_all? */
+	mutex_enter(&connp->conn_lock);
+	connp->conn_state_flags &= ~CONN_UPDATE_ILL;
+	if (connp->conn_state_flags & CONN_CLOSING)
+		cv_broadcast(&connp->conn_cv);
 	mutex_exit(&connp->conn_lock);
 }
 
-/*
- * Called when an ipif is unplumbed to make sure that there are no
- * dangling conn references to that ipif.
- * Handles ilg_ipif and conn_multicast_ipif
- */
-void
-reset_conn_ipif(ipif)
-	ipif_t	*ipif;
+/* Detach from an ill that is going away */
+static void
+ilg_check_detach(conn_t *connp, ill_t *ill)
 {
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
+	char	group_buf[INET6_ADDRSTRLEN];
+	ilg_t	*ilg, *held_ilg;
+	ilm_t	*ilm;
 
-	ipcl_walk(conn_delete_ipif, (caddr_t)ipif, ipst);
-}
+	mutex_enter(&ill->ill_mcast_serializer);
+	rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+	held_ilg = NULL;
+	for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+		if (ilg->ilg_condemned)
+			continue;
 
-/*
- * Called when an ill is unplumbed to make sure that there are no
- * dangling conn references to that ill.
- * Handles ilg_ill, conn_multicast_ill.
- */
-void
-reset_conn_ill(ill_t *ill)
-{
-	ip_stack_t	*ipst = ill->ill_ipst;
+		if (ilg->ilg_ill != ill)
+			continue;
+
+		/* Detach from current ill */
+		ip1dbg(("ilg_check_detach: detach %s on %s\n",
+		    inet_ntop(AF_INET6, &ilg->ilg_v6group,
+		    group_buf, sizeof (group_buf)),
+		    ilg->ilg_ill->ill_name));
+
+		/* Detach this ilg from the ill/ilm */
+		ilm = ilg->ilg_ilm;
+		ilg->ilg_ilm = NULL;
+		ilg->ilg_ill = NULL;
+		if (ilm == NULL)
+			continue;
 
-	ipcl_walk(conn_delete_ill, (caddr_t)ill, ipst);
+		/* Prevent ilg from disappearing */
+		ilg_transfer_hold(held_ilg, ilg);
+		held_ilg = ilg;
+		rw_exit(&connp->conn_ilg_lock);
+
+		(void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE);
+		rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+	}
+	if (held_ilg != NULL)
+		ilg_refrele(held_ilg);
+	rw_exit(&connp->conn_ilg_lock);
+	mutex_exit(&ill->ill_mcast_serializer);
 }
 
-#ifdef DEBUG
 /*
- * Walk functions walk all the interfaces in the system to make
- * sure that there is no refernece to the ipif or ill that is
- * going away.
+ * Check if there is a place to attach the conn_ilgs. We do this for both
+ * detached ilgs and attached ones, since for the latter there could be
+ * a better ill to attach them to.
  */
-int
-ilm_walk_ill(ill_t *ill)
+static void
+ilg_check_reattach(conn_t *connp)
 {
-	int cnt = 0;
-	ill_t *till;
-	ilm_t *ilm;
-	ill_walk_context_t ctx;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	till = ILL_START_WALK_ALL(&ctx, ipst);
-	for (; till != NULL; till = ill_next(&ctx, till)) {
-		mutex_enter(&till->ill_lock);
-		for (ilm = till->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (ilm->ilm_ill == ill) {
-				cnt++;
+	ill_t	*ill;
+	char	group_buf[INET6_ADDRSTRLEN];
+	ilg_t	*ilg, *held_ilg;
+	ilm_t	*ilm;
+	zoneid_t zoneid = IPCL_ZONEID(connp);
+	int	error;
+	ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+
+	rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+	held_ilg = NULL;
+	for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+		if (ilg->ilg_condemned)
+			continue;
+
+		/* Check if the conn_ill matches what we would pick now */
+		ill = ill_mcast_lookup(&ilg->ilg_v6group, ilg->ilg_ifaddr,
+		    ilg->ilg_ifindex, zoneid, ipst, &error);
+
+		/*
+		 * Make sure the ill is usable for multicast and that
+		 * we can send the DL_ADDMULTI_REQ before we create an
+		 * ilm.
+		 */
+		if (ill != NULL &&
+		    (!(ill->ill_flags & ILLF_MULTICAST) || !ill->ill_dl_up)) {
+			/* Drop locks across ill_refrele */
+			ilg_transfer_hold(held_ilg, ilg);
+			held_ilg = ilg;
+			rw_exit(&connp->conn_ilg_lock);
+			ill_refrele(ill);
+			ill = NULL;
+			rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+			/* Note that ilg could have become condemned */
+		}
+
+		/* Is the ill unchanged, even if both are NULL? */
+		if (ill == ilg->ilg_ill) {
+			if (ill != NULL) {
+				/* Drop locks across ill_refrele */
+				ilg_transfer_hold(held_ilg, ilg);
+				held_ilg = ilg;
+				rw_exit(&connp->conn_ilg_lock);
+				ill_refrele(ill);
+				rw_enter(&connp->conn_ilg_lock, RW_WRITER);
 			}
+			continue;
 		}
-		mutex_exit(&till->ill_lock);
-	}
-	rw_exit(&ipst->ips_ill_g_lock);
 
-	return (cnt);
+		/* Something changed; detach from old first if needed */
+		if (ilg->ilg_ill != NULL) {
+			ill_t *ill2 = ilg->ilg_ill;
+			boolean_t need_refrele = B_FALSE;
+
+			/*
+			 * In order to serialize on the ill we try to enter
+			 * and if that fails we unlock and relock.
+			 */
+			if (!mutex_tryenter(&ill2->ill_mcast_serializer)) {
+				ill_refhold(ill2);
+				need_refrele = B_TRUE;
+				ilg_transfer_hold(held_ilg, ilg);
+				held_ilg = ilg;
+				rw_exit(&connp->conn_ilg_lock);
+				mutex_enter(&ill2->ill_mcast_serializer);
+				rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+				/* Note that ilg could have become condemned */
+			}
+			/*
+			 * Check that nobody else re-attached the ilg while we
+			 * dropped the lock.
+			 */
+			if (ilg->ilg_ill == ill2) {
+				ASSERT(!ilg->ilg_condemned);
+				/* Detach from current ill */
+				ip1dbg(("conn_check_reattach: detach %s/%s\n",
+				    inet_ntop(AF_INET6, &ilg->ilg_v6group,
+				    group_buf, sizeof (group_buf)),
+				    ill2->ill_name));
+
+				ilm = ilg->ilg_ilm;
+				ilg->ilg_ilm = NULL;
+				ilg->ilg_ill = NULL;
+			} else {
+				ilm = NULL;
+			}
+			rw_exit(&connp->conn_ilg_lock);
+			if (ilm != NULL)
+				(void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE);
+			mutex_exit(&ill2->ill_mcast_serializer);
+			if (need_refrele) {
+				/* Drop ill reference while we hold no locks */
+				ill_refrele(ill2);
+			}
+			rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+			/*
+			 * While we dropped conn_ilg_lock some other thread
+			 * could have attached this ilg, thus we check again.
+			 */
+			if (ilg->ilg_ill != NULL) {
+				if (ill != NULL) {
+					/* Drop locks across ill_refrele */
+					ilg_transfer_hold(held_ilg, ilg);
+					held_ilg = ilg;
+					rw_exit(&connp->conn_ilg_lock);
+					ill_refrele(ill);
+					rw_enter(&connp->conn_ilg_lock,
+					    RW_WRITER);
+				}
+				continue;
+			}
+		}
+		if (ill != NULL) {
+			/*
+			 * In order to serialize on the ill we try to enter
+			 * and if that fails we unlock and relock.
+			 */
+			if (!mutex_tryenter(&ill->ill_mcast_serializer)) {
+				/* Already have a refhold on ill */
+				ilg_transfer_hold(held_ilg, ilg);
+				held_ilg = ilg;
+				rw_exit(&connp->conn_ilg_lock);
+				mutex_enter(&ill->ill_mcast_serializer);
+				rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+				/* Note that ilg could have become condemned */
+			}
+
+			/*
+			 * Check that nobody else attached the ilg and that
+			 * it wasn't condemned while we dropped the lock.
+			 */
+			if (ilg->ilg_ill == NULL && !ilg->ilg_condemned) {
+				/*
+				 * Attach to the new ill. Can fail in which
+				 * case ilg_ill will remain NULL. ilg_attach
+				 * drops and reacquires conn_ilg_lock.
+				 */
+				ip1dbg(("conn_check_reattach: attach %s/%s\n",
+				    inet_ntop(AF_INET6, &ilg->ilg_v6group,
+				    group_buf, sizeof (group_buf)),
+				    ill->ill_name));
+				ilg_attach(connp, ilg, ill);
+				ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock));
+			}
+			mutex_exit(&ill->ill_mcast_serializer);
+			/* Drop locks across ill_refrele */
+			ilg_transfer_hold(held_ilg, ilg);
+			held_ilg = ilg;
+			rw_exit(&connp->conn_ilg_lock);
+			ill_refrele(ill);
+			rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+		}
+	}
+	if (held_ilg != NULL)
+		ilg_refrele(held_ilg);
+	rw_exit(&connp->conn_ilg_lock);
 }
 
 /*
- * This function is called before the ipif is freed.
+ * Called when an ill is unplumbed to make sure that there are no
+ * dangling conn references to that ill. In that case ill is non-NULL and
+ * we make sure we remove all references to it.
+ * Also called when we should revisit the ilg_ill used for multicast
+ * memberships, in which case ill is NULL.
  */
-int
-ilm_walk_ipif(ipif_t *ipif)
+void
+update_conn_ill(ill_t *ill, ip_stack_t *ipst)
 {
-	int cnt = 0;
-	ill_t *till;
-	ilm_t *ilm;
-	ill_walk_context_t ctx;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
-
-	till = ILL_START_WALK_ALL(&ctx, ipst);
-	for (; till != NULL; till = ill_next(&ctx, till)) {
-		mutex_enter(&till->ill_lock);
-		for (ilm = till->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (ilm->ilm_ipif == ipif) {
-					cnt++;
-			}
-		}
-		mutex_exit(&till->ill_lock);
-	}
-	return (cnt);
+	ipcl_walk(conn_update_ill, (caddr_t)ill, ipst);
 }
-#endif
diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c
index 35f9d541e8..97096bea99 100644
--- a/usr/src/uts/common/inet/ip/ip_ndp.c
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c
@@ -40,6 +40,7 @@
 #include <sys/zone.h>
 #include <sys/ethernet.h>
 #include <sys/sdt.h>
+#include <sys/mac.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
@@ -61,53 +62,93 @@
 #include <inet/ip_rts.h>
 #include <inet/ip6.h>
 #include <inet/ip_ndp.h>
-#include <inet/ipsec_impl.h>
-#include <inet/ipsec_info.h>
 #include <inet/sctp_ip.h>
+#include <inet/ip_arp.h>
 #include <inet/ip2mac_impl.h>
 
+#define	ANNOUNCE_INTERVAL(isv6) \
+	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
+	ipst->ips_ip_arp_publish_interval)
+
+#define	DEFENSE_INTERVAL(isv6) \
+	(isv6 ? ipst->ips_ndp_defend_interval : \
+	ipst->ips_arp_defend_interval)
+
+/* Non-tunable probe interval, based on link capabilities */
+#define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
+
+/*
+ * The IPv4 Link Local address space is special; we do extra duplicate checking
+ * there, as the entire assignment mechanism rests on random numbers.
+ */
+#define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
+				((uchar_t *)ptr)[1] == 254)
+
+/*
+ * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
+ * in to the ncec*add* functions.
+ *
+ * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
+ * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
+ * that we will respond to requests for the protocol address.
+ */
+#define	NCE_EXTERNAL_FLAGS_MASK \
+	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
+	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
+	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
+
 /*
  * Function names with nce_ prefix are static while function
  * names with ndp_ prefix are used by rest of the IP.
  *
  * Lock ordering:
  *
- *	ndp_g_lock -> ill_lock -> nce_lock
+ *	ndp_g_lock -> ill_lock -> ncec_lock
  *
  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
- * nce_next.  Nce_lock protects the contents of the NCE (particularly
- * nce_refcnt).
- */
-
-static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
-    uint32_t ll_addr_len);
-static	void	nce_ire_delete(nce_t *nce);
-static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
-static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
-static	nce_t	*nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *,
-    nce_t *);
-static	nce_t	*nce_lookup_mapping(ill_t *, const in6_addr_t *);
-static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
-    uchar_t *addr);
-static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
-static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
-static	mblk_t	*nce_udreq_alloc(ill_t *ill);
-static	void	nce_update(nce_t *nce, uint16_t new_state,
-    uchar_t *new_ll_addr);
-static	uint32_t	nce_solicit(nce_t *nce, in6_addr_t src);
-static	boolean_t	nce_xmit(ill_t *ill, uint8_t type,
-    boolean_t use_lla_addr, const in6_addr_t *sender,
+ * ncec_next.  ncec_lock protects the contents of the NCE (particularly
+ * ncec_refcnt).
+ */
+
+static	void	nce_cleanup_list(ncec_t *ncec);
+static	void 	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
+static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
+    ncec_t *);
+static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
+static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
+    uint16_t ncec_flags, nce_t **newnce);
+static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
+    uint16_t ncec_flags, nce_t **newnce);
+static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
+    uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
     const in6_addr_t *target, int flag);
-static boolean_t	nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla,
-    const in6_addr_t *target, uint_t flags);
-static boolean_t	nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla,
-    const in6_addr_t *src, uint_t flags);
-static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
-    nce_t **, nce_t *);
-static ipif_t	*ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill);
+static void	ncec_refhold_locked(ncec_t *);
+static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
+static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
+static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
+    uint16_t, uint16_t, nce_t **);
+static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
+static nce_t *nce_add(ill_t *, ncec_t *);
+static void nce_inactive(nce_t *);
+extern nce_t 	*nce_lookup(ill_t *, const in6_addr_t *);
+static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
+static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
+    uint16_t, uint16_t, nce_t **);
+static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
+    uint16_t, uint16_t, nce_t **);
+static int  nce_add_v6_postprocess(nce_t *);
+static int  nce_add_v4_postprocess(nce_t *);
+static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
+static clock_t nce_fuzz_interval(clock_t, boolean_t);
+static void nce_resolv_ipmp_ok(ncec_t *);
+static void nce_walk_common(ill_t *, pfi_t, void *);
+static void nce_start_timer(ncec_t *, uint_t);
+static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
+static void nce_fastpath_trigger(nce_t *);
+static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
 
 #ifdef DEBUG
-static void	nce_trace_cleanup(const nce_t *);
+static void	ncec_trace_cleanup(const ncec_t *);
 #endif
 
 #define	NCE_HASH_PTR_V4(ipst, addr)					\
@@ -117,233 +158,245 @@ static void	nce_trace_cleanup(const nce_t *);
 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
 		NCE_TABLE_SIZE)]))
 
-/* Non-tunable probe interval, based on link capabilities */
-#define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
+extern kmem_cache_t *ncec_cache;
+extern kmem_cache_t *nce_cache;
+
+/*
+ * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
+ * If src_ill is not null, the ncec_addr is bound to src_ill. The
+ * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
+ * the probe is sent on the ncec_ill (in the non-IPMP case) or the
+ * IPMP cast_ill (in the IPMP case).
+ *
+ * Note that the probe interval is based on ncec->ncec_ill which
+ * may be the ipmp_ill.
+ */
+static void
+nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
+{
+	boolean_t dropped;
+	uint32_t probe_interval;
+
+	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
+	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
+	if (ncec->ncec_ipversion == IPV6_VERSION) {
+		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
+		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
+		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
+		probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill);
+	} else {
+		/* IPv4 DAD delay the initial probe. */
+		if (send_probe)
+			dropped = arp_probe(ncec);
+		else
+			dropped = B_TRUE;
+		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
+		    !send_probe);
+	}
+	if (!dropped) {
+		mutex_enter(&ncec->ncec_lock);
+		ncec->ncec_pcnt--;
+		mutex_exit(&ncec->ncec_lock);
+	}
+	nce_restart_timer(ncec, probe_interval);
+}
+
+/*
+ * Compute default flags to use for an advertisement of this ncec's address.
+ */
+static int
+nce_advert_flags(const ncec_t *ncec)
+{
+	int flag = 0;
+
+	if (ncec->ncec_flags & NCE_F_ISROUTER)
+		flag |= NDP_ISROUTER;
+	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
+		flag |= NDP_ORIDE;
+
+	return (flag);
+}
 
 /*
  * NDP Cache Entry creation routine.
  * Mapped entries will never do NUD .
  * This routine must always be called with ndp6->ndp_g_lock held.
- * Prior to return, nce_refcnt is incremented.
  */
 int
-ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
-    const in6_addr_t *mask, const in6_addr_t *extract_mask,
-    uint32_t hw_extract_start, uint16_t flags, uint16_t state,
-    nce_t **newnce)
+nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
+    const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 {
-	static	nce_t		nce_nil;
-	nce_t		*nce;
-	mblk_t		*mp;
-	mblk_t		*template;
-	nce_t		**ncep;
 	int		err;
-	boolean_t	dropped = B_FALSE;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	nce_t		*nce;
 
-	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
+	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
 	ASSERT(ill != NULL && ill->ill_isv6);
-	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
-		ip0dbg(("ndp_add_v6: no addr\n"));
-		return (EINVAL);
-	}
-	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
-		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
-		return (EINVAL);
-	}
-	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
-	    (flags & NCE_F_MAPPING)) {
-		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
-		return (EINVAL);
-	}
-	/*
-	 * Allocate the mblk to hold the nce.
-	 *
-	 * XXX This can come out of a separate cache - nce_cache.
-	 * We don't need the mp anymore as there are no more
-	 * "qwriter"s
-	 */
-	mp = allocb(sizeof (nce_t), BPRI_MED);
-	if (mp == NULL)
-		return (ENOMEM);
 
-	nce = (nce_t *)mp->b_rptr;
-	mp->b_wptr = (uchar_t *)&nce[1];
-	*nce = nce_nil;
-
-	/*
-	 * This one holds link layer address
-	 */
-	if (ill->ill_net_type == IRE_IF_RESOLVER) {
-		template = nce_udreq_alloc(ill);
-	} else {
-		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
-		    ill->ill_mactype != DL_IPV6) {
-			/*
-			 * We create a nce_res_mp with the IP nexthop address
-			 * as the destination address if the physical length
-			 * is exactly 16 bytes for point-to-multipoint links
-			 * that do their own resolution from IP to link-layer
-			 * address.
-			 */
-			template = ill_dlur_gen((uchar_t *)addr,
-			    ill->ill_phys_addr_length, ill->ill_sap,
-			    ill->ill_sap_length);
-		} else {
-			if (ill->ill_resolver_mp == NULL) {
-				freeb(mp);
-				return (EINVAL);
-			}
-			ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
-			template = copyb(ill->ill_resolver_mp);
-		}
-	}
-	if (template == NULL) {
-		freeb(mp);
-		return (ENOMEM);
-	}
-	nce->nce_ill = ill;
-	nce->nce_ipversion = IPV6_VERSION;
-	nce->nce_flags = flags;
-	nce->nce_state = state;
-	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
-	nce->nce_rcnt = ill->ill_xmit_count;
-	nce->nce_addr = *addr;
-	nce->nce_mask = *mask;
-	nce->nce_extract_mask = *extract_mask;
-	nce->nce_ll_extract_start = hw_extract_start;
-	nce->nce_fp_mp = NULL;
-	nce->nce_res_mp = template;
-	if (state == ND_REACHABLE)
-		nce->nce_last = TICK_TO_MSEC(lbolt64);
-	else
-		nce->nce_last = 0;
-	nce->nce_qd_mp = NULL;
-	nce->nce_mp = mp;
-	if (hw_addr != NULL)
-		nce_set_ll(nce, hw_addr);
-	/* This one is for nce getting created */
-	nce->nce_refcnt = 1;
-	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
-	if (nce->nce_flags & NCE_F_MAPPING) {
-		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
-		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
-		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
-		ncep = &ipst->ips_ndp6->nce_mask_entries;
-	} else {
-		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
-	}
+	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
+	    &nce);
+	if (err != 0)
+		return (err);
+	ASSERT(newnce != NULL);
+	*newnce = nce;
+	return (err);
+}
 
-	nce->nce_trace_disable = B_FALSE;
+/*
+ * Post-processing routine to be executed after nce_add_v6(). This function
+ * triggers fastpath (if appropriate) and DAD on the newly added nce entry
+ * and must be called without any locks held.
+ */
+int
+nce_add_v6_postprocess(nce_t *nce)
+{
+	ncec_t		*ncec = nce->nce_common;
+	boolean_t	dropped = B_FALSE;
+	uchar_t		*hw_addr = ncec->ncec_lladdr;
+	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
+	ill_t		*ill = ncec->ncec_ill;
+	int		err = 0;
+	uint16_t	flags = ncec->ncec_flags;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	boolean_t	trigger_fastpath = B_TRUE;
 
-	list_create(&nce->nce_cb, sizeof (nce_cb_t),
-	    offsetof(nce_cb_t, nce_cb_node));
 	/*
-	 * Atomically ensure that the ill is not CONDEMNED, before
-	 * adding the NCE.
+	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
+	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
+	 * We call nce_fastpath from nce_update if the link layer address of
+	 * the peer changes from nce_update
 	 */
-	mutex_enter(&ill->ill_lock);
-	if (ill->ill_state_flags & ILL_CONDEMNED) {
-		mutex_exit(&ill->ill_lock);
-		freeb(mp);
-		freeb(template);
-		return (EINVAL);
-	}
-	if ((nce->nce_next = *ncep) != NULL)
-		nce->nce_next->nce_ptpn = &nce->nce_next;
-	*ncep = nce;
-	nce->nce_ptpn = ncep;
-	*newnce = nce;
-	/* This one is for nce being used by an active thread */
-	NCE_REFHOLD(*newnce);
-
-	/* Bump up the number of nce's referencing this ill */
-	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
-	    (char *), "nce", (void *), nce);
-	ill->ill_nce_cnt++;
-	mutex_exit(&ill->ill_lock);
-
-	err = 0;
-	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
-		mutex_enter(&nce->nce_lock);
-		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
-		mutex_exit(&nce->nce_lock);
-		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
-		if (dropped) {
-			mutex_enter(&nce->nce_lock);
-			nce->nce_pcnt++;
-			mutex_exit(&nce->nce_lock);
+	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
+	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
+		trigger_fastpath = B_FALSE;
+
+	if (trigger_fastpath)
+		nce_fastpath_trigger(nce);
+	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
+		ill_t *hwaddr_ill;
+		/*
+		 * Unicast entry that needs DAD.
+		 */
+		if (IS_IPMP(ill)) {
+			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
+			    hw_addr, hw_addr_len);
+		} else {
+			hwaddr_ill = ill;
 		}
-		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
-		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+		nce_dad(ncec, hwaddr_ill, B_TRUE);
 		err = EINPROGRESS;
 	} else if (flags & NCE_F_UNSOL_ADV) {
 		/*
 		 * We account for the transmit below by assigning one
 		 * less than the ndd variable. Subsequent decrements
-		 * are done in ndp_timer.
+		 * are done in nce_timer.
 		 */
-		mutex_enter(&nce->nce_lock);
-		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
-		mutex_exit(&nce->nce_lock);
-		dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast,
-		    0);
-		mutex_enter(&nce->nce_lock);
+		mutex_enter(&ncec->ncec_lock);
+		ncec->ncec_unsolicit_count =
+		    ipst->ips_ip_ndp_unsolicit_count - 1;
+		mutex_exit(&ncec->ncec_lock);
+		dropped = ndp_xmit(ill,
+		    ND_NEIGHBOR_ADVERT,
+		    hw_addr,
+		    hw_addr_len,
+		    &ncec->ncec_addr,	/* Source and target of the adv */
+		    &ipv6_all_hosts_mcast, /* Destination of the packet */
+		    nce_advert_flags(ncec));
+		mutex_enter(&ncec->ncec_lock);
 		if (dropped)
-			nce->nce_unsolicit_count++;
-		if (nce->nce_unsolicit_count != 0) {
-			ASSERT(nce->nce_timeout_id == 0);
-			nce->nce_timeout_id = timeout(ndp_timer, nce,
-			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
+			ncec->ncec_unsolicit_count++;
+		else
+			ncec->ncec_last_time_defended = ddi_get_lbolt();
+		if (ncec->ncec_unsolicit_count != 0) {
+			nce_start_timer(ncec,
+			    ipst->ips_ip_ndp_unsolicit_interval);
 		}
-		mutex_exit(&nce->nce_lock);
-		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+		mutex_exit(&ncec->ncec_lock);
 	}
-
-	/*
-	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
-	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
-	 * We call nce_fastpath from nce_update if the link layer address of
-	 * the peer changes from nce_update
-	 */
-	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
-		nce_fastpath(nce);
 	return (err);
 }
 
+/*
+ * Atomically lookup and add (if needed) Neighbor Cache information for
+ * an address.
+ *
+ * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
+ * are always added pointing at the ipmp_ill. Thus, when the ill passed
+ * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
+ * entries will be created, both pointing at the same ncec_t. The nce_t
+ * entries will have their nce_ill set to the ipmp_ill and the under_ill
+ * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
+ * Local addresses are always created on the ill passed to nce_add_v6.
+ */
 int
-ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
-    const in6_addr_t *addr, const in6_addr_t *mask,
-    const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags,
-    uint16_t state, nce_t **newnce)
+nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
+    const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 {
-	int	err = 0;
-	nce_t	*nce;
+	int		err = 0;
 	ip_stack_t	*ipst = ill->ill_ipst;
+	nce_t		*nce, *upper_nce = NULL;
+	ill_t		*in_ill = ill;
+	boolean_t	need_ill_refrele = B_FALSE;
 
+	if (flags & NCE_F_MCAST) {
+		/*
+		 * hw_addr will be figured out in nce_set_multicast_v6;
+		 * caller has to select the cast_ill
+		 */
+		ASSERT(hw_addr == NULL);
+		ASSERT(!IS_IPMP(ill));
+		err = nce_set_multicast_v6(ill, addr, flags, newnce);
+		return (err);
+	}
 	ASSERT(ill->ill_isv6);
-	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
+		ill = ipmp_ill_hold_ipmp_ill(ill);
+		if (ill == NULL)
+			return (ENXIO);
+		need_ill_refrele = B_TRUE;
+	}
 
-	/* Get head of v6 hash table */
-	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
-	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
+	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+	nce = nce_lookup_addr(ill, addr);
 	if (nce == NULL) {
-		err = ndp_add_v6(ill,
-		    hw_addr,
-		    addr,
-		    mask,
-		    extract_mask,
-		    hw_extract_start,
-		    flags,
-		    state,
-		    newnce);
+		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
+		    &nce);
 	} else {
-		*newnce = nce;
 		err = EEXIST;
 	}
 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+	if (err == 0)
+		err = nce_add_v6_postprocess(nce);
+	if (in_ill != ill && nce != NULL) {
+		nce_t *under_nce;
+
+		/*
+		 * in_ill was the under_ill. Try to create the under_nce.
+		 * Hold the ill_g_lock to prevent changes to group membership
+		 * until we are done.
+		 */
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+		if (IS_IN_SAME_ILLGRP(in_ill, ill)) {
+			under_nce = nce_fastpath_create(in_ill,
+			    nce->nce_common);
+			upper_nce = nce;
+			if ((nce = under_nce) == NULL)
+				err = EINVAL;
+		}
+		rw_exit(&ipst->ips_ill_g_lock);
+		if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common))
+			nce_fastpath_trigger(under_nce);
+	}
+	if (nce != NULL) {
+		if (newnce != NULL)
+			*newnce = nce;
+		else
+			nce_refrele(nce);
+	}
+	/* nce_refrele is deferred until the lock is dropped  */
+	if (upper_nce != NULL)
+		nce_refrele(upper_nce);
+	if (need_ill_refrele)
+		ill_refrele(ill);
 	return (err);
 }
 
@@ -351,53 +404,51 @@ ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
  * Remove all the CONDEMNED nces from the appropriate hash table.
  * We create a private list of NCEs, these may have ires pointing
  * to them, so the list will be passed through to clean up dependent
- * ires and only then we can do NCE_REFRELE which can make NCE inactive.
+ * ires and only then we can do ncec_refrele() which can make NCE inactive.
  */
 static void
-nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
+nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
 {
-	nce_t *nce1;
-	nce_t **ptpn;
+	ncec_t *ncec1;
+	ncec_t **ptpn;
 
 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 	ASSERT(ndp->ndp_g_walker == 0);
-	for (; nce; nce = nce1) {
-		nce1 = nce->nce_next;
-		mutex_enter(&nce->nce_lock);
-		if (nce->nce_flags & NCE_F_CONDEMNED) {
-			ptpn = nce->nce_ptpn;
-			nce1 = nce->nce_next;
-			if (nce1 != NULL)
-				nce1->nce_ptpn = ptpn;
-			*ptpn = nce1;
-			nce->nce_ptpn = NULL;
-			nce->nce_next = NULL;
-			nce->nce_next = *free_nce_list;
-			*free_nce_list = nce;
+	for (; ncec; ncec = ncec1) {
+		ncec1 = ncec->ncec_next;
+		mutex_enter(&ncec->ncec_lock);
+		if (NCE_ISCONDEMNED(ncec)) {
+			ptpn = ncec->ncec_ptpn;
+			ncec1 = ncec->ncec_next;
+			if (ncec1 != NULL)
+				ncec1->ncec_ptpn = ptpn;
+			*ptpn = ncec1;
+			ncec->ncec_ptpn = NULL;
+			ncec->ncec_next = NULL;
+			ncec->ncec_next = *free_nce_list;
+			*free_nce_list = ncec;
 		}
-		mutex_exit(&nce->nce_lock);
+		mutex_exit(&ncec->ncec_lock);
 	}
 }
 
 /*
- * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
- *    will return this NCE. Also no new IREs will be created that
- *    point to this NCE (See ire_add_v6).  Also no new timeouts will
- *    be started (See NDP_RESTART_TIMER).
+ * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
+ *    will return this NCE. Also no new timeouts will
+ *    be started (See nce_restart_timer).
  * 2. Cancel any currently running timeouts.
  * 3. If there is an ndp walker, return. The walker will do the cleanup.
  *    This ensures that walkers see a consistent list of NCEs while walking.
  * 4. Otherwise remove the NCE from the list of NCEs
- * 5. Delete all IREs pointing to this NCE.
  */
 void
-ndp_delete(nce_t *nce)
+ncec_delete(ncec_t *ncec)
 {
-	nce_t	**ptpn;
-	nce_t	*nce1;
-	int	ipversion = nce->nce_ipversion;
+	ncec_t	**ptpn;
+	ncec_t	*ncec1;
+	int	ipversion = ncec->ncec_ipversion;
 	ndp_g_t *ndp;
-	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
+	ip_stack_t	*ipst = ncec->ncec_ipst;
 
 	if (ipversion == IPV4_VERSION)
 		ndp = ipst->ips_ndp4;
@@ -405,40 +456,42 @@ ndp_delete(nce_t *nce)
 		ndp = ipst->ips_ndp6;
 
 	/* Serialize deletes */
-	mutex_enter(&nce->nce_lock);
-	if (nce->nce_flags & NCE_F_CONDEMNED) {
+	mutex_enter(&ncec->ncec_lock);
+	if (NCE_ISCONDEMNED(ncec)) {
 		/* Some other thread is doing the delete */
-		mutex_exit(&nce->nce_lock);
+		mutex_exit(&ncec->ncec_lock);
 		return;
 	}
 	/*
 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
 	 * refcnt has to be >= 2
 	 */
-	ASSERT(nce->nce_refcnt >= 2);
-	nce->nce_flags |= NCE_F_CONDEMNED;
-	mutex_exit(&nce->nce_lock);
+	ASSERT(ncec->ncec_refcnt >= 2);
+	ncec->ncec_flags |= NCE_F_CONDEMNED;
+	mutex_exit(&ncec->ncec_lock);
 
-	nce_fastpath_list_delete(nce);
+	/* Count how many condemned ires for kmem_cache callback */
+	atomic_add_32(&ipst->ips_num_nce_condemned, 1);
+	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
 
 	/* Complete any waiting callbacks */
-	nce_cb_dispatch(nce);
+	ncec_cb_dispatch(ncec);
 
 	/*
 	 * Cancel any running timer. Timeout can't be restarted
-	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
+	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
 	 * Passing invalid timeout id is fine.
 	 */
-	if (nce->nce_timeout_id != 0) {
-		(void) untimeout(nce->nce_timeout_id);
-		nce->nce_timeout_id = 0;
+	if (ncec->ncec_timeout_id != 0) {
+		(void) untimeout(ncec->ncec_timeout_id);
+		ncec->ncec_timeout_id = 0;
 	}
 
 	mutex_enter(&ndp->ndp_g_lock);
-	if (nce->nce_ptpn == NULL) {
+	if (ncec->ncec_ptpn == NULL) {
 		/*
-		 * The last ndp walker has already removed this nce from
-		 * the list after we marked the nce CONDEMNED and before
+		 * The last ndp walker has already removed this ncec from
+		 * the list after we marked the ncec CONDEMNED and before
 		 * we grabbed the global lock.
 		 */
 		mutex_exit(&ndp->ndp_g_lock);
@@ -454,62 +507,68 @@ ndp_delete(nce_t *nce)
 	}
 
 	/*
-	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
+	 * Now remove the ncec from the list. nce_restart_timer won't restart
 	 * the timer since it is marked CONDEMNED.
 	 */
-	ptpn = nce->nce_ptpn;
-	nce1 = nce->nce_next;
-	if (nce1 != NULL)
-		nce1->nce_ptpn = ptpn;
-	*ptpn = nce1;
-	nce->nce_ptpn = NULL;
-	nce->nce_next = NULL;
+	ptpn = ncec->ncec_ptpn;
+	ncec1 = ncec->ncec_next;
+	if (ncec1 != NULL)
+		ncec1->ncec_ptpn = ptpn;
+	*ptpn = ncec1;
+	ncec->ncec_ptpn = NULL;
+	ncec->ncec_next = NULL;
 	mutex_exit(&ndp->ndp_g_lock);
 
-	nce_ire_delete(nce);
+	/* Removed from ncec_ptpn/ncec_next list */
+	ncec_refrele_notr(ncec);
 }
 
 void
-ndp_inactive(nce_t *nce)
+ncec_inactive(ncec_t *ncec)
 {
 	mblk_t		**mpp;
-	ill_t		*ill;
+	ill_t		*ill = ncec->ncec_ill;
+	ip_stack_t	*ipst = ncec->ncec_ipst;
 
-	ASSERT(nce->nce_refcnt == 0);
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
-	ASSERT(nce->nce_fastpath == NULL);
+	ASSERT(ncec->ncec_refcnt == 0);
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 
-	/* Free all nce allocated messages */
-	mpp = &nce->nce_first_mp_to_free;
-	do {
-		while (*mpp != NULL) {
-			mblk_t  *mp;
+	/* Count how many condemned nces for kmem_cache callback */
+	if (NCE_ISCONDEMNED(ncec))
+		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
 
-			mp = *mpp;
-			*mpp = mp->b_next;
+	/* Free all allocated messages */
+	mpp = &ncec->ncec_qd_mp;
+	while (*mpp != NULL) {
+		mblk_t  *mp;
 
-			inet_freemsg(mp);
-		}
-	} while (mpp++ != &nce->nce_last_mp_to_free);
+		mp = *mpp;
+		*mpp = mp->b_next;
 
-	if (nce->nce_ipversion == IPV6_VERSION) {
-		/*
-		 * must have been cleaned up in nce_delete
-		 */
-		ASSERT(list_is_empty(&nce->nce_cb));
-		list_destroy(&nce->nce_cb);
+		inet_freemsg(mp);
 	}
+	/*
+	 * must have been cleaned up in ncec_delete
+	 */
+	ASSERT(list_is_empty(&ncec->ncec_cb));
+	list_destroy(&ncec->ncec_cb);
+	/*
+	 * free the ncec_lladdr if one was allocated in nce_add_common()
+	 */
+	if (ncec->ncec_lladdr_length > 0)
+		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
+
 #ifdef DEBUG
-	nce_trace_cleanup(nce);
+	ncec_trace_cleanup(ncec);
 #endif
 
-	ill = nce->nce_ill;
 	mutex_enter(&ill->ill_lock);
 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
-	    (char *), "nce", (void *), nce);
-	ill->ill_nce_cnt--;
+	    (char *), "ncec", (void *), ncec);
+	ill->ill_ncec_cnt--;
+	ncec->ncec_ill = NULL;
 	/*
-	 * If the number of nce's associated with this ill have dropped
+	 * If the number of ncec's associated with this ill have dropped
 	 * to zero, check whether we need to restart any operation that
 	 * is waiting for this to happen.
 	 */
@@ -519,104 +578,59 @@ ndp_inactive(nce_t *nce)
 	} else {
 		mutex_exit(&ill->ill_lock);
 	}
-	mutex_destroy(&nce->nce_lock);
-	if (nce->nce_mp != NULL)
-		inet_freemsg(nce->nce_mp);
+
+	mutex_destroy(&ncec->ncec_lock);
+	kmem_cache_free(ncec_cache, ncec);
 }
 
 /*
- * ndp_walk routine.  Delete the nce if it is associated with the ill
+ * ncec_walk routine.  Delete the ncec if it is associated with the ill
  * that is going away.  Always called as a writer.
  */
 void
-ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
+ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
 {
-	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
-		ndp_delete(nce);
+	if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
+		ncec_delete(ncec);
 	}
 }
 
 /*
- * Walk a list of to be inactive NCEs and blow away all the ires.
+ * Neighbor Cache cleanup logic for a list of ncec_t entries.
  */
 static void
-nce_ire_delete_list(nce_t *nce)
+nce_cleanup_list(ncec_t *ncec)
 {
-	nce_t *nce_next;
+	ncec_t *ncec_next;
 
-	ASSERT(nce != NULL);
-	while (nce != NULL) {
-		nce_next = nce->nce_next;
-		nce->nce_next = NULL;
+	ASSERT(ncec != NULL);
+	while (ncec != NULL) {
+		ncec_next = ncec->ncec_next;
+		ncec->ncec_next = NULL;
 
 		/*
 		 * It is possible for the last ndp walker (this thread)
-		 * to come here after ndp_delete has marked the nce CONDEMNED
-		 * and before it has removed the nce from the fastpath list
+		 * to come here after ncec_delete has marked the ncec CONDEMNED
+		 * and before it has removed the ncec from the fastpath list
 		 * or called untimeout. So we need to do it here. It is safe
-		 * for both ndp_delete and this thread to do it twice or
+		 * for both ncec_delete and this thread to do it twice or
 		 * even simultaneously since each of the threads has a
-		 * reference on the nce.
+		 * reference on the ncec.
 		 */
-		nce_fastpath_list_delete(nce);
+		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
 		/*
 		 * Cancel any running timer. Timeout can't be restarted
-		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
-		 * Passing invalid timeout id is fine.
+		 * since CONDEMNED is set. The ncec_lock can't be
+		 * held across untimeout though passing invalid timeout
+		 * id is fine.
 		 */
-		if (nce->nce_timeout_id != 0) {
-			(void) untimeout(nce->nce_timeout_id);
-			nce->nce_timeout_id = 0;
+		if (ncec->ncec_timeout_id != 0) {
+			(void) untimeout(ncec->ncec_timeout_id);
+			ncec->ncec_timeout_id = 0;
 		}
-		/*
-		 * We might hit this func thus in the v4 case:
-		 * ipif_down->ipif_ndp_down->ndp_walk
-		 */
-
-		if (nce->nce_ipversion == IPV4_VERSION) {
-			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
-			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
-		} else {
-			ASSERT(nce->nce_ipversion == IPV6_VERSION);
-			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
-			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
-		}
-		NCE_REFRELE_NOTR(nce);
-		nce = nce_next;
-	}
-}
-
-/*
- * Delete an ire when the nce goes away.
- */
-/* ARGSUSED */
-static void
-nce_ire_delete(nce_t *nce)
-{
-	if (nce->nce_ipversion == IPV6_VERSION) {
-		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
-		    nce_ire_delete1, (char *)nce, nce->nce_ill);
-		NCE_REFRELE_NOTR(nce);
-	} else {
-		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
-		    nce_ire_delete1, (char *)nce, nce->nce_ill);
-		NCE_REFRELE_NOTR(nce);
-	}
-}
-
-/*
- * ire_walk routine used to delete every IRE that shares this nce
- */
-static void
-nce_ire_delete1(ire_t *ire, char *nce_arg)
-{
-	nce_t	*nce = (nce_t *)nce_arg;
-
-	ASSERT(ire->ire_type == IRE_CACHE);
-
-	if (ire->ire_nce == nce) {
-		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
-		ire_delete(ire);
+		/* Removed from ncec_ptpn/ncec_next list */
+		ncec_refrele_notr(ncec);
+		ncec = ncec_next;
 	}
 }
 
@@ -624,100 +638,97 @@ nce_ire_delete1(ire_t *ire, char *nce_arg)
  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
  */
 boolean_t
-ndp_restart_dad(nce_t *nce)
+nce_restart_dad(ncec_t *ncec)
 {
 	boolean_t started;
-	boolean_t dropped;
+	ill_t *ill, *hwaddr_ill;
 
-	if (nce == NULL)
+	if (ncec == NULL)
 		return (B_FALSE);
-	mutex_enter(&nce->nce_lock);
-	if (nce->nce_state == ND_PROBE) {
-		mutex_exit(&nce->nce_lock);
+	ill = ncec->ncec_ill;
+	mutex_enter(&ncec->ncec_lock);
+	if (ncec->ncec_state == ND_PROBE) {
+		mutex_exit(&ncec->ncec_lock);
 		started = B_TRUE;
-	} else if (nce->nce_state == ND_REACHABLE) {
-		nce->nce_state = ND_PROBE;
-		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
-		mutex_exit(&nce->nce_lock);
-		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
-		if (dropped) {
-			mutex_enter(&nce->nce_lock);
-			nce->nce_pcnt++;
-			mutex_exit(&nce->nce_lock);
+	} else if (ncec->ncec_state == ND_REACHABLE) {
+		ASSERT(ncec->ncec_lladdr != NULL);
+		ncec->ncec_state = ND_PROBE;
+		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
+		/*
+		 * Slight cheat here: we don't use the initial probe delay
+		 * for IPv4 in this obscure case.
+		 */
+		mutex_exit(&ncec->ncec_lock);
+		if (IS_IPMP(ill)) {
+			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
+			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
+		} else {
+			hwaddr_ill = ill;
 		}
-		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
+		nce_dad(ncec, hwaddr_ill, B_TRUE);
 		started = B_TRUE;
 	} else {
-		mutex_exit(&nce->nce_lock);
+		mutex_exit(&ncec->ncec_lock);
 		started = B_FALSE;
 	}
 	return (started);
 }
 
 /*
- * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
- * If one is found, the refcnt on the nce will be incremented.
+ * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
+ * If one is found, the refcnt on the ncec will be incremented.
  */
-nce_t *
-ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
-    boolean_t caller_holds_lock)
+ncec_t *
+ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
 {
-	nce_t	*nce;
-	ip_stack_t *ipst = ill->ill_ipst;
+	ncec_t		*ncec;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
-	ASSERT(ill->ill_isv6);
-	if (!caller_holds_lock)
-		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 
 	/* Get head of v6 hash table */
-	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
-	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
-	if (nce == NULL)
-		nce = nce_lookup_mapping(ill, addr);
-	if (!caller_holds_lock)
-		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-	return (nce);
+	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
+	ncec = ncec_lookup_illgrp(ill, addr, ncec);
+	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+	rw_exit(&ipst->ips_ill_g_lock);
+	return (ncec);
 }
 /*
- * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
- * If one is found, the refcnt on the nce will be incremented.
- * Since multicast mappings are handled in arp, there are no nce_mcast_entries
- * so we skip the nce_lookup_mapping call.
- * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
+ * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
+ * If one is found, the refcnt on the ncec will be incremented.
  */
-nce_t *
-ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
+ncec_t *
+ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
 {
-	nce_t	*nce;
+	ncec_t	*ncec = NULL;
 	in6_addr_t addr6;
 	ip_stack_t *ipst = ill->ill_ipst;
 
-	if (!caller_holds_lock)
-		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
 
 	/* Get head of v4 hash table */
-	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
+	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
-	/*
-	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
-	 * looking up have fastpath headers that are inherently per-ill.
-	 */
-	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
-	if (!caller_holds_lock)
-		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
-	return (nce);
+	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
+	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
+	rw_exit(&ipst->ips_ill_g_lock);
+	return (ncec);
 }
 
 /*
- * Cache entry lookup.  Try to find an nce matching the parameters passed.
- * Look only for exact entries (no mappings).  If an nce is found, increment
- * the hold count on that nce. The caller passes in the start of the
- * appropriate hash table, and must be holding the appropriate global
- * lock (ndp_g_lock).
+ * Cache entry lookup.  Try to find an ncec matching the parameters passed.
+ * If an ncec is found, increment the hold count on that ncec.
+ * The caller passes in the start of the appropriate hash table, and must
+ * be holding the appropriate global lock (ndp_g_lock). In addition, since
+ * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
+ * must be held as reader.
+ *
+ * This function always matches across the ipmp group.
  */
-static nce_t *
-nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
-    nce_t *nce)
+ncec_t *
+ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
 {
 	ndp_g_t		*ndp;
 	ip_stack_t	*ipst = ill->ill_ipst;
@@ -727,348 +738,246 @@ nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
 	else
 		ndp = ipst->ips_ndp4;
 
+	ASSERT(ill != NULL);
 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
 		return (NULL);
-	for (; nce != NULL; nce = nce->nce_next) {
-		if (nce->nce_ill == ill ||
-		    match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) {
-			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
-			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
-			    &ipv6_all_ones)) {
-				mutex_enter(&nce->nce_lock);
-				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
-					NCE_REFHOLD_LOCKED(nce);
-					mutex_exit(&nce->nce_lock);
+	for (; ncec != NULL; ncec = ncec->ncec_next) {
+		if (ncec->ncec_ill == ill ||
+		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
+			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
+				mutex_enter(&ncec->ncec_lock);
+				if (!NCE_ISCONDEMNED(ncec)) {
+					ncec_refhold_locked(ncec);
+					mutex_exit(&ncec->ncec_lock);
 					break;
 				}
-				mutex_exit(&nce->nce_lock);
+				mutex_exit(&ncec->ncec_lock);
 			}
 		}
 	}
+	return (ncec);
+}
+
+/*
+ * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
+ * entries for ill only, i.e., when ill is part of an ipmp group,
+ * nce_lookup_v4 will never try to match across the group.
+ */
+nce_t *
+nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
+{
+	nce_t *nce;
+	in6_addr_t addr6;
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
+	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
+	nce = nce_lookup_addr(ill, &addr6);
+	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 	return (nce);
 }
 
 /*
- * Cache entry lookup.  Try to find an nce matching the parameters passed.
- * Look only for mappings.
+ * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
+ * entries for ill only, i.e., when ill is part of an ipmp group,
+ * nce_lookup_v6 will never try to match across the group.
  */
+nce_t *
+nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
+{
+	nce_t *nce;
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+	nce = nce_lookup_addr(ill, addr6);
+	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+	return (nce);
+}
+
 static nce_t *
-nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
+nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
 {
-	nce_t	*nce;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	nce_t *nce;
 
-	ASSERT(ill != NULL && ill->ill_isv6);
-	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
-	if (!IN6_IS_ADDR_MULTICAST(addr))
-		return (NULL);
-	nce = ipst->ips_ndp6->nce_mask_entries;
-	for (; nce != NULL; nce = nce->nce_next)
-		if (nce->nce_ill == ill &&
-		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
-			mutex_enter(&nce->nce_lock);
-			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
-				NCE_REFHOLD_LOCKED(nce);
-				mutex_exit(&nce->nce_lock);
-				break;
-			}
-			mutex_exit(&nce->nce_lock);
-		}
+	ASSERT(ill != NULL);
+#ifdef DEBUG
+	if (ill->ill_isv6)
+		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
+	else
+		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
+#endif
+	mutex_enter(&ill->ill_lock);
+	nce = nce_lookup(ill, addr);
+	mutex_exit(&ill->ill_lock);
 	return (nce);
 }
 
+
+/*
+ * Router turned to host.  We need to make sure that cached copies of the ncec
+ * are not used for forwarding packets if they were derived from the default
+ * route, and that the default route itself is removed, as  required by
+ * section 7.2.5 of RFC 2461.
+ *
+ * Note that the ncec itself probably has valid link-layer information for the
+ * nexthop, so that there is no reason to delete the ncec, as long as the
+ * ISROUTER flag is turned off.
+ */
+static void
+ncec_router_to_host(ncec_t *ncec)
+{
+	ire_t		*ire;
+	ip_stack_t	*ipst = ncec->ncec_ipst;
+
+	mutex_enter(&ncec->ncec_lock);
+	ncec->ncec_flags &= ~NCE_F_ISROUTER;
+	mutex_exit(&ncec->ncec_lock);
+
+	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
+	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
+	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
+	if (ire != NULL) {
+		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
+		ire_delete(ire);
+		ire_refrele(ire);
+	}
+}
+
 /*
  * Process passed in parameters either from an incoming packet or via
  * user ioctl.
  */
-static void
-nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
+void
+nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
 {
-	ill_t	*ill = nce->nce_ill;
-	uint32_t hw_addr_len = ill->ill_nd_lla_len;
-	mblk_t	*mp;
+	ill_t	*ill = ncec->ncec_ill;
+	uint32_t hw_addr_len = ill->ill_phys_addr_length;
 	boolean_t ll_updated = B_FALSE;
 	boolean_t ll_changed;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	nce_t	*nce;
 
-	ASSERT(nce->nce_ipversion == IPV6_VERSION);
+	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
 	/*
 	 * No updates of link layer address or the neighbor state is
 	 * allowed, when the cache is in NONUD state.  This still
 	 * allows for responding to reachability solicitation.
 	 */
-	mutex_enter(&nce->nce_lock);
-	if (nce->nce_state == ND_INCOMPLETE) {
+	mutex_enter(&ncec->ncec_lock);
+	if (ncec->ncec_state == ND_INCOMPLETE) {
 		if (hw_addr == NULL) {
-			mutex_exit(&nce->nce_lock);
+			mutex_exit(&ncec->ncec_lock);
 			return;
 		}
-		nce_set_ll(nce, hw_addr);
+		nce_set_ll(ncec, hw_addr);
 		/*
-		 * Update nce state and send the queued packets
+		 * Update ncec state and send the queued packets
 		 * back to ip this time ire will be added.
 		 */
 		if (flag & ND_NA_FLAG_SOLICITED) {
-			nce_update(nce, ND_REACHABLE, NULL);
+			nce_update(ncec, ND_REACHABLE, NULL);
 		} else {
-			nce_update(nce, ND_STALE, NULL);
-		}
-		mutex_exit(&nce->nce_lock);
-		nce_fastpath(nce);
-		nce_cb_dispatch(nce); /* complete callbacks */
-		mutex_enter(&nce->nce_lock);
-		mp = nce->nce_qd_mp;
-		nce->nce_qd_mp = NULL;
-		mutex_exit(&nce->nce_lock);
-		while (mp != NULL) {
-			mblk_t *nxt_mp, *data_mp;
-
-			nxt_mp = mp->b_next;
-			mp->b_next = NULL;
-
-			if (mp->b_datap->db_type == M_CTL)
-				data_mp = mp->b_cont;
-			else
-				data_mp = mp;
-			if (data_mp->b_prev != NULL) {
-				ill_t   *inbound_ill;
-				queue_t *fwdq = NULL;
-				uint_t ifindex;
-
-				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
-				inbound_ill = ill_lookup_on_ifindex(ifindex,
-				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
-				if (inbound_ill == NULL) {
-					data_mp->b_prev = NULL;
-					freemsg(mp);
-					return;
-				} else {
-					fwdq = inbound_ill->ill_rq;
-				}
-				data_mp->b_prev = NULL;
-				/*
-				 * Send a forwarded packet back into ip_rput_v6
-				 * just as in ire_send_v6().
-				 * Extract the queue from b_prev (set in
-				 * ip_rput_data_v6).
-				 */
-				if (fwdq != NULL) {
-					/*
-					 * Forwarded packets hop count will
-					 * get decremented in ip_rput_data_v6
-					 */
-					if (data_mp != mp)
-						freeb(mp);
-					put(fwdq, data_mp);
-				} else {
-					/*
-					 * Send locally originated packets back
-					 * into ip_wput_v6.
-					 */
-					put(ill->ill_wq, mp);
-				}
-				ill_refrele(inbound_ill);
-			} else {
-				put(ill->ill_wq, mp);
-			}
-			mp = nxt_mp;
+			nce_update(ncec, ND_STALE, NULL);
 		}
+		mutex_exit(&ncec->ncec_lock);
+		nce = nce_fastpath(ncec, B_TRUE, NULL);
+		nce_resolv_ok(ncec);
+		if (nce != NULL)
+			nce_refrele(nce);
 		return;
 	}
-	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
+	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
 	if (!is_adv) {
 		/* If this is a SOLICITATION request only */
 		if (ll_changed)
-			nce_update(nce, ND_STALE, hw_addr);
-		mutex_exit(&nce->nce_lock);
-		nce_cb_dispatch(nce);
+			nce_update(ncec, ND_STALE, hw_addr);
+		mutex_exit(&ncec->ncec_lock);
+		ncec_cb_dispatch(ncec);
 		return;
 	}
 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
 		/* If in any other state than REACHABLE, ignore */
-		if (nce->nce_state == ND_REACHABLE) {
-			nce_update(nce, ND_STALE, NULL);
+		if (ncec->ncec_state == ND_REACHABLE) {
+			nce_update(ncec, ND_STALE, NULL);
 		}
-		mutex_exit(&nce->nce_lock);
-		nce_cb_dispatch(nce);
+		mutex_exit(&ncec->ncec_lock);
+		ncec_cb_dispatch(ncec);
 		return;
 	} else {
 		if (ll_changed) {
-			nce_update(nce, ND_UNCHANGED, hw_addr);
+			nce_update(ncec, ND_UNCHANGED, hw_addr);
 			ll_updated = B_TRUE;
 		}
 		if (flag & ND_NA_FLAG_SOLICITED) {
-			nce_update(nce, ND_REACHABLE, NULL);
+			nce_update(ncec, ND_REACHABLE, NULL);
 		} else {
 			if (ll_updated) {
-				nce_update(nce, ND_STALE, NULL);
+				nce_update(ncec, ND_STALE, NULL);
 			}
 		}
-		mutex_exit(&nce->nce_lock);
-		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
+		mutex_exit(&ncec->ncec_lock);
+		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
 		    NCE_F_ISROUTER)) {
-			ire_t *ire;
-
-			/*
-			 * Router turned to host.  We need to remove the
-			 * entry as well as any default route that may be
-			 * using this as a next hop.  This is required by
-			 * section 7.2.5 of RFC 2461.
-			 */
-			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
-			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
-			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
-			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
-			    MATCH_IRE_DEFAULT, ipst);
-			if (ire != NULL) {
-				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
-				ire_delete(ire);
-				ire_refrele(ire);
-			}
-			ndp_delete(nce); /* will do nce_cb_dispatch */
+			ncec_router_to_host(ncec);
 		} else {
-			nce_cb_dispatch(nce);
+			ncec_cb_dispatch(ncec);
 		}
 	}
 }
 
 /*
- * Walker state structure used by ndp_process() / ndp_process_entry().
- */
-typedef struct ndp_process_data {
-	ill_t		*np_ill; 	/* ill/illgrp to match against */
-	const in6_addr_t *np_addr; 	/* IPv6 address to match */
-	uchar_t		*np_hw_addr; 	/* passed to nce_process() */
-	uint32_t	np_flag;	/* passed to nce_process() */
-	boolean_t	np_is_adv;	/* passed to nce_process() */
-} ndp_process_data_t;
-
-/*
- * Walker callback used by ndp_process() for IPMP groups: calls nce_process()
- * for each NCE with a matching address that's in the same IPMP group.
- */
-static void
-ndp_process_entry(nce_t *nce, void *arg)
-{
-	ndp_process_data_t *npp = arg;
-
-	if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) &&
-	    IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) &&
-	    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
-		nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv);
-	}
-}
-
-/*
- * Wrapper around nce_process() that handles IPMP.  In particular, for IPMP,
- * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have
- * more than one NCE for a given IPv6 address to tend to.  In that case, we
- * need to walk all NCEs and callback nce_process() for each one.  Since this
- * is expensive, in the non-IPMP case we just directly call nce_process().
- * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP
- * interfaces in an IPMP group share the same NCEs -- at which point this
- * function can be removed entirely.
- */
-void
-ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
-{
-	ill_t *ill = nce->nce_ill;
-	struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6;
-	ndp_process_data_t np;
-
-	if (ill->ill_grp == NULL) {
-		nce_process(nce, hw_addr, flag, is_adv);
-		return;
-	}
-
-	/* IPMP case: walk all NCEs */
-	np.np_ill = ill;
-	np.np_addr = &nce->nce_addr;
-	np.np_flag = flag;
-	np.np_is_adv = is_adv;
-	np.np_hw_addr = hw_addr;
-
-	ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES);
-}
-
-/*
- * Pass arg1 to the pfi supplied, along with each nce in existence.
- * ndp_walk() places a REFHOLD on the nce and drops the lock when
+ * Pass arg1 to the pfi supplied, along with each ncec in existence.
+ * ncec_walk() places a REFHOLD on the ncec and drops the lock when
  * walking the hash list.
  */
 void
-ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
+ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
     boolean_t trace)
 {
-	nce_t	*nce;
-	nce_t	*nce1;
-	nce_t	**ncep;
-	nce_t	*free_nce_list = NULL;
+	ncec_t	*ncec;
+	ncec_t	*ncec1;
+	ncec_t	**ncep;
+	ncec_t	*free_nce_list = NULL;
 
 	mutex_enter(&ndp->ndp_g_lock);
-	/* Prevent ndp_delete from unlink and free of NCE */
+	/* Prevent ncec_delete from unlink and free of NCE */
 	ndp->ndp_g_walker++;
 	mutex_exit(&ndp->ndp_g_lock);
 	for (ncep = ndp->nce_hash_tbl;
 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
-		for (nce = *ncep; nce != NULL; nce = nce1) {
-			nce1 = nce->nce_next;
-			if (ill == NULL || nce->nce_ill == ill) {
+		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
+			ncec1 = ncec->ncec_next;
+			if (ill == NULL || ncec->ncec_ill == ill) {
 				if (trace) {
-					NCE_REFHOLD(nce);
-					(*pfi)(nce, arg1);
-					NCE_REFRELE(nce);
+					ncec_refhold(ncec);
+					(*pfi)(ncec, arg1);
+					ncec_refrele(ncec);
 				} else {
-					NCE_REFHOLD_NOTR(nce);
-					(*pfi)(nce, arg1);
-					NCE_REFRELE_NOTR(nce);
+					ncec_refhold_notr(ncec);
+					(*pfi)(ncec, arg1);
+					ncec_refrele_notr(ncec);
 				}
 			}
 		}
 	}
-	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
-		nce1 = nce->nce_next;
-		if (ill == NULL || nce->nce_ill == ill) {
-			if (trace) {
-				NCE_REFHOLD(nce);
-				(*pfi)(nce, arg1);
-				NCE_REFRELE(nce);
-			} else {
-				NCE_REFHOLD_NOTR(nce);
-				(*pfi)(nce, arg1);
-				NCE_REFRELE_NOTR(nce);
-			}
-		}
-	}
 	mutex_enter(&ndp->ndp_g_lock);
 	ndp->ndp_g_walker--;
-	/*
-	 * While NCE's are removed from global list they are placed
-	 * in a private list, to be passed to nce_ire_delete_list().
-	 * The reason is, there may be ires pointing to this nce
-	 * which needs to cleaned up.
-	 */
 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
 		/* Time to delete condemned entries */
 		for (ncep = ndp->nce_hash_tbl;
 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
-			nce = *ncep;
-			if (nce != NULL) {
-				nce_remove(ndp, nce, &free_nce_list);
+			ncec = *ncep;
+			if (ncec != NULL) {
+				nce_remove(ndp, ncec, &free_nce_list);
 			}
 		}
-		nce = ndp->nce_mask_entries;
-		if (nce != NULL) {
-			nce_remove(ndp, nce, &free_nce_list);
-		}
 		ndp->ndp_g_walker_cleanup = B_FALSE;
 	}
 
 	mutex_exit(&ndp->ndp_g_lock);
 
 	if (free_nce_list != NULL) {
-		nce_ire_delete_list(free_nce_list);
+		nce_cleanup_list(free_nce_list);
 	}
 }
 
@@ -1077,198 +986,10 @@ ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
  * Note that ill can be NULL hence can't derive the ipst from it.
  */
 void
-ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
-{
-	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
-	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
-}
-
-/*
- * Process resolve requests.  Handles both mapped entries
- * as well as cases that needs to be send out on the wire.
- * Lookup a NCE for a given IRE.  Regardless of whether one exists
- * or one is created, we defer making ire point to nce until the
- * ire is actually added at which point the nce_refcnt on the nce is
- * incremented.  This is done primarily to have symmetry between ire_add()
- * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
- */
-int
-ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
-{
-	nce_t		*nce, *hw_nce = NULL;
-	int		err;
-	ill_t		*ipmp_ill;
-	uint16_t	nce_flags;
-	mblk_t		*mp_nce = NULL;
-	ip_stack_t	*ipst = ill->ill_ipst;
-	uchar_t		*hwaddr = NULL;
-
-	ASSERT(ill->ill_isv6);
-
-	if (IN6_IS_ADDR_MULTICAST(dst))
-		return (nce_set_multicast(ill, dst));
-
-	nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
-
-	/*
-	 * If `ill' is under IPMP, then first check to see if there's an NCE
-	 * for `dst' on the IPMP meta-interface (e.g., because an application
-	 * explicitly did an SIOCLIFSETND to tie a hardware address to `dst').
-	 * If so, we use that hardware address when creating the NCE below.
-	 * Note that we don't yet have a mechanism to remove these NCEs if the
-	 * NCE for `dst' on the IPMP meta-interface is subsequently removed --
-	 * but rather than build such a beast, we should fix NCEs so that they
-	 * can be properly shared across an IPMP group.
-	 */
-	if (IS_UNDER_IPMP(ill)) {
-		if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
-			hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE);
-			if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) {
-				hwaddr = hw_nce->nce_res_mp->b_rptr +
-				    NCE_LL_ADDR_OFFSET(ipmp_ill);
-				nce_flags |= hw_nce->nce_flags;
-			}
-			ill_refrele(ipmp_ill);
-		}
-	}
-
-	err = ndp_lookup_then_add_v6(ill,
-	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
-	    hwaddr,
-	    dst,
-	    &ipv6_all_ones,
-	    &ipv6_all_zeros,
-	    0,
-	    nce_flags,
-	    hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE,
-	    &nce);
-
-	if (hw_nce != NULL)
-		NCE_REFRELE(hw_nce);
-
-	switch (err) {
-	case 0:
-		/*
-		 * New cache entry was created. Make sure that the state
-		 * is not ND_INCOMPLETE. It can be in some other state
-		 * even before we send out the solicitation as we could
-		 * get un-solicited advertisements.
-		 *
-		 * If this is an XRESOLV interface, simply return 0,
-		 * since we don't want to solicit just yet.
-		 */
-		if (ill->ill_flags & ILLF_XRESOLV) {
-			NCE_REFRELE(nce);
-			return (0);
-		}
-
-		mutex_enter(&nce->nce_lock);
-		if (nce->nce_state != ND_INCOMPLETE) {
-			mutex_exit(&nce->nce_lock);
-			NCE_REFRELE(nce);
-			return (0);
-		}
-		if (nce->nce_rcnt == 0) {
-			/* The caller will free mp */
-			mutex_exit(&nce->nce_lock);
-			ndp_delete(nce);
-			NCE_REFRELE(nce);
-			return (ESRCH);
-		}
-		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
-		if (mp_nce == NULL) {
-			/* The caller will free mp */
-			mutex_exit(&nce->nce_lock);
-			ndp_delete(nce);
-			NCE_REFRELE(nce);
-			return (ENOMEM);
-		}
-		nce_queue_mp(nce, mp_nce);
-		ip_ndp_resolve(nce);
-		mutex_exit(&nce->nce_lock);
-		NCE_REFRELE(nce);
-		return (EINPROGRESS);
-	case EEXIST:
-		/* Resolution in progress just queue the packet */
-		mutex_enter(&nce->nce_lock);
-		if (nce->nce_state == ND_INCOMPLETE) {
-			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
-			if (mp_nce == NULL) {
-				err = ENOMEM;
-			} else {
-				nce_queue_mp(nce, mp_nce);
-				err = EINPROGRESS;
-			}
-		} else {
-			/*
-			 * Any other state implies we have
-			 * a nce but IRE needs to be added ...
-			 * ire_add_v6() will take care of the
-			 * the case when the nce becomes CONDEMNED
-			 * before the ire is added to the table.
-			 */
-			err = 0;
-		}
-		mutex_exit(&nce->nce_lock);
-		NCE_REFRELE(nce);
-		break;
-	default:
-		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
-		break;
-	}
-	return (err);
-}
-
-/*
- * When there is no resolver, the link layer template is passed in
- * the IRE.
- * Lookup a NCE for a given IRE.  Regardless of whether one exists
- * or one is created, we defer making ire point to nce until the
- * ire is actually added at which point the nce_refcnt on the nce is
- * incremented.  This is done primarily to have symmetry between ire_add()
- * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
- */
-int
-ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
+ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
 {
-	nce_t		*nce;
-	int		err = 0;
-
-	ASSERT(ill != NULL);
-	ASSERT(ill->ill_isv6);
-	if (IN6_IS_ADDR_MULTICAST(dst)) {
-		err = nce_set_multicast(ill, dst);
-		return (err);
-	}
-
-	err = ndp_lookup_then_add_v6(ill,
-	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
-	    ill->ill_dest_addr,	/* hardware address is NULL in most cases */
-	    dst,
-	    &ipv6_all_ones,
-	    &ipv6_all_zeros,
-	    0,
-	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
-	    ND_REACHABLE,
-	    &nce);
-
-	switch (err) {
-	case 0:
-		/*
-		 * Cache entry with a proper resolver cookie was
-		 * created.
-		 */
-		NCE_REFRELE(nce);
-		break;
-	case EEXIST:
-		err = 0;
-		NCE_REFRELE(nce);
-		break;
-	default:
-		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
-		break;
-	}
-	return (err);
+	ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
+	ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
 }
 
 /*
@@ -1277,83 +998,73 @@ ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
  * multicast destination.
  */
 static int
-nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
+nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
+    uint16_t flags, nce_t **newnce)
 {
-	nce_t		*mnce;	/* Multicast mapping entry */
-	nce_t		*nce;
-	uchar_t		*hw_addr = NULL;
+	uchar_t		*hw_addr;
 	int		err = 0;
 	ip_stack_t	*ipst = ill->ill_ipst;
+	nce_t		*nce;
 
 	ASSERT(ill != NULL);
 	ASSERT(ill->ill_isv6);
 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
 
 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
-	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
-	nce = nce_lookup_addr(ill, B_FALSE, dst, nce);
+	nce = nce_lookup_addr(ill, dst);
 	if (nce != NULL) {
 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-		NCE_REFRELE(nce);
-		return (0);
-	}
-	/* No entry, now lookup for a mapping this should never fail */
-	mnce = nce_lookup_mapping(ill, dst);
-	if (mnce == NULL) {
-		/* Something broken for the interface. */
-		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-		return (ESRCH);
+		goto done;
 	}
-	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
 		/*
 		 * For IRE_IF_RESOLVER a hardware mapping can be
-		 * generated, for IRE_IF_NORESOLVER, resolution cookie
-		 * in the ill is copied in ndp_add_v6().
+		 * generated.
 		 */
 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
 		if (hw_addr == NULL) {
 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-			NCE_REFRELE(mnce);
 			return (ENOMEM);
 		}
-		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
+		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
+	} else {
+		/*
+		 * So no hw_addr is needed for IRE_IF_NORESOLVER.
+		 */
+		hw_addr = NULL;
 	}
-	NCE_REFRELE(mnce);
-	/*
-	 * IRE_IF_NORESOLVER type simply copies the resolution
-	 * cookie passed in.  So no hw_addr is needed.
-	 */
-	err = ndp_add_v6(ill,
-	    hw_addr,
-	    dst,
-	    &ipv6_all_ones,
-	    &ipv6_all_zeros,
-	    0,
-	    NCE_F_NONUD,
-	    ND_REACHABLE,
-	    &nce);
+	ASSERT((flags & NCE_F_MCAST) != 0);
+	ASSERT((flags & NCE_F_NONUD) != 0);
+	/* nce_state will be computed by nce_add_common() */
+	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
+	    ND_UNCHANGED, &nce);
 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+	if (err == 0)
+		err = nce_add_v6_postprocess(nce);
 	if (hw_addr != NULL)
 		kmem_free(hw_addr, ill->ill_nd_lla_len);
 	if (err != 0) {
-		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
+		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
 		return (err);
 	}
-	NCE_REFRELE(nce);
+done:
+	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
+	if (newnce != NULL)
+		*newnce = nce;
+	else
+		nce_refrele(nce);
 	return (0);
 }
 
 /*
- * Return the link layer address, and any flags of a nce.
+ * Return the link layer address, and any flags of a ncec.
  */
 int
 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
 {
-	nce_t		*nce;
+	ncec_t		*ncec;
 	in6_addr_t	*addr;
 	sin6_t		*sin6;
-	dl_unitdata_req_t	*dl;
 
 	ASSERT(ill != NULL && ill->ill_isv6);
 	sin6 = (sin6_t *)&lnr->lnr_addr;
@@ -1363,158 +1074,135 @@ ndp_query(ill_t *ill, struct lif_nd_req *lnr)
 	 * NOTE: if the ill is an IPMP interface, then match against the whole
 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
 	 * addresses for the data addresses on an IPMP interface even though
-	 * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill.
+	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
 	 */
-	nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE);
-	if (nce == NULL)
+	ncec = ncec_lookup_illgrp_v6(ill, addr);
+	if (ncec == NULL)
 		return (ESRCH);
-	/* If in INCOMPLETE state, no link layer address is available yet */
-	if (!NCE_ISREACHABLE(nce)) {
-		NCE_REFRELE(nce);
+	/* If no link layer address is available yet, return ESRCH */
+	if (!NCE_ISREACHABLE(ncec)) {
+		ncec_refrele(ncec);
 		return (ESRCH);
 	}
-	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
-	if (ill->ill_flags & ILLF_XRESOLV)
-		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
-	else
-		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
-	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
-	    sizeof (lnr->lnr_hdw_addr));
-	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
-	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
-	if (nce->nce_flags & NCE_F_ISROUTER)
+	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
+	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
+	    lnr->lnr_hdw_len);
+	if (ncec->ncec_flags & NCE_F_ISROUTER)
 		lnr->lnr_flags = NDF_ISROUTER_ON;
-	if (nce->nce_flags & NCE_F_ANYCAST)
+	if (ncec->ncec_flags & NCE_F_ANYCAST)
 		lnr->lnr_flags |= NDF_ANYCAST_ON;
-	NCE_REFRELE(nce);
+	ncec_refrele(ncec);
 	return (0);
 }
 
 /*
- * Send Enable/Disable multicast reqs to driver.
+ * Finish setting up the Enable/Disable multicast for the driver.
  */
-int
-ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
+mblk_t *
+ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
     uint32_t hw_addr_offset, mblk_t *mp)
 {
-	nce_t		*nce;
 	uchar_t		*hw_addr;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	ipaddr_t	v4group;
+	uchar_t		*addr;
 
-	ASSERT(ill != NULL && ill->ill_isv6);
 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
-	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
-	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
-		freemsg(mp);
-		return (EINVAL);
+	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
+		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
+
+		ASSERT(CLASSD(v4group));
+		ASSERT(!(ill->ill_isv6));
+
+		addr = (uchar_t *)&v4group;
+	} else {
+		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
+		ASSERT(ill->ill_isv6);
+
+		addr = (uchar_t *)v6group;
 	}
-	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
-	nce = nce_lookup_mapping(ill, addr);
-	if (nce == NULL) {
-		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
+	if (hw_addr == NULL) {
+		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
 		freemsg(mp);
-		return (ESRCH);
-	}
-	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-	/*
-	 * Update dl_addr_length and dl_addr_offset for primitives that
-	 * have physical addresses as opposed to full saps
-	 */
-	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
-	case DL_ENABMULTI_REQ:
-		/* Track the state if this is the first enabmulti */
-		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
-			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
-		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
-		break;
-	case DL_DISABMULTI_REQ:
-		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
-		break;
-	default:
-		NCE_REFRELE(nce);
-		ip1dbg(("ndp_mcastreq: default\n"));
-		return (EINVAL);
+		return (NULL);
 	}
-	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
-	NCE_REFRELE(nce);
-	ill_dlpi_send(ill, mp);
-	return (0);
-}
 
+	ip_mcast_mapping(ill, addr, hw_addr);
+	return (mp);
+}
 
-/*
- * Send out a NS for resolving the ip address in nce.
- */
 void
-ip_ndp_resolve(nce_t *nce)
+ip_ndp_resolve(ncec_t *ncec)
 {
+	in_addr_t	sender4 = INADDR_ANY;
 	in6_addr_t	sender6 = ipv6_all_zeros;
+	ill_t		*src_ill;
 	uint32_t	ms;
-	mblk_t		*mp;
-	ip6_t		*ip6h;
 
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
-	/*
-	 * Pick the src from outgoing packet, if one is available.
-	 * Otherwise let nce_xmit figure out the src.
-	 */
-	if ((mp = nce->nce_qd_mp) != NULL) {
-		/* Handle ip_newroute_v6 giving us IPSEC packets */
-		if (mp->b_datap->db_type == M_CTL)
-			mp = mp->b_cont;
-		ip6h = (ip6_t *)mp->b_rptr;
-		if (ip6h->ip6_nxt == IPPROTO_RAW) {
-			/*
-			 * This message should have been pulled up already in
-			 * ip_wput_v6. We can't do pullups here because
-			 * the message could be from the nce_qd_mp which could
-			 * have b_next/b_prev non-NULL.
-			 */
-			ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
-			ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
-		}
-		sender6 = ip6h->ip6_src;
+	src_ill = nce_resolve_src(ncec, &sender6);
+	if (src_ill == NULL) {
+		/* Make sure we try again later */
+		ms = ncec->ncec_ill->ill_reachable_retrans_time;
+		nce_restart_timer(ncec, (clock_t)ms);
+		return;
 	}
-	ms = nce_solicit(nce, sender6);
-	mutex_exit(&nce->nce_lock);
+	if (ncec->ncec_ipversion == IPV4_VERSION)
+		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
+	mutex_enter(&ncec->ncec_lock);
+	if (ncec->ncec_ipversion == IPV6_VERSION)
+		ms = ndp_solicit(ncec, sender6, src_ill);
+	else
+		ms = arp_request(ncec, sender4, src_ill);
+	mutex_exit(&ncec->ncec_lock);
 	if (ms == 0) {
-		if (nce->nce_state != ND_REACHABLE) {
-			nce_resolv_failed(nce);
-			ndp_delete(nce);
+		if (ncec->ncec_state != ND_REACHABLE) {
+			if (ncec->ncec_ipversion == IPV6_VERSION)
+				ndp_resolv_failed(ncec);
+			else
+				arp_resolv_failed(ncec);
+			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
+			nce_make_unreachable(ncec);
+			ncec_delete(ncec);
 		}
 	} else {
-		NDP_RESTART_TIMER(nce, (clock_t)ms);
+		nce_restart_timer(ncec, (clock_t)ms);
 	}
-	mutex_enter(&nce->nce_lock);
+done:
+	ill_refrele(src_ill);
 }
 
 /*
- * Send a neighbor solicitation.
+ * Send an IPv6 neighbor solicitation.
  * Returns number of milliseconds after which we should either rexmit or abort.
  * Return of zero means we should abort.
- * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
+ * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
+ * The optional source address is used as a hint to ndp_solicit for
+ * which source to use in the packet.
  *
- * NOTE: This routine drops nce_lock (and later reacquires it) when sending
+ * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
  * the packet.
  */
 uint32_t
-nce_solicit(nce_t *nce, in6_addr_t sender)
+ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
 {
-	boolean_t	dropped;
+	in6_addr_t	dst;
+	boolean_t	dropped = B_FALSE;
 
-	ASSERT(nce->nce_ipversion == IPV6_VERSION);
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
+	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 
-	if (nce->nce_rcnt == 0)
+	if (ncec->ncec_rcnt == 0)
 		return (0);
 
-	nce->nce_rcnt--;
-	mutex_exit(&nce->nce_lock);
-	dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0);
-	mutex_enter(&nce->nce_lock);
+	dst = ncec->ncec_addr;
+	ncec->ncec_rcnt--;
+	mutex_exit(&ncec->ncec_lock);
+	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
+	    ill->ill_phys_addr_length, &src, &dst, 0);
+	mutex_enter(&ncec->ncec_lock);
 	if (dropped)
-		nce->nce_rcnt++;
-	return (nce->nce_ill->ill_reachable_retrans_time);
+		ncec->ncec_rcnt++;
+	return (ncec->ncec_ill->ill_reachable_retrans_time);
 }
 
 /*
@@ -1528,23 +1216,30 @@ nce_solicit(nce_t *nce, in6_addr_t sender)
  * ip_ndp_excl.
  */
 /* ARGSUSED */
-static void
-ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
+void
+ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
 {
 	ill_t	*ill = rq->q_ptr;
 	ipif_t	*ipif;
-	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
+	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
+	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
+	boolean_t addr_equal;
 
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 		/*
 		 * We do not support recovery of proxy ARP'd interfaces,
 		 * because the system lacks a complete proxy ARP mechanism.
 		 */
-		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
-		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
-			continue;
+		if (ill->ill_isv6) {
+			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
+			    addr6);
+		} else {
+			addr_equal = (ipif->ipif_lcl_addr == *addr4);
 		}
 
+		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
+			continue;
+
 		/*
 		 * If we have already recovered or if the interface is going
 		 * away, then ignore.
@@ -1561,13 +1256,20 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
 		mutex_exit(&ill->ill_lock);
 		ipif->ipif_was_dup = B_TRUE;
 
-		VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
-		(void) ipif_up_done_v6(ipif);
+		if (ill->ill_isv6) {
+			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
+			(void) ipif_up_done_v6(ipif);
+		} else {
+			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
+			    EINPROGRESS);
+			(void) ipif_up_done(ipif);
+		}
 	}
 	freeb(mp);
 }
 
 /*
+ *
  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
  * As long as someone else holds the address, the interface will stay down.
  * When that conflict goes away, the interface is brought back up.  This is
@@ -1579,8 +1281,8 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
  *
  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
  */
-static void
-ipif6_dup_recovery(void *arg)
+void
+ipif_dup_recovery(void *arg)
 {
 	ipif_t *ipif = arg;
 
@@ -1598,7 +1300,7 @@ ipif6_dup_recovery(void *arg)
 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
 		return;
 
-	ndp_do_recovery(ipif);
+	ipif_do_recovery(ipif);
 }
 
 /*
@@ -1608,18 +1310,24 @@ ipif6_dup_recovery(void *arg)
  * Called both by recovery timer expiry and link-up notification.
  */
 void
-ndp_do_recovery(ipif_t *ipif)
+ipif_do_recovery(ipif_t *ipif)
 {
 	ill_t *ill = ipif->ipif_ill;
 	mblk_t *mp;
 	ip_stack_t *ipst = ill->ill_ipst;
+	size_t mp_size;
 
-	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
+	if (ipif->ipif_isv6)
+		mp_size = sizeof (ipif->ipif_v6lcl_addr);
+	else
+		mp_size = sizeof (ipif->ipif_lcl_addr);
+	mp = allocb(mp_size, BPRI_MED);
 	if (mp == NULL) {
 		mutex_enter(&ill->ill_lock);
-		if (ipif->ipif_recovery_id == 0 &&
+		if (ipst->ips_ip_dup_recovery > 0 &&
+		    ipif->ipif_recovery_id == 0 &&
 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
-			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
+			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
 		}
 		mutex_exit(&ill->ill_lock);
@@ -1632,10 +1340,15 @@ ndp_do_recovery(ipif_t *ipif)
 			(void) untimeout(ipif->ipif_recovery_id);
 		ipif->ipif_recovery_id = 0;
 
-		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
-		    sizeof (ipif->ipif_v6lcl_addr));
+		if (ipif->ipif_isv6) {
+			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
+			    sizeof (ipif->ipif_v6lcl_addr));
+		} else  {
+			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
+			    sizeof (ipif->ipif_lcl_addr));
+		}
 		ill_refhold(ill);
-		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
+		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
 		    B_FALSE);
 	}
 }
@@ -1644,80 +1357,19 @@ ndp_do_recovery(ipif_t *ipif)
  * Find the MAC and IP addresses in an NA/NS message.
  */
 static void
-ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp,
-    uchar_t **haddr, uint_t *haddrlenp)
+ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
+    in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
 {
-	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
-	nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
 	uchar_t *addr;
-	int alen = 0;
+	int alen;
 
-	if (dl_mp == NULL) {
-		nd_opt_hdr_t *opt = NULL;
-		int len;
-
-		/*
-		 * If it's from the fast-path, then it can't be a probe
-		 * message, and thus must include a linkaddr option.
-		 * Extract that here.
-		 */
-		switch (icmp6->icmp6_type) {
-		case ND_NEIGHBOR_SOLICIT:
-			len = mp->b_wptr - (uchar_t *)ns;
-			if ((len -= sizeof (*ns)) > 0) {
-				opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1),
-				    len, ND_OPT_SOURCE_LINKADDR);
-			}
-			break;
-		case ND_NEIGHBOR_ADVERT:
-			len = mp->b_wptr - (uchar_t *)na;
-			if ((len -= sizeof (*na)) > 0) {
-				opt = ndp_get_option((nd_opt_hdr_t *)(na + 1),
-				    len, ND_OPT_TARGET_LINKADDR);
-			}
-			break;
-		}
-
-		if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >=
-		    ill->ill_nd_lla_len) {
-			addr = (uchar_t *)(opt + 1);
-			alen = ill->ill_nd_lla_len;
-		}
-
-		/*
-		 * We cheat a bit here for the sake of printing usable log
-		 * messages in the rare case where the reply we got was unicast
-		 * without a source linkaddr option, and the interface is in
-		 * fastpath mode.  (Sigh.)
-		 */
-		if (alen == 0 && ill->ill_type == IFT_ETHER &&
-		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
-			struct ether_header *pether;
-
-			pether = (struct ether_header *)((char *)ip6h -
-			    sizeof (*pether));
-			addr = pether->ether_shost.ether_addr_octet;
-			alen = ETHERADDRL;
-		}
-	} else {
-		dl_unitdata_ind_t *dlu;
-
-		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
-		alen = dlu->dl_src_addr_length;
-		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
-		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
-			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
-			if (ill->ill_sap_length < 0) {
-				alen += ill->ill_sap_length;
-			} else {
-				addr += ill->ill_sap_length;
-				alen -= ill->ill_sap_length;
-			}
-		}
-	}
+	/* icmp_inbound_v6 ensures this */
+	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
 
+	addr = ira->ira_l2src;
+	alen = ill->ill_phys_addr_length;
 	if (alen > 0) {
 		*haddr = addr;
 		*haddrlenp = alen;
@@ -1740,35 +1392,58 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
 {
 	ill_t	*ill = rq->q_ptr;
 	ipif_t	*ipif;
-	mblk_t	*dl_mp = NULL;
 	uchar_t	*haddr;
 	uint_t	haddrlen;
 	ip_stack_t *ipst = ill->ill_ipst;
 	in6_addr_t targ;
-
-	if (DB_TYPE(mp) != M_DATA) {
-		dl_mp = mp;
-		mp = mp->b_cont;
+	ip_recv_attr_t iras;
+	mblk_t	*attrmp;
+
+	attrmp = mp;
+	mp = mp->b_cont;
+	attrmp->b_cont = NULL;
+	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
+		/* The ill or ip_stack_t disappeared on us */
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
+		freemsg(mp);
+		ira_cleanup(&iras, B_TRUE);
+		return;
 	}
 
-	ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
+	ASSERT(ill == iras.ira_rill);
+
+	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
 		/*
 		 * Ignore conflicts generated by misbehaving switches that
 		 * just reflect our own messages back to us.  For IPMP, we may
 		 * see reflections across any ill in the illgrp.
+		 *
+		 * RFC2462 and revisions tried to detect both the case
+		 * when a statically configured IPv6 address is a duplicate,
+		 * and the case when the L2 address itself is a duplicate. The
+		 * later is important because, with stateles address autoconf,
+		 * if the L2 address is a duplicate, the resulting IPv6
+		 * address(es) would also be duplicates. We rely on DAD of the
+		 * IPv6 address itself to detect the latter case.
 		 */
+		/* For an under ill_grp can change under lock */
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
 		    IS_UNDER_IPMP(ill) &&
-		    ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL)
+		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
+		    haddrlen) != NULL) {
+			rw_exit(&ipst->ips_ill_g_lock);
 			goto ignore_conflict;
+		}
+		rw_exit(&ipst->ips_ill_g_lock);
 	}
 
 	/*
 	 * Look up the appropriate ipif.
 	 */
-	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL,
-	    NULL, ipst);
+	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
 	if (ipif == NULL)
 		goto ignore_conflict;
 
@@ -1802,43 +1477,64 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
 	ill->ill_ipif_dup_count++;
 	mutex_exit(&ill->ill_lock);
 	(void) ipif_down(ipif, NULL, NULL);
-	ipif_down_tail(ipif);
+	(void) ipif_down_tail(ipif);
 	mutex_enter(&ill->ill_lock);
 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
 	    ill->ill_net_type == IRE_IF_RESOLVER &&
 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
 	    ipst->ips_ip_dup_recovery > 0) {
 		ASSERT(ipif->ipif_recovery_id == 0);
-		ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
+		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
 	}
 	mutex_exit(&ill->ill_lock);
 	ipif_refrele(ipif);
+
 ignore_conflict:
-	if (dl_mp != NULL)
-		freeb(dl_mp);
 	freemsg(mp);
+	ira_cleanup(&iras, B_TRUE);
 }
 
 /*
  * Handle failure by tearing down the ipifs with the specified address.  Note
- * that tearing down the ipif also means deleting the nce through ipif_down, so
- * it's not possible to do recovery by just restarting the nce timer.  Instead,
+ * that tearing down the ipif also means deleting the ncec through ipif_down, so
+ * it's not possible to do recovery by just restarting the ncec timer.  Instead,
  * we start a timer on the ipif.
+ * Caller has to free mp;
  */
 static void
-ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
+ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
 {
+	const uchar_t	*haddr;
+	ill_t		*ill = ira->ira_rill;
+
+	/*
+	 * Ignore conflicts generated by misbehaving switches that just
+	 * reflect our own messages back to us.
+	 */
+
+	/* icmp_inbound_v6 ensures this */
+	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
+	haddr = ira->ira_l2src;
+	if (haddr != NULL &&
+	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
+		return;
+	}
+
 	if ((mp = copymsg(mp)) != NULL) {
-		if (dl_mp == NULL)
-			dl_mp = mp;
-		else if ((dl_mp = copyb(dl_mp)) != NULL)
-			dl_mp->b_cont = mp;
-		if (dl_mp == NULL) {
+		mblk_t	*attrmp;
+
+		attrmp = ip_recv_attr_to_mblk(ira);
+		if (attrmp == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
 			freemsg(mp);
 		} else {
+			ASSERT(attrmp->b_cont == NULL);
+			attrmp->b_cont = mp;
+			mp = attrmp;
 			ill_refhold(ill);
-			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
+			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
 			    B_FALSE);
 		}
 	}
@@ -1848,20 +1544,39 @@ ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
  * Handle a discovered conflict: some other system is advertising that it owns
  * one of our IP addresses.  We need to defend ourselves, or just shut down the
  * interface.
+ *
+ * Handles both IPv4 and IPv6
  */
-static void
-ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
+boolean_t
+ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
 {
-	ipif_t *ipif;
-	uint32_t now;
-	uint_t maxdefense;
-	uint_t defs;
-	ip_stack_t *ipst = ill->ill_ipst;
+	ipif_t		*ipif;
+	clock_t		now;
+	uint_t		maxdefense;
+	uint_t		defs;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	uint32_t	elapsed;
+	boolean_t	isv6 = ill->ill_isv6;
+	ipaddr_t	ncec_addr;
 
-	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
-	    NULL, NULL, ipst);
+	if (isv6) {
+		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
+		    ipst);
+	} else {
+		if (arp_no_defense) {
+			/*
+			 * Yes, there is a conflict, but no, we do not
+			 * defend ourself.
+			 */
+			return (B_TRUE);
+		}
+		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
+		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
+		    ipst);
+	}
 	if (ipif == NULL)
-		return;
+		return (B_FALSE);
 
 	/*
 	 * First, figure out if this address is disposable.
@@ -1875,50 +1590,51 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
 	 * Now figure out how many times we've defended ourselves.  Ignore
 	 * defenses that happened long in the past.
 	 */
-	now = gethrestime_sec();
-	mutex_enter(&nce->nce_lock);
-	if ((defs = nce->nce_defense_count) > 0 &&
-	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
-		nce->nce_defense_count = defs = 0;
+	now = ddi_get_lbolt();
+	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
+	mutex_enter(&ncec->ncec_lock);
+	if ((defs = ncec->ncec_defense_count) > 0 &&
+	    elapsed > ipst->ips_ip_defend_interval) {
+		/*
+		 * ip_defend_interval has elapsed.
+		 * reset the defense count.
+		 */
+		ncec->ncec_defense_count = defs = 0;
 	}
-	nce->nce_defense_count++;
-	nce->nce_defense_time = now;
-	mutex_exit(&nce->nce_lock);
+	ncec->ncec_defense_count++;
+	ncec->ncec_last_time_defended = now;
+	mutex_exit(&ncec->ncec_lock);
 	ipif_refrele(ipif);
 
 	/*
 	 * If we've defended ourselves too many times already, then give up and
-	 * tear down the interface(s) using this address.  Otherwise, defend by
-	 * sending out an unsolicited Neighbor Advertisement.
+	 * tear down the interface(s) using this address.
+	 * Otherwise, caller has to defend by sending out an announce.
 	 */
 	if (defs >= maxdefense) {
-		ip_ndp_failure(ill, mp, dl_mp);
+		if (isv6)
+			ndp_failure(mp, ira);
+		else
+			arp_failure(mp, ira);
 	} else {
-		char hbuf[MAC_STR_LEN];
-		char sbuf[INET6_ADDRSTRLEN];
-		uchar_t *haddr;
-		uint_t haddrlen;
-		in6_addr_t targ;
-
-		ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
-		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
-		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)),
-		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
-		    ill->ill_name);
-
-		(void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0);
+		return (B_TRUE); /* caller must defend this address */
 	}
+	return (B_FALSE);
 }
 
+/*
+ * Handle reception of Neighbor Solicitation messages.
+ */
 static void
-ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
+ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
 {
+	ill_t		*ill = ira->ira_ill, *under_ill;
 	nd_neighbor_solicit_t *ns;
-	uint32_t	hlen = ill->ill_nd_lla_len;
+	uint32_t	hlen = ill->ill_phys_addr_length;
 	uchar_t		*haddr = NULL;
 	icmp6_t		*icmp_nd;
 	ip6_t		*ip6h;
-	nce_t		*our_nce = NULL;
+	ncec_t		*our_ncec = NULL;
 	in6_addr_t	target;
 	in6_addr_t	src;
 	int		len;
@@ -1926,6 +1642,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 	nd_opt_hdr_t	*opt = NULL;
 	boolean_t	bad_solicit = B_FALSE;
 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
+	boolean_t	need_ill_refrele = B_FALSE;
 
 	ip6h = (ip6_t *)mp->b_rptr;
 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
@@ -1951,7 +1668,6 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 			bad_solicit = B_TRUE;
 			goto done;
 		}
-
 	}
 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
 		/* Check to see if this is a valid DAD solicitation */
@@ -1974,20 +1690,20 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
 	 * to ensure we find the associated NCE.
 	 */
-	our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE);
+	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
 	/*
-	 * If this is a valid Solicitation, a permanent
-	 * entry should exist in the cache
+	 * If this is a valid Solicitation for an address we are publishing,
+	 * then a PUBLISH entry should exist in the cache
 	 */
-	if (our_nce == NULL ||
-	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
+	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
 		    "ifname=%s ", ill->ill_name));
 		if (ip_debug > 2) {
 			/* ip1dbg */
 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
 		}
-		bad_solicit = B_TRUE;
+		if (our_ncec == NULL)
+			bad_solicit = B_TRUE;
 		goto done;
 	}
 
@@ -1998,7 +1714,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 			haddr = (uchar_t *)&opt[1];
 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
 			    hlen == 0) {
-				ip1dbg(("ndp_input_solicit: bad SLLA\n"));
+				ip1dbg(("ndp_input_advert: bad SLLA\n"));
 				bad_solicit = B_TRUE;
 				goto done;
 			}
@@ -2010,7 +1726,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 		flag |= NDP_UNICAST;
 
 	/*
-	 * Create/update the entry for the soliciting node.
+	 * Create/update the entry for the soliciting node on the ipmp_ill.
 	 * or respond to outstanding queries, don't if
 	 * the source is unspecified address.
 	 */
@@ -2035,7 +1751,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 		 * process of verifying the address, then don't respond at all
 		 * and don't keep track of the sender.
 		 */
-		if (our_nce->nce_state == ND_PROBE)
+		if (our_ncec->ncec_state == ND_PROBE)
 			goto done;
 
 		/*
@@ -2048,27 +1764,37 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 		if (haddr == NULL)
 			goto no_source;
 
-		err = ndp_lookup_then_add_v6(ill,
-		    B_FALSE,
-		    haddr,
+		under_ill = ill;
+		if (IS_UNDER_IPMP(under_ill)) {
+			ill = ipmp_ill_hold_ipmp_ill(under_ill);
+			if (ill == NULL)
+				ill = under_ill;
+			else
+				need_ill_refrele = B_TRUE;
+		}
+		err = nce_lookup_then_add_v6(ill,
+		    haddr, hlen,
 		    &src,	/* Soliciting nodes address */
-		    &ipv6_all_ones,
-		    &ipv6_all_zeros,
-		    0,
 		    0,
 		    ND_STALE,
 		    &nnce);
+
+		if (need_ill_refrele) {
+			ill_refrele(ill);
+			ill = under_ill;
+			need_ill_refrele =  B_FALSE;
+		}
 		switch (err) {
 		case 0:
 			/* done with this entry */
-			NCE_REFRELE(nnce);
+			nce_refrele(nnce);
 			break;
 		case EEXIST:
 			/*
 			 * B_FALSE indicates this is not an an advertisement.
 			 */
-			ndp_process(nnce, haddr, 0, B_FALSE);
-			NCE_REFRELE(nnce);
+			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
+			nce_refrele(nnce);
 			break;
 		default:
 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
@@ -2088,19 +1814,18 @@ no_source:
 			bad_solicit = B_TRUE;
 			goto done;
 		}
-		if (our_nce->nce_state == ND_PROBE) {
+		if (our_ncec->ncec_state == ND_PROBE) {
 			/*
-			 * Internally looped-back probes won't have DLPI
-			 * attached to them.  External ones (which are sent by
-			 * multicast) always will.  Just ignore our own
+			 * Internally looped-back probes will have
+			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
 			 * transmissions.
 			 */
-			if (dl_mp != NULL) {
+			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
 				/*
 				 * If someone else is probing our address, then
 				 * we've crossed wires.  Declare failure.
 				 */
-				ip_ndp_failure(ill, mp, dl_mp);
+				ndp_failure(mp, ira);
 			}
 			goto done;
 		}
@@ -2110,24 +1835,34 @@ no_source:
 		 */
 		src = ipv6_all_hosts_mcast;
 	}
-	/* Response to a solicitation */
-	(void) nce_xmit_advert(our_nce, B_TRUE, &src, flag);
+	flag |= nce_advert_flags(our_ncec);
+	(void) ndp_xmit(ill,
+	    ND_NEIGHBOR_ADVERT,
+	    our_ncec->ncec_lladdr,
+	    our_ncec->ncec_lladdr_length,
+	    &target,	/* Source and target of the advertisement pkt */
+	    &src,	/* IP Destination (source of original pkt) */
+	    flag);
 done:
 	if (bad_solicit)
 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
-	if (our_nce != NULL)
-		NCE_REFRELE(our_nce);
+	if (our_ncec != NULL)
+		ncec_refrele(our_ncec);
 }
 
+/*
+ * Handle reception of Neighbor Solicitation messages
+ */
 void
-ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
+ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
 {
+	ill_t		*ill = ira->ira_ill;
 	nd_neighbor_advert_t *na;
-	uint32_t	hlen = ill->ill_nd_lla_len;
+	uint32_t	hlen = ill->ill_phys_addr_length;
 	uchar_t		*haddr = NULL;
 	icmp6_t		*icmp_nd;
 	ip6_t		*ip6h;
-	nce_t		*dst_nce = NULL;
+	ncec_t		*dst_ncec = NULL;
 	in6_addr_t	target;
 	nd_opt_hdr_t	*opt = NULL;
 	int		len;
@@ -2138,6 +1873,7 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
 	na = (nd_neighbor_advert_t *)icmp_nd;
+
 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
 		ip1dbg(("ndp_input_advert: Target is multicast but the "
@@ -2179,17 +1915,25 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 	 * our local addresses, and those are spread across all the active
 	 * ills in the group.
 	 */
-	if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL)
+	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
 		return;
 
-	if (dst_nce->nce_flags & NCE_F_PERMANENT) {
+	if (NCE_PUBLISH(dst_ncec)) {
 		/*
-		 * Someone just advertised one of our local addresses.	First,
+		 * Someone just advertised an addresses that we publish. First,
 		 * check it it was us -- if so, we can safely ignore it.
+		 * We don't get the haddr from the ira_l2src because, in the
+		 * case that the packet originated from us, on an IPMP group,
+		 * the ira_l2src may would be the link-layer address of the
+		 * cast_ill used to send the packet, which may not be the same
+		 * as the dst_ncec->ncec_lladdr of the address.
 		 */
 		if (haddr != NULL) {
-			if (!nce_cmp_ll_addr(dst_nce, haddr, hlen))
-				goto out;	/* from us -- no conflict */
+			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
+				goto out;
+
+			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
+				goto out;   /* from us -- no conflict */
 
 			/*
 			 * If we're in an IPMP group, check if this is an echo
@@ -2209,59 +1953,96 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 		}
 
 		/*
-		 * Our own (looped-back) unsolicited neighbor advertisements
-		 * will get here with dl_mp == NULL.  (These will usually be
-		 * filtered by the `haddr' checks above, but point-to-point
-		 * links have no hardware address and thus make it here.)
-		 */
-		if (dl_mp == NULL && dst_nce->nce_state != ND_PROBE)
-			goto out;
-
-		/*
 		 * This appears to be a real conflict.  If we're trying to
 		 * configure this NCE (ND_PROBE), then shut it down.
 		 * Otherwise, handle the discovered conflict.
-		 *
-		 * In the ND_PROBE case, dl_mp might be NULL if we're getting
-		 * a unicast reply.  This isn't typically done (multicast is
-		 * the norm in response to a probe), but we can handle it.
 		 */
-		if (dst_nce->nce_state == ND_PROBE)
-			ip_ndp_failure(ill, mp, dl_mp);
-		else
-			ip_ndp_conflict(ill, mp, dl_mp, dst_nce);
+		if (dst_ncec->ncec_state == ND_PROBE) {
+			ndp_failure(mp, ira);
+		} else {
+			if (ip_nce_conflict(mp, ira, dst_ncec)) {
+				char hbuf[MAC_STR_LEN];
+				char sbuf[INET6_ADDRSTRLEN];
+
+				cmn_err(CE_WARN,
+				    "node '%s' is using %s on %s",
+				    inet_ntop(AF_INET6, &target, sbuf,
+				    sizeof (sbuf)),
+				    haddr == NULL ? "<none>" :
+				    mac_colon_addr(haddr, hlen, hbuf,
+				    sizeof (hbuf)), ill->ill_name);
+				/*
+				 * RFC 4862, Section 5.4.4 does not mandate
+				 * any specific behavior when an NA matches
+				 * a non-tentative address assigned to the
+				 * receiver. We make the choice of defending
+				 * our address, based on the assumption that
+				 * the sender has not detected the Duplicate.
+				 *
+				 * ncec_last_time_defended has been adjusted
+				 * in ip_nce_conflict()
+				 */
+				(void) ndp_announce(dst_ncec);
+			}
+		}
 	} else {
 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
-			dst_nce->nce_flags |= NCE_F_ISROUTER;
+			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
 
 		/* B_TRUE indicates this an advertisement */
-		ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE);
+		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
 	}
 out:
-	NCE_REFRELE(dst_nce);
+	ncec_refrele(dst_ncec);
 }
 
 /*
  * Process NDP neighbor solicitation/advertisement messages.
  * The checksum has already checked o.k before reaching here.
+ * Information about the datalink header is contained in ira_l2src, but
+ * that should be ignored for loopback packets.
  */
 void
-ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
+ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
 {
+	ill_t		*ill = ira->ira_rill;
 	icmp6_t		*icmp_nd;
 	ip6_t		*ip6h;
 	int		len;
 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
+	ill_t		*orig_ill = NULL;
 
-
+	/*
+	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
+	 * and make it be the IPMP upper so avoid being confused by a packet
+	 * addressed to a unicast address on a different ill.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		orig_ill = ill;
+		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
+		if (ill == NULL) {
+			ill = orig_ill;
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
+			    mp, ill);
+			freemsg(mp);
+			return;
+		}
+		ASSERT(ill != orig_ill);
+		orig_ill = ira->ira_ill;
+		ira->ira_ill = ill;
+		mib = ill->ill_icmp6_mib;
+	}
 	if (!pullupmsg(mp, -1)) {
 		ip1dbg(("ndp_input: pullupmsg failed\n"));
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
 		goto done;
 	}
 	ip6h = (ip6_t *)mp->b_rptr;
 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
+		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
 		goto done;
 	}
@@ -2275,6 +2056,7 @@ ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
 		    ip6h->ip6_nxt));
+		ip_drop_input("Wrong next header", mp, ill);
 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
 		goto done;
 	}
@@ -2283,6 +2065,7 @@ ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
 	if (icmp_nd->icmp6_code != 0) {
 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
+		ip_drop_input("code non-zero", mp, ill);
 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
 		goto done;
 	}
@@ -2293,54 +2076,25 @@ ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 	 */
 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
 		ip1dbg(("ndp_input: packet too short\n"));
+		ip_drop_input("packet too short", mp, ill);
 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
 		goto done;
 	}
 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
-		ndp_input_solicit(ill, mp, dl_mp);
+		ndp_input_solicit(mp, ira);
 	} else {
-		ndp_input_advert(ill, mp, dl_mp);
+		ndp_input_advert(mp, ira);
 	}
 done:
 	freemsg(mp);
+	if (orig_ill != NULL) {
+		ill_refrele(ill);
+		ira->ira_ill = orig_ill;
+	}
 }
 
 /*
- * Utility routine to send an advertisement.  Assumes that the NCE cannot
- * go away (e.g., because it's refheld).
- */
-static boolean_t
-nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target,
-    uint_t flags)
-{
-	ASSERT((flags & NDP_PROBE) == 0);
-
-	if (nce->nce_flags & NCE_F_ISROUTER)
-		flags |= NDP_ISROUTER;
-	if (!(nce->nce_flags & NCE_F_ANYCAST))
-		flags |= NDP_ORIDE;
-
-	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla,
-	    &nce->nce_addr, target, flags));
-}
-
-/*
- * Utility routine to send a solicitation.  Assumes that the NCE cannot
- * go away (e.g., because it's refheld).
- */
-static boolean_t
-nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
-    uint_t flags)
-{
-	if (flags & NDP_PROBE)
-		sender = &ipv6_all_zeros;
-
-	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla,
-	    sender, &nce->nce_addr, flags));
-}
-
-/*
- * nce_xmit is called to form and transmit a ND solicitation or
+ * ndp_xmit is called to form and transmit a ND solicitation or
  * advertisement ICMP packet.
  *
  * If the source address is unspecified and this isn't a probe (used for
@@ -2353,112 +2107,123 @@ nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
  * corresponding ill's ill_wq otherwise returns B_TRUE.
  */
 static boolean_t
-nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
+ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
     const in6_addr_t *sender, const in6_addr_t *target, int flag)
 {
-	ill_t		*hwaddr_ill;
 	uint32_t	len;
 	icmp6_t 	*icmp6;
 	mblk_t		*mp;
 	ip6_t		*ip6h;
 	nd_opt_hdr_t	*opt;
-	uint_t		plen, maxplen;
-	ip6i_t		*ip6i;
-	ipif_t		*src_ipif = NULL;
-	uint8_t		*hw_addr;
+	uint_t		plen;
 	zoneid_t	zoneid = GLOBAL_ZONEID;
-	char		buf[INET6_ADDRSTRLEN];
+	ill_t		*hwaddr_ill = ill;
+	ip_xmit_attr_t	ixas;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	boolean_t	need_refrele = B_FALSE;
+	boolean_t	probe = B_FALSE;
 
-	ASSERT(!IS_IPMP(ill));
+	if (IS_UNDER_IPMP(ill)) {
+		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
+		/*
+		 * We send non-probe packets on the upper IPMP interface.
+		 * ip_output_simple() will use cast_ill for sending any
+		 * multicast packets. Note that we can't follow the same
+		 * logic for probe packets because all interfaces in the ipmp
+		 * group may have failed, so that we really want to only try
+		 * to send the ND packet on the ill corresponding to the src
+		 * address.
+		 */
+		if (!probe) {
+			ill = ipmp_ill_hold_ipmp_ill(ill);
+			if (ill != NULL)
+				need_refrele = B_TRUE;
+			else
+				ill = hwaddr_ill;
+		}
+	}
 
 	/*
-	 * Check that the sender is actually a usable address on `ill', and if
-	 * so, track that as the src_ipif.  If not, for solicitations, set the
-	 * sender to :: so that a new one will be picked below; for adverts,
-	 * drop the packet since we expect nce_xmit_advert() to always provide
-	 * a valid sender.
+	 * If we have a unspecified source(sender) address, select a
+	 * proper source address for the solicitation here itself so
+	 * that we can initialize the h/w address correctly.
+	 *
+	 * If the sender is specified then we use this address in order
+	 * to lookup the zoneid before calling ip_output_v6(). This is to
+	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
+	 * by IP (we cannot guarantee that the global zone has an interface
+	 * route to the destination).
+	 *
+	 * Note that the NA never comes here with the unspecified source
+	 * address.
 	 */
-	if (!IN6_IS_ADDR_UNSPECIFIED(sender)) {
-		if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL ||
-		    !src_ipif->ipif_addr_ready) {
-			if (src_ipif != NULL) {
-				ipif_refrele(src_ipif);
-				src_ipif = NULL;
-			}
-			if (type == ND_NEIGHBOR_ADVERT) {
-				ip1dbg(("nce_xmit: No source ipif for src %s\n",
-				    inet_ntop(AF_INET6, sender, buf,
-				    sizeof (buf))));
-				return (B_TRUE);
-			}
-			sender = &ipv6_all_zeros;
-		}
-	}
 
 	/*
-	 * If we still have an unspecified source (sender) address and this
-	 * isn't a probe, select a source address from `ill'.
+	 * Probes will have unspec src at this point.
 	 */
-	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
-		ASSERT(type != ND_NEIGHBOR_ADVERT);
+	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
+		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
 		/*
-		 * Pick a source address for this solicitation, but restrict
-		 * the selection to addresses assigned to the output
-		 * interface.  We do this because the destination will create
-		 * a neighbor cache entry for the source address of this
-		 * packet, so the source address needs to be a valid neighbor.
+		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
+		 * ALL_ZONES if it cannot find a matching ipif for the address
+		 * we are trying to use. In this case we err on the side of
+		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
 		 */
-		src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
-		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
-		if (src_ipif == NULL) {
-			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
-			    inet_ntop(AF_INET6, target, buf, sizeof (buf))));
-			return (B_TRUE);
-		}
-		sender = &src_ipif->ipif_v6src_addr;
+		if (zoneid == ALL_ZONES)
+			zoneid = GLOBAL_ZONEID;
 	}
 
-	/*
-	 * We're either sending a probe or we have a source address.
-	 */
-	ASSERT((flag & NDP_PROBE) || src_ipif != NULL);
-
-	maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8);
-	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
-	    maxplen;
+	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
+	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
 	mp = allocb(len,  BPRI_LO);
 	if (mp == NULL) {
-		if (src_ipif != NULL)
-			ipif_refrele(src_ipif);
+		if (need_refrele)
+			ill_refrele(ill);
 		return (B_TRUE);
 	}
+
 	bzero((char *)mp->b_rptr, len);
 	mp->b_wptr = mp->b_rptr + len;
 
-	ip6i = (ip6i_t *)mp->b_rptr;
-	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-	ip6i->ip6i_nxt = IPPROTO_RAW;
-	ip6i->ip6i_flags = IP6I_HOPLIMIT;
-	if (flag & NDP_PROBE)
-		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6 | IXAF_NO_HW_CKSUM;
 
-	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
+	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
+	ixas.ixa_ipst = ipst;
+	ixas.ixa_cred = kcred;
+	ixas.ixa_cpid = NOPID;
+	ixas.ixa_tsl = NULL;
+	ixas.ixa_zoneid = zoneid;
+
+	ip6h = (ip6_t *)mp->b_rptr;
 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
+	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
 	ip6h->ip6_hops = IPV6_MAX_HOPS;
-	ip6h->ip6_src = *sender;
+	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
 	ip6h->ip6_dst = *target;
 	icmp6 = (icmp6_t *)&ip6h[1];
 
-	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
-	    sizeof (nd_neighbor_advert_t));
-
-	if (type == ND_NEIGHBOR_SOLICIT) {
+	if (hw_addr_len != 0) {
+		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
+		    sizeof (nd_neighbor_advert_t));
+	} else {
+		opt = NULL;
+	}
+	if (operation == ND_NEIGHBOR_SOLICIT) {
 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
 
-		if (!(flag & NDP_PROBE))
+		if (opt != NULL && !(flag & NDP_PROBE)) {
+			/*
+			 * Note that we don't send out SLLA for ND probes
+			 * per RFC 4862, even though we do send out the src
+			 * haddr for IPv4 DAD probes, even though both IPv4
+			 * and IPv6 go out with the unspecified/INADDR_ANY
+			 * src IP addr.
+			 */
 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
+		}
+		ip6h->ip6_src = *sender;
 		ns->nd_ns_target = *target;
 		if (!(flag & NDP_UNICAST)) {
 			/* Form multicast address of the target */
@@ -2470,7 +2235,9 @@ nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
 
 		ASSERT(!(flag & NDP_PROBE));
-		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
+		if (opt != NULL)
+			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
+		ip6h->ip6_src = *sender;
 		na->nd_na_target = *sender;
 		if (flag & NDP_ISROUTER)
 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
@@ -2480,231 +2247,223 @@ nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
 	}
 
-	hw_addr = NULL;
 	if (!(flag & NDP_PROBE)) {
-		/*
-		 * Use our source address to find the hardware address to put
-		 * in the packet, so that the hardware address and IP address
-		 * will match up -- even if that hardware address doesn't
-		 * match the ill we actually transmit the packet through.
-		 */
-		if (IS_IPMP(src_ipif->ipif_ill)) {
-			hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif);
-			if (hwaddr_ill == NULL) {
-				ip1dbg(("nce_xmit: no bound ill!\n"));
-				ipif_refrele(src_ipif);
-				freemsg(mp);
-				return (B_TRUE);
-			}
-		} else {
-			hwaddr_ill = src_ipif->ipif_ill;
-			ill_refhold(hwaddr_ill);	/* for symmetry */
-		}
-
-		plen = roundup(sizeof (nd_opt_hdr_t) +
-		    hwaddr_ill->ill_nd_lla_len, 8);
-
-		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
-		    hwaddr_ill->ill_phys_addr;
-		if (hw_addr != NULL) {
+		if (hw_addr != NULL && opt != NULL) {
 			/* Fill in link layer address and option len */
-			opt->nd_opt_len = (uint8_t)(plen / 8);
-			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
+			opt->nd_opt_len = (uint8_t)plen;
+			bcopy(hw_addr, &opt[1], hw_addr_len);
 		}
-
-		ill_refrele(hwaddr_ill);
+	}
+	if (opt != NULL && opt->nd_opt_type == 0) {
+		/* If there's no link layer address option, then strip it. */
+		len -= plen * 8;
+		mp->b_wptr = mp->b_rptr + len;
+		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
 	}
 
-	if (hw_addr == NULL)
-		plen = 0;
-
-	/* Fix up the length of the packet now that plen is known */
-	len -= (maxplen - plen);
-	mp->b_wptr = mp->b_rptr + len;
-	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
-
-	icmp6->icmp6_type = type;
+	icmp6->icmp6_type = (uint8_t)operation;
 	icmp6->icmp6_code = 0;
 	/*
 	 * Prepare for checksum by putting icmp length in the icmp
-	 * checksum field. The checksum is calculated in ip_wput_v6.
+	 * checksum field. The checksum is calculated in ip_output.c.
 	 */
 	icmp6->icmp6_cksum = ip6h->ip6_plen;
 
-	/*
-	 * Before we toss the src_ipif, look up the zoneid to pass to
-	 * ip_output_v6().  This is to ensure unicast ND_NEIGHBOR_ADVERT
-	 * packets to be routed correctly by IP (we cannot guarantee that the
-	 * global zone has an interface route to the destination).
-	 */
-	if (src_ipif != NULL) {
-		if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES)
-			zoneid = GLOBAL_ZONEID;
-		ipif_refrele(src_ipif);
-	}
-
-	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
+	(void) ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
+	if (need_refrele)
+		ill_refrele(ill);
 	return (B_FALSE);
 }
 
 /*
- * Make a link layer address (does not include the SAP) from an nce.
- * To form the link layer address, use the last four bytes of ipv6
- * address passed in and the fixed offset stored in nce.
+ * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
+ * The datapath uses this as an indication that there
+ * is a problem (as opposed to a NCE that was just
+ * reclaimed due to lack of memory.
+ * Note that static ARP entries never become unreachable.
  */
-static void
-nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
-{
-	uchar_t *mask, *to;
-	ill_t	*ill = nce->nce_ill;
-	int 	len;
-
-	if (ill->ill_net_type == IRE_IF_NORESOLVER)
-		return;
-	ASSERT(nce->nce_res_mp != NULL);
-	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
-	ASSERT(nce->nce_flags & NCE_F_MAPPING);
-	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
-	ASSERT(addr != NULL);
-	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
-	    addrpos, ill->ill_nd_lla_len);
-	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
-	    IPV6_ADDR_LEN);
-	mask = (uchar_t *)&nce->nce_extract_mask;
-	mask += (IPV6_ADDR_LEN - len);
-	addr += (IPV6_ADDR_LEN - len);
-	to = addrpos + nce->nce_ll_extract_start;
-	while (len-- > 0)
-		*to++ |= *mask++ & *addr++;
-}
-
-mblk_t *
-nce_udreq_alloc(ill_t *ill)
+void
+nce_make_unreachable(ncec_t *ncec)
 {
-	mblk_t	*template_mp = NULL;
-	dl_unitdata_req_t *dlur;
-	int	sap_length;
-
-	ASSERT(ill->ill_isv6);
-
-	sap_length = ill->ill_sap_length;
-	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
-	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
-	if (template_mp == NULL)
-		return (NULL);
-
-	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
-	dlur->dl_priority.dl_min = 0;
-	dlur->dl_priority.dl_max = 0;
-	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
-	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
-
-	/* Copy in the SAP value. */
-	NCE_LL_SAP_COPY(ill, template_mp);
-
-	return (template_mp);
+	mutex_enter(&ncec->ncec_lock);
+	ncec->ncec_state = ND_UNREACHABLE;
+	mutex_exit(&ncec->ncec_lock);
 }
 
 /*
- * NDP retransmit timer.
+ * NCE retransmit timer. Common to IPv4 and IPv6.
  * This timer goes off when:
- * a. It is time to retransmit NS for resolver.
+ * a. It is time to retransmit a resolution for resolver.
  * b. It is time to send reachability probes.
  */
 void
-ndp_timer(void *arg)
+nce_timer(void *arg)
 {
-	nce_t		*nce = arg;
-	ill_t		*ill = nce->nce_ill;
+	ncec_t		*ncec = arg;
+	ill_t		*ill = ncec->ncec_ill, *src_ill;
 	char		addrbuf[INET6_ADDRSTRLEN];
 	boolean_t	dropped = B_FALSE;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	ip_stack_t	*ipst = ncec->ncec_ipst;
+	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
+	in_addr_t	sender4 = INADDR_ANY;
+	in6_addr_t	sender6 = ipv6_all_zeros;
 
 	/*
-	 * The timer has to be cancelled by ndp_delete before doing the final
+	 * The timer has to be cancelled by ncec_delete before doing the final
 	 * refrele. So the NCE is guaranteed to exist when the timer runs
 	 * until it clears the timeout_id. Before clearing the timeout_id
-	 * bump up the refcnt so that we can continue to use the nce
+	 * bump up the refcnt so that we can continue to use the ncec
 	 */
-	ASSERT(nce != NULL);
-
-	mutex_enter(&nce->nce_lock);
-	NCE_REFHOLD_LOCKED(nce);
-	nce->nce_timeout_id = 0;
+	ASSERT(ncec != NULL);
+	mutex_enter(&ncec->ncec_lock);
+	ncec_refhold_locked(ncec);
+	ncec->ncec_timeout_id = 0;
+	mutex_exit(&ncec->ncec_lock);
+
+	src_ill = nce_resolve_src(ncec, &sender6);
+	/* if we could not find a sender address, return */
+	if (src_ill == NULL) {
+		if (!isv6) {
+			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
+			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
+			    &sender4, addrbuf, sizeof (addrbuf))));
+		} else {
+			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
+			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
+		}
+		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
+		ncec_refrele(ncec);
+		return;
+	}
+	if (!isv6)
+		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
 
+	mutex_enter(&ncec->ncec_lock);
 	/*
-	 * Check the reachability state first.
+	 * Check the reachability state.
 	 */
-	switch (nce->nce_state) {
+	switch (ncec->ncec_state) {
 	case ND_DELAY:
-		nce->nce_state = ND_PROBE;
-		mutex_exit(&nce->nce_lock);
-		(void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros,
-		    NDP_UNICAST);
+		ASSERT(ncec->ncec_lladdr != NULL);
+		ncec->ncec_state = ND_PROBE;
+		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
+		if (isv6) {
+			mutex_exit(&ncec->ncec_lock);
+			(void) ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
+			    src_ill->ill_phys_addr,
+			    src_ill->ill_phys_addr_length,
+			    &sender6, &ncec->ncec_addr,
+			    NDP_UNICAST);
+		} else {
+			(void) arp_request(ncec, sender4, src_ill);
+			mutex_exit(&ncec->ncec_lock);
+		}
 		if (ip_debug > 3) {
 			/* ip2dbg */
-			pr_addr_dbg("ndp_timer: state for %s changed "
-			    "to PROBE\n", AF_INET6, &nce->nce_addr);
+			pr_addr_dbg("nce_timer: state for %s changed "
+			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
 		}
-		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
-		NCE_REFRELE(nce);
-		return;
+		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
+		break;
 	case ND_PROBE:
 		/* must be retransmit timer */
-		nce->nce_pcnt--;
-		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
-		    nce->nce_pcnt >= -1);
-		if (nce->nce_pcnt > 0) {
+		ASSERT(ncec->ncec_pcnt >= -1);
+		if (ncec->ncec_pcnt > 0) {
 			/*
-			 * As per RFC2461, the nce gets deleted after
+			 * As per RFC2461, the ncec gets deleted after
 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
 			 * Note that the first unicast solicitation is sent
 			 * during the DELAY state.
 			 */
-			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
-			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
-			    addrbuf, sizeof (addrbuf))));
-			mutex_exit(&nce->nce_lock);
-			dropped = nce_xmit_solicit(nce, B_FALSE,
-			    &ipv6_all_zeros,
-			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
-			    NDP_UNICAST);
-			if (dropped) {
-				mutex_enter(&nce->nce_lock);
-				nce->nce_pcnt++;
-				mutex_exit(&nce->nce_lock);
+			ip2dbg(("nce_timer: pcount=%x dst %s\n",
+			    ncec->ncec_pcnt,
+			    inet_ntop((isv6? AF_INET6 : AF_INET),
+			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
+			if (NCE_PUBLISH(ncec)) {
+				mutex_exit(&ncec->ncec_lock);
+				/*
+				 * send out a probe; note that src_ill
+				 * is ignored by nce_dad() for all
+				 * DAD message types other than IPv6
+				 * unicast probes
+				 */
+				nce_dad(ncec, src_ill, B_TRUE);
+			} else {
+				ASSERT(src_ill != NULL);
+				ncec->ncec_pcnt--;
+				if (isv6) {
+					mutex_exit(&ncec->ncec_lock);
+					(void) ndp_xmit(src_ill,
+					    ND_NEIGHBOR_SOLICIT,
+					    src_ill->ill_phys_addr,
+					    src_ill->ill_phys_addr_length,
+					    &sender6, &ncec->ncec_addr,
+					    NDP_UNICAST);
+				} else {
+					/*
+					 * since the nce is REACHABLE,
+					 * the ARP request will be sent out
+					 * as a link-layer unicast.
+					 */
+					(void) arp_request(ncec, sender4,
+					    src_ill);
+					mutex_exit(&ncec->ncec_lock);
+				}
+				nce_restart_timer(ncec,
+				    ill->ill_reachable_retrans_time);
 			}
-			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
-		} else if (nce->nce_pcnt < 0) {
-			/* No hope, delete the nce */
-			nce->nce_state = ND_UNREACHABLE;
-			mutex_exit(&nce->nce_lock);
+		} else if (ncec->ncec_pcnt < 0) {
+			/* No hope, delete the ncec */
+			/* Tell datapath it went bad */
+			ncec->ncec_state = ND_UNREACHABLE;
+			mutex_exit(&ncec->ncec_lock);
 			if (ip_debug > 2) {
 				/* ip1dbg */
-				pr_addr_dbg("ndp_timer: Delete IRE for"
-				    " dst %s\n", AF_INET6, &nce->nce_addr);
+				pr_addr_dbg("nce_timer: Delete NCE for"
+				    " dst %s\n", (isv6? AF_INET6: AF_INET),
+				    &ncec->ncec_addr);
 			}
-			ndp_delete(nce);
-		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
-			/* Wait RetransTimer, before deleting the entry */
-			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
-			    nce->nce_pcnt, inet_ntop(AF_INET6,
-			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
-			mutex_exit(&nce->nce_lock);
+			/* if static ARP can't delete. */
+			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
+				ncec_delete(ncec);
+
+		} else if (!NCE_PUBLISH(ncec)) {
+			/*
+			 * Probe count is 0 for a dynamic entry (one that we
+			 * ourselves are not publishing). We should never get
+			 * here if NONUD was requested, hence the ASSERT below.
+			 */
+			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
+			ip2dbg(("nce_timer: pcount=%x dst %s\n",
+			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
+			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
+			ncec->ncec_pcnt--;
+			mutex_exit(&ncec->ncec_lock);
 			/* Wait one interval before killing */
-			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
+			nce_restart_timer(ncec,
+			    ill->ill_reachable_retrans_time);
 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
 			ipif_t *ipif;
+			ipaddr_t ncec_addr;
 
 			/*
 			 * We're done probing, and we can now declare this
 			 * address to be usable.  Let IP know that it's ok to
 			 * use.
 			 */
-			nce->nce_state = ND_REACHABLE;
-			mutex_exit(&nce->nce_lock);
-			ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr,
-			    nce->nce_ill);
+			ncec->ncec_state = ND_REACHABLE;
+			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
+			mutex_exit(&ncec->ncec_lock);
+			if (isv6) {
+				ipif = ipif_lookup_addr_exact_v6(
+				    &ncec->ncec_addr, ill, ipst);
+			} else {
+				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
+				    ncec_addr);
+				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
+				    ipst);
+			}
 			if (ipif != NULL) {
 				if (ipif->ipif_was_dup) {
 					char ibuf[LIFNAMSIZ + 10];
@@ -2725,17 +2484,28 @@ ndp_timer(void *arg)
 				ipif->ipif_addr_ready = 1;
 				ipif_refrele(ipif);
 			}
+			if (!isv6 && arp_no_defense)
+				break;
 			/* Begin defending our new address */
-			nce->nce_unsolicit_count = 0;
-			dropped = nce_xmit_advert(nce, B_FALSE,
-			    &ipv6_all_hosts_mcast, 0);
-			if (dropped) {
-				nce->nce_unsolicit_count = 1;
-				NDP_RESTART_TIMER(nce,
-				    ipst->ips_ip_ndp_unsolicit_interval);
-			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
-				NDP_RESTART_TIMER(nce,
-				    ipst->ips_ip_ndp_defense_interval);
+			if (ncec->ncec_unsolicit_count > 0) {
+				ncec->ncec_unsolicit_count--;
+				if (isv6) {
+					dropped = ndp_announce(ncec);
+				} else {
+					dropped = arp_announce(ncec);
+				}
+
+				if (dropped)
+					ncec->ncec_unsolicit_count++;
+				else
+					ncec->ncec_last_time_defended =
+					    ddi_get_lbolt();
+			}
+			if (ncec->ncec_unsolicit_count > 0) {
+				nce_restart_timer(ncec,
+				    ANNOUNCE_INTERVAL(isv6));
+			} else if (DEFENSE_INTERVAL(isv6) != 0) {
+				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
 			}
 		} else {
 			/*
@@ -2744,76 +2514,93 @@ ndp_timer(void *arg)
 			 * doing anything, but switch to reachable state so
 			 * that the restart will work.
 			 */
-			nce->nce_state = ND_REACHABLE;
-			mutex_exit(&nce->nce_lock);
+			ncec->ncec_state = ND_REACHABLE;
+			mutex_exit(&ncec->ncec_lock);
 		}
-		NCE_REFRELE(nce);
-		return;
+		break;
 	case ND_INCOMPLETE: {
-		ip6_t	*ip6h;
-		ip6i_t	*ip6i;
-		mblk_t	*mp, *datamp, *nextmp, **prevmpp;
+		mblk_t	*mp, *nextmp;
+		mblk_t	**prevmpp;
 
 		/*
-		 * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp
-		 * for any IPMP probe packets, and toss 'em.  IPMP probe
-		 * packets will always be at the head of nce_qd_mp and always
-		 * have an ip6i_t header, so we can stop at the first queued
-		 * ND packet without an ip6i_t.
+		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
+		 * for any IPMP probe packets, and toss them.  IPMP probe
+		 * packets will always be at the head of ncec_qd_mp, so that
+		 * we can stop at the first queued ND packet that is
+		 * not a probe packet.
 		 */
-		prevmpp = &nce->nce_qd_mp;
-		for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) {
+		prevmpp = &ncec->ncec_qd_mp;
+		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
 			nextmp = mp->b_next;
-			datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp;
-			ip6h = (ip6_t *)datamp->b_rptr;
-			if (ip6h->ip6_nxt != IPPROTO_RAW)
-				break;
 
-			ip6i = (ip6i_t *)ip6h;
-			if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) {
+			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
 				inet_freemsg(mp);
+				ncec->ncec_nprobes--;
 				*prevmpp = nextmp;
 			} else {
 				prevmpp = &mp->b_next;
 			}
 		}
-		ip_ndp_resolve(nce);
-		mutex_exit(&nce->nce_lock);
-		NCE_REFRELE(nce);
+
+		/*
+		 * Must be resolver's retransmit timer.
+		 */
+		mutex_exit(&ncec->ncec_lock);
+		ip_ndp_resolve(ncec);
 		break;
 	}
 	case ND_REACHABLE:
-		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
-		    nce->nce_unsolicit_count != 0) ||
-		    ((nce->nce_flags & NCE_F_PERMANENT) &&
-		    ipst->ips_ip_ndp_defense_interval != 0)) {
-			if (nce->nce_unsolicit_count > 0)
-				nce->nce_unsolicit_count--;
-			mutex_exit(&nce->nce_lock);
-			dropped = nce_xmit_advert(nce, B_FALSE,
-			    &ipv6_all_hosts_mcast, 0);
+		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
+		    ncec->ncec_unsolicit_count != 0) ||
+		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
+			if (ncec->ncec_unsolicit_count > 0) {
+				ncec->ncec_unsolicit_count--;
+				mutex_exit(&ncec->ncec_lock);
+				/*
+				 * When we get to zero announcements left,
+				 * switch to address defense
+				 */
+			} else {
+				boolean_t rate_limit;
+
+				mutex_exit(&ncec->ncec_lock);
+				rate_limit = ill_defend_rate_limit(ill, ncec);
+				if (rate_limit) {
+					nce_restart_timer(ncec,
+					    DEFENSE_INTERVAL(isv6));
+					break;
+				}
+			}
+			if (isv6) {
+				dropped = ndp_announce(ncec);
+			} else {
+				dropped = arp_announce(ncec);
+			}
+			mutex_enter(&ncec->ncec_lock);
 			if (dropped) {
-				mutex_enter(&nce->nce_lock);
-				nce->nce_unsolicit_count++;
-				mutex_exit(&nce->nce_lock);
+				ncec->ncec_unsolicit_count++;
+			} else {
+				ncec->ncec_last_time_defended =
+				    ddi_get_lbolt();
 			}
-			if (nce->nce_unsolicit_count != 0) {
-				NDP_RESTART_TIMER(nce,
-				    ipst->ips_ip_ndp_unsolicit_interval);
+			mutex_exit(&ncec->ncec_lock);
+			if (ncec->ncec_unsolicit_count != 0) {
+				nce_restart_timer(ncec,
+				    ANNOUNCE_INTERVAL(isv6));
 			} else {
-				NDP_RESTART_TIMER(nce,
-				    ipst->ips_ip_ndp_defense_interval);
+				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
 			}
 		} else {
-			mutex_exit(&nce->nce_lock);
+			mutex_exit(&ncec->ncec_lock);
 		}
-		NCE_REFRELE(nce);
 		break;
 	default:
-		mutex_exit(&nce->nce_lock);
-		NCE_REFRELE(nce);
+		mutex_exit(&ncec->ncec_lock);
 		break;
 	}
+done:
+	ncec_refrele(ncec);
+	ill_refrele(src_ill);
 }
 
 /*
@@ -2821,31 +2608,21 @@ ndp_timer(void *arg)
  * Copy SAP from ill.
  */
 static void
-nce_set_ll(nce_t *nce, uchar_t *ll_addr)
+nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
 {
-	ill_t	*ill = nce->nce_ill;
-	uchar_t	*woffset;
+	ill_t	*ill = ncec->ncec_ill;
 
 	ASSERT(ll_addr != NULL);
-	/* Always called before fast_path_probe */
-	ASSERT(nce->nce_fp_mp == NULL);
-	if (ill->ill_sap_length != 0) {
-		/*
-		 * Copy the SAP type specified in the
-		 * request into the xmit template.
-		 */
-		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
-	}
 	if (ill->ill_phys_addr_length > 0) {
 		/*
 		 * The bcopy() below used to be called for the physical address
 		 * length rather than the link layer address length. For
 		 * ethernet and many other media, the phys_addr and lla are
 		 * identical.
-		 * However, with xresolv interfaces being introduced, the
-		 * phys_addr and lla are no longer the same, and the physical
-		 * address may not have any useful meaning, so we use the lla
-		 * for IPv6 address resolution and destination addressing.
+		 *
+		 * The phys_addr and lla may not be the same for devices that
+		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
+		 * no known instances of these.
 		 *
 		 * For PPP or other interfaces with a zero length
 		 * physical address, don't do anything here.
@@ -2854,22 +2631,18 @@ nce_set_ll(nce_t *nce, uchar_t *ll_addr)
 		 * Using the lla for them would change the way they operate.
 		 * Doing nothing in such cases preserves expected behavior.
 		 */
-		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
-		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
+		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
 	}
 }
 
-static boolean_t
-nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
+boolean_t
+nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
+    uint32_t ll_addr_len)
 {
-	ill_t	*ill = nce->nce_ill;
-	uchar_t	*ll_offset;
-
-	ASSERT(nce->nce_res_mp != NULL);
+	ASSERT(ncec->ncec_lladdr != NULL);
 	if (ll_addr == NULL)
 		return (B_FALSE);
-	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
-	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
+	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
 		return (B_TRUE);
 	return (B_FALSE);
 }
@@ -2878,15 +2651,16 @@ nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
  * Updates the link layer address or the reachability state of
  * a cache entry.  Reset probe counter if needed.
  */
-static void
-nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
+void
+nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
 {
-	ill_t	*ill = nce->nce_ill;
+	ill_t	*ill = ncec->ncec_ill;
 	boolean_t need_stop_timer = B_FALSE;
 	boolean_t need_fastpath_update = B_FALSE;
+	nce_t	*nce = NULL;
+	timeout_id_t tid;
 
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
-	ASSERT(nce->nce_ipversion == IPV6_VERSION);
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 	/*
 	 * If this interface does not do NUD, there is no point
 	 * in allowing an update to the cache entry.  Although
@@ -2896,184 +2670,251 @@ nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
 	 * Non-Resolvers will always be created as REACHABLE.
 	 */
 	if (new_state != ND_UNCHANGED) {
-		if ((nce->nce_flags & NCE_F_NONUD) &&
-		    (nce->nce_state != ND_INCOMPLETE))
+		if ((ncec->ncec_flags & NCE_F_NONUD) &&
+		    (ncec->ncec_state != ND_INCOMPLETE))
 			return;
 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
 		need_stop_timer = B_TRUE;
 		if (new_state == ND_REACHABLE)
-			nce->nce_last = TICK_TO_MSEC(lbolt64);
+			ncec->ncec_last = TICK_TO_MSEC(lbolt64);
 		else {
 			/* We force NUD in this case */
-			nce->nce_last = 0;
+			ncec->ncec_last = 0;
 		}
-		nce->nce_state = new_state;
-		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
+		ncec->ncec_state = new_state;
+		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
+		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
+		    new_state == ND_INCOMPLETE);
+	}
+	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
+		tid = ncec->ncec_timeout_id;
+		ncec->ncec_timeout_id = 0;
 	}
 	/*
-	 * In case of fast path we need to free the the fastpath
-	 * M_DATA and do another probe.  Otherwise we can just
+	 * Re-trigger fastpath probe and
 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
 	 * whatever packets that happens to be transmitting at the time.
 	 */
 	if (new_ll_addr != NULL) {
-		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
-		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
-		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
-		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
-		if (nce->nce_fp_mp != NULL) {
-			freemsg(nce->nce_fp_mp);
-			nce->nce_fp_mp = NULL;
-		}
+		bcopy(new_ll_addr, ncec->ncec_lladdr,
+		    ill->ill_phys_addr_length);
 		need_fastpath_update = B_TRUE;
 	}
-	mutex_exit(&nce->nce_lock);
-	if (need_stop_timer) {
-		(void) untimeout(nce->nce_timeout_id);
-		nce->nce_timeout_id = 0;
+	mutex_exit(&ncec->ncec_lock);
+	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
+		if (tid != 0)
+			(void) untimeout(tid);
 	}
-	if (need_fastpath_update)
-		nce_fastpath(nce);
-	mutex_enter(&nce->nce_lock);
+	if (need_fastpath_update) {
+		/*
+		 * Delete any existing existing dlur_mp and fp_mp information.
+		 * For IPMP interfaces, all underlying ill's must be checked
+		 * and purged.
+		 */
+		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
+		/*
+		 * add the new dlur_mp and fp_mp
+		 */
+		nce = nce_fastpath(ncec, B_TRUE, NULL);
+		if (nce != NULL)
+			nce_refrele(nce);
+	}
+	mutex_enter(&ncec->ncec_lock);
 }
 
-void
-nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
+static void
+nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
 {
 	uint_t	count = 0;
 	mblk_t  **mpp, *tmp;
 
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 
-	for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
-		if (++count > nce->nce_ill->ill_max_buf) {
-			tmp = nce->nce_qd_mp->b_next;
-			nce->nce_qd_mp->b_next = NULL;
-			nce->nce_qd_mp->b_prev = NULL;
-			freemsg(nce->nce_qd_mp);
-			nce->nce_qd_mp = tmp;
+	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
+		if (++count > ncec->ncec_ill->ill_max_buf) {
+			tmp = ncec->ncec_qd_mp->b_next;
+			ncec->ncec_qd_mp->b_next = NULL;
+			/*
+			 * if we never create data addrs on the under_ill
+			 * does this matter?
+			 */
+			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
+			    ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
+			    ncec->ncec_ill);
+			freemsg(ncec->ncec_qd_mp);
+			ncec->ncec_qd_mp = tmp;
 		}
 	}
 
 	if (head_insert) {
-		mp->b_next = nce->nce_qd_mp;
-		nce->nce_qd_mp = mp;
+		ncec->ncec_nprobes++;
+		mp->b_next = ncec->ncec_qd_mp;
+		ncec->ncec_qd_mp = mp;
 	} else {
 		*mpp = mp;
 	}
 }
 
-static void
-nce_queue_mp(nce_t *nce, mblk_t *mp)
+/*
+ * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
+ * queued at the head or tail of the queue based on the input argument
+ * 'head_insert'. The caller should specify this argument as B_TRUE if this
+ * packet is an IPMP probe packet, in which case the following happens:
+ *
+ *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
+ *	(non-ipmp_probe) load-speading case where the source address of the ND
+ *	packet is not tied to ncec_ill. If the ill bound to the source address
+ *	cannot receive, the response to the ND packet will not be received.
+ *	However, if ND packets for ncec_ill's probes are queued	behind that ND
+ *	packet, those probes will also fail to be sent, and thus in.mpathd will
+ *	 erroneously conclude that ncec_ill has also failed.
+ *
+ *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
+ *	the first attempt.  This ensures that ND problems do not manifest as
+ *	probe RTT spikes.
+ *
+ * We achieve this by inserting ipmp_probe() packets at the head of the
+ * nce_queue.
+ *
+ * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
+ * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
+ */
+void
+nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
 {
-	boolean_t head_insert = B_FALSE;
-	ip6_t	*ip6h;
-	ip6i_t  *ip6i;
-	mblk_t	*data_mp;
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+	nce_queue_mp_common(ncec, mp, head_insert);
+}
 
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
+/*
+ * Called when address resolution failed due to a timeout.
+ * Send an ICMP unreachable in response to all queued packets.
+ */
+void
+ndp_resolv_failed(ncec_t *ncec)
+{
+	mblk_t	*mp, *nxt_mp;
+	char	buf[INET6_ADDRSTRLEN];
+	ill_t *ill = ncec->ncec_ill;
+	ip_recv_attr_t	iras;
 
-	if (mp->b_datap->db_type == M_CTL)
-		data_mp = mp->b_cont;
-	else
-		data_mp = mp;
-	ip6h = (ip6_t *)data_mp->b_rptr;
-	if (ip6h->ip6_nxt == IPPROTO_RAW) {
-		/*
-		 * This message should have been pulled up already in
-		 * ip_wput_v6. We can't do pullups here because the message
-		 * could be from the nce_qd_mp which could have b_next/b_prev
-		 * non-NULL.
-		 */
-		ip6i = (ip6i_t *)ip6h;
-		ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
+	bzero(&iras, sizeof (iras));
+	iras.ira_flags = 0;
+	/*
+	 * we are setting the ira_rill to the ipmp_ill (instead of
+	 * the actual ill on which the packet was received), but this
+	 * is ok because we don't actually need the real ira_rill.
+	 * to send the icmp unreachable to the sender.
+	 */
+	iras.ira_ill = iras.ira_rill = ill;
+	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+	iras.ira_rifindex = iras.ira_ruifindex;
+
+	ip1dbg(("ndp_resolv_failed: dst %s\n",
+	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
+	mutex_enter(&ncec->ncec_lock);
+	mp = ncec->ncec_qd_mp;
+	ncec->ncec_qd_mp = NULL;
+	ncec->ncec_nprobes = 0;
+	mutex_exit(&ncec->ncec_lock);
+	while (mp != NULL) {
+		nxt_mp = mp->b_next;
+		mp->b_next = NULL;
 
-		/*
-		 * If this packet is marked IP6I_IPMP_PROBE, then we need to:
-		 *
-		 *   1. Insert it at the head of the nce_qd_mp list.  Consider
-		 *	the normal (non-probe) load-speading case where the
-		 *	source address of the ND packet is not tied to nce_ill.
-		 *	If the ill bound to the source address cannot receive,
-		 *	the response to the ND packet will not be received.
-		 *	However, if ND packets for nce_ill's probes are queued
-		 *	behind that ND packet, those probes will also fail to
-		 *	be sent, and thus in.mpathd will erroneously conclude
-		 *	that nce_ill has also failed.
-		 *
-		 *   2. Drop the probe packet in ndp_timer() if the ND did
-		 *	not succeed on the first attempt.  This ensures that
-		 *	ND problems do not manifest as probe RTT spikes.
-		 */
-		if (ip6i->ip6i_flags & IP6I_IPMP_PROBE)
-			head_insert = B_TRUE;
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
+		    mp, ill);
+		icmp_unreachable_v6(mp,
+		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
+		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
+		mp = nxt_mp;
 	}
-	nce_queue_mp_common(nce, mp, head_insert);
+	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
 }
 
 /*
- * Called when address resolution failed due to a timeout.
- * Send an ICMP unreachable in response to all queued packets.
+ * Handle the completion of NDP and ARP resolution.
  */
 void
-nce_resolv_failed(nce_t *nce)
+nce_resolv_ok(ncec_t *ncec)
 {
-	mblk_t	*mp, *nxt_mp, *first_mp;
-	char	buf[INET6_ADDRSTRLEN];
-	ip6_t *ip6h;
-	zoneid_t zoneid = GLOBAL_ZONEID;
-	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
+	mblk_t *mp;
+	uint_t pkt_len;
+	iaflags_t ixaflags = IXAF_NO_TRACE;
+	nce_t *nce;
+	ill_t	*ill = ncec->ncec_ill;
+	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	if (IS_IPMP(ncec->ncec_ill)) {
+		nce_resolv_ipmp_ok(ncec);
+		return;
+	}
+	/* non IPMP case */
+
+	mutex_enter(&ncec->ncec_lock);
+	ASSERT(ncec->ncec_nprobes == 0);
+	mp = ncec->ncec_qd_mp;
+	ncec->ncec_qd_mp = NULL;
+	mutex_exit(&ncec->ncec_lock);
 
-	ip1dbg(("nce_resolv_failed: dst %s\n",
-	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
-	mutex_enter(&nce->nce_lock);
-	mp = nce->nce_qd_mp;
-	nce->nce_qd_mp = NULL;
-	mutex_exit(&nce->nce_lock);
 	while (mp != NULL) {
+		mblk_t *nxt_mp;
+
+		if (ill->ill_isv6) {
+			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+		} else {
+			ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+			ixaflags |= IXAF_IS_IPV4;
+			pkt_len = ntohs(ipha->ipha_length);
+		}
 		nxt_mp = mp->b_next;
 		mp->b_next = NULL;
-		mp->b_prev = NULL;
-
-		first_mp = mp;
-		if (mp->b_datap->db_type == M_CTL) {
-			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
-			ASSERT(io->ipsec_out_type == IPSEC_OUT);
-			zoneid = io->ipsec_out_zoneid;
-			ASSERT(zoneid != ALL_ZONES);
-			mp = mp->b_cont;
-			mp->b_next = NULL;
-			mp->b_prev = NULL;
-		}
-
-		ip6h = (ip6_t *)mp->b_rptr;
-		if (ip6h->ip6_nxt == IPPROTO_RAW) {
-			ip6i_t *ip6i;
+		/*
+		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
+		 * longer available, but it's ok to drop this flag because TCP
+		 * has its own flow-control in effect, so TCP packets
+		 * are not likely to get here when flow-control is in effect.
+		 */
+		mutex_enter(&ill->ill_lock);
+		nce = nce_lookup(ill, &ncec->ncec_addr);
+		mutex_exit(&ill->ill_lock);
+
+		if (nce == NULL) {
+			if (isv6) {
+				BUMP_MIB(&ipst->ips_ip6_mib,
+				    ipIfStatsOutDiscards);
+			} else {
+				BUMP_MIB(&ipst->ips_ip_mib,
+				    ipIfStatsOutDiscards);
+			}
+			ip_drop_output("ipIfStatsOutDiscards - no nce",
+			    mp, NULL);
+			freemsg(mp);
+		} else {
 			/*
-			 * This message should have been pulled up already
-			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
-			 * the header is pulled up.
+			 * We don't know the zoneid, but
+			 * ip_xmit does not care since IXAF_NO_TRACE
+			 * is set. (We traced the packet the first
+			 * time through ip_xmit.)
 			 */
-			ip6i = (ip6i_t *)ip6h;
-			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
-			    sizeof (ip6i_t) + IPV6_HDR_LEN);
-			mp->b_rptr += sizeof (ip6i_t);
+			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
+			    ALL_ZONES, 0, NULL);
+			nce_refrele(nce);
 		}
-		/*
-		 * Ignore failure since icmp_unreachable_v6 will silently
-		 * drop packets with an unspecified source address.
-		 */
-		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
-		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
-		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
 		mp = nxt_mp;
 	}
-	nce_cb_dispatch(nce);
+
+	ncec_cb_dispatch(ncec); /* complete callbacks */
 }
 
 /*
- * Called by SIOCSNDP* ioctl to add/change an nce entry
+ * Called by SIOCSNDP* ioctl to add/change an ncec entry
  * and the corresponding attributes.
  * Disallow states other than ND_REACHABLE or ND_STALE.
  */
@@ -3082,31 +2923,28 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
 {
 	sin6_t		*sin6;
 	in6_addr_t	*addr;
+	ncec_t		*ncec;
 	nce_t		*nce;
-	int		err;
+	int		err = 0;
 	uint16_t	new_flags = 0;
 	uint16_t	old_flags = 0;
 	int		inflags = lnr->lnr_flags;
 	ip_stack_t	*ipst = ill->ill_ipst;
+	boolean_t	do_postprocess = B_FALSE;
 
 	ASSERT(ill->ill_isv6);
 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
 	    (lnr->lnr_state_create != ND_STALE))
 		return (EINVAL);
 
-	if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN)
-		return (EINVAL);
-
 	sin6 = (sin6_t *)&lnr->lnr_addr;
 	addr = &sin6->sin6_addr;
 
 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
-	/* We know it can not be mapping so just look in the hash table */
-	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
-	/* See comment in ndp_query() regarding IS_IPMP(ill) usage */
-	nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce);
+	ASSERT(!IS_UNDER_IPMP(ill));
+	nce = nce_lookup_addr(ill, addr);
 	if (nce != NULL)
-		new_flags = nce->nce_flags;
+		new_flags = nce->nce_common->ncec_flags;
 
 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
 	case NDF_ISROUTER_ON:
@@ -3118,7 +2956,7 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 		if (nce != NULL)
-			NCE_REFRELE(nce);
+			nce_refrele(nce);
 		return (EINVAL);
 	}
 
@@ -3132,17 +2970,15 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 		if (nce != NULL)
-			NCE_REFRELE(nce);
+			nce_refrele(nce);
 		return (EINVAL);
 	}
 
 	if (nce == NULL) {
-		err = ndp_add_v6(ill,
+		err = nce_add_v6(ill,
 		    (uchar_t *)lnr->lnr_hdw_addr,
+		    ill->ill_phys_addr_length,
 		    addr,
-		    &ipv6_all_ones,
-		    &ipv6_all_zeros,
-		    0,
 		    new_flags,
 		    lnr->lnr_state_create,
 		    &nce);
@@ -3150,269 +2986,354 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
 			return (err);
+		} else {
+			do_postprocess = B_TRUE;
 		}
 	}
-	old_flags = nce->nce_flags;
+	ncec = nce->nce_common;
+	old_flags = ncec->ncec_flags;
 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
-		/*
-		 * Router turned to host, delete all ires.
-		 * XXX Just delete the entry, but we need to add too.
-		 */
-		nce->nce_flags &= ~NCE_F_ISROUTER;
+		ncec_router_to_host(ncec);
 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
-		ndp_delete(nce);
-		NCE_REFRELE(nce);
+		if (do_postprocess)
+			err = nce_add_v6_postprocess(nce);
+		nce_refrele(nce);
 		return (0);
 	}
 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 
-	mutex_enter(&nce->nce_lock);
-	nce->nce_flags = new_flags;
-	mutex_exit(&nce->nce_lock);
+	if (do_postprocess)
+		err = nce_add_v6_postprocess(nce);
+	/*
+	 * err cannot be anything other than 0 because we don't support
+	 * proxy arp of static addresses.
+	 */
+	ASSERT(err == 0);
+
+	mutex_enter(&ncec->ncec_lock);
+	ncec->ncec_flags = new_flags;
+	mutex_exit(&ncec->ncec_lock);
 	/*
 	 * Note that we ignore the state at this point, which
 	 * should be either STALE or REACHABLE.  Instead we let
 	 * the link layer address passed in to determine the state
 	 * much like incoming packets.
 	 */
-	nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
-	NCE_REFRELE(nce);
+	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
+	nce_refrele(nce);
 	return (0);
 }
 
 /*
- * If the device driver supports it, we make nce_fp_mp to have
- * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
- * The caller ensures there is hold on nce for this function.
- * Note that since ill_fastpath_probe() copies the mblk there is
- * no need for the hold beyond this function.
+ * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
+ * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
+ * be held to ensure that they are in the same group.
  */
-void
-nce_fastpath(nce_t *nce)
+static nce_t *
+nce_fastpath_create(ill_t *ill, ncec_t *ncec)
 {
-	ill_t	*ill = nce->nce_ill;
-	int res;
 
-	ASSERT(ill != NULL);
-	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
+	nce_t *nce;
 
-	if (nce->nce_fp_mp != NULL) {
-		/* Already contains fastpath info */
-		return;
-	}
-	if (nce->nce_res_mp != NULL) {
-		nce_fastpath_list_add(nce);
-		res = ill_fastpath_probe(ill, nce->nce_res_mp);
-		/*
-		 * EAGAIN is an indication of a transient error
-		 * i.e. allocation failure etc. leave the nce in the list it
-		 * will be updated when another probe happens for another ire
-		 * if not it will be taken out of the list when the ire is
-		 * deleted.
-		 */
+	nce = nce_ill_lookup_then_add(ill, ncec);
+
+	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
+		return (nce);
 
-		if (res != 0 && res != EAGAIN)
-			nce_fastpath_list_delete(nce);
+	/*
+	 * hold the ncec_lock to synchronize with nce_update() so that,
+	 * at the end of this function, the contents of nce_dlur_mp are
+	 * consistent with ncec->ncec_lladdr, even though some intermediate
+	 * packet may have been sent out with a mangled address, which would
+	 * only be a transient condition.
+	 */
+	mutex_enter(&ncec->ncec_lock);
+	if (ncec->ncec_lladdr != NULL) {
+		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
+		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
+	} else {
+		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
+		    ill->ill_sap_length);
 	}
+	mutex_exit(&ncec->ncec_lock);
+	return (nce);
 }
 
 /*
- * Drain the list of nce's waiting for fastpath response.
+ * we make nce_fp_mp to have an M_DATA prepend.
+ * The caller ensures there is hold on ncec for this function.
+ * Note that since ill_fastpath_probe() copies the mblk there is
+ * no need to hold the nce or ncec beyond this function.
+ *
+ * If the caller has passed in a non-null ncec_nce to nce_faspath() that
+ * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
+ * and will be returned back by this function, so that no extra nce_refrele
+ * is required for the caller. The calls from nce_add_common() use this
+ * method. All other callers (that pass in NULL ncec_nce) will have to do a
+ * nce_refrele of the returned nce (when it is non-null).
  */
-void
-nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
-    void *arg)
+nce_t *
+nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
 {
+	nce_t *nce;
+	ill_t *ill = ncec->ncec_ill;
 
-	nce_t *next_nce;
-	nce_t *current_nce;
-	nce_t *first_nce;
-	nce_t *prev_nce = NULL;
+	ASSERT(ill != NULL);
+
+	if (IS_IPMP(ill) && trigger_fp_req) {
+		trigger_fp_req = B_FALSE;
+		ipmp_ncec_fastpath(ncec, ill);
 
-	mutex_enter(&ill->ill_lock);
-	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
-	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
-		next_nce = current_nce->nce_fastpath;
-		/*
-		 * Take it off the list if we're flushing, or if the callback
-		 * routine tells us to do so.  Otherwise, leave the nce in the
-		 * fastpath list to handle any pending response from the lower
-		 * layer.  We can't drain the list when the callback routine
-		 * comparison failed, because the response is asynchronous in
-		 * nature, and may not arrive in the same order as the list
-		 * insertion.
-		 */
-		if (func == NULL || func(current_nce, arg)) {
-			current_nce->nce_fastpath = NULL;
-			if (current_nce == first_nce)
-				ill->ill_fastpath_list = first_nce = next_nce;
-			else
-				prev_nce->nce_fastpath = next_nce;
-		} else {
-			/* previous element that is still in the list */
-			prev_nce = current_nce;
-		}
-		current_nce = next_nce;
 	}
-	mutex_exit(&ill->ill_lock);
+	/*
+	 * If the caller already has the nce corresponding to the ill, use
+	 * that one. Otherwise we have to lookup/add the nce. Calls from
+	 * nce_add_common() fall in the former category, and have just done
+	 * the nce lookup/add that can be reused.
+	 */
+	if (ncec_nce == NULL)
+		nce = nce_fastpath_create(ill, ncec);
+	else
+		nce = ncec_nce;
+
+	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
+		return (nce);
+
+	if (trigger_fp_req)
+		nce_fastpath_trigger(nce);
+	return (nce);
 }
 
 /*
- * Add nce to the nce fastpath list.
+ * Trigger fastpath on nce. No locks may be held.
  */
-void
-nce_fastpath_list_add(nce_t *nce)
+static void
+nce_fastpath_trigger(nce_t *nce)
 {
-	ill_t *ill;
+	int res;
+	ill_t *ill = nce->nce_ill;
+	ncec_t *ncec = nce->nce_common;
 
-	ill = nce->nce_ill;
+	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
+	/*
+	 * EAGAIN is an indication of a transient error
+	 * i.e. allocation failure etc. leave the ncec in the list it
+	 * will be updated when another probe happens for another ire
+	 * if not it will be taken out of the list when the ire is
+	 * deleted.
+	 */
+	if (res != 0 && res != EAGAIN && res != ENOTSUP)
+		nce_fastpath_list_delete(ill, ncec, NULL);
+}
 
-	mutex_enter(&ill->ill_lock);
-	mutex_enter(&nce->nce_lock);
+/*
+ * Add ncec to the nce fastpath list on ill.
+ */
+static nce_t *
+nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
+{
+	nce_t *nce = NULL;
 
+	ASSERT(MUTEX_HELD(&ill->ill_lock));
 	/*
-	 * if nce has not been deleted and
+	 * Atomically ensure that the ill is not CONDEMNED and is not going
+	 * down, before adding the NCE.
+	 */
+	if (ill->ill_state_flags & ILL_CONDEMNED)
+		return (NULL);
+	mutex_enter(&ncec->ncec_lock);
+	/*
+	 * if ncec has not been deleted and
 	 * is not already in the list add it.
 	 */
-	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
-	    (nce->nce_fastpath == NULL)) {
-		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
-		ill->ill_fastpath_list = nce;
+	if (!NCE_ISCONDEMNED(ncec)) {
+		nce = nce_lookup(ill, &ncec->ncec_addr);
+		if (nce != NULL)
+			goto done;
+		nce = nce_add(ill, ncec);
 	}
+done:
+	mutex_exit(&ncec->ncec_lock);
+	return (nce);
+}
 
-	mutex_exit(&nce->nce_lock);
+nce_t *
+nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
+{
+	nce_t *nce;
+
+	mutex_enter(&ill->ill_lock);
+	nce = nce_ill_lookup_then_add_locked(ill, ncec);
 	mutex_exit(&ill->ill_lock);
+	return (nce);
 }
 
+
 /*
- * remove nce from the nce fastpath list.
+ * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
+ * nce is added to the 'dead' list, and the caller must nce_refrele() the
+ * entry after all locks have been dropped.
  */
 void
-nce_fastpath_list_delete(nce_t *nce)
+nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
 {
-	nce_t *nce_ptr;
-
-	ill_t *ill;
+	nce_t *nce;
 
-	ill = nce->nce_ill;
 	ASSERT(ill != NULL);
 
-	mutex_enter(&ill->ill_lock);
-	if (nce->nce_fastpath == NULL)
-		goto done;
-
-	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
+	/* first clean out any nce pointers in the under_ills */
+	if (IS_IPMP(ill))
+		ipmp_ncec_flush_nce(ncec);
 
-	if (ill->ill_fastpath_list == nce) {
-		ill->ill_fastpath_list = nce->nce_fastpath;
-	} else {
-		nce_ptr = ill->ill_fastpath_list;
-		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
-			if (nce_ptr->nce_fastpath == nce) {
-				nce_ptr->nce_fastpath = nce->nce_fastpath;
-				break;
-			}
-			nce_ptr = nce_ptr->nce_fastpath;
+	/* now the ill itself */
+	mutex_enter(&ill->ill_lock);
+	for (nce = list_head(&ill->ill_nce); nce != NULL;
+	    nce = list_next(&ill->ill_nce, nce)) {
+		if (nce->nce_common == ncec) {
+			nce_refhold(nce);
+			nce_delete(nce);
+			break;
 		}
 	}
-
-	nce->nce_fastpath = NULL;
-done:
 	mutex_exit(&ill->ill_lock);
+	if (nce != NULL) {
+		if (dead == NULL)
+			nce_refrele(nce);
+		else
+			list_insert_tail(dead, nce);
+	}
 }
 
 /*
- * Update all NCE's that are not in fastpath mode and
- * have an nce_fp_mp that matches mp. mp->b_cont contains
- * the fastpath header.
- *
- * Returns TRUE if entry should be dequeued, or FALSE otherwise.
+ * when the fastpath response does not fit in the datab
+ * associated with the existing nce_fp_mp, we delete and
+ * add the nce to retrigger fastpath based on the information
+ * in the ncec_t.
  */
-boolean_t
-ndp_fastpath_update(nce_t *nce, void *arg)
+static nce_t *
+nce_delete_then_add(nce_t *nce)
+{
+	ill_t		*ill = nce->nce_ill;
+	nce_t		*newnce = NULL;
+
+	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
+	    (void *)nce, ill->ill_name));
+	mutex_enter(&ill->ill_lock);
+	mutex_enter(&nce->nce_common->ncec_lock);
+	nce_delete(nce);
+	/*
+	 * Make sure that ncec is not condemned before adding. We hold the
+	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
+	 * ipmp_ncec_flush_nce()
+	 */
+	if (!NCE_ISCONDEMNED(nce->nce_common))
+		newnce = nce_add(ill, nce->nce_common);
+	mutex_exit(&nce->nce_common->ncec_lock);
+	mutex_exit(&ill->ill_lock);
+	nce_refrele(nce);
+	return (newnce); /* could be null if nomem */
+}
+
+typedef struct nce_fp_match_s {
+	nce_t	*nce_fp_match_res;
+	mblk_t	*nce_fp_match_ack_mp;
+} nce_fp_match_t;
+
+/* ARGSUSED */
+static int
+nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
 {
-	mblk_t 	*mp, *fp_mp;
+	nce_fp_match_t	*nce_fp_marg = arg;
+	ncec_t		*ncec = nce->nce_common;
+	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
 	uchar_t	*mp_rptr, *ud_mp_rptr;
-	mblk_t	*ud_mp = nce->nce_res_mp;
+	mblk_t		*ud_mp = nce->nce_dlur_mp;
 	ptrdiff_t	cmplen;
 
-	if (nce->nce_flags & NCE_F_MAPPING)
-		return (B_TRUE);
-	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
-		return (B_TRUE);
-
-	ip2dbg(("ndp_fastpath_update: trying\n"));
-	mp = (mblk_t *)arg;
+	/*
+	 * mp is the mp associated with the fastpath ack.
+	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
+	 * under consideration. If the contents match, then the
+	 * fastpath ack is used to update the nce.
+	 */
+	if (ud_mp == NULL)
+		return (0); /* MH_WALK_CONTINUE */
 	mp_rptr = mp->b_rptr;
 	cmplen = mp->b_wptr - mp_rptr;
 	ASSERT(cmplen >= 0);
+
 	ud_mp_rptr = ud_mp->b_rptr;
 	/*
-	 * The nce is locked here to prevent any other threads
-	 * from accessing and changing nce_res_mp when the IPv6 address
-	 * becomes resolved to an lla while we're in the middle
-	 * of looking at and comparing the hardware address (lla).
-	 * It is also locked to prevent multiple threads in nce_fastpath_update
-	 * from examining nce_res_mp atthe same time.
+	 * The ncec is locked here to prevent any other threads from accessing
+	 * and changing nce_dlur_mp when the address becomes resolved to an
+	 * lla while we're in the middle of looking at and comparing the
+	 * hardware address (lla). It is also locked to prevent multiple
+	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
+	 * time.
 	 */
-	mutex_enter(&nce->nce_lock);
+	mutex_enter(&ncec->ncec_lock);
 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
-	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
-		mutex_exit(&nce->nce_lock);
-		/*
-		 * Don't take the ire off the fastpath list yet,
-		 * since the response may come later.
-		 */
-		return (B_FALSE);
-	}
-	/* Matched - install mp as the fastpath mp */
-	ip1dbg(("ndp_fastpath_update: match\n"));
-	fp_mp = dupb(mp->b_cont);
-	if (fp_mp != NULL) {
-		nce->nce_fp_mp = fp_mp;
+	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
+		nce_fp_marg->nce_fp_match_res = nce;
+		mutex_exit(&ncec->ncec_lock);
+		nce_refhold(nce);
+		return (1); /* MH_WALK_TERMINATE */
 	}
-	mutex_exit(&nce->nce_lock);
-	return (B_TRUE);
+	mutex_exit(&ncec->ncec_lock);
+	return (0); /* MH_WALK_CONTINUE */
 }
 
 /*
- * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
- * driver.  Note that it assumes IP is exclusive...
+ * Update all NCE's that are not in fastpath mode and
+ * have an nce_fp_mp that matches mp. mp->b_cont contains
+ * the fastpath header.
+ *
+ * Returns TRUE if entry should be dequeued, or FALSE otherwise.
  */
-/* ARGSUSED */
 void
-ndp_fastpath_flush(nce_t *nce, char *arg)
+nce_fastpath_update(ill_t *ill,  mblk_t *mp)
 {
-	if (nce->nce_flags & NCE_F_MAPPING)
-		return;
-	/* No fastpath info? */
-	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
+	nce_fp_match_t nce_fp_marg;
+	nce_t *nce;
+	mblk_t *nce_fp_mp, *fp_mp;
+
+	nce_fp_marg.nce_fp_match_res = NULL;
+	nce_fp_marg.nce_fp_match_ack_mp = mp;
+
+	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
+
+	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
 		return;
 
-	if (nce->nce_ipversion == IPV4_VERSION &&
-	    nce->nce_flags & NCE_F_BCAST) {
-		/*
-		 * IPv4 BROADCAST entries:
-		 * We can't delete the nce since it is difficult to
-		 * recreate these without going through the
-		 * ipif down/up dance.
-		 *
-		 * All access to nce->nce_fp_mp in the case of these
-		 * is protected by nce_lock.
-		 */
-		mutex_enter(&nce->nce_lock);
-		if (nce->nce_fp_mp != NULL) {
-			freeb(nce->nce_fp_mp);
-			nce->nce_fp_mp = NULL;
-			mutex_exit(&nce->nce_lock);
-			nce_fastpath(nce);
-		} else {
+	mutex_enter(&nce->nce_lock);
+	nce_fp_mp = nce->nce_fp_mp;
+
+	if (nce_fp_mp != NULL) {
+		fp_mp = mp->b_cont;
+		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
+		    nce_fp_mp->b_datap->db_lim) {
 			mutex_exit(&nce->nce_lock);
+			nce = nce_delete_then_add(nce);
+			if (nce == NULL) {
+				return;
+			}
+			mutex_enter(&nce->nce_lock);
+			nce_fp_mp = nce->nce_fp_mp;
 		}
+	}
+
+	/* Matched - install mp as the fastpath mp */
+	if (nce_fp_mp == NULL) {
+		fp_mp = dupb(mp->b_cont);
+		nce->nce_fp_mp = fp_mp;
 	} else {
-		/* Just delete the NCE... */
-		ndp_delete(nce);
+		fp_mp = mp->b_cont;
+		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
+		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
+		    + MBLKL(fp_mp);
 	}
+	mutex_exit(&nce->nce_lock);
+	nce_refrele(nce);
 }
 
 /*
@@ -3451,74 +3372,103 @@ ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
 }
 
 /*
- * ndp_walk function.
+ * ncec_walk function.
  * Free a fraction of the NCE cache entries.
- * A fraction of zero means to not free any in that category.
+ *
+ * A possible optimization here would be to use ncec_last where possible, and
+ * delete the least-frequently used entry, which would require more complex
+ * computation as we walk through the ncec's (e.g., track ncec entries by
+ * order of ncec_last and/or maintain state)
  */
-void
-ndp_cache_reclaim(nce_t *nce, char *arg)
+static void
+ncec_cache_reclaim(ncec_t *ncec, char *arg)
 {
-	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
-	uint_t	rand;
+	ip_stack_t	*ipst = ncec->ncec_ipst;
+	uint_t		fraction = *(uint_t *)arg;
+	uint_t		rand;
 
-	if (nce->nce_flags & NCE_F_PERMANENT)
+	if ((ncec->ncec_flags &
+	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
 		return;
+	}
 
 	rand = (uint_t)lbolt +
-	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
-	if (ncr->ncr_host != 0 &&
-	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
-		ndp_delete(nce);
-		return;
+	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
+	if ((rand/fraction)*fraction == rand) {
+		IP_STAT(ipst, ip_nce_reclaim_deleted);
+		ncec_delete(ncec);
 	}
 }
 
 /*
- * ndp_walk function.
- * Count the number of NCEs that can be deleted.
- * These would be hosts but not routers.
+ * kmem_cache callback to free up memory.
+ *
+ * For now we just delete a fixed fraction.
  */
-void
-ndp_cache_count(nce_t *nce, char *arg)
+static void
+ip_nce_reclaim_stack(ip_stack_t *ipst)
 {
-	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
+	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
 
-	if (nce->nce_flags & NCE_F_PERMANENT)
-		return;
+	IP_STAT(ipst, ip_nce_reclaim_calls);
 
-	ncc->ncc_total++;
-	if (!(nce->nce_flags & NCE_F_ISROUTER))
-		ncc->ncc_host++;
+	ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
+
+	/*
+	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
+	 * Get them to update any stale references to drop any refholds they
+	 * have.
+	 */
+	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
+}
+
+/*
+ * Called by the memory allocator subsystem directly, when the system
+ * is running low on memory.
+ */
+/* ARGSUSED */
+void
+ip_nce_reclaim(void *args)
+{
+	netstack_handle_t nh;
+	netstack_t *ns;
+
+	netstack_next_init(&nh);
+	while ((ns = netstack_next(&nh)) != NULL) {
+		ip_nce_reclaim_stack(ns->netstack_ip);
+		netstack_rele(ns);
+	}
+	netstack_next_fini(&nh);
 }
 
 #ifdef DEBUG
 void
-nce_trace_ref(nce_t *nce)
+ncec_trace_ref(ncec_t *ncec)
 {
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 
-	if (nce->nce_trace_disable)
+	if (ncec->ncec_trace_disable)
 		return;
 
-	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
-		nce->nce_trace_disable = B_TRUE;
-		nce_trace_cleanup(nce);
+	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
+		ncec->ncec_trace_disable = B_TRUE;
+		ncec_trace_cleanup(ncec);
 	}
 }
 
 void
-nce_untrace_ref(nce_t *nce)
+ncec_untrace_ref(ncec_t *ncec)
 {
-	ASSERT(MUTEX_HELD(&nce->nce_lock));
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 
-	if (!nce->nce_trace_disable)
-		th_trace_unref(nce);
+	if (!ncec->ncec_trace_disable)
+		th_trace_unref(ncec);
 }
 
 static void
-nce_trace_cleanup(const nce_t *nce)
+ncec_trace_cleanup(const ncec_t *ncec)
 {
-	th_trace_cleanup(nce, nce->nce_trace_disable);
+	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
 }
 #endif
 
@@ -3527,64 +3477,159 @@ nce_trace_cleanup(const nce_t *nce)
  * Send an ICMP unreachable in response to all queued packets.
  */
 void
-arp_resolv_failed(nce_t *nce)
+arp_resolv_failed(ncec_t *ncec)
 {
-	mblk_t	*mp, *nxt_mp, *first_mp;
+	mblk_t	*mp, *nxt_mp;
 	char	buf[INET6_ADDRSTRLEN];
-	zoneid_t zoneid = GLOBAL_ZONEID;
 	struct in_addr ipv4addr;
-	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
+	ill_t *ill = ncec->ncec_ill;
+	ip_stack_t *ipst = ncec->ncec_ipst;
+	ip_recv_attr_t	iras;
 
-	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
+	bzero(&iras, sizeof (iras));
+	iras.ira_flags = IRAF_IS_IPV4;
+	/*
+	 * we are setting the ira_rill to the ipmp_ill (instead of
+	 * the actual ill on which the packet was received), but this
+	 * is ok because we don't actually need the real ira_rill.
+	 * to send the icmp unreachable to the sender.
+	 */
+	iras.ira_ill = iras.ira_rill = ill;
+	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+	iras.ira_rifindex = iras.ira_ruifindex;
+
+	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
 	ip3dbg(("arp_resolv_failed: dst %s\n",
 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
-	mutex_enter(&nce->nce_lock);
-	mp = nce->nce_qd_mp;
-	nce->nce_qd_mp = NULL;
-	mutex_exit(&nce->nce_lock);
-
+	mutex_enter(&ncec->ncec_lock);
+	mp = ncec->ncec_qd_mp;
+	ncec->ncec_qd_mp = NULL;
+	ncec->ncec_nprobes = 0;
+	mutex_exit(&ncec->ncec_lock);
 	while (mp != NULL) {
 		nxt_mp = mp->b_next;
 		mp->b_next = NULL;
-		mp->b_prev = NULL;
 
-		first_mp = mp;
-		/*
-		 * Send icmp unreachable messages
-		 * to the hosts.
-		 */
-		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
-		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
-		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
-		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
+		    mp, ill);
+		if (ipst->ips_ip_arp_icmp_error) {
+			ip3dbg(("arp_resolv_failed: "
+			    "Calling icmp_unreachable\n"));
+			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
+		} else {
+			freemsg(mp);
+		}
+		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
 		mp = nxt_mp;
 	}
+	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
 }
 
+/*
+ * if ill is an under_ill, translate it to the ipmp_ill and add the
+ * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
+ * one on the underlying in_ill) will be created for the
+ * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
+ */
 int
-ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
-    nce_t **newnce, nce_t *src_nce)
+nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
+    const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 {
 	int	err;
-	nce_t	*nce;
 	in6_addr_t addr6;
 	ip_stack_t *ipst = ill->ill_ipst;
+	nce_t	*nce, *upper_nce = NULL;
+	ill_t	*in_ill = ill, *under = NULL;
+	boolean_t need_ill_refrele = B_FALSE;
+
+	if (flags & NCE_F_MCAST) {
+		/*
+		 * hw_addr will be figured out in nce_set_multicast_v4;
+		 * caller needs to pass in the cast_ill for ipmp
+		 */
+		ASSERT(hw_addr == NULL);
+		ASSERT(!IS_IPMP(ill));
+		err = nce_set_multicast_v4(ill, addr, flags, newnce);
+		return (err);
+	}
+
+	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
+		ill = ipmp_ill_hold_ipmp_ill(ill);
+		if (ill == NULL)
+			return (ENXIO);
+		need_ill_refrele = B_TRUE;
+	}
+	if ((flags & NCE_F_BCAST) != 0) {
+		/*
+		 * IPv4 broadcast ncec: compute the hwaddr.
+		 */
+		if (IS_IPMP(ill)) {
+			under = ipmp_ill_get_xmit_ill(ill, B_FALSE);
+			if (under == NULL)  {
+				if (need_ill_refrele)
+					ill_refrele(ill);
+				return (ENETDOWN);
+			}
+			hw_addr = under->ill_bcast_mp->b_rptr +
+			    NCE_LL_ADDR_OFFSET(under);
+			hw_addr_len = under->ill_phys_addr_length;
+		} else {
+			hw_addr = ill->ill_bcast_mp->b_rptr +
+			    NCE_LL_ADDR_OFFSET(ill),
+			    hw_addr_len = ill->ill_phys_addr_length;
+		}
+	}
 
 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
-	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
-	/*
-	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
-	 * looking up have fastpath headers that are inherently per-ill.
-	 */
-	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
+	nce = nce_lookup_addr(ill, &addr6);
 	if (nce == NULL) {
-		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
+		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
+		    state, &nce);
 	} else {
-		*newnce = nce;
 		err = EEXIST;
 	}
 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
+	if (err == 0)
+		err = nce_add_v4_postprocess(nce);
+
+	if (in_ill != ill && nce != NULL) {
+		nce_t *under_nce;
+
+		/*
+		 * in_ill was the under_ill. Try to create the under_nce.
+		 * Hold the ill_g_lock to prevent changes to group membership
+		 * until we are done.
+		 */
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+		if (IS_IN_SAME_ILLGRP(in_ill, ill)) {
+			under_nce = nce_fastpath_create(in_ill,
+			    nce->nce_common);
+			upper_nce = nce;
+			if ((nce = under_nce) == NULL)
+				err = EINVAL;
+		}
+		rw_exit(&ipst->ips_ill_g_lock);
+		if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common))
+			nce_fastpath_trigger(under_nce);
+	}
+	if (nce != NULL) {
+		if (newnce != NULL)
+			*newnce = nce;
+		else
+			nce_refrele(nce);
+	}
+
+	if (under != NULL)
+		ill_refrele(under);
+
+	if (upper_nce != NULL)
+		nce_refrele(upper_nce);
+
+	if (need_ill_refrele)
+		ill_refrele(ill);
+
 	return (err);
 }
 
@@ -3592,102 +3637,860 @@ ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
  * NDP Cache Entry creation routine for IPv4.
  * Mapped entries are handled in arp.
  * This routine must always be called with ndp4->ndp_g_lock held.
- * Prior to return, nce_refcnt is incremented.
+ * Prior to return, ncec_refcnt is incremented.
+ *
+ * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
+ * are always added pointing at the ipmp_ill. Thus, when the ill passed
+ * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
+ * entries will be created, both pointing at the same ncec_t. The nce_t
+ * entries will have their nce_ill set to the ipmp_ill and the under_ill
+ * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
+ * Local addresses are always created on the ill passed to nce_add_v4.
  */
-static int
-ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
-    nce_t **newnce, nce_t *src_nce)
+int
+nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
+    const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 {
-	static	nce_t		nce_nil;
-	nce_t		*nce;
-	mblk_t		*mp;
-	mblk_t		*template = NULL;
-	nce_t		**ncep;
-	ip_stack_t	*ipst = ill->ill_ipst;
-	uint16_t	state = ND_INITIAL;
 	int		err;
+	boolean_t	is_multicast = (flags & NCE_F_MCAST);
+	struct in6_addr	addr6;
+	nce_t		*nce;
 
-	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
+	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
 	ASSERT(!ill->ill_isv6);
-	ASSERT((flags & NCE_F_MAPPING) == 0);
+	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
+
+	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
+	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
+	    &nce);
+	ASSERT(newnce != NULL);
+	*newnce = nce;
+	return (err);
+}
+
+/*
+ * Post-processing routine to be executed after nce_add_v4(). This function
+ * triggers fastpath (if appropriate) and DAD on the newly added nce entry
+ * and must be called without any locks held.
+ *
+ * Always returns 0, but we return an int to keep this symmetric with the
+ * IPv6 counter-part.
+ */
+int
+nce_add_v4_postprocess(nce_t *nce)
+{
+	ncec_t		*ncec = nce->nce_common;
+	uint16_t	flags = ncec->ncec_flags;
+	boolean_t	ndp_need_dad = B_FALSE;
+	boolean_t	dropped;
+	clock_t		delay;
+	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
+	uchar_t		*hw_addr = ncec->ncec_lladdr;
+	boolean_t	trigger_fastpath = B_TRUE;
 
-	if (ill->ill_resolver_mp == NULL)
-		return (EINVAL);
 	/*
-	 * Allocate the mblk to hold the nce.
+	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
+	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
+	 * We call nce_fastpath from nce_update if the link layer address of
+	 * the peer changes from nce_update
 	 */
-	mp = allocb(sizeof (nce_t), BPRI_MED);
-	if (mp == NULL)
-		return (ENOMEM);
+	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
+	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
+		trigger_fastpath = B_FALSE;
 
-	nce = (nce_t *)mp->b_rptr;
-	mp->b_wptr = (uchar_t *)&nce[1];
-	*nce = nce_nil;
-	nce->nce_ill = ill;
-	nce->nce_ipversion = IPV4_VERSION;
-	nce->nce_flags = flags;
-	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
-	nce->nce_rcnt = ill->ill_xmit_count;
-	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
-	nce->nce_mask = ipv6_all_ones;
-	nce->nce_extract_mask = ipv6_all_zeros;
-	nce->nce_ll_extract_start = 0;
-	nce->nce_qd_mp = NULL;
-	nce->nce_mp = mp;
-	/* This one is for nce getting created */
-	nce->nce_refcnt = 1;
-	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
-	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
+	if (trigger_fastpath)
+		nce_fastpath_trigger(nce);
+
+	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
+		/*
+		 * Either the caller (by passing in ND_PROBE)
+		 * or nce_add_common() (by the internally computed state
+		 * based on ncec_addr and ill_net_type) has determined
+		 * that this unicast entry needs DAD. Trigger DAD.
+		 */
+		ndp_need_dad = B_TRUE;
+	} else if (flags & NCE_F_UNSOL_ADV) {
+		/*
+		 * We account for the transmit below by assigning one
+		 * less than the ndd variable. Subsequent decrements
+		 * are done in nce_timer.
+		 */
+		mutex_enter(&ncec->ncec_lock);
+		ncec->ncec_unsolicit_count =
+		    ipst->ips_ip_arp_publish_count - 1;
+		mutex_exit(&ncec->ncec_lock);
+		dropped = arp_announce(ncec);
+		mutex_enter(&ncec->ncec_lock);
+		if (dropped)
+			ncec->ncec_unsolicit_count++;
+		else
+			ncec->ncec_last_time_defended = ddi_get_lbolt();
+		if (ncec->ncec_unsolicit_count != 0) {
+			nce_start_timer(ncec,
+			    ipst->ips_ip_arp_publish_interval);
+		}
+		mutex_exit(&ncec->ncec_lock);
+	}
 
-	nce->nce_trace_disable = B_FALSE;
+	/*
+	 * If ncec_xmit_interval is 0, user has configured us to send the first
+	 * probe right away.  Do so, and set up for the subsequent probes.
+	 */
+	if (ndp_need_dad) {
+		mutex_enter(&ncec->ncec_lock);
+		if (ncec->ncec_pcnt == 0) {
+			/*
+			 * DAD probes and announce can be
+			 * administratively disabled by setting the
+			 * probe_count to zero. Restart the timer in
+			 * this case to mark the ipif as ready.
+			 */
+			ncec->ncec_unsolicit_count = 0;
+			mutex_exit(&ncec->ncec_lock);
+			nce_restart_timer(ncec, 0);
+		} else {
+			mutex_exit(&ncec->ncec_lock);
+			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
+			    ipst->ips_arp_probe_delay :
+			    ipst->ips_arp_fastprobe_delay);
+			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
+		}
+	}
+	return (0);
+}
 
-	if (src_nce != NULL) {
+/*
+ * ncec_walk routine to update all entries that have a given destination or
+ * gateway address and cached link layer (MAC) address.  This is used when ARP
+ * informs us that a network-to-link-layer mapping may have changed.
+ */
+void
+nce_update_hw_changed(ncec_t *ncec, void *arg)
+{
+	nce_hw_map_t *hwm = arg;
+	ipaddr_t ncec_addr;
+
+	if (ncec->ncec_state != ND_REACHABLE)
+		return;
+
+	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
+	if (ncec_addr != hwm->hwm_addr)
+		return;
+
+	mutex_enter(&ncec->ncec_lock);
+	if (hwm->hwm_flags != 0)
+		ncec->ncec_flags = hwm->hwm_flags;
+	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
+	mutex_exit(&ncec->ncec_lock);
+}
+
+void
+ncec_refhold(ncec_t *ncec)
+{
+	mutex_enter(&(ncec)->ncec_lock);
+	(ncec)->ncec_refcnt++;
+	ASSERT((ncec)->ncec_refcnt != 0);
+#ifdef DEBUG
+	ncec_trace_ref(ncec);
+#endif
+	mutex_exit(&(ncec)->ncec_lock);
+}
+
+void
+ncec_refhold_notr(ncec_t *ncec)
+{
+	mutex_enter(&(ncec)->ncec_lock);
+	(ncec)->ncec_refcnt++;
+	ASSERT((ncec)->ncec_refcnt != 0);
+	mutex_exit(&(ncec)->ncec_lock);
+}
+
+static void
+ncec_refhold_locked(ncec_t *ncec)
+{
+	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
+	(ncec)->ncec_refcnt++;
+#ifdef DEBUG
+	ncec_trace_ref(ncec);
+#endif
+}
+
+/* ncec_inactive destroys the mutex thus no mutex_exit is needed */
+void
+ncec_refrele(ncec_t *ncec)
+{
+	mutex_enter(&(ncec)->ncec_lock);
+#ifdef DEBUG
+	ncec_untrace_ref(ncec);
+#endif
+	ASSERT((ncec)->ncec_refcnt != 0);
+	if (--(ncec)->ncec_refcnt == 0) {
+		ncec_inactive(ncec);
+	} else {
+		mutex_exit(&(ncec)->ncec_lock);
+	}
+}
+
+void
+ncec_refrele_notr(ncec_t *ncec)
+{
+	mutex_enter(&(ncec)->ncec_lock);
+	ASSERT((ncec)->ncec_refcnt != 0);
+	if (--(ncec)->ncec_refcnt == 0) {
+		ncec_inactive(ncec);
+	} else {
+		mutex_exit(&(ncec)->ncec_lock);
+	}
+}
+
+/*
+ * Common to IPv4 and IPv6.
+ */
+void
+nce_restart_timer(ncec_t *ncec, uint_t ms)
+{
+	timeout_id_t tid;
+
+	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
+
+	/* First cancel any running timer */
+	mutex_enter(&ncec->ncec_lock);
+	tid = ncec->ncec_timeout_id;
+	ncec->ncec_timeout_id = 0;
+	if (tid != 0) {
+		mutex_exit(&ncec->ncec_lock);
+		(void) untimeout(tid);
+		mutex_enter(&ncec->ncec_lock);
+	}
+
+	/* Restart timer */
+	nce_start_timer(ncec, ms);
+	mutex_exit(&ncec->ncec_lock);
+}
+
+static void
+nce_start_timer(ncec_t *ncec, uint_t ms)
+{
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+	/*
+	 * Don't start the timer if the ncec has been deleted, or if the timer
+	 * is already running
+	 */
+	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
+		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
+		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
+	}
+}
+
+int
+nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
+    uint16_t flags, nce_t **newnce)
+{
+	uchar_t		*hw_addr;
+	int		err = 0;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	in6_addr_t	dst6;
+	nce_t		*nce;
+
+	ASSERT(!ill->ill_isv6);
+
+	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
+	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
+	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
+		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
+		goto done;
+	}
+	if (ill->ill_net_type == IRE_IF_RESOLVER) {
+		/*
+		 * For IRE_IF_RESOLVER a hardware mapping can be
+		 * generated, for IRE_IF_NORESOLVER, resolution cookie
+		 * in the ill is copied in nce_add_v4().
+		 */
+		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
+		if (hw_addr == NULL) {
+			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
+			return (ENOMEM);
+		}
+		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
+	} else {
 		/*
-		 * src_nce has been provided by the caller. The only
-		 * caller who provides a non-null, non-broadcast
-		 * src_nce is from ip_newroute() which must pass in
-		 * a ND_REACHABLE src_nce (this condition is verified
-		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
+		 * IRE_IF_NORESOLVER type simply copies the resolution
+		 * cookie passed in.  So no hw_addr is needed.
 		 */
-		mutex_enter(&src_nce->nce_lock);
-		state = src_nce->nce_state;
-		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
-		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
+		hw_addr = NULL;
+	}
+	ASSERT(flags & NCE_F_MCAST);
+	ASSERT(flags & NCE_F_NONUD);
+	/* nce_state will be computed by nce_add_common() */
+	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
+	    ND_UNCHANGED, &nce);
+	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
+	if (err == 0)
+		err = nce_add_v4_postprocess(nce);
+	if (hw_addr != NULL)
+		kmem_free(hw_addr, ill->ill_phys_addr_length);
+	if (err != 0) {
+		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
+		return (err);
+	}
+done:
+	if (newnce != NULL)
+		*newnce = nce;
+	else
+		nce_refrele(nce);
+	return (0);
+}
+
+/*
+ * This is used when scanning for "old" (least recently broadcast) NCEs.  We
+ * don't want to have to walk the list for every single one, so we gather up
+ * batches at a time.
+ */
+#define	NCE_RESCHED_LIST_LEN	8
+
+typedef struct {
+	ill_t	*ncert_ill;
+	uint_t	ncert_num;
+	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
+} nce_resched_t;
+
+/*
+ * Pick the longest waiting NCEs for defense.
+ */
+/* ARGSUSED */
+static int
+ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
+{
+	nce_resched_t *ncert = arg;
+	ncec_t **ncecs;
+	ncec_t **ncec_max;
+	ncec_t *ncec_temp;
+	ncec_t *ncec = nce->nce_common;
+
+	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
+	/*
+	 * Only reachable entries that are ready for announcement are eligible.
+	 */
+	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
+		return (0);
+	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
+		ncec_refhold(ncec);
+		ncert->ncert_nces[ncert->ncert_num++] = ncec;
+	} else {
+		ncecs = ncert->ncert_nces;
+		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
+		ncec_refhold(ncec);
+		for (; ncecs < ncec_max; ncecs++) {
+			ASSERT(ncec != NULL);
+			if ((*ncecs)->ncec_last_time_defended >
+			    ncec->ncec_last_time_defended) {
+				ncec_temp = *ncecs;
+				*ncecs = ncec;
+				ncec = ncec_temp;
+			}
+		}
+		ncec_refrele(ncec);
+	}
+	return (0);
+}
+
+/*
+ * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
+ * doesn't happen very often (if at all), and thus it needn't be highly
+ * optimized.  (Note, though, that it's actually O(N) complexity, because the
+ * outer loop is bounded by a constant rather than by the length of the list.)
+ */
+static void
+nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
+{
+	ncec_t		*ncec;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	uint_t		i, defend_rate;
+
+	i = ill->ill_defend_count;
+	ill->ill_defend_count = 0;
+	if (ill->ill_isv6)
+		defend_rate = ipst->ips_ndp_defend_rate;
+	else
+		defend_rate = ipst->ips_arp_defend_rate;
+	/* If none could be sitting around, then don't reschedule */
+	if (i < defend_rate) {
+		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
+		return;
+	}
+	ncert->ncert_ill = ill;
+	while (ill->ill_defend_count < defend_rate) {
+		nce_walk_common(ill, ncec_reschedule, ncert);
+		for (i = 0; i < ncert->ncert_num; i++) {
+
+			ncec = ncert->ncert_nces[i];
+			mutex_enter(&ncec->ncec_lock);
+			ncec->ncec_flags |= NCE_F_DELAYED;
+			mutex_exit(&ncec->ncec_lock);
 			/*
-			 * src_nce has been deleted, or
-			 * ip_arp_news is in the middle of
-			 * flushing entries in the the nce.
-			 * Fail the add, since we don't know
-			 * if it is safe to copy the contents of
-			 * src_nce
+			 * we plan to schedule this ncec, so incr the
+			 * defend_count in anticipation.
 			 */
-			DTRACE_PROBE2(nce__bad__src__nce,
-			    nce_t *, src_nce, ill_t *, ill);
-			mutex_exit(&src_nce->nce_lock);
-			err = EINVAL;
-			goto err_ret;
+			if (++ill->ill_defend_count >= defend_rate)
+				break;
 		}
-		template = copyb(src_nce->nce_res_mp);
-		mutex_exit(&src_nce->nce_lock);
-		if (template == NULL) {
-			err = ENOMEM;
-			goto err_ret;
+		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
+			break;
+	}
+}
+
+/*
+ * Check if the current rate-limiting parameters permit the sending
+ * of another address defense announcement for both IPv4 and IPv6.
+ * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
+ * permitted), and B_FALSE otherwise. The `defend_rate' parameter
+ * determines how many address defense announcements are permitted
+ * in any `defense_perio' interval.
+ */
+static boolean_t
+ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
+{
+	clock_t		now = ddi_get_lbolt();
+	ip_stack_t	*ipst = ill->ill_ipst;
+	clock_t		start = ill->ill_defend_start;
+	uint32_t	elapsed, defend_period, defend_rate;
+	nce_resched_t	ncert;
+	boolean_t	ret;
+	int		i;
+
+	if (ill->ill_isv6) {
+		defend_period = ipst->ips_ndp_defend_period;
+		defend_rate = ipst->ips_ndp_defend_rate;
+	} else {
+		defend_period = ipst->ips_arp_defend_period;
+		defend_rate = ipst->ips_arp_defend_rate;
+	}
+	if (defend_rate == 0)
+		return (B_TRUE);
+	bzero(&ncert, sizeof (ncert));
+	mutex_enter(&ill->ill_lock);
+	if (start > 0) {
+		elapsed = now - start;
+		if (elapsed > SEC_TO_TICK(defend_period)) {
+			ill->ill_defend_start = now;
+			/*
+			 * nce_ill_reschedule will attempt to
+			 * prevent starvation by reschduling the
+			 * oldest entries, which are marked with
+			 * the NCE_F_DELAYED flag.
+			 */
+			nce_ill_reschedule(ill, &ncert);
+		}
+	} else {
+		ill->ill_defend_start = now;
+	}
+	ASSERT(ill->ill_defend_count <= defend_rate);
+	mutex_enter(&ncec->ncec_lock);
+	if (ncec->ncec_flags & NCE_F_DELAYED) {
+		/*
+		 * This ncec was rescheduled as one of the really old
+		 * entries needing on-going defense. The
+		 * ill_defend_count was already incremented in
+		 * nce_ill_reschedule. Go ahead and send the announce.
+		 */
+		ncec->ncec_flags &= ~NCE_F_DELAYED;
+		mutex_exit(&ncec->ncec_lock);
+		ret = B_FALSE;
+		goto done;
+	}
+	mutex_exit(&ncec->ncec_lock);
+	if (ill->ill_defend_count < defend_rate)
+		ill->ill_defend_count++;
+	if (ill->ill_defend_count == defend_rate) {
+		/*
+		 * we are no longer allowed to send unbidden defense
+		 * messages. Wait for rescheduling.
+		 */
+		ret = B_TRUE;
+	} else {
+		ret = B_FALSE;
+	}
+done:
+	mutex_exit(&ill->ill_lock);
+	/*
+	 * After all the locks have been dropped we can restart nce timer,
+	 * and refrele the delayed ncecs
+	 */
+	for (i = 0; i < ncert.ncert_num; i++) {
+		clock_t	xmit_interval;
+		ncec_t	*tmp;
+
+		tmp = ncert.ncert_nces[i];
+		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
+		    B_FALSE);
+		nce_restart_timer(tmp, xmit_interval);
+		ncec_refrele(tmp);
+	}
+	return (ret);
+}
+
+boolean_t
+ndp_announce(ncec_t *ncec)
+{
+	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
+	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
+	    nce_advert_flags(ncec)));
+}
+
+ill_t *
+nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
+{
+	mblk_t		*mp;
+	in6_addr_t	src6;
+	ipaddr_t	src4;
+	ill_t		*ill = ncec->ncec_ill;
+	ill_t		*src_ill = NULL;
+	ipif_t		*ipif = NULL;
+	boolean_t	is_myaddr = NCE_MYADDR(ncec);
+	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
+
+	ASSERT(src != NULL);
+	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
+	src6 = *src;
+	if (is_myaddr) {
+		src6 = ncec->ncec_addr;
+		if (!isv6)
+			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
+	} else {
+		/*
+		 * try to find one from the outgoing packet.
+		 */
+		mutex_enter(&ncec->ncec_lock);
+		mp = ncec->ncec_qd_mp;
+		if (mp != NULL) {
+			if (isv6) {
+				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
+
+				src6 = ip6h->ip6_src;
+			} else {
+				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
+
+				src4 = ipha->ipha_src;
+				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
+			}
+		}
+		mutex_exit(&ncec->ncec_lock);
+	}
+
+	/*
+	 * For outgoing packets, if the src of outgoing packet is one
+	 * of the assigned interface addresses use it, otherwise we
+	 * will pick the source address below.
+	 * For local addresses (is_myaddr) doing DAD, NDP announce
+	 * messages are mcast. So we use the (IPMP) cast_ill or the
+	 * (non-IPMP) ncec_ill for these message types. The only case
+	 * of unicast DAD messages are for IPv6 ND probes, for which
+	 * we find the ipif_bound_ill corresponding to the ncec_addr.
+	 */
+	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
+		if (isv6) {
+			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
+			    ill->ill_ipst);
+		} else {
+			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
+			    ill->ill_ipst);
+		}
+
+		/*
+		 * If no relevant ipif can be found, then it's not one of our
+		 * addresses.  Reset to :: and try to find a src for the NS or
+		 * ARP request using ipif_select_source_v[4,6]  below.
+		 * If an ipif can be found, but it's not yet done with
+		 * DAD verification, and we are not being invoked for
+		 * DAD (i.e., !is_myaddr), then just postpone this
+		 * transmission until later.
+		 */
+		if (ipif == NULL) {
+			src6 = ipv6_all_zeros;
+			src4 = INADDR_ANY;
+		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
+			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
+			    ncec_t *, ncec, ipif_t *, ipif);
+			ipif_refrele(ipif);
+			return (NULL);
 		}
-	} else if (flags & NCE_F_BCAST) {
+	}
+
+	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
 		/*
-		 * broadcast nce.
+		 * Pick a source address for this solicitation, but
+		 * restrict the selection to addresses assigned to the
+		 * output interface.  We do this because the destination will
+		 * create a neighbor cache entry for the source address of
+		 * this packet, so the source address had better be a valid
+		 * neighbor.
 		 */
-		template = copyb(ill->ill_bcast_mp);
+		if (isv6) {
+			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
+			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
+			    B_FALSE, NULL);
+		} else {
+			ipaddr_t nce_addr;
+
+			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
+			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
+			    B_FALSE, NULL);
+		}
+		if (ipif == NULL && IS_IPMP(ill)) {
+			ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE);
+
+			if (send_ill != NULL) {
+				if (isv6) {
+					ipif = ipif_select_source_v6(send_ill,
+					    &ncec->ncec_addr, B_TRUE,
+					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
+					    B_FALSE, NULL);
+				} else {
+					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
+					    src4);
+					ipif = ipif_select_source_v4(send_ill,
+					    src4, ALL_ZONES, B_TRUE, NULL);
+				}
+				ill_refrele(send_ill);
+			}
+		}
+
+		if (ipif == NULL) {
+			char buf[INET6_ADDRSTRLEN];
+
+			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
+			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
+			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
+			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
+			return (NULL);
+		}
+		src6 = ipif->ipif_v6lcl_addr;
+	}
+	*src = src6;
+	if (ipif != NULL) {
+		src_ill = ipif->ipif_ill;
+		if (IS_IPMP(src_ill))
+			src_ill = ipmp_ipif_hold_bound_ill(ipif);
+		else
+			ill_refhold(src_ill);
+		ipif_refrele(ipif);
+		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
+		    ill_t *, src_ill);
+	}
+	return (src_ill);
+}
+
+void
+ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
+    uchar_t *hwaddr, int hwaddr_len, int flags)
+{
+	ill_t	*ill;
+	ncec_t	*ncec;
+	nce_t	*nce;
+	uint16_t new_state;
+
+	ill = (ipif ? ipif->ipif_ill : NULL);
+	if (ill != NULL) {
+		/*
+		 * only one ncec is possible
+		 */
+		nce = nce_lookup_v4(ill, addr);
+		if (nce != NULL) {
+			ncec = nce->nce_common;
+			mutex_enter(&ncec->ncec_lock);
+			if (NCE_ISREACHABLE(ncec))
+				new_state = ND_UNCHANGED;
+			else
+				new_state = ND_STALE;
+			ncec->ncec_flags = flags;
+			nce_update(ncec, new_state, hwaddr);
+			mutex_exit(&ncec->ncec_lock);
+			nce_refrele(nce);
+			return;
+		}
+	} else {
+		/*
+		 * ill is wildcard; clean up all ncec's and ire's
+		 * that match on addr.
+		 */
+		nce_hw_map_t hwm;
+
+		hwm.hwm_addr = *addr;
+		hwm.hwm_hwlen = hwaddr_len;
+		hwm.hwm_hwaddr = hwaddr;
+		hwm.hwm_flags = flags;
+
+		ncec_walk_common(ipst->ips_ndp4, NULL,
+		    (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
+	}
+}
+
+/*
+ * Common function to add ncec entries.
+ * we always add the ncec with ncec_ill == ill, and always create
+ * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
+ * ncec is !reachable.
+ *
+ * When the caller passes in an nce_state of ND_UNCHANGED,
+ * nce_add_common() will determine the state of the created nce based
+ * on the ill_net_type and nce_flags used. Otherwise, the nce will
+ * be created with state set to the passed in nce_state.
+ */
+static int
+nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
+    const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
+{
+	static	ncec_t		nce_nil;
+	uchar_t			*template = NULL;
+	int			err;
+	ncec_t			*ncec;
+	ncec_t			**ncep;
+	ip_stack_t		*ipst = ill->ill_ipst;
+	uint16_t		state;
+	boolean_t		fastprobe = B_FALSE;
+	struct ndp_g_s		*ndp;
+	nce_t			*nce = NULL;
+	mblk_t			*dlur_mp = NULL;
+
+	if (ill->ill_isv6)
+		ndp = ill->ill_ipst->ips_ndp6;
+	else
+		ndp = ill->ill_ipst->ips_ndp4;
+
+	*retnce = NULL;
+
+	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
+
+	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
+		ip0dbg(("nce_add_common: no addr\n"));
+		return (EINVAL);
+	}
+	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
+		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
+		return (EINVAL);
+	}
+
+	if (ill->ill_isv6) {
+		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
+	} else {
+		ipaddr_t v4addr;
+
+		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
+		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
+	}
+
+	/*
+	 * The caller has ensured that there is no nce on ill, but there could
+	 * still be an nce_common_t for the address, so that we find exisiting
+	 * ncec_t strucutures first, and atomically add a new nce_t if
+	 * one is found. The ndp_g_lock ensures that we don't cross threads
+	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
+	 * compare for matches across the illgrp because this function is
+	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
+	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
+	 * appropriate.
+	 */
+	ncec = *ncep;
+	for (; ncec != NULL; ncec = ncec->ncec_next) {
+		if (ncec->ncec_ill == ill) {
+			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
+				*retnce = nce_ill_lookup_then_add(ill, ncec);
+				if (*retnce != NULL)
+					break;
+			}
+		}
+	}
+	if (*retnce != NULL) {
+		/*
+		 * We should never find *retnce to be MYADDR, since the caller
+		 * may then incorrectly restart a DAD timer that's already
+		 * running.
+		 */
+		ASSERT(!NCE_MYADDR(ncec));
+		/* caller must trigger fastpath on nce */
+		return (0);
+	}
+	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
+	if (ncec == NULL)
+		return (ENOMEM);
+	*ncec = nce_nil;
+	ncec->ncec_ill = ill;
+	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
+	ncec->ncec_flags = flags;
+	ncec->ncec_ipst = ipst;	/* No netstack_hold */
+
+	if (!ill->ill_isv6) {
+		ipaddr_t addr4;
+
+		/*
+		 * DAD probe interval and probe count are set based on
+		 * fast/slow probe settings. If the underlying link doesn't
+		 * have reliably up/down notifications or if we're working
+		 * with IPv4 169.254.0.0/16 Link Local Address space, then
+		 * don't use the fast timers.  Otherwise, use them.
+		 */
+		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
+		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
+		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4))
+			fastprobe = B_TRUE;
+		if (fastprobe) {
+			ncec->ncec_xmit_interval =
+			    ipst->ips_arp_fastprobe_interval;
+			ncec->ncec_pcnt =
+			    ipst->ips_arp_fastprobe_count;
+			ncec->ncec_flags |= NCE_F_FAST;
+		} else {
+			ncec->ncec_xmit_interval =
+			    ipst->ips_arp_probe_interval;
+			ncec->ncec_pcnt =
+			    ipst->ips_arp_probe_count;
+		}
+		if (NCE_PUBLISH(ncec)) {
+			ncec->ncec_unsolicit_count =
+			    ipst->ips_ip_arp_publish_count;
+		}
+	} else {
+		/*
+		 * probe interval is constant: ILL_PROBE_INTERVAL
+		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
+		 */
+		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
+		if (NCE_PUBLISH(ncec)) {
+			ncec->ncec_unsolicit_count =
+			    ipst->ips_ip_ndp_unsolicit_count;
+		}
+	}
+	ncec->ncec_rcnt = ill->ill_xmit_count;
+	ncec->ncec_addr = *addr;
+	ncec->ncec_qd_mp = NULL;
+	ncec->ncec_refcnt = 1; /* for ncec getting created */
+	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
+	ncec->ncec_trace_disable = B_FALSE;
+
+	/*
+	 * ncec_lladdr holds link layer address
+	 */
+	if (hw_addr_len > 0) {
+		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
 		if (template == NULL) {
 			err = ENOMEM;
 			goto err_ret;
 		}
+		ncec->ncec_lladdr = template;
+		ncec->ncec_lladdr_length = hw_addr_len;
+		bzero(ncec->ncec_lladdr, hw_addr_len);
+	}
+	if ((flags & NCE_F_BCAST) != 0) {
 		state = ND_REACHABLE;
+		ASSERT(hw_addr_len > 0);
+	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
+		state = ND_INITIAL;
 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
 		/*
 		 * NORESOLVER entries are always created in the REACHABLE
 		 * state.
 		 */
+		state = ND_REACHABLE;
 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
 		    ill->ill_mactype != DL_IPV4 &&
 		    ill->ill_mactype != DL_6TO4) {
@@ -3698,32 +4501,91 @@ ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
 			 * that do their own resolution from IP to link-layer
 			 * address (e.g. IP over X.25).
 			 */
-			template = ill_dlur_gen((uchar_t *)addr,
-			    ill->ill_phys_addr_length,
-			    ill->ill_sap, ill->ill_sap_length);
-		} else {
-			template = copyb(ill->ill_resolver_mp);
+			bcopy((uchar_t *)addr,
+			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
 		}
-		if (template == NULL) {
-			err = ENOMEM;
-			goto err_ret;
+		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
+		    ill->ill_mactype != DL_IPV6) {
+			/*
+			 * We create a nce_res_mp with the IP nexthop address
+			 * as the destination address if the physical legnth
+			 * is exactly 16 bytes for point-to-multipoint links
+			 * that do their own resolution from IP to link-layer
+			 * address.
+			 */
+			bcopy((uchar_t *)addr,
+			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
 		}
+		/*
+		 * Since NUD is not part of the base IPv4 protocol definition,
+		 * IPv4 neighbor entries on NORESOLVER interfaces will never
+		 * age, and are marked NCE_F_NONUD.
+		 */
+		if (!ill->ill_isv6)
+			ncec->ncec_flags |= NCE_F_NONUD;
+	} else if (ill->ill_net_type == IRE_LOOPBACK) {
 		state = ND_REACHABLE;
 	}
-	nce->nce_fp_mp = NULL;
-	nce->nce_res_mp = template;
-	nce->nce_state = state;
+
+	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
+		/*
+		 * We are adding an ncec with a deterministic hw_addr,
+		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
+		 *
+		 * if we are adding a unicast ncec for the local address
+		 * it would be REACHABLE; we would be adding a ND_STALE entry
+		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
+		 * addresses are added in PROBE to trigger DAD.
+		 */
+		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
+		    ill->ill_net_type == IRE_IF_NORESOLVER)
+			state = ND_REACHABLE;
+		else if (!NCE_PUBLISH(ncec))
+			state = ND_STALE;
+		else
+			state = ND_PROBE;
+		if (hw_addr != NULL)
+			nce_set_ll(ncec, hw_addr);
+	}
+	/* caller overrides internally computed state */
+	if (nce_state != ND_UNCHANGED)
+		state = nce_state;
+
+	if (state == ND_PROBE)
+		ncec->ncec_flags |= NCE_F_UNVERIFIED;
+
+	ncec->ncec_state = state;
+
 	if (state == ND_REACHABLE) {
-		nce->nce_last = TICK_TO_MSEC(lbolt64);
-		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
+		ncec->ncec_last = TICK_TO_MSEC(lbolt64);
+		ncec->ncec_init_time = TICK_TO_MSEC(lbolt64);
 	} else {
-		nce->nce_last = 0;
+		ncec->ncec_last = 0;
 		if (state == ND_INITIAL)
-			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
+			ncec->ncec_init_time = TICK_TO_MSEC(lbolt64);
+	}
+	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
+	    offsetof(ncec_cb_t, ncec_cb_node));
+	/*
+	 * have all the memory allocations out of the way before taking locks
+	 * and adding the nce.
+	 */
+	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
+	if (nce == NULL) {
+		err = ENOMEM;
+		goto err_ret;
+	}
+	if (ncec->ncec_lladdr != NULL ||
+	    ill->ill_net_type == IRE_IF_NORESOLVER) {
+		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
+		    ill->ill_phys_addr_length, ill->ill_sap,
+		    ill->ill_sap_length);
+		if (dlur_mp == NULL) {
+			err = ENOMEM;
+			goto err_ret;
+		}
 	}
 
-	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
-	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
 	/*
 	 * Atomically ensure that the ill is not CONDEMNED, before
 	 * adding the NCE.
@@ -3734,128 +4596,423 @@ ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
 		err = EINVAL;
 		goto err_ret;
 	}
-	if ((nce->nce_next = *ncep) != NULL)
-		nce->nce_next->nce_ptpn = &nce->nce_next;
-	*ncep = nce;
-	nce->nce_ptpn = ncep;
-	*newnce = nce;
-	/* This one is for nce being used by an active thread */
-	NCE_REFHOLD(*newnce);
+	if (!NCE_MYADDR(ncec) &&
+	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
+		mutex_exit(&ill->ill_lock);
+		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
+		err = EINVAL;
+		goto err_ret;
+	}
+	/*
+	 * Acquire the ncec_lock even before adding the ncec to the list
+	 * so that it cannot get deleted after the ncec is added, but
+	 * before we add the nce.
+	 */
+	mutex_enter(&ncec->ncec_lock);
+	if ((ncec->ncec_next = *ncep) != NULL)
+		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
+	*ncep = ncec;
+	ncec->ncec_ptpn = ncep;
 
-	/* Bump up the number of nce's referencing this ill */
+	/* Bump up the number of ncec's referencing this ill */
 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
-	    (char *), "nce", (void *), nce);
-	ill->ill_nce_cnt++;
+	    (char *), "ncec", (void *), ncec);
+	ill->ill_ncec_cnt++;
+	/*
+	 * Since we hold the ncec_lock at this time, the ncec cannot be
+	 * condemned, and we can safely add the nce.
+	 */
+	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
+	mutex_exit(&ncec->ncec_lock);
 	mutex_exit(&ill->ill_lock);
-	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
+
+	/* caller must trigger fastpath on *retnce */
 	return (0);
+
 err_ret:
-	freeb(mp);
-	freemsg(template);
+	if (ncec != NULL)
+		kmem_cache_free(ncec_cache, ncec);
+	if (nce != NULL)
+		kmem_cache_free(nce_cache, nce);
+	freemsg(dlur_mp);
+	if (template != NULL)
+		kmem_free(template, ill->ill_phys_addr_length);
 	return (err);
 }
 
 /*
- * ndp_walk routine to delete all entries that have a given destination or
- * gateway address and cached link layer (MAC) address.  This is used when ARP
- * informs us that a network-to-link-layer mapping may have changed.
+ * take a ref on the nce
  */
 void
-nce_delete_hw_changed(nce_t *nce, void *arg)
+nce_refhold(nce_t *nce)
 {
-	nce_hw_map_t *hwm = arg;
-	mblk_t *mp;
-	dl_unitdata_req_t *dlu;
-	uchar_t *macaddr;
-	ill_t *ill;
-	int saplen;
-	ipaddr_t nce_addr;
+	mutex_enter(&nce->nce_lock);
+	nce->nce_refcnt++;
+	ASSERT((nce)->nce_refcnt != 0);
+	mutex_exit(&nce->nce_lock);
+}
 
-	if (nce->nce_state != ND_REACHABLE)
-		return;
+/*
+ * release a ref on the nce; In general, this
+ * cannot be called with locks held because nce_inactive
+ * may result in nce_inactive which will take the ill_lock,
+ * do ipif_ill_refrele_tail etc. Thus the one exception
+ * where this can be called with locks held is when the caller
+ * is certain that the nce_refcnt is sufficient to prevent
+ * the invocation of nce_inactive.
+ */
+void
+nce_refrele(nce_t *nce)
+{
+	ASSERT((nce)->nce_refcnt != 0);
+	mutex_enter(&nce->nce_lock);
+	if (--nce->nce_refcnt == 0)
+		nce_inactive(nce); /* destroys the mutex */
+	else
+		mutex_exit(&nce->nce_lock);
+}
 
-	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
-	if (nce_addr != hwm->hwm_addr)
-		return;
+/*
+ * free the nce after all refs have gone away.
+ */
+static void
+nce_inactive(nce_t *nce)
+{
+	ill_t *ill = nce->nce_ill;
+
+	ASSERT(nce->nce_refcnt == 0);
+
+	ncec_refrele_notr(nce->nce_common);
+	nce->nce_common = NULL;
+	freemsg(nce->nce_fp_mp);
+	freemsg(nce->nce_dlur_mp);
+
+	mutex_enter(&ill->ill_lock);
+	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
+	    (char *), "nce", (void *), nce);
+	ill->ill_nce_cnt--;
+	nce->nce_ill = NULL;
+	/*
+	 * If the number of ncec's associated with this ill have dropped
+	 * to zero, check whether we need to restart any operation that
+	 * is waiting for this to happen.
+	 */
+	if (ILL_DOWN_OK(ill)) {
+		/* ipif_ill_refrele_tail drops the ill_lock */
+		ipif_ill_refrele_tail(ill);
+	} else {
+		mutex_exit(&ill->ill_lock);
+	}
+
+	mutex_destroy(&nce->nce_lock);
+	kmem_cache_free(nce_cache, nce);
+}
+
+/*
+ * Add an nce to the ill_nce list.
+ */
+static nce_t *
+nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
+{
+	bzero(nce, sizeof (*nce));
+	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
+	nce->nce_common = ncec;
+	nce->nce_addr = ncec->ncec_addr;
+	nce->nce_ill = ill;
+	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
+	    (char *), "nce", (void *), nce);
+	ill->ill_nce_cnt++;
+
+	nce->nce_refcnt = 1; /* for the thread */
+	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
+	nce->nce_dlur_mp = dlur_mp;
+
+	/* add nce to the ill's fastpath list.  */
+	nce->nce_refcnt++; /* for the list */
+	list_insert_head(&ill->ill_nce, nce);
+	return (nce);
+}
+
+static nce_t *
+nce_add(ill_t *ill, ncec_t *ncec)
+{
+	nce_t	*nce;
+	mblk_t	*dlur_mp = NULL;
+
+	ASSERT(MUTEX_HELD(&ill->ill_lock));
+	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+
+	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
+	if (nce == NULL)
+		return (NULL);
+	if (ncec->ncec_lladdr != NULL ||
+	    ill->ill_net_type == IRE_IF_NORESOLVER) {
+		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
+		    ill->ill_phys_addr_length, ill->ill_sap,
+		    ill->ill_sap_length);
+		if (dlur_mp == NULL) {
+			kmem_cache_free(nce_cache, nce);
+			return (NULL);
+		}
+	}
+	return (nce_add_impl(ill, ncec, nce, dlur_mp));
+}
+
+/*
+ * remove the nce from the ill_faspath list
+ */
+void
+nce_delete(nce_t *nce)
+{
+	ill_t	*ill = nce->nce_ill;
+
+	ASSERT(MUTEX_HELD(&ill->ill_lock));
 
 	mutex_enter(&nce->nce_lock);
-	if ((mp = nce->nce_res_mp) == NULL) {
+	if (nce->nce_is_condemned) {
+		/*
+		 * some other thread has removed this nce from the ill_nce list
+		 */
 		mutex_exit(&nce->nce_lock);
 		return;
 	}
-	dlu = (dl_unitdata_req_t *)mp->b_rptr;
-	macaddr = (uchar_t *)(dlu + 1);
-	ill = nce->nce_ill;
-	if ((saplen = ill->ill_sap_length) > 0)
-		macaddr += saplen;
-	else
-		saplen = -saplen;
+	nce->nce_is_condemned = B_TRUE;
+	mutex_exit(&nce->nce_lock);
 
+	list_remove(&ill->ill_nce, nce);
 	/*
-	 * If the hardware address is unchanged, then leave this one alone.
-	 * Note that saplen == abs(saplen) now.
+	 * even though we are holding the ill_lock, it is ok to
+	 * call nce_refrele here because we know that we should have
+	 * at least 2 refs on the nce: one for the thread, and one
+	 * for the list. The refrele below will release the one for
+	 * the list.
 	 */
-	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
-	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
-		mutex_exit(&nce->nce_lock);
-		return;
+	nce_refrele(nce);
+}
+
+nce_t *
+nce_lookup(ill_t *ill, const in6_addr_t *addr)
+{
+	nce_t *nce = NULL;
+
+	ASSERT(ill != NULL);
+	ASSERT(MUTEX_HELD(&ill->ill_lock));
+
+	for (nce = list_head(&ill->ill_nce); nce != NULL;
+	    nce = list_next(&ill->ill_nce, nce)) {
+		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
+			break;
 	}
-	mutex_exit(&nce->nce_lock);
 
-	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
-	ndp_delete(nce);
+	/*
+	 * if we found the nce on the ill_nce list while holding
+	 * the ill_lock, then it cannot be condemned yet.
+	 */
+	if (nce != NULL) {
+		ASSERT(!nce->nce_is_condemned);
+		nce_refhold(nce);
+	}
+	return (nce);
 }
 
 /*
- * This function verifies whether a given IPv4 address is potentially known to
- * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
- * so that it can continue to look for hardware changes on that address.
+ * Walk the ill_nce list on ill. The callback function func() cannot perform
+ * any destructive actions.
  */
-boolean_t
-ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
+static void
+nce_walk_common(ill_t *ill, pfi_t func, void *arg)
 {
-	nce_t		*nce;
-	struct in_addr	nceaddr;
-	ip_stack_t	*ipst = ns->netstack_ip;
+	nce_t *nce = NULL, *nce_next;
 
-	if (addr == INADDR_ANY)
-		return (B_FALSE);
+	ASSERT(MUTEX_HELD(&ill->ill_lock));
+	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
+		nce_next = list_next(&ill->ill_nce, nce);
+		if (func(ill, nce, arg) != 0)
+			break;
+		nce = nce_next;
+	}
+}
 
-	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
-	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
-	for (; nce != NULL; nce = nce->nce_next) {
-		/* Note that only v4 mapped entries are in the table. */
-		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
-		if (addr == nceaddr.s_addr &&
-		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
-			/* Single flag check; no lock needed */
-			if (!(nce->nce_flags & NCE_F_CONDEMNED))
-				break;
+void
+nce_walk(ill_t *ill, pfi_t func, void *arg)
+{
+	mutex_enter(&ill->ill_lock);
+	nce_walk_common(ill, func, arg);
+	mutex_exit(&ill->ill_lock);
+}
+
+void
+nce_flush(ill_t *ill, boolean_t flushall)
+{
+	nce_t *nce, *nce_next;
+	list_t dead;
+
+	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
+	mutex_enter(&ill->ill_lock);
+	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
+		nce_next = list_next(&ill->ill_nce, nce);
+		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
+			nce = nce_next;
+			continue;
 		}
+		/*
+		 * nce_delete requires that the caller should either not
+		 * be holding locks, or should hold a ref to ensure that
+		 * we wont hit ncec_inactive. So take a ref and clean up
+		 * after the list is flushed.
+		 */
+		nce_refhold(nce);
+		nce_delete(nce);
+		list_insert_tail(&dead, nce);
+		nce = nce_next;
 	}
-	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
-	return (nce != NULL);
+	mutex_exit(&ill->ill_lock);
+	while ((nce = list_head(&dead)) != NULL) {
+		list_remove(&dead, nce);
+		nce_refrele(nce);
+	}
+	ASSERT(list_is_empty(&dead));
+	list_destroy(&dead);
 }
 
-/*
- * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly
- * with IPMP.  Specifically, since neighbor discovery is always done on
- * underlying interfaces (even for addresses owned by an IPMP interface), we
- * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface
- * associated with `ill' (if it exists).
- */
-static ipif_t *
-ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill)
+/* Return an interval that is anywhere in the [1 .. intv] range */
+static clock_t
+nce_fuzz_interval(clock_t intv, boolean_t initial_time)
+{
+	clock_t rnd, frac;
+
+	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
+	/* Note that clock_t is signed; must chop off bits */
+	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
+	if (initial_time) {
+		if (intv <= 0)
+			intv = 1;
+		else
+			intv = (rnd % intv) + 1;
+	} else {
+		/* Compute 'frac' as 20% of the configured interval */
+		if ((frac = intv / 5) <= 1)
+			frac = 2;
+		/* Set intv randomly in the range [intv-frac .. intv+frac] */
+		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
+		intv = 1;
+	}
+	return (intv);
+}
+
+void
+nce_resolv_ipmp_ok(ncec_t *ncec)
 {
-	ipif_t *ipif;
+	mblk_t *mp;
+	uint_t pkt_len;
+	iaflags_t ixaflags = IXAF_NO_TRACE;
+	nce_t *under_nce;
+	ill_t	*ill = ncec->ncec_ill;
+	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
+	ipif_t *src_ipif = NULL;
 	ip_stack_t *ipst = ill->ill_ipst;
+	ill_t *send_ill;
+	uint_t nprobes;
 
-	ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
-	if (ipif == NULL && IS_UNDER_IPMP(ill)) {
-		if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
-			ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
-			ill_refrele(ill);
+	ASSERT(IS_IPMP(ill));
+
+	mutex_enter(&ncec->ncec_lock);
+	nprobes = ncec->ncec_nprobes;
+	mp = ncec->ncec_qd_mp;
+	ncec->ncec_qd_mp = NULL;
+	ncec->ncec_nprobes = 0;
+	mutex_exit(&ncec->ncec_lock);
+
+	while (mp != NULL) {
+		mblk_t *nxt_mp;
+
+		nxt_mp = mp->b_next;
+		mp->b_next = NULL;
+		if (isv6) {
+			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
+			    ill, ALL_ZONES, ipst);
+		} else {
+			ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+			ixaflags |= IXAF_IS_IPV4;
+			pkt_len = ntohs(ipha->ipha_length);
+			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
+			    ill, ALL_ZONES, ipst);
+		}
+
+		/*
+		 * find a new nce based on an under_ill. The first IPMP probe
+		 * packet gets queued, so we could still find a src_ipif that
+		 * matches an IPMP test address.
+		 */
+		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
+			/*
+			 * if src_ipif is null, this could be either a
+			 * forwarded packet or a probe whose src got deleted.
+			 * We identify the former case by looking for the
+			 * ncec_nprobes: the first ncec_nprobes packets are
+			 * probes;
+			 */
+			if (src_ipif == NULL && nprobes > 0)
+				goto drop_pkt;
+
+			/*
+			 * For forwarded packets, we use the ipmp rotor
+			 * to find send_ill.
+			 */
+			send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill,
+			    B_TRUE);
+		} else {
+			send_ill = src_ipif->ipif_ill;
+			ill_refhold(send_ill);
+		}
+
+		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
+		    (ncec_t *), ncec, (ipif_t *),
+		    src_ipif, (ill_t *), send_ill);
+
+		if (send_ill == NULL) {
+			if (src_ipif != NULL)
+				ipif_refrele(src_ipif);
+			goto drop_pkt;
 		}
+		/* create an under_nce on send_ill */
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
+			under_nce = nce_fastpath_create(send_ill, ncec);
+		else
+			under_nce = NULL;
+		rw_exit(&ipst->ips_ill_g_lock);
+		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
+			nce_fastpath_trigger(under_nce);
+
+		ill_refrele(send_ill);
+		if (src_ipif != NULL)
+			ipif_refrele(src_ipif);
+
+		if (under_nce != NULL) {
+			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
+			    ALL_ZONES, 0, NULL);
+			nce_refrele(under_nce);
+			if (nprobes > 0)
+				nprobes--;
+			mp = nxt_mp;
+			continue;
+		}
+drop_pkt:
+		if (isv6) {
+			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
+		} else {
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+		}
+		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
+		freemsg(mp);
+		if (nprobes > 0)
+			nprobes--;
+		mp = nxt_mp;
 	}
-	return (ipif);
+	ncec_cb_dispatch(ncec); /* complete callbacks */
 }
diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c
index 8b97462d13..33e791adac 100644
--- a/usr/src/uts/common/inet/ip/ip_netinfo.c
+++ b/usr/src/uts/common/inet/ip/ip_netinfo.c
@@ -38,6 +38,7 @@
 #include <sys/cmn_err.h>
 
 #include <netinet/in.h>
+#include <inet/ipsec_impl.h>
 #include <inet/common.h>
 #include <inet/mib2.h>
 #include <inet/ip.h>
@@ -89,6 +90,20 @@ static phy_if_t 	ipv6_routeto(net_handle_t, struct sockaddr *,
 			    struct sockaddr *);
 static int 		ipv6_isvalidchecksum(net_handle_t, mblk_t *);
 
+static int 		net_no_getmtu(net_handle_t, phy_if_t, lif_if_t);
+static int 		net_no_getpmtuenabled(net_handle_t);
+static lif_if_t 	net_no_lifgetnext(net_handle_t, phy_if_t, lif_if_t);
+static int 		net_no_inject(net_handle_t, inject_t, net_inject_t *);
+static phy_if_t 	net_no_routeto(net_handle_t, struct sockaddr *,
+			    struct sockaddr *);
+static int 		net_no_ispartialchecksum(net_handle_t, mblk_t *);
+static int 		net_no_getlifaddr(net_handle_t, phy_if_t, lif_if_t,
+			    size_t, net_ifaddr_t [], void *);
+static int		net_no_getlifzone(net_handle_t, phy_if_t, lif_if_t,
+			    zoneid_t *);
+static int		net_no_getlifflags(net_handle_t, phy_if_t, lif_if_t,
+			    uint64_t *);
+
 /* Netinfo private functions */
 static	int		ip_getifname_impl(phy_if_t, char *,
 			    const size_t, boolean_t, ip_stack_t *);
@@ -111,7 +126,6 @@ static	void		ip_ni_queue_in_func(void *);
 static	void		ip_ni_queue_out_func(void *);
 static	void		ip_ni_queue_func_impl(injection_t *,  boolean_t);
 
-
 static net_protocol_t ipv4info = {
 	NETINFO_VERSION,
 	NHF_INET,
@@ -149,6 +163,24 @@ static net_protocol_t ipv6info = {
 	ipv6_isvalidchecksum
 };
 
+static net_protocol_t arp_netinfo = {
+	NETINFO_VERSION,
+	NHF_ARP,
+	ip_getifname,
+	net_no_getmtu,
+	net_no_getpmtuenabled,
+	net_no_getlifaddr,
+	net_no_getlifzone,
+	net_no_getlifflags,
+	ip_phygetnext,
+	ip_phylookup,
+	net_no_lifgetnext,
+	net_no_inject,
+	net_no_routeto,
+	net_no_ispartialchecksum,
+	ip_isvalidchecksum
+};
+
 /*
  * The taskq eventq_queue_in is used to process the upside inject messages.
  * The taskq eventq_queue_out is used to process the downside inject messages.
@@ -230,6 +262,9 @@ ip_net_init(ip_stack_t *ipst, netstack_t *ns)
 
 	ipst->ips_ipv6_net_data = net_protocol_register(id, &ipv6info);
 	ASSERT(ipst->ips_ipv6_net_data != NULL);
+
+	ipst->ips_arp_net_data = net_protocol_register(id, &arp_netinfo);
+	ASSERT(ipst->ips_ipv6_net_data != NULL);
 }
 
 
@@ -248,6 +283,11 @@ ip_net_destroy(ip_stack_t *ipst)
 		if (net_protocol_unregister(ipst->ips_ipv6_net_data) == 0)
 			ipst->ips_ipv6_net_data = NULL;
 	}
+
+	if (ipst->ips_arp_net_data != NULL) {
+		if (net_protocol_unregister(ipst->ips_arp_net_data) == 0)
+			ipst->ips_arp_net_data = NULL;
+	}
 }
 
 /*
@@ -612,8 +652,7 @@ ip_getifname_impl(phy_if_t phy_ifdata,
 
 	ASSERT(buffer != NULL);
 
-	ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, NULL, NULL,
-	    NULL, NULL, ipst);
+	ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, ipst);
 	if (ill == NULL)
 		return (1);
 
@@ -667,17 +706,17 @@ ip_getmtu_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
 	if (ipif == NULL)
 		return (0);
 
-	mtu = ipif->ipif_mtu;
+	mtu = ipif->ipif_ill->ill_mtu;
 	ipif_refrele(ipif);
 
 	if (mtu == 0) {
 		ill_t *ill;
 
 		if ((ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6,
-		    NULL, NULL, NULL, NULL, ipst)) == NULL) {
+		    ipst)) == NULL) {
 			return (0);
 		}
-		mtu = ill->ill_max_frag;
+		mtu = ill->ill_mtu;
 		ill_refrele(ill);
 	}
 
@@ -760,8 +799,7 @@ ip_phylookup_impl(const char *name, boolean_t isv6, ip_stack_t *ipst)
 	phy_if_t phy;
 	ill_t *ill;
 
-	ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, NULL,
-	    NULL, NULL, NULL, ipst);
+	ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, ipst);
 	if (ill == NULL)
 		return (0);
 
@@ -813,8 +851,7 @@ ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
 	ipif_t *ipif;
 	ill_t *ill;
 
-	ill = ill_lookup_on_ifindex(phy_ifdata, isv6, NULL, NULL,
-	    NULL, NULL, ipst);
+	ill = ill_lookup_on_ifindex(phy_ifdata, isv6, ipst);
 	if (ill == NULL)
 		return (0);
 
@@ -898,14 +935,10 @@ static int
 ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6,
     ip_stack_t *ipst)
 {
-	struct sockaddr_in6 *sin6;
 	ddi_taskq_t *tq = NULL;
 	void (* func)(void *);
 	injection_t *inject;
-	ip6_t *ip6h;
-	ire_t *ire;
 	mblk_t *mp;
-	zoneid_t zoneid;
 
 	ASSERT(packet != NULL);
 	ASSERT(packet->ni_packet != NULL);
@@ -941,130 +974,44 @@ ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6,
 		tq = eventq_queue_out;
 		break;
 
-	case NI_DIRECT_OUT:
-		/*
-		 * Note:
-		 * For IPv4, the code path below will be greatly simplified
-		 * with the delivery of surya - it will become a single
-		 * function call to X.  A follow on project is aimed to
-		 * provide similar functionality for IPv6.
-		 */
-		mp = packet->ni_packet;
-		zoneid =
-		    netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid);
-
-		if (!isv6) {
-			struct sockaddr *sock;
-
-			sock = (struct sockaddr *)&packet->ni_addr;
-			/*
-			 * ipfil_sendpkt was provided by surya to ease the
-			 * problems associated with sending out a packet.
-			 * Currently this function only supports IPv4.
-			 */
-			switch (ipfil_sendpkt(sock, mp, packet->ni_physical,
-			    zoneid)) {
-			case 0 :
-			case EINPROGRESS:
-				return (0);
-			case ECOMM :
-			case ENONET :
-				return (1);
-			default :
-				return (1);
-			}
-			/* NOTREACHED */
-
-		}
-
-		ip6h = (ip6_t *)mp->b_rptr;
-		sin6 = (struct sockaddr_in6 *)&packet->ni_addr;
-		ASSERT(sin6->sin6_family == AF_INET6);
-
-		ire = ire_route_lookup_v6(&sin6->sin6_addr, 0, 0, 0,
-		    NULL, NULL, zoneid, NULL,
-		    MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE,
-		    ipst);
+	case NI_DIRECT_OUT: {
+		struct sockaddr *sock;
 
-		if (ire == NULL) {
-			ip2dbg(("ip_inject: ire_cache_lookup failed\n"));
-			freemsg(mp);
-			return (1);
-		}
-
-		if (ire->ire_stq == NULL) {
-			/* Send to loopback destination. */
-			if (ire->ire_rfq == NULL) {
-				ip2dbg(("ip_inject: bad nexthop\n"));
-				ire_refrele(ire);
-				freemsg(mp);
-				return (1);
-			}
-			DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
-			    void_ip_t *, ip6h, __dtrace_ipsr_ill_t *,
-			    ire->ire_ipif->ipif_ill, ipha_t *, NULL, ip6_t *,
-			    ip6h, int, 1);
-			ip_wput_local_v6(ire->ire_rfq,
-			    ire->ire_ipif->ipif_ill, ip6h, mp, ire, 0, zoneid);
-			ire_refrele(ire);
-			return (0);
-		}
-
-		mp->b_queue = ire->ire_stq;
-
-		if (ire->ire_nce == NULL ||
-		    ire->ire_nce->nce_fp_mp == NULL &&
-		    ire->ire_nce->nce_res_mp == NULL) {
-			ip_newroute_v6(ire->ire_stq, mp, &sin6->sin6_addr,
-			    &ip6h->ip6_src, NULL, zoneid, ipst);
+		mp = packet->ni_packet;
 
-			ire_refrele(ire);
+		sock = (struct sockaddr *)&packet->ni_addr;
+		/*
+		 * ipfil_sendpkt was provided by surya to ease the
+		 * problems associated with sending out a packet.
+		 */
+		switch (ipfil_sendpkt(sock, mp, packet->ni_physical,
+		    netstackid_to_zoneid(
+		    ipst->ips_netstack->netstack_stackid))) {
+		case 0 :
+		case EINPROGRESS:
 			return (0);
-		} else {
-			/* prepend L2 header for IPv6 packets. */
-			mblk_t *llmp;
-
-			/*
-			 * Lock IREs, see 6420438
-			 */
-			mutex_enter(&ire->ire_lock);
-			llmp = ire->ire_nce->nce_fp_mp ?
-			    ire->ire_nce->nce_fp_mp :
-			    ire->ire_nce->nce_res_mp;
-
-			if ((mp = dupb(llmp)) == NULL &&
-			    (mp = copyb(llmp)) == NULL) {
-				ip2dbg(("ip_inject: llhdr failed\n"));
-				mutex_exit(&ire->ire_lock);
-				ire_refrele(ire);
-				freemsg(mp);
-				return (1);
-			}
-			mutex_exit(&ire->ire_lock);
-			linkb(mp, packet->ni_packet);
+		case ECOMM :
+		case ENONET :
+			return (1);
+		default :
+			return (1);
 		}
-
-		mp->b_queue = ire->ire_stq;
-
-		break;
+		/* NOTREACHED */
+	}
 	default:
 		freemsg(packet->ni_packet);
 		return (1);
 	}
 
-	if (tq) {
-		inject->inj_ptr = ipst;
-		if (ddi_taskq_dispatch(tq, func, (void *)inject,
-		    DDI_SLEEP) == DDI_FAILURE) {
-			ip2dbg(("ip_inject:  ddi_taskq_dispatch failed\n"));
-			freemsg(packet->ni_packet);
-			return (1);
-		}
-	} else {
-		putnext(ire->ire_stq, mp);
-		ire_refrele(ire);
-	}
+	ASSERT(tq != NULL);
 
+	inject->inj_ptr = ipst;
+	if (ddi_taskq_dispatch(tq, func, (void *)inject,
+	    DDI_SLEEP) == DDI_FAILURE) {
+		ip2dbg(("ip_inject:  ddi_taskq_dispatch failed\n"));
+		freemsg(packet->ni_packet);
+		return (1);
+	}
 	return (0);
 }
 
@@ -1121,64 +1068,57 @@ ip_routeto_impl(struct sockaddr *address, struct sockaddr *nexthop,
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)address;
 	struct sockaddr_in *next = (struct sockaddr_in *)nexthop;
 	struct sockaddr_in *sin = (struct sockaddr_in *)address;
-	ire_t *sire = NULL;
 	ire_t *ire;
-	ill_t *ill;
+	ire_t *nexthop_ire;
 	phy_if_t phy_if;
 	zoneid_t zoneid;
 
 	zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid);
 
 	if (address->sa_family == AF_INET6) {
-		ire = ire_route_lookup_v6(&sin6->sin6_addr, NULL,
-		    0, 0, NULL, &sire, zoneid, NULL,
-		    MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE,
-		    ipst);
+		ire = ire_route_recursive_v6(&sin6->sin6_addr, 0, NULL,
+		    zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst, NULL,
+		    NULL, NULL);
 	} else {
-		ire = ire_route_lookup(sin->sin_addr.s_addr, 0,
-		    0, 0, NULL, &sire, zoneid, NULL,
-		    MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE,
-		    ipst);
+		ire = ire_route_recursive_v4(sin->sin_addr.s_addr, 0, NULL,
+		    zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst, NULL,
+		    NULL, NULL);
 	}
-
-	if (ire == NULL)
-		return (0);
-
+	ASSERT(ire != NULL);
 	/*
 	 * For some destinations, we have routes that are dead ends, so
 	 * return to indicate that no physical interface can be used to
 	 * reach the destination.
 	 */
-	if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) != 0) {
-		if (sire != NULL)
-			ire_refrele(sire);
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		ire_refrele(ire);
-		return (0);
+		return (NULL);
 	}
 
-	ill = ire_to_ill(ire);
-	if (ill == NULL) {
-		if (sire != NULL)
-			ire_refrele(sire);
+	nexthop_ire = ire_nexthop(ire);
+	if (nexthop_ire == NULL) {
+		ire_refrele(ire);
+		return (0);
+	}
+	if (nexthop_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+		ire_refrele(nexthop_ire);
 		ire_refrele(ire);
 		return (0);
 	}
 
+	ASSERT(nexthop_ire->ire_ill != NULL);
+
 	if (nexthop != NULL) {
 		if (address->sa_family == AF_INET6) {
-			next->sin_addr.s_addr = sire ? sire->ire_gateway_addr :
-			    sin->sin_addr.s_addr;
+			next6->sin6_addr = nexthop_ire->ire_addr_v6;
 		} else {
-			next6->sin6_addr = sire ? sire->ire_gateway_addr_v6 :
-			    sin6->sin6_addr;
+			next->sin_addr.s_addr = nexthop_ire->ire_addr;
 		}
 	}
 
-	ASSERT(ill != NULL);
-	phy_if = (phy_if_t)ill->ill_phyint->phyint_ifindex;
-	if (sire != NULL)
-		ire_refrele(sire);
+	phy_if = (phy_if_t)nexthop_ire->ire_ill->ill_phyint->phyint_ifindex;
 	ire_refrele(ire);
+	ire_refrele(nexthop_ire);
 
 	return (phy_if);
 }
@@ -1477,8 +1417,7 @@ ip_getlifflags_impl(sa_family_t family, phy_if_t phy_ifdata, lif_if_t ifdata,
 	ipif_t *ipif;
 	ill_t *ill;
 
-	ill = ill_lookup_on_ifindex(phy_ifdata,
-	    (family == AF_INET6), NULL, NULL, NULL, NULL, ipst);
+	ill = ill_lookup_on_ifindex(phy_ifdata, (family == AF_INET6), ipst);
 	if (ill == NULL)
 		return (-1);
 	phyi = ill->ill_phyint;
@@ -1538,59 +1477,43 @@ static void
 ip_ni_queue_func_impl(injection_t *inject,  boolean_t out)
 {
 	net_inject_t *packet;
-	conn_t *conn;
 	ill_t *ill;
 	ip_stack_t *ipst = (ip_stack_t *)inject->inj_ptr;
+	ip_xmit_attr_t	ixas;
 
 	ASSERT(inject != NULL);
 	packet = &inject->inj_data;
 	ASSERT(packet->ni_packet != NULL);
 
-	ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical,
-	    B_FALSE, NULL, NULL, NULL, NULL, ipst);
-	if (ill == NULL) {
-		kmem_free(inject, sizeof (*inject));
-		return;
-	}
-
 	if (out == 0) {
+		ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical,
+		    inject->inj_isv6, ipst);
+
+		if (ill == NULL) {
+			kmem_free(inject, sizeof (*inject));
+			return;
+		}
+
 		if (inject->inj_isv6) {
-			ip_rput_v6(ill->ill_rq, packet->ni_packet);
+			ip_input_v6(ill, NULL, packet->ni_packet, NULL);
 		} else {
 			ip_input(ill, NULL, packet->ni_packet, NULL);
 		}
-		kmem_free(inject, sizeof (*inject));
 		ill_refrele(ill);
-		return;
-	}
-
-	/*
-	 * Even though ipcl_conn_create requests that it be passed
-	 * a different value for "TCP", in this case there may not
-	 * be a TCP connection backing the packet and more than
-	 * likely, non-TCP packets will go here too.
-	 */
-	conn = ipcl_conn_create(IPCL_IPCCONN, KM_NOSLEEP, ipst->ips_netstack);
-	if (conn != NULL) {
+	} else {
+		bzero(&ixas, sizeof (ixas));
+		ixas.ixa_ifindex = packet->ni_physical;
+		ixas.ixa_ipst = ipst;
 		if (inject->inj_isv6) {
-			conn->conn_af_isv6 = B_TRUE;
-			conn->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT;
-			conn->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
-			ip_output_v6(conn, packet->ni_packet, ill->ill_wq,
-			    IP_WPUT);
+			ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
 		} else {
-			conn->conn_af_isv6 = B_FALSE;
-			conn->conn_pkt_isv6 = B_FALSE;
-			conn->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
-			ip_output(conn, packet->ni_packet, ill->ill_wq,
-			    IP_WPUT);
+			ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
 		}
-
-		CONN_DEC_REF(conn);
+		(void) ip_output_simple(packet->ni_packet, &ixas);
+		ixa_cleanup(&ixas);
 	}
 
 	kmem_free(inject, sizeof (*inject));
-	ill_refrele(ill);
 }
 
 /*
@@ -1623,3 +1546,152 @@ done:
 	kmem_free(info->hnei_event.hne_data, info->hnei_event.hne_datalen);
 	kmem_free(arg, sizeof (hook_nic_event_int_t));
 }
+
+/*
+ * Initialize ARP hook family and events
+ */
+void
+arp_hook_init(ip_stack_t *ipst)
+{
+	HOOK_FAMILY_INIT(&ipst->ips_arproot, Hn_ARP);
+	if (net_family_register(ipst->ips_arp_net_data, &ipst->ips_arproot)
+	    != 0) {
+		cmn_err(CE_NOTE, "arp_hook_init"
+		    "net_family_register failed for arp");
+	}
+
+	HOOK_EVENT_INIT(&ipst->ips_arp_physical_in_event, NH_PHYSICAL_IN);
+	ipst->ips_arp_physical_in = net_event_register(ipst->ips_arp_net_data,
+	    &ipst->ips_arp_physical_in_event);
+	if (ipst->ips_arp_physical_in == NULL) {
+		cmn_err(CE_NOTE, "arp_hook_init: "
+		    "net_event_register failed for arp/physical_in");
+	}
+
+	HOOK_EVENT_INIT(&ipst->ips_arp_physical_out_event, NH_PHYSICAL_OUT);
+	ipst->ips_arp_physical_out = net_event_register(ipst->ips_arp_net_data,
+	    &ipst->ips_arp_physical_out_event);
+	if (ipst->ips_arp_physical_out == NULL) {
+		cmn_err(CE_NOTE, "arp_hook_init: "
+		    "net_event_register failed for arp/physical_out");
+	}
+
+	HOOK_EVENT_INIT(&ipst->ips_arp_nic_events, NH_NIC_EVENTS);
+	ipst->ips_arpnicevents = net_event_register(ipst->ips_arp_net_data,
+	    &ipst->ips_arp_nic_events);
+	if (ipst->ips_arpnicevents == NULL) {
+		cmn_err(CE_NOTE, "arp_hook_init: "
+		    "net_event_register failed for arp/nic_events");
+	}
+}
+
+void
+arp_hook_destroy(ip_stack_t *ipst)
+{
+	if (ipst->ips_arpnicevents != NULL) {
+		if (net_event_unregister(ipst->ips_arp_net_data,
+		    &ipst->ips_arp_nic_events) == 0)
+			ipst->ips_arpnicevents = NULL;
+	}
+
+	if (ipst->ips_arp_physical_out != NULL) {
+		if (net_event_unregister(ipst->ips_arp_net_data,
+		    &ipst->ips_arp_physical_out_event) == 0)
+			ipst->ips_arp_physical_out = NULL;
+	}
+
+	if (ipst->ips_arp_physical_in != NULL) {
+		if (net_event_unregister(ipst->ips_arp_net_data,
+		    &ipst->ips_arp_physical_in_event) == 0)
+			ipst->ips_arp_physical_in = NULL;
+	}
+
+	(void) net_family_unregister(ipst->ips_arp_net_data,
+	    &ipst->ips_arproot);
+}
+
+void
+arp_hook_shutdown(ip_stack_t *ipst)
+{
+	if (ipst->ips_arp_physical_in != NULL) {
+		(void) net_event_shutdown(ipst->ips_arp_net_data,
+		    &ipst->ips_arp_physical_in_event);
+	}
+	if (ipst->ips_arp_physical_out != NULL) {
+		(void) net_event_shutdown(ipst->ips_arp_net_data,
+		    &ipst->ips_arp_physical_out_event);
+	}
+	if (ipst->ips_arpnicevents != NULL) {
+		(void) net_event_shutdown(ipst->ips_arp_net_data,
+		    &ipst->ips_arp_nic_events);
+	}
+}
+
+/* netinfo routines for the unsupported cases */
+
+/* ARGSUSED */
+int
+net_no_getmtu(net_handle_t handle, phy_if_t phy_ifdata, lif_if_t ifdata)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_getpmtuenabled(net_handle_t neti)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static lif_if_t
+net_no_lifgetnext(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_inject(net_handle_t neti, inject_t style, net_inject_t *packet)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+net_no_routeto(net_handle_t neti, struct sockaddr *address,
+    struct sockaddr *next)
+{
+	return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_ispartialchecksum(net_handle_t neti, mblk_t *mp)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_getlifaddr(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata,
+    size_t nelem, net_ifaddr_t type[], void *storage)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_getlifzone(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata,
+    zoneid_t *zoneid)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_getlifflags(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata,
+    uint64_t *flags)
+{
+	return (-1);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_opt_data.c b/usr/src/uts/common/inet/ip/ip_opt_data.c
deleted file mode 100644
index e86e59f67d..0000000000
--- a/usr/src/uts/common/inet/ip/ip_opt_data.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/types.h>
-#include <sys/stream.h>
-#define	_SUN_TPI_VERSION 2
-#include <sys/tihdr.h>
-#include <sys/socket.h>
-#include <sys/xti_inet.h>
-
-#include <inet/common.h>
-#include <netinet/ip6.h>
-#include <inet/ip.h>
-
-#include <netinet/in.h>
-#include <netinet/ip_mroute.h>
-#include <inet/optcom.h>
-
-
-extern int	ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
-extern int	ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
-extern int	ip_opt_set(queue_t *q, uint_t optset_context, int level,
-    int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
-    void *dummy, cred_t *cr, mblk_t *first_mp);
-
-/*
- * Table of all known options handled on a IP protocol stack.
- *
- * Note: Not all of these options are available through all protocol stacks
- *	 For example, multicast options are not accessible in TCP over IP.
- *	 The filtering for that happens in option table at transport level.
- *	 Also, this table excludes any options processed exclusively at the
- *	 transport protocol level.
- */
-opdes_t		ip_opt_arr[] = {
-
-{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-
-{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
-	0 },
-
-
-{ IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
-	(OP_VARLEN|OP_NODEFAULT),
-	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
-{ T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
-	(OP_VARLEN|OP_NODEFAULT),
-	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
-
-{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (struct in_addr),	0 /* INADDR_ANY */ },
-
-{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_DEF_FN),
-	sizeof (uchar_t), -1 /* not initialized */},
-
-{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_DEF_FN),
-	sizeof (uchar_t), -1 /* not initialized */ },
-
-{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT),
-	sizeof (struct ip_mreq), -1 /* not initialized */ },
-
-{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT),
-	sizeof (struct ip_mreq), -1 /* not initialized */ },
-
-{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT),
-	sizeof (struct ip_mreq_source), -1 /* not initialized */ },
-
-{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT),
-	sizeof (struct ip_mreq_source), -1 /* not initialized */ },
-
-{ IP_ADD_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
-
-{ IP_DROP_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
-
-{ IP_RECVOPTS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-
-{ IP_RECVDSTADDR, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
-	},
-
-{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-
-{ IP_PKTINFO, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-
-{ IP_RECVSLLA, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-
-{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int),	0 /* no ifindex */ },
-
-{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
-	sizeof (int), 0 },
-
-{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
-	sizeof (int), 0 },
-
-{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_NODEFAULT),
-	sizeof (ipsec_req_t), -1 /* not initialized */ },
-
-{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
-	sizeof (in_addr_t),	-1 /* not initialized */ },
-
-{ MRT_INIT, IPPROTO_IP, 0, OA_X, OP_CONFIG,
-	(OP_NODEFAULT), sizeof (int), -1 /* not initialized */ },
-
-{ MRT_DONE, IPPROTO_IP, 0, OA_X, OP_CONFIG,
-	(OP_NODEFAULT), 0, -1 /* not initialized */ },
-
-{ MRT_ADD_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT),
-	sizeof (struct vifctl), -1 /* not initialized */ },
-
-{ MRT_DEL_VIF, 	IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT),
-	sizeof (vifi_t), -1 /* not initialized */ },
-
-{ MRT_ADD_MFC, 	IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT),
-	sizeof (struct mfcctl), -1 /* not initialized */ },
-
-{ MRT_DEL_MFC, 	IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT),
-	sizeof (struct mfcctl), -1 /* not initialized */ },
-
-{ MRT_VERSION, 	IPPROTO_IP, OA_R, OA_R, OP_NP, (OP_NODEFAULT),
-	sizeof (int), -1 /* not initialized */ },
-
-{ MRT_ASSERT, 	IPPROTO_IP, 0, OA_RW, OP_CONFIG, (OP_NODEFAULT),
-	sizeof (int), -1 /* not initialized */ },
-
-{ MCAST_JOIN_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_req),
-	-1 /* not initialized */ },
-{ MCAST_LEAVE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_req),
-	-1 /* not initialized */ },
-{ MCAST_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_source_req),
-	-1 /* not initialized */ },
-{ MCAST_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_source_req),
-	-1 /* not initialized */ },
-{ MCAST_JOIN_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_source_req),
-	-1 /* not initialized */ },
-{ MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_source_req),
-	-1 /* not initialized */ },
-
-{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-
-{ IPV6_MULTICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
-
-{ IPV6_MULTICAST_LOOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_DEF_FN), sizeof (int), -1 /* not initialized */},
-
-{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_NODEFAULT),
-	sizeof (struct ipv6_mreq), -1 /* not initialized */ },
-
-{ IPV6_LEAVE_GROUP,	IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT),
-	sizeof (struct ipv6_mreq), -1 /* not initialized */ },
-
-{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
-
-{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int),	0 /* no ifindex */ },
-
-{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
-	sizeof (int), 0 },
-
-{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_NODEFAULT|OP_VARLEN),
-	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
-{ IPV6_HOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_NODEFAULT|OP_VARLEN),
-	sizeof (int), -1 /* not initialized */ },
-{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_NODEFAULT|OP_VARLEN),
-	sizeof (sin6_t), -1 /* not initialized */ },
-{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_VARLEN|OP_NODEFAULT), 255*8,
-	-1 /* not initialized */ },
-{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_VARLEN|OP_NODEFAULT), 255*8,
-	-1 /* not initialized */ },
-{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_VARLEN|OP_NODEFAULT), 255*8,
-	-1 /* not initialized */ },
-{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_VARLEN|OP_NODEFAULT), 255*8,
-	-1 /* not initialized */ },
-{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_NODEFAULT|OP_VARLEN),
-	sizeof (int), -1 /* not initialized */ },
-{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (struct ip6_mtuinfo), -1 },
-{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), -1 },
-{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-
-/* Enable receipt of ancillary data */
-{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-{ IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int), 0 },
-
-{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_NODEFAULT),
-	sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
-
-{ MCAST_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_req),
-	-1 /* not initialized */ },
-{ MCAST_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_req),
-	-1 /* not initialized */ },
-{ MCAST_BLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_source_req),
-	-1 /* not initialized */ },
-{ MCAST_UNBLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_source_req),
-	-1 /* not initialized */ },
-{ MCAST_JOIN_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_source_req),
-	-1 /* not initialized */ },
-{ MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_NODEFAULT), sizeof (struct group_source_req),
-	-1 /* not initialized */ },
-};
-
-
-#define	IP_OPT_ARR_CNT		A_CNT(ip_opt_arr)
-
-
-/*
- * Initialize option database object for IP
- *
- * This object represents database of options to search passed to
- * {sock,tpi}optcom_req() interface routine to take care of option
- * management and associated methods.
- */
-
-optdb_obj_t ip_opt_obj = {
-	ip_opt_default,		/* IP default value function pointer */
-	ip_opt_get,		/* IP get function pointer */
-	ip_opt_set,		/* IP set function pointer */
-	B_FALSE,		/* IP is NOT a tpi provider */
-	IP_OPT_ARR_CNT,		/* IP option database count of entries */
-	ip_opt_arr,		/* IP option database */
-	0,			/* 0 - not needed if not top tpi provider */
-	(optlevel_t *)0		/* null - not needed if not top tpi provider */
-};
diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c
new file mode 100644
index 0000000000..a4940fd3e8
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_output.c
@@ -0,0 +1,2554 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/dlpi.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/tcp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/optcom.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+
+#include <sys/pattr.h>
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+#ifdef	DEBUG
+extern boolean_t skip_sctp_cksum;
+#endif
+
+static int	ip_verify_nce(mblk_t *, ip_xmit_attr_t *);
+static int	ip_verify_dce(mblk_t *, ip_xmit_attr_t *);
+static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *);
+static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *);
+static void	ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *);
+
+/*
+ * There are two types of output functions for IP used for different
+ * purposes:
+ *  - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there
+ *     is no context in the form of a conn_t. However, there is a
+ *     ip_xmit_attr_t that the callers use to influence interface selection
+ *     (needed for ICMP echo as well as IPv6 link-locals) and IPsec.
+ *
+ *  - conn_ip_output() is used when sending packets with a conn_t and
+ *    ip_set_destination has been called to cache information. In that case
+ *    various socket options are recorded in the ip_xmit_attr_t and should
+ *    be taken into account.
+ */
+
+/*
+ * The caller *must* have called conn_connect() or ip_attr_connect()
+ * before calling conn_ip_output(). The caller needs to redo that each time
+ * the destination IP address or port changes, as well as each time there is
+ * a change to any socket option that would modify how packets are routed out
+ * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF).
+ *
+ * The ULP caller has to serialize the use of a single ip_xmit_attr_t.
+ * We assert for that here.
+ */
+int
+conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+	iaflags_t	ixaflags = ixa->ixa_flags;
+	ire_t		*ire;
+	nce_t		*nce;
+	dce_t		*dce;
+	ill_t		*ill;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	int		error;
+
+	/* We defer ipIfStatsHCOutRequests until an error or we have an ill */
+
+	ASSERT(ixa->ixa_ire != NULL);
+	/* Note there is no ixa_nce when reject and blackhole routes */
+	ASSERT(ixa->ixa_dce != NULL);	/* Could be default dce */
+
+#ifdef DEBUG
+	ASSERT(ixa->ixa_curthread == NULL);
+	ixa->ixa_curthread = curthread;
+#endif
+
+	/*
+	 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
+	 * for IGMP/MLD traffic.
+	 */
+
+	ire = ixa->ixa_ire;
+
+	/*
+	 * If the ULP says the (old) IRE resulted in reachability we
+	 * record this before determine whether to use a new IRE.
+	 * No locking for performance reasons.
+	 */
+	if (ixaflags & IXAF_REACH_CONF)
+		ire->ire_badcnt = 0;
+
+	/*
+	 * Has routing changed since we cached the results of the lookup?
+	 *
+	 * This check captures all of:
+	 *  - the cached ire being deleted (by means of the special
+	 *    IRE_GENERATION_CONDEMNED)
+	 *  - A potentially better ire being added (ire_generation being
+	 *    increased)
+	 *  - A deletion of the nexthop ire that was used when we did the
+	 *    lookup.
+	 *  - An addition of a potentially better nexthop ire.
+	 * The last two are handled by walking and increasing the generation
+	 * number on all dependant IREs in ire_flush_cache().
+	 *
+	 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE
+	 * since we ensure that each time we set ixa_ire to such an IRE we
+	 * make sure the ixa_ire_generation does not match (by using
+	 * IRE_GENERATION_VERIFY).
+	 */
+	if (ire->ire_generation != ixa->ixa_ire_generation) {
+		error = ip_verify_ire(mp, ixa);
+		if (error != 0) {
+			ip_drop_output("ipIfStatsOutDiscards - verify ire",
+			    mp, NULL);
+			goto drop;
+		}
+		ire = ixa->ixa_ire;
+		ASSERT(ire != NULL);
+		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+#ifdef DEBUG
+			ASSERT(ixa->ixa_curthread == curthread);
+			ixa->ixa_curthread = NULL;
+#endif
+			ire->ire_ob_pkt_count++;
+			/* ixa_dce might be condemned; use default one */
+			return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa,
+			    &ipst->ips_dce_default->dce_ident));
+		}
+		/*
+		 * If the ncec changed then ip_verify_ire already set
+		 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+		 * so we can recheck the interface mtu.
+		 */
+
+		/*
+		 * Note that ire->ire_generation could already have changed.
+		 * We catch that next time we send a packet.
+		 */
+	}
+
+	/*
+	 * No need to lock access to ixa_nce since the ip_xmit_attr usage
+	 * is single threaded.
+	 */
+	ASSERT(ixa->ixa_nce != NULL);
+	nce = ixa->ixa_nce;
+	if (nce->nce_is_condemned) {
+		error = ip_verify_nce(mp, ixa);
+		/*
+		 * In case ZEROCOPY capability become not available, we
+		 * copy the message and free the original one. We might
+		 * be copying more data than needed but it doesn't hurt
+		 * since such change rarely happens.
+		 */
+		switch (error) {
+		case 0:
+			break;
+		case ENOTSUP: { /* ZEROCOPY */
+			mblk_t *nmp;
+
+			if ((nmp = copymsg(mp)) != NULL) {
+				freemsg(mp);
+				mp = nmp;
+
+				break;
+			}
+			/* FALLTHROUGH */
+		}
+		default:
+			ip_drop_output("ipIfStatsOutDiscards - verify nce",
+			    mp, NULL);
+			goto drop;
+		}
+		ire = ixa->ixa_ire;
+		ASSERT(ire != NULL);
+		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+#ifdef DEBUG
+			ASSERT(ixa->ixa_curthread == curthread);
+			ixa->ixa_curthread = NULL;
+#endif
+			ire->ire_ob_pkt_count++;
+			/* ixa_dce might be condemned; use default one */
+			return ((ire->ire_sendfn)(ire, mp, mp->b_rptr,
+			    ixa, &ipst->ips_dce_default->dce_ident));
+		}
+		ASSERT(ixa->ixa_nce != NULL);
+		nce = ixa->ixa_nce;
+
+		/*
+		 * Note that some other event could already have made
+		 * the new nce condemned. We catch that next time we
+		 * try to send a packet.
+		 */
+	}
+	/*
+	 * If there is no per-destination dce_t then we have a reference to
+	 * the default dce_t (which merely contains the dce_ipid).
+	 * The generation check captures both the introduction of a
+	 * per-destination dce_t (e.g., due to ICMP packet too big) and
+	 * any change to the per-destination dce (including it becoming
+	 * condemned by use of the special DCE_GENERATION_CONDEMNED).
+	 */
+	dce = ixa->ixa_dce;
+
+	/*
+	 * To avoid a periodic timer to increase the path MTU we
+	 * look at dce_last_change_time each time we send a packet.
+	 */
+	if ((dce->dce_flags & DCEF_PMTU) &&
+	    (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+	    ipst->ips_ip_pathmtu_interval)) {
+		/*
+		 * Older than 20 minutes. Drop the path MTU information.
+		 * Since the path MTU changes as a result of this, twiddle
+		 * ixa_dce_generation to make us go through the dce
+		 * verification code in conn_ip_output.
+		 */
+		mutex_enter(&dce->dce_lock);
+		dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
+		dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+		mutex_exit(&dce->dce_lock);
+		dce_increment_generation(dce);
+	}
+
+	if (dce->dce_generation != ixa->ixa_dce_generation) {
+		error = ip_verify_dce(mp, ixa);
+		if (error != 0) {
+			ip_drop_output("ipIfStatsOutDiscards - verify dce",
+			    mp, NULL);
+			goto drop;
+		}
+		dce = ixa->ixa_dce;
+
+		/*
+		 * Note that some other event could already have made the
+		 * new dce's generation number change.
+		 * We catch that next time we try to send a packet.
+		 */
+	}
+
+	ill = nce->nce_ill;
+
+	/*
+	 * An initial ixa_fragsize was set in ip_set_destination
+	 * and we update it if any routing changes above.
+	 * A change to ill_mtu with ifconfig will increase all dce_generation
+	 * so that we will detect that with the generation check.
+	 */
+
+	/*
+	 * Caller needs to make sure IXAF_VERIFY_SRC is not set if
+	 * conn_unspec_src.
+	 */
+	if ((ixaflags & IXAF_VERIFY_SOURCE) &&
+	    ixa->ixa_src_generation != ipst->ips_src_generation) {
+		/* Check if the IP source is still assigned to the host. */
+		uint_t gen;
+
+		if (!ip_verify_src(mp, ixa, &gen)) {
+			/* Don't send a packet with a source that isn't ours */
+			error = EADDRNOTAVAIL;
+			ip_drop_output("ipIfStatsOutDiscards - invalid src",
+			    mp, NULL);
+			goto drop;
+		}
+		/* The source is still valid - update the generation number */
+		ixa->ixa_src_generation = gen;
+	}
+
+	/*
+	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
+	 * can only count the use prior to fragmentation. However the MIB
+	 * counters on the ill will be incremented in post fragmentation.
+	 */
+	ire->ire_ob_pkt_count++;
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
+
+	/*
+	 * Based on ire_type and ire_flags call one of:
+	 *	ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK
+	 *	ire_send_multirt_v* - if RTF_MULTIRT
+	 *	ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE
+	 *	ire_send_multicast_v* - for IRE_MULTICAST
+	 *	ire_send_broadcast_v4 - for IRE_BROADCAST
+	 *	ire_send_wire_v* - for the rest.
+	 */
+#ifdef DEBUG
+	ASSERT(ixa->ixa_curthread == curthread);
+	ixa->ixa_curthread = NULL;
+#endif
+	return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident));
+
+drop:
+	if (ixaflags & IXAF_IS_IPV4) {
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+	} else {
+		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests);
+		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
+	}
+	freemsg(mp);
+#ifdef DEBUG
+	ASSERT(ixa->ixa_curthread == curthread);
+	ixa->ixa_curthread = NULL;
+#endif
+	return (error);
+}
+
+/*
+ * Handle both IPv4 and IPv6. Sets the generation number
+ * to allow the caller to know when to call us again.
+ * Returns true if the source address in the packet is a valid source.
+ * We handle callers which try to send with a zero address (since we only
+ * get here if UNSPEC_SRC is not set).
+ */
+boolean_t
+ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
+{
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+
+	/*
+	 * Need to grab the generation number before we check to
+	 * avoid a race with a change to the set of local addresses.
+	 * No lock needed since the thread which updates the set of local
+	 * addresses use ipif/ill locks and exit those (hence a store memory
+	 * barrier) before doing the atomic increase of ips_src_generation.
+	 */
+	if (generationp != NULL)
+		*generationp = ipst->ips_src_generation;
+
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
+
+		if (ipha->ipha_src == INADDR_ANY)
+			return (B_FALSE);
+
+		return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
+		    ipst, B_FALSE) != IPVL_BAD);
+	} else {
+		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
+		uint_t	scopeid;
+
+		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src))
+			return (B_FALSE);
+
+		if (ixa->ixa_flags & IXAF_SCOPEID_SET)
+			scopeid = ixa->ixa_scopeid;
+		else
+			scopeid = 0;
+
+		return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid,
+		    ipst, B_FALSE, scopeid) != IPVL_BAD);
+	}
+}
+
+/*
+ * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use.
+ */
+int
+ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+	uint_t		gen;
+	ire_t		*ire;
+	nce_t		*nce;
+	int		error;
+	boolean_t	multirt = B_FALSE;
+
+	/*
+	 * Redo ip_select_route.
+	 * Need to grab generation number as part of the lookup to
+	 * avoid race.
+	 */
+	error = 0;
+	ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt);
+	ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
+	if (error != 0) {
+		ire_refrele(ire);
+		return (error);
+	}
+
+	if (ixa->ixa_ire != NULL)
+		ire_refrele_notr(ixa->ixa_ire);
+#ifdef DEBUG
+	ire_refhold_notr(ire);
+	ire_refrele(ire);
+#endif
+	ixa->ixa_ire = ire;
+	ixa->ixa_ire_generation = gen;
+	if (multirt) {
+		if (ixa->ixa_flags & IXAF_IS_IPV4)
+			ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
+		else
+			ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
+		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
+	} else {
+		ixa->ixa_postfragfn = ire->ire_postfragfn;
+		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
+	}
+
+	/*
+	 * Don't look for an nce for reject or blackhole.
+	 * They have ire_generation set to IRE_GENERATION_VERIFY which
+	 * makes conn_ip_output avoid references to ixa_nce.
+	 */
+	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+		ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY);
+		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+		return (0);
+	}
+
+	/* The NCE could now be different */
+	nce = ire_to_nce_pkt(ire, mp);
+	if (nce == NULL) {
+		/*
+		 * Allocation failure. Make sure we redo ire/nce selection
+		 * next time we send.
+		 */
+		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+		return (ENOBUFS);
+	}
+	if (nce == ixa->ixa_nce) {
+		/* No change */
+		nce_refrele(nce);
+		return (0);
+	}
+
+	/*
+	 * Since the path MTU might change as a result of this
+	 * route change, we twiddle ixa_dce_generation to
+	 * make conn_ip_output go through the ip_verify_dce code.
+	 */
+	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+
+	if (ixa->ixa_nce != NULL)
+		nce_refrele(ixa->ixa_nce);
+	ixa->ixa_nce = nce;
+	return (0);
+}
+
+/*
+ * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use.
+ */
+static int
+ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+	ire_t		*ire = ixa->ixa_ire;
+	nce_t		*nce;
+	int		error = 0;
+	ipha_t		*ipha = NULL;
+	ip6_t		*ip6h = NULL;
+
+	if (ire->ire_ipversion == IPV4_VERSION)
+		ipha = (ipha_t *)mp->b_rptr;
+	else
+		ip6h = (ip6_t *)mp->b_rptr;
+
+	nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE);
+	if (nce == NULL) {
+		/* Try to find a better ire */
+		return (ip_verify_ire(mp, ixa));
+	}
+
+	/*
+	 * The hardware offloading capabilities, for example LSO, of the
+	 * interface might have changed, so do sanity verification here.
+	 */
+	if (ixa->ixa_flags & IXAF_VERIFY_LSO) {
+		if (!ip_verify_lso(nce->nce_ill, ixa)) {
+			ASSERT(ixa->ixa_notify != NULL);
+			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
+			    IXAN_LSO, 0);
+			error = ENOTSUP;
+		}
+	}
+
+	/*
+	 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with
+	 * any ZEROCOPY changes. In case ZEROCOPY capability is not available
+	 * any more, return error so that conn_ip_output() can take care of
+	 * the ZEROCOPY message properly. It's safe to continue send the
+	 * message when ZEROCOPY newly become available.
+	 */
+	if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) {
+		if (!ip_verify_zcopy(nce->nce_ill, ixa)) {
+			ASSERT(ixa->ixa_notify != NULL);
+			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
+			    IXAN_ZCOPY, 0);
+			if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0)
+				error = ENOTSUP;
+		}
+	}
+
+	/*
+	 * Since the path MTU might change as a result of this
+	 * change, we twiddle ixa_dce_generation to
+	 * make conn_ip_output go through the ip_verify_dce code.
+	 */
+	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+
+	nce_refrele(ixa->ixa_nce);
+	ixa->ixa_nce = nce;
+	return (error);
+}
+
+/*
+ * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use.
+ */
+static int
+ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+	dce_t		*dce;
+	uint_t		gen;
+	uint_t		pmtu;
+
+	dce = dce_lookup_pkt(mp, ixa, &gen);
+	ASSERT(dce != NULL);
+
+	dce_refrele_notr(ixa->ixa_dce);
+#ifdef DEBUG
+	dce_refhold_notr(dce);
+	dce_refrele(dce);
+#endif
+	ixa->ixa_dce = dce;
+	ixa->ixa_dce_generation = gen;
+
+	/* Extract the (path) mtu from the dce, ncec_ill etc */
+	pmtu = ip_get_pmtu(ixa);
+
+	/*
+	 * Tell ULP about PMTU changes - increase or decrease - by returning
+	 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update
+	 * both ixa_pmtu and ixa_fragsize appropriately.
+	 *
+	 * If ULP doesn't set that flag then we need to update ixa_fragsize
+	 * since routing could have changed the ill after after ixa_fragsize
+	 * was set previously in the conn_ip_output path or in
+	 * ip_set_destination.
+	 *
+	 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu.
+	 *
+	 * In the case of a path MTU increase we send the packet after the
+	 * notify to the ULP.
+	 */
+	if (ixa->ixa_flags & IXAF_VERIFY_PMTU) {
+		if (ixa->ixa_pmtu != pmtu) {
+			uint_t oldmtu = ixa->ixa_pmtu;
+
+			DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu,
+			    uint32_t, ixa->ixa_pmtu);
+			ASSERT(ixa->ixa_notify != NULL);
+			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
+			    IXAN_PMTU, pmtu);
+			if (pmtu < oldmtu)
+				return (EMSGSIZE);
+		}
+	} else {
+		ixa->ixa_fragsize = pmtu;
+	}
+	return (0);
+}
+
+/*
+ * Verify LSO usability. Keep the return value simple to indicate whether
+ * the LSO capability has changed. Handle both IPv4 and IPv6.
+ */
+static boolean_t
+ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa)
+{
+	ill_lso_capab_t	*lsoc = &ixa->ixa_lso_capab;
+	ill_lso_capab_t	*new_lsoc = ill->ill_lso_capab;
+
+	if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
+		/*
+		 * Not unsable any more.
+		 */
+		if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
+		    (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
+		    (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
+		    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
+		    !ILL_LSO_TCP_IPV4_USABLE(ill) :
+		    !ILL_LSO_TCP_IPV6_USABLE(ill))) {
+			ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
+
+			return (B_FALSE);
+		}
+
+		/*
+		 * Capability has changed, refresh the copy in ixa.
+		 */
+		if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) {
+			*lsoc = *new_lsoc;
+
+			return (B_FALSE);
+		}
+	} else { /* Was not usable */
+		if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
+		    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
+		    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
+		    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
+		    ILL_LSO_TCP_IPV4_USABLE(ill) :
+		    ILL_LSO_TCP_IPV6_USABLE(ill))) {
+			*lsoc = *new_lsoc;
+			ixa->ixa_flags |= IXAF_LSO_CAPAB;
+
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Verify ZEROCOPY usability. Keep the return value simple to indicate whether
+ * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6.
+ */
+static boolean_t
+ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa)
+{
+	if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) {
+		/*
+		 * Not unsable any more.
+		 */
+		if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
+		    (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
+		    (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
+		    !ILL_ZCOPY_USABLE(ill)) {
+			ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
+
+			return (B_FALSE);
+		}
+	} else { /* Was not usable */
+		if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
+		    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
+		    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
+		    ILL_ZCOPY_USABLE(ill)) {
+			ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
+
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+
+/*
+ * When there is no conn_t context, this will send a packet.
+ * The caller must *not* have called conn_connect() or ip_attr_connect()
+ * before calling ip_output_simple().
+ * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH.
+ * Honors IXAF_SET_SOURCE.
+ *
+ * We acquire the ire and after calling ire_sendfn we release
+ * the hold on the ire. Ditto for the nce and dce.
+ *
+ * This assumes that the caller has set the following in ip_xmit_attr_t:
+ *	ixa_tsl, ixa_zoneid, and ixa_ipst must always be set.
+ *	If ixa_ifindex is non-zero it means send out that ill. (If it is
+ *	an upper IPMP ill we load balance across the group; if a lower we send
+ *	on that lower ill without load balancing.)
+ *	IXAF_IS_IPV4 must be set correctly.
+ *	If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set.
+ *	If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup.
+ *	If neither of those two are set we do an IPsec policy lookup.
+ *
+ * We handle setting things like
+ *	ixa_pktlen
+ *	ixa_ip_hdr_length
+ *	ixa->ixa_protocol
+ *
+ * The caller may set ixa_xmit_hint, which is used for ECMP selection and
+ * transmit ring selecting in GLD.
+ *
+ * The caller must do an ixa_cleanup() to release any IPsec references
+ * after we return.
+ */
+int
+ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+	ts_label_t	*effective_tsl = NULL;
+	int		err;
+
+	ASSERT(ixa->ixa_ipst != NULL);
+
+	if (is_system_labeled()) {
+		ip_stack_t *ipst = ixa->ixa_ipst;
+
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
+			err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
+			    &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
+			    &effective_tsl);
+		} else {
+			err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
+			    &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
+			    &effective_tsl);
+		}
+		if (err != 0) {
+			ip2dbg(("tsol_check: label check failed (%d)\n", err));
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("tsol_check_label", mp, NULL);
+			freemsg(mp);
+			return (err);
+		}
+		if (effective_tsl != NULL) {
+			/* Update the label */
+			ip_xmit_attr_replace_tsl(ixa, effective_tsl);
+		}
+	}
+
+	if (ixa->ixa_flags & IXAF_IS_IPV4)
+		return (ip_output_simple_v4(mp, ixa));
+	else
+		return (ip_output_simple_v6(mp, ixa));
+}
+
+int
+ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+	ipha_t		*ipha;
+	ipaddr_t	firsthop; /* In IP header */
+	ipaddr_t	dst;	/* End of source route, or ipha_dst if none */
+	ire_t		*ire;
+	ipaddr_t	setsrc;	/* RTF_SETSRC */
+	int		error;
+	ill_t		*ill = NULL;
+	dce_t		*dce = NULL;
+	nce_t		*nce;
+	iaflags_t	ixaflags = ixa->ixa_flags;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	boolean_t	repeat = B_FALSE;
+	boolean_t	multirt = B_FALSE;
+
+	ipha = (ipha_t *)mp->b_rptr;
+	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+
+	/*
+	 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
+	 * for IGMP/MLD traffic.
+	 */
+
+	/* Caller already set flags */
+	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
+
+	ASSERT(ixa->ixa_nce == NULL);
+
+	ixa->ixa_pktlen = ntohs(ipha->ipha_length);
+	ASSERT(ixa->ixa_pktlen == msgdsize(mp));
+	ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
+	ixa->ixa_protocol = ipha->ipha_protocol;
+
+	/*
+	 * Assumes that source routed packets have already been massaged by
+	 * the ULP (ip_massage_options) and as a result ipha_dst is the next
+	 * hop in the source route. The final destination is used for IPsec
+	 * policy and DCE lookup.
+	 */
+	firsthop = ipha->ipha_dst;
+	dst = ip_get_dst(ipha);
+
+repeat_ire:
+	error = 0;
+	setsrc = INADDR_ANY;
+	ire = ip_select_route_v4(firsthop, ixa, NULL, &setsrc, &error,
+	    &multirt);
+	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
+	if (error != 0) {
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL);
+		freemsg(mp);
+		goto done;
+	}
+
+	if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
+		/* ire_ill might be NULL hence need to skip some code */
+		if (ixaflags & IXAF_SET_SOURCE)
+			ipha->ipha_src = htonl(INADDR_LOOPBACK);
+		ixa->ixa_fragsize = IP_MAXPACKET;
+		ill = NULL;
+		nce = NULL;
+		ire->ire_ob_pkt_count++;
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+		/* No dce yet; use default one */
+		error = (ire->ire_sendfn)(ire, mp, ipha, ixa,
+		    &ipst->ips_dce_default->dce_ident);
+		goto done;
+	}
+
+	/* Note that ipha_dst is only used for IRE_MULTICAST */
+	nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
+	if (nce == NULL) {
+		/* Allocation failure? */
+		ip_drop_output("ire_to_nce", mp, ill);
+		freemsg(mp);
+		error = ENOBUFS;
+		goto done;
+	}
+	if (nce->nce_is_condemned) {
+		nce_t *nce1;
+
+		nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE);
+		nce_refrele(nce);
+		if (nce1 == NULL) {
+			if (!repeat) {
+				/* Try finding a better IRE */
+				repeat = B_TRUE;
+				ire_refrele(ire);
+				goto repeat_ire;
+			}
+			/* Tried twice - drop packet */
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("No nce", mp, ill);
+			freemsg(mp);
+			error = ENOBUFS;
+			goto done;
+		}
+		nce = nce1;
+	}
+
+	/*
+	 * For multicast with multirt we have a flag passed back from
+	 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
+	 * possible multicast address.
+	 * We also need a flag for multicast since we can't check
+	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
+	 */
+	if (multirt) {
+		ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
+		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
+	} else {
+		ixa->ixa_postfragfn = ire->ire_postfragfn;
+		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
+	}
+	ASSERT(ixa->ixa_nce == NULL);
+	ixa->ixa_nce = nce;
+
+	/*
+	 * Check for a dce_t with a path mtu.
+	 */
+	dce = dce_lookup_v4(dst, ipst, NULL);
+	ASSERT(dce != NULL);
+
+	if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
+		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+	} else if (dce->dce_flags & DCEF_PMTU) {
+		/*
+		 * To avoid a periodic timer to increase the path MTU we
+		 * look at dce_last_change_time each time we send a packet.
+		 */
+		if (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+		    ipst->ips_ip_pathmtu_interval) {
+			/*
+			 * Older than 20 minutes. Drop the path MTU information.
+			 */
+			mutex_enter(&dce->dce_lock);
+			dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
+			dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+			mutex_exit(&dce->dce_lock);
+			dce_increment_generation(dce);
+			ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+		} else {
+			uint_t fragsize;
+
+			fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+			if (fragsize > dce->dce_pmtu)
+				fragsize = dce->dce_pmtu;
+			ixa->ixa_fragsize = fragsize;
+		}
+	} else {
+		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+	}
+
+	/*
+	 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
+	 * interface for source address selection.
+	 */
+	ill = ire_nexthop_ill(ire);
+
+	if (ixaflags & IXAF_SET_SOURCE) {
+		ipaddr_t	src;
+
+		/*
+		 * We use the final destination to get
+		 * correct selection for source routed packets
+		 */
+
+		/* If unreachable we have no ill but need some source */
+		if (ill == NULL) {
+			src = htonl(INADDR_LOOPBACK);
+			error = 0;
+		} else {
+			error = ip_select_source_v4(ill, setsrc, dst,
+			    ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst,
+			    &src, NULL, NULL);
+		}
+		if (error != 0) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - no source",
+			    mp, ill);
+			freemsg(mp);
+			goto done;
+		}
+		ipha->ipha_src = src;
+	} else if (ixaflags & IXAF_VERIFY_SOURCE) {
+		/* Check if the IP source is assigned to the host. */
+		if (!ip_verify_src(mp, ixa, NULL)) {
+			/* Don't send a packet with a source that isn't ours */
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - invalid source",
+			    mp, ill);
+			freemsg(mp);
+			error = EADDRNOTAVAIL;
+			goto done;
+		}
+	}
+
+
+	/*
+	 * Check against global IPsec policy to set the AH/ESP attributes.
+	 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
+	 */
+	if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
+		ASSERT(ixa->ixa_ipsec_policy == NULL);
+		mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa);
+		if (mp == NULL) {
+			/* MIB and ip_drop_packet already done */
+			return (EHOSTUNREACH);	/* IPsec policy failure */
+		}
+	}
+
+	if (ill != NULL) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
+	} else {
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+	}
+
+	/*
+	 * We update the statistics on the most specific IRE i.e., the first
+	 * one we found.
+	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
+	 * can only count the use prior to fragmentation. However the MIB
+	 * counters on the ill will be incremented in post fragmentation.
+	 */
+	ire->ire_ob_pkt_count++;
+
+	/*
+	 * Based on ire_type and ire_flags call one of:
+	 *	ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK
+	 *	ire_send_multirt_v4 - if RTF_MULTIRT
+	 *	ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
+	 *	ire_send_multicast_v4 - for IRE_MULTICAST
+	 *	ire_send_broadcast_v4 - for IRE_BROADCAST
+	 *	ire_send_wire_v4 - for the rest.
+	 */
+	error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident);
+done:
+	ire_refrele(ire);
+	if (dce != NULL)
+		dce_refrele(dce);
+	if (ill != NULL)
+		ill_refrele(ill);
+	if (ixa->ixa_nce != NULL)
+		nce_refrele(ixa->ixa_nce);
+	ixa->ixa_nce = NULL;
+	return (error);
+}
+
+/*
+ * ire_sendfn() functions.
+ * These functions use the following xmit_attr:
+ *  - ixa_fragsize - read to determine whether or not to fragment
+ *  - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
+ *  - ixa_ipsec_*  are used inside IPsec
+ *  - IXAF_SET_SOURCE - replace IP source in broadcast case.
+ *  - IXAF_LOOPBACK_COPY - for multicast and broadcast
+ */
+
+
+/*
+ * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
+ *
+ * The checks for restrict_interzone_loopback are done in ire_route_recursive.
+ */
+/* ARGSUSED4 */
+int
+ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	ill_t		*ill = ire->ire_ill;
+	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
+	uint_t		pktlen = ixa->ixa_pktlen;
+
+	/*
+	 * No fragmentation, no nce, no application of IPsec,
+	 * and no ipha_ident assignment.
+	 *
+	 * Note different order between IP provider and FW_HOOKS than in
+	 * send_wire case.
+	 */
+
+	/*
+	 * DTrace this as ip:::send.  A packet blocked by FW_HOOKS will fire the
+	 * send probe, but not the receive probe.
+	 */
+	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
+	    int, 1);
+
+	if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) {
+		int error;
+
+		DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
+		    ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
+		FW_HOOKS(ipst->ips_ip4_loopback_out_event,
+		    ipst->ips_ipv4firewall_loopback_out,
+		    NULL, ill, ipha, mp, mp, 0, ipst, error);
+		DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
+		if (mp == NULL)
+			return (error);
+
+		/*
+		 * Even if the destination was changed by the filter we use the
+		 * forwarding decision that was made based on the address
+		 * in ip_output/ip_set_destination.
+		 */
+		/* Length could be different */
+		ipha = (ipha_t *)mp->b_rptr;
+		pktlen = ntohs(ipha->ipha_length);
+	}
+
+	/*
+	 * If a callback is enabled then we need to know the
+	 * source and destination zoneids for the packet. We already
+	 * have those handy.
+	 */
+	if (ipst->ips_ip4_observe.he_interested) {
+		zoneid_t szone, dzone;
+		zoneid_t stackzoneid;
+
+		stackzoneid = netstackid_to_zoneid(
+		    ipst->ips_netstack->netstack_stackid);
+
+		if (stackzoneid == GLOBAL_ZONEID) {
+			/* Shared-IP zone */
+			dzone = ire->ire_zoneid;
+			szone = ixa->ixa_zoneid;
+		} else {
+			szone = dzone = stackzoneid;
+		}
+		ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
+	}
+
+	/* Handle lo0 stats */
+	ipst->ips_loopback_packets++;
+
+	/* Map ixa to ira including IPsec policies */
+	ipsec_out_to_in(ixa, ill, &iras);
+	iras.ira_pktlen = pktlen;
+
+	if (!IS_SIMPLE_IPH(ipha)) {
+		ip_output_local_options(ipha, ipst);
+		iras.ira_flags |= IRAF_IPV4_OPTIONS;
+	}
+
+	if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) {
+		int error;
+
+		DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
+		    ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
+		FW_HOOKS(ipst->ips_ip4_loopback_in_event,
+		    ipst->ips_ipv4firewall_loopback_in,
+		    ill, NULL, ipha, mp, mp, 0, ipst, error);
+
+		DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
+		if (mp == NULL) {
+			ira_cleanup(&iras, B_FALSE);
+			return (error);
+		}
+		/*
+		 * Even if the destination was changed by the filter we use the
+		 * forwarding decision that was made based on the address
+		 * in ip_output/ip_set_destination.
+		 */
+		/* Length could be different */
+		ipha = (ipha_t *)mp->b_rptr;
+		pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length);
+	}
+
+	DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
+	    int, 1);
+
+	ire->ire_ib_pkt_count++;
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
+
+	/* Destined to ire_zoneid - use that for fanout */
+	iras.ira_zoneid = ire->ire_zoneid;
+
+	if (is_system_labeled()) {
+		iras.ira_flags |= IRAF_SYSTEM_LABELED;
+
+		/*
+		 * This updates ira_cred, ira_tsl and ira_free_flags based
+		 * on the label. We don't expect this to ever fail for
+		 * loopback packets, so we silently drop the packet should it
+		 * fail.
+		 */
+		if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("tsol_get_pkt_label", mp, ill);
+			freemsg(mp);
+			return (0);
+		}
+		ASSERT(iras.ira_tsl != NULL);
+
+		/* tsol_get_pkt_label sometimes does pullupmsg */
+		ipha = (ipha_t *)mp->b_rptr;
+	}
+
+	ip_fanout_v4(mp, ipha, &iras);
+
+	/* We moved any IPsec refs from ixa to iras */
+	ira_cleanup(&iras, B_FALSE);
+	return (0);
+}
+
+/*
+ * ire_sendfn for IRE_BROADCAST
+ * If the broadcast address is present on multiple ills and ixa_ifindex
+ * isn't set, then we generate
+ * a separate datagram (potentially with different source address) for
+ * those ills. In any case, only one copy is looped back to ip_input_v4.
+ */
+int
+ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	irb_t		*irb = ire->ire_bucket;
+	ire_t		*ire1;
+	mblk_t		*mp1;
+	ipha_t		*ipha1;
+	iaflags_t	ixaflags = ixa->ixa_flags;
+	nce_t		*nce1, *nce_orig;
+
+	/*
+	 * Unless ire_send_multirt_v4 already set a ttl, force the
+	 * ttl to a smallish value.
+	 */
+	if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) {
+		/*
+		 * To avoid broadcast storms, we usually set the TTL to 1 for
+		 * broadcasts.  This can
+		 * be overridden stack-wide through the ip_broadcast_ttl
+		 * ndd tunable, or on a per-connection basis through the
+		 * IP_BROADCAST_TTL socket option.
+		 *
+		 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4
+		 * will force ttl to one after we've set this.
+		 */
+		if (ixaflags & IXAF_BROADCAST_TTL_SET)
+			ipha->ipha_ttl = ixa->ixa_broadcast_ttl;
+		else
+			ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
+	}
+	/*
+	 * Make sure we get a loopback copy (after IPsec and frag)
+	 * Skip hardware checksum so that loopback copy is checksumed.
+	 */
+	ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+
+	/* Do we need to potentially generate multiple copies? */
+	if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0)
+		return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
+
+	/*
+	 * Loop over all IRE_BROADCAST in the bucket (might only be one).
+	 * Note that everything in the bucket has the same destination address.
+	 */
+	irb_refhold(irb);
+	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
+		/* We do the main IRE after the end of the loop */
+		if (ire1 == ire)
+			continue;
+
+		/*
+		 * Only IREs for the same IP address should be in the same
+		 * bucket.
+		 * But could have IRE_HOSTs in the case of CGTP.
+		 * If we find any multirt routes we bail out of the loop
+		 * and just do the single packet at the end; ip_postfrag_multirt
+		 * will duplicate the packet.
+		 */
+		ASSERT(ire1->ire_addr == ire->ire_addr);
+		if (!(ire1->ire_type & IRE_BROADCAST))
+			continue;
+
+		if (IRE_IS_CONDEMNED(ire1))
+			continue;
+
+		if (ixa->ixa_zoneid != ALL_ZONES &&
+		    ire->ire_zoneid != ire1->ire_zoneid)
+			continue;
+
+		ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL);
+
+		if (ire1->ire_flags & RTF_MULTIRT)
+			break;
+
+		/*
+		 * For IPMP we only send for the ipmp_ill. arp_nce_init() will
+		 * ensure that this goes out on the cast_ill.
+		 */
+		if (IS_UNDER_IPMP(ire1->ire_ill))
+			continue;
+
+		mp1 = copymsg(mp);
+		if (mp1 == NULL) {
+			BUMP_MIB(ire1->ire_ill->ill_ip_mib,
+			    ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards",
+			    mp, ire1->ire_ill);
+			continue;
+		}
+
+		ipha1 = (ipha_t *)mp1->b_rptr;
+		if (ixa->ixa_flags & IXAF_SET_SOURCE) {
+			/*
+			 * Need to pick a different source address for each
+			 * interface. If we have a global IPsec policy and
+			 * no per-socket policy then we punt to
+			 * ip_output_simple_v4 using a separate ip_xmit_attr_t.
+			 */
+			if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) {
+				ip_output_simple_broadcast(ixa, mp1);
+				continue;
+			}
+			/* Pick a new source address for each interface */
+			if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY,
+			    ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst,
+			    &ipha1->ipha_src, NULL, NULL) != 0) {
+				BUMP_MIB(ire1->ire_ill->ill_ip_mib,
+				    ipIfStatsOutDiscards);
+				ip_drop_output("ipIfStatsOutDiscards - select "
+				    "broadcast source", mp1, ire1->ire_ill);
+				freemsg(mp1);
+				continue;
+			}
+			/*
+			 * Check against global IPsec policy to set the AH/ESP
+			 * attributes. IPsec will set IXAF_IPSEC_* and
+			 * ixa_ipsec_* as appropriate.
+			 */
+			if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
+				ASSERT(ixa->ixa_ipsec_policy == NULL);
+				mp1 = ip_output_attach_policy(mp1, ipha, NULL,
+				    NULL, ixa);
+				if (mp1 == NULL) {
+					/*
+					 * MIB and ip_drop_packet already
+					 * done
+					 */
+					continue;
+				}
+			}
+		}
+		/* Make sure we have an NCE on this ill */
+		nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr,
+		    ire1->ire_type);
+		if (nce1 == NULL) {
+			BUMP_MIB(ire1->ire_ill->ill_ip_mib,
+			    ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - broadcast nce",
+			    mp1, ire1->ire_ill);
+			freemsg(mp1);
+			continue;
+		}
+		nce_orig = ixa->ixa_nce;
+		ixa->ixa_nce = nce1;
+
+		ire_refhold(ire1);
+		/*
+		 * Ignore any errors here. We just collect the errno for
+		 * the main ire below
+		 */
+		(void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp);
+		ire_refrele(ire1);
+
+		ixa->ixa_nce = nce_orig;
+		nce_refrele(nce1);
+
+		ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY;
+	}
+	irb_refrele(irb);
+	/* Finally, the main one */
+
+	/*
+	 * For IPMP we only send broadcasts on the ipmp_ill.
+	 */
+	if (IS_UNDER_IPMP(ire->ire_ill)) {
+		freemsg(mp);
+		return (0);
+	}
+
+	return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
+}
+
+/*
+ * Send a packet using a different source address and different
+ * IPsec policy.
+ */
+static void
+ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp)
+{
+	ip_xmit_attr_t ixas;
+
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+	ixas.ixa_zoneid = ixa->ixa_zoneid;
+	ixas.ixa_ifindex = 0;
+	ixas.ixa_ipst = ixa->ixa_ipst;
+	ixas.ixa_cred = ixa->ixa_cred;
+	ixas.ixa_cpid = ixa->ixa_cpid;
+	ixas.ixa_tsl = ixa->ixa_tsl;
+	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+	(void) ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
+}
+
+
+static void
+multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa)
+{
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+
+	/* Limit the TTL on multirt packets */
+	if (ire->ire_type & IRE_MULTICAST) {
+		if (ipha->ipha_ttl > 1) {
+			ip2dbg(("ire_send_multirt_v4: forcing multicast "
+			    "multirt TTL to 1 (was %d), dst 0x%08x\n",
+			    ipha->ipha_ttl, ntohl(ire->ire_addr)));
+			ipha->ipha_ttl = 1;
+		}
+		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+	} else if ((ipst->ips_ip_multirt_ttl > 0) &&
+	    (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
+		ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
+		/*
+		 * Need to ensure we don't increase the ttl should we go through
+		 * ire_send_broadcast or multicast.
+		 */
+		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+	}
+}
+
+/*
+ * ire_sendfn for IRE_MULTICAST
+ */
+int
+ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	ill_t		*ill = ire->ire_ill;
+	iaflags_t	ixaflags = ixa->ixa_flags;
+
+	/*
+	 * The IRE_MULTICAST is the same whether or not multirt is in use.
+	 * Hence we need special-case code.
+	 */
+	if (ixaflags & IXAF_MULTIRT_MULTICAST)
+		multirt_check_v4(ire, ipha, ixa);
+
+	/*
+	 * Check if anything in ip_input_v4 wants a copy of the transmitted
+	 * packet (after IPsec and fragmentation)
+	 *
+	 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
+	 *    RSVP and the rsvp daemon is an example of a
+	 *    protocol and user level process that
+	 *    handles it's own routing. Hence, it uses the
+	 *    SO_DONTROUTE option to accomplish this.
+	 * 2. If the sender has set IP_MULTICAST_LOOP, then we just
+	 *    check whether there are any receivers for the group on the ill
+	 *    (ignoring the zoneid).
+	 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
+	 *    any members in other shared-IP zones.
+	 *    If such members exist, then we indicate that the sending zone
+	 *    shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
+	 *    behavior.
+	 *
+	 * When we loopback we skip hardware checksum to make sure loopback
+	 * copy is checksumed.
+	 *
+	 * Note that ire_ill is the upper in the case of IPMP.
+	 */
+	ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
+	if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
+	    !(ixaflags & IXAF_DONTROUTE)) {
+		ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+	} else if (ixaflags & IXAF_MULTICAST_LOOP) {
+		/*
+		 * If this zone or any other zone has members then loopback
+		 * a copy.
+		 */
+		if (ill_hasmembers_v4(ill, ipha->ipha_dst))
+			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+	} else if (ipst->ips_netstack->netstack_numzones > 1) {
+		/*
+		 * This zone should not have a copy. But there are some other
+		 * zones which might have members.
+		 */
+		if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
+		    ixa->ixa_zoneid)) {
+			ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
+			ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
+			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+		}
+	}
+
+	/*
+	 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl,
+	 * force the ttl to the IP_MULTICAST_TTL value
+	 */
+	if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
+		ipha->ipha_ttl = ixa->ixa_multicast_ttl;
+	}
+
+	return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
+}
+
+/*
+ * ire_sendfn for IREs with RTF_MULTIRT
+ */
+int
+ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+
+	multirt_check_v4(ire, ipha, ixa);
+
+	if (ire->ire_type & IRE_MULTICAST)
+		return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp));
+	else if (ire->ire_type & IRE_BROADCAST)
+		return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp));
+	else
+		return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
+}
+
+/*
+ * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
+ */
+int
+ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	ill_t		*ill;
+	ip_recv_attr_t	iras;
+	boolean_t	dummy;
+
+	/* We assign an IP ident for nice errors */
+	ipha->ipha_ident = atomic_add_32_nv(identp, 1);
+
+	BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
+
+	if (ire->ire_type & IRE_NOROUTE) {
+		/* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
+		ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0,
+		    RTA_DST, ipst);
+	}
+
+	if (ire->ire_flags & RTF_BLACKHOLE) {
+		ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
+		freemsg(mp);
+		/* No error even for local senders - silent blackhole */
+		return (0);
+	}
+	ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
+
+	/*
+	 * We need an ill_t for the ip_recv_attr_t even though this packet
+	 * was never received and icmp_unreachable doesn't currently use
+	 * ira_ill.
+	 */
+	ill = ill_lookup_on_name("lo0", B_FALSE,
+	    !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
+	if (ill == NULL) {
+		freemsg(mp);
+		return (EHOSTUNREACH);
+	}
+
+	bzero(&iras, sizeof (iras));
+	/* Map ixa to ira including IPsec policies */
+	ipsec_out_to_in(ixa, ill, &iras);
+
+	if (ip_source_routed(ipha, ipst)) {
+		icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
+	} else {
+		icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
+	}
+	/* We moved any IPsec refs from ixa to iras */
+	ira_cleanup(&iras, B_FALSE);
+	ill_refrele(ill);
+	return (EHOSTUNREACH);
+}
+
+/*
+ * Calculate a checksum ignoring any hardware capabilities
+ *
+ * Returns B_FALSE if the packet was too short for the checksum. Caller
+ * should free and do stats.
+ */
+static boolean_t
+ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
+{
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	uint_t		pktlen = ixa->ixa_pktlen;
+	uint16_t	*cksump;
+	uint32_t	cksum;
+	uint8_t		protocol = ixa->ixa_protocol;
+	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
+	ipaddr_t	dst = ipha->ipha_dst;
+	ipaddr_t	src = ipha->ipha_src;
+
+	/* Just in case it contained garbage */
+	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
+
+	/*
+	 * Calculate ULP checksum
+	 */
+	if (protocol == IPPROTO_TCP) {
+		cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
+		cksum = IP_TCP_CSUM_COMP;
+	} else if (protocol == IPPROTO_UDP) {
+		cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
+		cksum = IP_UDP_CSUM_COMP;
+	} else if (protocol == IPPROTO_SCTP) {
+		sctp_hdr_t	*sctph;
+
+		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
+		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
+		/*
+		 * Zero out the checksum field to ensure proper
+		 * checksum calculation.
+		 */
+		sctph->sh_chksum = 0;
+#ifdef	DEBUG
+		if (!skip_sctp_cksum)
+#endif
+			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
+		goto ip_hdr_cksum;
+	} else {
+		goto ip_hdr_cksum;
+	}
+
+	/* ULP puts the checksum field is in the first mblk */
+	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
+
+	/*
+	 * We accumulate the pseudo header checksum in cksum.
+	 * This is pretty hairy code, so watch close.  One
+	 * thing to keep in mind is that UDP and TCP have
+	 * stored their respective datagram lengths in their
+	 * checksum fields.  This lines things up real nice.
+	 */
+	cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
+
+	cksum = IP_CSUM(mp, ip_hdr_length, cksum);
+	/*
+	 * For UDP/IPv4 a zero means that the packets wasn't checksummed.
+	 * Change to 0xffff
+	 */
+	if (protocol == IPPROTO_UDP && cksum == 0)
+		*cksump = ~cksum;
+	else
+		*cksump = cksum;
+
+	IP_STAT(ipst, ip_out_sw_cksum);
+	IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen);
+
+ip_hdr_cksum:
+	/* Calculate IPv4 header checksum */
+	ipha->ipha_hdr_checksum = 0;
+	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+	return (B_TRUE);
+}
+
+/*
+ * Calculate the ULP checksum - try to use hardware.
+ * In the case of MULTIRT, broadcast or multicast the
+ * IXAF_NO_HW_CKSUM is set in which case we use software.
+ *
+ * If the hardware supports IP header checksum offload; then clear the
+ * contents of IP header checksum field as expected by NIC.
+ * Do this only if we offloaded either full or partial sum.
+ *
+ * Returns B_FALSE if the packet was too short for the checksum. Caller
+ * should free and do stats.
+ */
+static boolean_t
+ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
+    ip_xmit_attr_t *ixa, ill_t *ill)
+{
+	uint_t		pktlen = ixa->ixa_pktlen;
+	uint16_t	*cksump;
+	uint16_t	hck_flags;
+	uint32_t	cksum;
+	uint8_t		protocol = ixa->ixa_protocol;
+	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
+
+	if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
+	    !dohwcksum) {
+		return (ip_output_sw_cksum_v4(mp, ipha, ixa));
+	}
+
+	/*
+	 * Calculate ULP checksum. Note that we don't use cksump and cksum
+	 * if the ill has FULL support.
+	 */
+	if (protocol == IPPROTO_TCP) {
+		cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
+		cksum = IP_TCP_CSUM_COMP;	/* Pseudo-header cksum */
+	} else if (protocol == IPPROTO_UDP) {
+		cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
+		cksum = IP_UDP_CSUM_COMP;	/* Pseudo-header cksum */
+	} else if (protocol == IPPROTO_SCTP) {
+		sctp_hdr_t	*sctph;
+
+		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
+		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
+		/*
+		 * Zero out the checksum field to ensure proper
+		 * checksum calculation.
+		 */
+		sctph->sh_chksum = 0;
+#ifdef	DEBUG
+		if (!skip_sctp_cksum)
+#endif
+			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
+		goto ip_hdr_cksum;
+	} else {
+	ip_hdr_cksum:
+		/* Calculate IPv4 header checksum */
+		ipha->ipha_hdr_checksum = 0;
+		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+		return (B_TRUE);
+	}
+
+	/* ULP puts the checksum field is in the first mblk */
+	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
+
+	/*
+	 * Underlying interface supports hardware checksum offload for
+	 * the payload; leave the payload checksum for the hardware to
+	 * calculate.  N.B: We only need to set up checksum info on the
+	 * first mblk.
+	 */
+	hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
+
+	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
+	if (hck_flags & HCKSUM_INET_FULL_V4) {
+		/*
+		 * Hardware calculates pseudo-header, header and the
+		 * payload checksums, so clear the checksum field in
+		 * the protocol header.
+		 */
+		*cksump = 0;
+		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
+
+		ipha->ipha_hdr_checksum = 0;
+		if (hck_flags & HCKSUM_IPHDRCKSUM) {
+			DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
+		} else {
+			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+		}
+		return (B_TRUE);
+	}
+	if ((hck_flags) & HCKSUM_INET_PARTIAL)  {
+		ipaddr_t	dst = ipha->ipha_dst;
+		ipaddr_t	src = ipha->ipha_src;
+		/*
+		 * Partial checksum offload has been enabled.  Fill
+		 * the checksum field in the protocol header with the
+		 * pseudo-header checksum value.
+		 *
+		 * We accumulate the pseudo header checksum in cksum.
+		 * This is pretty hairy code, so watch close.  One
+		 * thing to keep in mind is that UDP and TCP have
+		 * stored their respective datagram lengths in their
+		 * checksum fields.  This lines things up real nice.
+		 */
+		cksum += (dst >> 16) + (dst & 0xFFFF) +
+		    (src >> 16) + (src & 0xFFFF);
+		cksum += *(cksump);
+		cksum = (cksum & 0xFFFF) + (cksum >> 16);
+		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
+
+		/*
+		 * Offsets are relative to beginning of IP header.
+		 */
+		DB_CKSUMSTART(mp) = ip_hdr_length;
+		DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha;
+		DB_CKSUMEND(mp) = pktlen;
+		DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
+
+		ipha->ipha_hdr_checksum = 0;
+		if (hck_flags & HCKSUM_IPHDRCKSUM) {
+			DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
+		} else {
+			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+		}
+		return (B_TRUE);
+	}
+	/* Hardware capabilities include neither full nor partial IPv4 */
+	return (ip_output_sw_cksum_v4(mp, ipha, ixa));
+}
+
+/*
+ * ire_sendfn for offlink and onlink destinations.
+ * Also called from the multicast, broadcast, multirt send functions.
+ *
+ * Assumes that the caller has a hold on the ire.
+ *
+ * This function doesn't care if the IRE just became condemned since that
+ * can happen at any time.
+ */
+/* ARGSUSED */
+int
+ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+    ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	ipha_t		*ipha = (ipha_t *)iph_arg;
+	iaflags_t	ixaflags = ixa->ixa_flags;
+	ill_t		*ill;
+
+	ASSERT(ixa->ixa_nce != NULL);
+	ill = ixa->ixa_nce->nce_ill;
+
+	if (ixaflags & IXAF_DONTROUTE)
+		ipha->ipha_ttl = 1;
+
+	/*
+	 * Assign an ident value for this packet. There could be other
+	 * threads targeting the same destination, so we have to arrange
+	 * for a atomic increment.  Note that we use a 32-bit atomic add
+	 * because it has better performance than its 16-bit sibling.
+	 *
+	 * Normally ixa_extra_ident is 0, but in the case of LSO it will
+	 * be the number of TCP segments  that the driver/hardware will
+	 * extraly construct.
+	 *
+	 * If running in cluster mode and if the source address
+	 * belongs to a replicated service then vector through
+	 * cl_inet_ipident vector to allocate ip identifier
+	 * NOTE: This is a contract private interface with the
+	 * clustering group.
+	 */
+	if (cl_inet_ipident != NULL) {
+		ipaddr_t src = ipha->ipha_src;
+		ipaddr_t dst = ipha->ipha_dst;
+		netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
+
+		ASSERT(cl_inet_isclusterwide != NULL);
+		if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
+		    AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
+			/*
+			 * Note: not correct with LSO since we can't allocate
+			 * ixa_extra_ident+1 consecutive values.
+			 */
+			ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
+			    IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
+			    (uint8_t *)(uintptr_t)dst, NULL);
+		} else {
+			ipha->ipha_ident = atomic_add_32_nv(identp,
+			    ixa->ixa_extra_ident + 1);
+		}
+	} else {
+		ipha->ipha_ident = atomic_add_32_nv(identp,
+		    ixa->ixa_extra_ident + 1);
+	}
+#ifndef _BIG_ENDIAN
+	ipha->ipha_ident = htons(ipha->ipha_ident);
+#endif
+
+	/*
+	 * This might set b_band, thus the IPsec and fragmentation
+	 * code in IP ensures that b_band is updated in the first mblk.
+	 */
+	if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
+		/* ip_process translates an IS_UNDER_IPMP */
+		mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
+		if (mp == NULL) {
+			/* ip_drop_packet and MIB done */
+			return (0);	/* Might just be delayed */
+		}
+	}
+
+	/*
+	 * Verify any IPv4 options.
+	 *
+	 * The presense of IP options also forces the network stack to
+	 * calculate the checksum in software.  This is because:
+	 *
+	 * Wrap around: certain partial-checksum NICs (eri, ce) limit
+	 * the size of "start offset" width to 6-bit.  This effectively
+	 * sets the largest value of the offset to 64-bytes, starting
+	 * from the MAC header.  When the cumulative MAC and IP headers
+	 * exceed such limit, the offset will wrap around.  This causes
+	 * the checksum to be calculated at the wrong place.
+	 *
+	 * IPv4 source routing: none of the full-checksum capable NICs
+	 * is capable of correctly handling the	IPv4 source-routing
+	 * option for purposes of calculating the pseudo-header; the
+	 * actual destination is different from the destination in the
+	 * header which is that of the next-hop.  (This case may not be
+	 * true for NICs which can parse IPv6 extension headers, but
+	 * we choose to simplify the implementation by not offloading
+	 * checksum when they are present.)
+	 */
+	if (!IS_SIMPLE_IPH(ipha)) {
+		ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM;
+		/* An IS_UNDER_IPMP ill is ok here */
+		if (ip_output_options(mp, ipha, ixa, ill)) {
+			/* Packet has been consumed and ICMP error sent */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			return (EINVAL);
+		}
+	}
+
+	/*
+	 * To handle IPsec/iptun's labeling needs we need to tag packets
+	 * while we still have ixa_tsl
+	 */
+	if (is_system_labeled() && ixa->ixa_tsl != NULL &&
+	    (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
+	    ill->ill_mactype == DL_IPV6)) {
+		cred_t *newcr;
+
+		newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
+		    KM_NOSLEEP);
+		if (newcr == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - newcr",
+			    mp, ill);
+			freemsg(mp);
+			return (ENOBUFS);
+		}
+		mblk_setcred(mp, newcr, NOPID);
+		crfree(newcr);	/* mblk_setcred did its own crhold */
+	}
+
+	if (ixa->ixa_pktlen > ixa->ixa_fragsize ||
+	    (ixaflags & IXAF_IPSEC_SECURE)) {
+		uint32_t pktlen;
+
+		pktlen = ixa->ixa_pktlen;
+		if (ixaflags & IXAF_IPSEC_SECURE)
+			pktlen += ipsec_out_extra_length(ixa);
+
+		if (pktlen > IP_MAXPACKET)
+			return (EMSGSIZE);
+
+		if (ixaflags & IXAF_SET_ULP_CKSUM) {
+			/*
+			 * Compute ULP checksum and IP header checksum
+			 * using software
+			 */
+			if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
+				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+				ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+				freemsg(mp);
+				return (EINVAL);
+			}
+		} else {
+			/* Calculate IPv4 header checksum */
+			ipha->ipha_hdr_checksum = 0;
+			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+		}
+
+		/*
+		 * If this packet would generate a icmp_frag_needed
+		 * message, we need to handle it before we do the IPsec
+		 * processing. Otherwise, we need to strip the IPsec
+		 * headers before we send up the message to the ULPs
+		 * which becomes messy and difficult.
+		 *
+		 * We check using IXAF_DONTFRAG. The DF bit in the header
+		 * is not inspected - it will be copied to any generated
+		 * fragments.
+		 */
+		if ((pktlen > ixa->ixa_fragsize) &&
+		    (ixaflags & IXAF_DONTFRAG)) {
+			/* Generate ICMP and return error */
+			ip_recv_attr_t	iras;
+
+			DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen,
+			    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
+			    uint_t, ixa->ixa_pmtu);
+
+			bzero(&iras, sizeof (iras));
+			/* Map ixa to ira including IPsec policies */
+			ipsec_out_to_in(ixa, ill, &iras);
+
+			ip_drop_output("ICMP_FRAG_NEEDED", mp, ill);
+			icmp_frag_needed(mp, ixa->ixa_fragsize, &iras);
+			/* We moved any IPsec refs from ixa to iras */
+			ira_cleanup(&iras, B_FALSE);
+			return (EMSGSIZE);
+		}
+		DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen,
+		    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
+		    uint_t, ixa->ixa_pmtu);
+
+		if (ixaflags & IXAF_IPSEC_SECURE) {
+			/*
+			 * Pass in sufficient information so that
+			 * IPsec can determine whether to fragment, and
+			 * which function to call after fragmentation.
+			 */
+			return (ipsec_out_process(mp, ixa));
+		}
+		return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags,
+		    ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint,
+		    ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
+		    ixa->ixa_postfragfn, &ixa->ixa_cookie));
+	}
+	if (ixaflags & IXAF_SET_ULP_CKSUM) {
+		/* Compute ULP checksum and IP header checksum */
+		/* An IS_UNDER_IPMP ill is ok here */
+		if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+			freemsg(mp);
+			return (EINVAL);
+		}
+	} else {
+		/* Calculate IPv4 header checksum */
+		ipha->ipha_hdr_checksum = 0;
+		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+	}
+	return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
+	    ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
+	    ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
+}
+
+/*
+ * Send mp into ip_input
+ * Common for IPv4 and IPv6
+ */
+void
+ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
+    uint_t pkt_len, zoneid_t nolzid)
+{
+	rtc_t		rtc;
+	ill_t		*ill = nce->nce_ill;
+	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
+	ncec_t		*ncec;
+
+	ncec = nce->nce_common;
+	iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM |
+	    IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
+	if (ncec->ncec_flags & NCE_F_BCAST)
+		iras.ira_flags |= IRAF_L2DST_BROADCAST;
+	else if (ncec->ncec_flags & NCE_F_MCAST)
+		iras.ira_flags |= IRAF_L2DST_MULTICAST;
+
+	iras.ira_free_flags = 0;
+	iras.ira_cred = NULL;
+	iras.ira_cpid = NOPID;
+	iras.ira_tsl = NULL;
+	iras.ira_zoneid = ALL_ZONES;
+	iras.ira_pktlen = pkt_len;
+	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen);
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+
+	if (ixaflags & IXAF_IS_IPV4)
+		iras.ira_flags |= IRAF_IS_IPV4;
+
+	iras.ira_ill = iras.ira_rill = ill;
+	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+	iras.ira_rifindex = iras.ira_ruifindex;
+	iras.ira_mhip = NULL;
+
+	iras.ira_flags |= ixaflags & IAF_MASK;
+	iras.ira_no_loop_zoneid = nolzid;
+
+	/* Broadcast and multicast doesn't care about the squeue */
+	iras.ira_sqp = NULL;
+
+	rtc.rtc_ire = NULL;
+	if (ixaflags & IXAF_IS_IPV4) {
+		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
+
+		rtc.rtc_ipaddr = INADDR_ANY;
+
+		(*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
+		if (rtc.rtc_ire != NULL) {
+			ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
+			ire_refrele(rtc.rtc_ire);
+		}
+	} else {
+		ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
+
+		rtc.rtc_ip6addr = ipv6_all_zeros;
+
+		(*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
+		if (rtc.rtc_ire != NULL) {
+			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
+			ire_refrele(rtc.rtc_ire);
+		}
+	}
+	/* Any references to clean up? No hold on ira */
+	if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
+		ira_cleanup(&iras, B_FALSE);
+}
+
+/*
+ * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which
+ * looks at the IXAF_LOOPBACK_COPY flag.
+ * Common for IPv4 and IPv6.
+ *
+ * If the loopback copy fails (due to no memory) but we send the packet out
+ * on the wire we return no failure. Only in the case we supress the wire
+ * sending do we take the loopback failure into account.
+ *
+ * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy.
+ * Those operations are performed on this packet in ip_xmit() and it would
+ * be odd to do it twice for the same packet.
+ */
+int
+ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
+    uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
+    uintptr_t *ixacookie)
+{
+	ill_t		*ill = nce->nce_ill;
+	int		error = 0;
+
+	/*
+	 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver
+	 * had looped it back
+	 */
+	if (ixaflags & IXAF_LOOPBACK_COPY) {
+		mblk_t		*mp1;
+
+		mp1 = copymsg(mp);
+		if (mp1 == NULL) {
+			/* Failed to deliver the loopback copy. */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+			error = ENOBUFS;
+		} else {
+			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
+			    nolzid);
+		}
+	}
+
+	/*
+	 * If TTL = 0 then only do the loopback to this host i.e. we are
+	 * done. We are also done if this was the
+	 * loopback interface since it is sufficient
+	 * to loopback one copy of a multicast packet.
+	 */
+	if (ixaflags & IXAF_IS_IPV4) {
+		ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+		if (ipha->ipha_ttl == 0) {
+			ip_drop_output("multicast ipha_ttl not sent to wire",
+			    mp, ill);
+			freemsg(mp);
+			return (error);
+		}
+	} else {
+		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
+
+		if (ip6h->ip6_hops == 0) {
+			ip_drop_output("multicast ipha_ttl not sent to wire",
+			    mp, ill);
+			freemsg(mp);
+			return (error);
+		}
+	}
+	if (nce->nce_ill->ill_wq == NULL) {
+		/* Loopback interface */
+		ip_drop_output("multicast on lo0 not sent to wire", mp, ill);
+		freemsg(mp);
+		return (error);
+	}
+
+	return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
+	    ixacookie));
+}
+
+/*
+ * Post fragmentation function for RTF_MULTIRT routes.
+ * Since IRE_BROADCASTs can have RTF_MULTIRT, this function
+ * checks IXAF_LOOPBACK_COPY.
+ *
+ * If no packet is sent due to failures then we return an errno, but if at
+ * least one succeeded we return zero.
+ */
+int
+ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
+    uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
+    uintptr_t *ixacookie)
+{
+	irb_t		*irb;
+	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
+	ire_t		*ire;
+	ire_t		*ire1;
+	mblk_t		*mp1;
+	nce_t		*nce1;
+	ill_t		*ill = nce->nce_ill;
+	ill_t		*ill1;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	int		error = 0;
+	int		num_sent = 0;
+	int		err;
+	uint_t		ire_type;
+	ipaddr_t	nexthop;
+
+	ASSERT(ixaflags & IXAF_IS_IPV4);
+
+	/* Check for IXAF_LOOPBACK_COPY */
+	if (ixaflags & IXAF_LOOPBACK_COPY) {
+		mblk_t *mp1;
+
+		mp1 = copymsg(mp);
+		if (mp1 == NULL) {
+			/* Failed to deliver the loopback copy. */
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+			error = ENOBUFS;
+		} else {
+			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
+			    nolzid);
+		}
+	}
+
+	/*
+	 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send
+	 * a copy to each one.
+	 * Use the nce (nexthop) and ipha_dst to find the ire.
+	 *
+	 * MULTIRT is not designed to work with shared-IP zones thus we don't
+	 * need to pass a zoneid or a label to the IRE lookup.
+	 */
+	if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) {
+		/* Broadcast and multicast case */
+		ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0,
+		    NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
+	} else {
+		ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr);
+
+		/* Unicast case */
+		ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0,
+		    NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
+	}
+
+	if (ire == NULL ||
+	    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+	    !(ire->ire_flags & RTF_MULTIRT)) {
+		/* Drop */
+		ip_drop_output("ip_postfrag_multirt didn't find route",
+		    mp, nce->nce_ill);
+		if (ire != NULL)
+			ire_refrele(ire);
+		return (ENETUNREACH);
+	}
+
+	irb = ire->ire_bucket;
+	irb_refhold(irb);
+	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
+		/*
+		 * For broadcast we can have a mixture of IRE_BROADCAST and
+		 * IRE_HOST due to the manually added IRE_HOSTs that are used
+		 * to trigger the creation of the special CGTP broadcast routes.
+		 * Thus we have to skip if ire_type doesn't match the original.
+		 */
+		if (IRE_IS_CONDEMNED(ire1) ||
+		    !(ire1->ire_flags & RTF_MULTIRT) ||
+		    ire1->ire_type != ire->ire_type)
+			continue;
+
+		/* Do the ire argument one after the loop */
+		if (ire1 == ire)
+			continue;
+
+		ill1 = ire_nexthop_ill(ire1);
+		if (ill1 == NULL) {
+			/*
+			 * This ire might not have been picked by
+			 * ire_route_recursive, in which case ire_dep might
+			 * not have been setup yet.
+			 * We kick ire_route_recursive to try to resolve
+			 * starting at ire1.
+			 */
+			ire_t *ire2;
+
+			ire2 = ire_route_recursive_impl_v4(ire1,
+			    ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
+			    ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY,
+			    B_TRUE, 0, ipst, NULL, NULL, NULL);
+			if (ire2 != NULL)
+				ire_refrele(ire2);
+			ill1 = ire_nexthop_ill(ire1);
+		}
+
+		if (ill1 == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - no ill",
+			    mp, ill);
+			error = ENETUNREACH;
+			continue;
+		}
+
+		/* Pick the addr and type to use for arp_nce_init */
+		if (nce->nce_common->ncec_flags & NCE_F_BCAST) {
+			ire_type = IRE_BROADCAST;
+			nexthop = ire1->ire_gateway_addr;
+		} else if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
+			ire_type = IRE_MULTICAST;
+			nexthop = ipha->ipha_dst;
+		} else {
+			ire_type = ire1->ire_type;	/* Doesn't matter */
+			nexthop = ire1->ire_gateway_addr;
+		}
+
+		/* If IPMP meta or under, then we just drop */
+		if (ill1->ill_grp != NULL) {
+			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - IPMP",
+			    mp, ill1);
+			ill_refrele(ill1);
+			error = ENETUNREACH;
+			continue;
+		}
+
+		nce1 = arp_nce_init(ill1, nexthop, ire_type);
+		if (nce1 == NULL) {
+			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards - no nce",
+			    mp, ill1);
+			ill_refrele(ill1);
+			error = ENETUNREACH;
+			continue;
+		}
+		mp1 = copymsg(mp);
+		if (mp1 == NULL) {
+			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
+			nce_refrele(nce1);
+			ill_refrele(ill1);
+			error = ENOBUFS;
+			continue;
+		}
+		/* Preserve HW checksum for this copy */
+		DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
+		DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
+		DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
+		DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
+		DB_LSOMSS(mp1) = DB_LSOMSS(mp);
+
+		ire1->ire_ob_pkt_count++;
+		err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
+		    0, ixacookie);
+		if (err == 0)
+			num_sent++;
+		else
+			error = err;
+		nce_refrele(nce1);
+		ill_refrele(ill1);
+	}
+	irb_refrele(irb);
+	ire_refrele(ire);
+	/* Finally, the main one */
+	err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
+	    ixacookie);
+	if (err == 0)
+		num_sent++;
+	else
+		error = err;
+	if (num_sent > 0)
+		return (0);
+	else
+		return (error);
+}
+
+/*
+ * Verify local connectivity. This check is called by ULP fusion code.
+ * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if
+ * the interface is brought down and back up. So we simply fail the local
+ * process. The caller, TCP Fusion, should unfuse the connection.
+ */
+boolean_t
+ip_output_verify_local(ip_xmit_attr_t *ixa)
+{
+	ire_t		*ire = ixa->ixa_ire;
+
+	if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
+		return (B_FALSE);
+
+	return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation);
+}
+
+/*
+ * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6.
+ *
+ * The caller must call ip_output_verify_local() first. This function handles
+ * IPobs, FW_HOOKS, and/or IPsec cases sequentially.
+ */
+mblk_t *
+ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out,
+    boolean_t hooks_in, conn_t *peer_connp)
+{
+	ill_t		*ill = ixa->ixa_ire->ire_ill;
+	ipha_t		*ipha = NULL;
+	ip6_t		*ip6h = NULL;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	iaflags_t	ixaflags = ixa->ixa_flags;
+	ip_recv_attr_t	iras;
+	int		error;
+
+	ASSERT(mp != NULL);
+
+	if (ixaflags & IXAF_IS_IPV4) {
+		ipha = (ipha_t *)mp->b_rptr;
+
+		/*
+		 * If a callback is enabled then we need to know the
+		 * source and destination zoneids for the packet. We already
+		 * have those handy.
+		 */
+		if (ipst->ips_ip4_observe.he_interested) {
+			zoneid_t szone, dzone;
+			zoneid_t stackzoneid;
+
+			stackzoneid = netstackid_to_zoneid(
+			    ipst->ips_netstack->netstack_stackid);
+
+			if (stackzoneid == GLOBAL_ZONEID) {
+				/* Shared-IP zone */
+				dzone = ixa->ixa_ire->ire_zoneid;
+				szone = ixa->ixa_zoneid;
+			} else {
+				szone = dzone = stackzoneid;
+			}
+			ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
+			    ipst);
+		}
+		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
+		    NULL, int, 1);
+
+		/* FW_HOOKS: LOOPBACK_OUT */
+		if (hooks_out) {
+			DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
+			    ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
+			FW_HOOKS(ipst->ips_ip4_loopback_out_event,
+			    ipst->ips_ipv4firewall_loopback_out,
+			    NULL, ill, ipha, mp, mp, 0, ipst, error);
+			DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
+		}
+		if (mp == NULL)
+			return (NULL);
+
+		/* FW_HOOKS: LOOPBACK_IN */
+		if (hooks_in) {
+			DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
+			    ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
+			FW_HOOKS(ipst->ips_ip4_loopback_in_event,
+			    ipst->ips_ipv4firewall_loopback_in,
+			    ill, NULL, ipha, mp, mp, 0, ipst, error);
+			DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
+		}
+		if (mp == NULL)
+			return (NULL);
+
+		DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
+		    NULL, int, 1);
+
+		/* Inbound IPsec polocies */
+		if (peer_connp != NULL) {
+			/* Map ixa to ira including IPsec policies. */
+			ipsec_out_to_in(ixa, ill, &iras);
+			mp = ipsec_check_inbound_policy(mp, peer_connp, ipha,
+			    NULL, &iras);
+		}
+	} else {
+		ip6h = (ip6_t *)mp->b_rptr;
+
+		/*
+		 * If a callback is enabled then we need to know the
+		 * source and destination zoneids for the packet. We already
+		 * have those handy.
+		 */
+		if (ipst->ips_ip6_observe.he_interested) {
+			zoneid_t szone, dzone;
+			zoneid_t stackzoneid;
+
+			stackzoneid = netstackid_to_zoneid(
+			    ipst->ips_netstack->netstack_stackid);
+
+			if (stackzoneid == GLOBAL_ZONEID) {
+				/* Shared-IP zone */
+				dzone = ixa->ixa_ire->ire_zoneid;
+				szone = ixa->ixa_zoneid;
+			} else {
+				szone = dzone = stackzoneid;
+			}
+			ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
+			    ipst);
+		}
+		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
+		    ip6h, int, 1);
+
+		/* FW_HOOKS: LOOPBACK_OUT */
+		if (hooks_out) {
+			DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL,
+			    ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp);
+			FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
+			    ipst->ips_ipv6firewall_loopback_out,
+			    NULL, ill, ip6h, mp, mp, 0, ipst, error);
+			DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
+		}
+		if (mp == NULL)
+			return (NULL);
+
+		/* FW_HOOKS: LOOPBACK_IN */
+		if (hooks_in) {
+			DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill,
+			    ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp);
+			FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
+			    ipst->ips_ipv6firewall_loopback_in,
+			    ill, NULL, ip6h, mp, mp, 0, ipst, error);
+			DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
+		}
+		if (mp == NULL)
+			return (NULL);
+
+		DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
+		    ip6h, int, 1);
+
+		/* Inbound IPsec polocies */
+		if (peer_connp != NULL) {
+			/* Map ixa to ira including IPsec policies. */
+			ipsec_out_to_in(ixa, ill, &iras);
+			mp = ipsec_check_inbound_policy(mp, peer_connp, NULL,
+			    ip6h, &iras);
+		}
+	}
+
+	if (mp == NULL) {
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards", NULL, ill);
+	}
+
+	return (mp);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c
index 70c8bd2ea1..228c7581a3 100644
--- a/usr/src/uts/common/inet/ip/ip_rts.c
+++ b/usr/src/uts/common/inet/ip/ip_rts.c
@@ -81,24 +81,33 @@
 static size_t	rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp);
 static void	rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst,
     ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr,
-    ipaddr_t author, const ipif_t *ipif, mblk_t *mp, uint_t, const tsol_gc_t *);
+    ipaddr_t author, ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp,
+    const tsol_gc_t *);
 static int	rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp,
     in6_addr_t *gw_addrp, in6_addr_t *net_maskp, in6_addr_t *authorp,
     in6_addr_t *if_addrp, in6_addr_t *src_addrp, ushort_t *indexp,
     sa_family_t *afp, tsol_rtsecattr_t *rtsecattr, int *error);
 static void	rts_getifdata(if_data_t *if_data, const ipif_t *ipif);
 static int	rts_getmetrics(ire_t *ire, rt_metrics_t *metrics);
-static mblk_t	*rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire,
-    sa_family_t af);
+static mblk_t	*rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire,
+    const in6_addr_t *setsrc, tsol_ire_gw_secattr_t *attrp, sa_family_t af);
 static void	rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics);
-static void	ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *);
+static ire_t	*ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask,
+    ipaddr_t gw_addr, const ill_t *ill, zoneid_t zoneid,
+    const ts_label_t *tsl, int match_flags, ip_stack_t *ipst, ire_t **pifire,
+    ipaddr_t *v4setsrcp, tsol_ire_gw_secattr_t **gwattrp);
+static ire_t	*ire_lookup_v6(const in6_addr_t *dst_addr_v6,
+    const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6,
+    const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags,
+    ip_stack_t *ipst, ire_t **pifire,
+    in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp);
 
 /*
  * Send `mp' to all eligible routing queues.  A queue is ineligible if:
  *
  *  1. SO_USELOOPBACK is off and it is not the originating queue.
- *  2. RTAW_UNDER_IPMP is on and RTSQ_UNDER_IPMP is clear in `flags'.
- *  3. RTAW_UNDER_IPMP is off and RTSQ_NORMAL is clear in `flags'.
+ *  2. RTA_UNDER_IPMP is on and RTSQ_UNDER_IPMP is not set in `flags'.
+ *  3. RTA_UNDER_IPMP is off and RTSQ_NORMAL is not set in `flags'.
  *  4. It is not the same address family as `af', and `af' isn't AF_UNSPEC.
  */
 void
@@ -110,7 +119,7 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
 
 	/*
 	 * Since we don't have an ill_t here, RTSQ_DEFAULT must already be
-	 * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP by now.
+	 * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP at this point.
 	 */
 	ASSERT(!(flags & RTSQ_DEFAULT));
 
@@ -119,7 +128,6 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
 
 	for (; connp != NULL; connp = next_connp) {
 		next_connp = connp->conn_next;
-
 		/*
 		 * If there was a family specified when this routing socket was
 		 * created and it doesn't match the family of the message to
@@ -139,28 +147,27 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
 			if (!(flags & RTSQ_NORMAL))
 				continue;
 		}
-
 		/*
 		 * For the originating queue, we only copy the message upstream
 		 * if loopback is set.  For others reading on the routing
 		 * socket, we check if there is room upstream for a copy of the
 		 * message.
 		 */
-		if ((o_connp == connp) && connp->conn_loopback == 0) {
+		if ((o_connp == connp) && connp->conn_useloopback == 0) {
 			connp = connp->conn_next;
 			continue;
 		}
 		CONN_INC_REF(connp);
 		mutex_exit(&ipst->ips_rts_clients->connf_lock);
 		/* Pass to rts_input */
-		if ((IPCL_IS_NONSTR(connp) && !PROTO_FLOW_CNTRLD(connp))||
-		    (!IPCL_IS_NONSTR(connp) &&
-		    canputnext(CONNP_TO_RQ(connp)))) {
+		if (IPCL_IS_NONSTR(connp) ? !connp->conn_flow_cntrld :
+		    canputnext(connp->conn_rq)) {
 			mp1 = dupmsg(mp);
 			if (mp1 == NULL)
 				mp1 = copymsg(mp);
+			/* Note that we pass a NULL ira to rts_input */
 			if (mp1 != NULL)
-				(connp->conn_recv)(connp, mp1, NULL);
+				(connp->conn_recv)(connp, mp1, NULL, NULL);
 		}
 
 		mutex_enter(&ipst->ips_rts_clients->connf_lock);
@@ -176,7 +183,7 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
  * Takes an ire and sends an ack to all the routing sockets. This
  * routine is used
  * - when a route is created/deleted through the ioctl interface.
- * - when ire_expire deletes a stale redirect
+ * - when a stale redirect is deleted
  */
 void
 ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
@@ -192,6 +199,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
 	ASSERT(ire->ire_ipversion == IPV4_VERSION ||
 	    ire->ire_ipversion == IPV6_VERSION);
 
+	ASSERT(!(ire->ire_type & IRE_IF_CLONE));
+
 	if (ire->ire_flags & RTF_SETSRC)
 		rtm_addrs |= RTA_SRC;
 
@@ -202,8 +211,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
 		if (mp == NULL)
 			return;
 		rts_fill_msg(type, rtm_addrs, ire->ire_addr, ire->ire_mask,
-		    ire->ire_gateway_addr, ire->ire_src_addr, 0, 0, NULL, mp,
-		    0, NULL);
+		    ire->ire_gateway_addr, ire->ire_setsrc_addr, 0, 0, 0, NULL,
+		    mp, NULL);
 		break;
 	case IPV6_VERSION:
 		af = AF_INET6;
@@ -215,8 +224,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
 		mutex_exit(&ire->ire_lock);
 		rts_fill_msg_v6(type, rtm_addrs, &ire->ire_addr_v6,
 		    &ire->ire_mask_v6, &gw_addr_v6,
-		    &ire->ire_src_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros,
-		    NULL, mp, 0, NULL);
+		    &ire->ire_setsrc_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros,
+		    &ipv6_all_zeros, NULL, mp, NULL);
 		break;
 	}
 	rtm = (rt_msghdr_t *)mp->b_rptr;
@@ -230,13 +239,6 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
 	rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst);
 }
 
-/* ARGSUSED */
-static void
-ip_rts_request_retry(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy)
-{
-	(void) ip_rts_request(q, mp, msg_getcred(mp, NULL));
-}
-
 /*
  * This is a call from the RTS module
  * indicating that this is a Routing Socket
@@ -248,7 +250,7 @@ ip_rts_register(conn_t *connp)
 {
 	ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
 
-	connp->conn_loopback = 1;
+	connp->conn_useloopback = 1;
 	ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
 }
 
@@ -269,18 +271,9 @@ ip_rts_unregister(conn_t *connp)
  *
  * In general, this function does not consume the message supplied but rather
  * sends the message upstream with an appropriate UNIX errno.
- *
- * We may need to restart this operation if the ipif cannot be looked up
- * due to an exclusive operation that is currently in progress. The restart
- * entry point is ip_rts_request_retry. While the request is enqueud in the
- * ipsq the ioctl could be aborted and the conn close. To ensure that we don't
- * have stale conn pointers, ip_wput_ioctl does a conn refhold. This is
- * released at the completion of the rts ioctl at the end of this function
- * by calling CONN_OPER_PENDING_DONE or when the ioctl is aborted and
- * conn close occurs in conn_ioctl_cleanup.
  */
 int
-ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
+ip_rts_request_common(mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
 {
 	rt_msghdr_t	*rtm = NULL;
 	in6_addr_t	dst_addr_v6;
@@ -289,9 +282,12 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
 	in6_addr_t	net_mask_v6;
 	in6_addr_t	author_v6;
 	in6_addr_t	if_addr_v6;
-	mblk_t		*mp1, *ioc_mp = mp;
+	mblk_t		*mp1;
 	ire_t		*ire = NULL;
-	ire_t		*sire = NULL;
+	ire_t		*ifire = NULL;
+	ipaddr_t	v4setsrc;
+	in6_addr_t	v6setsrc = ipv6_all_zeros;
+	tsol_ire_gw_secattr_t *gwattr = NULL;
 	int		error = 0;
 	int		match_flags = MATCH_IRE_DSTONLY;
 	int		match_flags_local = MATCH_IRE_TYPE | MATCH_IRE_GW;
@@ -302,9 +298,6 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
 	ipaddr_t	src_addr;
 	ipaddr_t	net_mask;
 	ushort_t	index;
-	ipif_t		*ipif = NULL;
-	ipif_t		*tmp_ipif = NULL;
-	IOCP		iocp = (IOCP)mp->b_rptr;
 	boolean_t	gcgrp_xtraref = B_FALSE;
 	tsol_gcgrp_addr_t ga;
 	tsol_rtsecattr_t rtsecattr;
@@ -314,42 +307,11 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
 	ts_label_t	*tsl = NULL;
 	zoneid_t	zoneid;
 	ip_stack_t	*ipst;
-
-	ip1dbg(("ip_rts_request: mp is %x\n", DB_TYPE(mp)));
+	ill_t   	*ill = NULL;
 
 	zoneid = connp->conn_zoneid;
 	ipst = connp->conn_netstack->netstack_ip;
 
-	ASSERT(mp->b_cont != NULL);
-	/* ioc_mp holds mp */
-	mp = mp->b_cont;
-
-	/*
-	 * The Routing Socket data starts on
-	 * next block. If there is no next block
-	 * this is an indication from routing module
-	 * that it is a routing socket stream queue.
-	 * We need to support that for compatibility with SDP since
-	 * it has a contract private interface to use IP_IOC_RTS_REQUEST.
-	 */
-	if (mp->b_cont == NULL) {
-		/*
-		 * This is a message from SDP
-		 * indicating that this is a Routing Socket
-		 * Stream. Insert this conn_t in routing
-		 * socket client list.
-		 */
-		connp->conn_loopback = 1;
-		ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
-		goto done;
-	}
-	mp1 = dupmsg(mp->b_cont);
-	if (mp1 == NULL) {
-		error  = ENOBUFS;
-		goto done;
-	}
-	mp = mp1;
-
 	if (mp->b_cont != NULL && !pullupmsg(mp, -1)) {
 		freemsg(mp);
 		error =  EINVAL;
@@ -446,20 +408,13 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
 	 */
 	ASSERT(af == AF_INET || af == AF_INET6);
 
+	/* Handle RTA_IFP */
 	if (index != 0) {
-		ill_t   *ill;
+		ipif_t		*ipif;
 lookup:
-		/*
-		 * IPC must be refheld somewhere in ip_wput_nondata or
-		 * ip_wput_ioctl etc... and cleaned up if ioctl is killed.
-		 * If ILL_CHANGING the request is queued in the ipsq.
-		 */
-		ill = ill_lookup_on_ifindex(index, af == AF_INET6,
-		    CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry, &error,
-		    ipst);
+		ill = ill_lookup_on_ifindex(index, af == AF_INET6, ipst);
 		if (ill == NULL) {
-			if (error != EINPROGRESS)
-				error = EINVAL;
+			error = EINVAL;
 			goto done;
 		}
 
@@ -474,13 +429,13 @@ lookup:
 			switch (rtm->rtm_type) {
 			case RTM_CHANGE:
 			case RTM_DELETE:
-				ill_refrele(ill);
 				error = EINVAL;
 				goto done;
 			case RTM_ADD:
 				index = ipmp_ill_get_ipmp_ifindex(ill);
 				ill_refrele(ill);
 				if (index == 0) {
+					ill = NULL; /* already refrele'd */
 					error = EINVAL;
 					goto done;
 				}
@@ -488,9 +443,18 @@ lookup:
 			}
 		}
 
-		ipif = ipif_get_next_ipif(NULL, ill);
-		ill_refrele(ill);
 		match_flags |= MATCH_IRE_ILL;
+		/*
+		 * This provides the same zoneid as in Solaris 10
+		 * that -ifp picks the zoneid from the first ipif on the ill.
+		 * But it might not be useful since the first ipif will always
+		 * have the same zoneid as the ill.
+		 */
+		ipif = ipif_get_next_ipif(NULL, ill);
+		if (ipif != NULL) {
+			zoneid = ipif->ipif_zoneid;
+			ipif_refrele(ipif);
+		}
 	}
 
 	/*
@@ -545,6 +509,8 @@ lookup:
 		switch (af) {
 		case AF_INET:
 			if (src_addr != INADDR_ANY) {
+				uint_t type;
+
 				/*
 				 * The RTF_SETSRC flag is present, check that
 				 * the supplied src address is not the loopback
@@ -556,20 +522,11 @@ lookup:
 				}
 				/*
 				 * Also check that the supplied address is a
-				 * valid, local one.
+				 * valid, local one. Only allow IFF_UP ones
 				 */
-				tmp_ipif = ipif_lookup_addr(src_addr, NULL,
-				    ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp,
-				    ip_rts_request_retry, &error, ipst);
-				if (tmp_ipif == NULL) {
-					if (error != EINPROGRESS)
-						error = EADDRNOTAVAIL;
-					goto done;
-				}
-				if (!(tmp_ipif->ipif_flags & IPIF_UP) ||
-				    (tmp_ipif->ipif_flags &
-				    (IPIF_NOLOCAL | IPIF_ANYCAST))) {
-					error = EINVAL;
+				type = ip_type_v4(src_addr, ipst);
+				if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) {
+					error = EADDRNOTAVAIL;
 					goto done;
 				}
 			} else {
@@ -584,14 +541,15 @@ lookup:
 			}
 
 			error = ip_rt_add(dst_addr, net_mask, gw_addr, src_addr,
-			    rtm->rtm_flags, ipif, &ire, B_FALSE,
-			    WR(q), ioc_mp, ip_rts_request_retry,
-			    rtsap, ipst);
-			if (ipif != NULL)
-				ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
+			    rtm->rtm_flags, ill, &ire, B_FALSE,
+			    rtsap, ipst, zoneid);
+			if (ill != NULL)
+				ASSERT(!MUTEX_HELD(&ill->ill_lock));
 			break;
 		case AF_INET6:
 			if (!IN6_IS_ADDR_UNSPECIFIED(&src_addr_v6)) {
+				uint_t type;
+
 				/*
 				 * The RTF_SETSRC flag is present, check that
 				 * the supplied src address is not the loopback
@@ -603,28 +561,17 @@ lookup:
 				}
 				/*
 				 * Also check that the supplied address is a
-				 * valid, local one.
+				 * valid, local one. Only allow UP ones.
 				 */
-				tmp_ipif = ipif_lookup_addr_v6(&src_addr_v6,
-				    NULL, ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp,
-				    ip_rts_request_retry, &error, ipst);
-				if (tmp_ipif == NULL) {
-					if (error != EINPROGRESS)
-						error = EADDRNOTAVAIL;
-					goto done;
-				}
-
-				if (!(tmp_ipif->ipif_flags & IPIF_UP) ||
-				    (tmp_ipif->ipif_flags &
-				    (IPIF_NOLOCAL | IPIF_ANYCAST))) {
-					error = EINVAL;
+				type = ip_type_v6(&src_addr_v6, ipst);
+				if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) {
+					error = EADDRNOTAVAIL;
 					goto done;
 				}
 
 				error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
 				    &gw_addr_v6, &src_addr_v6, rtm->rtm_flags,
-				    ipif, &ire, WR(q), ioc_mp,
-				    ip_rts_request_retry, rtsap, ipst);
+				    ill, &ire, rtsap, ipst, zoneid);
 				break;
 			}
 			/*
@@ -637,10 +584,9 @@ lookup:
 			}
 			error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
 			    &gw_addr_v6, NULL, rtm->rtm_flags,
-			    ipif, &ire, WR(q), ioc_mp,
-			    ip_rts_request_retry, rtsap, ipst);
-			if (ipif != NULL)
-				ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
+			    ill, &ire, rtsap, ipst, zoneid);
+			if (ill != NULL)
+				ASSERT(!MUTEX_HELD(&ill->ill_lock));
 			break;
 		}
 		if (error != 0)
@@ -666,13 +612,13 @@ lookup:
 		switch (af) {
 		case AF_INET:
 			error = ip_rt_delete(dst_addr, net_mask, gw_addr,
-			    found_addrs, rtm->rtm_flags, ipif, B_FALSE,
-			    WR(q), ioc_mp, ip_rts_request_retry, ipst);
+			    found_addrs, rtm->rtm_flags, ill, B_FALSE,
+			    ipst, zoneid);
 			break;
 		case AF_INET6:
 			error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6,
-			    &gw_addr_v6, found_addrs, rtm->rtm_flags, ipif,
-			    WR(q), ioc_mp, ip_rts_request_retry, ipst);
+			    &gw_addr_v6, found_addrs, rtm->rtm_flags, ill,
+			    ipst, zoneid);
 			break;
 		}
 		break;
@@ -680,8 +626,7 @@ lookup:
 	case RTM_CHANGE:
 		/*
 		 * In the case of RTM_GET, the forwarding table should be
-		 * searched recursively with default being matched if the
-		 * specific route doesn't exist.  Also, if a gateway was
+		 * searched recursively.  Also, if a gateway was
 		 * specified then the gateway address must also be matched.
 		 *
 		 * In the case of RTM_CHANGE, the gateway address (if supplied)
@@ -706,9 +651,7 @@ lookup:
 		}
 
 		if (rtm->rtm_type == RTM_GET) {
-			match_flags |=
-			    (MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE |
-			    MATCH_IRE_SECATTR);
+			match_flags |= MATCH_IRE_SECATTR;
 			match_flags_local |= MATCH_IRE_SECATTR;
 			if ((found_addrs & RTA_GATEWAY) != 0)
 				match_flags |= MATCH_IRE_GW;
@@ -749,57 +692,34 @@ lookup:
 		 * IRE_LOCAL entry.
 		 *
 		 * If we didn't check for or find an IRE_LOOPBACK or IRE_LOCAL
-		 * entry, then look in the forwarding table.
+		 * entry, then look for any other type of IRE.
 		 */
 		switch (af) {
 		case AF_INET:
 			if (net_mask == IP_HOST_MASK) {
-				ire = ire_ctable_lookup(dst_addr, gw_addr,
+				ire = ire_ftable_lookup_v4(dst_addr, 0, gw_addr,
 				    IRE_LOCAL | IRE_LOOPBACK, NULL, zoneid,
-				    tsl, match_flags_local, ipst);
-				/*
-				 * If we found an IRE_LOCAL, make sure
-				 * it is one that would be used by this
-				 * zone to send packets.
-				 */
-				if (ire != NULL &&
-				    ire->ire_type == IRE_LOCAL &&
-				    ipst->ips_ip_restrict_interzone_loopback &&
-				    !ire_local_ok_across_zones(ire,
-				    zoneid, &dst_addr, tsl, ipst)) {
-					ire_refrele(ire);
-					ire = NULL;
-				}
+				    tsl, match_flags_local, 0, ipst, NULL);
 			}
 			if (ire == NULL) {
-				ire = ire_ftable_lookup(dst_addr, net_mask,
-				    gw_addr, 0, ipif, &sire, zoneid, 0,
-				    tsl, match_flags, ipst);
+				ire = ire_lookup_v4(dst_addr, net_mask,
+				    gw_addr, ill, zoneid, tsl, match_flags,
+				    ipst, &ifire, &v4setsrc, &gwattr);
+				IN6_IPADDR_TO_V4MAPPED(v4setsrc, &v6setsrc);
 			}
 			break;
 		case AF_INET6:
 			if (IN6_ARE_ADDR_EQUAL(&net_mask_v6, &ipv6_all_ones)) {
-				ire = ire_ctable_lookup_v6(&dst_addr_v6,
+				ire = ire_ftable_lookup_v6(&dst_addr_v6, NULL,
 				    &gw_addr_v6, IRE_LOCAL | IRE_LOOPBACK, NULL,
-				    zoneid, tsl, match_flags_local, ipst);
-				/*
-				 * If we found an IRE_LOCAL, make sure
-				 * it is one that would be used by this
-				 * zone to send packets.
-				 */
-				if (ire != NULL &&
-				    ire->ire_type == IRE_LOCAL &&
-				    ipst->ips_ip_restrict_interzone_loopback &&
-				    !ire_local_ok_across_zones(ire,
-				    zoneid, (void *)&dst_addr_v6, tsl, ipst)) {
-					ire_refrele(ire);
-					ire = NULL;
-				}
+				    zoneid, tsl, match_flags_local, 0, ipst,
+				    NULL);
 			}
 			if (ire == NULL) {
-				ire = ire_ftable_lookup_v6(&dst_addr_v6,
-				    &net_mask_v6, &gw_addr_v6, 0, ipif, &sire,
-				    zoneid, 0, tsl, match_flags, ipst);
+				ire = ire_lookup_v6(&dst_addr_v6,
+				    &net_mask_v6, &gw_addr_v6, ill, zoneid,
+				    tsl, match_flags, ipst, &ifire, &v6setsrc,
+				    &gwattr);
 			}
 			break;
 		}
@@ -810,10 +730,21 @@ lookup:
 			error = ESRCH;
 			goto done;
 		}
+		/*
+		 * Want to return failure if we get an IRE_NOROUTE from
+		 * ire_route_recursive
+		 */
+		if (ire->ire_type & IRE_NOROUTE) {
+			ire_refrele(ire);
+			ire = NULL;
+			error = ESRCH;
+			goto done;
+		}
+
 		/* we know the IRE before we come here */
 		switch (rtm->rtm_type) {
 		case RTM_GET:
-			mp1 = rts_rtmget(mp, ire, sire, af);
+			mp1 = rts_rtmget(mp, ire, ifire, &v6setsrc, gwattr, af);
 			if (mp1 == NULL) {
 				error = ENOBUFS;
 				goto done;
@@ -843,7 +774,6 @@ lookup:
 			 */
 			switch (af) {
 			case AF_INET:
-				ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
 				if ((found_addrs & RTA_GATEWAY) != 0 &&
 				    (ire->ire_gateway_addr != gw_addr)) {
 					ire->ire_gateway_addr = gw_addr;
@@ -863,9 +793,10 @@ lookup:
 
 				if ((found_addrs & RTA_SRC) != 0 &&
 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
-				    (ire->ire_src_addr != src_addr)) {
-
+				    (ire->ire_setsrc_addr != src_addr)) {
 					if (src_addr != INADDR_ANY) {
+						uint_t type;
+
 						/*
 						 * The RTF_SETSRC flag is
 						 * present, check that the
@@ -880,50 +811,47 @@ lookup:
 							goto done;
 						}
 						/*
-						 * Also check that the the
+						 * Also check that the
 						 * supplied addr is a valid
 						 * local address.
 						 */
-						tmp_ipif = ipif_lookup_addr(
-						    src_addr, NULL, ALL_ZONES,
-						    WR(q), ioc_mp,
-						    ip_rts_request_retry,
-						    &error, ipst);
-						if (tmp_ipif == NULL) {
-							error = (error ==
-							    EINPROGRESS) ?
-							    error :
-							    EADDRNOTAVAIL;
-							goto done;
-						}
-
-						if (!(tmp_ipif->ipif_flags &
-						    IPIF_UP) ||
-						    (tmp_ipif->ipif_flags &
-						    (IPIF_NOLOCAL |
-						    IPIF_ANYCAST))) {
-							error = EINVAL;
+						type = ip_type_v4(src_addr,
+						    ipst);
+						if (!(type &
+						    (IRE_LOCAL|IRE_LOOPBACK))) {
+							error = EADDRNOTAVAIL;
 							goto done;
 						}
 						ire->ire_flags |= RTF_SETSRC;
+						ire->ire_setsrc_addr =
+						    src_addr;
 					} else {
 						ire->ire_flags &= ~RTF_SETSRC;
+						ire->ire_setsrc_addr =
+						    INADDR_ANY;
 					}
-					ire->ire_src_addr = src_addr;
+					/*
+					 * Let conn_ixa caching know that
+					 * source address selection changed
+					 */
+					ip_update_source_selection(ipst);
 				}
+				ire_flush_cache_v4(ire, IRE_FLUSH_GWCHANGE);
 				break;
 			case AF_INET6:
-				ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
 				mutex_enter(&ire->ire_lock);
 				if ((found_addrs & RTA_GATEWAY) != 0 &&
 				    !IN6_ARE_ADDR_EQUAL(
 				    &ire->ire_gateway_addr_v6, &gw_addr_v6)) {
 					ire->ire_gateway_addr_v6 = gw_addr_v6;
 				}
+				mutex_exit(&ire->ire_lock);
 
 				if (rtsap != NULL) {
 					ga.ga_af = AF_INET6;
+					mutex_enter(&ire->ire_lock);
 					ga.ga_addr = ire->ire_gateway_addr_v6;
+					mutex_exit(&ire->ire_lock);
 
 					gcgrp = gcgrp_lookup(&ga, B_TRUE);
 					if (gcgrp == NULL) {
@@ -935,10 +863,11 @@ lookup:
 				if ((found_addrs & RTA_SRC) != 0 &&
 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
 				    !IN6_ARE_ADDR_EQUAL(
-				    &ire->ire_src_addr_v6, &src_addr_v6)) {
-
+				    &ire->ire_setsrc_addr_v6, &src_addr_v6)) {
 					if (!IN6_IS_ADDR_UNSPECIFIED(
 					    &src_addr_v6)) {
+						uint_t type;
+
 						/*
 						 * The RTF_SETSRC flag is
 						 * present, check that the
@@ -949,54 +878,44 @@ lookup:
 						 */
 						if (IN6_IS_ADDR_LOOPBACK(
 						    &src_addr_v6)) {
-							mutex_exit(
-							    &ire->ire_lock);
 							error = EINVAL;
 							goto done;
 						}
 						/*
-						 * Also check that the the
+						 * Also check that the
 						 * supplied addr is a valid
 						 * local address.
 						 */
-						tmp_ipif = ipif_lookup_addr_v6(
-						    &src_addr_v6, NULL,
-						    ALL_ZONES,
-						    CONNP_TO_WQ(connp), ioc_mp,
-						    ip_rts_request_retry,
-						    &error, ipst);
-						if (tmp_ipif == NULL) {
-							mutex_exit(
-							    &ire->ire_lock);
-							error = (error ==
-							    EINPROGRESS) ?
-							    error :
-							    EADDRNOTAVAIL;
-							goto done;
-						}
-						if (!(tmp_ipif->ipif_flags &
-						    IPIF_UP) ||
-						    (tmp_ipif->ipif_flags &
-						    (IPIF_NOLOCAL |
-						    IPIF_ANYCAST))) {
-							mutex_exit(
-							    &ire->ire_lock);
-							error = EINVAL;
+						type = ip_type_v6(&src_addr_v6,
+						    ipst);
+						if (!(type &
+						    (IRE_LOCAL|IRE_LOOPBACK))) {
+							error = EADDRNOTAVAIL;
 							goto done;
 						}
+						mutex_enter(&ire->ire_lock);
 						ire->ire_flags |= RTF_SETSRC;
+						ire->ire_setsrc_addr_v6 =
+						    src_addr_v6;
+						mutex_exit(&ire->ire_lock);
 					} else {
+						mutex_enter(&ire->ire_lock);
 						ire->ire_flags &= ~RTF_SETSRC;
+						ire->ire_setsrc_addr_v6 =
+						    ipv6_all_zeros;
+						mutex_exit(&ire->ire_lock);
 					}
-					ire->ire_src_addr_v6 = src_addr_v6;
+					/*
+					 * Let conn_ixa caching know that
+					 * source address selection changed
+					 */
+					ip_update_source_selection(ipst);
 				}
-				mutex_exit(&ire->ire_lock);
+				ire_flush_cache_v6(ire, IRE_FLUSH_GWCHANGE);
 				break;
 			}
 
 			if (rtsap != NULL) {
-				in_addr_t ga_addr4;
-
 				ASSERT(gcgrp != NULL);
 
 				/*
@@ -1010,7 +929,7 @@ lookup:
 				gc = gc_create(rtsap, gcgrp, &gcgrp_xtraref);
 				if (gc == NULL ||
 				    (error = tsol_ire_init_gwattr(ire,
-				    ire->ire_ipversion, gc, NULL)) != 0) {
+				    ire->ire_ipversion, gc)) != 0) {
 					if (gc != NULL) {
 						GC_REFRELE(gc);
 					} else {
@@ -1019,21 +938,6 @@ lookup:
 					}
 					goto done;
 				}
-
-				/*
-				 * Now delete any existing gateway IRE caches
-				 * as well as all caches using the gateway,
-				 * and allow them to be created on demand
-				 * through ip_newroute{_v6}.
-				 */
-				IN6_V4MAPPED_TO_IPADDR(&ga.ga_addr, ga_addr4);
-				if (af == AF_INET) {
-					ire_clookup_delete_cache_gw(
-					    ga_addr4, ALL_ZONES, ipst);
-				} else {
-					ire_clookup_delete_cache_gw_v6(
-					    &ga.ga_addr, ALL_ZONES, ipst);
-				}
 			}
 			rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
 			break;
@@ -1046,21 +950,14 @@ lookup:
 done:
 	if (ire != NULL)
 		ire_refrele(ire);
-	if (sire != NULL)
-		ire_refrele(sire);
-	if (ipif != NULL)
-		ipif_refrele(ipif);
-	if (tmp_ipif != NULL)
-		ipif_refrele(tmp_ipif);
+	if (ifire != NULL)
+		ire_refrele(ifire);
+	if (ill != NULL)
+		ill_refrele(ill);
 
 	if (gcgrp_xtraref)
 		GCGRP_REFRELE(gcgrp);
 
-	if (error == EINPROGRESS) {
-		if (rtm != NULL)
-			freemsg(mp);
-		return (error);
-	}
 	if (rtm != NULL) {
 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
 		if (error != 0) {
@@ -1074,12 +971,190 @@ done:
 		}
 		rts_queue_input(mp, connp, af, RTSQ_ALL, ipst);
 	}
+	return (error);
+}
+
+/*
+ * Helper function that can do recursive lookups including when
+ * MATCH_IRE_GW and/or MATCH_IRE_MASK is set.
+ */
+static ire_t *
+ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask, ipaddr_t gw_addr,
+    const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
+    int match_flags, ip_stack_t *ipst, ire_t **pifire, ipaddr_t *v4setsrcp,
+    tsol_ire_gw_secattr_t **gwattrp)
+{
+	ire_t		*ire;
+	ire_t		*ifire = NULL;
+	uint_t		ire_type;
+
+	*pifire = NULL;
+	*v4setsrcp = INADDR_ANY;
+	*gwattrp = NULL;
+
+	/* Skip IRE_IF_CLONE */
+	match_flags |= MATCH_IRE_TYPE;
+	ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE;
+
+	/*
+	 * ire_route_recursive can't match gateway or mask thus if they are
+	 * set we have to do two steps of lookups
+	 */
+	if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) {
+		ire = ire_ftable_lookup_v4(dst_addr, net_mask, gw_addr,
+		    ire_type, ill, zoneid, tsl, match_flags, 0, ipst, NULL);
+
+		if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)))
+			return (ire);
+
+		if (ire->ire_type & IRE_ONLINK)
+			return (ire);
+
+		if (ire->ire_flags & RTF_SETSRC) {
+			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
+			*v4setsrcp = ire->ire_setsrc_addr;
+			v4setsrcp = NULL;
+		}
+
+		/* The first ire_gw_secattr is passed back */
+		if (ire->ire_gw_secattr != NULL) {
+			*gwattrp = ire->ire_gw_secattr;
+			gwattrp = NULL;
+		}
+
+		/* Look for an interface ire recursively based on the gateway */
+		dst_addr = ire->ire_gateway_addr;
+		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK);
+		ifire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid,
+		    tsl, match_flags, B_FALSE, 0, ipst, v4setsrcp, gwattrp,
+		    NULL);
+	} else {
+		ire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid,
+		    tsl, match_flags, B_FALSE, 0, ipst, v4setsrcp, gwattrp,
+		    NULL);
+	}
+	*pifire = ifire;
+	return (ire);
+}
+
+static ire_t *
+ire_lookup_v6(const in6_addr_t *dst_addr_v6,
+    const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6,
+    const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags,
+    ip_stack_t *ipst, ire_t **pifire,
+    in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp)
+{
+	ire_t		*ire;
+	ire_t		*ifire = NULL;
+	uint_t		ire_type;
+
+	*pifire = NULL;
+	*v6setsrcp = ipv6_all_zeros;
+	*gwattrp = NULL;
+
+	/* Skip IRE_IF_CLONE */
+	match_flags |= MATCH_IRE_TYPE;
+	ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE;
+
+	/*
+	 * ire_route_recursive can't match gateway or mask thus if they are
+	 * set we have to do two steps of lookups
+	 */
+	if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) {
+		in6_addr_t dst;
+
+		ire = ire_ftable_lookup_v6(dst_addr_v6, net_mask_v6,
+		    gw_addr_v6, ire_type, ill, zoneid, tsl, match_flags, 0,
+		    ipst, NULL);
+
+		if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)))
+			return (ire);
+
+		if (ire->ire_type & IRE_ONLINK)
+			return (ire);
+
+		if (ire->ire_flags & RTF_SETSRC) {
+			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
+			    &ire->ire_setsrc_addr_v6));
+			*v6setsrcp = ire->ire_setsrc_addr_v6;
+			v6setsrcp = NULL;
+		}
+
+		/* The first ire_gw_secattr is passed back */
+		if (ire->ire_gw_secattr != NULL) {
+			*gwattrp = ire->ire_gw_secattr;
+			gwattrp = NULL;
+		}
+
+		mutex_enter(&ire->ire_lock);
+		dst = ire->ire_gateway_addr_v6;
+		mutex_exit(&ire->ire_lock);
+		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK);
+		ifire = ire_route_recursive_v6(&dst, ire_type, ill, zoneid, tsl,
+		    match_flags, B_FALSE, 0, ipst, v6setsrcp, gwattrp, NULL);
+	} else {
+		ire = ire_route_recursive_v6(dst_addr_v6, ire_type, ill, zoneid,
+		    tsl, match_flags, B_FALSE, 0, ipst, v6setsrcp, gwattrp,
+		    NULL);
+	}
+	*pifire = ifire;
+	return (ire);
+}
+
+
+/*
+ * Handle IP_IOC_RTS_REQUEST ioctls
+ */
+int
+ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
+{
+	conn_t	*connp = Q_TO_CONN(q);
+	IOCP	iocp = (IOCP)mp->b_rptr;
+	mblk_t	*mp1, *ioc_mp = mp;
+	int	error = 0;
+	ip_stack_t	*ipst;
 
+	ipst = connp->conn_netstack->netstack_ip;
+
+	ASSERT(mp->b_cont != NULL);
+	/* ioc_mp holds mp */
+	mp = mp->b_cont;
+
+	/*
+	 * The Routing Socket data starts on
+	 * next block. If there is no next block
+	 * this is an indication from routing module
+	 * that it is a routing socket stream queue.
+	 * We need to support that for compatibility with SDP since
+	 * it has a contract private interface to use IP_IOC_RTS_REQUEST.
+	 * Note: SDP no longer uses IP_IOC_RTS_REQUEST - we can remove this.
+	 */
+	if (mp->b_cont == NULL) {
+		/*
+		 * This is a message from SDP
+		 * indicating that this is a Routing Socket
+		 * Stream. Insert this conn_t in routing
+		 * socket client list.
+		 */
+		connp->conn_useloopback = 1;
+		ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
+		goto done;
+	}
+	mp1 = dupmsg(mp->b_cont);
+	if (mp1 == NULL) {
+		error  = ENOBUFS;
+		goto done;
+	}
+	mp = mp1;
+
+	error = ip_rts_request_common(mp, connp, ioc_cr);
+done:
 	iocp->ioc_error = error;
 	ioc_mp->b_datap->db_type = M_IOCACK;
 	if (iocp->ioc_error != 0)
 		iocp->ioc_count = 0;
-	(connp->conn_recv)(connp, ioc_mp, NULL);
+	/* Note that we pass a NULL ira to rts_input */
+	(connp->conn_recv)(connp, ioc_mp, NULL, NULL);
 
 	/* conn was refheld in ip_wput_ioctl. */
 	CONN_OPER_PENDING_DONE(connp);
@@ -1087,12 +1162,6 @@ done:
 	return (error);
 }
 
-int
-ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
-{
-	return (ip_rts_request_common(q, mp, Q_TO_CONN(q), ioc_cr));
-}
-
 /*
  * Build a reply to the RTM_GET request contained in the given message block
  * using the retrieved IRE of the destination address, the parent IRE (if it
@@ -1102,26 +1171,34 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
  * otherwise NULL is returned.
  */
 static mblk_t *
-rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
+rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire, const in6_addr_t *setsrc,
+    tsol_ire_gw_secattr_t *attrp, sa_family_t af)
 {
 	rt_msghdr_t	*rtm;
 	rt_msghdr_t	*new_rtm;
 	mblk_t		*new_mp;
 	int		rtm_addrs;
 	int		rtm_flags;
-	in6_addr_t	gw_addr_v6;
-	tsol_ire_gw_secattr_t *attrp = NULL;
 	tsol_gc_t	*gc = NULL;
 	tsol_gcgrp_t	*gcgrp = NULL;
-	int		sacnt = 0;
+	ill_t		*ill;
+	ipif_t		*ipif = NULL;
+	ipaddr_t	brdaddr;	/* IFF_POINTOPOINT destination */
+	ipaddr_t	ifaddr;
+	in6_addr_t	brdaddr6;	/* IFF_POINTOPOINT destination */
+	in6_addr_t	ifaddr6;
+	ipaddr_t	v4setsrc;
 
-	ASSERT(ire->ire_ipif != NULL);
 	rtm = (rt_msghdr_t *)mp->b_rptr;
 
-	if (sire != NULL && sire->ire_gw_secattr != NULL)
-		attrp = sire->ire_gw_secattr;
-	else if (ire->ire_gw_secattr != NULL)
-		attrp = ire->ire_gw_secattr;
+	/*
+	 * Find the ill used to send packets. This will be NULL in case
+	 * of a reject or blackhole.
+	 */
+	if (ifire != NULL)
+		ill = ire_nexthop_ill(ifire);
+	else
+		ill = ire_nexthop_ill(ire);
 
 	if (attrp != NULL) {
 		mutex_enter(&attrp->igsa_lock);
@@ -1129,29 +1206,9 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
 			gcgrp = gc->gc_grp;
 			ASSERT(gcgrp != NULL);
 			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
-			sacnt = 1;
-		} else if ((gcgrp = attrp->igsa_gcgrp) != NULL) {
-			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
-			gc = gcgrp->gcgrp_head;
-			sacnt = gcgrp->gcgrp_count;
 		}
 		mutex_exit(&attrp->igsa_lock);
-
-		/* do nothing if there's no gc to report */
-		if (gc == NULL) {
-			ASSERT(sacnt == 0);
-			if (gcgrp != NULL) {
-				/* we might as well drop the lock now */
-				rw_exit(&gcgrp->gcgrp_rwlock);
-				gcgrp = NULL;
-			}
-			attrp = NULL;
-		}
-
-		ASSERT(gc == NULL || (gcgrp != NULL &&
-		    RW_LOCK_HELD(&gcgrp->gcgrp_rwlock)));
 	}
-	ASSERT(sacnt == 0 || gc != NULL);
 
 	/*
 	 * Always return RTA_DST, RTA_GATEWAY and RTA_NETMASK.
@@ -1162,16 +1219,36 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
 	 * point-to-point.
 	 */
 	rtm_addrs = (RTA_DST | RTA_GATEWAY | RTA_NETMASK);
-	if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
+	if ((rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) && ill != NULL) {
 		rtm_addrs |= (RTA_IFP | RTA_IFA);
-		if (ire->ire_ipif->ipif_flags & IPIF_POINTOPOINT)
-			rtm_addrs |= RTA_BRD;
+		/*
+		 * We associate an IRE with an ILL, hence we don't exactly
+		 * know what might make sense for RTA_IFA and RTA_BRD. We
+		 * pick the first ipif on the ill.
+		 */
+		ipif = ipif_get_next_ipif(NULL, ill);
+		if (ipif != NULL) {
+			if (ipif->ipif_isv6)
+				ifaddr6 = ipif->ipif_v6lcl_addr;
+			else
+				ifaddr = ipif->ipif_lcl_addr;
+			if (ipif->ipif_flags & IPIF_POINTOPOINT) {
+				rtm_addrs |= RTA_BRD;
+				if (ipif->ipif_isv6)
+					brdaddr6 = ipif->ipif_v6pp_dst_addr;
+				else
+					brdaddr = ipif->ipif_pp_dst_addr;
+			}
+			ipif_refrele(ipif);
+		}
 	}
 
-	new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, sacnt);
+	new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, gc != NULL ? 1 : 0);
 	if (new_mp == NULL) {
 		if (gcgrp != NULL)
 			rw_exit(&gcgrp->gcgrp_rwlock);
+		if (ill != NULL)
+			ill_refrele(ill);
 		return (NULL);
 	}
 
@@ -1187,49 +1264,24 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
 	ASSERT(af == AF_INET || af == AF_INET6);
 	switch (af) {
 	case AF_INET:
-		if (sire == NULL) {
-			rtm_flags = ire->ire_flags;
-			rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr,
-			    ire->ire_mask, ire->ire_src_addr, ire->ire_src_addr,
-			    ire->ire_ipif->ipif_pp_dst_addr, 0, ire->ire_ipif,
-			    new_mp, sacnt, gc);
-		} else {
-			if (sire->ire_flags & RTF_SETSRC)
-				rtm_addrs |= RTA_SRC;
-
-			rtm_flags = sire->ire_flags;
-			rts_fill_msg(RTM_GET, rtm_addrs, sire->ire_addr,
-			    sire->ire_mask, sire->ire_gateway_addr,
-			    (sire->ire_flags & RTF_SETSRC) ?
-			    sire->ire_src_addr : ire->ire_src_addr,
-			    ire->ire_ipif->ipif_pp_dst_addr,
-			    0, ire->ire_ipif, new_mp, sacnt, gc);
-		}
+		IN6_V4MAPPED_TO_IPADDR(setsrc, v4setsrc);
+		if (v4setsrc != INADDR_ANY)
+			rtm_addrs |= RTA_SRC;
+
+		rtm_flags = ire->ire_flags;
+		rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr,
+		    ire->ire_mask, ire->ire_gateway_addr, v4setsrc,
+		    brdaddr, 0, ifaddr, ill, new_mp, gc);
 		break;
 	case AF_INET6:
-		if (sire == NULL) {
-			rtm_flags = ire->ire_flags;
-			rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6,
-			    &ire->ire_mask_v6, &ire->ire_src_addr_v6,
-			    &ire->ire_src_addr_v6,
-			    &ire->ire_ipif->ipif_v6pp_dst_addr,
-			    &ipv6_all_zeros, ire->ire_ipif, new_mp,
-			    sacnt, gc);
-		} else {
-			if (sire->ire_flags & RTF_SETSRC)
-				rtm_addrs |= RTA_SRC;
-
-			rtm_flags = sire->ire_flags;
-			mutex_enter(&sire->ire_lock);
-			gw_addr_v6 = sire->ire_gateway_addr_v6;
-			mutex_exit(&sire->ire_lock);
-			rts_fill_msg_v6(RTM_GET, rtm_addrs, &sire->ire_addr_v6,
-			    &sire->ire_mask_v6, &gw_addr_v6,
-			    (sire->ire_flags & RTF_SETSRC) ?
-			    &sire->ire_src_addr_v6 : &ire->ire_src_addr_v6,
-			    &ire->ire_ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros,
-			    ire->ire_ipif, new_mp, sacnt, gc);
-		}
+		if (!IN6_IS_ADDR_UNSPECIFIED(setsrc))
+			rtm_addrs |= RTA_SRC;
+
+		rtm_flags = ire->ire_flags;
+		rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6,
+		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
+		    setsrc, &brdaddr6, &ipv6_all_zeros,
+		    &ifaddr6, ill, new_mp, gc);
 		break;
 	}
 
@@ -1259,11 +1311,9 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
 	new_rtm->rtm_use = rtm->rtm_use;
 	new_rtm->rtm_addrs = rtm_addrs;
 	new_rtm->rtm_flags = rtm_flags;
-	if (sire == NULL)
-		new_rtm->rtm_inits = rts_getmetrics(ire, &new_rtm->rtm_rmx);
-	else
-		new_rtm->rtm_inits = rts_getmetrics(sire, &new_rtm->rtm_rmx);
-
+	new_rtm->rtm_inits = rts_getmetrics(ire, &new_rtm->rtm_rmx);
+	if (ill != NULL)
+		ill_refrele(ill);
 	return (new_mp);
 }
 
@@ -1273,10 +1323,11 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
 static void
 rts_getifdata(if_data_t *if_data, const ipif_t *ipif)
 {
-	if_data->ifi_type = ipif->ipif_type;	/* ethernet, tokenring, etc */
+	if_data->ifi_type = ipif->ipif_ill->ill_type;
+						/* ethernet, tokenring, etc */
 	if_data->ifi_addrlen = 0;		/* media address length */
 	if_data->ifi_hdrlen = 0;		/* media header length */
-	if_data->ifi_mtu = ipif->ipif_mtu;	/* maximum transmission unit */
+	if_data->ifi_mtu = ipif->ipif_ill->ill_mtu;	/* mtu */
 	if_data->ifi_metric = ipif->ipif_metric; /* metric (external only) */
 	if_data->ifi_baudrate = 0;		/* linespeed */
 
@@ -1302,18 +1353,19 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics)
 {
 	clock_t		rtt;
 	clock_t		rtt_sd;
-	ipif_t		*ipif;
+	ill_t		*ill;
 	ifrt_t		*ifrt;
 	mblk_t		*mp;
 	in6_addr_t	gw_addr_v6;
 
+	/* Need to add back some metrics to the IRE? */
 	/*
-	 * Bypass obtaining the lock and searching ipif_saved_ire_mp in the
+	 * Bypass obtaining the lock and searching ill_saved_ire_mp in the
 	 * common case of no metrics.
 	 */
 	if (which == 0)
 		return;
-	ire->ire_uinfo.iulp_set = B_TRUE;
+	ire->ire_metrics.iulp_set = B_TRUE;
 
 	/*
 	 * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
@@ -1330,42 +1382,41 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics)
 	 */
 	mutex_enter(&ire->ire_lock);
 	if (which & RTV_MTU)
-		ire->ire_max_frag = metrics->rmx_mtu;
+		ire->ire_metrics.iulp_mtu = metrics->rmx_mtu;
 	if (which & RTV_RTT)
-		ire->ire_uinfo.iulp_rtt = rtt;
+		ire->ire_metrics.iulp_rtt = rtt;
 	if (which & RTV_SSTHRESH)
-		ire->ire_uinfo.iulp_ssthresh = metrics->rmx_ssthresh;
+		ire->ire_metrics.iulp_ssthresh = metrics->rmx_ssthresh;
 	if (which & RTV_RTTVAR)
-		ire->ire_uinfo.iulp_rtt_sd = rtt_sd;
+		ire->ire_metrics.iulp_rtt_sd = rtt_sd;
 	if (which & RTV_SPIPE)
-		ire->ire_uinfo.iulp_spipe = metrics->rmx_sendpipe;
+		ire->ire_metrics.iulp_spipe = metrics->rmx_sendpipe;
 	if (which & RTV_RPIPE)
-		ire->ire_uinfo.iulp_rpipe = metrics->rmx_recvpipe;
+		ire->ire_metrics.iulp_rpipe = metrics->rmx_recvpipe;
 	mutex_exit(&ire->ire_lock);
 
 	/*
-	 * Search through the ifrt_t chain hanging off the IPIF in order to
+	 * Search through the ifrt_t chain hanging off the ILL in order to
 	 * reflect the metric change there.
 	 */
-	ipif = ire->ire_ipif;
-	if (ipif == NULL)
+	ill = ire->ire_ill;
+	if (ill == NULL)
 		return;
-	ASSERT((ipif->ipif_isv6 && ire->ire_ipversion == IPV6_VERSION) ||
-	    ((!ipif->ipif_isv6 && ire->ire_ipversion == IPV4_VERSION)));
-	if (ipif->ipif_isv6) {
+	ASSERT((ill->ill_isv6 && ire->ire_ipversion == IPV6_VERSION) ||
+	    ((!ill->ill_isv6 && ire->ire_ipversion == IPV4_VERSION)));
+	if (ill->ill_isv6) {
 		mutex_enter(&ire->ire_lock);
 		gw_addr_v6 = ire->ire_gateway_addr_v6;
 		mutex_exit(&ire->ire_lock);
 	}
-	mutex_enter(&ipif->ipif_saved_ire_lock);
-	for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
+	mutex_enter(&ill->ill_saved_ire_lock);
+	for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
 		/*
-		 * On a given ipif, the triple of address, gateway and mask is
-		 * unique for each saved IRE (in the case of ordinary interface
-		 * routes, the gateway address is all-zeroes).
+		 * On a given ill, the tuple of address, gateway, mask,
+		 * ire_type and zoneid unique for each saved IRE.
 		 */
 		ifrt = (ifrt_t *)mp->b_rptr;
-		if (ipif->ipif_isv6) {
+		if (ill->ill_isv6) {
 			if (!IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
 			    &ire->ire_addr_v6) ||
 			    !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
@@ -1379,23 +1430,36 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics)
 			    ifrt->ifrt_mask != ire->ire_mask)
 				continue;
 		}
+		if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
+		    ifrt->ifrt_type != ire->ire_type)
+			continue;
+
 		if (which & RTV_MTU)
-			ifrt->ifrt_max_frag = metrics->rmx_mtu;
+			ifrt->ifrt_metrics.iulp_mtu = metrics->rmx_mtu;
 		if (which & RTV_RTT)
-			ifrt->ifrt_iulp_info.iulp_rtt = rtt;
+			ifrt->ifrt_metrics.iulp_rtt = rtt;
 		if (which & RTV_SSTHRESH) {
-			ifrt->ifrt_iulp_info.iulp_ssthresh =
+			ifrt->ifrt_metrics.iulp_ssthresh =
 			    metrics->rmx_ssthresh;
 		}
 		if (which & RTV_RTTVAR)
-			ifrt->ifrt_iulp_info.iulp_rtt_sd = metrics->rmx_rttvar;
+			ifrt->ifrt_metrics.iulp_rtt_sd = metrics->rmx_rttvar;
 		if (which & RTV_SPIPE)
-			ifrt->ifrt_iulp_info.iulp_spipe = metrics->rmx_sendpipe;
+			ifrt->ifrt_metrics.iulp_spipe = metrics->rmx_sendpipe;
 		if (which & RTV_RPIPE)
-			ifrt->ifrt_iulp_info.iulp_rpipe = metrics->rmx_recvpipe;
+			ifrt->ifrt_metrics.iulp_rpipe = metrics->rmx_recvpipe;
 		break;
 	}
-	mutex_exit(&ipif->ipif_saved_ire_lock);
+	mutex_exit(&ill->ill_saved_ire_lock);
+
+	/*
+	 * Update any IRE_IF_CLONE hanging created from this IRE_IF so they
+	 * get any new iulp_mtu.
+	 * We do that by deleting them; ire_create_if_clone will pick
+	 * up the new metrics.
+	 */
+	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
+		ire_dep_delete_if_clone(ire);
 }
 
 /*
@@ -1407,27 +1471,69 @@ rts_getmetrics(ire_t *ire, rt_metrics_t *metrics)
 	int	metrics_set = 0;
 
 	bzero(metrics, sizeof (rt_metrics_t));
+
 	/*
 	 * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
 	 * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as
 	 * microseconds.
 	 */
-	metrics->rmx_rtt = ire->ire_uinfo.iulp_rtt * 1000;
+	metrics->rmx_rtt = ire->ire_metrics.iulp_rtt * 1000;
 	metrics_set |= RTV_RTT;
-	metrics->rmx_mtu = ire->ire_max_frag;
+	metrics->rmx_mtu = ire->ire_metrics.iulp_mtu;
 	metrics_set |= RTV_MTU;
-	metrics->rmx_ssthresh = ire->ire_uinfo.iulp_ssthresh;
+	metrics->rmx_ssthresh = ire->ire_metrics.iulp_ssthresh;
 	metrics_set |= RTV_SSTHRESH;
-	metrics->rmx_rttvar = ire->ire_uinfo.iulp_rtt_sd * 1000;
+	metrics->rmx_rttvar = ire->ire_metrics.iulp_rtt_sd * 1000;
 	metrics_set |= RTV_RTTVAR;
-	metrics->rmx_sendpipe = ire->ire_uinfo.iulp_spipe;
+	metrics->rmx_sendpipe = ire->ire_metrics.iulp_spipe;
 	metrics_set |= RTV_SPIPE;
-	metrics->rmx_recvpipe = ire->ire_uinfo.iulp_rpipe;
+	metrics->rmx_recvpipe = ire->ire_metrics.iulp_rpipe;
 	metrics_set |= RTV_RPIPE;
 	return (metrics_set);
 }
 
 /*
+ * Given two sets of metrics (src and dst), use the dst values if they are
+ * set. If a dst value is not set but the src value is set, then we use
+ * the src value.
+ * dst is updated with the new values.
+ * This is used to merge information from a dce_t and ire_metrics, where the
+ * dce values takes precedence.
+ */
+void
+rts_merge_metrics(iulp_t *dst, const iulp_t *src)
+{
+	if (!src->iulp_set)
+		return;
+
+	if (dst->iulp_ssthresh == 0)
+		dst->iulp_ssthresh = src->iulp_ssthresh;
+	if (dst->iulp_rtt == 0)
+		dst->iulp_rtt = src->iulp_rtt;
+	if (dst->iulp_rtt_sd == 0)
+		dst->iulp_rtt_sd = src->iulp_rtt_sd;
+	if (dst->iulp_spipe == 0)
+		dst->iulp_spipe = src->iulp_spipe;
+	if (dst->iulp_rpipe == 0)
+		dst->iulp_rpipe = src->iulp_rpipe;
+	if (dst->iulp_rtomax == 0)
+		dst->iulp_rtomax = src->iulp_rtomax;
+	if (dst->iulp_sack == 0)
+		dst->iulp_sack = src->iulp_sack;
+	if (dst->iulp_tstamp_ok == 0)
+		dst->iulp_tstamp_ok = src->iulp_tstamp_ok;
+	if (dst->iulp_wscale_ok == 0)
+		dst->iulp_wscale_ok = src->iulp_wscale_ok;
+	if (dst->iulp_ecn_ok == 0)
+		dst->iulp_ecn_ok = src->iulp_ecn_ok;
+	if (dst->iulp_pmtud_ok == 0)
+		dst->iulp_pmtud_ok = src->iulp_pmtud_ok;
+	if (dst->iulp_mtu == 0)
+		dst->iulp_mtu = src->iulp_mtu;
+}
+
+
+/*
  * Takes a pointer to a routing message and extracts necessary info by looking
  * at the rtm->rtm_addrs bits and store the requested sockaddrs in the pointers
  * passed (all of which must be valid).
@@ -1552,7 +1658,8 @@ rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp, in6_addr_t *gw_addrp,
 static void
 rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
     ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr, ipaddr_t author,
-    const ipif_t *ipif, mblk_t *mp, uint_t sacnt, const tsol_gc_t *gc)
+    ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp,
+    const tsol_gc_t *gc)
 {
 	rt_msghdr_t	*rtm;
 	sin_t		*sin;
@@ -1561,7 +1668,6 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
 	int		i;
 
 	ASSERT(mp != NULL);
-	ASSERT(sacnt == 0 || gc != NULL);
 	/*
 	 * First find the type of the message
 	 * and its length.
@@ -1571,7 +1677,7 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
 	 * Now find the size of the data
 	 * that follows the message header.
 	 */
-	data_size = rts_data_msg_size(rtm_addrs, AF_INET, sacnt);
+	data_size = rts_data_msg_size(rtm_addrs, AF_INET, gc != NULL ? 1 : 0);
 
 	rtm = (rt_msghdr_t *)mp->b_rptr;
 	mp->b_wptr = &mp->b_rptr[header_size];
@@ -1596,9 +1702,13 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
 			cp += sizeof (sin_t);
 			break;
 		case RTA_IFP:
-			cp += ill_dls_info((struct sockaddr_dl *)cp, ipif);
+			cp += ill_dls_info((struct sockaddr_dl *)cp, ill);
 			break;
 		case RTA_IFA:
+			sin->sin_addr.s_addr = ifaddr;
+			sin->sin_family = AF_INET;
+			cp += sizeof (sin_t);
+			break;
 		case RTA_SRC:
 			sin->sin_addr.s_addr = src_addr;
 			sin->sin_family = AF_INET;
@@ -1625,24 +1735,20 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
 		rtm_ext_t *rtm_ext;
 		struct rtsa_s *rp_dst;
 		tsol_rtsecattr_t *rsap;
-		int i;
 
 		ASSERT(gc->gc_grp != NULL);
 		ASSERT(RW_LOCK_HELD(&gc->gc_grp->gcgrp_rwlock));
-		ASSERT(sacnt > 0);
 
 		rtm_ext = (rtm_ext_t *)cp;
 		rtm_ext->rtmex_type = RTMEX_GATEWAY_SECATTR;
-		rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(sacnt);
+		rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(1);
 
 		rsap = (tsol_rtsecattr_t *)(rtm_ext + 1);
-		rsap->rtsa_cnt = sacnt;
+		rsap->rtsa_cnt = 1;
 		rp_dst = rsap->rtsa_attr;
 
-		for (i = 0; i < sacnt; i++, gc = gc->gc_next, rp_dst++) {
-			ASSERT(gc->gc_db != NULL);
-			bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
-		}
+		ASSERT(gc->gc_db != NULL);
+		bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
 		cp = (uchar_t *)rp_dst;
 	}
 
@@ -1659,6 +1765,7 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
 
 /*
  * Allocates and initializes a routing socket message.
+ * Note that sacnt is either zero or one.
  */
 mblk_t *
 rts_alloc_msg(int type, int rtm_addrs, sa_family_t af, uint_t sacnt)
@@ -1755,7 +1862,7 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
 	if (mp == NULL)
 		return;
 	rts_fill_msg(type, rtm_addrs, dst_addr, net_mask, gw_addr, source, 0,
-	    author, NULL, mp, 0, NULL);
+	    author, 0, NULL, mp, NULL);
 	rtm = (rt_msghdr_t *)mp->b_rptr;
 	rtm->rtm_flags = flags;
 	rtm->rtm_errno = error;
@@ -1784,12 +1891,12 @@ ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags)
 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
 
 	/*
-	 * This message should be generated only when the physical interface
-	 * is changing state.
+	 * This message should be generated only
+	 * when the physical device is changing
+	 * state.
 	 */
 	if (ipif->ipif_id != 0)
 		return;
-
 	if (ipif->ipif_isv6) {
 		af = AF_INET6;
 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
@@ -1797,14 +1904,15 @@ ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags)
 			return;
 		rts_fill_msg_v6(RTM_IFINFO, RTA_IFP, &ipv6_all_zeros,
 		    &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
-		    &ipv6_all_zeros, &ipv6_all_zeros, ipif, mp, 0, NULL);
+		    &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
+		    ipif->ipif_ill, mp, NULL);
 	} else {
 		af = AF_INET;
 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
 		if (mp == NULL)
 			return;
-		rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, ipif, mp,
-		    0, NULL);
+		rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, 0,
+		    ipif->ipif_ill, mp, NULL);
 	}
 	ifm = (if_msghdr_t *)mp->b_rptr;
 	ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
@@ -1843,6 +1951,12 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
 	sa_family_t	af;
 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
 
+	/*
+	 * Let conn_ixa caching know that source address selection
+	 * changed
+	 */
+	ip_update_source_selection(ipst);
+
 	if (ipif->ipif_isv6)
 		af = AF_INET6;
 	else
@@ -1875,15 +1989,17 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
 			case AF_INET:
 				rts_fill_msg(ncmd, rtm_addrs, 0,
 				    ipif->ipif_net_mask, 0, ipif->ipif_lcl_addr,
-				    ipif->ipif_pp_dst_addr, 0, ipif, mp,
-				    0, NULL);
+				    ipif->ipif_pp_dst_addr, 0,
+				    ipif->ipif_lcl_addr, ipif->ipif_ill,
+				    mp, NULL);
 				break;
 			case AF_INET6:
 				rts_fill_msg_v6(ncmd, rtm_addrs,
 				    &ipv6_all_zeros, &ipif->ipif_v6net_mask,
 				    &ipv6_all_zeros, &ipif->ipif_v6lcl_addr,
 				    &ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros,
-				    ipif, mp, 0, NULL);
+				    &ipif->ipif_v6lcl_addr, ipif->ipif_ill,
+				    mp, NULL);
 				break;
 			}
 			ifam = (ifa_msghdr_t *)mp->b_rptr;
@@ -1904,14 +2020,15 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
 			case AF_INET:
 				rts_fill_msg(cmd, rtm_addrs,
 				    ipif->ipif_lcl_addr, ipif->ipif_net_mask, 0,
-				    0, 0, 0, NULL, mp, 0, NULL);
+				    0, 0, 0, 0, NULL, mp, NULL);
 				break;
 			case AF_INET6:
 				rts_fill_msg_v6(cmd, rtm_addrs,
 				    &ipif->ipif_v6lcl_addr,
 				    &ipif->ipif_v6net_mask, &ipv6_all_zeros,
 				    &ipv6_all_zeros, &ipv6_all_zeros,
-				    &ipv6_all_zeros, NULL, mp, 0, NULL);
+				    &ipv6_all_zeros, &ipv6_all_zeros,
+				    NULL, mp, NULL);
 				break;
 			}
 			rtm = (rt_msghdr_t *)mp->b_rptr;
diff --git a/usr/src/uts/common/inet/ip/ip_sadb.c b/usr/src/uts/common/inet/ip/ip_sadb.c
index 35b822902a..e099d04427 100644
--- a/usr/src/uts/common/inet/ip/ip_sadb.c
+++ b/usr/src/uts/common/inet/ip/ip_sadb.c
@@ -36,7 +36,6 @@
 #include <inet/ip6.h>
 
 #include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
 #include <inet/sadb.h>
 #include <inet/ipsec_impl.h>
 #include <inet/ipdrop.h>
@@ -57,35 +56,21 @@ ipsec_match_outbound_ids(ipsec_latch_t *ipl, ipsa_t *sa)
 	    ipsid_equal(ipl->ipl_remote_cid, sa->ipsa_dst_cid);
 }
 
-/* cr1 is packet cred; cr2 is SA credential */
+/* l1 is packet label; l2 is SA label */
 boolean_t
-ipsec_label_match(cred_t *cr1, cred_t *cr2)
+ipsec_label_match(ts_label_t *l1, ts_label_t *l2)
 {
-	ts_label_t *l1, *l2;
-
 	if (!is_system_labeled())
 		return (B_TRUE);
 
 	/*
-	 * Check for NULL creds.  Unlabeled SA always matches;
+	 * Check for NULL label.  Unlabeled SA (l2) always matches;
 	 * unlabeled user with labeled  SA always fails
 	 */
-	if (cr2 == NULL)
+	if (l2 == NULL)
 		return (B_TRUE);
 
-	if (cr1 == NULL)
-		return (B_FALSE);
-
-	/* If we reach here, we have two passed-in creds. */
-	ASSERT(cr2 != NULL && cr1 != NULL);
-
-	/* Check for NULL labels.  Two is good, one is bad, zero is good. */
-	l1 = crgetlabel(cr1);
-	l2 = crgetlabel(cr2);
 	if (l1 == NULL)
-		return (l2 == NULL);
-
-	if (l2 == NULL)
 		return (B_FALSE);
 
 	/* Simple IPsec MLS policy: labels must be equal */
@@ -109,32 +94,32 @@ ipsec_label_match(cred_t *cr1, cred_t *cr2)
  * The SA ptr I return will have its reference count incremented by one.
  */
 ipsa_t *
-ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src,
-    uint32_t *dst, sa_family_t af, uint8_t protocol, cred_t *cr)
+ipsec_getassocbyconn(isaf_t *bucket, ip_xmit_attr_t *ixa, uint32_t *src,
+    uint32_t *dst, sa_family_t af, uint8_t protocol, ts_label_t *tsl)
 {
 	ipsa_t *retval, *candidate;
 	ipsec_action_t *candact;
 	boolean_t need_unique;
-	boolean_t tunnel_mode = io->ipsec_out_tunnel;
+	boolean_t tunnel_mode = (ixa->ixa_flags & IXAF_IPSEC_TUNNEL);
 	uint64_t unique_id;
 	uint32_t old_flags, excludeflags;
-	ipsec_policy_t *pp = io->ipsec_out_policy;
-	ipsec_action_t *actlist = io->ipsec_out_act;
+	ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
+	ipsec_action_t *actlist = ixa->ixa_ipsec_action;
 	ipsec_action_t *act;
-	ipsec_latch_t *ipl = io->ipsec_out_latch;
+	ipsec_latch_t *ipl = ixa->ixa_ipsec_latch;
 	ipsa_ref_t *ipr = NULL;
-	sa_family_t inaf = io->ipsec_out_inaf;
-	uint32_t *insrc = io->ipsec_out_insrc;
-	uint32_t *indst = io->ipsec_out_indst;
-	uint8_t insrcpfx = io->ipsec_out_insrcpfx;
-	uint8_t indstpfx = io->ipsec_out_indstpfx;
+	sa_family_t inaf = ixa->ixa_ipsec_inaf;
+	uint32_t *insrc = ixa->ixa_ipsec_insrc;
+	uint32_t *indst = ixa->ixa_ipsec_indst;
+	uint8_t insrcpfx = ixa->ixa_ipsec_insrcpfx;
+	uint8_t indstpfx = ixa->ixa_ipsec_indstpfx;
 
 	ASSERT(MUTEX_HELD(&bucket->isaf_lock));
 
 	/*
-	 * Caller must set ipsec_out_t structure such that we know
+	 * Caller must set ip_xmit_attr_t structure such that we know
 	 * whether this is tunnel mode or transport mode based on
-	 * io->ipsec_out_tunnel.  If this flag is set, we assume that
+	 * IXAF_IPSEC_TUNNEL.  If this flag is set, we assume that
 	 * there are valid inner src and destination addresses to compare.
 	 */
 
@@ -145,7 +130,7 @@ ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src,
 
 	if (ipl != NULL) {
 		ASSERT((protocol == IPPROTO_AH) || (protocol == IPPROTO_ESP));
-		ipr = &ipl->ipl_ref[protocol - IPPROTO_ESP];
+		ipr = &ixa->ixa_ipsec_ref[protocol - IPPROTO_ESP];
 
 		retval = ipr->ipsr_sa;
 
@@ -169,7 +154,7 @@ ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src,
 	ASSERT(actlist != NULL);
 
 	need_unique = actlist->ipa_want_unique;
-	unique_id = SA_FORM_UNIQUE_ID(io);
+	unique_id = SA_FORM_UNIQUE_ID(ixa);
 
 	/*
 	 * Precompute mask for SA flags comparison: If we need a
@@ -332,7 +317,7 @@ ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src,
 		/*
 		 * Do labels match?
 		 */
-		if (!ipsec_label_match(cr, retval->ipsa_cred))
+		if (!ipsec_label_match(tsl, retval->ipsa_tsl))
 			goto next_ipsa;
 
 		/*
@@ -451,10 +436,9 @@ next_ipsa:
 			ipsec_latch_ids(ipl,
 			    retval->ipsa_src_cid, retval->ipsa_dst_cid);
 		}
-		if (!ipl->ipl_out_action_latched) {
+		if (ixa->ixa_ipsec_action == NULL) {
 			IPACT_REFHOLD(act);
-			ipl->ipl_out_action = act;
-			ipl->ipl_out_action_latched = B_TRUE;
+			ixa->ixa_ipsec_action = act;
 		}
 	}
 
@@ -471,7 +455,7 @@ next_ipsa:
 			retval->ipsa_flags |= IPSA_F_UNIQUE;
 			retval->ipsa_unique_id = unique_id;
 			retval->ipsa_unique_mask = SA_UNIQUE_MASK(
-			    io->ipsec_out_src_port, io->ipsec_out_dst_port,
+			    ixa->ixa_ipsec_src_port, ixa->ixa_ipsec_dst_port,
 			    protocol, 0);
 		}
 
@@ -581,45 +565,41 @@ ipsec_getassocbyspi(isaf_t *bucket, uint32_t spi, uint32_t *src, uint32_t *dst,
 }
 
 boolean_t
-ipsec_outbound_sa(mblk_t *mp, uint_t proto)
+ipsec_outbound_sa(mblk_t *data_mp, ip_xmit_attr_t *ixa, uint_t proto)
 {
-	mblk_t *data_mp;
-	ipsec_out_t *io;
 	ipaddr_t dst;
 	uint32_t *dst_ptr, *src_ptr;
 	isaf_t *bucket;
 	ipsa_t *assoc;
-	ip6_pkt_t ipp;
+	ip_pkt_t ipp;
 	in6_addr_t dst6;
 	ipsa_t **sa;
 	sadbp_t *sadbp;
 	sadb_t *sp;
 	sa_family_t af;
-	cred_t *cr;
-	netstack_t	*ns;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
+	netstack_t	*ns = ipst->ips_netstack;
 
-	data_mp = mp->b_cont;
-	io = (ipsec_out_t *)mp->b_rptr;
-	ns = io->ipsec_out_ns;
+	ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
 
 	if (proto == IPPROTO_ESP) {
 		ipsecesp_stack_t	*espstack;
 
 		espstack = ns->netstack_ipsecesp;
-		sa = &io->ipsec_out_esp_sa;
+		sa = &ixa->ixa_ipsec_esp_sa;
 		sadbp = &espstack->esp_sadb;
 	} else {
 		ipsecah_stack_t	*ahstack;
 
 		ASSERT(proto == IPPROTO_AH);
 		ahstack = ns->netstack_ipsecah;
-		sa = &io->ipsec_out_ah_sa;
+		sa = &ixa->ixa_ipsec_ah_sa;
 		sadbp = &ahstack->ah_sadb;
 	}
 
 	ASSERT(*sa == NULL);
 
-	if (io->ipsec_out_v4) {
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
 		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
 
 		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
@@ -651,11 +631,9 @@ ipsec_outbound_sa(mblk_t *mp, uint_t proto)
 		dst_ptr = (uint32_t *)&dst6;
 	}
 
-	cr = msg_getcred(data_mp, NULL);
-
 	mutex_enter(&bucket->isaf_lock);
-	assoc = ipsec_getassocbyconn(bucket, io, src_ptr, dst_ptr, af,
-	    proto, cr);
+	assoc = ipsec_getassocbyconn(bucket, ixa, src_ptr, dst_ptr, af,
+	    proto, ixa->ixa_tsl);
 	mutex_exit(&bucket->isaf_lock);
 
 	if (assoc == NULL)
@@ -674,17 +652,16 @@ ipsec_outbound_sa(mblk_t *mp, uint_t proto)
 
 /*
  * Inbound IPsec SA selection.
+ * Can return a pulled up mblk.
+ * When it returns non-NULL ahp is updated
  */
-
-ah_t *
-ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns)
+mblk_t *
+ipsec_inbound_ah_sa(mblk_t *mp, ip_recv_attr_t *ira, ah_t **ahp)
 {
-	mblk_t *ipsec_in;
 	ipha_t *ipha;
 	ipsa_t 	*assoc;
 	ah_t *ah;
 	isaf_t *hptr;
-	ipsec_in_t *ii;
 	boolean_t isv6;
 	ip6_t *ip6h;
 	int ah_offset;
@@ -692,20 +669,13 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns)
 	int pullup_len;
 	sadb_t *sp;
 	sa_family_t af;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
 
 	IP_AH_BUMP_STAT(ipss, in_requests);
 
-	ASSERT(mp->b_datap->db_type == M_CTL);
-
-	ipsec_in = mp;
-	ii = (ipsec_in_t *)ipsec_in->b_rptr;
-	mp = mp->b_cont;
-
-	ASSERT(mp->b_datap->db_type == M_DATA);
-
-	isv6 = !ii->ipsec_in_v4;
+	isv6 = !(ira->ira_flags & IRAF_IS_IPV4);
 	if (isv6) {
 		ip6h = (ip6_t *)mp->b_rptr;
 		ah_offset = ipsec_ah_get_hdr_size_v6(mp, B_TRUE);
@@ -729,7 +699,7 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns)
 			    SL_WARN | SL_ERROR,
 			    "ipsec_inbound_ah_sa: Small AH header\n");
 			IP_AH_BUMP_STAT(ipss, in_discards);
-			ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
+			ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 			    DROPPER(ipss, ipds_ah_bad_length),
 			    &ipss->ipsec_dropper);
 			return (NULL);
@@ -763,11 +733,11 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns)
 	    assoc->ipsa_state == IPSA_STATE_ACTIVE_ELSEWHERE) {
 		IP_AH_BUMP_STAT(ipss, lookup_failure);
 		IP_AH_BUMP_STAT(ipss, in_discards);
-		ipsecah_in_assocfailure(ipsec_in, 0,
+		ipsecah_in_assocfailure(mp, 0,
 		    SL_ERROR | SL_CONSOLE | SL_WARN,
 		    "ipsec_inbound_ah_sa: No association found for "
 		    "spi 0x%x, dst addr %s\n",
-		    ah->ah_spi, dst_ptr, af, ahstack);
+		    ah->ah_spi, dst_ptr, af, ira);
 		if (assoc != NULL) {
 			IPSA_REFRELE(assoc);
 		}
@@ -775,33 +745,44 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns)
 	}
 
 	if (assoc->ipsa_state == IPSA_STATE_LARVAL &&
-	    sadb_set_lpkt(assoc, ipsec_in, ns)) {
+	    sadb_set_lpkt(assoc, mp, ira)) {
 		/* Not fully baked; swap the packet under a rock until then */
 		IPSA_REFRELE(assoc);
 		return (NULL);
 	}
 
+	/* Are the IPsec fields initialized at all? */
+	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
+		ira->ira_ipsec_action = NULL;
+		ira->ira_ipsec_ah_sa = NULL;
+		ira->ira_ipsec_esp_sa = NULL;
+	}
+
 	/*
 	 * Save a reference to the association so that it can
 	 * be retrieved after execution. We free any AH SA reference
 	 * already there (innermost SA "wins". The reference to
 	 * the SA will also be used later when doing the policy checks.
 	 */
-
-	if (ii->ipsec_in_ah_sa != NULL) {
-		IPSA_REFRELE(ii->ipsec_in_ah_sa);
+	if (ira->ira_ipsec_ah_sa != NULL) {
+		IPSA_REFRELE(ira->ira_ipsec_ah_sa);
 	}
-	ii->ipsec_in_ah_sa = assoc;
+	ira->ira_flags |= IRAF_IPSEC_SECURE;
+	ira->ira_ipsec_ah_sa = assoc;
 
-	return (ah);
+	*ahp = ah;
+	return (mp);
 }
 
-esph_t *
-ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
+/*
+ * Can return a pulled up mblk.
+ * When it returns non-NULL esphp is updated
+ */
+mblk_t *
+ipsec_inbound_esp_sa(mblk_t *data_mp, ip_recv_attr_t *ira, esph_t **esphp)
 {
-	mblk_t *data_mp, *placeholder;
+	mblk_t *placeholder;
 	uint32_t *src_ptr, *dst_ptr;
-	ipsec_in_t *ii;
 	ipha_t *ipha;
 	ip6_t *ip6h;
 	esph_t *esph;
@@ -811,19 +792,13 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
 	sa_family_t af;
 	boolean_t isv6;
 	sadb_t *sp;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
 
 	IP_ESP_BUMP_STAT(ipss, in_requests);
-	ASSERT(ipsec_in_mp->b_datap->db_type == M_CTL);
-
-	/* We have IPSEC_IN already! */
-	ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
-	data_mp = ipsec_in_mp->b_cont;
 
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
-
-	isv6 = !ii->ipsec_in_v4;
+	isv6 = !(ira->ira_flags & IRAF_IS_IPV4);
 	if (isv6) {
 		ip6h = (ip6_t *)data_mp->b_rptr;
 	} else {
@@ -841,17 +816,11 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
 	 * actual packet length.
 	 */
 	if (data_mp->b_datap->db_ref > 1 ||
-	    (data_mp->b_wptr - data_mp->b_rptr) <
-	    (isv6 ? (ntohs(ip6h->ip6_plen) + sizeof (ip6_t))
-	    : ntohs(ipha->ipha_length))) {
+	    (data_mp->b_wptr - data_mp->b_rptr) < ira->ira_pktlen) {
 		placeholder = msgpullup(data_mp, -1);
 		if (placeholder == NULL) {
 			IP_ESP_BUMP_STAT(ipss, in_discards);
-			/*
-			 * TODO: Extract inbound interface from the IPSEC_IN
-			 * message's ii->ipsec_in_rill_index.
-			 */
-			ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
+			ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
 			    DROPPER(ipss, ipds_esp_nomem),
 			    &ipss->ipsec_dropper);
 			return (NULL);
@@ -859,7 +828,6 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
 			/* Reset packet with new pulled up mblk. */
 			freemsg(data_mp);
 			data_mp = placeholder;
-			ipsec_in_mp->b_cont = data_mp;
 		}
 	}
 
@@ -904,11 +872,11 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
 		/*  This is a loggable error!  AUDIT ME! */
 		IP_ESP_BUMP_STAT(ipss, lookup_failure);
 		IP_ESP_BUMP_STAT(ipss, in_discards);
-		ipsecesp_in_assocfailure(ipsec_in_mp, 0,
+		ipsecesp_in_assocfailure(data_mp, 0,
 		    SL_ERROR | SL_CONSOLE | SL_WARN,
 		    "ipsec_inbound_esp_sa: No association found for "
 		    "spi 0x%x, dst addr %s\n",
-		    esph->esph_spi, dst_ptr, af, espstack);
+		    esph->esph_spi, dst_ptr, af, ira);
 		if (ipsa != NULL) {
 			IPSA_REFRELE(ipsa);
 		}
@@ -916,22 +884,31 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
 	}
 
 	if (ipsa->ipsa_state == IPSA_STATE_LARVAL &&
-	    sadb_set_lpkt(ipsa, ipsec_in_mp, ns)) {
+	    sadb_set_lpkt(ipsa, data_mp, ira)) {
 		/* Not fully baked; swap the packet under a rock until then */
 		IPSA_REFRELE(ipsa);
 		return (NULL);
 	}
 
+	/* Are the IPsec fields initialized at all? */
+	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
+		ira->ira_ipsec_action = NULL;
+		ira->ira_ipsec_ah_sa = NULL;
+		ira->ira_ipsec_esp_sa = NULL;
+	}
+
 	/*
 	 * Save a reference to the association so that it can
 	 * be retrieved after execution. We free any AH SA reference
 	 * already there (innermost SA "wins". The reference to
 	 * the SA will also be used later when doing the policy checks.
 	 */
-	if (ii->ipsec_in_esp_sa != NULL) {
-		IPSA_REFRELE(ii->ipsec_in_esp_sa);
+	if (ira->ira_ipsec_esp_sa != NULL) {
+		IPSA_REFRELE(ira->ira_ipsec_esp_sa);
 	}
-	ii->ipsec_in_esp_sa = ipsa;
+	ira->ira_flags |= IRAF_IPSEC_SECURE;
+	ira->ira_ipsec_esp_sa = ipsa;
 
-	return (esph);
+	*esphp = esph;
+	return (data_mp);
 }
diff --git a/usr/src/uts/common/inet/ip/ip_srcid.c b/usr/src/uts/common/inet/ip/ip_srcid.c
index 949508a796..f6507d6413 100644
--- a/usr/src/uts/common/inet/ip/ip_srcid.c
+++ b/usr/src/uts/common/inet/ip/ip_srcid.c
@@ -101,11 +101,7 @@
 #include <netinet/ip_mroute.h>
 #include <inet/ipclassifier.h>
 
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
-#include <inet/sadb.h>
 #include <sys/kmem.h>
-#include <inet/ipsec_impl.h>
 
 static uint_t		srcid_nextid(ip_stack_t *);
 static srcid_map_t	**srcid_lookup_addr(const in6_addr_t *addr,
@@ -239,7 +235,7 @@ ip_srcid_find_id(uint_t id, in6_addr_t *addr, zoneid_t zoneid,
 	rw_enter(&ipst->ips_srcid_lock, RW_READER);
 	smpp = srcid_lookup_id(id, ipst);
 	smp = *smpp;
-	if (smp == NULL || smp->sm_zoneid != zoneid) {
+	if (smp == NULL || (smp->sm_zoneid != zoneid && zoneid != ALL_ZONES)) {
 		/* Not preset */
 		ip1dbg(("ip_srcid_find_id: unknown %u or in wrong zone\n", id));
 		*addr = ipv6_all_zeros;
@@ -290,7 +286,7 @@ srcid_lookup_addr(const in6_addr_t *addr, zoneid_t zoneid, ip_stack_t *ipst)
 	smpp = &ipst->ips_srcid_head;
 	while (*smpp != NULL) {
 		if (IN6_ARE_ADDR_EQUAL(&(*smpp)->sm_addr, addr) &&
-		    zoneid == (*smpp)->sm_zoneid)
+		    (zoneid == (*smpp)->sm_zoneid || zoneid == ALL_ZONES))
 			return (smpp);
 		smpp = &(*smpp)->sm_next;
 	}
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index 45683ec967..31fa14b4af 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -52,16 +52,12 @@
  * asynchronous and the reference protects the connection from being destroyed
  * before its processing is finished).
  *
- * send and receive functions are currently used for TCP only. The send function
- * determines the IP entry point for the packet once it leaves TCP to be sent to
- * the destination address. The receive function is used by IP when the packet
- * should be passed for TCP processing. When a new connection is created these
- * are set to ip_output() and tcp_input() respectively. During the lifetime of
- * the connection the send and receive functions may change depending on the
- * changes in the connection state. For example, Once the connection is bound to
- * an addresse, the receive function for this connection is set to
- * tcp_conn_request().  This allows incoming SYNs to go directly into the
- * listener SYN processing function without going to tcp_input() first.
+ * conn_recv is used to pass up packets to the ULP.
+ * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
+ * a listener, and changes to tcp_input_listener as the listener has picked a
+ * good squeue. For other cases it is set to tcp_input_data.
+ *
+ * conn_recvicmp is used to pass up ICMP errors to the ULP.
  *
  * Classifier uses several hash tables:
  *
@@ -91,8 +87,8 @@
  * Connection Lookup:
  * ------------------
  *
- * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
- * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
+ * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
+ * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
  *
  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
  * it can't find any associated connection. If the connection is found, its
@@ -107,9 +103,12 @@
  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
  *		 the packet.
  *
- * 	zoneid: The zone in which the returned connection must be; the zoneid
- *		corresponding to the ire_zoneid on the IRE located for the
- *		packet's destination address.
+ * 	ira->ira_zoneid: The zone in which the returned connection must be; the
+ *		zoneid corresponding to the ire_zoneid on the IRE located for
+ *		the packet's destination address.
+ *
+ *	ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
+ *		IRAF_TX_SHARED_ADDR flags
  *
  *	For TCP connections, the lookup order is as follows:
  *		5-tuple {src, dst, protocol, local port, remote port}
@@ -156,7 +155,7 @@
  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
  * receiver's label must dominate the sender's default label.
  *
- * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
+ * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
  *					 ip_stack);
  *
@@ -184,34 +183,26 @@
  * Table Updates
  * -------------
  *
- * int ipcl_conn_insert(connp, protocol, src, dst, ports)
- * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
+ * int ipcl_conn_insert(connp);
+ * int ipcl_conn_insert_v4(connp);
+ * int ipcl_conn_insert_v6(connp);
  *
  *	Insert 'connp' in the ipcl_conn_fanout.
  *	Arguements :
  *		connp		conn_t to be inserted
- *		protocol	connection protocol
- *		src		source address
- *		dst		destination address
- *		ports		local and remote port
- *		ifindex		interface index for IPv6 connections
  *
  *	Return value :
  *		0		if connp was inserted
  *		EADDRINUSE	if the connection with the same tuple
  *				already exists.
  *
- * int ipcl_bind_insert(connp, protocol, src, lport);
- * int ipcl_bind_insert_v6(connp, protocol, src, lport);
+ * int ipcl_bind_insert(connp);
+ * int ipcl_bind_insert_v4(connp);
+ * int ipcl_bind_insert_v6(connp);
  *
  * 	Insert 'connp' in ipcl_bind_fanout.
  * 	Arguements :
  * 		connp		conn_t to be inserted
- * 		protocol	connection protocol
- * 		src		source address connection wants
- * 				to bind to
- * 		lport		local port connection wants to
- * 				bind to
  *
  *
  * void ipcl_hash_remove(connp);
@@ -261,6 +252,8 @@
 #include <netinet/icmp6.h>
 
 #include <inet/ip.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
 #include <inet/ip6.h>
 #include <inet/ip_ndp.h>
 #include <inet/ip_impl.h>
@@ -280,19 +273,6 @@
 #include <sys/tsol/tnet.h>
 #include <sys/sockio.h>
 
-#ifdef DEBUG
-#define	IPCL_DEBUG
-#else
-#undef	IPCL_DEBUG
-#endif
-
-#ifdef	IPCL_DEBUG
-int	ipcl_debug_level = 0;
-#define	IPCL_DEBUG_LVL(level, args)	\
-	if (ipcl_debug_level  & level) { printf args; }
-#else
-#define	IPCL_DEBUG_LVL(level, args) {; }
-#endif
 /* Old value for compatibility. Setable in /etc/system */
 uint_t tcp_conn_hash_size = 0;
 
@@ -336,10 +316,8 @@ typedef union itc_s {
 
 struct kmem_cache  *tcp_conn_cache;
 struct kmem_cache  *ip_conn_cache;
-struct kmem_cache  *ip_helper_stream_cache;
 extern struct kmem_cache  *sctp_conn_cache;
 extern struct kmem_cache  *tcp_sack_info_cache;
-extern struct kmem_cache  *tcp_iphc_cache;
 struct kmem_cache  *udp_conn_cache;
 struct kmem_cache  *rawip_conn_cache;
 struct kmem_cache  *rts_conn_cache;
@@ -362,34 +340,6 @@ static void	rawip_conn_destructor(void *, void *);
 static int	rts_conn_constructor(void *, void *, int);
 static void	rts_conn_destructor(void *, void *);
 
-static int	ip_helper_stream_constructor(void *, void *, int);
-static void	ip_helper_stream_destructor(void *, void *);
-
-boolean_t	ip_use_helper_cache = B_TRUE;
-
-/*
- * Hook functions to enable cluster networking
- * On non-clustered systems these vectors must always be NULL.
- */
-extern void	(*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t,
-		    uint8_t *, in_port_t, void *);
-extern void	(*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t,
-		    uint8_t *, in_port_t, void *);
-
-#ifdef	IPCL_DEBUG
-#define	INET_NTOA_BUFSIZE	18
-
-static char *
-inet_ntoa_r(uint32_t in, char *b)
-{
-	unsigned char	*p;
-
-	p = (unsigned char *)&in;
-	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
-	return (b);
-}
-#endif
-
 /*
  * Global (for all stack instances) init routine
  */
@@ -420,15 +370,6 @@ ipcl_g_init(void)
 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
 	    rts_conn_constructor, rts_conn_destructor,
 	    NULL, NULL, NULL, 0);
-
-	if (ip_use_helper_cache) {
-		ip_helper_stream_cache = kmem_cache_create
-		    ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t),
-		    CACHE_ALIGN_SIZE, ip_helper_stream_constructor,
-		    ip_helper_stream_destructor, NULL, NULL, NULL, 0);
-	} else {
-		ip_helper_stream_cache = NULL;
-	}
 }
 
 /*
@@ -493,10 +434,10 @@ ipcl_init(ip_stack_t *ipst)
 		    MUTEX_DEFAULT, NULL);
 	}
 
-	ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
+	ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
 	    sizeof (connf_t), KM_SLEEP);
 	for (i = 0; i < IPPROTO_MAX; i++) {
-		mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
+		mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
 	}
 
@@ -576,11 +517,12 @@ ipcl_destroy(ip_stack_t *ipst)
 	ipst->ips_ipcl_bind_fanout = NULL;
 
 	for (i = 0; i < IPPROTO_MAX; i++) {
-		ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
-		mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
+		ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
+		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
 	}
-	kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
-	ipst->ips_ipcl_proto_fanout = NULL;
+	kmem_free(ipst->ips_ipcl_proto_fanout_v4,
+	    IPPROTO_MAX * sizeof (connf_t));
+	ipst->ips_ipcl_proto_fanout_v4 = NULL;
 
 	for (i = 0; i < IPPROTO_MAX; i++) {
 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
@@ -636,7 +578,6 @@ conn_t *
 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
 {
 	conn_t	*connp;
-	sctp_stack_t *sctps;
 	struct kmem_cache *conn_cache;
 
 	switch (type) {
@@ -644,10 +585,10 @@ ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
 			return (NULL);
 		sctp_conn_init(connp);
-		sctps = ns->netstack_sctp;
-		SCTP_G_Q_REFHOLD(sctps);
 		netstack_hold(ns);
 		connp->conn_netstack = ns;
+		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
+		ipcl_globalhash_insert(connp);
 		return (connp);
 
 	case IPCL_TCPCONN:
@@ -681,6 +622,7 @@ ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
 	connp->conn_ref = 1;
 	netstack_hold(ns);
 	connp->conn_netstack = ns;
+	connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 	ipcl_globalhash_insert(connp);
 	return (connp);
 }
@@ -693,61 +635,61 @@ ipcl_conn_destroy(conn_t *connp)
 
 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
 	ASSERT(connp->conn_ref == 0);
-	ASSERT(connp->conn_ire_cache == NULL);
 
 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
 
-	if (connp->conn_effective_cred != NULL) {
-		crfree(connp->conn_effective_cred);
-		connp->conn_effective_cred = NULL;
-	}
-
 	if (connp->conn_cred != NULL) {
 		crfree(connp->conn_cred);
 		connp->conn_cred = NULL;
 	}
 
+	if (connp->conn_ht_iphc != NULL) {
+		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
+		connp->conn_ht_iphc = NULL;
+		connp->conn_ht_iphc_allocated = 0;
+		connp->conn_ht_iphc_len = 0;
+		connp->conn_ht_ulp = NULL;
+		connp->conn_ht_ulp_len = 0;
+	}
+	ip_pkt_free(&connp->conn_xmit_ipp);
+
 	ipcl_globalhash_remove(connp);
 
-	/* FIXME: add separate tcp_conn_free()? */
+	if (connp->conn_latch != NULL) {
+		IPLATCH_REFRELE(connp->conn_latch);
+		connp->conn_latch = NULL;
+	}
+	if (connp->conn_latch_in_policy != NULL) {
+		IPPOL_REFRELE(connp->conn_latch_in_policy);
+		connp->conn_latch_in_policy = NULL;
+	}
+	if (connp->conn_latch_in_action != NULL) {
+		IPACT_REFRELE(connp->conn_latch_in_action);
+		connp->conn_latch_in_action = NULL;
+	}
+	if (connp->conn_policy != NULL) {
+		IPPH_REFRELE(connp->conn_policy, ns);
+		connp->conn_policy = NULL;
+	}
+
+	if (connp->conn_ipsec_opt_mp != NULL) {
+		freemsg(connp->conn_ipsec_opt_mp);
+		connp->conn_ipsec_opt_mp = NULL;
+	}
+
 	if (connp->conn_flags & IPCL_TCPCONN) {
-		tcp_t	*tcp = connp->conn_tcp;
-		tcp_stack_t *tcps;
-
-		ASSERT(tcp != NULL);
-		tcps = tcp->tcp_tcps;
-		if (tcps != NULL) {
-			if (connp->conn_latch != NULL) {
-				IPLATCH_REFRELE(connp->conn_latch, ns);
-				connp->conn_latch = NULL;
-			}
-			if (connp->conn_policy != NULL) {
-				IPPH_REFRELE(connp->conn_policy, ns);
-				connp->conn_policy = NULL;
-			}
-			tcp->tcp_tcps = NULL;
-			TCPS_REFRELE(tcps);
-		}
+		tcp_t *tcp = connp->conn_tcp;
 
 		tcp_free(tcp);
 		mp = tcp->tcp_timercache;
-		tcp->tcp_cred = NULL;
+
+		tcp->tcp_tcps = NULL;
 
 		if (tcp->tcp_sack_info != NULL) {
 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
 			kmem_cache_free(tcp_sack_info_cache,
 			    tcp->tcp_sack_info);
 		}
-		if (tcp->tcp_iphc != NULL) {
-			if (tcp->tcp_hdr_grown) {
-				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
-			} else {
-				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
-				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
-			}
-			tcp->tcp_iphc_len = 0;
-		}
-		ASSERT(tcp->tcp_iphc_len == 0);
 
 		/*
 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
@@ -759,17 +701,15 @@ ipcl_conn_destroy(conn_t *connp)
 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
 		}
 
-		ASSERT(connp->conn_latch == NULL);
-		ASSERT(connp->conn_policy == NULL);
-
+		ipcl_conn_cleanup(connp);
+		connp->conn_flags = IPCL_TCPCONN;
 		if (ns != NULL) {
 			ASSERT(tcp->tcp_tcps == NULL);
 			connp->conn_netstack = NULL;
+			connp->conn_ixa->ixa_ipst = NULL;
 			netstack_rele(ns);
 		}
 
-		ipcl_conn_cleanup(connp);
-		connp->conn_flags = IPCL_TCPCONN;
 		bzero(tcp, sizeof (tcp_t));
 
 		tcp->tcp_timercache = mp;
@@ -777,18 +717,6 @@ ipcl_conn_destroy(conn_t *connp)
 		kmem_cache_free(tcp_conn_cache, connp);
 		return;
 	}
-	if (connp->conn_latch != NULL) {
-		IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
-		connp->conn_latch = NULL;
-	}
-	if (connp->conn_policy != NULL) {
-		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
-		connp->conn_policy = NULL;
-	}
-	if (connp->conn_ipsec_opt_mp != NULL) {
-		freemsg(connp->conn_ipsec_opt_mp);
-		connp->conn_ipsec_opt_mp = NULL;
-	}
 
 	if (connp->conn_flags & IPCL_SCTPCONN) {
 		ASSERT(ns != NULL);
@@ -796,21 +724,21 @@ ipcl_conn_destroy(conn_t *connp)
 		return;
 	}
 
+	ipcl_conn_cleanup(connp);
 	if (ns != NULL) {
 		connp->conn_netstack = NULL;
+		connp->conn_ixa->ixa_ipst = NULL;
 		netstack_rele(ns);
 	}
 
-	ipcl_conn_cleanup(connp);
-
 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
 	if (connp->conn_flags & IPCL_UDPCONN) {
 		connp->conn_flags = IPCL_UDPCONN;
 		kmem_cache_free(udp_conn_cache, connp);
 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
-
 		connp->conn_flags = IPCL_RAWIPCONN;
-		connp->conn_ulp = IPPROTO_ICMP;
+		connp->conn_proto = IPPROTO_ICMP;
+		connp->conn_ixa->ixa_protocol = connp->conn_proto;
 		kmem_cache_free(rawip_conn_cache, connp);
 	} else if (connp->conn_flags & IPCL_RTSCONN) {
 		connp->conn_flags = IPCL_RTSCONN;
@@ -826,7 +754,6 @@ ipcl_conn_destroy(conn_t *connp)
 /*
  * Running in cluster mode - deregister listener information
  */
-
 static void
 ipcl_conn_unlisten(conn_t *connp)
 {
@@ -837,12 +764,12 @@ ipcl_conn_unlisten(conn_t *connp)
 		sa_family_t	addr_family;
 		uint8_t		*laddrp;
 
-		if (connp->conn_pkt_isv6) {
+		if (connp->conn_ipversion == IPV6_VERSION) {
 			addr_family = AF_INET6;
-			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
+			laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
 		} else {
 			addr_family = AF_INET;
-			laddrp = (uint8_t *)&connp->conn_bound_source;
+			laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
 		}
 		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
 		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
@@ -859,8 +786,6 @@ ipcl_conn_unlisten(conn_t *connp)
 	connf_t	*connfp = (connp)->conn_fanout;				\
 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
 	if (connfp != NULL) {						\
-		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
-		    (void *)(connp)));					\
 		mutex_enter(&connfp->connf_lock);			\
 		if ((connp)->conn_next != NULL)				\
 			(connp)->conn_next->conn_prev =			\
@@ -884,7 +809,11 @@ ipcl_conn_unlisten(conn_t *connp)
 void
 ipcl_hash_remove(conn_t *connp)
 {
+	uint8_t		protocol = connp->conn_proto;
+
 	IPCL_HASH_REMOVE(connp);
+	if (protocol == IPPROTO_RSVP)
+		ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
 }
 
 /*
@@ -937,8 +866,6 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
 }
 
 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
-	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
-	    "connp %p", (void *)(connfp), (void *)(connp)));		\
 	IPCL_HASH_REMOVE((connp));					\
 	mutex_enter(&(connfp)->connf_lock);				\
 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
@@ -947,13 +874,11 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
 
 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
 	conn_t *pconnp = NULL, *nconnp;					\
-	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
-	    "connp %p", (void *)connfp, (void *)(connp)));		\
 	IPCL_HASH_REMOVE((connp));					\
 	mutex_enter(&(connfp)->connf_lock);				\
 	nconnp = (connfp)->connf_head;					\
 	while (nconnp != NULL &&					\
-	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
+	    !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {		\
 		pconnp = nconnp;					\
 		nconnp = nconnp->conn_next;				\
 	}								\
@@ -977,16 +902,14 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
 	conn_t **list, *prev, *next;					\
 	boolean_t isv4mapped =						\
-	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
-	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
-	    "connp %p", (void *)(connfp), (void *)(connp)));		\
+	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);		\
 	IPCL_HASH_REMOVE((connp));					\
 	mutex_enter(&(connfp)->connf_lock);				\
 	list = &(connfp)->connf_head;					\
 	prev = NULL;							\
 	while ((next = *list) != NULL) {				\
 		if (isv4mapped &&					\
-		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
+		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&	\
 		    connp->conn_zoneid == next->conn_zoneid) {		\
 			(connp)->conn_next = next;			\
 			if (prev != NULL)				\
@@ -1012,44 +935,13 @@ ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
 }
 
-void
-ipcl_proto_insert(conn_t *connp, uint8_t protocol)
-{
-	connf_t	*connfp;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	ASSERT(connp != NULL);
-	ASSERT((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
-	    protocol == IPPROTO_AH || protocol == IPPROTO_ESP);
-
-	connp->conn_ulp = protocol;
-
-	/* Insert it in the protocol hash */
-	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
-	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
-}
-
-void
-ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
-{
-	connf_t	*connfp;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	ASSERT(connp != NULL);
-	ASSERT((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
-	    protocol == IPPROTO_AH || protocol == IPPROTO_ESP);
-
-	connp->conn_ulp = protocol;
-
-	/* Insert it in the Bind Hash */
-	connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
-	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
-}
-
 /*
  * Because the classifier is used to classify inbound packets, the destination
  * address is meant to be our local tunnel address (tunnel source), and the
  * source the remote tunnel address (tunnel destination).
+ *
+ * Note that conn_proto can't be used for fanout since the upper protocol
+ * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
  */
 conn_t *
 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
@@ -1128,13 +1020,13 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
 	    oconnp = oconnp->conn_next) {
 		if (oconnp->conn_lport == lport &&
 		    oconnp->conn_zoneid == connp->conn_zoneid &&
-		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
-		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
-		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
-		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
-		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
-		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
-		    &connp->conn_srcv6))) {
+		    oconnp->conn_family == connp->conn_family &&
+		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
+		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
+		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
+		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
+		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
+		    &connp->conn_laddr_v6))) {
 			break;
 		}
 	}
@@ -1142,10 +1034,10 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
 	if (oconnp != NULL)
 		return (EADDRNOTAVAIL);
 
-	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
-	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
-		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
-		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
+	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
+	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
+		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
+		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
 		} else {
 			IPCL_HASH_INSERT_BOUND(connfp, connp);
@@ -1157,17 +1049,18 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
 }
 
 static int
-ipcl_iptun_hash_insert(conn_t *connp, ipaddr_t src, ipaddr_t dst,
-    ip_stack_t *ipst)
+ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
 {
 	connf_t	*connfp;
 	conn_t	*tconnp;
+	ipaddr_t laddr = connp->conn_laddr_v4;
+	ipaddr_t faddr = connp->conn_faddr_v4;
 
-	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(src, dst)];
+	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
 	mutex_enter(&connfp->connf_lock);
 	for (tconnp = connfp->connf_head; tconnp != NULL;
 	    tconnp = tconnp->conn_next) {
-		if (IPCL_IPTUN_MATCH(tconnp, src, dst)) {
+		if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
 			/* A tunnel is already bound to these addresses. */
 			mutex_exit(&connfp->connf_lock);
 			return (EADDRINUSE);
@@ -1179,17 +1072,18 @@ ipcl_iptun_hash_insert(conn_t *connp, ipaddr_t src, ipaddr_t dst,
 }
 
 static int
-ipcl_iptun_hash_insert_v6(conn_t *connp, const in6_addr_t *src,
-    const in6_addr_t *dst, ip_stack_t *ipst)
+ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
 {
 	connf_t	*connfp;
 	conn_t	*tconnp;
+	in6_addr_t *laddr = &connp->conn_laddr_v6;
+	in6_addr_t *faddr = &connp->conn_faddr_v6;
 
-	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(src, dst)];
+	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
 	mutex_enter(&connfp->connf_lock);
 	for (tconnp = connfp->connf_head; tconnp != NULL;
 	    tconnp = tconnp->conn_next) {
-		if (IPCL_IPTUN_MATCH_V6(tconnp, src, dst)) {
+		if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
 			/* A tunnel is already bound to these addresses. */
 			mutex_exit(&connfp->connf_lock);
 			return (EADDRINUSE);
@@ -1213,12 +1107,12 @@ check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
 	connf_t	*connfp;
 	conn_t *tconn;
 
-	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
+	connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
 	mutex_enter(&connfp->connf_lock);
 	for (tconn = connfp->connf_head; tconn != NULL;
 	    tconn = tconn->conn_next) {
 		/* We don't allow v4 fallback for v6 raw socket */
-		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
+		if (connp->conn_family != tconn->conn_family)
 			continue;
 		/* If neither is exempt, then there's no conflict */
 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
@@ -1228,9 +1122,9 @@ check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
 		if (connp->conn_zoneid == tconn->conn_zoneid)
 			continue;
 		/* If both are bound to different specific addrs, ok */
-		if (connp->conn_src != INADDR_ANY &&
-		    tconn->conn_src != INADDR_ANY &&
-		    connp->conn_src != tconn->conn_src)
+		if (connp->conn_laddr_v4 != INADDR_ANY &&
+		    tconn->conn_laddr_v4 != INADDR_ANY &&
+		    connp->conn_laddr_v4 != tconn->conn_laddr_v4)
 			continue;
 		/* These two conflict; fail */
 		break;
@@ -1245,12 +1139,12 @@ check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
 	connf_t	*connfp;
 	conn_t *tconn;
 
-	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
+	connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
 	mutex_enter(&connfp->connf_lock);
 	for (tconn = connfp->connf_head; tconn != NULL;
 	    tconn = tconn->conn_next) {
 		/* We don't allow v4 fallback for v6 raw socket */
-		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
+		if (connp->conn_family != tconn->conn_family)
 			continue;
 		/* If neither is exempt, then there's no conflict */
 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
@@ -1260,9 +1154,10 @@ check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
 		if (connp->conn_zoneid == tconn->conn_zoneid)
 			continue;
 		/* If both are bound to different addrs, ok */
-		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
-		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
-		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
+		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
+		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
+		    !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
+		    &tconn->conn_laddr_v6))
 			continue;
 		/* These two conflict; fail */
 		break;
@@ -1273,28 +1168,29 @@ check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
 
 /*
  * (v4, v6) bind hash insertion routines
+ * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
  */
+
+int
+ipcl_bind_insert(conn_t *connp)
+{
+	if (connp->conn_ipversion == IPV6_VERSION)
+		return (ipcl_bind_insert_v6(connp));
+	else
+		return (ipcl_bind_insert_v4(connp));
+}
+
 int
-ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
+ipcl_bind_insert_v4(conn_t *connp)
 {
 	connf_t	*connfp;
-#ifdef	IPCL_DEBUG
-	char	buf[INET_NTOA_BUFSIZE];
-#endif
 	int	ret = 0;
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	ASSERT(connp);
-
-	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
-	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
-
-	connp->conn_ulp = protocol;
-	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
-	connp->conn_lport = lport;
+	uint16_t	lport = connp->conn_lport;
+	uint8_t		protocol = connp->conn_proto;
 
 	if (IPCL_IS_IPTUN(connp))
-		return (ipcl_iptun_hash_insert(connp, src, INADDR_ANY, ipst));
+		return (ipcl_iptun_hash_insert(connp, ipst));
 
 	switch (protocol) {
 	default:
@@ -1304,45 +1200,40 @@ ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
 		/* FALLTHROUGH */
 	case IPPROTO_UDP:
 		if (protocol == IPPROTO_UDP) {
-			IPCL_DEBUG_LVL(64,
-			    ("ipcl_bind_insert: connp %p - udp\n",
-			    (void *)connp));
 			connfp = &ipst->ips_ipcl_udp_fanout[
 			    IPCL_UDP_HASH(lport, ipst)];
 		} else {
-			IPCL_DEBUG_LVL(64,
-			    ("ipcl_bind_insert: connp %p - protocol\n",
-			    (void *)connp));
-			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
+			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
 		}
 
-		if (connp->conn_rem != INADDR_ANY) {
+		if (connp->conn_faddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
-		} else if (connp->conn_src != INADDR_ANY) {
+		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_BOUND(connfp, connp);
 		} else {
 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
 		}
+		if (protocol == IPPROTO_RSVP)
+			ill_set_inputfn_all(ipst);
 		break;
 
 	case IPPROTO_TCP:
-
 		/* Insert it in the Bind Hash */
 		ASSERT(connp->conn_zoneid != ALL_ZONES);
 		connfp = &ipst->ips_ipcl_bind_fanout[
 		    IPCL_BIND_HASH(lport, ipst)];
-		if (connp->conn_src != INADDR_ANY) {
+		if (connp->conn_laddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_BOUND(connfp, connp);
 		} else {
 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
 		}
 		if (cl_inet_listen != NULL) {
-			ASSERT(!connp->conn_pkt_isv6);
+			ASSERT(connp->conn_ipversion == IPV4_VERSION);
 			connp->conn_flags |= IPCL_CL_LISTENER;
 			(*cl_inet_listen)(
 			    connp->conn_netstack->netstack_stackid,
 			    IPPROTO_TCP, AF_INET,
-			    (uint8_t *)&connp->conn_bound_source, lport, NULL);
+			    (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
 		}
 		break;
 
@@ -1355,20 +1246,16 @@ ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
 }
 
 int
-ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
-    uint16_t lport)
+ipcl_bind_insert_v6(conn_t *connp)
 {
 	connf_t		*connfp;
 	int		ret = 0;
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	ASSERT(connp != NULL);	connp->conn_ulp = protocol;
-	connp->conn_srcv6 = *src;
-	connp->conn_lport = lport;
+	uint16_t	lport = connp->conn_lport;
+	uint8_t		protocol = connp->conn_proto;
 
 	if (IPCL_IS_IPTUN(connp)) {
-		return (ipcl_iptun_hash_insert_v6(connp, src, &ipv6_all_zeros,
-		    ipst));
+		return (ipcl_iptun_hash_insert_v6(connp, ipst));
 	}
 
 	switch (protocol) {
@@ -1379,21 +1266,15 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
 		/* FALLTHROUGH */
 	case IPPROTO_UDP:
 		if (protocol == IPPROTO_UDP) {
-			IPCL_DEBUG_LVL(128,
-			    ("ipcl_bind_insert_v6: connp %p - udp\n",
-			    (void *)connp));
 			connfp = &ipst->ips_ipcl_udp_fanout[
 			    IPCL_UDP_HASH(lport, ipst)];
 		} else {
-			IPCL_DEBUG_LVL(128,
-			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
-			    (void *)connp));
 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
 		}
 
-		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
+		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
-		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
+		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
 			IPCL_HASH_INSERT_BOUND(connfp, connp);
 		} else {
 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
@@ -1401,13 +1282,11 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
 		break;
 
 	case IPPROTO_TCP:
-		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
-
 		/* Insert it in the Bind Hash */
 		ASSERT(connp->conn_zoneid != ALL_ZONES);
 		connfp = &ipst->ips_ipcl_bind_fanout[
 		    IPCL_BIND_HASH(lport, ipst)];
-		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
+		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
 			IPCL_HASH_INSERT_BOUND(connfp, connp);
 		} else {
 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
@@ -1416,13 +1295,13 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
 			sa_family_t	addr_family;
 			uint8_t		*laddrp;
 
-			if (connp->conn_pkt_isv6) {
+			if (connp->conn_ipversion == IPV6_VERSION) {
 				addr_family = AF_INET6;
 				laddrp =
-				    (uint8_t *)&connp->conn_bound_source_v6;
+				    (uint8_t *)&connp->conn_bound_addr_v6;
 			} else {
 				addr_family = AF_INET;
-				laddrp = (uint8_t *)&connp->conn_bound_source;
+				laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
 			}
 			connp->conn_flags |= IPCL_CL_LISTENER;
 			(*cl_inet_listen)(
@@ -1441,43 +1320,35 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
 
 /*
  * ipcl_conn_hash insertion routines.
+ * The caller has already set conn_proto and the addresses/ports in the conn_t.
  */
+
 int
-ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
-    ipaddr_t rem, uint32_t ports)
+ipcl_conn_insert(conn_t *connp)
+{
+	if (connp->conn_ipversion == IPV6_VERSION)
+		return (ipcl_conn_insert_v6(connp));
+	else
+		return (ipcl_conn_insert_v4(connp));
+}
+
+int
+ipcl_conn_insert_v4(conn_t *connp)
 {
 	connf_t		*connfp;
-	uint16_t	*up;
 	conn_t		*tconnp;
-#ifdef	IPCL_DEBUG
-	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
-#endif
-	in_port_t	lport;
 	int		ret = 0;
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
-	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
-	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
-	    ports, protocol));
+	uint16_t	lport = connp->conn_lport;
+	uint8_t		protocol = connp->conn_proto;
 
 	if (IPCL_IS_IPTUN(connp))
-		return (ipcl_iptun_hash_insert(connp, src, rem, ipst));
+		return (ipcl_iptun_hash_insert(connp, ipst));
 
 	switch (protocol) {
 	case IPPROTO_TCP:
-		if (!(connp->conn_flags & IPCL_EAGER)) {
-			/*
-			 * for a eager connection, i.e connections which
-			 * have just been created, the initialization is
-			 * already done in ip at conn_creation time, so
-			 * we can skip the checks here.
-			 */
-			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
-		}
-
 		/*
-		 * For tcp, we check whether the connection tuple already
+		 * For TCP, we check whether the connection tuple already
 		 * exists before allowing the connection to proceed.  We
 		 * also allow indexing on the zoneid. This is to allow
 		 * multiple shared stack zones to have the same tcp
@@ -1486,16 +1357,15 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
 		 * doesn't have to be unique.
 		 */
 		connfp = &ipst->ips_ipcl_conn_fanout[
-		    IPCL_CONN_HASH(connp->conn_rem,
+		    IPCL_CONN_HASH(connp->conn_faddr_v4,
 		    connp->conn_ports, ipst)];
 		mutex_enter(&connfp->connf_lock);
 		for (tconnp = connfp->connf_head; tconnp != NULL;
 		    tconnp = tconnp->conn_next) {
-			if ((IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
-			    connp->conn_rem, connp->conn_src,
-			    connp->conn_ports)) &&
-			    (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
-
+			if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
+			    connp->conn_faddr_v4, connp->conn_laddr_v4,
+			    connp->conn_ports) &&
+			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
 				/* Already have a conn. bail out */
 				mutex_exit(&connfp->connf_lock);
 				return (EADDRINUSE);
@@ -1512,6 +1382,7 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
 		}
 
 		ASSERT(connp->conn_recv != NULL);
+		ASSERT(connp->conn_recvicmp != NULL);
 
 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
 		mutex_exit(&connfp->connf_lock);
@@ -1523,7 +1394,6 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
 		 * from the hash first.
 		 */
 		IPCL_HASH_REMOVE(connp);
-		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
 		ret = ipcl_sctp_hash_insert(connp, lport);
 		break;
 
@@ -1540,18 +1410,16 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
 		/* FALLTHROUGH */
 
 	case IPPROTO_UDP:
-		up = (uint16_t *)&ports;
-		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
 		if (protocol == IPPROTO_UDP) {
 			connfp = &ipst->ips_ipcl_udp_fanout[
-			    IPCL_UDP_HASH(up[1], ipst)];
+			    IPCL_UDP_HASH(lport, ipst)];
 		} else {
-			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
+			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
 		}
 
-		if (connp->conn_rem != INADDR_ANY) {
+		if (connp->conn_faddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
-		} else if (connp->conn_src != INADDR_ANY) {
+		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_BOUND(connfp, connp);
 		} else {
 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
@@ -1563,25 +1431,21 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
 }
 
 int
-ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
-    const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
+ipcl_conn_insert_v6(conn_t *connp)
 {
 	connf_t		*connfp;
-	uint16_t	*up;
 	conn_t		*tconnp;
-	in_port_t	lport;
 	int		ret = 0;
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
+	uint16_t	lport = connp->conn_lport;
+	uint8_t		protocol = connp->conn_proto;
+	uint_t		ifindex = connp->conn_bound_if;
 
 	if (IPCL_IS_IPTUN(connp))
-		return (ipcl_iptun_hash_insert_v6(connp, src, rem, ipst));
+		return (ipcl_iptun_hash_insert_v6(connp, ipst));
 
 	switch (protocol) {
 	case IPPROTO_TCP:
-		/* Just need to insert a conn struct */
-		if (!(connp->conn_flags & IPCL_EAGER)) {
-			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
-		}
 
 		/*
 		 * For tcp, we check whether the connection tuple already
@@ -1593,17 +1457,18 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
 		 * doesn't have to be unique.
 		 */
 		connfp = &ipst->ips_ipcl_conn_fanout[
-		    IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports,
+		    IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
 		    ipst)];
 		mutex_enter(&connfp->connf_lock);
 		for (tconnp = connfp->connf_head; tconnp != NULL;
 		    tconnp = tconnp->conn_next) {
-			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
-			    connp->conn_remv6, connp->conn_srcv6,
+			/* NOTE: need to match zoneid. Bug in onnv-gate */
+			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
+			    connp->conn_faddr_v6, connp->conn_laddr_v6,
 			    connp->conn_ports) &&
-			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
-			    tconnp->conn_tcp->tcp_bound_if == ifindex) &&
-			    (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
+			    (tconnp->conn_bound_if == 0 ||
+			    tconnp->conn_bound_if == ifindex) &&
+			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
 				/* Already have a conn. bail out */
 				mutex_exit(&connfp->connf_lock);
 				return (EADDRINUSE);
@@ -1624,7 +1489,6 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
 
 	case IPPROTO_SCTP:
 		IPCL_HASH_REMOVE(connp);
-		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
 		ret = ipcl_sctp_hash_insert(connp, lport);
 		break;
 
@@ -1634,18 +1498,16 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
 			return (EADDRINUSE);
 		/* FALLTHROUGH */
 	case IPPROTO_UDP:
-		up = (uint16_t *)&ports;
-		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
 		if (protocol == IPPROTO_UDP) {
 			connfp = &ipst->ips_ipcl_udp_fanout[
-			    IPCL_UDP_HASH(up[1], ipst)];
+			    IPCL_UDP_HASH(lport, ipst)];
 		} else {
 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
 		}
 
-		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
+		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
-		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
+		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
 			IPCL_HASH_INSERT_BOUND(connfp, connp);
 		} else {
 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
@@ -1667,8 +1529,8 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
  * zone, then label checks are omitted.
  */
 conn_t *
-ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
-    ip_stack_t *ipst)
+ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
+    ip_recv_attr_t *ira, ip_stack_t *ipst)
 {
 	ipha_t	*ipha;
 	connf_t	*connfp, *bind_connfp;
@@ -1677,8 +1539,7 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 	uint32_t ports;
 	conn_t	*connp;
 	uint16_t  *up;
-	boolean_t shared_addr;
-	boolean_t unlabeled;
+	zoneid_t	zoneid = ira->ira_zoneid;
 
 	ipha = (ipha_t *)mp->b_rptr;
 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
@@ -1692,11 +1553,14 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 		mutex_enter(&connfp->connf_lock);
 		for (connp = connfp->connf_head; connp != NULL;
 		    connp = connp->conn_next) {
-			if ((IPCL_CONN_MATCH(connp, protocol,
-			    ipha->ipha_src, ipha->ipha_dst, ports)) &&
-			    (IPCL_ZONE_MATCH(connp, zoneid))) {
+			if (IPCL_CONN_MATCH(connp, protocol,
+			    ipha->ipha_src, ipha->ipha_dst, ports) &&
+			    (connp->conn_zoneid == zoneid ||
+			    connp->conn_allzones ||
+			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
 				break;
-			}
 		}
 
 		if (connp != NULL) {
@@ -1713,48 +1577,19 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 		}
 
 		mutex_exit(&connfp->connf_lock);
-
 		lport = up[1];
-		unlabeled = B_FALSE;
-		/* Cred cannot be null on IPv4 */
-		if (is_system_labeled()) {
-			cred_t *cr = msg_getcred(mp, NULL);
-			ASSERT(cr != NULL);
-			unlabeled = (crgetlabel(cr)->tsl_flags &
-			    TSLF_UNLABELED) != 0;
-		}
-		shared_addr = (zoneid == ALL_ZONES);
-		if (shared_addr) {
-			/*
-			 * No need to handle exclusive-stack zones since
-			 * ALL_ZONES only applies to the shared stack.
-			 */
-			zoneid = tsol_mlp_findzone(protocol, lport);
-			/*
-			 * If no shared MLP is found, tsol_mlp_findzone returns
-			 * ALL_ZONES.  In that case, we assume it's SLP, and
-			 * search for the zone based on the packet label.
-			 *
-			 * If there is such a zone, we prefer to find a
-			 * connection in it.  Otherwise, we look for a
-			 * MAC-exempt connection in any zone whose label
-			 * dominates the default label on the packet.
-			 */
-			if (zoneid == ALL_ZONES)
-				zoneid = tsol_packet_to_zoneid(mp);
-			else
-				unlabeled = B_FALSE;
-		}
-
 		bind_connfp =
 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
 		mutex_enter(&bind_connfp->connf_lock);
 		for (connp = bind_connfp->connf_head; connp != NULL;
 		    connp = connp->conn_next) {
 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
-			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
-			    (unlabeled && shared_addr &&
-			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
+			    lport) &&
+			    (connp->conn_zoneid == zoneid ||
+			    connp->conn_allzones ||
+			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
 				break;
 		}
 
@@ -1762,16 +1597,17 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 		 * If the matching connection is SLP on a private address, then
 		 * the label on the packet must match the local zone's label.
 		 * Otherwise, it must be in the label range defined by tnrh.
-		 * This is ensured by tsol_receive_label.
+		 * This is ensured by tsol_receive_local.
+		 *
+		 * Note that we don't check tsol_receive_local for
+		 * the connected case.
 		 */
-		if (connp != NULL && is_system_labeled() &&
+		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
-		    shared_addr, connp)) {
-				DTRACE_PROBE3(
-				    tx__ip__log__info__classify__tcp,
-				    char *,
-				    "connp(1) could not receive mp(2)",
-				    conn_t *, connp, mblk_t *, mp);
+		    ira, connp)) {
+			DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
+			    char *, "connp(1) could not receive mp(2)",
+			    conn_t *, connp, mblk_t *, mp);
 			connp = NULL;
 		}
 
@@ -1783,61 +1619,27 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 		}
 
 		mutex_exit(&bind_connfp->connf_lock);
-
-		IPCL_DEBUG_LVL(512,
-		    ("ipcl_classify: couldn't classify mp = %p\n",
-		    (void *)mp));
 		break;
 
 	case IPPROTO_UDP:
 		lport = up[1];
-		unlabeled = B_FALSE;
-		/* Cred cannot be null on IPv4 */
-		if (is_system_labeled()) {
-			cred_t *cr = msg_getcred(mp, NULL);
-			ASSERT(cr != NULL);
-			unlabeled = (crgetlabel(cr)->tsl_flags &
-			    TSLF_UNLABELED) != 0;
-		}
-		shared_addr = (zoneid == ALL_ZONES);
-		if (shared_addr) {
-			/*
-			 * No need to handle exclusive-stack zones since
-			 * ALL_ZONES only applies to the shared stack.
-			 */
-			zoneid = tsol_mlp_findzone(protocol, lport);
-			/*
-			 * If no shared MLP is found, tsol_mlp_findzone returns
-			 * ALL_ZONES.  In that case, we assume it's SLP, and
-			 * search for the zone based on the packet label.
-			 *
-			 * If there is such a zone, we prefer to find a
-			 * connection in it.  Otherwise, we look for a
-			 * MAC-exempt connection in any zone whose label
-			 * dominates the default label on the packet.
-			 */
-			if (zoneid == ALL_ZONES)
-				zoneid = tsol_packet_to_zoneid(mp);
-			else
-				unlabeled = B_FALSE;
-		}
 		fport = up[0];
-		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
 		mutex_enter(&connfp->connf_lock);
 		for (connp = connfp->connf_head; connp != NULL;
 		    connp = connp->conn_next) {
 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
 			    fport, ipha->ipha_src) &&
-			    (IPCL_ZONE_MATCH(connp, zoneid) ||
-			    (unlabeled && shared_addr &&
-			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
+			    (connp->conn_zoneid == zoneid ||
+			    connp->conn_allzones ||
+			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
 				break;
 		}
 
-		if (connp != NULL && is_system_labeled() &&
+		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
-		    shared_addr, connp)) {
+		    ira, connp)) {
 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
 			    char *, "connp(1) could not receive mp(2)",
 			    conn_t *, connp, mblk_t *, mp);
@@ -1854,9 +1656,7 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 		 * We shouldn't come here for multicast/broadcast packets
 		 */
 		mutex_exit(&connfp->connf_lock);
-		IPCL_DEBUG_LVL(512,
-		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
-		    lport, fport));
+
 		break;
 
 	case IPPROTO_ENCAP:
@@ -1869,26 +1669,25 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 }
 
 conn_t *
-ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
-    ip_stack_t *ipst)
+ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
+    ip_recv_attr_t *ira, ip_stack_t *ipst)
 {
 	ip6_t		*ip6h;
 	connf_t		*connfp, *bind_connfp;
 	uint16_t	lport;
 	uint16_t	fport;
-	tcph_t		*tcph;
+	tcpha_t		*tcpha;
 	uint32_t	ports;
 	conn_t		*connp;
 	uint16_t	*up;
-	boolean_t	shared_addr;
-	boolean_t	unlabeled;
+	zoneid_t	zoneid = ira->ira_zoneid;
 
 	ip6h = (ip6_t *)mp->b_rptr;
 
 	switch (protocol) {
 	case IPPROTO_TCP:
-		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
-		up = (uint16_t *)tcph->th_lport;
+		tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
+		up = &tcpha->tha_lport;
 		ports = *(uint32_t *)up;
 
 		connfp =
@@ -1897,11 +1696,14 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 		mutex_enter(&connfp->connf_lock);
 		for (connp = connfp->connf_head; connp != NULL;
 		    connp = connp->conn_next) {
-			if ((IPCL_CONN_MATCH_V6(connp, protocol,
-			    ip6h->ip6_src, ip6h->ip6_dst, ports)) &&
-			    (IPCL_ZONE_MATCH(connp, zoneid))) {
+			if (IPCL_CONN_MATCH_V6(connp, protocol,
+			    ip6h->ip6_src, ip6h->ip6_dst, ports) &&
+			    (connp->conn_zoneid == zoneid ||
+			    connp->conn_allzones ||
+			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
 				break;
-			}
 		}
 
 		if (connp != NULL) {
@@ -1920,37 +1722,6 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 		mutex_exit(&connfp->connf_lock);
 
 		lport = up[1];
-		unlabeled = B_FALSE;
-		/* Cred can be null on IPv6 */
-		if (is_system_labeled()) {
-			cred_t *cr = msg_getcred(mp, NULL);
-
-			unlabeled = (cr != NULL &&
-			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
-		}
-		shared_addr = (zoneid == ALL_ZONES);
-		if (shared_addr) {
-			/*
-			 * No need to handle exclusive-stack zones since
-			 * ALL_ZONES only applies to the shared stack.
-			 */
-			zoneid = tsol_mlp_findzone(protocol, lport);
-			/*
-			 * If no shared MLP is found, tsol_mlp_findzone returns
-			 * ALL_ZONES.  In that case, we assume it's SLP, and
-			 * search for the zone based on the packet label.
-			 *
-			 * If there is such a zone, we prefer to find a
-			 * connection in it.  Otherwise, we look for a
-			 * MAC-exempt connection in any zone whose label
-			 * dominates the default label on the packet.
-			 */
-			if (zoneid == ALL_ZONES)
-				zoneid = tsol_packet_to_zoneid(mp);
-			else
-				unlabeled = B_FALSE;
-		}
-
 		bind_connfp =
 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
 		mutex_enter(&bind_connfp->connf_lock);
@@ -1958,15 +1729,17 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 		    connp = connp->conn_next) {
 			if (IPCL_BIND_MATCH_V6(connp, protocol,
 			    ip6h->ip6_dst, lport) &&
-			    (IPCL_ZONE_MATCH(connp, zoneid) ||
-			    (unlabeled && shared_addr &&
-			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
+			    (connp->conn_zoneid == zoneid ||
+			    connp->conn_allzones ||
+			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
 				break;
 		}
 
-		if (connp != NULL && is_system_labeled() &&
+		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
-		    shared_addr, connp)) {
+		    ira, connp)) {
 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
 			    char *, "connp(1) could not receive mp(2)",
 			    conn_t *, connp, mblk_t *, mp);
@@ -1977,72 +1750,33 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 			/* Have a listner at least */
 			CONN_INC_REF(connp);
 			mutex_exit(&bind_connfp->connf_lock);
-			IPCL_DEBUG_LVL(512,
-			    ("ipcl_classify_v6: found listner "
-			    "connp = %p\n", (void *)connp));
-
 			return (connp);
 		}
 
 		mutex_exit(&bind_connfp->connf_lock);
-
-		IPCL_DEBUG_LVL(512,
-		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
-		    (void *)mp));
 		break;
 
 	case IPPROTO_UDP:
 		up = (uint16_t *)&mp->b_rptr[hdr_len];
 		lport = up[1];
-		unlabeled = B_FALSE;
-		/* Cred can be null on IPv6 */
-		if (is_system_labeled()) {
-			cred_t *cr = msg_getcred(mp, NULL);
-
-			unlabeled = (cr != NULL &&
-			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
-		}
-		shared_addr = (zoneid == ALL_ZONES);
-		if (shared_addr) {
-			/*
-			 * No need to handle exclusive-stack zones since
-			 * ALL_ZONES only applies to the shared stack.
-			 */
-			zoneid = tsol_mlp_findzone(protocol, lport);
-			/*
-			 * If no shared MLP is found, tsol_mlp_findzone returns
-			 * ALL_ZONES.  In that case, we assume it's SLP, and
-			 * search for the zone based on the packet label.
-			 *
-			 * If there is such a zone, we prefer to find a
-			 * connection in it.  Otherwise, we look for a
-			 * MAC-exempt connection in any zone whose label
-			 * dominates the default label on the packet.
-			 */
-			if (zoneid == ALL_ZONES)
-				zoneid = tsol_packet_to_zoneid(mp);
-			else
-				unlabeled = B_FALSE;
-		}
-
 		fport = up[0];
-		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
-		    fport));
 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
 		mutex_enter(&connfp->connf_lock);
 		for (connp = connfp->connf_head; connp != NULL;
 		    connp = connp->conn_next) {
 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
 			    fport, ip6h->ip6_src) &&
-			    (IPCL_ZONE_MATCH(connp, zoneid) ||
-			    (unlabeled && shared_addr &&
-			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
+			    (connp->conn_zoneid == zoneid ||
+			    connp->conn_allzones ||
+			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
 				break;
 		}
 
-		if (connp != NULL && is_system_labeled() &&
+		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
-		    shared_addr, connp)) {
+		    ira, connp)) {
 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
 			    char *, "connp(1) could not receive mp(2)",
 			    conn_t *, connp, mblk_t *, mp);
@@ -2059,9 +1793,6 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
 		 * We shouldn't come here for multicast/broadcast packets
 		 */
 		mutex_exit(&connfp->connf_lock);
-		IPCL_DEBUG_LVL(512,
-		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
-		    lport, fport));
 		break;
 	case IPPROTO_ENCAP:
 	case IPPROTO_IPV6:
@@ -2076,125 +1807,80 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
  * wrapper around ipcl_classify_(v4,v6) routines.
  */
 conn_t *
-ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
+ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
 {
-	uint16_t	hdr_len;
-	ipha_t		*ipha;
-	uint8_t		*nexthdrp;
-
-	if (MBLKL(mp) < sizeof (ipha_t))
-		return (NULL);
-
-	switch (IPH_HDR_VERSION(mp->b_rptr)) {
-	case IPV4_VERSION:
-		ipha = (ipha_t *)mp->b_rptr;
-		hdr_len = IPH_HDR_LENGTH(ipha);
-		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
-		    zoneid, ipst));
-	case IPV6_VERSION:
-		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
-		    &hdr_len, &nexthdrp))
-			return (NULL);
-
-		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst));
+	if (ira->ira_flags & IRAF_IS_IPV4) {
+		return (ipcl_classify_v4(mp, ira->ira_protocol,
+		    ira->ira_ip_hdr_length, ira, ipst));
+	} else {
+		return (ipcl_classify_v6(mp, ira->ira_protocol,
+		    ira->ira_ip_hdr_length, ira, ipst));
 	}
-
-	return (NULL);
 }
 
+/*
+ * Only used to classify SCTP RAW sockets
+ */
 conn_t *
-ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
-    uint32_t ports, ipha_t *hdr, ip_stack_t *ipst)
+ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
+    ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
 {
 	connf_t		*connfp;
 	conn_t		*connp;
 	in_port_t	lport;
-	int		af;
-	boolean_t	shared_addr;
-	boolean_t	unlabeled;
+	int		ipversion;
 	const void	*dst;
+	zoneid_t	zoneid = ira->ira_zoneid;
 
 	lport = ((uint16_t *)&ports)[1];
-
-	unlabeled = B_FALSE;
-	/* Cred can be null on IPv6 */
-	if (is_system_labeled()) {
-		cred_t *cr = msg_getcred(mp, NULL);
-
-		unlabeled = (cr != NULL &&
-		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
-	}
-	shared_addr = (zoneid == ALL_ZONES);
-	if (shared_addr) {
-		/*
-		 * No need to handle exclusive-stack zones since ALL_ZONES
-		 * only applies to the shared stack.
-		 */
-		zoneid = tsol_mlp_findzone(protocol, lport);
-		/*
-		 * If no shared MLP is found, tsol_mlp_findzone returns
-		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
-		 * the zone based on the packet label.
-		 *
-		 * If there is such a zone, we prefer to find a connection in
-		 * it.  Otherwise, we look for a MAC-exempt connection in any
-		 * zone whose label dominates the default label on the packet.
-		 */
-		if (zoneid == ALL_ZONES)
-			zoneid = tsol_packet_to_zoneid(mp);
-		else
-			unlabeled = B_FALSE;
+	if (ira->ira_flags & IRAF_IS_IPV4) {
+		dst = (const void *)&ipha->ipha_dst;
+		ipversion = IPV4_VERSION;
+	} else {
+		dst = (const void *)&ip6h->ip6_dst;
+		ipversion = IPV6_VERSION;
 	}
 
-	af = IPH_HDR_VERSION(hdr);
-	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
-	    (const void *)&((ip6_t *)hdr)->ip6_dst;
 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
-
 	mutex_enter(&connfp->connf_lock);
 	for (connp = connfp->connf_head; connp != NULL;
 	    connp = connp->conn_next) {
 		/* We don't allow v4 fallback for v6 raw socket. */
-		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
-		    IPV6_VERSION))
+		if (ipversion != connp->conn_ipversion)
 			continue;
-		if (connp->conn_fully_bound) {
-			if (af == IPV4_VERSION) {
+		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
+		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
+			if (ipversion == IPV4_VERSION) {
 				if (!IPCL_CONN_MATCH(connp, protocol,
-				    hdr->ipha_src, hdr->ipha_dst, ports))
+				    ipha->ipha_src, ipha->ipha_dst, ports))
 					continue;
 			} else {
 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
-				    ((ip6_t *)hdr)->ip6_src,
-				    ((ip6_t *)hdr)->ip6_dst, ports))
+				    ip6h->ip6_src, ip6h->ip6_dst, ports))
 					continue;
 			}
 		} else {
-			if (af == IPV4_VERSION) {
+			if (ipversion == IPV4_VERSION) {
 				if (!IPCL_BIND_MATCH(connp, protocol,
-				    hdr->ipha_dst, lport))
+				    ipha->ipha_dst, lport))
 					continue;
 			} else {
 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
-				    ((ip6_t *)hdr)->ip6_dst, lport))
+				    ip6h->ip6_dst, lport))
 					continue;
 			}
 		}
 
-		if (IPCL_ZONE_MATCH(connp, zoneid) ||
-		    (unlabeled &&
-		    (connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
-		    shared_addr))
+		if (connp->conn_zoneid == zoneid ||
+		    connp->conn_allzones ||
+		    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+		    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+		    (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
 			break;
 	}
-	/*
-	 * If the connection is fully-bound and connection-oriented (TCP or
-	 * SCTP), then we've already validated the remote system's label.
-	 * There's no need to do it again for every packet.
-	 */
-	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
-	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
-	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
+
+	if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
+	    !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
 		    char *, "connp(1) could not receive mp(2)",
 		    conn_t *, connp, mblk_t *, mp);
@@ -2205,22 +1891,22 @@ ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
 		goto found;
 	mutex_exit(&connfp->connf_lock);
 
-	/* Try to look for a wildcard match. */
+	/* Try to look for a wildcard SCTP RAW socket match. */
 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
 	mutex_enter(&connfp->connf_lock);
 	for (connp = connfp->connf_head; connp != NULL;
 	    connp = connp->conn_next) {
 		/* We don't allow v4 fallback for v6 raw socket. */
-		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
-		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
+		if (ipversion != connp->conn_ipversion)
 			continue;
-		}
-		if (af == IPV4_VERSION) {
-			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
+		if (!IPCL_ZONE_MATCH(connp, zoneid))
+			continue;
+
+		if (ipversion == IPV4_VERSION) {
+			if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
 				break;
 		} else {
-			if (IPCL_RAW_MATCH_V6(connp, protocol,
-			    ((ip6_t *)hdr)->ip6_dst)) {
+			if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
 				break;
 			}
 		}
@@ -2253,11 +1939,23 @@ tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
-	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
+	tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
+	if (tcp->tcp_timercache == NULL)
+		return (ENOMEM);
 	connp->conn_tcp = tcp;
 	connp->conn_flags = IPCL_TCPCONN;
-	connp->conn_ulp = IPPROTO_TCP;
+	connp->conn_proto = IPPROTO_TCP;
 	tcp->tcp_connp = connp;
+	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+
+	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+	if (connp->conn_ixa == NULL) {
+		tcp_timermp_free(tcp);
+		return (ENOMEM);
+	}
+	connp->conn_ixa->ixa_refcnt = 1;
+	connp->conn_ixa->ixa_protocol = connp->conn_proto;
+	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
 	return (0);
 }
 
@@ -2276,6 +1974,15 @@ tcp_conn_destructor(void *buf, void *cdrarg)
 	mutex_destroy(&connp->conn_lock);
 	cv_destroy(&connp->conn_cv);
 	cv_destroy(&connp->conn_sq_cv);
+	rw_destroy(&connp->conn_ilg_lock);
+
+	/* Can be NULL if constructor failed */
+	if (connp->conn_ixa != NULL) {
+		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+		ASSERT(connp->conn_ixa->ixa_ire == NULL);
+		ASSERT(connp->conn_ixa->ixa_nce == NULL);
+		ixa_refrele(connp->conn_ixa);
+	}
 }
 
 /* ARGSUSED */
@@ -2289,7 +1996,13 @@ ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
 	connp->conn_flags = IPCL_IPCCONN;
+	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
 
+	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+	if (connp->conn_ixa == NULL)
+		return (ENOMEM);
+	connp->conn_ixa->ixa_refcnt = 1;
+	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
 	return (0);
 }
 
@@ -2304,6 +2017,15 @@ ip_conn_destructor(void *buf, void *cdrarg)
 	ASSERT(connp->conn_priv == NULL);
 	mutex_destroy(&connp->conn_lock);
 	cv_destroy(&connp->conn_cv);
+	rw_destroy(&connp->conn_ilg_lock);
+
+	/* Can be NULL if constructor failed */
+	if (connp->conn_ixa != NULL) {
+		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+		ASSERT(connp->conn_ixa->ixa_ire == NULL);
+		ASSERT(connp->conn_ixa->ixa_nce == NULL);
+		ixa_refrele(connp->conn_ixa);
+	}
 }
 
 /* ARGSUSED */
@@ -2321,8 +2043,15 @@ udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
 	connp->conn_udp = udp;
 	connp->conn_flags = IPCL_UDPCONN;
-	connp->conn_ulp = IPPROTO_UDP;
+	connp->conn_proto = IPPROTO_UDP;
 	udp->udp_connp = connp;
+	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+	if (connp->conn_ixa == NULL)
+		return (ENOMEM);
+	connp->conn_ixa->ixa_refcnt = 1;
+	connp->conn_ixa->ixa_protocol = connp->conn_proto;
+	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
 	return (0);
 }
 
@@ -2339,6 +2068,15 @@ udp_conn_destructor(void *buf, void *cdrarg)
 	ASSERT(connp->conn_udp == udp);
 	mutex_destroy(&connp->conn_lock);
 	cv_destroy(&connp->conn_cv);
+	rw_destroy(&connp->conn_ilg_lock);
+
+	/* Can be NULL if constructor failed */
+	if (connp->conn_ixa != NULL) {
+		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+		ASSERT(connp->conn_ixa->ixa_ire == NULL);
+		ASSERT(connp->conn_ixa->ixa_nce == NULL);
+		ixa_refrele(connp->conn_ixa);
+	}
 }
 
 /* ARGSUSED */
@@ -2356,8 +2094,15 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
 	connp->conn_icmp = icmp;
 	connp->conn_flags = IPCL_RAWIPCONN;
-	connp->conn_ulp = IPPROTO_ICMP;
+	connp->conn_proto = IPPROTO_ICMP;
 	icmp->icmp_connp = connp;
+	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+	if (connp->conn_ixa == NULL)
+		return (ENOMEM);
+	connp->conn_ixa->ixa_refcnt = 1;
+	connp->conn_ixa->ixa_protocol = connp->conn_proto;
+	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
 	return (0);
 }
 
@@ -2374,6 +2119,15 @@ rawip_conn_destructor(void *buf, void *cdrarg)
 	ASSERT(connp->conn_icmp == icmp);
 	mutex_destroy(&connp->conn_lock);
 	cv_destroy(&connp->conn_cv);
+	rw_destroy(&connp->conn_ilg_lock);
+
+	/* Can be NULL if constructor failed */
+	if (connp->conn_ixa != NULL) {
+		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+		ASSERT(connp->conn_ixa->ixa_ire == NULL);
+		ASSERT(connp->conn_ixa->ixa_nce == NULL);
+		ixa_refrele(connp->conn_ixa);
+	}
 }
 
 /* ARGSUSED */
@@ -2392,6 +2146,12 @@ rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
 	connp->conn_rts = rts;
 	connp->conn_flags = IPCL_RTSCONN;
 	rts->rts_connp = connp;
+	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+	if (connp->conn_ixa == NULL)
+		return (ENOMEM);
+	connp->conn_ixa->ixa_refcnt = 1;
+	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
 	return (0);
 }
 
@@ -2408,71 +2168,35 @@ rts_conn_destructor(void *buf, void *cdrarg)
 	ASSERT(connp->conn_rts == rts);
 	mutex_destroy(&connp->conn_lock);
 	cv_destroy(&connp->conn_cv);
-}
+	rw_destroy(&connp->conn_ilg_lock);
 
-/* ARGSUSED */
-int
-ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags)
-{
-	int error;
-	netstack_t	*ns;
-	int		ret;
-	tcp_stack_t	*tcps;
-	ip_helper_stream_info_t	*ip_helper_str;
-	ip_stack_t	*ipst;
-
-	ns = netstack_find_by_cred(kcred);
-	ASSERT(ns != NULL);
-	tcps = ns->netstack_tcp;
-	ipst = ns->netstack_ip;
-	ASSERT(tcps != NULL);
-	ip_helper_str = (ip_helper_stream_info_t *)buf;
-
-	do {
-		error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred,
-		    &ip_helper_str->iphs_handle, ipst->ips_ldi_ident);
-	} while (error == EINTR);
-
-	if (error == 0) {
-		do {
-			error = ldi_ioctl(
-			    ip_helper_str->iphs_handle, SIOCSQPTR,
-			    (intptr_t)buf, FKIOCTL, kcred, &ret);
-		} while (error == EINTR);
-
-		if (error != 0) {
-			(void) ldi_close(
-			    ip_helper_str->iphs_handle, 0, kcred);
-		}
+	/* Can be NULL if constructor failed */
+	if (connp->conn_ixa != NULL) {
+		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+		ASSERT(connp->conn_ixa->ixa_ire == NULL);
+		ASSERT(connp->conn_ixa->ixa_nce == NULL);
+		ixa_refrele(connp->conn_ixa);
 	}
-
-	netstack_rele(ipst->ips_netstack);
-
-	return (error);
 }
 
-/* ARGSUSED */
-static void
-ip_helper_stream_destructor(void *buf, void *cdrarg)
-{
-	ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf;
-
-	ip_helper_str->iphs_rq->q_ptr =
-	    ip_helper_str->iphs_wq->q_ptr =
-	    ip_helper_str->iphs_minfo;
-	(void) ldi_close(ip_helper_str->iphs_handle, 0, kcred);
-}
-
-
 /*
  * Called as part of ipcl_conn_destroy to assert and clear any pointers
  * in the conn_t.
+ *
+ * Below we list all the pointers in the conn_t as a documentation aid.
+ * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
+ * If you add any pointers to the conn_t please add an ASSERT here
+ * and #ifdef it out if it can't be actually asserted to be NULL.
+ * In any case, we bzero most of the conn_t at the end of the function.
  */
 void
 ipcl_conn_cleanup(conn_t *connp)
 {
-	ASSERT(connp->conn_ire_cache == NULL);
+	ip_xmit_attr_t	*ixa;
+
 	ASSERT(connp->conn_latch == NULL);
+	ASSERT(connp->conn_latch_in_policy == NULL);
+	ASSERT(connp->conn_latch_in_action == NULL);
 #ifdef notdef
 	ASSERT(connp->conn_rq == NULL);
 	ASSERT(connp->conn_wq == NULL);
@@ -2485,18 +2209,6 @@ ipcl_conn_cleanup(conn_t *connp)
 	ASSERT(connp->conn_fanout == NULL);
 	ASSERT(connp->conn_next == NULL);
 	ASSERT(connp->conn_prev == NULL);
-#ifdef notdef
-	/*
-	 * The ill and ipif pointers are not cleared before the conn_t
-	 * goes away since they do not hold a reference on the ill/ipif.
-	 * We should replace these pointers with ifindex/ipaddr_t to
-	 * make the code less complex.
-	 */
-	ASSERT(connp->conn_outgoing_ill == NULL);
-	ASSERT(connp->conn_incoming_ill == NULL);
-	ASSERT(connp->conn_multicast_ipif == NULL);
-	ASSERT(connp->conn_multicast_ill == NULL);
-#endif
 	ASSERT(connp->conn_oper_pending_ill == NULL);
 	ASSERT(connp->conn_ilg == NULL);
 	ASSERT(connp->conn_drain_next == NULL);
@@ -2506,10 +2218,19 @@ ipcl_conn_cleanup(conn_t *connp)
 	ASSERT(connp->conn_idl == NULL);
 #endif
 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
-	ASSERT(connp->conn_effective_cred == NULL);
+#ifdef notdef
+	/* conn_netstack is cleared by the caller; needed by ixa_cleanup */
 	ASSERT(connp->conn_netstack == NULL);
+#endif
 
 	ASSERT(connp->conn_helper_info == NULL);
+	ASSERT(connp->conn_ixa != NULL);
+	ixa = connp->conn_ixa;
+	ASSERT(ixa->ixa_refcnt == 1);
+	/* Need to preserve ixa_protocol */
+	ixa_cleanup(ixa);
+	ixa->ixa_flags = 0;
+
 	/* Clear out the conn_t fields that are not preserved */
 	bzero(&connp->conn_start_clr,
 	    sizeof (conn_t) -
@@ -2602,10 +2323,11 @@ ipcl_globalhash_remove(conn_t *connp)
 
 /*
  * Walk the list of all conn_t's in the system, calling the function provided
- * with the specified argument for each.
+ * With the specified argument for each.
  * Applies to both IPv4 and IPv6.
  *
- * IPCs may hold pointers to ipif/ill. To guard against stale pointers
+ * CONNs may hold pointers to ills (conn_dhcpinit_ill and
+ * conn_oper_pending_ill). To guard against stale pointers
  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
  * unplumbed or removed. New conn_t's that are created while we are walking
  * may be missed by this walk, because they are not necessarily inserted
@@ -2657,7 +2379,7 @@ ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
  * (peer tcp in ESTABLISHED state).
  */
 conn_t *
-ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
+ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
     ip_stack_t *ipst)
 {
 	uint32_t ports;
@@ -2675,8 +2397,8 @@ ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
 
-	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
-	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
+	pports[0] = tcpha->tha_fport;
+	pports[1] = tcpha->tha_lport;
 
 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
 	    ports, ipst)];
@@ -2707,7 +2429,7 @@ ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
  * (peer tcp in ESTABLISHED state).
  */
 conn_t *
-ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
+ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
     ip_stack_t *ipst)
 {
 	uint32_t ports;
@@ -2728,8 +2450,8 @@ ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
 
-	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
-	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
+	pports[0] = tcpha->tha_fport;
+	pports[1] = tcpha->tha_lport;
 
 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
 	    ports, ipst)];
@@ -2738,7 +2460,7 @@ ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
 	for (tconnp = connfp->connf_head; tconnp != NULL;
 	    tconnp = tconnp->conn_next) {
 
-		/* We skip tcp_bound_if check here as this is loopback tcp */
+		/* We skip conn_bound_if check here as this is loopback tcp */
 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
@@ -2760,7 +2482,7 @@ ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
  * Only checks for connected entries i.e. no INADDR_ANY checks.
  */
 conn_t *
-ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
+ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
     ip_stack_t *ipst)
 {
 	uint32_t ports;
@@ -2769,8 +2491,8 @@ ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
 	conn_t	*tconnp;
 
 	pports = (uint16_t *)&ports;
-	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
-	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
+	pports[0] = tcpha->tha_fport;
+	pports[1] = tcpha->tha_lport;
 
 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
 	    ports, ipst)];
@@ -2823,8 +2545,8 @@ ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
 		    tcp->tcp_state >= min_state &&
-		    (tcp->tcp_bound_if == 0 ||
-		    tcp->tcp_bound_if == ifindex)) {
+		    (tconnp->conn_bound_if == 0 ||
+		    tconnp->conn_bound_if == ifindex)) {
 
 			CONN_INC_REF(tconnp);
 			mutex_exit(&connfp->connf_lock);
@@ -2901,8 +2623,8 @@ ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
 		tcp = connp->conn_tcp;
 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
 		    IPCL_ZONE_MATCH(connp, zoneid) &&
-		    (tcp->tcp_bound_if == 0 ||
-		    tcp->tcp_bound_if == ifindex) &&
+		    (connp->conn_bound_if == 0 ||
+		    connp->conn_bound_if == ifindex) &&
 		    tcp->tcp_listener == NULL) {
 			CONN_INC_REF(connp);
 			mutex_exit(&bind_connfp->connf_lock);
diff --git a/usr/src/uts/common/inet/ip/ipdrop.c b/usr/src/uts/common/inet/ip/ipdrop.c
index 6d08ec9d60..0f257d6cd2 100644
--- a/usr/src/uts/common/inet/ip/ipdrop.c
+++ b/usr/src/uts/common/inet/ip/ipdrop.c
@@ -29,11 +29,11 @@
 #include <sys/sunddi.h>
 #include <sys/kstat.h>
 #include <sys/kmem.h>
+#include <sys/sdt.h>
 #include <net/pfkeyv2.h>
 #include <inet/common.h>
 #include <inet/ip.h>
 #include <inet/ip6.h>
-#include <inet/ipsec_info.h>
 #include <inet/ipsec_impl.h>
 #include <inet/ipdrop.h>
 
@@ -246,16 +246,11 @@ ip_drop_unregister(ipdropper_t *ipd)
  * Actually drop a packet.  Many things could happen here, but at the least,
  * the packet will be freemsg()ed.
  */
-/* ARGSUSED */
 void
-ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving,
-    ire_t *outbound_ire, struct kstat_named *counter, ipdropper_t *who_called)
+ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *ill,
+    struct kstat_named *counter, ipdropper_t *who_called)
 {
-	mblk_t *ipsec_mp = NULL;
-	ipsec_in_t *ii = NULL;
-	ipsec_out_t *io = NULL;
-	ipsec_info_t *in;
-	uint8_t vers;
+	char *str;
 
 	if (mp == NULL) {
 		/*
@@ -265,41 +260,7 @@ ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving,
 		return;
 	}
 
-	if (DB_TYPE(mp) == M_CTL) {
-		in = (ipsec_info_t *)mp->b_rptr;
-
-		if (in->ipsec_info_type == IPSEC_IN)
-			ii = (ipsec_in_t *)in;
-		else if (in->ipsec_info_type == IPSEC_OUT)
-			io = (ipsec_out_t *)in;
-
-		/* See if this is an ICMP packet (check for v4/v6). */
-		vers = (*mp->b_rptr) >> 4;
-		if (vers != IPV4_VERSION && vers != IPV6_VERSION) {
-			/*
-			 * If not, it's some other sort of M_CTL to be freed.
-			 * For now, treat it like an ordinary packet.
-			 */
-			ipsec_mp = mp;
-			mp = mp->b_cont;
-		}
-	}
-
-	/* Reality checks */
-	if (inbound && io != NULL)
-		cmn_err(CE_WARN,
-		    "ip_drop_packet: inbound packet with IPSEC_OUT");
-
-	if (outbound_ire != NULL && ii != NULL)
-		cmn_err(CE_WARN,
-		    "ip_drop_packet: outbound packet with IPSEC_IN");
-
-	/* At this point, mp always points to the data. */
-	/*
-	 * Can't make the assertion yet - It could be an inbound ICMP
-	 * message, which is M_CTL but with data in it.
-	 */
-	/* ASSERT(mp->b_datap->db_type == M_DATA); */
+	ASSERT(mp->b_datap->db_type == M_DATA);
 
 	/* Increment the bean counter, if available. */
 	if (counter != NULL) {
@@ -318,16 +279,22 @@ ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving,
 			break;
 		/* Other types we can't handle for now. */
 		}
-
-		/* TODO?  Copy out kstat name for use in logging. */
 	}
 
-	/* TODO: log the packet details if logging is called for. */
+	if (counter != NULL)
+		str = counter->name;
+	else if (who_called != NULL)
+		str = who_called->ipd_name;
+	else
+		str = "Unspecified IPsec drop";
+
+	if (inbound)
+		ip_drop_input(str, mp, ill);
+	else
+		ip_drop_output(str, mp, ill);
+
 	/* TODO: queue the packet onto a snoop-friendly queue. */
 
-	/* If I haven't queued the packet or some such nonsense, free it. */
-	if (ipsec_mp != NULL)
-		freeb(ipsec_mp);
 	/*
 	 * ASSERT this isn't a b_next linked mblk chain where a
 	 * chained dropper should be used instead
@@ -335,3 +302,50 @@ ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving,
 	ASSERT(mp->b_prev == NULL && mp->b_next == NULL);
 	freemsg(mp);
 }
+
+/*
+ * This is just a convinient place for dtrace to see dropped packets
+ */
+/*ARGSUSED*/
+void
+ip_drop_input(char *str, mblk_t *mp, ill_t *ill)
+{
+	if (mp == NULL)
+		return;
+
+	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
+		ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+		DTRACE_IP7(drop__in, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha,
+		    ip6_t *, NULL, int, 0);
+	} else {
+		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+		DTRACE_IP7(drop__in, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL,
+		    ip6_t *, ip6h, int, 0);
+	}
+}
+
+/*ARGSUSED*/
+void
+ip_drop_output(char *str, mblk_t *mp, ill_t *ill)
+{
+	if (mp == NULL)
+		return;
+
+	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
+		ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+		DTRACE_IP7(drop__out, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha,
+		    ip6_t *, NULL, int, 0);
+	} else {
+		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+		DTRACE_IP7(drop__out, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL,
+		    ip6_t *, ip6h, int, 0);
+	}
+}
diff --git a/usr/src/uts/common/inet/ip/ipmp.c b/usr/src/uts/common/inet/ip/ipmp.c
index ea8b4a73bb..b89171ed2b 100644
--- a/usr/src/uts/common/inet/ip/ipmp.c
+++ b/usr/src/uts/common/inet/ip/ipmp.c
@@ -22,12 +22,12 @@
  * Use is subject to license terms.
  */
 
-#include <inet/arp.h>
 #include <inet/ip.h>
 #include <inet/ip6.h>
 #include <inet/ip_if.h>
 #include <inet/ip_ire.h>
 #include <inet/ip_multi.h>
+#include <inet/ip_ndp.h>
 #include <inet/ip_rts.h>
 #include <inet/mi.h>
 #include <net/if_types.h>
@@ -52,20 +52,6 @@
 #define	IPMP_GRP_HASH_SIZE		64
 #define	IPMP_ILL_REFRESH_TIMEOUT	120	/* seconds */
 
-/*
- * Templates for IPMP ARP messages.
- */
-static const arie_t ipmp_aract_template = {
-	AR_IPMP_ACTIVATE,
-	sizeof (arie_t),		/* Name offset */
-	sizeof (arie_t)			/* Name length (set by ill_arp_alloc) */
-};
-
-static const arie_t ipmp_ardeact_template = {
-	AR_IPMP_DEACTIVATE,
-	sizeof (arie_t),		/* Name offset */
-	sizeof (arie_t)			/* Name length (set by ill_arp_alloc) */
-};
 
 /*
  * IPMP meta-interface kstats (based on those in PSARC/1997/198).
@@ -497,7 +483,7 @@ ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
 	 * An ill must strictly be using ARP and/or ND for address
 	 * resolution for it to be allowed into a group.
 	 */
-	if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV))
+	if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP))
 		return (ENOTSUP);
 
 	/*
@@ -752,7 +738,7 @@ ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
 		if (illg->ig_next_ill == NULL)
 			illg->ig_next_ill = list_head(&illg->ig_actif);
 
-		if (ill_check_and_refhold(ill) == 0) {
+		if (ill_check_and_refhold(ill)) {
 			rw_exit(&ipst->ips_ipmp_lock);
 			return (ill);
 		}
@@ -763,17 +749,6 @@ ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
 }
 
 /*
- * Return a pointer to the nominated multicast ill in `illg', or NULL if one
- * doesn't exist.  Caller must be inside the IPSQ.
- */
-ill_t *
-ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg)
-{
-	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
-	return (illg->ig_cast_ill);
-}
-
-/*
  * Return a held pointer to the nominated multicast ill in `illg', or NULL if
  * one doesn't exist.  Caller need not be inside the IPSQ.
  */
@@ -785,7 +760,7 @@ ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
 
 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
 	castill = illg->ig_cast_ill;
-	if (castill != NULL && ill_check_and_refhold(castill) == 0) {
+	if (castill != NULL && ill_check_and_refhold(castill)) {
 		rw_exit(&ipst->ips_ipmp_lock);
 		return (castill);
 	}
@@ -794,6 +769,20 @@ ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
 }
 
 /*
+ * Callback routine for ncec_walk() that deletes `nce' if it is associated with
+ * the `(ill_t *)arg' and it is not one of the local addresses.  Caller must be
+ * inside the IPSQ.
+ */
+static void
+ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *arg)
+{
+	if ((ncec != NULL) && !NCE_MYADDR(ncec) &&
+	    ncec->ncec_ill == (ill_t *)arg) {
+		ncec_delete(ncec);
+	}
+}
+
+/*
  * Set the nominated cast ill on `illg' to `castill'.  If `castill' is NULL,
  * any existing nomination is removed.  Caller must be inside the IPSQ.
  */
@@ -820,6 +809,14 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
 		 */
 		if (ipmp_ill->ill_dl_up)
 			ill_leave_multicast(ipmp_ill);
+
+		/*
+		 * Delete any NCEs tied to the old nomination.  We must do this
+		 * last since ill_leave_multicast() may trigger IREs to be
+		 * built using ig_cast_ill.
+		 */
+		ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill,
+		    ocastill->ill_ipst);
 	}
 
 	/*
@@ -829,16 +826,6 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
 	illg->ig_cast_ill = castill;
 	rw_exit(&ipst->ips_ipmp_lock);
 
-	if (ocastill != NULL) {
-		/*
-		 * Delete any IREs tied to the old nomination.  We must do
-		 * this after the new castill is set and has reached global
-		 * visibility since the datapath has not been quiesced.
-		 */
-		ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
-		    ill_stq_cache_delete, ocastill, ocastill);
-	}
-
 	/*
 	 * Enable new nominated ill (if any).
 	 */
@@ -855,15 +842,6 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
 		if (ipmp_ill->ill_dl_up)
 			ill_recover_multicast(ipmp_ill);
 	}
-
-	/*
-	 * For IPv4, refresh our broadcast IREs.  This needs to be done even
-	 * if there's no new nomination since ill_refresh_bcast() still must
-	 * update the IPMP meta-interface's broadcast IREs to point back at
-	 * the IPMP meta-interface itself.
-	 */
-	if (!ipmp_ill->ill_isv6)
-		ill_refresh_bcast(ipmp_ill);
 }
 
 /*
@@ -872,33 +850,33 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
  * created IPMP ARP entry, or NULL on failure.
  */
 ipmp_arpent_t *
-ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp)
+ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp,
+    ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags)
 {
-	uchar_t *addrp;
-	area_t *area = (area_t *)mp->b_rptr;
 	ipmp_arpent_t *entp, *oentp;
 
 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
-	ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t));
 
-	if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL)
+	if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len,
+	    KM_NOSLEEP)) == NULL)
 		return (NULL);
 
-	if ((mp = copyb(mp)) == NULL) {
-		kmem_free(entp, sizeof (ipmp_arpent_t));
-		return (NULL);
-	}
-
-	DB_TYPE(mp) = M_PROTO;
-	entp->ia_area_mp = mp;
-	entp->ia_proxyarp = proxyarp;
-	addrp = mi_offset_paramc(mp, area->area_proto_addr_offset,
-	    sizeof (ipaddr_t));
-	bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t));
-
+	/*
+	 * Delete any existing ARP entry for this address.
+	 */
 	if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
 		ipmp_illgrp_destroy_arpent(illg, oentp);
 
+	/*
+	 * Prepend the new entry.
+	 */
+	entp->ia_ipaddr = ipaddr;
+	entp->ia_flags = flags;
+	entp->ia_lladdr_len = lladdr_len;
+	entp->ia_lladdr = (uchar_t *)&entp[1];
+	bcopy(lladdr, entp->ia_lladdr, lladdr_len);
+	entp->ia_proxyarp = proxyarp;
+	entp->ia_notified = B_TRUE;
 	list_insert_head(&illg->ig_arpent, entp);
 	return (entp);
 }
@@ -912,8 +890,7 @@ ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
 
 	list_remove(&illg->ig_arpent, entp);
-	freeb(entp->ia_area_mp);
-	kmem_free(entp, sizeof (ipmp_arpent_t));
+	kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len);
 }
 
 /*
@@ -957,10 +934,9 @@ ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
 {
 	ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
 	uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
-	area_t *area;
-	mblk_t *area_mp;
-	uchar_t *physaddr;
 	ipmp_arpent_t *entp;
+	ncec_t *ncec;
+	nce_t  *nce;
 
 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
 	ASSERT(!ipmp_ill->ill_isv6);
@@ -973,11 +949,7 @@ ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
 			continue;
 		}
 
-		area = (area_t *)entp->ia_area_mp->b_rptr;
 		ASSERT(paddrlen == ill->ill_phys_addr_length);
-		ASSERT(paddrlen == area->area_hw_addr_length);
-		physaddr = mi_offset_paramc(entp->ia_area_mp,
-		    area->area_hw_addr_offset, paddrlen);
 
 		/*
 		 * If this is a proxy ARP entry, we can skip notifying ARP if
@@ -985,18 +957,25 @@ ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
 		 * update the entry's hardware address before notifying ARP.
 		 */
 		if (entp->ia_proxyarp) {
-			if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 &&
-			    entp->ia_notified)
+			if (bcmp(ill->ill_phys_addr, entp->ia_lladdr,
+			    paddrlen) == 0 && entp->ia_notified)
 				continue;
-			bcopy(ill->ill_phys_addr, physaddr, paddrlen);
+			bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen);
 		}
 
-		if ((area_mp = copyb(entp->ia_area_mp)) == NULL) {
-			entp->ia_notified = B_FALSE;
+		(void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr,
+		    paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED,
+		    &nce);
+		if (nce == NULL || !entp->ia_proxyarp) {
+			if (nce != NULL)
+				nce_refrele(nce);
 			continue;
 		}
-
-		putnext(ipmp_ill->ill_rq, area_mp);
+		ncec = nce->nce_common;
+		mutex_enter(&ncec->ncec_lock);
+		nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr);
+		mutex_exit(&ncec->ncec_lock);
+		nce_refrele(nce);
 		ipmp_illgrp_mark_arpent(illg, entp);
 
 		if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
@@ -1061,16 +1040,16 @@ ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
 
 	/*
-	 * Since ill_max_mtu can only change under ill_lock, we hold ill_lock
+	 * Since ill_mtu can only change under ill_lock, we hold ill_lock
 	 * for each ill as we iterate through the list.  Any changes to the
-	 * ill_max_mtu will also trigger an update, so even if we missed it
+	 * ill_mtu will also trigger an update, so even if we missed it
 	 * this time around, the update will catch it.
 	 */
 	ill = list_head(&illg->ig_if);
 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
 		mutex_enter(&ill->ill_lock);
-		if (mtu == 0 || ill->ill_max_mtu < mtu)
-			mtu = ill->ill_max_mtu;
+		if (mtu == 0 || ill->ill_mtu < mtu)
+			mtu = ill->ill_mtu;
 		mutex_exit(&ill->ill_lock);
 	}
 
@@ -1171,13 +1150,12 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
 	 * This may seem odd, but it's consistent with the application view
 	 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
 	 */
+	update_conn_ill(ill, ill->ill_ipst);
 	if (ill->ill_isv6) {
-		reset_conn_ill(ill);
 		reset_mrt_ill(ill);
 	} else {
 		ipif = ill->ill_ipif;
 		for (; ipif != NULL; ipif = ipif->ipif_next) {
-			reset_conn_ipif(ipif);
 			reset_mrt_vif_ipif(ipif);
 		}
 	}
@@ -1206,7 +1184,7 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
 			ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
 			mutex_exit(&ipmp_ill->ill_lock);
 		}
-		ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+		ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
 	} else {
 		ASSERT(ipmp_ill->ill_phys_addr_length ==
 		    ill->ill_phys_addr_length);
@@ -1217,8 +1195,8 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
 			ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
 			mutex_exit(&ipmp_ill->ill_lock);
 		}
-		if (illg->ig_mtu > ill->ill_max_mtu)
-			ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+		if (illg->ig_mtu > ill->ill_mtu)
+			ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
 	}
 
 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
@@ -1232,12 +1210,6 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
 	 */
 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
 
-	/*
-	 * Merge any broadcast IREs, if need be.
-	 */
-	if (!ill->ill_isv6)
-		ill_refresh_bcast(ill);
-
 	ipmp_ill_refresh_active(ill);
 }
 
@@ -1301,12 +1273,6 @@ ipmp_ill_leave_illgrp(ill_t *ill)
 	rw_exit(&ipst->ips_ill_g_lock);
 
 	/*
-	 * Recreate any broadcast IREs that had been shared, if need be.
-	 */
-	if (!ill->ill_isv6)
-		ill_refresh_bcast(ill);
-
-	/*
 	 * Re-establish multicast memberships that were previously being
 	 * handled by the IPMP meta-interface.
 	 */
@@ -1456,10 +1422,8 @@ static boolean_t
 ipmp_ill_activate(ill_t *ill)
 {
 	ipif_t		*ipif;
-	mblk_t		*actmp = NULL, *deactmp = NULL;
 	mblk_t		*linkupmp = NULL, *linkdownmp = NULL;
 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
-	const char	*grifname = grp->gr_ifname;
 	ipmp_illgrp_t	*illg = ill->ill_grp;
 	ill_t		*maxill;
 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
@@ -1478,20 +1442,6 @@ ipmp_ill_activate(ill_t *ill)
 			goto fail;
 	}
 
-	/*
-	 * For IPv4, allocate the activate/deactivate messages, and tell ARP.
-	 */
-	if (!ill->ill_isv6) {
-		actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template);
-		deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template);
-		if (actmp == NULL || deactmp == NULL)
-			goto fail;
-
-		ASSERT(ill->ill_ardeact_mp == NULL);
-		ill->ill_ardeact_mp = deactmp;
-		putnext(illg->ig_ipmp_ill->ill_rq, actmp);
-	}
-
 	if (list_is_empty(&illg->ig_actif)) {
 		/*
 		 * Now that we have an active ill, nominate it for multicast
@@ -1524,12 +1474,6 @@ ipmp_ill_activate(ill_t *ill)
 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
 			ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
 		}
-
-		/*
-		 * TODO: explore whether it's advantageous to flush IRE_CACHE
-		 * bindings to force existing connections to be redistributed
-		 * to the new ill.
-		 */
 	}
 
 	/*
@@ -1542,7 +1486,7 @@ ipmp_ill_activate(ill_t *ill)
 	rw_exit(&ipst->ips_ipmp_lock);
 
 	/*
-	 * Refresh ARP entries to use `ill', if need be.
+	 * Refresh static/proxy ARP entries to use `ill', if need be.
 	 */
 	if (!ill->ill_isv6)
 		ipmp_illgrp_refresh_arpent(illg);
@@ -1557,8 +1501,6 @@ ipmp_ill_activate(ill_t *ill)
 	}
 	return (B_TRUE);
 fail:
-	freemsg(actmp);
-	freemsg(deactmp);
 	freemsg(linkupmp);
 	freemsg(linkdownmp);
 	return (B_FALSE);
@@ -1581,18 +1523,6 @@ ipmp_ill_deactivate(ill_t *ill)
 	ASSERT(IS_UNDER_IPMP(ill));
 
 	/*
-	 * Delete all IRE_CACHE entries for the group.  (We cannot restrict
-	 * ourselves to entries with ire_stq == ill since there may be other
-	 * IREs that are backed by ACEs that are tied to this ill -- and thus
-	 * when those ACEs are deleted, the IREs will be adrift without any
-	 * AR_CN_ANNOUNCE notification from ARP.)
-	 */
-	if (ill->ill_isv6)
-		ire_walk_v6(ill_grp_cache_delete, ill, ALL_ZONES, ipst);
-	else
-		ire_walk_v4(ill_grp_cache_delete, ill, ALL_ZONES, ipst);
-
-	/*
 	 * Pull the interface out of the active list.
 	 */
 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
@@ -1609,6 +1539,12 @@ ipmp_ill_deactivate(ill_t *ill)
 		ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
 
 	/*
+	 * Delete all nce_t entries using this ill, so that the next attempt
+	 * to send data traffic will revalidate cached nce's.
+	 */
+	nce_flush(ill, B_TRUE);
+
+	/*
 	 * Unbind all of the ipifs bound to this ill, and save 'em in a list;
 	 * we'll rebind them after we tell the resolver the ill is no longer
 	 * active.  We must do things in this order or the resolver could
@@ -1620,18 +1556,10 @@ ipmp_ill_deactivate(ill_t *ill)
 		ipif->ipif_bound_next = ubheadipif;
 		ubheadipif = ipif;
 	}
-
 	if (!ill->ill_isv6) {
-		/*
-		 * Tell ARP `ill' is no longer active in the group.
-		 */
-		mp = ill->ill_ardeact_mp;
-		ill->ill_ardeact_mp = NULL;
-		ASSERT(mp != NULL);
-		putnext(illg->ig_ipmp_ill->ill_rq, mp);
 
 		/*
-		 * Refresh any ARP entries that had been using `ill'.
+		 * Refresh static/proxy ARP entries that had been using `ill'.
 		 */
 		ipmp_illgrp_refresh_arpent(illg);
 	}
@@ -1649,6 +1577,20 @@ ipmp_ill_deactivate(ill_t *ill)
 			ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
 	}
 
+	if (list_is_empty(&illg->ig_actif)) {
+		ill_t *ipmp_ill = illg->ig_ipmp_ill;
+
+		ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill,
+		    (uchar_t *)ipmp_ill, ipmp_ill->ill_ipst);
+	}
+
+	/*
+	 * Remove any IRE_IF_CLONE for this ill since they might have
+	 * an ire_nce_cache/nce_common which refers to another ill in the group.
+	 */
+	ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone,
+	    ill, ill);
+
 	/*
 	 * Finally, mark the group link down, if necessary.
 	 */
@@ -1725,7 +1667,7 @@ ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
 
 	/*
 	 * If necessary, tell ARP/NDP about the new mapping.  Note that
-	 * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills.
+	 * ipif_resolver_up() cannot fail for IPv6 ills.
 	 */
 	if (act != Res_act_none) {
 		if (ill->ill_isv6) {
@@ -1756,15 +1698,12 @@ ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
 static ipif_t *
 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
 {
-	ill_t *ipmp_ill;
 	ipif_t *previpif;
 	ip_stack_t *ipst = ill->ill_ipst;
 
 	ASSERT(IAM_WRITER_ILL(ill));
 	ASSERT(IS_UNDER_IPMP(ill));
 
-	ipmp_ill = ill->ill_grp->ig_ipmp_ill;
-
 	/*
 	 * If necessary, find an ipif to unbind.
 	 */
@@ -1803,13 +1742,10 @@ ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
 	 * If requested, notify the resolvers (provided we're bound).
 	 */
 	if (notifyres && ipif->ipif_bound) {
-		if (ill->ill_isv6) {
+		if (ill->ill_isv6)
 			ipif_ndp_down(ipif);
-		} else {
-			ASSERT(ipif->ipif_arp_del_mp != NULL);
-			putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp);
-			ipif->ipif_arp_del_mp = NULL;
-		}
+		else
+			(void) ipif_arp_down(ipif);
 	}
 	ipif->ipif_bound = B_FALSE;
 
@@ -1845,8 +1781,8 @@ ipmp_ill_is_active(ill_t *ill)
 }
 
 /*
- * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet
- * IREs with a source address on `ill_arg'.
+ * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated
+ * with `ill_arg'.
  */
 static void
 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
@@ -1856,27 +1792,18 @@ ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
 	ASSERT(IAM_WRITER_ILL(ill));
 	ASSERT(!IS_IPMP(ill));
 
-	if (ire->ire_ipif->ipif_ill != ill)
+	if (ire->ire_ill != ill)
 		return;
 
-	switch (ire->ire_type) {
-	case IRE_HOST:
-	case IRE_PREFIX:
-	case IRE_DEFAULT:
-	case IRE_CACHE:
-	case IRE_IF_RESOLVER:
-	case IRE_IF_NORESOLVER:
+	if (IRE_HIDDEN_TYPE(ire->ire_type)) {
 		DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
-		ire->ire_marks |= IRE_MARK_TESTHIDDEN;
-		break;
-	default:
-		break;
+		ire->ire_testhidden = B_TRUE;
 	}
 }
 
 /*
- * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source
- * address on `ill_arg'.
+ * IRE walker callback: clear ire_testhidden if the IRE has a source address
+ * on `ill_arg'.
  */
 static void
 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
@@ -1886,9 +1813,9 @@ ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
 	ASSERT(IAM_WRITER_ILL(ill));
 	ASSERT(!IS_IPMP(ill));
 
-	if (ire->ire_ipif->ipif_ill == ill) {
+	if (ire->ire_ill == ill) {
 		DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
-		ire->ire_marks &= ~IRE_MARK_TESTHIDDEN;
+		ire->ire_testhidden = B_FALSE;
 	}
 }
 
@@ -1909,7 +1836,7 @@ ipmp_ill_hold_ipmp_ill(ill_t *ill)
 
 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
 	illg = ill->ill_grp;
-	if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill) == 0) {
+	if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) {
 		rw_exit(&ipst->ips_ipmp_lock);
 		return (illg->ig_ipmp_ill);
 	}
@@ -2135,7 +2062,7 @@ ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
 
 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
 	boundill = ipif->ipif_bound_ill;
-	if (boundill != NULL && ill_check_and_refhold(boundill) == 0) {
+	if (boundill != NULL && ill_check_and_refhold(boundill)) {
 		rw_exit(&ipst->ips_ipmp_lock);
 		return (boundill);
 	}
@@ -2192,3 +2119,182 @@ ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
 {
 	return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
 }
+
+/*
+ * Check if `mp' contains a probe packet by verifying if the IP source address
+ * is a test address on an underlying interface `ill'. Caller need not be inside
+ * the IPSQ.
+ */
+boolean_t
+ipmp_packet_is_probe(mblk_t *mp, ill_t *ill)
+{
+	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+	ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+	ASSERT(DB_TYPE(mp) != M_CTL);
+
+	if (!IS_UNDER_IPMP(ill))
+		return (B_FALSE);
+
+	if (ill->ill_isv6) {
+		if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
+		    ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL))
+			return (B_TRUE);
+	} else {
+		if ((ipha->ipha_src != INADDR_ANY) &&
+		    ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL))
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Pick out an appropriate underlying interface for packet transmit.  This
+ * function may be called from the data path, so we need to verify that the
+ * IPMP group associated with `ill' is non-null after holding the ill_g_lock.
+ * Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_ill_get_xmit_ill(ill_t *ill, boolean_t is_unicast)
+{
+	ill_t *xmit_ill;
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	if (ill->ill_grp == NULL) {
+		/*
+		 * The interface was taken out of the group. Return ill itself,
+		 * but take a ref so that callers will always be able to do
+		 * ill_refrele(ill);
+		 */
+		rw_exit(&ipst->ips_ill_g_lock);
+		ill_refhold(ill);
+		return (ill);
+	}
+	if (!is_unicast)
+		xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+	else
+		xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp);
+	rw_exit(&ipst->ips_ill_g_lock);
+	return (xmit_ill);
+}
+
+/*
+ * Flush out any nce that points at `ncec' from an underlying interface
+ */
+void
+ipmp_ncec_flush_nce(ncec_t *ncec)
+{
+	ill_t		*ncec_ill = ncec->ncec_ill;
+	ill_t		*ill;
+	ipmp_illgrp_t	*illg;
+	ip_stack_t	*ipst = ncec_ill->ill_ipst;
+	list_t		dead;
+	nce_t		*nce;
+
+	if (!IS_IPMP(ncec_ill))
+		return;
+
+	illg = ncec_ill->ill_grp;
+	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	ill = list_head(&illg->ig_if);
+	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+		nce_fastpath_list_delete(ill, ncec, &dead);
+	}
+	rw_exit(&ipst->ips_ill_g_lock);
+
+	/*
+	 * we may now nce_refrele() all dead entries since all locks have been
+	 * dropped.
+	 */
+	while ((nce = list_head(&dead)) != NULL) {
+		list_remove(&dead, nce);
+		nce_refrele(nce);
+	}
+	ASSERT(list_is_empty(&dead));
+	list_destroy(&dead);
+}
+
+/*
+ * For each interface in the IPMP group, if there are nce_t entries for the IP
+ * address corresponding to `ncec', then their dl_unitdata_req_t and fastpath
+ * information must be updated to match the link-layer address information in
+ * `ncec'.
+ */
+void
+ipmp_ncec_fastpath(ncec_t *ncec, ill_t *ipmp_ill)
+{
+	ill_t		*ill;
+	ipmp_illgrp_t	*illg = ipmp_ill->ill_grp;
+	ip_stack_t	*ipst = ipmp_ill->ill_ipst;
+	nce_t		*nce, *nce_next;
+	list_t		replace;
+
+	ASSERT(IS_IPMP(ipmp_ill));
+
+	/*
+	 * if ncec itself is not reachable, there is no use in creating nce_t
+	 * entries on the underlying interfaces in the group.
+	 */
+	if (!NCE_ISREACHABLE(ncec))
+		return;
+
+	list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node));
+	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+	ill = list_head(&illg->ig_actif);
+	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
+		/*
+		 * For each underlying interface, we first check if there is an
+		 * nce_t for the address in ncec->ncec_addr. If one exists,
+		 * we should trigger nce_fastpath for that nce_t. However, the
+		 * catch is that we are holding the ips_ipmp_lock to prevent
+		 * changes to the IPMP group membership, so that we cannot
+		 * putnext() to the driver.  So we nce_delete the
+		 * list nce_t entries that need to be updated into the
+		 * `replace' list, and then process the `replace' list
+		 * after dropping the ips_ipmp_lock.
+		 */
+		mutex_enter(&ill->ill_lock);
+		for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
+			nce_next = list_next(&ill->ill_nce, nce);
+			if (!IN6_ARE_ADDR_EQUAL(&nce->nce_addr,
+			    &ncec->ncec_addr)) {
+				nce = nce_next;
+				continue;
+			}
+			nce_refhold(nce);
+			nce_delete(nce);
+			list_insert_tail(&replace, nce);
+			nce = nce_next;
+		}
+		mutex_exit(&ill->ill_lock);
+	}
+	rw_exit(&ipst->ips_ipmp_lock);
+	/*
+	 * `replace' now has the list of nce's on which we should be triggering
+	 * nce_fastpath(). We now retrigger fastpath by setting up the nce
+	 * again. The code in nce_lookup_then_add_v* ensures that nce->nce_ill
+	 * is still in the group for ncec->ncec_ill
+	 */
+	while ((nce = list_head(&replace)) != NULL) {
+		list_remove(&replace, nce);
+		if (ncec->ncec_ill->ill_isv6) {
+			(void) nce_lookup_then_add_v6(nce->nce_ill,
+			    ncec->ncec_lladdr,  ncec->ncec_lladdr_length,
+			    &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED,
+			    NULL);
+		} else {
+			ipaddr_t ipaddr;
+
+			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr);
+			(void) nce_lookup_then_add_v4(nce->nce_ill,
+			    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
+			    &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL);
+		}
+		nce_refrele(nce);
+	}
+	ASSERT(list_is_empty(&replace));
+	list_destroy(&replace);
+}
diff --git a/usr/src/uts/common/inet/ip/ipsec_loader.c b/usr/src/uts/common/inet/ip/ipsec_loader.c
index 6609146fd1..7f5c434359 100644
--- a/usr/src/uts/common/inet/ip/ipsec_loader.c
+++ b/usr/src/uts/common/inet/ip/ipsec_loader.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -121,8 +121,6 @@ ipsec_loader(void *arg)
 		}
 		mutex_exit(&ipss->ipsec_loader_lock);
 
-		ip_ipsec_load_complete(ipss);
-
 		mutex_enter(&ipss->ipsec_loader_lock);
 		if (!ipsec_failure) {
 			CALLB_CPR_EXIT(&cprinfo);
diff --git a/usr/src/uts/common/inet/ip/ipsecah.c b/usr/src/uts/common/inet/ip/ipsecah.c
index c130dac490..a511b85ff4 100644
--- a/usr/src/uts/common/inet/ip/ipsecah.c
+++ b/usr/src/uts/common/inet/ip/ipsecah.c
@@ -54,6 +54,8 @@
 #include <inet/ip.h>
 #include <inet/ip6.h>
 #include <inet/nd.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ndp.h>
 #include <inet/ipsec_info.h>
 #include <inet/ipsec_impl.h>
 #include <inet/sadb.h>
@@ -62,7 +64,6 @@
 #include <inet/ipdrop.h>
 #include <sys/taskq.h>
 #include <sys/policy.h>
-#include <sys/iphada.h>
 #include <sys/strsun.h>
 
 #include <sys/crypto/common.h>
@@ -132,32 +133,27 @@ static	ipsecahparam_t	lcl_param_arr[] = {
 #define	AH_MSGSIZE(mp) ((mp)->b_cont != NULL ? msgdsize(mp) : MBLKL(mp))
 
 
-static ipsec_status_t ah_auth_out_done(mblk_t *);
-static ipsec_status_t ah_auth_in_done(mblk_t *);
+static mblk_t *ah_auth_out_done(mblk_t *, ip_xmit_attr_t *, ipsec_crypto_t *);
+static mblk_t *ah_auth_in_done(mblk_t *, ip_recv_attr_t *, ipsec_crypto_t *);
 static mblk_t *ah_process_ip_options_v4(mblk_t *, ipsa_t *, int *, uint_t,
     boolean_t, ipsecah_stack_t *);
 static mblk_t *ah_process_ip_options_v6(mblk_t *, ipsa_t *, int *, uint_t,
     boolean_t, ipsecah_stack_t *);
 static void ah_getspi(mblk_t *, keysock_in_t *, ipsecah_stack_t *);
-static ipsec_status_t ah_inbound_accelerated(mblk_t *, boolean_t, ipsa_t *,
-    uint32_t);
-static ipsec_status_t ah_outbound_accelerated_v4(mblk_t *, ipsa_t *);
-static ipsec_status_t ah_outbound_accelerated_v6(mblk_t *, ipsa_t *);
-static ipsec_status_t ah_outbound(mblk_t *);
+static void ah_inbound_restart(mblk_t *, ip_recv_attr_t *);
+
+static mblk_t *ah_outbound(mblk_t *, ip_xmit_attr_t *);
+static void ah_outbound_finish(mblk_t *, ip_xmit_attr_t *);
 
 static int ipsecah_open(queue_t *, dev_t *, int, int, cred_t *);
 static int ipsecah_close(queue_t *);
-static void ipsecah_rput(queue_t *, mblk_t *);
 static void ipsecah_wput(queue_t *, mblk_t *);
 static void ah_send_acquire(ipsacq_t *, mblk_t *, netstack_t *);
 static boolean_t ah_register_out(uint32_t, uint32_t, uint_t, ipsecah_stack_t *,
-    mblk_t *);
+    cred_t *);
 static void	*ipsecah_stack_init(netstackid_t stackid, netstack_t *ns);
 static void	ipsecah_stack_fini(netstackid_t stackid, void *arg);
 
-extern void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
-    void *);
-
 /* Setable in /etc/system */
 uint32_t ah_hash_size = IPSEC_DEFAULT_HASH_SIZE;
 
@@ -168,7 +164,7 @@ static struct module_info info = {
 };
 
 static struct qinit rinit = {
-	(pfi_t)ipsecah_rput, NULL, ipsecah_open, ipsecah_close, NULL, &info,
+	(pfi_t)putnext, NULL, ipsecah_open, ipsecah_close, NULL, &info,
 	NULL
 };
 
@@ -215,9 +211,6 @@ ah_kstat_init(ipsecah_stack_t *ahstack, netstackid_t stackid)
 	KI(acquire_requests);
 	KI(bytes_expired);
 	KI(out_discards);
-	KI(in_accelerated);
-	KI(out_accelerated);
-	KI(noaccel);
 	KI(crypto_sync);
 	KI(crypto_async);
 	KI(crypto_failures);
@@ -275,9 +268,9 @@ ah_ager(void *arg)
 	hrtime_t begin = gethrtime();
 
 	sadb_ager(&ahstack->ah_sadb.s_v4, ahstack->ah_pfkey_q,
-	    ahstack->ah_sadb.s_ip_q, ahstack->ipsecah_reap_delay, ns);
+	    ahstack->ipsecah_reap_delay, ns);
 	sadb_ager(&ahstack->ah_sadb.s_v6, ahstack->ah_pfkey_q,
-	    ahstack->ah_sadb.s_ip_q, ahstack->ipsecah_reap_delay, ns);
+	    ahstack->ipsecah_reap_delay, ns);
 
 	ahstack->ah_event = sadb_retimeout(begin, ahstack->ah_pfkey_q,
 	    ah_ager, ahstack,
@@ -474,7 +467,13 @@ ipsecah_stack_fini(netstackid_t stackid, void *arg)
 }
 
 /*
- * AH module open routine. The module should be opened by keysock.
+ * AH module open routine, which is here for keysock plumbing.
+ * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old
+ * Days of export control, and fears that ESP would not be allowed
+ * to be shipped at all by default.  Eventually, keysock should
+ * either access AH and ESP via modstubs or krtld dependencies, or
+ * perhaps be folded in with AH and ESP into a single IPsec/netsec
+ * module ("netsec" if PF_KEY provides more than AH/ESP keying tables).
  */
 /* ARGSUSED */
 static int
@@ -497,57 +496,10 @@ ipsecah_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 	ahstack = ns->netstack_ipsecah;
 	ASSERT(ahstack != NULL);
 
-	/*
-	 * ASSUMPTIONS (because I'm MT_OCEXCL):
-	 *
-	 *	* I'm being pushed on top of IP for all my opens (incl. #1).
-	 *	* Only ipsecah_open() can write into ah_sadb.s_ip_q.
-	 *	* Because of this, I can check lazily for ah_sadb.s_ip_q.
-	 *
-	 *  If these assumptions are wrong, I'm in BIG trouble...
-	 */
-
 	q->q_ptr = ahstack;
 	WR(q)->q_ptr = q->q_ptr;
 
-	if (ahstack->ah_sadb.s_ip_q == NULL) {
-		struct T_unbind_req *tur;
-
-		ahstack->ah_sadb.s_ip_q = WR(q);
-		/* Allocate an unbind... */
-		ahstack->ah_ip_unbind = allocb(sizeof (struct T_unbind_req),
-		    BPRI_HI);
-
-		/*
-		 * Send down T_BIND_REQ to bind IPPROTO_AH.
-		 * Handle the ACK here in AH.
-		 */
-		qprocson(q);
-		if (ahstack->ah_ip_unbind == NULL ||
-		    !sadb_t_bind_req(ahstack->ah_sadb.s_ip_q, IPPROTO_AH)) {
-			if (ahstack->ah_ip_unbind != NULL) {
-				freeb(ahstack->ah_ip_unbind);
-				ahstack->ah_ip_unbind = NULL;
-			}
-			q->q_ptr = NULL;
-			qprocsoff(q);
-			netstack_rele(ahstack->ipsecah_netstack);
-			return (ENOMEM);
-		}
-
-		ahstack->ah_ip_unbind->b_datap->db_type = M_PROTO;
-		tur = (struct T_unbind_req *)ahstack->ah_ip_unbind->b_rptr;
-		tur->PRIM_type = T_UNBIND_REQ;
-	} else {
-		qprocson(q);
-	}
-
-	/*
-	 * For now, there's not much I can do.  I'll be getting a message
-	 * passed down to me from keysock (in my wput), and a T_BIND_ACK
-	 * up from IP (in my rput).
-	 */
-
+	qprocson(q);
 	return (0);
 }
 
@@ -560,17 +512,6 @@ ipsecah_close(queue_t *q)
 	ipsecah_stack_t	*ahstack = (ipsecah_stack_t *)q->q_ptr;
 
 	/*
-	 * If ah_sadb.s_ip_q is attached to this instance, send a
-	 * T_UNBIND_REQ to IP for the instance before doing
-	 * a qprocsoff().
-	 */
-	if (WR(q) == ahstack->ah_sadb.s_ip_q &&
-	    ahstack->ah_ip_unbind != NULL) {
-		putnext(WR(q), ahstack->ah_ip_unbind);
-		ahstack->ah_ip_unbind = NULL;
-	}
-
-	/*
 	 * Clean up q_ptr, if needed.
 	 */
 	qprocsoff(q);
@@ -585,98 +526,16 @@ ipsecah_close(queue_t *q)
 		(void) quntimeout(q, ahstack->ah_event);
 	}
 
-	if (WR(q) == ahstack->ah_sadb.s_ip_q) {
-		/*
-		 * If the ah_sadb.s_ip_q is attached to this instance, find
-		 * another.  The OCEXCL outer perimeter helps us here.
-		 */
-
-		ahstack->ah_sadb.s_ip_q = NULL;
-
-		/*
-		 * Find a replacement queue for ah_sadb.s_ip_q.
-		 */
-		if (ahstack->ah_pfkey_q != NULL &&
-		    ahstack->ah_pfkey_q != RD(q)) {
-			/*
-			 * See if we can use the pfkey_q.
-			 */
-			ahstack->ah_sadb.s_ip_q = WR(ahstack->ah_pfkey_q);
-		}
-
-		if (ahstack->ah_sadb.s_ip_q == NULL ||
-		    !sadb_t_bind_req(ahstack->ah_sadb.s_ip_q, IPPROTO_AH)) {
-			ah1dbg(ahstack,
-			    ("ipsecah: Can't reassign ah_sadb.s_ip_q.\n"));
-			ahstack->ah_sadb.s_ip_q = NULL;
-		} else {
-			ahstack->ah_ip_unbind =
-			    allocb(sizeof (struct T_unbind_req), BPRI_HI);
-
-			if (ahstack->ah_ip_unbind != NULL) {
-				struct T_unbind_req *tur;
-
-				ahstack->ah_ip_unbind->b_datap->db_type =
-				    M_PROTO;
-				tur = (struct T_unbind_req *)
-				    ahstack->ah_ip_unbind->b_rptr;
-				tur->PRIM_type = T_UNBIND_REQ;
-			}
-			/* If it's NULL, I can't do much here. */
-		}
-	}
-
 	netstack_rele(ahstack->ipsecah_netstack);
 	return (0);
 }
 
 /*
- * AH module read put routine.
- */
-/* ARGSUSED */
-static void
-ipsecah_rput(queue_t *q, mblk_t *mp)
-{
-	ipsecah_stack_t	*ahstack = (ipsecah_stack_t *)q->q_ptr;
-
-	ASSERT(mp->b_datap->db_type != M_CTL);	/* No more IRE_DB_REQ. */
-
-	switch (mp->b_datap->db_type) {
-	case M_PROTO:
-	case M_PCPROTO:
-		/* TPI message of some sort. */
-		switch (*((t_scalar_t *)mp->b_rptr)) {
-		case T_BIND_ACK:
-			/* We expect this. */
-			ah3dbg(ahstack,
-			    ("Thank you IP from AH for T_BIND_ACK\n"));
-			break;
-		case T_ERROR_ACK:
-			cmn_err(CE_WARN,
-			    "ipsecah:  AH received T_ERROR_ACK from IP.");
-			break;
-		case T_OK_ACK:
-			/* Probably from a (rarely sent) T_UNBIND_REQ. */
-			break;
-		default:
-			ah1dbg(ahstack, ("Unknown M_{,PC}PROTO message.\n"));
-		}
-		freemsg(mp);
-		break;
-	default:
-		/* For now, passthru message. */
-		ah2dbg(ahstack, ("AH got unknown mblk type %d.\n",
-		    mp->b_datap->db_type));
-		putnext(q, mp);
-	}
-}
-
-/*
  * Construct an SADB_REGISTER message with the current algorithms.
  */
 static boolean_t
 ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
-    ipsecah_stack_t *ahstack, mblk_t *in_mp)
+    ipsecah_stack_t *ahstack, cred_t *cr)
 {
 	mblk_t *mp;
 	boolean_t rc = B_TRUE;
@@ -691,7 +550,7 @@ ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
 	sadb_sens_t *sens;
 	size_t sens_len = 0;
 	sadb_ext_t *nextext;
-	cred_t *sens_cr = NULL;
+	ts_label_t *sens_tsl = NULL;
 
 	/* Allocate the KEYSOCK_OUT. */
 	mp = sadb_keysock_out(serial);
@@ -700,11 +559,10 @@ ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
 		return (B_FALSE);
 	}
 
-	if (is_system_labeled() && (in_mp != NULL)) {
-		sens_cr = msg_getcred(in_mp, NULL);
-
-		if (sens_cr != NULL) {
-			sens_len = sadb_sens_len_from_cred(sens_cr);
+	if (is_system_labeled() && (cr != NULL)) {
+		sens_tsl = crgetlabel(cr);
+		if (sens_tsl != NULL) {
+			sens_len = sadb_sens_len_from_label(sens_tsl);
 			allocsize += sens_len;
 		}
 	}
@@ -786,10 +644,10 @@ ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
 
 	mutex_exit(&ipss->ipsec_alg_lock);
 
-	if (sens_cr != NULL) {
+	if (sens_tsl != NULL) {
 		sens = (sadb_sens_t *)nextext;
-		sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY,
-		    sens_cr, sens_len);
+		sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
+		    sens_tsl, sens_len);
 
 		nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len);
 	}
@@ -847,40 +705,61 @@ ipsecah_algs_changed(netstack_t *ns)
 
 /*
  * Stub function that taskq_dispatch() invokes to take the mblk (in arg)
- * and put() it into AH and STREAMS again.
+ * and send it into AH and IP again.
  */
 static void
 inbound_task(void *arg)
 {
-	ah_t *ah;
-	mblk_t *mp = (mblk_t *)arg;
-	ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
-	int ipsec_rc;
-	netstack_t *ns;
-	ipsecah_stack_t	*ahstack;
-
-	ns = netstack_find_by_stackid(ii->ipsec_in_stackid);
-	if (ns == NULL || ns != ii->ipsec_in_ns) {
-		/* Just freemsg(). */
-		if (ns != NULL)
-			netstack_rele(ns);
+	mblk_t		*mp = (mblk_t *)arg;
+	mblk_t		*async_mp;
+	ip_recv_attr_t	iras;
+
+	async_mp = mp;
+	mp = async_mp->b_cont;
+	async_mp->b_cont = NULL;
+	if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
+		/* The ill or ip_stack_t disappeared on us */
+		ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
 		freemsg(mp);
-		return;
+		goto done;
 	}
 
-	ahstack = ns->netstack_ipsecah;
+	ah_inbound_restart(mp, &iras);
+done:
+	ira_cleanup(&iras, B_TRUE);
+}
 
-	ah2dbg(ahstack, ("in AH inbound_task"));
+/*
+ * Restart ESP after the SA has been added.
+ */
+static void
+ah_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira)
+{
+	ah_t		*ah;
+	netstack_t	*ns;
+	ipsecah_stack_t	*ahstack;
+
+	ns = ira->ira_ill->ill_ipst->ips_netstack;
+	ahstack = ns->netstack_ipsecah;
 
 	ASSERT(ahstack != NULL);
-	ah = ipsec_inbound_ah_sa(mp, ns);
-	if (ah != NULL) {
-		ASSERT(ii->ipsec_in_ah_sa != NULL);
-		ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func(mp, ah);
-		if (ipsec_rc == IPSEC_STATUS_SUCCESS)
-			ip_fanout_proto_again(mp, NULL, NULL, NULL);
+	mp = ipsec_inbound_ah_sa(mp, ira, &ah);
+	if (mp == NULL)
+		return;
+
+	ASSERT(ah != NULL);
+	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+	ASSERT(ira->ira_ipsec_ah_sa != NULL);
+
+	mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira);
+	if (mp == NULL) {
+		/*
+		 * Either it failed or is pending. In the former case
+		 * ipIfStatsInDiscards was increased.
+		 */
+		return;
 	}
-	netstack_rele(ns);
+	ip_input_post_ipsec(mp, ira);
 }
 
 /*
@@ -1051,60 +930,96 @@ ah_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
 	if (larval != NULL)
 		lpkt = sadb_clear_lpkt(larval);
 
-	rc = sadb_common_add(ahstack->ah_sadb.s_ip_q, ahstack->ah_pfkey_q, mp,
+	rc = sadb_common_add(ahstack->ah_pfkey_q, mp,
 	    samsg, ksi, primary, secondary, larval, clone, is_inbound,
 	    diagnostic, ns, &ahstack->ah_sadb);
 
+	if (lpkt != NULL) {
+		if (rc == 0) {
+			rc = !taskq_dispatch(ah_taskq, inbound_task, lpkt,
+			    TQ_NOSLEEP);
+		}
+		if (rc != 0) {
+			lpkt = ip_recv_attr_free_mblk(lpkt);
+			ip_drop_packet(lpkt, B_TRUE, NULL,
+			    DROPPER(ipss, ipds_sadb_inlarval_timeout),
+			    &ahstack->ah_dropper);
+		}
+	}
+
 	/*
 	 * How much more stack will I create with all of these
-	 * ah_inbound_* and ah_outbound_*() calls?
+	 * ah_outbound_*() calls?
 	 */
 
-	if (rc == 0 && lpkt != NULL)
-		rc = !taskq_dispatch(ah_taskq, inbound_task, lpkt, TQ_NOSLEEP);
-
-	if (rc != 0) {
-		ip_drop_packet(lpkt, B_TRUE, NULL, NULL,
-		    DROPPER(ipss, ipds_sadb_inlarval_timeout),
-		    &ahstack->ah_dropper);
-	}
-
+	/* Handle the packets queued waiting for the SA */
 	while (acq_msgs != NULL) {
-		mblk_t *mp = acq_msgs;
+		mblk_t		*asyncmp;
+		mblk_t		*data_mp;
+		ip_xmit_attr_t	ixas;
+		ill_t		*ill;
 
+		asyncmp = acq_msgs;
 		acq_msgs = acq_msgs->b_next;
-		mp->b_next = NULL;
-		if (rc == 0) {
-			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
-
-			ASSERT(ahstack->ah_sadb.s_ip_q != NULL);
-			if (ipsec_outbound_sa(mp, IPPROTO_AH)) {
-				io->ipsec_out_ah_done = B_TRUE;
-				if (ah_outbound(mp) == IPSEC_STATUS_SUCCESS) {
-					ipha_t *ipha = (ipha_t *)
-					    mp->b_cont->b_rptr;
-					if (sq.af == AF_INET) {
-						ip_wput_ipsec_out(NULL, mp,
-						    ipha, NULL, NULL);
-					} else {
-						ip6_t *ip6h = (ip6_t *)ipha;
-
-						ASSERT(sq.af == AF_INET6);
-
-						ip_wput_ipsec_out_v6(NULL,
-						    mp, ip6h, NULL, NULL);
-					}
-				}
-				continue;
-			}
+		asyncmp->b_next = NULL;
+
+		/*
+		 * Extract the ip_xmit_attr_t from the first mblk.
+		 * Verifies that the netstack and ill is still around; could
+		 * have vanished while iked was doing its work.
+		 * On succesful return we have a nce_t and the ill/ipst can't
+		 * disappear until we do the nce_refrele in ixa_cleanup.
+		 */
+		data_mp = asyncmp->b_cont;
+		asyncmp->b_cont = NULL;
+		if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) {
+			AH_BUMP_STAT(ahstack, out_discards);
+			ip_drop_packet(data_mp, B_FALSE, NULL,
+			    DROPPER(ipss, ipds_sadb_acquire_timeout),
+			    &ahstack->ah_dropper);
+		} else if (rc != 0) {
+			ill = ixas.ixa_nce->nce_ill;
+			AH_BUMP_STAT(ahstack, out_discards);
+			ip_drop_packet(data_mp, B_FALSE, ill,
+			    DROPPER(ipss, ipds_sadb_acquire_timeout),
+			    &ahstack->ah_dropper);
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		} else {
+			ah_outbound_finish(data_mp, &ixas);
 		}
+		ixa_cleanup(&ixas);
+	}
+
+	return (rc);
+}
+
+
+/*
+ * Process one of the queued messages (from ipsacq_mp) once the SA
+ * has been added.
+ */
+static void
+ah_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa)
+{
+	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
+	ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ill_t		*ill = ixa->ixa_nce->nce_ill;
+
+	if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) {
 		AH_BUMP_STAT(ahstack, out_discards);
-		ip_drop_packet(mp, B_FALSE, NULL, NULL,
+		ip_drop_packet(data_mp, B_FALSE, ill,
 		    DROPPER(ipss, ipds_sadb_acquire_timeout),
 		    &ahstack->ah_dropper);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		return;
 	}
 
-	return (rc);
+	data_mp = ah_outbound(data_mp, ixa);
+	if (data_mp == NULL)
+		return;
+
+	(void) ip_output_post_ipsec(data_mp, ixa);
 }
 
 /*
@@ -1300,8 +1215,7 @@ ah_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
 		}
 		return (sadb_purge_sa(mp, ksi,
 		    (sin->sin_family == AF_INET6) ? &ahstack->ah_sadb.s_v6 :
-		    &ahstack->ah_sadb.s_v4, diagnostic, ahstack->ah_pfkey_q,
-		    ahstack->ah_sadb.s_ip_q));
+		    &ahstack->ah_sadb.s_v4, diagnostic, ahstack->ah_pfkey_q));
 	}
 
 	return (sadb_delget_sa(mp, ksi, &ahstack->ah_sadb, diagnostic,
@@ -1449,7 +1363,7 @@ ah_parse_pfkey(mblk_t *mp, ipsecah_stack_t *ahstack)
 		 * Keysock takes care of the PF_KEY bookkeeping for this.
 		 */
 		if (ah_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
-		    ksi->ks_in_serial, ahstack, mp)) {
+		    ksi->ks_in_serial, ahstack, msg_getcred(mp, NULL))) {
 			freemsg(mp);
 		} else {
 			/*
@@ -1534,8 +1448,7 @@ ah_keysock_no_socket(mblk_t *mp, ipsecah_stack_t *ahstack)
 		samsg->sadb_msg_errno = kse->ks_err_errno;
 		samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
 		/*
-		 * Use the write-side of the ah_pfkey_q, in case there is
-		 * no ahstack->ah_sadb.s_ip_q.
+		 * Use the write-side of the ah_pfkey_q
 		 */
 		sadb_in_acquire(samsg, &ahstack->ah_sadb,
 		    WR(ahstack->ah_pfkey_q), ahstack->ipsecah_netstack);
@@ -1825,22 +1738,15 @@ ah_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
  * Called while holding the algorithm lock.
  */
 static void
-ah_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
+ah_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs,
+    netstack_t *ns)
 {
 	sadb_comb_t *comb = (sadb_comb_t *)(prop + 1);
-	ipsec_out_t *io;
 	ipsec_action_t *ap;
 	ipsec_prot_t *prot;
-	ipsecah_stack_t	*ahstack;
-	netstack_t	*ns;
-	ipsec_stack_t	*ipss;
-
-	io = (ipsec_out_t *)acqrec->ipsacq_mp->b_rptr;
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
+	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
-	ns = io->ipsec_out_ns;
-	ipss = ns->netstack_ipsec;
-	ahstack = ns->netstack_ipsecah;
 	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
 
 	prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
@@ -1851,9 +1757,9 @@ ah_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
 
 	/*
 	 * Based upon algorithm properties, and what-not, prioritize a
-	 * proposal, based on the ordering of the ah algorithms in the
-	 * alternatives presented in the policy rule passed down
-	 * through the ipsec_out_t and attached to the acquire record.
+	 * proposal, based on the ordering of the AH algorithms in the
+	 * alternatives in the policy rule or socket that was placed
+	 * in the acquire record.
 	 */
 
 	for (ap = acqrec->ipsacq_act; ap != NULL;
@@ -1961,7 +1867,7 @@ ah_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns)
 	/* Insert proposal here. */
 
 	prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len);
-	ah_insert_prop(prop, acqrec, combs);
+	ah_insert_prop(prop, acqrec, combs, ns);
 	samsg->sadb_msg_len += prop->sadb_prop_len;
 	msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len);
 
@@ -2117,11 +2023,12 @@ ah_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecah_stack_t *ahstack)
 /*
  * IPv6 sends up the ICMP errors for validation and the removal of the AH
  * header.
+ * If succesful, the mp has been modified to not include the AH header so
+ * that the caller can fanout to the ULP's icmp error handler.
  */
-static ipsec_status_t
-ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
+static mblk_t *
+ah_icmp_error_v6(mblk_t *mp, ip_recv_attr_t *ira, ipsecah_stack_t *ahstack)
 {
-	mblk_t *mp;
 	ip6_t *ip6h, *oip6h;
 	uint16_t hdr_length, ah_length;
 	uint8_t *nexthdrp;
@@ -2132,14 +2039,6 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 	uint8_t *post_ah_ptr;
 	ipsec_stack_t	*ipss = ahstack->ipsecah_netstack->netstack_ipsec;
 
-	mp = ipsec_mp->b_cont;
-	ASSERT(mp->b_datap->db_type == M_CTL);
-
-	/*
-	 * Change the type to M_DATA till we finish pullups.
-	 */
-	mp->b_datap->db_type = M_DATA;
-
 	/*
 	 * Eat the cost of a pullupmsg() for now.  It makes the rest of this
 	 * code far less convoluted.
@@ -2150,10 +2049,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 	    mp->b_rptr + hdr_length + sizeof (icmp6_t) + sizeof (ip6_t) +
 	    sizeof (ah_t) > mp->b_wptr) {
 		IP_AH_BUMP_STAT(ipss, in_discards);
-		ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+		ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_ah_nomem),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		return (NULL);
 	}
 
 	oip6h = (ip6_t *)mp->b_rptr;
@@ -2161,10 +2060,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 	ip6h = (ip6_t *)(icmp6 + 1);
 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) {
 		IP_AH_BUMP_STAT(ipss, in_discards);
-		ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+		ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_ah_bad_v6_hdrs),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		return (NULL);
 	}
 	ah = (ah_t *)((uint8_t *)ip6h + hdr_length);
 
@@ -2186,10 +2085,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 			    ah->ah_spi, &oip6h->ip6_src, AF_INET6,
 			    ahstack->ipsecah_netstack);
 		}
-		ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+		ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_ah_no_sa),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		return (NULL);
 	}
 
 	IPSA_REFRELE(assoc);
@@ -2208,10 +2107,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 
 	if (post_ah_ptr > mp->b_wptr) {
 		IP_AH_BUMP_STAT(ipss, in_discards);
-		ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+		ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_ah_bad_length),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		return (NULL);
 	}
 
 	ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - ah_length);
@@ -2219,20 +2118,19 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 	ovbcopy(post_ah_ptr, ah,
 	    (size_t)((uintptr_t)mp->b_wptr - (uintptr_t)post_ah_ptr));
 	mp->b_wptr -= ah_length;
-	/* Rewhack to be an ICMP error. */
-	mp->b_datap->db_type = M_CTL;
 
-	return (IPSEC_STATUS_SUCCESS);
+	return (mp);
 }
 
 /*
  * IP sends up the ICMP errors for validation and the removal of
  * the AH header.
+ * If succesful, the mp has been modified to not include the AH header so
+ * that the caller can fanout to the ULP's icmp error handler.
  */
-static ipsec_status_t
-ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
+static mblk_t *
+ah_icmp_error_v4(mblk_t *mp, ip_recv_attr_t *ira, ipsecah_stack_t *ahstack)
 {
-	mblk_t *mp;
 	mblk_t *mp1;
 	icmph_t *icmph;
 	int iph_hdr_length;
@@ -2248,14 +2146,6 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 	uint8_t nexthdr;
 	ipsec_stack_t	*ipss = ahstack->ipsecah_netstack->netstack_ipsec;
 
-	mp = ipsec_mp->b_cont;
-	ASSERT(mp->b_datap->db_type == M_CTL);
-
-	/*
-	 * Change the type to M_DATA till we finish pullups.
-	 */
-	mp->b_datap->db_type = M_DATA;
-
 	oipha = ipha = (ipha_t *)mp->b_rptr;
 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
@@ -2274,10 +2164,10 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 			    SL_WARN | SL_ERROR,
 			    "ICMP error: Small AH header\n");
 			IP_AH_BUMP_STAT(ipss, in_discards);
-			ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+			ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 			    DROPPER(ipss, ipds_ah_bad_length),
 			    &ahstack->ah_dropper);
-			return (IPSEC_STATUS_FAILED);
+			return (NULL);
 		}
 		icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
 		ipha = (ipha_t *)&icmph[1];
@@ -2304,10 +2194,10 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 			    ah->ah_spi, &oipha->ipha_src, AF_INET,
 			    ahstack->ipsecah_netstack);
 		}
-		ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+		ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_ah_no_sa),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		return (NULL);
 	}
 
 	IPSA_REFRELE(assoc);
@@ -2343,10 +2233,10 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 			 * We tried hard, give up now.
 			 */
 			IP_AH_BUMP_STAT(ipss, in_discards);
-			ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+			ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 			    DROPPER(ipss, ipds_ah_nomem),
 			    &ahstack->ah_dropper);
-			return (IPSEC_STATUS_FAILED);
+			return (NULL);
 		}
 		icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
 		ipha = (ipha_t *)&icmph[1];
@@ -2354,8 +2244,8 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
 done:
 	/*
 	 * Remove the AH header and change the protocol.
-	 * Don't update the spi fields in the ipsec_in
-	 * message as we are called just to validate the
+	 * Don't update the spi fields in the ip_recv_attr_t
+	 * as we are called just to validate the
 	 * message attached to the ICMP message.
 	 *
 	 * If we never pulled up since all of the message
@@ -2368,14 +2258,11 @@ done:
 
 	if ((mp1 = allocb(alloc_size, BPRI_LO)) == NULL) {
 		IP_AH_BUMP_STAT(ipss, in_discards);
-		ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+		ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_ah_nomem),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		return (NULL);
 	}
-	/* ICMP errors are M_CTL messages */
-	mp1->b_datap->db_type = M_CTL;
-	ipsec_mp->b_cont = mp1;
 	bcopy(mp->b_rptr, mp1->b_rptr, alloc_size);
 	mp1->b_wptr += alloc_size;
 
@@ -2402,24 +2289,23 @@ done:
 	ipha->ipha_hdr_checksum = 0;
 	ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
 
-	return (IPSEC_STATUS_SUCCESS);
+	return (mp1);
 }
 
 /*
  * IP calls this to validate the ICMP errors that
  * we got from the network.
  */
-ipsec_status_t
-ipsecah_icmp_error(mblk_t *mp)
+mblk_t *
+ipsecah_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira)
 {
-	ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
-	netstack_t	*ns = ii->ipsec_in_ns;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
 	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
 
-	if (ii->ipsec_in_v4)
-		return (ah_icmp_error_v4(mp, ahstack));
+	if (ira->ira_flags & IRAF_IS_IPV4)
+		return (ah_icmp_error_v4(data_mp, ira, ahstack));
 	else
-		return (ah_icmp_error_v6(mp, ahstack));
+		return (ah_icmp_error_v6(data_mp, ira, ahstack));
 }
 
 static int
@@ -2546,7 +2432,7 @@ ah_fix_phdr_v6(ip6_t *ip6h, ip6_t *oip6h, boolean_t outbound,
 	prev_nexthdr = (uint8_t *)&ip6h->ip6_nxt;
 	nexthdr = oip6h->ip6_nxt;
 	/* Assume IP has already stripped it */
-	ASSERT(nexthdr != IPPROTO_FRAGMENT && nexthdr != IPPROTO_RAW);
+	ASSERT(nexthdr != IPPROTO_FRAGMENT);
 	ah = NULL;
 	dsthdr = NULL;
 	for (;;) {
@@ -2741,19 +2627,19 @@ ah_finish_up(ah_t *phdr_ah, ah_t *inbound_ah, ipsa_t *assoc,
  * argument is freed.
  */
 static void
-ah_log_bad_auth(mblk_t *ipsec_in)
+ah_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
 {
-	mblk_t *mp = ipsec_in->b_cont->b_cont;
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
-	boolean_t isv4 = ii->ipsec_in_v4;
-	ipsa_t *assoc = ii->ipsec_in_ah_sa;
-	int af;
-	void *addr;
-	netstack_t	*ns = ii->ipsec_in_ns;
+	boolean_t	isv4 = (ira->ira_flags & IRAF_IS_IPV4);
+	ipsa_t		*assoc = ira->ira_ipsec_ah_sa;
+	int		af;
+	void		*addr;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
 	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
-	mp->b_rptr -= ii->ipsec_in_skip_len;
+	ASSERT(mp->b_datap->db_type == M_DATA);
+
+	mp->b_rptr -= ic->ic_skip_len;
 
 	if (isv4) {
 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
@@ -2776,110 +2662,163 @@ ah_log_bad_auth(mblk_t *ipsec_in)
 	    assoc->ipsa_spi, addr, af, ahstack->ipsecah_netstack);
 
 	IP_AH_BUMP_STAT(ipss, in_discards);
-	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
+	ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 	    DROPPER(ipss, ipds_ah_bad_auth),
 	    &ahstack->ah_dropper);
 }
 
 /*
  * Kernel crypto framework callback invoked after completion of async
- * crypto requests.
+ * crypto requests for outbound packets.
  */
 static void
-ah_kcf_callback(void *arg, int status)
+ah_kcf_callback_outbound(void *arg, int status)
 {
-	mblk_t *ipsec_mp = (mblk_t *)arg;
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
-	netstackid_t	stackid;
-	netstack_t	*ns, *ns_arg;
+	mblk_t		*mp = (mblk_t *)arg;
+	mblk_t		*async_mp;
+	netstack_t	*ns;
 	ipsec_stack_t	*ipss;
 	ipsecah_stack_t	*ahstack;
-	ipsec_out_t	*io = (ipsec_out_t *)ii;
+	mblk_t		*data_mp;
+	ip_xmit_attr_t	ixas;
+	ipsec_crypto_t	*ic;
+	ill_t		*ill;
 
-	ASSERT(ipsec_mp->b_cont != NULL);
+	/*
+	 * First remove the ipsec_crypto_t mblk
+	 * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
+	 */
+	async_mp = ipsec_remove_crypto_data(mp, &ic);
+	ASSERT(async_mp != NULL);
 
-	if (is_inbound) {
-		stackid = ii->ipsec_in_stackid;
-		ns_arg = ii->ipsec_in_ns;
+	/*
+	 * Extract the ip_xmit_attr_t from the first mblk.
+	 * Verifies that the netstack and ill is still around; could
+	 * have vanished while kEf was doing its work.
+	 * On succesful return we have a nce_t and the ill/ipst can't
+	 * disappear until we do the nce_refrele in ixa_cleanup.
+	 */
+	data_mp = async_mp->b_cont;
+	async_mp->b_cont = NULL;
+	if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) {
+		/* Disappeared on us - no ill/ipst for MIB */
+		if (ixas.ixa_nce != NULL) {
+			ill = ixas.ixa_nce->nce_ill;
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
+		}
+		freemsg(data_mp);
+		goto done;
+	}
+	ns = ixas.ixa_ipst->ips_netstack;
+	ahstack = ns->netstack_ipsecah;
+	ipss = ns->netstack_ipsec;
+	ill = ixas.ixa_nce->nce_ill;
+
+	if (status == CRYPTO_SUCCESS) {
+		data_mp = ah_auth_out_done(data_mp, &ixas, ic);
+		if (data_mp == NULL)
+			goto done;
+
+		(void) ip_output_post_ipsec(data_mp, &ixas);
 	} else {
-		stackid = io->ipsec_out_stackid;
-		ns_arg = io->ipsec_out_ns;
+		/* Outbound shouldn't see invalid MAC */
+		ASSERT(status != CRYPTO_INVALID_MAC);
+
+		ah1dbg(ahstack,
+		    ("ah_kcf_callback_outbound: crypto failed with 0x%x\n",
+		    status));
+		AH_BUMP_STAT(ahstack, crypto_failures);
+		AH_BUMP_STAT(ahstack, out_discards);
+
+		ip_drop_packet(data_mp, B_FALSE, ill,
+		    DROPPER(ipss, ipds_ah_crypto_failed),
+		    &ahstack->ah_dropper);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
 	}
+done:
+	ixa_cleanup(&ixas);
+	(void) ipsec_free_crypto_data(mp);
+}
+
+/*
+ * Kernel crypto framework callback invoked after completion of async
+ * crypto requests for inbound packets.
+ */
+static void
+ah_kcf_callback_inbound(void *arg, int status)
+{
+	mblk_t		*mp = (mblk_t *)arg;
+	mblk_t		*async_mp;
+	netstack_t	*ns;
+	ipsec_stack_t	*ipss;
+	ipsecah_stack_t	*ahstack;
+	mblk_t		*data_mp;
+	ip_recv_attr_t	iras;
+	ipsec_crypto_t	*ic;
+
 	/*
-	 * Verify that the netstack is still around; could have vanished
-	 * while kEf was doing its work.
+	 * First remove the ipsec_crypto_t mblk
+	 * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
 	 */
-	ns = netstack_find_by_stackid(stackid);
-	if (ns == NULL || ns != ns_arg) {
-		/* Disappeared on us */
-		if (ns != NULL)
-			netstack_rele(ns);
-		freemsg(ipsec_mp);
-		return;
-	}
+	async_mp = ipsec_remove_crypto_data(mp, &ic);
+	ASSERT(async_mp != NULL);
 
+	/*
+	 * Extract the ip_xmit_attr_t from the first mblk.
+	 * Verifies that the netstack and ill is still around; could
+	 * have vanished while kEf was doing its work.
+	 */
+	data_mp = async_mp->b_cont;
+	async_mp->b_cont = NULL;
+	if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
+		/* The ill or ip_stack_t disappeared on us */
+		ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
+		freemsg(data_mp);
+		goto done;
+	}
+	ns = iras.ira_ill->ill_ipst->ips_netstack;
 	ahstack = ns->netstack_ipsecah;
 	ipss = ns->netstack_ipsec;
 
 	if (status == CRYPTO_SUCCESS) {
-		if (is_inbound) {
-			if (ah_auth_in_done(ipsec_mp) != IPSEC_STATUS_SUCCESS) {
-				netstack_rele(ns);
-				return;
-			}
-			/* finish IPsec processing */
-			ip_fanout_proto_again(ipsec_mp, NULL, NULL, NULL);
-		} else {
-			ipha_t *ipha;
+		data_mp = ah_auth_in_done(data_mp, &iras, ic);
+		if (data_mp == NULL)
+			goto done;
 
-			if (ah_auth_out_done(ipsec_mp) !=
-			    IPSEC_STATUS_SUCCESS) {
-				netstack_rele(ns);
-				return;
-			}
-
-			/* finish IPsec processing */
-			ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
-			if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
-				ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL,
-				    NULL);
-			} else {
-				ip6_t *ip6h = (ip6_t *)ipha;
-				ip_wput_ipsec_out_v6(NULL, ipsec_mp, ip6h,
-				    NULL, NULL);
-			}
-		}
+		/* finish IPsec processing */
+		ip_input_post_ipsec(data_mp, &iras);
 
 	} else if (status == CRYPTO_INVALID_MAC) {
-		ah_log_bad_auth(ipsec_mp);
+		ah_log_bad_auth(data_mp, &iras, ic);
 	} else {
-		ah1dbg(ahstack, ("ah_kcf_callback: crypto failed with 0x%x\n",
+		ah1dbg(ahstack,
+		    ("ah_kcf_callback_inbound: crypto failed with 0x%x\n",
 		    status));
 		AH_BUMP_STAT(ahstack, crypto_failures);
-		if (is_inbound)
-			IP_AH_BUMP_STAT(ipss, in_discards);
-		else
-			AH_BUMP_STAT(ahstack, out_discards);
-		ip_drop_packet(ipsec_mp, is_inbound, NULL, NULL,
+		IP_AH_BUMP_STAT(ipss, in_discards);
+		ip_drop_packet(data_mp, B_TRUE, iras.ira_ill,
 		    DROPPER(ipss, ipds_ah_crypto_failed),
 		    &ahstack->ah_dropper);
+		BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards);
 	}
-	netstack_rele(ns);
+done:
+	ira_cleanup(&iras, B_TRUE);
+	(void) ipsec_free_crypto_data(mp);
 }
 
 /*
  * Invoked on kernel crypto failure during inbound and outbound processing.
  */
 static void
-ah_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
-    ipsecah_stack_t *ahstack)
+ah_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc,
+    ill_t *ill, ipsecah_stack_t *ahstack)
 {
 	ipsec_stack_t	*ipss = ahstack->ipsecah_netstack->netstack_ipsec;
 
 	ah1dbg(ahstack, ("crypto failed for %s AH with 0x%x\n",
 	    is_inbound ? "inbound" : "outbound", kef_rc));
-	ip_drop_packet(mp, is_inbound, NULL, NULL,
+	ip_drop_packet(data_mp, is_inbound, ill,
 	    DROPPER(ipss, ipds_ah_crypto_failed),
 	    &ahstack->ah_dropper);
 	AH_BUMP_STAT(ahstack, crypto_failures);
@@ -2893,14 +2832,14 @@ ah_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
  * Helper macros for the ah_submit_req_{inbound,outbound}() functions.
  */
 
-#define	AH_INIT_CALLREQ(_cr, _ipss) {					\
-	(_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_RESTRICTED;		\
-	if ((_ipss)->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] == 		\
-	    IPSEC_ALGS_EXEC_ASYNC)					\
-		(_cr)->cr_flag |= CRYPTO_ALWAYS_QUEUE;			\
-	(_cr)->cr_callback_arg = ipsec_mp;				\
-	(_cr)->cr_callback_func = ah_kcf_callback;			\
-}
+/*
+ * A statement-equivalent macro, _cr MUST point to a modifiable
+ * crypto_call_req_t.
+ */
+#define	AH_INIT_CALLREQ(_cr, _mp, _callback)		\
+	(_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE;	\
+	(_cr)->cr_callback_arg = (_mp);				\
+	(_cr)->cr_callback_func = (_callback)
 
 #define	AH_INIT_CRYPTO_DATA(data, msglen, mblk) {			\
 	(data)->cd_format = CRYPTO_DATA_MBLK;				\
@@ -2920,124 +2859,185 @@ ah_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
 /*
  * Submit an inbound packet for processing by the crypto framework.
  */
-static ipsec_status_t
-ah_submit_req_inbound(mblk_t *ipsec_mp, size_t skip_len, uint32_t ah_offset,
-    ipsa_t *assoc)
+static mblk_t *
+ah_submit_req_inbound(mblk_t *phdr_mp, ip_recv_attr_t *ira,
+    size_t skip_len, uint32_t ah_offset, ipsa_t *assoc)
 {
 	int kef_rc;
-	mblk_t *phdr_mp;
-	crypto_call_req_t call_req;
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
+	mblk_t *mp;
+	crypto_call_req_t call_req, *callrp;
 	uint_t icv_len = assoc->ipsa_mac_len;
 	crypto_ctx_template_t ctx_tmpl;
-	netstack_t	*ns = ii->ipsec_in_ns;
-	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
-	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ipsecah_stack_t	*ahstack;
+	ipsec_crypto_t	*ic, icstack;
+	boolean_t force = (assoc->ipsa_flags & IPSA_F_ASYNC);
+
+	ahstack = ira->ira_ill->ill_ipst->ips_netstack->netstack_ipsecah;
 
-	phdr_mp = ipsec_mp->b_cont;
 	ASSERT(phdr_mp != NULL);
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
+	ASSERT(phdr_mp->b_datap->db_type == M_DATA);
+
+	if (force) {
+		/* We are doing asynch; allocate mblks to hold state */
+		if ((mp = ip_recv_attr_to_mblk(ira)) == NULL ||
+		    (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
+			BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", phdr_mp,
+			    ira->ira_ill);
+			freemsg(phdr_mp);
+			return (NULL);
+		}
 
-	/*
-	 * In case kEF queues and calls back, make sure we have the
-	 * netstackid_t for verification that the IP instance is still around
-	 * in esp_kcf_callback().
-	 */
-	ASSERT(ii->ipsec_in_stackid == ns->netstack_stackid);
+		linkb(mp, phdr_mp);
+		callrp = &call_req;
+		AH_INIT_CALLREQ(callrp, mp, ah_kcf_callback_inbound);
+	} else {
+		/*
+		 * If we know we are going to do sync then ipsec_crypto_t
+		 * should be on the stack.
+		 */
+		ic = &icstack;
+		bzero(ic, sizeof (*ic));
+		callrp = NULL;
+	}
 
 	/* init arguments for the crypto framework */
-	AH_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data, AH_MSGSIZE(phdr_mp),
+	AH_INIT_CRYPTO_DATA(&ic->ic_crypto_data, AH_MSGSIZE(phdr_mp),
 	    phdr_mp);
 
-	AH_INIT_CRYPTO_MAC(&ii->ipsec_in_crypto_mac, icv_len,
+	AH_INIT_CRYPTO_MAC(&ic->ic_crypto_mac, icv_len,
 	    (char *)phdr_mp->b_cont->b_rptr - skip_len + ah_offset +
 	    sizeof (ah_t));
 
-	AH_INIT_CALLREQ(&call_req, ipss);
-
-	ii->ipsec_in_skip_len = skip_len;
+	ic->ic_skip_len = skip_len;
 
 	IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH, ctx_tmpl);
 
 	/* call KEF to do the MAC operation */
 	kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
-	    &ii->ipsec_in_crypto_data, &assoc->ipsa_kcfauthkey, ctx_tmpl,
-	    &ii->ipsec_in_crypto_mac, &call_req);
+	    &ic->ic_crypto_data, &assoc->ipsa_kcfauthkey, ctx_tmpl,
+	    &ic->ic_crypto_mac, callrp);
 
 	switch (kef_rc) {
 	case CRYPTO_SUCCESS:
 		AH_BUMP_STAT(ahstack, crypto_sync);
-		return (ah_auth_in_done(ipsec_mp));
+		phdr_mp = ah_auth_in_done(phdr_mp, ira, ic);
+		if (force) {
+			/* Free mp after we are done with ic */
+			mp = ipsec_free_crypto_data(mp);
+			(void) ip_recv_attr_free_mblk(mp);
+		}
+		return (phdr_mp);
 	case CRYPTO_QUEUED:
-		/* ah_kcf_callback() will be invoked on completion */
+		/* ah_kcf_callback_inbound() will be invoked on completion */
 		AH_BUMP_STAT(ahstack, crypto_async);
-		return (IPSEC_STATUS_PENDING);
+		return (NULL);
 	case CRYPTO_INVALID_MAC:
+		/* Free mp after we are done with ic */
 		AH_BUMP_STAT(ahstack, crypto_sync);
-		ah_log_bad_auth(ipsec_mp);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+		ah_log_bad_auth(phdr_mp, ira, ic);
+		/* phdr_mp was passed to ip_drop_packet */
+		if (force) {
+			mp = ipsec_free_crypto_data(mp);
+			(void) ip_recv_attr_free_mblk(mp);
+		}
+		return (NULL);
 	}
 
-	ah_crypto_failed(ipsec_mp, B_TRUE, kef_rc, ahstack);
-	return (IPSEC_STATUS_FAILED);
+	if (force) {
+		mp = ipsec_free_crypto_data(mp);
+		phdr_mp = ip_recv_attr_free_mblk(mp);
+	}
+	BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+	ah_crypto_failed(phdr_mp, B_TRUE, kef_rc, ira->ira_ill, ahstack);
+	/* phdr_mp was passed to ip_drop_packet */
+	return (NULL);
 }
 
 /*
  * Submit an outbound packet for processing by the crypto framework.
  */
-static ipsec_status_t
-ah_submit_req_outbound(mblk_t *ipsec_mp, size_t skip_len, ipsa_t *assoc)
+static mblk_t *
+ah_submit_req_outbound(mblk_t *phdr_mp, ip_xmit_attr_t *ixa,
+    size_t skip_len, ipsa_t *assoc)
 {
 	int kef_rc;
-	mblk_t *phdr_mp;
-	crypto_call_req_t call_req;
-	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
+	mblk_t *mp;
+	crypto_call_req_t call_req, *callrp;
 	uint_t icv_len = assoc->ipsa_mac_len;
-	netstack_t	*ns = io->ipsec_out_ns;
-	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
-	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ipsecah_stack_t	*ahstack;
+	ipsec_crypto_t	*ic, icstack;
+	ill_t		*ill = ixa->ixa_nce->nce_ill;
+	boolean_t force = (assoc->ipsa_flags & IPSA_F_ASYNC);
 
-	phdr_mp = ipsec_mp->b_cont;
-	ASSERT(phdr_mp != NULL);
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
+	ahstack = ill->ill_ipst->ips_netstack->netstack_ipsecah;
 
-	/*
-	 * In case kEF queues and calls back, keep netstackid_t for
-	 * verification that the IP instance is still around in
-	 * ah_kcf_callback().
-	 */
-	io->ipsec_out_stackid = ns->netstack_stackid;
+	ASSERT(phdr_mp != NULL);
+	ASSERT(phdr_mp->b_datap->db_type == M_DATA);
+
+	if (force) {
+		/* We are doing asynch; allocate mblks to hold state */
+		if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL ||
+		    (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", phdr_mp, ill);
+			freemsg(phdr_mp);
+			return (NULL);
+		}
+		linkb(mp, phdr_mp);
+		callrp = &call_req;
+		AH_INIT_CALLREQ(callrp, mp, ah_kcf_callback_outbound);
+	} else {
+		/*
+		 * If we know we are going to do sync then ipsec_crypto_t
+		 * should be on the stack.
+		 */
+		ic = &icstack;
+		bzero(ic, sizeof (*ic));
+		callrp = NULL;
+	}
 
 	/* init arguments for the crypto framework */
-	AH_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data, AH_MSGSIZE(phdr_mp),
+	AH_INIT_CRYPTO_DATA(&ic->ic_crypto_data, AH_MSGSIZE(phdr_mp),
 	    phdr_mp);
 
-	AH_INIT_CRYPTO_MAC(&io->ipsec_out_crypto_mac, icv_len,
+	AH_INIT_CRYPTO_MAC(&ic->ic_crypto_mac, icv_len,
 	    (char *)phdr_mp->b_wptr);
 
-	AH_INIT_CALLREQ(&call_req, ipss);
+	ic->ic_skip_len = skip_len;
 
-	io->ipsec_out_skip_len = skip_len;
-
-	ASSERT(io->ipsec_out_ah_sa != NULL);
+	ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
 
 	/* call KEF to do the MAC operation */
-	kef_rc = crypto_mac(&assoc->ipsa_amech, &io->ipsec_out_crypto_data,
+	kef_rc = crypto_mac(&assoc->ipsa_amech, &ic->ic_crypto_data,
 	    &assoc->ipsa_kcfauthkey, assoc->ipsa_authtmpl,
-	    &io->ipsec_out_crypto_mac, &call_req);
+	    &ic->ic_crypto_mac, callrp);
 
 	switch (kef_rc) {
 	case CRYPTO_SUCCESS:
 		AH_BUMP_STAT(ahstack, crypto_sync);
-		return (ah_auth_out_done(ipsec_mp));
+		phdr_mp = ah_auth_out_done(phdr_mp, ixa, ic);
+		if (force) {
+			/* Free mp after we are done with ic */
+			mp = ipsec_free_crypto_data(mp);
+			(void) ip_xmit_attr_free_mblk(mp);
+		}
+		return (phdr_mp);
 	case CRYPTO_QUEUED:
-		/* ah_kcf_callback() will be invoked on completion */
+		/* ah_kcf_callback_outbound() will be invoked on completion */
 		AH_BUMP_STAT(ahstack, crypto_async);
-		return (IPSEC_STATUS_PENDING);
+		return (NULL);
 	}
 
-	ah_crypto_failed(ipsec_mp, B_FALSE, kef_rc, ahstack);
-	return (IPSEC_STATUS_FAILED);
+	if (force) {
+		mp = ipsec_free_crypto_data(mp);
+		phdr_mp = ip_xmit_attr_free_mblk(mp);
+	}
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+	ah_crypto_failed(phdr_mp, B_FALSE, kef_rc, NULL, ahstack);
+	/* phdr_mp was passed to ip_drop_packet */
+	return (NULL);
 }
 
 /*
@@ -3056,7 +3056,6 @@ ah_process_ip_options_v6(mblk_t *mp, ipsa_t *assoc, int *length_to_skip,
 	uint_t	ah_align_sz;
 	uint_t ah_offset;
 	int hdr_size;
-	ipsec_stack_t	*ipss = ahstack->ipsecah_netstack->netstack_ipsec;
 
 	/*
 	 * Allocate space for the authentication data also. It is
@@ -3135,9 +3134,6 @@ ah_process_ip_options_v6(mblk_t *mp, ipsa_t *assoc, int *length_to_skip,
 
 		ah_offset = ah_fix_phdr_v6(ip6h, oip6h, outbound, B_FALSE);
 		if (ah_offset == 0) {
-			ip_drop_packet(phdr_mp, !outbound, NULL, NULL,
-			    DROPPER(ipss, ipds_ah_bad_v6_hdrs),
-			    &ahstack->ah_dropper);
 			return (NULL);
 		}
 	}
@@ -3375,65 +3371,67 @@ ah_hdr:
 /*
  * Authenticate an outbound datagram. This function is called
  * whenever IP sends an outbound datagram that needs authentication.
+ * Returns a modified packet if done. Returns NULL if error or queued.
+ * If error return then ipIfStatsOutDiscards has been increased.
  */
-static ipsec_status_t
-ah_outbound(mblk_t *ipsec_out)
+static mblk_t *
+ah_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa)
 {
-	mblk_t *mp;
 	mblk_t *phdr_mp;
-	ipsec_out_t *oi;
 	ipsa_t *assoc;
 	int length_to_skip;
 	uint_t ah_align_sz;
 	uint_t age_bytes;
-	netstack_t	*ns;
-	ipsec_stack_t	*ipss;
-	ipsecah_stack_t	*ahstack;
+	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
+	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ill_t		*ill = ixa->ixa_nce->nce_ill;
+	boolean_t	need_refrele = B_FALSE;
 
 	/*
 	 * Construct the chain of mblks
 	 *
-	 * IPSEC_OUT->PSEUDO_HDR->DATA
+	 * PSEUDO_HDR->DATA
 	 *
 	 * one by one.
 	 */
 
-	ASSERT(ipsec_out->b_datap->db_type == M_CTL);
-
-	ASSERT(MBLKL(ipsec_out) >= sizeof (ipsec_info_t));
-
-	mp = ipsec_out->b_cont;
-	oi = (ipsec_out_t *)ipsec_out->b_rptr;
-	ns = oi->ipsec_out_ns;
-	ipss = ns->netstack_ipsec;
-	ahstack = ns->netstack_ipsecah;
-
 	AH_BUMP_STAT(ahstack, out_requests);
 
-	ASSERT(mp->b_datap->db_type == M_DATA);
+	ASSERT(data_mp->b_datap->db_type == M_DATA);
 
-	assoc = oi->ipsec_out_ah_sa;
+	assoc = ixa->ixa_ipsec_ah_sa;
 	ASSERT(assoc != NULL);
 
 
 	/*
 	 * Get the outer IP header in shape to escape this system..
 	 */
-	if (is_system_labeled() && (assoc->ipsa_ocred != NULL)) {
-		int whack;
-
-		mblk_setcred(mp, assoc->ipsa_ocred, NOPID);
-		if (oi->ipsec_out_v4)
-			whack = sadb_whack_label(&mp, assoc);
-		else
-			whack = sadb_whack_label_v6(&mp, assoc);
-		if (whack != 0) {
-			ip_drop_packet(ipsec_out, B_FALSE, NULL,
-			    NULL, DROPPER(ipss, ipds_ah_nomem),
+	if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) {
+		/*
+		 * Need to update packet with any CIPSO option and update
+		 * ixa_tsl to capture the new label.
+		 * We allocate a separate ixa for that purpose.
+		 */
+		ixa = ip_xmit_attr_duplicate(ixa);
+		if (ixa == NULL) {
+			ip_drop_packet(data_mp, B_FALSE, ill,
+			    DROPPER(ipss, ipds_ah_nomem),
 			    &ahstack->ah_dropper);
-			return (IPSEC_STATUS_FAILED);
+			return (NULL);
+		}
+		need_refrele = B_TRUE;
+
+		label_hold(assoc->ipsa_otsl);
+		ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl);
+
+		data_mp = sadb_whack_label(data_mp, assoc, ixa,
+		    DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper);
+		if (data_mp == NULL) {
+			/* Packet dropped by sadb_whack_label */
+			ixa_refrele(ixa);
+			return (NULL);
 		}
-		ipsec_out->b_cont = mp;
 	}
 
 	/*
@@ -3441,14 +3439,14 @@ ah_outbound(mblk_t *ipsec_out)
 	 * adding the AH header, ICV, and padding to the packet.
 	 */
 
-	if (oi->ipsec_out_v4) {
-		ipha_t *ipha = (ipha_t *)mp->b_rptr;
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
 		ah_align_sz = P2ALIGN(assoc->ipsa_mac_len +
 		    IPV4_PADDING_ALIGN - 1, IPV4_PADDING_ALIGN);
 		age_bytes = ntohs(ipha->ipha_length) + sizeof (ah_t) +
 		    ah_align_sz;
 	} else {
-		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+		ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
 		ah_align_sz = P2ALIGN(assoc->ipsa_mac_len +
 		    IPV6_PADDING_ALIGN - 1, IPV6_PADDING_ALIGN);
 		age_bytes = sizeof (ip6_t) + ntohs(ip6h->ip6_plen) +
@@ -3461,8 +3459,12 @@ ah_outbound(mblk_t *ipsec_out)
 		    "AH association 0x%x, dst %s had bytes expire.\n",
 		    ntohl(assoc->ipsa_spi), assoc->ipsa_dstaddr, AF_INET,
 		    ahstack->ipsecah_netstack);
-		freemsg(ipsec_out);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
+		freemsg(data_mp);
+		if (need_refrele)
+			ixa_refrele(ixa);
+		return (NULL);
 	}
 
 	/*
@@ -3470,64 +3472,59 @@ ah_outbound(mblk_t *ipsec_out)
 	 * (AH is computing the checksum over the outer label).
 	 */
 
-	if (oi->ipsec_out_is_capab_ill) {
-		ah3dbg(ahstack, ("ah_outbound: pkt can be accelerated\n"));
-		if (oi->ipsec_out_v4)
-			return (ah_outbound_accelerated_v4(ipsec_out, assoc));
-		else
-			return (ah_outbound_accelerated_v6(ipsec_out, assoc));
-	}
-	AH_BUMP_STAT(ahstack, noaccel);
-
 	/*
 	 * Insert pseudo header:
-	 * IPSEC_INFO -> [IP, ULP] => IPSEC_INFO -> [IP, AH, ICV] -> ULP
+	 * [IP, ULP] => [IP, AH, ICV] -> ULP
 	 */
 
-	if (oi->ipsec_out_v4) {
-		phdr_mp = ah_process_ip_options_v4(mp, assoc, &length_to_skip,
-		    assoc->ipsa_mac_len, B_TRUE, ahstack);
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		phdr_mp = ah_process_ip_options_v4(data_mp, assoc,
+		    &length_to_skip, assoc->ipsa_mac_len, B_TRUE, ahstack);
 	} else {
-		phdr_mp = ah_process_ip_options_v6(mp, assoc, &length_to_skip,
-		    assoc->ipsa_mac_len, B_TRUE, ahstack);
+		phdr_mp = ah_process_ip_options_v6(data_mp, assoc,
+		    &length_to_skip, assoc->ipsa_mac_len, B_TRUE, ahstack);
 	}
 
 	if (phdr_mp == NULL) {
 		AH_BUMP_STAT(ahstack, out_discards);
-		ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
+		ip_drop_packet(data_mp, B_FALSE, ixa->ixa_nce->nce_ill,
 		    DROPPER(ipss, ipds_ah_bad_v4_opts),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		if (need_refrele)
+			ixa_refrele(ixa);
+		return (NULL);
 	}
 
-	ipsec_out->b_cont = phdr_mp;
-	phdr_mp->b_cont = mp;
-	mp->b_rptr += length_to_skip;
+	phdr_mp->b_cont = data_mp;
+	data_mp->b_rptr += length_to_skip;
+	data_mp = phdr_mp;
 
 	/*
-	 * At this point ipsec_out points to the IPSEC_OUT, new_mp
-	 * points to an mblk containing the pseudo header (IP header,
+	 * At this point data_mp points to
+	 * an mblk containing the pseudo header (IP header,
 	 * AH header, and ICV with mutable fields zero'ed out).
 	 * mp points to the mblk containing the ULP data. The original
-	 * IP header is kept before the ULP data in mp.
+	 * IP header is kept before the ULP data in data_mp.
 	 */
 
 	/* submit MAC request to KCF */
-	return (ah_submit_req_outbound(ipsec_out, length_to_skip, assoc));
+	data_mp = ah_submit_req_outbound(data_mp, ixa, length_to_skip, assoc);
+	if (need_refrele)
+		ixa_refrele(ixa);
+	return (data_mp);
 }
 
-static ipsec_status_t
-ah_inbound(mblk_t *ipsec_in_mp, void *arg)
+static mblk_t *
+ah_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira)
 {
-	mblk_t *data_mp = ipsec_in_mp->b_cont;
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
-	ah_t *ah = (ah_t *)arg;
-	ipsa_t *assoc = ii->ipsec_in_ah_sa;
-	int length_to_skip;
-	int ah_length;
-	mblk_t *phdr_mp;
-	uint32_t ah_offset;
-	netstack_t	*ns = ii->ipsec_in_ns;
+	ah_t		*ah = (ah_t *)arg;
+	ipsa_t		*assoc = ira->ira_ipsec_ah_sa;
+	int		length_to_skip;
+	int		ah_length;
+	mblk_t		*phdr_mp;
+	uint32_t	ah_offset;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
 	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
@@ -3547,10 +3544,11 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg)
 	if (!sadb_replay_peek(assoc, ah->ah_replay)) {
 		AH_BUMP_STAT(ahstack, replay_early_failures);
 		IP_AH_BUMP_STAT(ipss, in_discards);
-		ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
+		ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_ah_early_replay),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+		return (NULL);
 	}
 
 	/*
@@ -3561,19 +3559,6 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg)
 	ah_offset = (uchar_t *)ah - data_mp->b_rptr;
 
 	/*
-	 * Has this packet already been processed by a hardware
-	 * IPsec accelerator?
-	 */
-	if (ii->ipsec_in_accelerated) {
-		ah3dbg(ahstack,
-		    ("ah_inbound_v6: pkt processed by ill=%d isv6=%d\n",
-		    ii->ipsec_in_ill_index, !ii->ipsec_in_v4));
-		return (ah_inbound_accelerated(ipsec_in_mp, ii->ipsec_in_v4,
-		    assoc, ah_offset));
-	}
-	AH_BUMP_STAT(ahstack, noaccel);
-
-	/*
 	 * We need to pullup until the ICV before we call
 	 * ah_process_ip_options_v6.
 	 */
@@ -3590,18 +3575,19 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg)
 			    SL_WARN | SL_ERROR,
 			    "ah_inbound: Small AH header\n");
 			IP_AH_BUMP_STAT(ipss, in_discards);
-			ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
+			ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
 			    DROPPER(ipss, ipds_ah_nomem),
 			    &ahstack->ah_dropper);
-			return (IPSEC_STATUS_FAILED);
+			BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+			return (NULL);
 		}
 	}
 
 	/*
 	 * Insert pseudo header:
-	 * IPSEC_INFO -> [IP, ULP] => IPSEC_INFO -> [IP, AH, ICV] -> ULP
+	 * [IP, ULP] => [IP, AH, ICV] -> ULP
 	 */
-	if (ii->ipsec_in_v4) {
+	if (ira->ira_flags & IRAF_IS_IPV4) {
 		phdr_mp = ah_process_ip_options_v4(data_mp, assoc,
 		    &length_to_skip, assoc->ipsa_mac_len, B_FALSE, ahstack);
 	} else {
@@ -3611,483 +3597,33 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg)
 
 	if (phdr_mp == NULL) {
 		IP_AH_BUMP_STAT(ipss, in_discards);
-		ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
-		    (ii->ipsec_in_v4 ?
+		ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
+		    ((ira->ira_flags & IRAF_IS_IPV4) ?
 		    DROPPER(ipss, ipds_ah_bad_v4_opts) :
 		    DROPPER(ipss, ipds_ah_bad_v6_hdrs)),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+		return (NULL);
 	}
 
-	ipsec_in_mp->b_cont = phdr_mp;
 	phdr_mp->b_cont = data_mp;
 	data_mp->b_rptr += length_to_skip;
+	data_mp = phdr_mp;
 
 	/* submit request to KCF */
-	return (ah_submit_req_inbound(ipsec_in_mp, length_to_skip, ah_offset,
+	return (ah_submit_req_inbound(data_mp, ira, length_to_skip, ah_offset,
 	    assoc));
 }
 
 /*
- * ah_inbound_accelerated:
- * Called from ah_inbound() to process IPsec packets that have been
- * accelerated by hardware.
- *
- * Basically does what ah_auth_in_done() with some changes since
- * no pseudo-headers are involved, i.e. the passed message is a
- * IPSEC_INFO->DATA.
- *
- * It is assumed that only packets that have been successfully
- * processed by the adapter come here.
- *
- * 1. get algorithm structure corresponding to association
- * 2. calculate pointers to authentication header and ICV
- * 3. compare ICV in AH header with ICV in data attributes
- *    3.1 if different:
- *	  3.1.1 generate error
- *        3.1.2 discard message
- *    3.2 if ICV matches:
- *	  3.2.1 check replay
- *        3.2.2 remove AH header
- *        3.2.3 age SA byte
- *        3.2.4 send to IP
- */
-ipsec_status_t
-ah_inbound_accelerated(mblk_t *ipsec_in, boolean_t isv4, ipsa_t *assoc,
-    uint32_t ah_offset)
-{
-	mblk_t *mp;
-	ipha_t *ipha;
-	ah_t *ah;
-	ipsec_in_t *ii;
-	uint32_t icv_len;
-	uint32_t align_len;
-	uint32_t age_bytes;
-	ip6_t *ip6h;
-	uint8_t *in_icv;
-	mblk_t *hada_mp;
-	uint32_t next_hdr;
-	da_ipsec_t *hada;
-	kstat_named_t *counter;
-	ipsecah_stack_t	*ahstack;
-	netstack_t	*ns;
-	ipsec_stack_t	*ipss;
-
-	ii = (ipsec_in_t *)ipsec_in->b_rptr;
-	ns = ii->ipsec_in_ns;
-	ahstack = ns->netstack_ipsecah;
-	ipss = ns->netstack_ipsec;
-
-	mp = ipsec_in->b_cont;
-	hada_mp = ii->ipsec_in_da;
-	ASSERT(hada_mp != NULL);
-	hada = (da_ipsec_t *)hada_mp->b_rptr;
-
-	AH_BUMP_STAT(ahstack, in_accelerated);
-
-	/*
-	 * We only support one level of decapsulation in hardware, so
-	 * nuke the pointer.
-	 */
-	ii->ipsec_in_da = NULL;
-	ii->ipsec_in_accelerated = B_FALSE;
-
-	/*
-	 * Extract ICV length from attributes M_CTL and sanity check
-	 * its value. We allow the mblk to be smaller than da_ipsec_t
-	 * for a small ICV, as long as the entire ICV fits within the mblk.
-	 * Also ensures that the ICV length computed by Provider
-	 * corresponds to the ICV length of the algorithm specified by the SA.
-	 */
-	icv_len = hada->da_icv_len;
-	if ((icv_len != assoc->ipsa_mac_len) ||
-	    (icv_len > DA_ICV_MAX_LEN) || (MBLKL(hada_mp) <
-	    (sizeof (da_ipsec_t) - DA_ICV_MAX_LEN + icv_len))) {
-		ah0dbg(("ah_inbound_accelerated: "
-		    "ICV len (%u) incorrect or mblk too small (%u)\n",
-		    icv_len, (uint32_t)(MBLKL(hada_mp))));
-		counter = DROPPER(ipss, ipds_ah_bad_length);
-		goto ah_in_discard;
-	}
-	ASSERT(icv_len != 0);
-
-	/* compute the padded AH ICV len */
-	if (isv4) {
-		ipha = (ipha_t *)mp->b_rptr;
-		align_len = (icv_len + IPV4_PADDING_ALIGN - 1) &
-		    -IPV4_PADDING_ALIGN;
-	} else {
-		ip6h = (ip6_t *)mp->b_rptr;
-		align_len = (icv_len + IPV6_PADDING_ALIGN - 1) &
-		    -IPV6_PADDING_ALIGN;
-	}
-
-	ah = (ah_t *)(mp->b_rptr + ah_offset);
-	in_icv = (uint8_t *)ah + sizeof (ah_t);
-
-	/* compare ICV in AH header vs ICV computed by adapter */
-	if (bcmp(hada->da_icv, in_icv, icv_len)) {
-		int af;
-		void *addr;
-
-		if (isv4) {
-			addr = &ipha->ipha_dst;
-			af = AF_INET;
-		} else {
-			addr = &ip6h->ip6_dst;
-			af = AF_INET6;
-		}
-
-		/*
-		 * Log the event. Don't print to the console, block
-		 * potential denial-of-service attack.
-		 */
-		AH_BUMP_STAT(ahstack, bad_auth);
-		ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
-		    "AH Authentication failed spi %x, dst_addr %s",
-		    assoc->ipsa_spi, addr, af, ahstack->ipsecah_netstack);
-		counter = DROPPER(ipss, ipds_ah_bad_auth);
-		goto ah_in_discard;
-	}
-
-	ah3dbg(ahstack, ("AH succeeded, checking replay\n"));
-	AH_BUMP_STAT(ahstack, good_auth);
-
-	if (!sadb_replay_check(assoc, ah->ah_replay)) {
-		int af;
-		void *addr;
-
-		if (isv4) {
-			addr = &ipha->ipha_dst;
-			af = AF_INET;
-		} else {
-			addr = &ip6h->ip6_dst;
-			af = AF_INET6;
-		}
-
-		/*
-		 * Log the event. As of now we print out an event.
-		 * Do not print the replay failure number, or else
-		 * syslog cannot collate the error messages.  Printing
-		 * the replay number that failed (or printing to the
-		 * console) opens a denial-of-service attack.
-		 */
-		AH_BUMP_STAT(ahstack, replay_failures);
-		ipsec_assocfailure(info.mi_idnum, 0, 0,
-		    SL_ERROR | SL_WARN,
-		    "Replay failed for AH spi %x, dst_addr %s",
-		    assoc->ipsa_spi, addr, af, ahstack->ipsecah_netstack);
-		counter = DROPPER(ipss, ipds_ah_replay);
-		goto ah_in_discard;
-	}
-
-	/*
-	 * Remove AH header. We do this by copying everything before
-	 * the AH header onto the AH header+ICV.
-	 */
-	/* overwrite AH with what was preceeding it (IP header) */
-	next_hdr = ah->ah_nexthdr;
-	ovbcopy(mp->b_rptr, mp->b_rptr + sizeof (ah_t) + align_len,
-	    ah_offset);
-	mp->b_rptr += sizeof (ah_t) + align_len;
-	if (isv4) {
-		/* adjust IP header next protocol */
-		ipha = (ipha_t *)mp->b_rptr;
-		ipha->ipha_protocol = next_hdr;
-
-		age_bytes = ipha->ipha_length;
-
-		/* adjust length in IP header */
-		ipha->ipha_length -= (sizeof (ah_t) + align_len);
-
-		/* recalculate checksum */
-		ipha->ipha_hdr_checksum = 0;
-		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
-	} else {
-		/* adjust IP header next protocol */
-		ip6h = (ip6_t *)mp->b_rptr;
-		ip6h->ip6_nxt = next_hdr;
-
-		age_bytes = sizeof (ip6_t) + ntohs(ip6h->ip6_plen) +
-		    sizeof (ah_t);
-
-		/* adjust length in IP header */
-		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
-		    (sizeof (ah_t) + align_len));
-	}
-
-	/* age SA */
-	if (!ah_age_bytes(assoc, age_bytes, B_TRUE)) {
-		/* The ipsa has hit hard expiration, LOG and AUDIT. */
-		ipsec_assocfailure(info.mi_idnum, 0, 0,
-		    SL_ERROR | SL_WARN,
-		    "AH Association 0x%x, dst %s had bytes expire.\n",
-		    assoc->ipsa_spi, assoc->ipsa_dstaddr,
-		    AF_INET, ahstack->ipsecah_netstack);
-		AH_BUMP_STAT(ahstack, bytes_expired);
-		counter = DROPPER(ipss, ipds_ah_bytes_expire);
-		goto ah_in_discard;
-	}
-
-	freeb(hada_mp);
-	return (IPSEC_STATUS_SUCCESS);
-
-ah_in_discard:
-	IP_AH_BUMP_STAT(ipss, in_discards);
-	freeb(hada_mp);
-	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter,
-	    &ahstack->ah_dropper);
-	return (IPSEC_STATUS_FAILED);
-}
-
-/*
- * ah_outbound_accelerated_v4:
- * Called from ah_outbound_v4() and once it is determined that the
- * packet is elligible for hardware acceleration.
- *
- * We proceed as follows:
- * 1. allocate and initialize attributes mblk
- * 2. mark IPSEC_OUT to indicate that pkt is accelerated
- * 3. insert AH header
- */
-static ipsec_status_t
-ah_outbound_accelerated_v4(mblk_t *ipsec_mp, ipsa_t *assoc)
-{
-	mblk_t *mp, *new_mp;
-	ipsec_out_t *oi;
-	uint_t ah_data_sz;	/* ICV length, algorithm dependent */
-	uint_t ah_align_sz;	/* ICV length + padding */
-	uint32_t v_hlen_tos_len; /* from original IP header */
-	ipha_t	*oipha;		/* original IP header */
-	ipha_t	*nipha;		/* new IP header */
-	uint_t option_length = 0;
-	uint_t new_hdr_len;	/* new header length */
-	uint_t iphdr_length;
-	ah_t *ah_hdr;		/* ptr to AH header */
-	netstack_t	*ns;
-	ipsec_stack_t	*ipss;
-	ipsecah_stack_t	*ahstack;
-
-	oi = (ipsec_out_t *)ipsec_mp->b_rptr;
-	ns = oi->ipsec_out_ns;
-	ipss = ns->netstack_ipsec;
-	ahstack = ns->netstack_ipsecah;
-
-	mp = ipsec_mp->b_cont;
-
-	AH_BUMP_STAT(ahstack, out_accelerated);
-
-	oipha = (ipha_t *)mp->b_rptr;
-	v_hlen_tos_len = ((uint32_t *)oipha)[0];
-
-	/* mark packet as being accelerated in IPSEC_OUT */
-	ASSERT(oi->ipsec_out_accelerated == B_FALSE);
-	oi->ipsec_out_accelerated = B_TRUE;
-
-	/* calculate authentication data length, i.e. ICV + padding */
-	ah_data_sz = assoc->ipsa_mac_len;
-	ah_align_sz = (ah_data_sz + IPV4_PADDING_ALIGN - 1) &
-	    -IPV4_PADDING_ALIGN;
-
-	/*
-	 * Insert pseudo header:
-	 * IPSEC_INFO -> [IP, ULP] => IPSEC_INFO -> [IP, AH, ICV] -> ULP
-	 */
-
-	/* IP + AH + authentication + padding data length */
-	new_hdr_len = IP_SIMPLE_HDR_LENGTH + sizeof (ah_t) + ah_align_sz;
-	if (V_HLEN != IP_SIMPLE_HDR_VERSION) {
-		option_length = oipha->ipha_version_and_hdr_length -
-		    (uint8_t)((IP_VERSION << 4) +
-		    IP_SIMPLE_HDR_LENGTH_IN_WORDS);
-		option_length <<= 2;
-		new_hdr_len += option_length;
-	}
-
-	/* allocate pseudo-header mblk */
-	if ((new_mp = allocb(new_hdr_len, BPRI_HI)) == NULL) {
-		/* IPsec kstats: bump bean counter here */
-		ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
-		    DROPPER(ipss, ipds_ah_nomem),
-		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
-	}
-
-	new_mp->b_cont = mp;
-	ipsec_mp->b_cont = new_mp;
-	new_mp->b_wptr += new_hdr_len;
-
-	/* copy original IP header to new header */
-	bcopy(mp->b_rptr, new_mp->b_rptr, IP_SIMPLE_HDR_LENGTH +
-	    option_length);
-
-	/* update IP header */
-	nipha = (ipha_t *)new_mp->b_rptr;
-	nipha->ipha_protocol = IPPROTO_AH;
-	iphdr_length = ntohs(nipha->ipha_length);
-	iphdr_length += sizeof (ah_t) + ah_align_sz;
-	nipha->ipha_length = htons(iphdr_length);
-	nipha->ipha_hdr_checksum = 0;
-	nipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(nipha);
-
-	/* skip original IP header in mp */
-	mp->b_rptr += IP_SIMPLE_HDR_LENGTH + option_length;
-
-	/* initialize AH header */
-	ah_hdr = (ah_t *)(new_mp->b_rptr + IP_SIMPLE_HDR_LENGTH +
-	    option_length);
-	ah_hdr->ah_nexthdr = oipha->ipha_protocol;
-	if (!ah_finish_up(ah_hdr, NULL, assoc, ah_data_sz, ah_align_sz,
-	    ahstack)) {
-		/* Only way this fails is if outbound replay counter wraps. */
-		ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
-		    DROPPER(ipss, ipds_ah_replay),
-		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
-	}
-
-	return (IPSEC_STATUS_SUCCESS);
-}
-
-/*
- * ah_outbound_accelerated_v6:
- *
- * Called from ah_outbound_v6() once it is determined that the packet
- * is eligible for hardware acceleration.
- *
- * We proceed as follows:
- * 1. allocate and initialize attributes mblk
- * 2. mark IPSEC_OUT to indicate that pkt is accelerated
- * 3. insert AH header
- */
-static ipsec_status_t
-ah_outbound_accelerated_v6(mblk_t *ipsec_mp, ipsa_t *assoc)
-{
-	mblk_t *mp, *phdr_mp;
-	ipsec_out_t *oi;
-	uint_t ah_data_sz;	/* ICV length, algorithm dependent */
-	uint_t ah_align_sz;	/* ICV length + padding */
-	ip6_t	*oip6h;		/* original IP header */
-	ip6_t	*ip6h;		/* new IP header */
-	uint_t option_length = 0;
-	uint_t hdr_size;
-	uint_t ah_offset;
-	ah_t *ah_hdr;		/* ptr to AH header */
-	netstack_t	*ns;
-	ipsec_stack_t	*ipss;
-	ipsecah_stack_t	*ahstack;
-
-	oi = (ipsec_out_t *)ipsec_mp->b_rptr;
-	ns = oi->ipsec_out_ns;
-	ipss = ns->netstack_ipsec;
-	ahstack = ns->netstack_ipsecah;
-
-	mp = ipsec_mp->b_cont;
-
-	AH_BUMP_STAT(ahstack, out_accelerated);
-
-	oip6h = (ip6_t *)mp->b_rptr;
-
-	/* mark packet as being accelerated in IPSEC_OUT */
-	ASSERT(oi->ipsec_out_accelerated == B_FALSE);
-	oi->ipsec_out_accelerated = B_TRUE;
-
-	/* calculate authentication data length, i.e. ICV + padding */
-	ah_data_sz = assoc->ipsa_mac_len;
-	ah_align_sz = (ah_data_sz + IPV4_PADDING_ALIGN - 1) &
-	    -IPV4_PADDING_ALIGN;
-
-	ASSERT(ah_align_sz >= ah_data_sz);
-
-	hdr_size = ipsec_ah_get_hdr_size_v6(mp, B_FALSE);
-	option_length = hdr_size - IPV6_HDR_LEN;
-
-	/* This was not included in ipsec_ah_get_hdr_size_v6() */
-	hdr_size += (sizeof (ah_t) + ah_align_sz);
-
-	if ((phdr_mp = allocb(hdr_size, BPRI_HI)) == NULL) {
-		ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
-		    DROPPER(ipss, ipds_ah_nomem),
-		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
-	}
-	phdr_mp->b_wptr += hdr_size;
-
-	/*
-	 * Form the basic IP header first.  We always assign every bit
-	 * of the v6 basic header, so a separate bzero is unneeded.
-	 */
-	ip6h = (ip6_t *)phdr_mp->b_rptr;
-	ip6h->ip6_vcf = oip6h->ip6_vcf;
-	ip6h->ip6_hlim = oip6h->ip6_hlim;
-	ip6h->ip6_src = oip6h->ip6_src;
-	ip6h->ip6_dst = oip6h->ip6_dst;
-	/*
-	 * Include the size of AH and authentication data.
-	 * This is how our recipient would compute the
-	 * authentication data. Look at what we do in the
-	 * inbound case below.
-	 */
-	ip6h->ip6_plen = htons(ntohs(oip6h->ip6_plen) + sizeof (ah_t) +
-	    ah_align_sz);
-
-	/*
-	 * Insert pseudo header:
-	 * IPSEC_INFO -> [IP6, LLH, ULP] =>
-	 *	IPSEC_INFO -> [IP, LLH, AH, ICV] -> ULP
-	 */
-
-	if (option_length == 0) {
-		/* Form the AH header */
-		ip6h->ip6_nxt = IPPROTO_AH;
-		((ah_t *)(ip6h + 1))->ah_nexthdr = oip6h->ip6_nxt;
-		ah_offset = IPV6_HDR_LEN;
-	} else {
-		ip6h->ip6_nxt = oip6h->ip6_nxt;
-		/* option_length does not include the AH header's size */
-		ah_offset = ah_fix_phdr_v6(ip6h, oip6h, B_TRUE, B_FALSE);
-		if (ah_offset == 0) {
-			freemsg(phdr_mp);
-			ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
-			    DROPPER(ipss, ipds_ah_bad_v6_hdrs),
-			    &ahstack->ah_dropper);
-			return (IPSEC_STATUS_FAILED);
-		}
-	}
-
-	phdr_mp->b_cont = mp;
-	ipsec_mp->b_cont = phdr_mp;
-
-	/* skip original IP header in mp */
-	mp->b_rptr += IPV6_HDR_LEN + option_length;
-
-	/* initialize AH header */
-	ah_hdr = (ah_t *)(phdr_mp->b_rptr + IPV6_HDR_LEN + option_length);
-	ah_hdr->ah_nexthdr = oip6h->ip6_nxt;
-
-	if (!ah_finish_up(((ah_t *)((uint8_t *)ip6h + ah_offset)), NULL,
-	    assoc, ah_data_sz, ah_align_sz, ahstack)) {
-		/* Only way this fails is if outbound replay counter wraps. */
-		ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
-		    DROPPER(ipss, ipds_ah_replay),
-		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
-	}
-
-	return (IPSEC_STATUS_SUCCESS);
-}
-
-/*
  * Invoked after processing of an inbound packet by the
  * kernel crypto framework. Called by ah_submit_req() for a sync request,
  * or by the kcf callback for an async request.
- * Returns IPSEC_STATUS_SUCCESS on success, IPSEC_STATUS_FAILED on failure.
- * On failure, the mblk chain ipsec_in is freed by this function.
+ * Returns NULL if the mblk chain is consumed.
  */
-static ipsec_status_t
-ah_auth_in_done(mblk_t *ipsec_in)
+static mblk_t *
+ah_auth_in_done(mblk_t *phdr_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
 {
-	mblk_t *phdr_mp;
 	ipha_t *ipha;
 	uint_t ah_offset = 0;
 	mblk_t *mp;
@@ -4096,41 +3632,36 @@ ah_auth_in_done(mblk_t *ipsec_in)
 	uint32_t length;
 	uint32_t *dest32;
 	uint8_t *dest;
-	ipsec_in_t *ii;
 	boolean_t isv4;
 	ip6_t *ip6h;
 	uint_t icv_len;
 	ipsa_t *assoc;
 	kstat_named_t *counter;
-	netstack_t	*ns;
-	ipsecah_stack_t	*ahstack;
-	ipsec_stack_t	*ipss;
-
-	ii = (ipsec_in_t *)ipsec_in->b_rptr;
-	ns = ii->ipsec_in_ns;
-	ahstack = ns->netstack_ipsecah;
-	ipss = ns->netstack_ipsec;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
+	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
-	isv4 = ii->ipsec_in_v4;
-	assoc = ii->ipsec_in_ah_sa;
-	icv_len = (uint_t)ii->ipsec_in_crypto_mac.cd_raw.iov_len;
+	isv4 = (ira->ira_flags & IRAF_IS_IPV4);
+	assoc = ira->ira_ipsec_ah_sa;
+	icv_len = (uint_t)ic->ic_crypto_mac.cd_raw.iov_len;
 
-	phdr_mp = ipsec_in->b_cont;
 	if (phdr_mp == NULL) {
-		ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
+		ip_drop_packet(phdr_mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_ah_nomem),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+		return (NULL);
 	}
 
 	mp = phdr_mp->b_cont;
 	if (mp == NULL) {
-		ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
+		ip_drop_packet(phdr_mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_ah_nomem),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+		return (NULL);
 	}
-	mp->b_rptr -= ii->ipsec_in_skip_len;
+	mp->b_rptr -= ic->ic_skip_len;
 
 	ah_set_usetime(assoc, B_TRUE);
 
@@ -4256,8 +3787,7 @@ ah_auth_in_done(mblk_t *ipsec_in)
 		while (*nexthdr != IPPROTO_AH) {
 			whereptr += hdrlen;
 			/* Assume IP has already stripped it */
-			ASSERT(*nexthdr != IPPROTO_FRAGMENT &&
-			    *nexthdr != IPPROTO_RAW);
+			ASSERT(*nexthdr != IPPROTO_FRAGMENT);
 			switch (*nexthdr) {
 			case IPPROTO_HOPOPTS:
 				hbhhdr = (ip6_hbh_t *)whereptr;
@@ -4292,20 +3822,18 @@ ah_auth_in_done(mblk_t *ipsec_in)
 		while (--dest >= mp->b_rptr)
 			*dest = *(dest - newpos);
 	}
-	ipsec_in->b_cont = mp;
-	phdr_mp->b_cont = NULL;
-	/*
-	 * If a db_credp exists in phdr_mp, it must also exist in mp.
-	 */
-	ASSERT(DB_CRED(phdr_mp) == NULL ||
-	    msg_getcred(mp, NULL) != NULL);
 	freeb(phdr_mp);
 
 	/*
 	 * If SA is labelled, use its label, else inherit the label
 	 */
-	if (is_system_labeled() && (assoc->ipsa_cred != NULL)) {
-		mblk_setcred(mp, assoc->ipsa_cred, NOPID);
+	if (is_system_labeled() && (assoc->ipsa_tsl != NULL)) {
+		if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) {
+			ip_drop_packet(mp, B_TRUE, ira->ira_ill,
+			    DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper);
+			BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+			return (NULL);
+		}
 	}
 
 	if (assoc->ipsa_state == IPSA_STATE_IDLE) {
@@ -4313,17 +3841,18 @@ ah_auth_in_done(mblk_t *ipsec_in)
 		 * Cluster buffering case.  Tell caller that we're
 		 * handling the packet.
 		 */
-		sadb_buf_pkt(assoc, ipsec_in, ns);
-		return (IPSEC_STATUS_PENDING);
+		sadb_buf_pkt(assoc, mp, ira);
+		return (NULL);
 	}
 
-	return (IPSEC_STATUS_SUCCESS);
+	return (mp);
 
 ah_in_discard:
 	IP_AH_BUMP_STAT(ipss, in_discards);
-	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter,
+	ip_drop_packet(phdr_mp, B_TRUE, ira->ira_ill, counter,
 	    &ahstack->ah_dropper);
-	return (IPSEC_STATUS_FAILED);
+	BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+	return (NULL);
 }
 
 /*
@@ -4332,49 +3861,37 @@ ah_in_discard:
  * executed syncrhonously, or by the KEF callback for a request
  * executed asynchronously.
  */
-static ipsec_status_t
-ah_auth_out_done(mblk_t *ipsec_out)
+static mblk_t *
+ah_auth_out_done(mblk_t *phdr_mp, ip_xmit_attr_t *ixa, ipsec_crypto_t *ic)
 {
-	mblk_t *phdr_mp;
 	mblk_t *mp;
 	int align_len;
 	uint32_t hdrs_length;
 	uchar_t *ptr;
 	uint32_t length;
 	boolean_t isv4;
-	ipsec_out_t *io;
 	size_t icv_len;
-	netstack_t	*ns;
-	ipsec_stack_t	*ipss;
-	ipsecah_stack_t	*ahstack;
-
-	io = (ipsec_out_t *)ipsec_out->b_rptr;
-	ns = io->ipsec_out_ns;
-	ipss = ns->netstack_ipsec;
-	ahstack = ns->netstack_ipsecah;
+	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
+	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ill_t		*ill = ixa->ixa_nce->nce_ill;
 
-	isv4 = io->ipsec_out_v4;
-	icv_len = io->ipsec_out_crypto_mac.cd_raw.iov_len;
-
-	phdr_mp = ipsec_out->b_cont;
-	if (phdr_mp == NULL) {
-		ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
-		    DROPPER(ipss, ipds_ah_nomem),
-		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
-	}
+	isv4 = (ixa->ixa_flags & IXAF_IS_IPV4);
+	icv_len = ic->ic_crypto_mac.cd_raw.iov_len;
 
 	mp = phdr_mp->b_cont;
 	if (mp == NULL) {
-		ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
+		ip_drop_packet(phdr_mp, B_FALSE, ill,
 		    DROPPER(ipss, ipds_ah_nomem),
 		    &ahstack->ah_dropper);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		return (NULL);
 	}
-	mp->b_rptr -= io->ipsec_out_skip_len;
+	mp->b_rptr -= ic->ic_skip_len;
 
-	ASSERT(io->ipsec_out_ah_sa != NULL);
-	ah_set_usetime(io->ipsec_out_ah_sa, B_FALSE);
+	ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
+	ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
+	ah_set_usetime(ixa->ixa_ipsec_ah_sa, B_FALSE);
 
 	if (isv4) {
 		ipha_t *ipha;
@@ -4454,7 +3971,7 @@ ah_auth_out_done(mblk_t *ipsec_out)
 		freeb(mp);
 	}
 
-	return (IPSEC_STATUS_SUCCESS);
+	return (phdr_mp);
 }
 
 /* Refactor me */
@@ -4464,16 +3981,18 @@ ah_auth_out_done(mblk_t *ipsec_out)
  */
 void
 ipsecah_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
-    uint32_t spi, void *addr, int af, ipsecah_stack_t *ahstack)
+    uint32_t spi, void *addr, int af, ip_recv_attr_t *ira)
 {
-	ipsec_stack_t	*ipss = ahstack->ipsecah_netstack->netstack_ipsec;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
+	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
 	if (ahstack->ipsecah_log_unknown_spi) {
 		ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
 		    addr, af, ahstack->ipsecah_netstack);
 	}
 
-	ip_drop_packet(mp, B_TRUE, NULL, NULL,
+	ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 	    DROPPER(ipss, ipds_ah_no_sa),
 	    &ahstack->ah_dropper);
 }
diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c
index 089e23e937..8af449384f 100644
--- a/usr/src/uts/common/inet/ip/ipsecesp.c
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c
@@ -53,6 +53,8 @@
 #include <inet/ip.h>
 #include <inet/ip_impl.h>
 #include <inet/ip6.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ndp.h>
 #include <inet/sadb.h>
 #include <inet/ipsec_info.h>
 #include <inet/ipsec_impl.h>
@@ -67,8 +69,6 @@
 #include <sys/taskq.h>
 #include <sys/note.h>
 
-#include <sys/iphada.h>
-
 #include <sys/tsol/tnet.h>
 
 /*
@@ -130,26 +130,23 @@ static	ipsecespparam_t	lcl_param_arr[] = {
 
 static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
 static int ipsecesp_close(queue_t *);
-static void ipsecesp_rput(queue_t *, mblk_t *);
 static void ipsecesp_wput(queue_t *, mblk_t *);
 static void	*ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
 static void	ipsecesp_stack_fini(netstackid_t stackid, void *arg);
 static void esp_send_acquire(ipsacq_t *, mblk_t *, netstack_t *);
 
 static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
-static ipsec_status_t esp_outbound_accelerated(mblk_t *, uint_t);
-static ipsec_status_t esp_inbound_accelerated(mblk_t *, mblk_t *,
-    boolean_t, ipsa_t *);
+static void esp_outbound_finish(mblk_t *, ip_xmit_attr_t *);
+static void esp_inbound_restart(mblk_t *, ip_recv_attr_t *);
 
 static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
-    ipsecesp_stack_t *, mblk_t *);
+    ipsecesp_stack_t *, cred_t *);
 static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
     kstat_named_t **, ipsecesp_stack_t *);
-static ipsec_status_t esp_submit_req_inbound(mblk_t *, ipsa_t *, uint_t);
-static ipsec_status_t esp_submit_req_outbound(mblk_t *, ipsa_t *, uchar_t *,
-    uint_t);
-extern void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
-    void *);
+static mblk_t *esp_submit_req_inbound(mblk_t *, ip_recv_attr_t *,
+    ipsa_t *, uint_t);
+static mblk_t *esp_submit_req_outbound(mblk_t *, ip_xmit_attr_t *,
+    ipsa_t *, uchar_t *, uint_t);
 
 /* Setable in /etc/system */
 uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
@@ -159,7 +156,7 @@ static struct module_info info = {
 };
 
 static struct qinit rinit = {
-	(pfi_t)ipsecesp_rput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
+	(pfi_t)putnext, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
 	NULL
 };
 
@@ -201,9 +198,6 @@ typedef struct esp_kstats_s {
 	kstat_named_t esp_stat_acquire_requests;
 	kstat_named_t esp_stat_bytes_expired;
 	kstat_named_t esp_stat_out_discards;
-	kstat_named_t esp_stat_in_accelerated;
-	kstat_named_t esp_stat_out_accelerated;
-	kstat_named_t esp_stat_noaccel;
 	kstat_named_t esp_stat_crypto_sync;
 	kstat_named_t esp_stat_crypto_async;
 	kstat_named_t esp_stat_crypto_failures;
@@ -266,9 +260,6 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
 	KI(acquire_requests);
 	KI(bytes_expired);
 	KI(out_discards);
-	KI(in_accelerated);
-	KI(out_accelerated);
-	KI(noaccel);
 	KI(crypto_sync);
 	KI(crypto_async);
 	KI(crypto_failures);
@@ -384,9 +375,9 @@ esp_ager(void *arg)
 	hrtime_t begin = gethrtime();
 
 	sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
-	    espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
+	    espstack->ipsecesp_reap_delay, ns);
 	sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
-	    espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
+	    espstack->ipsecesp_reap_delay, ns);
 
 	espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
 	    esp_ager, espstack,
@@ -583,7 +574,13 @@ ipsecesp_stack_fini(netstackid_t stackid, void *arg)
 }
 
 /*
- * ESP module open routine.
+ * ESP module open routine, which is here for keysock plumbing.
+ * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old
+ * Days of export control, and fears that ESP would not be allowed
+ * to be shipped at all by default.  Eventually, keysock should
+ * either access AH and ESP via modstubs or krtld dependencies, or
+ * perhaps be folded in with AH and ESP into a single IPsec/netsec
+ * module ("netsec" if PF_KEY provides more than AH/ESP keying tables).
  */
 /* ARGSUSED */
 static int
@@ -606,56 +603,10 @@ ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 	espstack = ns->netstack_ipsecesp;
 	ASSERT(espstack != NULL);
 
-	/*
-	 * ASSUMPTIONS (because I'm MT_OCEXCL):
-	 *
-	 *	* I'm being pushed on top of IP for all my opens (incl. #1).
-	 *	* Only ipsecesp_open() can write into esp_sadb.s_ip_q.
-	 *	* Because of this, I can check lazily for esp_sadb.s_ip_q.
-	 *
-	 *  If these assumptions are wrong, I'm in BIG trouble...
-	 */
-
 	q->q_ptr = espstack;
 	WR(q)->q_ptr = q->q_ptr;
 
-	if (espstack->esp_sadb.s_ip_q == NULL) {
-		struct T_unbind_req *tur;
-
-		espstack->esp_sadb.s_ip_q = WR(q);
-		/* Allocate an unbind... */
-		espstack->esp_ip_unbind = allocb(sizeof (struct T_unbind_req),
-		    BPRI_HI);
-
-		/*
-		 * Send down T_BIND_REQ to bind IPPROTO_ESP.
-		 * Handle the ACK here in ESP.
-		 */
-		qprocson(q);
-		if (espstack->esp_ip_unbind == NULL ||
-		    !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
-			if (espstack->esp_ip_unbind != NULL) {
-				freeb(espstack->esp_ip_unbind);
-				espstack->esp_ip_unbind = NULL;
-			}
-			q->q_ptr = NULL;
-			netstack_rele(espstack->ipsecesp_netstack);
-			return (ENOMEM);
-		}
-
-		espstack->esp_ip_unbind->b_datap->db_type = M_PROTO;
-		tur = (struct T_unbind_req *)espstack->esp_ip_unbind->b_rptr;
-		tur->PRIM_type = T_UNBIND_REQ;
-	} else {
-		qprocson(q);
-	}
-
-	/*
-	 * For now, there's not much I can do.  I'll be getting a message
-	 * passed down to me from keysock (in my wput), and a T_BIND_ACK
-	 * up from IP (in my rput).
-	 */
-
+	qprocson(q);
 	return (0);
 }
 
@@ -668,17 +619,6 @@ ipsecesp_close(queue_t *q)
 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
 
 	/*
-	 * If esp_sadb.s_ip_q is attached to this instance, send a
-	 * T_UNBIND_REQ to IP for the instance before doing
-	 * a qprocsoff().
-	 */
-	if (WR(q) == espstack->esp_sadb.s_ip_q &&
-	    espstack->esp_ip_unbind != NULL) {
-		putnext(WR(q), espstack->esp_ip_unbind);
-		espstack->esp_ip_unbind = NULL;
-	}
-
-	/*
 	 * Clean up q_ptr, if needed.
 	 */
 	qprocsoff(q);
@@ -693,45 +633,6 @@ ipsecesp_close(queue_t *q)
 		(void) quntimeout(q, espstack->esp_event);
 	}
 
-	if (WR(q) == espstack->esp_sadb.s_ip_q) {
-		/*
-		 * If the esp_sadb.s_ip_q is attached to this instance, find
-		 * another.  The OCEXCL outer perimeter helps us here.
-		 */
-		espstack->esp_sadb.s_ip_q = NULL;
-
-		/*
-		 * Find a replacement queue for esp_sadb.s_ip_q.
-		 */
-		if (espstack->esp_pfkey_q != NULL &&
-		    espstack->esp_pfkey_q != RD(q)) {
-			/*
-			 * See if we can use the pfkey_q.
-			 */
-			espstack->esp_sadb.s_ip_q = WR(espstack->esp_pfkey_q);
-		}
-
-		if (espstack->esp_sadb.s_ip_q == NULL ||
-		    !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
-			esp1dbg(espstack, ("ipsecesp: Can't reassign ip_q.\n"));
-			espstack->esp_sadb.s_ip_q = NULL;
-		} else {
-			espstack->esp_ip_unbind =
-			    allocb(sizeof (struct T_unbind_req), BPRI_HI);
-
-			if (espstack->esp_ip_unbind != NULL) {
-				struct T_unbind_req *tur;
-
-				espstack->esp_ip_unbind->b_datap->db_type =
-				    M_PROTO;
-				tur = (struct T_unbind_req *)
-				    espstack->esp_ip_unbind->b_rptr;
-				tur->PRIM_type = T_UNBIND_REQ;
-			}
-			/* If it's NULL, I can't do much here. */
-		}
-	}
-
 	netstack_rele(espstack->ipsecesp_netstack);
 	return (0);
 }
@@ -834,26 +735,27 @@ esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
 
 /*
  * Do incoming NAT-T manipulations for packet.
+ * Returns NULL if the mblk chain is consumed.
  */
-static ipsec_status_t
+static mblk_t *
 esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
 {
 	ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
-	tcpha_t *tcph;
+	tcpha_t *tcpha;
 	udpha_t *udpha;
 	/* Initialize to our inbound cksum adjustment... */
 	uint32_t sum = assoc->ipsa_inbound_cksum;
 
 	switch (ipha->ipha_protocol) {
 	case IPPROTO_TCP:
-		tcph = (tcpha_t *)(data_mp->b_rptr +
+		tcpha = (tcpha_t *)(data_mp->b_rptr +
 		    IPH_HDR_LENGTH(ipha));
 
 #define	DOWN_SUM(x) (x) = ((x) & 0xFFFF) +	 ((x) >> 16)
-		sum += ~ntohs(tcph->tha_sum) & 0xFFFF;
+		sum += ~ntohs(tcpha->tha_sum) & 0xFFFF;
 		DOWN_SUM(sum);
 		DOWN_SUM(sum);
-		tcph->tha_sum = ~htons(sum);
+		tcpha->tha_sum = ~htons(sum);
 		break;
 	case IPPROTO_UDP:
 		udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
@@ -876,7 +778,7 @@ esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
 		 */
 		break;
 	}
-	return (IPSEC_STATUS_SUCCESS);
+	return (data_mp);
 }
 
 
@@ -968,10 +870,11 @@ esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
 		if (ip6h->ip6_nxt == IPPROTO_ESP) {
 			ip6h->ip6_nxt = nexthdr;
 		} else {
-			ip6_pkt_t ipp;
+			ip_pkt_t ipp;
 
 			bzero(&ipp, sizeof (ipp));
-			(void) ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
+			(void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp,
+			    NULL);
 			if (ipp.ipp_dstopts != NULL) {
 				ipp.ipp_dstopts->ip6d_nxt = nexthdr;
 			} else if (ipp.ipp_rthdr != NULL) {
@@ -1227,16 +1130,14 @@ esp_set_usetime(ipsa_t *assoc, boolean_t inbound)
 /*
  * Handle ESP inbound data for IPv4 and IPv6.
  * On success returns B_TRUE, on failure returns B_FALSE and frees the
- * mblk chain ipsec_in_mp.
+ * mblk chain data_mp.
  */
-ipsec_status_t
-esp_inbound(mblk_t *ipsec_in_mp, void *arg)
+mblk_t *
+esp_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira)
 {
-	mblk_t *data_mp = ipsec_in_mp->b_cont;
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
 	esph_t *esph = (esph_t *)arg;
-	ipsa_t *ipsa = ii->ipsec_in_esp_sa;
-	netstack_t	*ns = ii->ipsec_in_ns;
+	ipsa_t *ipsa = ira->ira_ipsec_esp_sa;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
@@ -1254,36 +1155,18 @@ esp_inbound(mblk_t *ipsec_in_mp, void *arg)
 	if (!sadb_replay_peek(ipsa, esph->esph_replay)) {
 		ESP_BUMP_STAT(espstack, replay_early_failures);
 		IP_ESP_BUMP_STAT(ipss, in_discards);
-		/*
-		 * TODO: Extract inbound interface from the IPSEC_IN
-		 * message's ii->ipsec_in_rill_index.
-		 */
-		ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
+		ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_esp_early_replay),
 		    &espstack->esp_dropper);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+		return (NULL);
 	}
 
 	/*
-	 * Has this packet already been processed by a hardware
-	 * IPsec accelerator?
-	 */
-	if (ii->ipsec_in_accelerated) {
-		ipsec_status_t rv;
-		esp3dbg(espstack,
-		    ("esp_inbound: pkt processed by ill=%d isv6=%d\n",
-		    ii->ipsec_in_ill_index, !ii->ipsec_in_v4));
-		rv = esp_inbound_accelerated(ipsec_in_mp,
-		    data_mp, ii->ipsec_in_v4, ipsa);
-		return (rv);
-	}
-	ESP_BUMP_STAT(espstack, noaccel);
-
-	/*
 	 * Adjust the IP header's payload length to reflect the removal
 	 * of the ICV.
 	 */
-	if (!ii->ipsec_in_v4) {
+	if (!(ira->ira_flags & IRAF_IS_IPV4)) {
 		ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
 		    ipsa->ipsa_mac_len);
@@ -1294,7 +1177,7 @@ esp_inbound(mblk_t *ipsec_in_mp, void *arg)
 	}
 
 	/* submit the request to the crypto framework */
-	return (esp_submit_req_inbound(ipsec_in_mp, ipsa,
+	return (esp_submit_req_inbound(data_mp, ira, ipsa,
 	    (uint8_t *)esph - data_mp->b_rptr));
 }
 
@@ -1303,21 +1186,15 @@ esp_inbound(mblk_t *ipsec_in_mp, void *arg)
  * Called while holding the algorithm lock.
  */
 static void
-esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
+esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs,
+    netstack_t *ns)
 {
 	sadb_comb_t *comb = (sadb_comb_t *)(prop + 1);
-	ipsec_out_t *io;
 	ipsec_action_t *ap;
 	ipsec_prot_t *prot;
-	netstack_t *ns;
-	ipsecesp_stack_t *espstack;
-	ipsec_stack_t *ipss;
+	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
-	io = (ipsec_out_t *)acqrec->ipsacq_mp->b_rptr;
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	ns = io->ipsec_out_ns;
-	espstack = ns->netstack_ipsecesp;
-	ipss = ns->netstack_ipsec;
 	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
 
 	prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
@@ -1327,9 +1204,10 @@ esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
 	prop->sadb_prop_replay = espstack->ipsecesp_replay_size;
 
 	/*
-	 * Based upon algorithm properties, and what-not, prioritize
-	 * a proposal.  If the IPSEC_OUT message has an algorithm specified,
-	 * use it first and foremost.
+	 * Based upon algorithm properties, and what-not, prioritize a
+	 * proposal, based on the ordering of the ESP algorithms in the
+	 * alternatives in the policy rule or socket that was placed
+	 * in the acquire record.
 	 *
 	 * For each action in policy list
 	 *   Add combination.  If I've hit limit, return.
@@ -1456,7 +1334,7 @@ esp_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns)
 	/* Insert proposal here. */
 
 	prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len);
-	esp_insert_prop(prop, acqrec, combs);
+	esp_insert_prop(prop, acqrec, combs, ns);
 	samsg->sadb_msg_len += prop->sadb_prop_len;
 	msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len);
 
@@ -1756,13 +1634,11 @@ esp_port_freshness(uint32_t ports, ipsa_t *assoc)
  * If authentication was performed on the packet, this function is called
  * only if the authentication succeeded.
  * On success returns B_TRUE, on failure returns B_FALSE and frees the
- * mblk chain ipsec_in_mp.
+ * mblk chain data_mp.
  */
-static ipsec_status_t
-esp_in_done(mblk_t *ipsec_in_mp)
+static mblk_t *
+esp_in_done(mblk_t *data_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
 {
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
-	mblk_t *data_mp;
 	ipsa_t *assoc;
 	uint_t espstart;
 	uint32_t ivlen = 0;
@@ -1770,11 +1646,11 @@ esp_in_done(mblk_t *ipsec_in_mp)
 	esph_t *esph;
 	kstat_named_t *counter;
 	boolean_t is_natt;
-	netstack_t	*ns = ii->ipsec_in_ns;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
-	assoc = ii->ipsec_in_esp_sa;
+	assoc = ira->ira_ipsec_esp_sa;
 	ASSERT(assoc != NULL);
 
 	is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
@@ -1782,26 +1658,25 @@ esp_in_done(mblk_t *ipsec_in_mp)
 	/* get the pointer to the ESP header */
 	if (assoc->ipsa_encr_alg == SADB_EALG_NULL) {
 		/* authentication-only ESP */
-		espstart = ii->ipsec_in_crypto_data.cd_offset;
-		processed_len = ii->ipsec_in_crypto_data.cd_length;
+		espstart = ic->ic_crypto_data.cd_offset;
+		processed_len = ic->ic_crypto_data.cd_length;
 	} else {
 		/* encryption present */
 		ivlen = assoc->ipsa_iv_len;
 		if (assoc->ipsa_auth_alg == SADB_AALG_NONE) {
 			/* encryption-only ESP */
-			espstart = ii->ipsec_in_crypto_data.cd_offset -
+			espstart = ic->ic_crypto_data.cd_offset -
 			    sizeof (esph_t) - assoc->ipsa_iv_len;
-			processed_len = ii->ipsec_in_crypto_data.cd_length +
+			processed_len = ic->ic_crypto_data.cd_length +
 			    ivlen;
 		} else {
 			/* encryption with authentication */
-			espstart = ii->ipsec_in_crypto_dual_data.dd_offset1;
-			processed_len = ii->ipsec_in_crypto_dual_data.dd_len2 +
+			espstart = ic->ic_crypto_dual_data.dd_offset1;
+			processed_len = ic->ic_crypto_dual_data.dd_len2 +
 			    ivlen;
 		}
 	}
 
-	data_mp = ipsec_in_mp->b_cont;
 	esph = (esph_t *)(data_mp->b_rptr + espstart);
 
 	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE ||
@@ -1840,8 +1715,11 @@ esp_in_done(mblk_t *ipsec_in_mp)
 			goto drop_and_bail;
 		}
 
-		if (is_natt)
-			esp_port_freshness(ii->ipsec_in_esp_udp_ports, assoc);
+		if (is_natt) {
+			ASSERT(ira->ira_flags & IRAF_ESP_UDP_PORTS);
+			ASSERT(ira->ira_esp_udp_ports != 0);
+			esp_port_freshness(ira->ira_esp_udp_ports, assoc);
+		}
 	}
 
 	esp_set_usetime(assoc, B_TRUE);
@@ -1863,44 +1741,41 @@ esp_in_done(mblk_t *ipsec_in_mp)
 	 * spews "branch, predict taken" code for this.
 	 */
 
-	if (esp_strip_header(data_mp, ii->ipsec_in_v4, ivlen, &counter,
-	    espstack)) {
-
-		if (is_system_labeled()) {
-			cred_t *cr = assoc->ipsa_cred;
+	if (esp_strip_header(data_mp, (ira->ira_flags & IRAF_IS_IPV4),
+	    ivlen, &counter, espstack)) {
 
-			if (cr != NULL) {
-				mblk_setcred(data_mp, cr, NOPID);
+		if (is_system_labeled() && assoc->ipsa_tsl != NULL) {
+			if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) {
+				ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
+				    DROPPER(ipss, ipds_ah_nomem),
+				    &espstack->esp_dropper);
+				BUMP_MIB(ira->ira_ill->ill_ip_mib,
+				    ipIfStatsInDiscards);
+				return (NULL);
 			}
-
 		}
 		if (is_natt)
 			return (esp_fix_natt_checksums(data_mp, assoc));
 
-		ASSERT(!is_system_labeled() || (DB_CRED(data_mp) != NULL));
-
 		if (assoc->ipsa_state == IPSA_STATE_IDLE) {
 			/*
 			 * Cluster buffering case.  Tell caller that we're
 			 * handling the packet.
 			 */
-			sadb_buf_pkt(assoc, ipsec_in_mp, ns);
-			return (IPSEC_STATUS_PENDING);
+			sadb_buf_pkt(assoc, data_mp, ira);
+			return (NULL);
 		}
 
-		return (IPSEC_STATUS_SUCCESS);
+		return (data_mp);
 	}
 
 	esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n"));
 drop_and_bail:
 	IP_ESP_BUMP_STAT(ipss, in_discards);
-	/*
-	 * TODO: Extract inbound interface from the IPSEC_IN message's
-	 * ii->ipsec_in_rill_index.
-	 */
-	ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, counter,
+	ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, counter,
 	    &espstack->esp_dropper);
-	return (IPSEC_STATUS_FAILED);
+	BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+	return (NULL);
 }
 
 /*
@@ -1908,11 +1783,10 @@ drop_and_bail:
  * argument is freed.
  */
 static void
-esp_log_bad_auth(mblk_t *ipsec_in)
+esp_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira)
 {
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
-	ipsa_t *assoc = ii->ipsec_in_esp_sa;
-	netstack_t	*ns = ii->ipsec_in_ns;
+	ipsa_t		*assoc = ira->ira_ipsec_esp_sa;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
@@ -1928,11 +1802,7 @@ esp_log_bad_auth(mblk_t *ipsec_in)
 	    espstack->ipsecesp_netstack);
 
 	IP_ESP_BUMP_STAT(ipss, in_discards);
-	/*
-	 * TODO: Extract inbound interface from the IPSEC_IN
-	 * message's ii->ipsec_in_rill_index.
-	 */
-	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
+	ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 	    DROPPER(ipss, ipds_esp_bad_auth),
 	    &espstack->esp_dropper);
 }
@@ -1944,148 +1814,205 @@ esp_log_bad_auth(mblk_t *ipsec_in)
  * Returns B_TRUE if the AH processing was not needed or if it was
  * performed successfully. Returns B_FALSE and consumes the passed mblk
  * if AH processing was required but could not be performed.
+ *
+ * Returns data_mp unless data_mp was consumed/queued.
  */
-static boolean_t
-esp_do_outbound_ah(mblk_t *ipsec_mp)
+static mblk_t *
+esp_do_outbound_ah(mblk_t *data_mp, ip_xmit_attr_t *ixa)
 {
-	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	ipsec_status_t ipsec_rc;
 	ipsec_action_t *ap;
 
-	ap = io->ipsec_out_act;
+	ap = ixa->ixa_ipsec_action;
 	if (ap == NULL) {
-		ipsec_policy_t *pp = io->ipsec_out_policy;
+		ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
 		ap = pp->ipsp_act;
 	}
 
 	if (!ap->ipa_want_ah)
-		return (B_TRUE);
+		return (data_mp);
 
-	ASSERT(io->ipsec_out_ah_done == B_FALSE);
-
-	if (io->ipsec_out_ah_sa == NULL) {
-		if (!ipsec_outbound_sa(ipsec_mp, IPPROTO_AH)) {
-			sadb_acquire(ipsec_mp, io, B_TRUE, B_FALSE);
-			return (B_FALSE);
+	/*
+	 * Normally the AH SA would have already been put in place
+	 * but it could have been flushed so we need to look for it.
+	 */
+	if (ixa->ixa_ipsec_ah_sa == NULL) {
+		if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) {
+			sadb_acquire(data_mp, ixa, B_TRUE, B_FALSE);
+			return (NULL);
 		}
 	}
-	ASSERT(io->ipsec_out_ah_sa != NULL);
+	ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
 
-	io->ipsec_out_ah_done = B_TRUE;
-	ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp);
-	return (ipsec_rc == IPSEC_STATUS_SUCCESS);
+	data_mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(data_mp, ixa);
+	return (data_mp);
 }
 
 
 /*
  * Kernel crypto framework callback invoked after completion of async
- * crypto requests.
+ * crypto requests for outbound packets.
  */
 static void
-esp_kcf_callback(void *arg, int status)
+esp_kcf_callback_outbound(void *arg, int status)
 {
-	mblk_t *ipsec_mp = (mblk_t *)arg;
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
-	netstackid_t	stackid;
-	netstack_t	*ns, *ns_arg;
-	ipsecesp_stack_t *espstack;
+	mblk_t		*mp = (mblk_t *)arg;
+	mblk_t		*async_mp;
+	netstack_t	*ns;
 	ipsec_stack_t	*ipss;
+	ipsecesp_stack_t *espstack;
+	mblk_t		*data_mp;
+	ip_xmit_attr_t	ixas;
+	ipsec_crypto_t	*ic;
+	ill_t		*ill;
 
-	ASSERT(ipsec_mp->b_cont != NULL);
+	/*
+	 * First remove the ipsec_crypto_t mblk
+	 * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
+	 */
+	async_mp = ipsec_remove_crypto_data(mp, &ic);
+	ASSERT(async_mp != NULL);
 
-	if (is_inbound) {
-		stackid = ii->ipsec_in_stackid;
-		ns_arg = ii->ipsec_in_ns;
+	/*
+	 * Extract the ip_xmit_attr_t from the first mblk.
+	 * Verifies that the netstack and ill is still around; could
+	 * have vanished while kEf was doing its work.
+	 * On succesful return we have a nce_t and the ill/ipst can't
+	 * disappear until we do the nce_refrele in ixa_cleanup.
+	 */
+	data_mp = async_mp->b_cont;
+	async_mp->b_cont = NULL;
+	if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) {
+		/* Disappeared on us - no ill/ipst for MIB */
+		/* We have nowhere to do stats since ixa_ipst could be NULL */
+		if (ixas.ixa_nce != NULL) {
+			ill = ixas.ixa_nce->nce_ill;
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
+		}
+		freemsg(data_mp);
+		goto done;
+	}
+	ns = ixas.ixa_ipst->ips_netstack;
+	espstack = ns->netstack_ipsecesp;
+	ipss = ns->netstack_ipsec;
+	ill = ixas.ixa_nce->nce_ill;
+
+	if (status == CRYPTO_SUCCESS) {
+		/*
+		 * If a ICV was computed, it was stored by the
+		 * crypto framework at the end of the packet.
+		 */
+		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
+
+		esp_set_usetime(ixas.ixa_ipsec_esp_sa, B_FALSE);
+		/* NAT-T packet. */
+		if (IPH_HDR_VERSION(ipha) == IP_VERSION &&
+		    ipha->ipha_protocol == IPPROTO_UDP)
+			esp_prepare_udp(ns, data_mp, ipha);
+
+		/* do AH processing if needed */
+		data_mp = esp_do_outbound_ah(data_mp, &ixas);
+		if (data_mp == NULL)
+			goto done;
+
+		(void) ip_output_post_ipsec(data_mp, &ixas);
 	} else {
-		stackid = io->ipsec_out_stackid;
-		ns_arg = io->ipsec_out_ns;
+		/* Outbound shouldn't see invalid MAC */
+		ASSERT(status != CRYPTO_INVALID_MAC);
+
+		esp1dbg(espstack,
+		    ("esp_kcf_callback_outbound: crypto failed with 0x%x\n",
+		    status));
+		ESP_BUMP_STAT(espstack, crypto_failures);
+		ESP_BUMP_STAT(espstack, out_discards);
+		ip_drop_packet(data_mp, B_FALSE, ill,
+		    DROPPER(ipss, ipds_esp_crypto_failed),
+		    &espstack->esp_dropper);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
 	}
+done:
+	ixa_cleanup(&ixas);
+	(void) ipsec_free_crypto_data(mp);
+}
+
+/*
+ * Kernel crypto framework callback invoked after completion of async
+ * crypto requests for inbound packets.
+ */
+static void
+esp_kcf_callback_inbound(void *arg, int status)
+{
+	mblk_t		*mp = (mblk_t *)arg;
+	mblk_t		*async_mp;
+	netstack_t	*ns;
+	ipsecesp_stack_t *espstack;
+	ipsec_stack_t	*ipss;
+	mblk_t		*data_mp;
+	ip_recv_attr_t	iras;
+	ipsec_crypto_t	*ic;
 
 	/*
-	 * Verify that the netstack is still around; could have vanished
-	 * while kEf was doing its work.
+	 * First remove the ipsec_crypto_t mblk
+	 * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
 	 */
-	ns = netstack_find_by_stackid(stackid);
-	if (ns == NULL || ns != ns_arg) {
-		/* Disappeared on us */
-		if (ns != NULL)
-			netstack_rele(ns);
-		freemsg(ipsec_mp);
-		return;
+	async_mp = ipsec_remove_crypto_data(mp, &ic);
+	ASSERT(async_mp != NULL);
+
+	/*
+	 * Extract the ip_recv_attr_t from the first mblk.
+	 * Verifies that the netstack and ill is still around; could
+	 * have vanished while kEf was doing its work.
+	 */
+	data_mp = async_mp->b_cont;
+	async_mp->b_cont = NULL;
+	if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
+		/* The ill or ip_stack_t disappeared on us */
+		ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
+		freemsg(data_mp);
+		goto done;
 	}
 
+	ns = iras.ira_ill->ill_ipst->ips_netstack;
 	espstack = ns->netstack_ipsecesp;
 	ipss = ns->netstack_ipsec;
 
 	if (status == CRYPTO_SUCCESS) {
-		if (is_inbound) {
-			if (esp_in_done(ipsec_mp) != IPSEC_STATUS_SUCCESS) {
-				netstack_rele(ns);
-				return;
-			}
-			/* finish IPsec processing */
-			ip_fanout_proto_again(ipsec_mp, NULL, NULL, NULL);
-		} else {
-			/*
-			 * If a ICV was computed, it was stored by the
-			 * crypto framework at the end of the packet.
-			 */
-			ipha_t *ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
-
-			esp_set_usetime(io->ipsec_out_esp_sa, B_FALSE);
-			/* NAT-T packet. */
-			if (ipha->ipha_protocol == IPPROTO_UDP)
-				esp_prepare_udp(ns, ipsec_mp->b_cont, ipha);
-
-			/* do AH processing if needed */
-			if (!esp_do_outbound_ah(ipsec_mp)) {
-				netstack_rele(ns);
-				return;
-			}
-			/* finish IPsec processing */
-			if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
-				ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL,
-				    NULL);
-			} else {
-				ip6_t *ip6h = (ip6_t *)ipha;
-				ip_wput_ipsec_out_v6(NULL, ipsec_mp, ip6h,
-				    NULL, NULL);
-			}
-		}
+		data_mp = esp_in_done(data_mp, &iras, ic);
+		if (data_mp == NULL)
+			goto done;
 
+		/* finish IPsec processing */
+		ip_input_post_ipsec(data_mp, &iras);
 	} else if (status == CRYPTO_INVALID_MAC) {
-		esp_log_bad_auth(ipsec_mp);
-
+		esp_log_bad_auth(data_mp, &iras);
 	} else {
 		esp1dbg(espstack,
 		    ("esp_kcf_callback: crypto failed with 0x%x\n",
 		    status));
 		ESP_BUMP_STAT(espstack, crypto_failures);
-		if (is_inbound)
-			IP_ESP_BUMP_STAT(ipss, in_discards);
-		else
-			ESP_BUMP_STAT(espstack, out_discards);
-		ip_drop_packet(ipsec_mp, is_inbound, NULL, NULL,
+		IP_ESP_BUMP_STAT(ipss, in_discards);
+		ip_drop_packet(data_mp, B_TRUE, iras.ira_ill,
 		    DROPPER(ipss, ipds_esp_crypto_failed),
 		    &espstack->esp_dropper);
+		BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards);
 	}
-	netstack_rele(ns);
+done:
+	ira_cleanup(&iras, B_TRUE);
+	(void) ipsec_free_crypto_data(mp);
 }
 
 /*
  * Invoked on crypto framework failure during inbound and outbound processing.
  */
 static void
-esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
-    ipsecesp_stack_t *espstack)
+esp_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc,
+    ill_t *ill, ipsecesp_stack_t *espstack)
 {
 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
 
 	esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n",
 	    is_inbound ? "inbound" : "outbound", kef_rc));
-	ip_drop_packet(mp, is_inbound, NULL, NULL,
+	ip_drop_packet(data_mp, is_inbound, ill,
 	    DROPPER(ipss, ipds_esp_crypto_failed),
 	    &espstack->esp_dropper);
 	ESP_BUMP_STAT(espstack, crypto_failures);
@@ -2095,11 +2022,14 @@ esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
 		ESP_BUMP_STAT(espstack, out_discards);
 }
 
-#define	ESP_INIT_CALLREQ(_cr) {						\
-	(_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_RESTRICTED;		\
-	(_cr)->cr_callback_arg = ipsec_mp;				\
-	(_cr)->cr_callback_func = esp_kcf_callback;			\
-}
+/*
+ * A statement-equivalent macro, _cr MUST point to a modifiable
+ * crypto_call_req_t.
+ */
+#define	ESP_INIT_CALLREQ(_cr, _mp, _callback)				\
+	(_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE;	\
+	(_cr)->cr_callback_arg = (_mp);				\
+	(_cr)->cr_callback_func = (_callback)
 
 #define	ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) {			\
 	(mac)->cd_format = CRYPTO_DATA_RAW;				\
@@ -2132,44 +2062,45 @@ esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
 	(data)->dd_offset2 = off2;					\
 }
 
-static ipsec_status_t
-esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
+/*
+ * Returns data_mp if successfully completed the request. Returns
+ * NULL if it failed (and increments InDiscards) or if it is pending.
+ */
+static mblk_t *
+esp_submit_req_inbound(mblk_t *esp_mp, ip_recv_attr_t *ira,
+    ipsa_t *assoc, uint_t esph_offset)
 {
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	boolean_t do_auth;
 	uint_t auth_offset, msg_len, auth_len;
-	crypto_call_req_t call_req;
-	mblk_t *esp_mp;
+	crypto_call_req_t call_req, *callrp;
+	mblk_t *mp;
 	esph_t *esph_ptr;
-	int kef_rc = CRYPTO_FAILED;
+	int kef_rc;
 	uint_t icv_len = assoc->ipsa_mac_len;
 	crypto_ctx_template_t auth_ctx_tmpl;
-	boolean_t do_encr;
+	boolean_t do_auth, do_encr, force;
 	uint_t encr_offset, encr_len;
 	uint_t iv_len = assoc->ipsa_iv_len;
 	crypto_ctx_template_t encr_ctx_tmpl;
-	netstack_t	*ns = ii->ipsec_in_ns;
-	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
-	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ipsec_crypto_t	*ic, icstack;
 	uchar_t *iv_ptr;
-
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
-
-	/*
-	 * In case kEF queues and calls back, keep netstackid_t for
-	 * verification that the IP instance is still around in
-	 * esp_kcf_callback().
-	 */
-	ASSERT(ii->ipsec_in_stackid == ns->netstack_stackid);
+	netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
+	ipsec_stack_t *ipss = ns->netstack_ipsec;
+	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
 
 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
+	force = (assoc->ipsa_flags & IPSA_F_ASYNC);
+
+#ifdef IPSEC_LATENCY_TEST
+	kef_rc = CRYPTO_SUCCESS;
+#else
+	kef_rc = CRYPTO_FAILED;
+#endif
 
 	/*
 	 * An inbound packet is of the form:
-	 * IPSEC_IN -> [IP,options,ESP,IV,data,ICV,pad]
+	 * [IP,options,ESP,IV,data,ICV,pad]
 	 */
-	esp_mp = ipsec_mp->b_cont;
 	esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
 	iv_ptr = (uchar_t *)(esph_ptr + 1);
 	/* Packet length starting at IP header ending after ESP ICV. */
@@ -2178,8 +2109,6 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
 	encr_offset = esph_offset + sizeof (esph_t) + iv_len;
 	encr_len = msg_len - encr_offset;
 
-	ESP_INIT_CALLREQ(&call_req);
-
 	/*
 	 * Counter mode algs need a nonce. This is setup in sadb_common_add().
 	 * If for some reason we are using a SA which does not have a nonce
@@ -2187,23 +2116,40 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
 	 */
 	if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
 	    (assoc->ipsa_nonce == NULL)) {
-		ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
+		ip_drop_packet(esp_mp, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
-		return (IPSEC_STATUS_FAILED);
+		return (NULL);
 	}
 
-	if (do_auth) {
-		/* force asynchronous processing? */
-		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
-		    IPSEC_ALGS_EXEC_ASYNC)
-			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
+	if (force) {
+		/* We are doing asynch; allocate mblks to hold state */
+		if ((mp = ip_recv_attr_to_mblk(ira)) == NULL ||
+		    (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
+			BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", esp_mp,
+			    ira->ira_ill);
+			return (NULL);
+		}
+		linkb(mp, esp_mp);
+		callrp = &call_req;
+		ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_inbound);
+	} else {
+		/*
+		 * If we know we are going to do sync then ipsec_crypto_t
+		 * should be on the stack.
+		 */
+		ic = &icstack;
+		bzero(ic, sizeof (*ic));
+		callrp = NULL;
+	}
 
+	if (do_auth) {
 		/* authentication context template */
 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
 		    auth_ctx_tmpl);
 
 		/* ICV to be verified */
-		ESP_INIT_CRYPTO_MAC(&ii->ipsec_in_crypto_mac,
+		ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
 		    icv_len, esp_mp->b_wptr - icv_len);
 
 		/* authentication starts at the ESP header */
@@ -2212,79 +2158,90 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
 		if (!do_encr) {
 			/* authentication only */
 			/* initialize input data argument */
-			ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data,
+			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
 			    esp_mp, auth_offset, auth_len);
 
 			/* call the crypto framework */
 			kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
-			    &ii->ipsec_in_crypto_data,
+			    &ic->ic_crypto_data,
 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
-			    &ii->ipsec_in_crypto_mac, &call_req);
+			    &ic->ic_crypto_mac, callrp);
 		}
 	}
 
 	if (do_encr) {
-		/* force asynchronous processing? */
-		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
-		    IPSEC_ALGS_EXEC_ASYNC)
-			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
-
 		/* encryption template */
 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
 		    encr_ctx_tmpl);
 
 		/* Call the nonce update function. Also passes in IV */
 		(assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, encr_len,
-		    iv_ptr, &ii->ipsec_in_cmm, &ii->ipsec_in_crypto_data);
+		    iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
 
 		if (!do_auth) {
 			/* decryption only */
 			/* initialize input data argument */
-			ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data,
+			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
 			    esp_mp, encr_offset, encr_len);
 
 			/* call the crypto framework */
 			kef_rc = crypto_decrypt((crypto_mechanism_t *)
-			    &ii->ipsec_in_cmm, &ii->ipsec_in_crypto_data,
+			    &ic->ic_cmm, &ic->ic_crypto_data,
 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
-			    NULL, &call_req);
+			    NULL, callrp);
 		}
 	}
 
 	if (do_auth && do_encr) {
 		/* dual operation */
 		/* initialize input data argument */
-		ESP_INIT_CRYPTO_DUAL_DATA(&ii->ipsec_in_crypto_dual_data,
+		ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
 		    esp_mp, auth_offset, auth_len,
 		    encr_offset, encr_len - icv_len);
 
 		/* specify IV */
-		ii->ipsec_in_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
+		ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
 
 		/* call the framework */
 		kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech,
-		    &assoc->ipsa_emech, &ii->ipsec_in_crypto_dual_data,
+		    &assoc->ipsa_emech, &ic->ic_crypto_dual_data,
 		    &assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey,
-		    auth_ctx_tmpl, encr_ctx_tmpl, &ii->ipsec_in_crypto_mac,
-		    NULL, &call_req);
+		    auth_ctx_tmpl, encr_ctx_tmpl, &ic->ic_crypto_mac,
+		    NULL, callrp);
 	}
 
 	switch (kef_rc) {
 	case CRYPTO_SUCCESS:
 		ESP_BUMP_STAT(espstack, crypto_sync);
-		return (esp_in_done(ipsec_mp));
+		esp_mp = esp_in_done(esp_mp, ira, ic);
+		if (force) {
+			/* Free mp after we are done with ic */
+			mp = ipsec_free_crypto_data(mp);
+			(void) ip_recv_attr_free_mblk(mp);
+		}
+		return (esp_mp);
 	case CRYPTO_QUEUED:
-		/* esp_kcf_callback() will be invoked on completion */
+		/* esp_kcf_callback_inbound() will be invoked on completion */
 		ESP_BUMP_STAT(espstack, crypto_async);
-		return (IPSEC_STATUS_PENDING);
+		return (NULL);
 	case CRYPTO_INVALID_MAC:
+		if (force) {
+			mp = ipsec_free_crypto_data(mp);
+			esp_mp = ip_recv_attr_free_mblk(mp);
+		}
 		ESP_BUMP_STAT(espstack, crypto_sync);
-		esp_log_bad_auth(ipsec_mp);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+		esp_log_bad_auth(esp_mp, ira);
+		/* esp_mp was passed to ip_drop_packet */
+		return (NULL);
 	}
 
-	esp_crypto_failed(ipsec_mp, B_TRUE, kef_rc, espstack);
-	return (IPSEC_STATUS_FAILED);
+	mp = ipsec_free_crypto_data(mp);
+	esp_mp = ip_recv_attr_free_mblk(mp);
+	BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+	esp_crypto_failed(esp_mp, B_TRUE, kef_rc, ira->ira_ill, espstack);
+	/* esp_mp was passed to ip_drop_packet */
+	return (NULL);
 }
 
 /*
@@ -2293,6 +2250,9 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
  * uses mblk-insertion to insert the UDP header.
  * TODO - If there is an easy way to prep a packet for HW checksums, make
  * it happen here.
+ * Note that this is used before both before calling ip_output_simple and
+ * in the esp datapath. The former could use IXAF_SET_ULP_CKSUM but not the
+ * latter.
  */
 static void
 esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
@@ -2313,7 +2273,7 @@ esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
 		/* arr points to the IP header. */
 		arr = (uint16_t *)ipha;
 		IP_STAT(ns->netstack_ip, ip_out_sw_cksum);
-		IP_STAT_UPDATE(ns->netstack_ip, ip_udp_out_sw_cksum_bytes,
+		IP_STAT_UPDATE(ns->netstack_ip, ip_out_sw_cksum_bytes,
 		    ntohs(htons(ipha->ipha_length) - hlen));
 		/* arr[6-9] are the IP addresses. */
 		cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] +
@@ -2336,41 +2296,45 @@ esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
 static void
 actually_send_keepalive(void *arg)
 {
-	mblk_t *ipsec_mp = (mblk_t *)arg;
-	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	ipha_t *ipha;
-	netstack_t *ns;
-
-	ASSERT(DB_TYPE(ipsec_mp) == M_CTL);
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	ASSERT(ipsec_mp->b_cont != NULL);
-	ASSERT(DB_TYPE(ipsec_mp->b_cont) == M_DATA);
-
-	ns = netstack_find_by_stackid(io->ipsec_out_stackid);
-	if (ns == NULL || ns != io->ipsec_out_ns) {
-		/* Just freemsg(). */
-		if (ns != NULL)
-			netstack_rele(ns);
-		freemsg(ipsec_mp);
+	mblk_t *mp = (mblk_t *)arg;
+	ip_xmit_attr_t ixas;
+	netstack_t	*ns;
+	netstackid_t	stackid;
+
+	stackid = (netstackid_t)(uintptr_t)mp->b_prev;
+	mp->b_prev = NULL;
+	ns = netstack_find_by_stackid(stackid);
+	if (ns == NULL) {
+		/* Disappeared */
+		ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
+		freemsg(mp);
 		return;
 	}
 
-	ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
-	ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL, NULL);
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_zoneid = ALL_ZONES;
+	ixas.ixa_cred = kcred;
+	ixas.ixa_cpid = NOPID;
+	ixas.ixa_tsl = NULL;
+	ixas.ixa_ipst = ns->netstack_ip;
+	/* No ULP checksum; done by esp_prepare_udp */
+	ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_IPSEC;
+
+	(void) ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
 	netstack_rele(ns);
 }
 
 /*
- * Send a one-byte UDP NAT-T keepalive.  Construct an IPSEC_OUT too that'll
- * get fed into esp_send_udp/ip_wput_ipsec_out.
+ * Send a one-byte UDP NAT-T keepalive.
  */
 void
 ipsecesp_send_keepalive(ipsa_t *assoc)
 {
-	mblk_t *mp = NULL, *ipsec_mp = NULL;
-	ipha_t *ipha;
-	udpha_t *udpha;
-	ipsec_out_t *io;
+	mblk_t		*mp;
+	ipha_t		*ipha;
+	udpha_t		*udpha;
+	netstack_t	*ns = assoc->ipsa_netstack;
 
 	ASSERT(MUTEX_NOT_HELD(&assoc->ipsa_lock));
 
@@ -2399,85 +2363,78 @@ ipsecesp_send_keepalive(ipsa_t *assoc)
 	mp->b_wptr = (uint8_t *)(udpha + 1);
 	*(mp->b_wptr++) = 0xFF;
 
-	ipsec_mp = ipsec_alloc_ipsec_out(assoc->ipsa_netstack);
-	if (ipsec_mp == NULL) {
-		freeb(mp);
-		return;
-	}
-	ipsec_mp->b_cont = mp;
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	io->ipsec_out_zoneid =
-	    netstackid_to_zoneid(assoc->ipsa_netstack->netstack_stackid);
-	io->ipsec_out_stackid = assoc->ipsa_netstack->netstack_stackid;
+	esp_prepare_udp(ns, mp, ipha);
 
-	esp_prepare_udp(assoc->ipsa_netstack, mp, ipha);
 	/*
 	 * We're holding an isaf_t bucket lock, so pawn off the actual
 	 * packet transmission to another thread.  Just in case syncq
 	 * processing causes a same-bucket packet to be processed.
 	 */
-	if (taskq_dispatch(esp_taskq, actually_send_keepalive, ipsec_mp,
+	mp->b_prev = (mblk_t *)(uintptr_t)ns->netstack_stackid;
+
+	if (taskq_dispatch(esp_taskq, actually_send_keepalive, mp,
 	    TQ_NOSLEEP) == 0) {
 		/* Assume no memory if taskq_dispatch() fails. */
-		ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
-		    DROPPER(assoc->ipsa_netstack->netstack_ipsec,
-		    ipds_esp_nomem),
-		    &assoc->ipsa_netstack->netstack_ipsecesp->esp_dropper);
+		mp->b_prev = NULL;
+		ip_drop_packet(mp, B_FALSE, NULL,
+		    DROPPER(ns->netstack_ipsec, ipds_esp_nomem),
+		    &ns->netstack_ipsecesp->esp_dropper);
 	}
 }
 
-static ipsec_status_t
-esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
-    uint_t payload_len)
+/*
+ * Returns mp if successfully completed the request. Returns
+ * NULL if it failed (and increments InDiscards) or if it is pending.
+ */
+static mblk_t *
+esp_submit_req_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa, ipsa_t *assoc,
+    uchar_t *icv_buf, uint_t payload_len)
 {
-	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
 	uint_t auth_len;
-	crypto_call_req_t call_req;
-	mblk_t *esp_mp, *data_mp, *ip_mp;
+	crypto_call_req_t call_req, *callrp;
+	mblk_t *esp_mp;
 	esph_t *esph_ptr;
+	mblk_t *mp;
 	int kef_rc = CRYPTO_FAILED;
 	uint_t icv_len = assoc->ipsa_mac_len;
 	crypto_ctx_template_t auth_ctx_tmpl;
-	boolean_t do_auth;
-	boolean_t do_encr;
+	boolean_t do_auth, do_encr, force;
 	uint_t iv_len = assoc->ipsa_iv_len;
 	crypto_ctx_template_t encr_ctx_tmpl;
 	boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
 	size_t esph_offset = (is_natt ? UDPH_SIZE : 0);
-	netstack_t	*ns = io->ipsec_out_ns;
+	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+	ipsec_crypto_t	*ic, icstack;
+	uchar_t		*iv_ptr;
+	crypto_data_t	*cd_ptr = NULL;
+	ill_t		*ill = ixa->ixa_nce->nce_ill;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
-	uchar_t *iv_ptr;
-	crypto_data_t *cd_ptr = NULL;
 
 	esp3dbg(espstack, ("esp_submit_req_outbound:%s",
 	    is_natt ? "natt" : "not natt"));
 
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
-
-	/*
-	 * In case kEF queues and calls back, keep netstackid_t for
-	 * verification that the IP instance is still around in
-	 * esp_kcf_callback().
-	 */
-	io->ipsec_out_stackid = ns->netstack_stackid;
-
 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
+	force = (assoc->ipsa_flags & IPSA_F_ASYNC);
+
+#ifdef IPSEC_LATENCY_TEST
+	kef_rc = CRYPTO_SUCCESS;
+#else
+	kef_rc = CRYPTO_FAILED;
+#endif
 
 	/*
 	 * Outbound IPsec packets are of the form:
-	 * IPSEC_OUT -> [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
+	 * [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
 	 * unless it's NATT, then it's
-	 * IPSEC_OUT -> [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
+	 * [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
 	 * Get a pointer to the mblk containing the ESP header.
 	 */
-	ip_mp = ipsec_mp->b_cont;
-	esp_mp = ipsec_mp->b_cont->b_cont;
-	ASSERT(ip_mp != NULL && esp_mp != NULL);
+	ASSERT(data_mp->b_cont != NULL);
+	esp_mp = data_mp->b_cont;
 	esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
 	iv_ptr = (uchar_t *)(esph_ptr + 1);
-	data_mp = ipsec_mp->b_cont->b_cont->b_cont;
 
 	/*
 	 * Combined mode algs need a nonce. This is setup in sadb_common_add().
@@ -2486,25 +2443,42 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
 	 */
 	if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
 	    (assoc->ipsa_nonce == NULL)) {
-		ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
+		ip_drop_packet(data_mp, B_FALSE, NULL,
 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
-		return (IPSEC_STATUS_FAILED);
+		return (NULL);
 	}
 
-	ESP_INIT_CALLREQ(&call_req);
+	if (force) {
+		/* We are doing asynch; allocate mblks to hold state */
+		if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL ||
+		    (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
+			freemsg(data_mp);
+			return (NULL);
+		}
+
+		linkb(mp, data_mp);
+		callrp = &call_req;
+		ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_outbound);
+	} else {
+		/*
+		 * If we know we are going to do sync then ipsec_crypto_t
+		 * should be on the stack.
+		 */
+		ic = &icstack;
+		bzero(ic, sizeof (*ic));
+		callrp = NULL;
+	}
 
-	if (do_auth) {
-		/* force asynchronous processing? */
-		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
-		    IPSEC_ALGS_EXEC_ASYNC)
-			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
 
+	if (do_auth) {
 		/* authentication context template */
 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
 		    auth_ctx_tmpl);
 
 		/* where to store the computed mac */
-		ESP_INIT_CRYPTO_MAC(&io->ipsec_out_crypto_mac,
+		ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
 		    icv_len, icv_buf);
 
 		/* authentication starts at the ESP header */
@@ -2512,35 +2486,30 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
 		if (!do_encr) {
 			/* authentication only */
 			/* initialize input data argument */
-			ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data,
+			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
 			    esp_mp, esph_offset, auth_len);
 
 			/* call the crypto framework */
 			kef_rc = crypto_mac(&assoc->ipsa_amech,
-			    &io->ipsec_out_crypto_data,
+			    &ic->ic_crypto_data,
 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
-			    &io->ipsec_out_crypto_mac, &call_req);
+			    &ic->ic_crypto_mac, callrp);
 		}
 	}
 
 	if (do_encr) {
-		/* force asynchronous processing? */
-		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
-		    IPSEC_ALGS_EXEC_ASYNC)
-			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
-
 		/* encryption context template */
 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
 		    encr_ctx_tmpl);
 		/* Call the nonce update function. */
 		(assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, payload_len,
-		    iv_ptr, &io->ipsec_out_cmm, &io->ipsec_out_crypto_data);
+		    iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
 
 		if (!do_auth) {
 			/* encryption only, skip mblk that contains ESP hdr */
 			/* initialize input data argument */
-			ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data,
-			    data_mp, 0, payload_len);
+			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
+			    esp_mp->b_cont, 0, payload_len);
 
 			/*
 			 * For combined mode ciphers, the ciphertext is the same
@@ -2556,20 +2525,19 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
 			 * for the cipher to use.
 			 */
 			if (assoc->ipsa_flags & IPSA_F_COMBINED) {
-				bcopy(&io->ipsec_out_crypto_data,
-				    &io->ipsec_out_crypto_mac,
+				bcopy(&ic->ic_crypto_data,
+				    &ic->ic_crypto_mac,
 				    sizeof (crypto_data_t));
-				io->ipsec_out_crypto_mac.cd_length =
+				ic->ic_crypto_mac.cd_length =
 				    payload_len + icv_len;
-				cd_ptr = &io->ipsec_out_crypto_mac;
+				cd_ptr = &ic->ic_crypto_mac;
 			}
 
 			/* call the crypto framework */
 			kef_rc = crypto_encrypt((crypto_mechanism_t *)
-			    &io->ipsec_out_cmm,
-			    &io->ipsec_out_crypto_data,
+			    &ic->ic_cmm, &ic->ic_crypto_data,
 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
-			    cd_ptr, &call_req);
+			    cd_ptr, callrp);
 
 		}
 	}
@@ -2584,49 +2552,58 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
 		 * the authentication at the ESP header, i.e. use an
 		 * authentication offset of zero.
 		 */
-		ESP_INIT_CRYPTO_DUAL_DATA(&io->ipsec_out_crypto_dual_data,
+		ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
 		    esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len);
 
 		/* specify IV */
-		io->ipsec_out_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
+		ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
 
 		/* call the framework */
 		kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech,
 		    &assoc->ipsa_amech, NULL,
 		    &assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey,
 		    encr_ctx_tmpl, auth_ctx_tmpl,
-		    &io->ipsec_out_crypto_dual_data,
-		    &io->ipsec_out_crypto_mac, &call_req);
+		    &ic->ic_crypto_dual_data,
+		    &ic->ic_crypto_mac, callrp);
 	}
 
 	switch (kef_rc) {
 	case CRYPTO_SUCCESS:
 		ESP_BUMP_STAT(espstack, crypto_sync);
 		esp_set_usetime(assoc, B_FALSE);
+		if (force) {
+			mp = ipsec_free_crypto_data(mp);
+			data_mp = ip_xmit_attr_free_mblk(mp);
+		}
 		if (is_natt)
-			esp_prepare_udp(ns, ipsec_mp->b_cont,
-			    (ipha_t *)ipsec_mp->b_cont->b_rptr);
-		return (IPSEC_STATUS_SUCCESS);
+			esp_prepare_udp(ns, data_mp, (ipha_t *)data_mp->b_rptr);
+		return (data_mp);
 	case CRYPTO_QUEUED:
-		/* esp_kcf_callback() will be invoked on completion */
+		/* esp_kcf_callback_outbound() will be invoked on completion */
 		ESP_BUMP_STAT(espstack, crypto_async);
-		return (IPSEC_STATUS_PENDING);
+		return (NULL);
 	}
 
-	esp_crypto_failed(ipsec_mp, B_FALSE, kef_rc, espstack);
-	return (IPSEC_STATUS_FAILED);
+	if (force) {
+		mp = ipsec_free_crypto_data(mp);
+		data_mp = ip_xmit_attr_free_mblk(mp);
+	}
+	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+	esp_crypto_failed(data_mp, B_FALSE, kef_rc, NULL, espstack);
+	/* data_mp was passed to ip_drop_packet */
+	return (NULL);
 }
 
 /*
  * Handle outbound IPsec processing for IPv4 and IPv6
- * On success returns B_TRUE, on failure returns B_FALSE and frees the
- * mblk chain ipsec_in_mp.
+ *
+ * Returns data_mp if successfully completed the request. Returns
+ * NULL if it failed (and increments InDiscards) or if it is pending.
  */
-static ipsec_status_t
-esp_outbound(mblk_t *mp)
+static mblk_t *
+esp_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa)
 {
-	mblk_t *ipsec_out_mp, *data_mp, *espmp, *tailmp;
-	ipsec_out_t *io;
+	mblk_t *espmp, *tailmp;
 	ipha_t *ipha;
 	ip6_t *ip6h;
 	esph_t *esph_ptr, *iv_ptr;
@@ -2640,17 +2617,11 @@ esp_outbound(mblk_t *mp)
 	uchar_t *icv_buf;
 	udpha_t *udpha;
 	boolean_t is_natt = B_FALSE;
-	netstack_t	*ns;
-	ipsecesp_stack_t *espstack;
-	ipsec_stack_t	*ipss;
-
-	ipsec_out_mp = mp;
-	data_mp = ipsec_out_mp->b_cont;
-
-	io = (ipsec_out_t *)ipsec_out_mp->b_rptr;
-	ns = io->ipsec_out_ns;
-	espstack = ns->netstack_ipsecesp;
-	ipss = ns->netstack_ipsec;
+	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
+	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ill_t		*ill = ixa->ixa_nce->nce_ill;
+	boolean_t	need_refrele = B_FALSE;
 
 	ESP_BUMP_STAT(espstack, out_requests);
 
@@ -2662,65 +2633,73 @@ esp_outbound(mblk_t *mp)
 	 * we might as well make use of msgpullup() and get the mblk into one
 	 * contiguous piece!
 	 */
-	ipsec_out_mp->b_cont = msgpullup(data_mp, -1);
-	if (ipsec_out_mp->b_cont == NULL) {
+	tailmp = msgpullup(data_mp, -1);
+	if (tailmp == NULL) {
 		esp0dbg(("esp_outbound: msgpullup() failed, "
 		    "dropping packet.\n"));
-		ipsec_out_mp->b_cont = data_mp;
-		/*
-		 * TODO:  Find the outbound IRE for this packet and
-		 * pass it to ip_drop_packet().
-		 */
-		ip_drop_packet(ipsec_out_mp, B_FALSE, NULL, NULL,
+		ip_drop_packet(data_mp, B_FALSE, ill,
 		    DROPPER(ipss, ipds_esp_nomem),
 		    &espstack->esp_dropper);
-		return (IPSEC_STATUS_FAILED);
-	} else {
-		freemsg(data_mp);
-		data_mp = ipsec_out_mp->b_cont;
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		return (NULL);
 	}
+	freemsg(data_mp);
+	data_mp = tailmp;
 
-	assoc = io->ipsec_out_esp_sa;
+	assoc = ixa->ixa_ipsec_esp_sa;
 	ASSERT(assoc != NULL);
 
 	/*
 	 * Get the outer IP header in shape to escape this system..
 	 */
-	if (is_system_labeled() && (assoc->ipsa_ocred != NULL)) {
-		int whack;
-
-		mblk_setcred(data_mp, assoc->ipsa_ocred, NOPID);
-		if (io->ipsec_out_v4)
-			whack = sadb_whack_label(&data_mp, assoc);
-		else
-			whack = sadb_whack_label_v6(&data_mp, assoc);
-		if (whack != 0) {
-			ip_drop_packet(ipsec_out_mp, B_FALSE, NULL,
-			    NULL, DROPPER(ipss, ipds_esp_nomem),
+	if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) {
+		/*
+		 * Need to update packet with any CIPSO option and update
+		 * ixa_tsl to capture the new label.
+		 * We allocate a separate ixa for that purpose.
+		 */
+		ixa = ip_xmit_attr_duplicate(ixa);
+		if (ixa == NULL) {
+			ip_drop_packet(data_mp, B_FALSE, ill,
+			    DROPPER(ipss, ipds_esp_nomem),
 			    &espstack->esp_dropper);
-			return (IPSEC_STATUS_FAILED);
+			return (NULL);
 		}
-		ipsec_out_mp->b_cont = data_mp;
-	}
+		need_refrele = B_TRUE;
 
+		label_hold(assoc->ipsa_otsl);
+		ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl);
+
+		data_mp = sadb_whack_label(data_mp, assoc, ixa,
+		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
+		if (data_mp == NULL) {
+			/* Packet dropped by sadb_whack_label */
+			ixa_refrele(ixa);
+			return (NULL);
+		}
+	}
 
 	/*
 	 * Reality check....
 	 */
 	ipha = (ipha_t *)data_mp->b_rptr;  /* So we can call esp_acquire(). */
 
-	if (io->ipsec_out_v4) {
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+
 		af = AF_INET;
 		divpoint = IPH_HDR_LENGTH(ipha);
 		datalen = ntohs(ipha->ipha_length) - divpoint;
 		nhp = (uint8_t *)&ipha->ipha_protocol;
 	} else {
-		ip6_pkt_t ipp;
+		ip_pkt_t ipp;
+
+		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
 
 		af = AF_INET6;
 		ip6h = (ip6_t *)ipha;
 		bzero(&ipp, sizeof (ipp));
-		divpoint = ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
+		divpoint = ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, NULL);
 		if (ipp.ipp_dstopts != NULL &&
 		    ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) {
 			/*
@@ -2795,28 +2774,26 @@ esp_outbound(mblk_t *mp)
 	 */
 
 	if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) {
-		/*
-		 * TODO:  Find the outbound IRE for this packet and
-		 * pass it to ip_drop_packet().
-		 */
-		ip_drop_packet(mp, B_FALSE, NULL, NULL,
+		ip_drop_packet(data_mp, B_FALSE, ill,
 		    DROPPER(ipss, ipds_esp_bytes_expire),
 		    &espstack->esp_dropper);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		if (need_refrele)
+			ixa_refrele(ixa);
+		return (NULL);
 	}
 
 	espmp = allocb(esplen, BPRI_HI);
 	if (espmp == NULL) {
 		ESP_BUMP_STAT(espstack, out_discards);
 		esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n"));
-		/*
-		 * TODO:  Find the outbound IRE for this packet and
-		 * pass it to ip_drop_packet().
-		 */
-		ip_drop_packet(mp, B_FALSE, NULL, NULL,
+		ip_drop_packet(data_mp, B_FALSE, ill,
 		    DROPPER(ipss, ipds_esp_nomem),
 		    &espstack->esp_dropper);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		if (need_refrele)
+			ixa_refrele(ixa);
+		return (NULL);
 	}
 	espmp->b_wptr += esplen;
 	esph_ptr = (esph_t *)espmp->b_rptr;
@@ -2853,14 +2830,13 @@ esp_outbound(mblk_t *mp)
 
 		ESP_BUMP_STAT(espstack, out_discards);
 		sadb_replay_delete(assoc);
-		/*
-		 * TODO:  Find the outbound IRE for this packet and
-		 * pass it to ip_drop_packet().
-		 */
-		ip_drop_packet(mp, B_FALSE, NULL, NULL,
+		ip_drop_packet(data_mp, B_FALSE, ill,
 		    DROPPER(ipss, ipds_esp_replay),
 		    &espstack->esp_dropper);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		if (need_refrele)
+			ixa_refrele(ixa);
+		return (NULL);
 	}
 
 	iv_ptr = (esph_ptr + 1);
@@ -2887,9 +2863,11 @@ esp_outbound(mblk_t *mp)
 	 */
 	if (!update_iv((uint8_t *)iv_ptr, espstack->esp_pfkey_q, assoc,
 	    espstack)) {
-		ip_drop_packet(mp, B_FALSE, NULL, NULL,
+		ip_drop_packet(data_mp, B_FALSE, ill,
 		    DROPPER(ipss, ipds_esp_iv_wrap), &espstack->esp_dropper);
-		return (IPSEC_STATUS_FAILED);
+		if (need_refrele)
+			ixa_refrele(ixa);
+		return (NULL);
 	}
 
 	/* Fix the IP header. */
@@ -2898,7 +2876,7 @@ esp_outbound(mblk_t *mp)
 
 	protocol = *nhp;
 
-	if (io->ipsec_out_v4) {
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj);
 		if (is_natt) {
 			*nhp = IPPROTO_UDP;
@@ -2922,15 +2900,14 @@ esp_outbound(mblk_t *mp)
 	if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) {
 		ESP_BUMP_STAT(espstack, out_discards);
 		/* NOTE:  esp_insert_esp() only fails if there's no memory. */
-		/*
-		 * TODO:  Find the outbound IRE for this packet and
-		 * pass it to ip_drop_packet().
-		 */
-		ip_drop_packet(mp, B_FALSE, NULL, NULL,
+		ip_drop_packet(data_mp, B_FALSE, ill,
 		    DROPPER(ipss, ipds_esp_nomem),
 		    &espstack->esp_dropper);
 		freeb(espmp);
-		return (IPSEC_STATUS_FAILED);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		if (need_refrele)
+			ixa_refrele(ixa);
+		return (NULL);
 	}
 
 	/* Append padding (and leave room for ICV). */
@@ -2941,14 +2918,13 @@ esp_outbound(mblk_t *mp)
 		if (tailmp->b_cont == NULL) {
 			ESP_BUMP_STAT(espstack, out_discards);
 			esp0dbg(("esp_outbound:  Can't allocate tailmp.\n"));
-			/*
-			 * TODO:  Find the outbound IRE for this packet and
-			 * pass it to ip_drop_packet().
-			 */
-			ip_drop_packet(mp, B_FALSE, NULL, NULL,
+			ip_drop_packet(data_mp, B_FALSE, ill,
 			    DROPPER(ipss, ipds_esp_nomem),
 			    &espstack->esp_dropper);
-			return (IPSEC_STATUS_FAILED);
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+			if (need_refrele)
+				ixa_refrele(ixa);
+			return (NULL);
 		}
 		tailmp = tailmp->b_cont;
 	}
@@ -2968,29 +2944,6 @@ esp_outbound(mblk_t *mp)
 	esp2dbg(espstack, (dump_msg(data_mp)));
 
 	/*
-	 * The packet is eligible for hardware acceleration if the
-	 * following conditions are satisfied:
-	 *
-	 * 1. the packet will not be fragmented
-	 * 2. the provider supports the algorithms specified by SA
-	 * 3. there is no pending control message being exchanged
-	 * 4. snoop is not attached
-	 * 5. the destination address is not a multicast address
-	 *
-	 * All five of these conditions are checked by IP prior to
-	 * sending the packet to ESP.
-	 *
-	 * But We, and We Alone, can, nay MUST check if the packet
-	 * is over NATT, and then disqualify it from hardware
-	 * acceleration.
-	 */
-
-	if (io->ipsec_out_is_capab_ill && !(assoc->ipsa_flags & IPSA_F_NATT)) {
-		return (esp_outbound_accelerated(ipsec_out_mp, mac_len));
-	}
-	ESP_BUMP_STAT(espstack, noaccel);
-
-	/*
 	 * Okay.  I've set up the pre-encryption ESP.  Let's do it!
 	 */
 
@@ -3002,32 +2955,23 @@ esp_outbound(mblk_t *mp)
 		icv_buf = NULL;
 	}
 
-	return (esp_submit_req_outbound(ipsec_out_mp, assoc, icv_buf,
-	    datalen + padlen + 2));
+	data_mp = esp_submit_req_outbound(data_mp, ixa, assoc, icv_buf,
+	    datalen + padlen + 2);
+	if (need_refrele)
+		ixa_refrele(ixa);
+	return (data_mp);
 }
 
 /*
  * IP calls this to validate the ICMP errors that
  * we got from the network.
  */
-ipsec_status_t
-ipsecesp_icmp_error(mblk_t *ipsec_mp)
+mblk_t *
+ipsecesp_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira)
 {
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
-	netstack_t	*ns;
-	ipsecesp_stack_t *espstack;
-	ipsec_stack_t	*ipss;
-
-	if (is_inbound) {
-		ns = ii->ipsec_in_ns;
-	} else {
-		ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
-
-		ns = io->ipsec_out_ns;
-	}
-	espstack = ns->netstack_ipsecesp;
-	ipss = ns->netstack_ipsec;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
+	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
 	/*
 	 * Unless we get an entire packet back, this function is useless.
@@ -3044,55 +2988,10 @@ ipsecesp_icmp_error(mblk_t *ipsec_mp)
 	 * very small, we discard here.
 	 */
 	IP_ESP_BUMP_STAT(ipss, in_discards);
-	ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+	ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
 	    DROPPER(ipss, ipds_esp_icmp),
 	    &espstack->esp_dropper);
-	return (IPSEC_STATUS_FAILED);
-}
-
-/*
- * ESP module read put routine.
- */
-/* ARGSUSED */
-static void
-ipsecesp_rput(queue_t *q, mblk_t *mp)
-{
-	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
-
-	ASSERT(mp->b_datap->db_type != M_CTL);	/* No more IRE_DB_REQ. */
-
-	switch (mp->b_datap->db_type) {
-	case M_PROTO:
-	case M_PCPROTO:
-		/* TPI message of some sort. */
-		switch (*((t_scalar_t *)mp->b_rptr)) {
-		case T_BIND_ACK:
-			esp3dbg(espstack,
-			    ("Thank you IP from ESP for T_BIND_ACK\n"));
-			break;
-		case T_ERROR_ACK:
-			cmn_err(CE_WARN,
-			    "ipsecesp:  ESP received T_ERROR_ACK from IP.");
-			/*
-			 * Make esp_sadb.s_ip_q NULL, and in the
-			 * future, perhaps try again.
-			 */
-			espstack->esp_sadb.s_ip_q = NULL;
-			break;
-		case T_OK_ACK:
-			/* Probably from a (rarely sent) T_UNBIND_REQ. */
-			break;
-		default:
-			esp0dbg(("Unknown M_{,PC}PROTO message.\n"));
-		}
-		freemsg(mp);
-		break;
-	default:
-		/* For now, passthru message. */
-		esp2dbg(espstack, ("ESP got unknown mblk type %d.\n",
-		    mp->b_datap->db_type));
-		putnext(q, mp);
-	}
+	return (NULL);
 }
 
 /*
@@ -3102,7 +3001,7 @@ ipsecesp_rput(queue_t *q, mblk_t *mp)
  */
 static boolean_t
 esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
-    ipsecesp_stack_t *espstack, mblk_t *in_mp)
+    ipsecesp_stack_t *espstack, cred_t *cr)
 {
 	mblk_t *pfkey_msg_mp, *keysock_out_mp;
 	sadb_msg_t *samsg;
@@ -3121,7 +3020,7 @@ esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
 	sadb_sens_t *sens;
 	size_t sens_len = 0;
 	sadb_ext_t *nextext;
-	cred_t *sens_cr = NULL;
+	ts_label_t *sens_tsl = NULL;
 
 	/* Allocate the KEYSOCK_OUT. */
 	keysock_out_mp = sadb_keysock_out(serial);
@@ -3130,11 +3029,10 @@ esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
 		return (B_FALSE);
 	}
 
-	if (is_system_labeled() && (in_mp != NULL)) {
-		sens_cr = msg_getcred(in_mp, NULL);
-
-		if (sens_cr != NULL) {
-			sens_len = sadb_sens_len_from_cred(sens_cr);
+	if (is_system_labeled() && (cr != NULL)) {
+		sens_tsl = crgetlabel(cr);
+		if (sens_tsl != NULL) {
+			sens_len = sadb_sens_len_from_label(sens_tsl);
 			allocsize += sens_len;
 		}
 	}
@@ -3268,10 +3166,10 @@ esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
 
 	mutex_exit(&ipss->ipsec_alg_lock);
 
-	if (sens_cr != NULL) {
+	if (sens_tsl != NULL) {
 		sens = (sadb_sens_t *)nextext;
-		sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY,
-		    sens_cr, sens_len);
+		sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
+		    sens_tsl, sens_len);
 
 		nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len);
 	}
@@ -3336,40 +3234,61 @@ ipsecesp_algs_changed(netstack_t *ns)
 
 /*
  * Stub function that taskq_dispatch() invokes to take the mblk (in arg)
- * and put() it into AH and STREAMS again.
+ * and send() it into ESP and IP again.
  */
 static void
 inbound_task(void *arg)
 {
-	esph_t *esph;
-	mblk_t *mp = (mblk_t *)arg;
-	ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
-	netstack_t *ns;
-	ipsecesp_stack_t *espstack;
-	int ipsec_rc;
-
-	ns = netstack_find_by_stackid(ii->ipsec_in_stackid);
-	if (ns == NULL || ns != ii->ipsec_in_ns) {
-		/* Just freemsg(). */
-		if (ns != NULL)
-			netstack_rele(ns);
+	mblk_t		*mp = (mblk_t *)arg;
+	mblk_t		*async_mp;
+	ip_recv_attr_t	iras;
+
+	async_mp = mp;
+	mp = async_mp->b_cont;
+	async_mp->b_cont = NULL;
+	if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
+		/* The ill or ip_stack_t disappeared on us */
+		ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
 		freemsg(mp);
-		return;
+		goto done;
 	}
 
-	espstack = ns->netstack_ipsecesp;
+	esp_inbound_restart(mp, &iras);
+done:
+	ira_cleanup(&iras, B_TRUE);
+}
+
+/*
+ * Restart ESP after the SA has been added.
+ */
+static void
+esp_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira)
+{
+	esph_t		*esph;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
+	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
 
 	esp2dbg(espstack, ("in ESP inbound_task"));
 	ASSERT(espstack != NULL);
 
-	esph = ipsec_inbound_esp_sa(mp, ns);
-	if (esph != NULL) {
-		ASSERT(ii->ipsec_in_esp_sa != NULL);
-		ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func(mp, esph);
-		if (ipsec_rc == IPSEC_STATUS_SUCCESS)
-			ip_fanout_proto_again(mp, NULL, NULL, NULL);
+	mp = ipsec_inbound_esp_sa(mp, ira, &esph);
+	if (mp == NULL)
+		return;
+
+	ASSERT(esph != NULL);
+	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+	ASSERT(ira->ira_ipsec_esp_sa != NULL);
+
+	mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, ira);
+	if (mp == NULL) {
+		/*
+		 * Either it failed or is pending. In the former case
+		 * ipIfStatsInDiscards was increased.
+		 */
+		return;
 	}
-	netstack_rele(ns);
+
+	ip_input_post_ipsec(mp, ira);
 }
 
 /*
@@ -3533,17 +3452,21 @@ esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
 	if (larval != NULL)
 		lpkt = sadb_clear_lpkt(larval);
 
-	rc = sadb_common_add(espstack->esp_sadb.s_ip_q, espstack->esp_pfkey_q,
+	rc = sadb_common_add(espstack->esp_pfkey_q,
 	    mp, samsg, ksi, primary, secondary, larval, clone, is_inbound,
 	    diagnostic, espstack->ipsecesp_netstack, &espstack->esp_sadb);
 
-	if (rc == 0 && lpkt != NULL)
-		rc = !taskq_dispatch(esp_taskq, inbound_task, lpkt, TQ_NOSLEEP);
-
-	if (rc != 0) {
-		ip_drop_packet(lpkt, B_TRUE, NULL, NULL,
-		    DROPPER(ipss, ipds_sadb_inlarval_timeout),
-		    &espstack->esp_dropper);
+	if (lpkt != NULL) {
+		if (rc == 0) {
+			rc = !taskq_dispatch(esp_taskq, inbound_task,
+			    lpkt, TQ_NOSLEEP);
+		}
+		if (rc != 0) {
+			lpkt = ip_recv_attr_free_mblk(lpkt);
+			ip_drop_packet(lpkt, B_TRUE, NULL,
+			    DROPPER(ipss, ipds_sadb_inlarval_timeout),
+			    &espstack->esp_dropper);
+		}
 	}
 
 	/*
@@ -3551,45 +3474,78 @@ esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
 	 * esp_outbound() calls?
 	 */
 
+	/* Handle the packets queued waiting for the SA */
 	while (acq_msgs != NULL) {
-		mblk_t *mp = acq_msgs;
+		mblk_t		*asyncmp;
+		mblk_t		*data_mp;
+		ip_xmit_attr_t	ixas;
+		ill_t		*ill;
 
+		asyncmp = acq_msgs;
 		acq_msgs = acq_msgs->b_next;
-		mp->b_next = NULL;
-		if (rc == 0) {
-			if (ipsec_outbound_sa(mp, IPPROTO_ESP)) {
-				((ipsec_out_t *)(mp->b_rptr))->
-				    ipsec_out_esp_done = B_TRUE;
-				if (esp_outbound(mp) == IPSEC_STATUS_SUCCESS) {
-					ipha_t *ipha;
-
-					/* do AH processing if needed */
-					if (!esp_do_outbound_ah(mp))
-						continue;
-
-					ipha = (ipha_t *)mp->b_cont->b_rptr;
-
-					/* finish IPsec processing */
-					if (IPH_HDR_VERSION(ipha) ==
-					    IP_VERSION) {
-						ip_wput_ipsec_out(NULL, mp,
-						    ipha, NULL, NULL);
-					} else {
-						ip6_t *ip6h = (ip6_t *)ipha;
-						ip_wput_ipsec_out_v6(NULL,
-						    mp, ip6h, NULL, NULL);
-					}
-				}
-				continue;
-			}
+		asyncmp->b_next = NULL;
+
+		/*
+		 * Extract the ip_xmit_attr_t from the first mblk.
+		 * Verifies that the netstack and ill is still around; could
+		 * have vanished while iked was doing its work.
+		 * On succesful return we have a nce_t and the ill/ipst can't
+		 * disappear until we do the nce_refrele in ixa_cleanup.
+		 */
+		data_mp = asyncmp->b_cont;
+		asyncmp->b_cont = NULL;
+		if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) {
+			ESP_BUMP_STAT(espstack, out_discards);
+			ip_drop_packet(data_mp, B_FALSE, NULL,
+			    DROPPER(ipss, ipds_sadb_acquire_timeout),
+			    &espstack->esp_dropper);
+		} else if (rc != 0) {
+			ill = ixas.ixa_nce->nce_ill;
+			ESP_BUMP_STAT(espstack, out_discards);
+			ip_drop_packet(data_mp, B_FALSE, ill,
+			    DROPPER(ipss, ipds_sadb_acquire_timeout),
+			    &espstack->esp_dropper);
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		} else {
+			esp_outbound_finish(data_mp, &ixas);
 		}
+		ixa_cleanup(&ixas);
+	}
+
+	return (rc);
+}
+
+/*
+ * Process one of the queued messages (from ipsacq_mp) once the SA
+ * has been added.
+ */
+static void
+esp_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa)
+{
+	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
+	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ill_t		*ill = ixa->ixa_nce->nce_ill;
+
+	if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_ESP)) {
 		ESP_BUMP_STAT(espstack, out_discards);
-		ip_drop_packet(mp, B_FALSE, NULL, NULL,
+		ip_drop_packet(data_mp, B_FALSE, ill,
 		    DROPPER(ipss, ipds_sadb_acquire_timeout),
 		    &espstack->esp_dropper);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+		return;
 	}
 
-	return (rc);
+	data_mp = esp_outbound(data_mp, ixa);
+	if (data_mp == NULL)
+		return;
+
+	/* do AH processing if needed */
+	data_mp = esp_do_outbound_ah(data_mp, ixa);
+	if (data_mp == NULL)
+		return;
+
+	(void) ip_output_post_ipsec(data_mp, ixa);
 }
 
 /*
@@ -3674,11 +3630,13 @@ esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
 		return (EINVAL);
 	}
 
+#ifndef IPSEC_LATENCY_TEST
 	if (assoc->sadb_sa_encrypt == SADB_EALG_NULL &&
 	    assoc->sadb_sa_auth == SADB_AALG_NONE) {
 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
 		return (EINVAL);
 	}
+#endif
 
 	if (assoc->sadb_sa_flags & ~espstack->esp_sadb.s_addflags) {
 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS;
@@ -3734,7 +3692,11 @@ esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
 	/*
 	 * First locate the authentication algorithm.
 	 */
+#ifdef IPSEC_LATENCY_TEST
+	if (akey != NULL && assoc->sadb_sa_auth != SADB_AALG_NONE) {
+#else
 	if (akey != NULL) {
+#endif
 		ipsec_alginfo_t *aalg;
 
 		aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
@@ -3883,7 +3845,7 @@ esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
 		return (sadb_purge_sa(mp, ksi,
 		    (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
 		    &espstack->esp_sadb.s_v4, diagnostic,
-		    espstack->esp_pfkey_q, espstack->esp_sadb.s_ip_q));
+		    espstack->esp_pfkey_q));
 	}
 
 	return (sadb_delget_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
@@ -4024,7 +3986,7 @@ esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack)
 		 * Keysock takes care of the PF_KEY bookkeeping for this.
 		 */
 		if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
-		    ksi->ks_in_serial, espstack, mp)) {
+		    ksi->ks_in_serial, espstack, msg_getcred(mp, NULL))) {
 			freemsg(mp);
 		} else {
 			/*
@@ -4109,8 +4071,7 @@ esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack)
 		samsg->sadb_msg_errno = kse->ks_err_errno;
 		samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
 		/*
-		 * Use the write-side of the esp_pfkey_q, in case there is
-		 * no esp_sadb.s_ip_q.
+		 * Use the write-side of the esp_pfkey_q
 		 */
 		sadb_in_acquire(samsg, &espstack->esp_sadb,
 		    WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack);
@@ -4197,236 +4158,23 @@ ipsecesp_wput(queue_t *q, mblk_t *mp)
 }
 
 /*
- * Process an outbound ESP packet that can be accelerated by a IPsec
- * hardware acceleration capable Provider.
- * The caller already inserted and initialized the ESP header.
- * This function allocates a tagging M_CTL, and adds room at the end
- * of the packet to hold the ICV if authentication is needed.
- *
- * On success returns B_TRUE, on failure returns B_FALSE and frees the
- * mblk chain ipsec_out.
- */
-static ipsec_status_t
-esp_outbound_accelerated(mblk_t *ipsec_out, uint_t icv_len)
-{
-	ipsec_out_t *io;
-	mblk_t *lastmp;
-	netstack_t	*ns;
-	ipsecesp_stack_t *espstack;
-	ipsec_stack_t	*ipss;
-
-	io = (ipsec_out_t *)ipsec_out->b_rptr;
-	ns = io->ipsec_out_ns;
-	espstack = ns->netstack_ipsecesp;
-	ipss = ns->netstack_ipsec;
-
-	ESP_BUMP_STAT(espstack, out_accelerated);
-
-	/* mark packet as being accelerated in IPSEC_OUT */
-	ASSERT(io->ipsec_out_accelerated == B_FALSE);
-	io->ipsec_out_accelerated = B_TRUE;
-
-	/*
-	 * add room at the end of the packet for the ICV if needed
-	 */
-	if (icv_len > 0) {
-		/* go to last mblk */
-		lastmp = ipsec_out;	/* For following while loop. */
-		do {
-			lastmp = lastmp->b_cont;
-		} while (lastmp->b_cont != NULL);
-
-		/* if not enough available room, allocate new mblk */
-		if ((lastmp->b_wptr + icv_len) > lastmp->b_datap->db_lim) {
-			lastmp->b_cont = allocb(icv_len, BPRI_HI);
-			if (lastmp->b_cont == NULL) {
-				ESP_BUMP_STAT(espstack, out_discards);
-				ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
-				    DROPPER(ipss, ipds_esp_nomem),
-				    &espstack->esp_dropper);
-				return (IPSEC_STATUS_FAILED);
-			}
-			lastmp = lastmp->b_cont;
-		}
-		lastmp->b_wptr += icv_len;
-	}
-
-	return (IPSEC_STATUS_SUCCESS);
-}
-
-/*
- * Process an inbound accelerated ESP packet.
- * On success returns B_TRUE, on failure returns B_FALSE and frees the
- * mblk chain ipsec_in.
- */
-static ipsec_status_t
-esp_inbound_accelerated(mblk_t *ipsec_in, mblk_t *data_mp, boolean_t isv4,
-    ipsa_t *assoc)
-{
-	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
-	mblk_t *hada_mp;
-	uint32_t icv_len = 0;
-	da_ipsec_t *hada;
-	ipha_t *ipha;
-	ip6_t *ip6h;
-	kstat_named_t *counter;
-	netstack_t	*ns = ii->ipsec_in_ns;
-	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
-	ipsec_stack_t	*ipss = ns->netstack_ipsec;
-
-	ESP_BUMP_STAT(espstack, in_accelerated);
-
-	hada_mp = ii->ipsec_in_da;
-	ASSERT(hada_mp != NULL);
-	hada = (da_ipsec_t *)hada_mp->b_rptr;
-
-	/*
-	 * We only support one level of decapsulation in hardware, so
-	 * nuke the pointer.
-	 */
-	ii->ipsec_in_da = NULL;
-	ii->ipsec_in_accelerated = B_FALSE;
-
-	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE) {
-		/*
-		 * ESP with authentication. We expect the Provider to have
-		 * computed the ICV and placed it in the hardware acceleration
-		 * data attributes.
-		 *
-		 * Extract ICV length from attributes M_CTL and sanity check
-		 * its value. We allow the mblk to be smaller than da_ipsec_t
-		 * for a small ICV, as long as the entire ICV fits within the
-		 * mblk.
-		 *
-		 * Also ensures that the ICV length computed by Provider
-		 * corresponds to the ICV length of the agorithm specified by
-		 * the SA.
-		 */
-		icv_len = hada->da_icv_len;
-		if ((icv_len != assoc->ipsa_mac_len) ||
-		    (icv_len > DA_ICV_MAX_LEN) || (MBLKL(hada_mp) <
-		    (sizeof (da_ipsec_t) - DA_ICV_MAX_LEN + icv_len))) {
-			esp0dbg(("esp_inbound_accelerated: "
-			    "ICV len (%u) incorrect or mblk too small (%u)\n",
-			    icv_len, (uint32_t)(MBLKL(hada_mp))));
-			counter = DROPPER(ipss, ipds_esp_bad_auth);
-			goto esp_in_discard;
-		}
-	}
-
-	/* get pointers to IP header */
-	if (isv4) {
-		ipha = (ipha_t *)data_mp->b_rptr;
-	} else {
-		ip6h = (ip6_t *)data_mp->b_rptr;
-	}
-
-	/*
-	 * Compare ICV in ESP packet vs ICV computed by adapter.
-	 * We also remove the ICV from the end of the packet since
-	 * it will no longer be needed.
-	 *
-	 * Assume that esp_inbound() already ensured that the pkt
-	 * was in one mblk.
-	 */
-	ASSERT(data_mp->b_cont == NULL);
-	data_mp->b_wptr -= icv_len;
-	/* adjust IP header */
-	if (isv4)
-		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - icv_len);
-	else
-		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - icv_len);
-	if (icv_len && bcmp(hada->da_icv, data_mp->b_wptr, icv_len)) {
-		int af;
-		void *addr;
-
-		if (isv4) {
-			addr = &ipha->ipha_dst;
-			af = AF_INET;
-		} else {
-			addr = &ip6h->ip6_dst;
-			af = AF_INET6;
-		}
-
-		/*
-		 * Log the event. Don't print to the console, block
-		 * potential denial-of-service attack.
-		 */
-		ESP_BUMP_STAT(espstack, bad_auth);
-		ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
-		    "ESP Authentication failed spi %x, dst_addr %s",
-		    assoc->ipsa_spi, addr, af, espstack->ipsecesp_netstack);
-		counter = DROPPER(ipss, ipds_esp_bad_auth);
-		goto esp_in_discard;
-	}
-
-	esp3dbg(espstack, ("esp_inbound_accelerated: ESP authentication "
-	    "succeeded, checking replay\n"));
-
-	ipsec_in->b_cont = data_mp;
-
-	/*
-	 * Remove ESP header and padding from packet.
-	 */
-	if (!esp_strip_header(data_mp, ii->ipsec_in_v4, assoc->ipsa_iv_len,
-	    &counter, espstack)) {
-		esp1dbg(espstack, ("esp_inbound_accelerated: "
-		    "esp_strip_header() failed\n"));
-		goto esp_in_discard;
-	}
-
-	freeb(hada_mp);
-
-	if (is_system_labeled() && (assoc->ipsa_cred != NULL))
-		mblk_setcred(data_mp, assoc->ipsa_cred, NOPID);
-
-	/*
-	 * Account for usage..
-	 */
-	if (!esp_age_bytes(assoc, msgdsize(data_mp), B_TRUE)) {
-		/* The ipsa has hit hard expiration, LOG and AUDIT. */
-		ESP_BUMP_STAT(espstack, bytes_expired);
-		IP_ESP_BUMP_STAT(ipss, in_discards);
-		ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
-		    "ESP association 0x%x, dst %s had bytes expire.\n",
-		    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
-		    espstack->ipsecesp_netstack);
-		ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
-		    DROPPER(ipss, ipds_esp_bytes_expire),
-		    &espstack->esp_dropper);
-		return (IPSEC_STATUS_FAILED);
-	}
-
-	/* done processing the packet */
-	return (IPSEC_STATUS_SUCCESS);
-
-esp_in_discard:
-	IP_ESP_BUMP_STAT(ipss, in_discards);
-	freeb(hada_mp);
-
-	ipsec_in->b_cont = data_mp;	/* For ip_drop_packet()'s sake... */
-	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter,
-	    &espstack->esp_dropper);
-
-	return (IPSEC_STATUS_FAILED);
-}
-
-/*
  * Wrapper to allow IP to trigger an ESP association failure message
  * during inbound SA selection.
  */
 void
 ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
-    uint32_t spi, void *addr, int af, ipsecesp_stack_t *espstack)
+    uint32_t spi, void *addr, int af, ip_recv_attr_t *ira)
 {
-	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
+	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
 	if (espstack->ipsecesp_log_unknown_spi) {
 		ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
 		    addr, af, espstack->ipsecesp_netstack);
 	}
 
-	ip_drop_packet(mp, B_TRUE, NULL, NULL,
+	ip_drop_packet(mp, B_TRUE, ira->ira_ill,
 	    DROPPER(ipss, ipds_esp_no_sa),
 	    &espstack->esp_dropper);
 }
diff --git a/usr/src/uts/common/inet/ip/keysock.c b/usr/src/uts/common/inet/ip/keysock.c
index ca82eeece0..855af28bb2 100644
--- a/usr/src/uts/common/inet/ip/keysock.c
+++ b/usr/src/uts/common/inet/ip/keysock.c
@@ -852,7 +852,7 @@ keysock_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
 int
 keysock_opt_set(queue_t *q, uint_t mgmt_flags, int level,
     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
-    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
 {
 	int *i1 = (int *)invalp, errno = 0;
 	keysock_t *ks = (keysock_t *)q->q_ptr;
@@ -936,11 +936,9 @@ keysock_wput_other(queue_t *q, mblk_t *mp)
 			}
 			if (((union T_primitives *)mp->b_rptr)->type ==
 			    T_SVR4_OPTMGMT_REQ) {
-				(void) svr4_optcom_req(q, mp, cr,
-				    &keysock_opt_obj, B_FALSE);
+				svr4_optcom_req(q, mp, cr, &keysock_opt_obj);
 			} else {
-				(void) tpi_optcom_req(q, mp, cr,
-				    &keysock_opt_obj, B_FALSE);
+				tpi_optcom_req(q, mp, cr, &keysock_opt_obj);
 			}
 			break;
 		case T_DATA_REQ:
diff --git a/usr/src/uts/common/inet/ip/keysock_opt_data.c b/usr/src/uts/common/inet/ip/keysock_opt_data.c
index d8d9f1d0ad..4dee663d42 100644
--- a/usr/src/uts/common/inet/ip/keysock_opt_data.c
+++ b/usr/src/uts/common/inet/ip/keysock_opt_data.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1996-1998,2001-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/stream.h>
 #define	_SUN_TPI_VERSION 1
@@ -51,11 +48,11 @@
  */
 
 opdes_t keysock_opt_arr[] = {
-	{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+	{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	    (t_uscalar_t)sizeof (int), 0 },
-	{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+	{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	    (t_uscalar_t)sizeof (int), 0 },
-	{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+	{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	    (t_uscalar_t)sizeof (int), 0 },
 };
 
@@ -88,7 +85,6 @@ optdb_obj_t keysock_opt_obj = {
 	NULL,			/* KEYSOCK default value function pointer */
 	keysock_opt_get,	/* KEYSOCK get function pointer */
 	keysock_opt_set,	/* KEYSOCK set function pointer */
-	B_TRUE,			/* KEYSOCK is tpi provider */
 	KEYSOCK_OPT_ARR_CNT,	/* KEYSOCK option database count of entries */
 	keysock_opt_arr,	/* KEYSOCK option database */
 	KEYSOCK_VALID_LEVELS_CNT, /* KEYSOCK valid level count of entries */
diff --git a/usr/src/uts/common/inet/ip/rts.c b/usr/src/uts/common/inet/ip/rts.c
index ce3ac6faca..d5a1d84395 100644
--- a/usr/src/uts/common/inet/ip/rts.c
+++ b/usr/src/uts/common/inet/ip/rts.c
@@ -72,7 +72,6 @@
  *	Addresses are assigned to interfaces.
  *	ICMP redirects are processed and a IRE_HOST/RTF_DYNAMIC is installed.
  *	No route is found while sending a packet.
- *	When TCP requests IP to remove an IRE_CACHE of a troubled destination.
  *
  * Since all we do is reformat the messages between routing socket and
  * ioctl forms, no synchronization is necessary in this module; all
@@ -113,7 +112,8 @@ static rtsparam_t	lcl_param_arr[] = {
 
 static void 	rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
     int sys_error);
-static void	rts_input(void *, mblk_t *, void *);
+static void	rts_input(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void	rts_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 static mblk_t	*rts_ioctl_alloc(mblk_t *data);
 static int	rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
 static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt);
@@ -211,28 +211,28 @@ rts_common_close(queue_t *q, conn_t *connp)
 
 	if (!IPCL_IS_NONSTR(connp)) {
 		qprocsoff(q);
+	}
 
-		/*
-		 * Now we are truly single threaded on this stream, and can
-		 * delete the things hanging off the connp, and finally the
-		 * connp.
-		 * We removed this connp from the fanout list, it cannot be
-		 * accessed thru the fanouts, and we already waited for the
-		 * conn_ref to drop to 0. We are already in close, so
-		 * there cannot be any other thread from the top. qprocsoff
-		 * has completed, and service has completed or won't run in
-		 * future.
-		 */
+	/*
+	 * Now we are truly single threaded on this stream, and can
+	 * delete the things hanging off the connp, and finally the connp.
+	 * We removed this connp from the fanout list, it cannot be
+	 * accessed thru the fanouts, and we already waited for the
+	 * conn_ref to drop to 0. We are already in close, so
+	 * there cannot be any other thread from the top. qprocsoff
+	 * has completed, and service has completed or won't run in
+	 * future.
+	 */
+	ASSERT(connp->conn_ref == 1);
+
+	if (!IPCL_IS_NONSTR(connp)) {
 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
 	} else {
 		ip_free_helper_stream(connp);
 	}
-	ASSERT(connp->conn_ref == 1);
-
 
 	connp->conn_ref--;
 	ipcl_conn_destroy(connp);
-
 	return (0);
 }
 
@@ -256,7 +256,6 @@ rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 {
 	conn_t *connp;
 	dev_t	conn_dev;
-	rts_stack_t *rtss;
 	rts_t   *rts;
 
 	/* If the stream is already open, return immediately. */
@@ -266,7 +265,6 @@ rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 	if (sflag == MODOPEN)
 		return (EINVAL);
 
-
 	/*
 	 * Since RTS is not used so heavily, allocating from the small
 	 * arena should be sufficient.
@@ -278,44 +276,31 @@ rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 	connp = rts_open(flag, credp);
 	ASSERT(connp != NULL);
 
-
 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
 
 	rts = connp->conn_rts;
-
 	rw_enter(&rts->rts_rwlock, RW_WRITER);
 	connp->conn_dev = conn_dev;
 	connp->conn_minor_arena = ip_minor_arena_sa;
 
-	/*
-	 * Initialize the rts_t structure for this stream.
-	 */
 	q->q_ptr = connp;
 	WR(q)->q_ptr = connp;
 	connp->conn_rq = q;
 	connp->conn_wq = WR(q);
 
-	rtss = rts->rts_rtss;
-	q->q_hiwat = rtss->rtss_recv_hiwat;
-	WR(q)->q_hiwat = rtss->rtss_xmit_hiwat;
-	WR(q)->q_lowat = rtss->rtss_xmit_lowat;
-
-
+	WR(q)->q_hiwat = connp->conn_sndbuf;
+	WR(q)->q_lowat = connp->conn_sndlowat;
 
 	mutex_enter(&connp->conn_lock);
 	connp->conn_state_flags &= ~CONN_INCIPIENT;
 	mutex_exit(&connp->conn_lock);
-
-	qprocson(q);
 	rw_exit(&rts->rts_rwlock);
-	/*
-	 * Indicate the down IP module that this is a routing socket
-	 * client by sending an RTS IOCTL without any user data. Although
-	 * this is just a notification message (without any real routing
-	 * request), we pass in any credential for correctness sake.
-	 */
+
+	/* Indicate to IP that this is a routing socket client */
 	ip_rts_register(connp);
 
+	qprocson(q);
+
 	return (0);
 }
 
@@ -352,22 +337,38 @@ rts_open(int flag, cred_t *credp)
 	 */
 	netstack_rele(ns);
 
-
 	rw_enter(&rts->rts_rwlock, RW_WRITER);
 	ASSERT(connp->conn_rts == rts);
 	ASSERT(rts->rts_connp == connp);
 
+	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
+	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+	connp->conn_ixa->ixa_zoneid = zoneid;
 	connp->conn_zoneid = zoneid;
 	connp->conn_flow_cntrld = B_FALSE;
 
-	connp->conn_ulp_labeled = is_system_labeled();
-
 	rts->rts_rtss = rtss;
-	rts->rts_xmit_hiwat = rtss->rtss_xmit_hiwat;
+
+	connp->conn_rcvbuf = rtss->rtss_recv_hiwat;
+	connp->conn_sndbuf = rtss->rtss_xmit_hiwat;
+	connp->conn_sndlowat = rtss->rtss_xmit_lowat;
+	connp->conn_rcvlowat = rts_mod_info.mi_lowat;
+
+	connp->conn_family = PF_ROUTE;
+	connp->conn_so_type = SOCK_RAW;
+	/* SO_PROTOTYPE is always sent down by sockfs setting conn_proto */
 
 	connp->conn_recv = rts_input;
+	connp->conn_recvicmp = rts_icmp_input;
+
 	crhold(credp);
 	connp->conn_cred = credp;
+	connp->conn_cpid = curproc->p_pid;
+	/* Cache things in ixa without an extra refhold */
+	connp->conn_ixa->ixa_cred = connp->conn_cred;
+	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+	if (is_system_labeled())
+		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
 
 	/*
 	 * rts sockets start out as bound and connected
@@ -429,7 +430,6 @@ rts_tpi_bind(queue_t *q, mblk_t *mp)
 {
 	conn_t	*connp = Q_TO_CONN(q);
 	rts_t	*rts = connp->conn_rts;
-	mblk_t	*mp1;
 	struct T_bind_req *tbr;
 
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
@@ -444,16 +444,6 @@ rts_tpi_bind(queue_t *q, mblk_t *mp)
 		rts_err_ack(q, mp, TOUTSTATE, 0);
 		return;
 	}
-	/*
-	 * Reallocate the message to make sure we have enough room for an
-	 * address and the protocol type.
-	 */
-	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1);
-	if (mp1 == NULL) {
-		rts_err_ack(q, mp, TSYSERR, ENOMEM);
-		return;
-	}
-	mp = mp1;
 	tbr = (struct T_bind_req *)mp->b_rptr;
 	if (tbr->ADDR_length != 0) {
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
@@ -465,6 +455,7 @@ rts_tpi_bind(queue_t *q, mblk_t *mp)
 	tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req);
 	tbr->ADDR_length = 0;
 	tbr->PRIM_type = T_BIND_ACK;
+	mp->b_datap->db_type = M_PCPROTO;
 	rts->rts_state = TS_IDLE;
 	qreply(q, mp);
 }
@@ -545,70 +536,30 @@ static int
 rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 {
 	rts_t	*rts = connp->conn_rts;
-	int	*i1 = (int *)ptr;
+	conn_opt_arg_t	coas;
+	int retval;
 
 	ASSERT(RW_READ_HELD(&rts->rts_rwlock));
 
 	switch (level) {
-	case SOL_SOCKET:
-		switch (name) {
-		case SO_DEBUG:
-			*i1 = rts->rts_debug;
-			break;
-		case SO_REUSEADDR:
-			*i1 = rts->rts_reuseaddr;
-			break;
-		case SO_TYPE:
-			*i1 = SOCK_RAW;
-			break;
-		/*
-		 * The following three items are available here,
-		 * but are only meaningful to IP.
-		 */
-		case SO_DONTROUTE:
-			*i1 = rts->rts_dontroute;
-			break;
-		case SO_USELOOPBACK:
-			*i1 = rts->rts_useloopback;
-			break;
-		case SO_BROADCAST:
-			*i1 = rts->rts_broadcast;
-			break;
-		case SO_PROTOTYPE:
-			*i1 = rts->rts_proto;
-			break;
-		/*
-		 * The following two items can be manipulated,
-		 * but changing them should do nothing.
-		 */
-		case SO_SNDBUF:
-			ASSERT(rts->rts_xmit_hiwat <= INT_MAX);
-			*i1 = (int)(rts->rts_xmit_hiwat);
-			break;
-		case SO_RCVBUF:
-			ASSERT(rts->rts_recv_hiwat <= INT_MAX);
-			*i1 = (int)(rts->rts_recv_hiwat);
-			break;
-		case SO_DOMAIN:
-			*i1 = PF_ROUTE;
-			break;
-		default:
-			return (-1);
-		}
-		break;
+	/* do this in conn_opt_get? */
 	case SOL_ROUTE:
 		switch (name) {
 		case RT_AWARE:
 			mutex_enter(&connp->conn_lock);
-			*i1 = connp->conn_rtaware;
+			*(int *)ptr = connp->conn_rtaware;
 			mutex_exit(&connp->conn_lock);
-			break;
+			return (0);
 		}
 		break;
-	default:
-		return (-1);
 	}
-	return ((int)sizeof (int));
+	coas.coa_connp = connp;
+	coas.coa_ixa = connp->conn_ixa;
+	coas.coa_ipp = &connp->conn_xmit_ipp;
+	mutex_enter(&connp->conn_lock);
+	retval = conn_opt_get(&coas, level, name, ptr);
+	mutex_exit(&connp->conn_lock);
+	return (retval);
 }
 
 /* ARGSUSED */
@@ -620,6 +571,12 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 	int	*i1 = (int *)invalp;
 	rts_t	*rts = connp->conn_rts;
 	rts_stack_t	*rtss = rts->rts_rtss;
+	int		error;
+	conn_opt_arg_t	coas;
+
+	coas.coa_connp = connp;
+	coas.coa_ixa = connp->conn_ixa;
+	coas.coa_ipp = &connp->conn_xmit_ipp;
 
 	ASSERT(RW_WRITE_HELD(&rts->rts_rwlock));
 
@@ -638,38 +595,6 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 	switch (level) {
 	case SOL_SOCKET:
 		switch (name) {
-		case SO_REUSEADDR:
-			if (!checkonly) {
-				rts->rts_reuseaddr = *i1 ? 1 : 0;
-				connp->conn_reuseaddr = *i1 ? 1 : 0;
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_DEBUG:
-			if (!checkonly)
-				rts->rts_debug = *i1 ? 1 : 0;
-			break;	/* goto sizeof (int) option return */
-		/*
-		 * The following three items are available here,
-		 * but are only meaningful to IP.
-		 */
-		case SO_DONTROUTE:
-			if (!checkonly) {
-				rts->rts_dontroute = *i1 ? 1 : 0;
-				connp->conn_dontroute = *i1 ? 1 : 0;
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_USELOOPBACK:
-			if (!checkonly) {
-				rts->rts_useloopback = *i1 ? 1 : 0;
-				connp->conn_loopback = *i1 ? 1 : 0;
-			}
-			break;	/* goto sizeof (int) option return */
-		case SO_BROADCAST:
-			if (!checkonly) {
-				rts->rts_broadcast = *i1 ? 1 : 0;
-				connp->conn_broadcast = *i1 ? 1 : 0;
-			}
-			break;	/* goto sizeof (int) option return */
 		case SO_PROTOTYPE:
 			/*
 			 * Routing socket applications that call socket() with
@@ -678,13 +603,15 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 			 * down the SO_PROTOTYPE and rts_queue_input()
 			 * implements the filtering.
 			 */
-			if (*i1 != AF_INET && *i1 != AF_INET6)
+			if (*i1 != AF_INET && *i1 != AF_INET6) {
+				*outlenp = 0;
 				return (EPROTONOSUPPORT);
-			if (!checkonly) {
-				rts->rts_proto = *i1;
-				connp->conn_proto = *i1;
 			}
-			break;	/* goto sizeof (int) option return */
+			if (!checkonly)
+				connp->conn_proto = *i1;
+			*outlenp = inlen;
+			return (0);
+
 		/*
 		 * The following two items can be manipulated,
 		 * but changing them should do nothing.
@@ -694,36 +621,13 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 				*outlenp = 0;
 				return (ENOBUFS);
 			}
-			if (!checkonly) {
-				rts->rts_xmit_hiwat = *i1;
-				if (!IPCL_IS_NONSTR(connp))
-					connp->conn_wq->q_hiwat = *i1;
-			}
 			break;	/* goto sizeof (int) option return */
 		case SO_RCVBUF:
 			if (*i1 > rtss->rtss_max_buf) {
 				*outlenp = 0;
 				return (ENOBUFS);
 			}
-			if (!checkonly) {
-				rts->rts_recv_hiwat = *i1;
-				rw_exit(&rts->rts_rwlock);
-				(void) proto_set_rx_hiwat(connp->conn_rq, connp,
-				    *i1);
-				rw_enter(&rts->rts_rwlock, RW_WRITER);
-			}
-
 			break;	/* goto sizeof (int) option return */
-		case SO_RCVTIMEO:
-		case SO_SNDTIMEO:
-			/*
-			 * Pass these two options in order for third part
-			 * protocol usage. Here just return directly.
-			 */
-			return (0);
-		default:
-			*outlenp = 0;
-			return (EINVAL);
 		}
 		break;
 	case SOL_ROUTE:
@@ -734,15 +638,17 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 				connp->conn_rtaware = *i1;
 				mutex_exit(&connp->conn_lock);
 			}
-			break;	/* goto sizeof (int) option return */
-		default:
-			*outlenp = 0;
-			return (EINVAL);
+			*outlenp = inlen;
+			return (0);
 		}
 		break;
-	default:
+	}
+	/* Serialized setsockopt since we are D_MTQPAIR */
+	error = conn_opt_set(&coas, level, name, inlen, invalp,
+	    checkonly, cr);
+	if (error != 0) {
 		*outlenp = 0;
-		return (EINVAL);
+		return (error);
 	}
 	/*
 	 * Common case of return from an option that is sizeof (int)
@@ -832,7 +738,7 @@ rts_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
 int
 rts_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
-    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
 {
 	conn_t	*connp = Q_TO_CONN(q);
 	int	error;
@@ -1009,10 +915,6 @@ err_ret:
  * consumes the message or passes it downstream; it never queues a
  * a message. The data messages that go down are wrapped in an IOCTL
  * message.
- *
- * FIXME? Should we call IP rts_request directly? Could punt on returning
- * errno in the case when it defers processing due to
- * IPIF_CHANGING/ILL_CHANGING???
  */
 static void
 rts_wput(queue_t *q, mblk_t *mp)
@@ -1057,7 +959,7 @@ rts_wput(queue_t *q, mblk_t *mp)
 		}
 		return;
 	}
-	ip_output(connp, mp1, q, IP_WPUT);
+	ip_wput_nondata(q, mp1);
 }
 
 
@@ -1120,11 +1022,9 @@ rts_wput_other(queue_t *q, mblk_t *mp)
 			}
 			if (((union T_primitives *)rptr)->type ==
 			    T_SVR4_OPTMGMT_REQ) {
-				(void) svr4_optcom_req(q, mp, cr,
-				    &rts_opt_obj, B_TRUE);
+				svr4_optcom_req(q, mp, cr, &rts_opt_obj);
 			} else {
-				(void) tpi_optcom_req(q, mp, cr,
-				    &rts_opt_obj, B_TRUE);
+				tpi_optcom_req(q, mp, cr, &rts_opt_obj);
 			}
 			return;
 		case O_T_CONN_RES:
@@ -1168,7 +1068,7 @@ rts_wput_other(queue_t *q, mblk_t *mp)
 	default:
 		break;
 	}
-	ip_output(connp, mp, q, IP_WPUT);
+	ip_wput_nondata(q, mp);
 }
 
 /*
@@ -1177,7 +1077,6 @@ rts_wput_other(queue_t *q, mblk_t *mp)
 static void
 rts_wput_iocdata(queue_t *q, mblk_t *mp)
 {
-	conn_t *connp = Q_TO_CONN(q);
 	struct sockaddr	*rtsaddr;
 	mblk_t	*mp1;
 	STRUCT_HANDLE(strbuf, sb);
@@ -1188,7 +1087,7 @@ rts_wput_iocdata(queue_t *q, mblk_t *mp)
 	case TI_GETPEERNAME:
 		break;
 	default:
-		ip_output(connp, mp, q, IP_WPUT);
+		ip_wput_nondata(q, mp);
 		return;
 	}
 	switch (mi_copy_state(q, mp, &mp1)) {
@@ -1233,9 +1132,12 @@ rts_wput_iocdata(queue_t *q, mblk_t *mp)
 	mi_copyout(q, mp);
 }
 
+/*
+ * IP passes up a NULL ira.
+ */
 /*ARGSUSED2*/
 static void
-rts_input(void *arg1, mblk_t *mp, void *arg2)
+rts_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 {
 	conn_t *connp = (conn_t *)arg1;
 	rts_t	*rts = connp->conn_rts;
@@ -1248,27 +1150,17 @@ rts_input(void *arg1, mblk_t *mp, void *arg2)
 	case M_IOCACK:
 	case M_IOCNAK:
 		iocp = (struct iocblk *)mp->b_rptr;
-		if (IPCL_IS_NONSTR(connp)) {
-			ASSERT(rts->rts_flag & (RTS_REQ_PENDING));
-			mutex_enter(&rts->rts_send_mutex);
-			rts->rts_flag &= ~RTS_REQ_INPROG;
+		ASSERT(!IPCL_IS_NONSTR(connp));
+		if (rts->rts_flag & (RTS_WPUT_PENDING)) {
+			rts->rts_flag &= ~RTS_WPUT_PENDING;
 			rts->rts_error = iocp->ioc_error;
-			cv_signal(&rts->rts_io_cv);
-			mutex_exit(&rts->rts_send_mutex);
+			/*
+			 * Tell rts_wvw/qwait that we are done.
+			 * Note: there is no qwait_wakeup() we can use.
+			 */
+			qenable(connp->conn_rq);
 			freemsg(mp);
 			return;
-		} else {
-			if (rts->rts_flag & (RTS_WPUT_PENDING)) {
-				rts->rts_flag &= ~RTS_WPUT_PENDING;
-				rts->rts_error = iocp->ioc_error;
-				/*
-				 * Tell rts_wvw/qwait that we are done.
-				 * Note: there is no qwait_wakeup() we can use.
-				 */
-				qenable(connp->conn_rq);
-				freemsg(mp);
-				return;
-			}
 		}
 		break;
 	case M_DATA:
@@ -1316,6 +1208,12 @@ rts_input(void *arg1, mblk_t *mp, void *arg2)
 	}
 }
 
+/*ARGSUSED*/
+static void
+rts_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
+{
+	freemsg(mp);
+}
 
 void
 rts_ddi_g_init(void)
@@ -1427,11 +1325,6 @@ int
 rts_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
     socklen_t *addrlen, cred_t *cr)
 {
-	conn_t *connp = (conn_t *)proto_handle;
-	rts_t *rts = connp->conn_rts;
-
-	ASSERT(rts != NULL);
-
 	bzero(addr, sizeof (struct sockaddr));
 	addr->sa_family = AF_ROUTE;
 	*addrlen = sizeof (struct sockaddr);
@@ -1444,7 +1337,11 @@ int
 rts_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
     socklen_t *addrlen, cred_t *cr)
 {
-	return (EOPNOTSUPP);
+	bzero(addr, sizeof (struct sockaddr));
+	addr->sa_family = AF_ROUTE;
+	*addrlen = sizeof (struct sockaddr);
+
+	return (0);
 }
 
 static int
@@ -1461,7 +1358,6 @@ rts_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
 	    rts_opt_obj.odb_opt_des_arr,
 	    rts_opt_obj.odb_opt_arr_cnt,
-	    rts_opt_obj.odb_topmost_tpiprovider,
 	    B_FALSE, B_TRUE, cr);
 	if (error != 0) {
 		if (error < 0)
@@ -1473,25 +1369,20 @@ rts_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	rw_enter(&rts->rts_rwlock, RW_READER);
 	len = rts_opt_get(connp, level, option_name, optvalp_buf);
 	rw_exit(&rts->rts_rwlock);
-
-	if (len < 0) {
-		/*
-		 * Pass on to IP
-		 */
-		error = ip_get_options(connp, level, option_name,
-		    optvalp, optlen, cr);
-	} else {
-		/*
-		 * update optlen and copy option value
-		 */
-		t_uscalar_t size = MIN(len, *optlen);
-		bcopy(optvalp_buf, optvalp, size);
-		bcopy(&size, optlen, sizeof (size));
-		error = 0;
+	if (len == -1) {
+		kmem_free(optvalp_buf, max_optbuf_len);
+		return (EINVAL);
 	}
 
+	/*
+	 * update optlen and copy option value
+	 */
+	t_uscalar_t size = MIN(len, *optlen);
+
+	bcopy(optvalp_buf, optvalp, size);
+	bcopy(&size, optlen, sizeof (size));
 	kmem_free(optvalp_buf, max_optbuf_len);
-	return (error);
+	return (0);
 }
 
 static int
@@ -1505,7 +1396,6 @@ rts_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	error = proto_opt_check(level, option_name, optlen, NULL,
 	    rts_opt_obj.odb_opt_des_arr,
 	    rts_opt_obj.odb_opt_arr_cnt,
-	    rts_opt_obj.odb_topmost_tpiprovider,
 	    B_TRUE, B_FALSE, cr);
 
 	if (error != 0) {
@@ -1530,9 +1420,7 @@ static int
 rts_send(sock_lower_handle_t proto_handle, mblk_t *mp,
     struct nmsghdr *msg, cred_t *cr)
 {
-	mblk_t  *mp1;
 	conn_t  *connp = (conn_t *)proto_handle;
-	rts_t   *rts = connp->conn_rts;
 	rt_msghdr_t	*rtm;
 	int error;
 
@@ -1546,65 +1434,19 @@ rts_send(sock_lower_handle_t proto_handle, mblk_t *mp,
 	 */
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
 		if (!pullupmsg(mp, sizeof (rt_msghdr_t))) {
-			rts->rts_error = EINVAL;
 			freemsg(mp);
-			return (rts->rts_error);
+			return (EINVAL);
 		}
 	}
 	rtm = (rt_msghdr_t *)mp->b_rptr;
 	rtm->rtm_pid = curproc->p_pid;
 
-	mp1 = rts_ioctl_alloc(mp);
-	if (mp1 == NULL) {
-		ASSERT(rts != NULL);
-		freemsg(mp);
-		return (ENOMEM);
-	}
-
 	/*
-	 * Allow only one outstanding request(ioctl) at any given time
+	 * We are not constrained by the ioctl interface and
+	 * ip_rts_request_common processing requests synchronously hence
+	 * we can send them down concurrently.
 	 */
-	mutex_enter(&rts->rts_send_mutex);
-	while (rts->rts_flag & RTS_REQ_PENDING) {
-		int ret;
-
-		ret = cv_wait_sig(&rts->rts_send_cv, &rts->rts_send_mutex);
-		if (ret <= 0) {
-			mutex_exit(&rts->rts_send_mutex);
-			freemsg(mp);
-			return (EINTR);
-		}
-	}
-
-	rts->rts_flag |= RTS_REQ_PENDING;
-
-	rts->rts_flag |= RTS_REQ_INPROG;
-
-	mutex_exit(&rts->rts_send_mutex);
-
-	CONN_INC_REF(connp);
-
-	error = ip_rts_request_common(rts->rts_connp->conn_wq, mp1, connp, cr);
-
-	mutex_enter(&rts->rts_send_mutex);
-	if (error == EINPROGRESS) {
-		ASSERT(rts->rts_flag & RTS_REQ_INPROG);
-		if (rts->rts_flag & RTS_REQ_INPROG) {
-			/*
-			 * Once the request has been issued we wait for
-			 * completion
-			 */
-			cv_wait(&rts->rts_io_cv, &rts->rts_send_mutex);
-			error = rts->rts_error;
-		}
-	}
-
-	ASSERT((error != 0) || !(rts->rts_flag & RTS_REQ_INPROG));
-	ASSERT(MUTEX_HELD(&rts->rts_send_mutex));
-
-	rts->rts_flag &= ~(RTS_REQ_PENDING | RTS_REQ_INPROG);
-	cv_signal(&rts->rts_send_cv);
-	mutex_exit(&rts->rts_send_mutex);
+	error = ip_rts_request_common(mp, connp, cr);
 	return (error);
 }
 
@@ -1614,8 +1456,6 @@ rts_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
     uint_t *smodep, int *errorp, int flags, cred_t *credp)
 {
 	conn_t	*connp;
-	rts_t	*rts;
-	rts_stack_t *rtss;
 
 	if (family != AF_ROUTE || type != SOCK_RAW ||
 	    (proto != 0 && proto != AF_INET && proto != AF_INET6)) {
@@ -1627,25 +1467,7 @@ rts_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 	ASSERT(connp != NULL);
 	connp->conn_flags |= IPCL_NONSTR;
 
-	rts = connp->conn_rts;
-	rtss = rts->rts_rtss;
-
-	rts->rts_xmit_hiwat = rtss->rtss_xmit_hiwat;
-	rts->rts_xmit_lowat = rtss->rtss_xmit_lowat;
-	rts->rts_recv_hiwat = rtss->rtss_recv_hiwat;
-	rts->rts_recv_lowat = rts_mod_info.mi_lowat;
-
-	ASSERT(rtss->rtss_ldi_ident != NULL);
-
-	*errorp = ip_create_helper_stream(connp, rtss->rtss_ldi_ident);
-	if (*errorp != 0) {
-#ifdef DEBUG
-		cmn_err(CE_CONT, "rts_create: create of IP helper stream"
-		    " failed\n");
-#endif
-		(void) rts_close((sock_lower_handle_t)connp, 0, credp);
-		return (NULL);
-	}
+	connp->conn_proto = proto;
 
 	mutex_enter(&connp->conn_lock);
 	connp->conn_state_flags &= ~CONN_INCIPIENT;
@@ -1663,8 +1485,6 @@ rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
 {
 	conn_t  *connp = (conn_t *)proto_handle;
-	rts_t	*rts = connp->conn_rts;
-	rts_stack_t *rtss = rts->rts_rtss;
 	struct sock_proto_props sopp;
 
 	connp->conn_upcalls = sock_upcalls;
@@ -1673,8 +1493,8 @@ rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
 	sopp.sopp_wroff = 0;
-	sopp.sopp_rxhiwat = rtss->rtss_recv_hiwat;
-	sopp.sopp_rxlowat = rts_mod_info.mi_lowat;
+	sopp.sopp_rxhiwat = connp->conn_rcvbuf;
+	sopp.sopp_rxlowat = connp->conn_rcvlowat;
 	sopp.sopp_maxblk = INFPSZ;
 	sopp.sopp_maxpsz = rts_mod_info.mi_maxpsz;
 	sopp.sopp_minpsz = (rts_mod_info.mi_minpsz == 1) ? 0 :
@@ -1689,12 +1509,7 @@ rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
 	(*connp->conn_upcalls->su_connected)
 	    (connp->conn_upper_handle, 0, NULL, -1);
 
-	/*
-	 * Indicate the down IP module that this is a routing socket
-	 * client by sending an RTS IOCTL without any user data. Although
-	 * this is just a notification message (without any real routing
-	 * request), we pass in any credential for correctness sake.
-	 */
+	/* Indicate to IP that this is a routing socket client */
 	ip_rts_register(connp);
 }
 
@@ -1743,6 +1558,27 @@ rts_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 	conn_t		*connp = (conn_t *)proto_handle;
 	int		error;
 
+	/*
+	 * If we don't have a helper stream then create one.
+	 * ip_create_helper_stream takes care of locking the conn_t,
+	 * so this check for NULL is just a performance optimization.
+	 */
+	if (connp->conn_helper_info == NULL) {
+		rts_stack_t *rtss = connp->conn_rts->rts_rtss;
+
+		ASSERT(rtss->rtss_ldi_ident != NULL);
+
+		/*
+		 * Create a helper stream for non-STREAMS socket.
+		 */
+		error = ip_create_helper_stream(connp, rtss->rtss_ldi_ident);
+		if (error != 0) {
+			ip0dbg(("rts_ioctl: create of IP helper stream "
+			    "failed %d\n", error));
+			return (error);
+		}
+	}
+
 	switch (cmd) {
 	case ND_SET:
 	case ND_GET:
diff --git a/usr/src/uts/common/inet/ip/rts_opt_data.c b/usr/src/uts/common/inet/ip/rts_opt_data.c
index 8a96edb668..1dd64a0317 100644
--- a/usr/src/uts/common/inet/ip/rts_opt_data.c
+++ b/usr/src/uts/common/inet/ip/rts_opt_data.c
@@ -40,6 +40,7 @@
 #include <inet/optcom.h>
 #include <inet/rts_impl.h>
 
+#include <inet/rts_impl.h>
 /*
  * Table of all known options handled on a RTS protocol stack.
  *
@@ -49,21 +50,21 @@
  */
 opdes_t	rts_opt_arr[] = {
 
-{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	0 },
-{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_REUSEADDR,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct timeval), 0 },
-{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct timeval), 0 },
-{ SO_PROTOTYPE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_PROTOTYPE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 { RT_AWARE,	SOL_ROUTE, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 };
 
@@ -98,9 +99,8 @@ uint_t rts_max_optsize; /* initialized in _init() */
 
 optdb_obj_t rts_opt_obj = {
 	rts_opt_default,	/* RTS default value function pointer */
-	rts_tpi_opt_get,		/* RTS get function pointer */
-	rts_tpi_opt_set,		/* RTS set function pointer */
-	B_TRUE,			/* RTS is tpi provider */
+	rts_tpi_opt_get,	/* RTS get function pointer */
+	rts_tpi_opt_set,	/* RTS set function pointer */
 	RTS_OPT_ARR_CNT,	/* RTS option database count of entries */
 	rts_opt_arr,		/* RTS option database */
 	RTS_VALID_LEVELS_CNT,	/* RTS valid level count of entries */
diff --git a/usr/src/uts/common/inet/ip/sadb.c b/usr/src/uts/common/inet/ip/sadb.c
index 784b3b08aa..5ae4f6da8e 100644
--- a/usr/src/uts/common/inet/ip/sadb.c
+++ b/usr/src/uts/common/inet/ip/sadb.c
@@ -59,7 +59,6 @@
 #include <inet/ipsecesp.h>
 #include <sys/random.h>
 #include <sys/dlpi.h>
-#include <sys/iphada.h>
 #include <sys/strsun.h>
 #include <sys/strsubr.h>
 #include <inet/ip_if.h>
@@ -77,15 +76,13 @@
 static mblk_t *sadb_extended_acquire(ipsec_selector_t *, ipsec_policy_t *,
     ipsec_action_t *, boolean_t, uint32_t, uint32_t, sadb_sens_t *,
     netstack_t *);
-static void sadb_ill_df(ill_t *, mblk_t *, isaf_t *, int, boolean_t);
-static ipsa_t *sadb_torch_assoc(isaf_t *, ipsa_t *, boolean_t, mblk_t **);
-static void sadb_drain_torchq(queue_t *, mblk_t *);
+static ipsa_t *sadb_torch_assoc(isaf_t *, ipsa_t *);
 static void sadb_destroy_acqlist(iacqf_t **, uint_t, boolean_t,
 			    netstack_t *);
 static void sadb_destroy(sadb_t *, netstack_t *);
 static mblk_t *sadb_sa2msg(ipsa_t *, sadb_msg_t *);
-static cred_t *sadb_cred_from_sens(sadb_sens_t *, uint64_t *);
-static sadb_sens_t *sadb_make_sens_ext(cred_t *cr, int *len);
+static ts_label_t *sadb_label_from_sens(sadb_sens_t *, uint64_t *);
+static sadb_sens_t *sadb_make_sens_ext(ts_label_t *tsl, int *len);
 
 static time_t sadb_add_time(time_t, uint64_t);
 static void lifetime_fuzz(ipsa_t *);
@@ -96,12 +93,6 @@ static void destroy_ipsa_pair(ipsap_t *);
 static int update_pairing(ipsap_t *, ipsa_query_t *, keysock_in_t *, int *);
 static void ipsa_set_replay(ipsa_t *ipsa, uint32_t offset);
 
-extern void (*cl_inet_getspi)(netstackid_t stack_id, uint8_t protocol,
-    uint8_t *ptr, size_t len, void *args);
-extern int (*cl_inet_checkspi)(netstackid_t stack_id, uint8_t protocol,
-    uint32_t spi, void *args);
-extern void (*cl_inet_deletespi)(netstackid_t stack_id, uint8_t protocol,
-    uint32_t spi, void *args);
 /*
  * ipsacq_maxpackets is defined here to make it tunable
  * from /etc/system.
@@ -269,6 +260,7 @@ static void
 sadb_freeassoc(ipsa_t *ipsa)
 {
 	ipsec_stack_t	*ipss = ipsa->ipsa_netstack->netstack_ipsec;
+	mblk_t		*asyncmp, *mp;
 
 	ASSERT(ipss != NULL);
 	ASSERT(MUTEX_NOT_HELD(&ipsa->ipsa_lock));
@@ -276,20 +268,24 @@ sadb_freeassoc(ipsa_t *ipsa)
 	ASSERT(ipsa->ipsa_next == NULL);
 	ASSERT(ipsa->ipsa_ptpn == NULL);
 
+
+	asyncmp = sadb_clear_lpkt(ipsa);
+	if (asyncmp != NULL) {
+		mp = ip_recv_attr_free_mblk(asyncmp);
+		ip_drop_packet(mp, B_TRUE, NULL,
+		    DROPPER(ipss, ipds_sadb_inlarval_timeout),
+		    &ipss->ipsec_sadb_dropper);
+	}
 	mutex_enter(&ipsa->ipsa_lock);
-	/* Don't call sadb_clear_lpkt() since we hold the ipsa_lock anyway. */
-	ip_drop_packet(ipsa->ipsa_lpkt, B_TRUE, NULL, NULL,
-	    DROPPER(ipss, ipds_sadb_inlarval_timeout),
-	    &ipss->ipsec_sadb_dropper);
 
-	if (ipsa->ipsa_cred != NULL) {
-		crfree(ipsa->ipsa_cred);
-		ipsa->ipsa_cred = NULL;
+	if (ipsa->ipsa_tsl != NULL) {
+		label_rele(ipsa->ipsa_tsl);
+		ipsa->ipsa_tsl = NULL;
 	}
 
-	if (ipsa->ipsa_ocred != NULL) {
-		crfree(ipsa->ipsa_ocred);
-		ipsa->ipsa_ocred = NULL;
+	if (ipsa->ipsa_otsl != NULL) {
+		label_rele(ipsa->ipsa_otsl);
+		ipsa->ipsa_otsl = NULL;
 	}
 
 	ipsec_destroy_ctx_tmpl(ipsa, IPSEC_ALG_AUTH);
@@ -712,336 +708,6 @@ sadb_walker(isaf_t *table, uint_t numentries,
 }
 
 /*
- * From the given SA, construct a dl_ct_ipsec_key and
- * a dl_ct_ipsec structures to be sent to the adapter as part
- * of a DL_CONTROL_REQ.
- *
- * ct_sa must point to the storage allocated for the key
- * structure and must be followed by storage allocated
- * for the SA information that must be sent to the driver
- * as part of the DL_CONTROL_REQ request.
- *
- * The is_inbound boolean indicates whether the specified
- * SA is part of an inbound SA table.
- *
- * Returns B_TRUE if the corresponding SA must be passed to
- * a provider, B_FALSE otherwise; frees *mp if it returns B_FALSE.
- */
-static boolean_t
-sadb_req_from_sa(ipsa_t *sa, mblk_t *mp, boolean_t is_inbound)
-{
-	dl_ct_ipsec_key_t *keyp;
-	dl_ct_ipsec_t *sap;
-	void *ct_sa = mp->b_wptr;
-
-	ASSERT(MUTEX_HELD(&sa->ipsa_lock));
-
-	keyp = (dl_ct_ipsec_key_t *)(ct_sa);
-	sap = (dl_ct_ipsec_t *)(keyp + 1);
-
-	IPSECHW_DEBUG(IPSECHW_CAPAB, ("sadb_req_from_sa: "
-	    "is_inbound = %d\n", is_inbound));
-
-	/* initialize flag */
-	sap->sadb_sa_flags = 0;
-	if (is_inbound) {
-		sap->sadb_sa_flags |= DL_CT_IPSEC_INBOUND;
-		/*
-		 * If an inbound SA has a peer, then mark it has being
-		 * an outbound SA as well.
-		 */
-		if (sa->ipsa_haspeer)
-			sap->sadb_sa_flags |= DL_CT_IPSEC_OUTBOUND;
-	} else {
-		/*
-		 * If an outbound SA has a peer, then don't send it,
-		 * since we will send the copy from the inbound table.
-		 */
-		if (sa->ipsa_haspeer) {
-			freemsg(mp);
-			return (B_FALSE);
-		}
-		sap->sadb_sa_flags |= DL_CT_IPSEC_OUTBOUND;
-	}
-
-	keyp->dl_key_spi = sa->ipsa_spi;
-	bcopy(sa->ipsa_dstaddr, keyp->dl_key_dest_addr,
-	    DL_CTL_IPSEC_ADDR_LEN);
-	keyp->dl_key_addr_family = sa->ipsa_addrfam;
-
-	sap->sadb_sa_auth = sa->ipsa_auth_alg;
-	sap->sadb_sa_encrypt = sa->ipsa_encr_alg;
-
-	sap->sadb_key_len_a = sa->ipsa_authkeylen;
-	sap->sadb_key_bits_a = sa->ipsa_authkeybits;
-	bcopy(sa->ipsa_authkey,
-	    sap->sadb_key_data_a, sap->sadb_key_len_a);
-
-	sap->sadb_key_len_e = sa->ipsa_encrkeylen;
-	sap->sadb_key_bits_e = sa->ipsa_encrkeybits;
-	bcopy(sa->ipsa_encrkey,
-	    sap->sadb_key_data_e, sap->sadb_key_len_e);
-
-	mp->b_wptr += sizeof (dl_ct_ipsec_t) + sizeof (dl_ct_ipsec_key_t);
-	return (B_TRUE);
-}
-
-/*
- * Called from AH or ESP to format a message which will be used to inform
- * IPsec-acceleration-capable ills of a SADB change.
- * (It is not possible to send the message to IP directly from this function
- * since the SA, if any, is locked during the call).
- *
- * dl_operation: DL_CONTROL_REQ operation (add, delete, update, etc)
- * sa_type: identifies whether the operation applies to AH or ESP
- *	(must be one of SADB_SATYPE_AH or SADB_SATYPE_ESP)
- * sa: Pointer to an SA.  Must be non-NULL and locked
- *	for ADD, DELETE, GET, and UPDATE operations.
- * This function returns an mblk chain that must be passed to IP
- * for forwarding to the IPsec capable providers.
- */
-mblk_t *
-sadb_fmt_sa_req(uint_t dl_operation, uint_t sa_type, ipsa_t *sa,
-    boolean_t is_inbound)
-{
-	mblk_t *mp;
-	dl_control_req_t *ctrl;
-	boolean_t need_key = B_FALSE;
-	mblk_t *ctl_mp = NULL;
-	ipsec_ctl_t *ctl;
-
-	/*
-	 * 1 allocate and initialize DL_CONTROL_REQ M_PROTO
-	 * 2 if a key is needed for the operation
-	 *    2.1 initialize key
-	 *    2.2 if a full SA is needed for the operation
-	 *	2.2.1 initialize full SA info
-	 * 3 return message; caller will call ill_ipsec_capab_send_all()
-	 * to send the resulting message to IPsec capable ills.
-	 */
-
-	ASSERT(sa_type == SADB_SATYPE_AH || sa_type == SADB_SATYPE_ESP);
-
-	/*
-	 * Allocate DL_CONTROL_REQ M_PROTO
-	 * We allocate room for the SA even if it's not needed
-	 * by some of the operations (for example flush)
-	 */
-	mp = allocb(sizeof (dl_control_req_t) +
-	    sizeof (dl_ct_ipsec_key_t) + sizeof (dl_ct_ipsec_t), BPRI_HI);
-	if (mp == NULL)
-		return (NULL);
-	mp->b_datap->db_type = M_PROTO;
-
-	/* initialize dl_control_req_t */
-	ctrl = (dl_control_req_t *)mp->b_wptr;
-	ctrl->dl_primitive = DL_CONTROL_REQ;
-	ctrl->dl_operation = dl_operation;
-	ctrl->dl_type = sa_type == SADB_SATYPE_AH ? DL_CT_IPSEC_AH :
-	    DL_CT_IPSEC_ESP;
-	ctrl->dl_key_offset = sizeof (dl_control_req_t);
-	ctrl->dl_key_length = sizeof (dl_ct_ipsec_key_t);
-	ctrl->dl_data_offset = sizeof (dl_control_req_t) +
-	    sizeof (dl_ct_ipsec_key_t);
-	ctrl->dl_data_length = sizeof (dl_ct_ipsec_t);
-	mp->b_wptr += sizeof (dl_control_req_t);
-
-	if ((dl_operation == DL_CO_SET) || (dl_operation == DL_CO_DELETE)) {
-		ASSERT(sa != NULL);
-		ASSERT(MUTEX_HELD(&sa->ipsa_lock));
-
-		need_key = B_TRUE;
-
-		/*
-		 * Initialize key and SA data. Note that for some
-		 * operations the SA data is ignored by the provider
-		 * (delete, etc.)
-		 */
-		if (!sadb_req_from_sa(sa, mp, is_inbound))
-			return (NULL);
-	}
-
-	/* construct control message */
-	ctl_mp = allocb(sizeof (ipsec_ctl_t), BPRI_HI);
-	if (ctl_mp == NULL) {
-		cmn_err(CE_WARN, "sadb_fmt_sa_req: allocb failed\n");
-		freemsg(mp);
-		return (NULL);
-	}
-
-	ctl_mp->b_datap->db_type = M_CTL;
-	ctl_mp->b_wptr += sizeof (ipsec_ctl_t);
-	ctl_mp->b_cont = mp;
-
-	ctl = (ipsec_ctl_t *)ctl_mp->b_rptr;
-	ctl->ipsec_ctl_type = IPSEC_CTL;
-	ctl->ipsec_ctl_len  = sizeof (ipsec_ctl_t);
-	ctl->ipsec_ctl_sa_type = sa_type;
-
-	if (need_key) {
-		/*
-		 * Keep an additional reference on SA, since it will be
-		 * needed by IP to send control messages corresponding
-		 * to that SA from its perimeter. IP will do a
-		 * IPSA_REFRELE when done with the request.
-		 */
-		ASSERT(MUTEX_HELD(&sa->ipsa_lock));
-		IPSA_REFHOLD(sa);
-		ctl->ipsec_ctl_sa = sa;
-	} else
-		ctl->ipsec_ctl_sa = NULL;
-
-	return (ctl_mp);
-}
-
-
-/*
- * Called by sadb_ill_download() to dump the entries for a specific
- * fanout table.  For each SA entry in the table passed as argument,
- * use mp as a template and constructs a full DL_CONTROL message, and
- * call ill_dlpi_send(), provided by IP, to send the resulting
- * messages to the ill.
- */
-static void
-sadb_ill_df(ill_t *ill, mblk_t *mp, isaf_t *fanout, int num_entries,
-    boolean_t is_inbound)
-{
-	ipsa_t *walker;
-	mblk_t *nmp, *salist;
-	int i, error = 0;
-	ip_stack_t	*ipst = ill->ill_ipst;
-	netstack_t	*ns = ipst->ips_netstack;
-
-	IPSECHW_DEBUG(IPSECHW_SADB, ("sadb_ill_df: fanout at 0x%p ne=%d\n",
-	    (void *)fanout, num_entries));
-	/*
-	 * For each IPSA hash bucket do:
-	 *	- Hold the mutex
-	 *	- Walk each entry, sending a corresponding request to IP
-	 *	  for it.
-	 */
-	ASSERT(mp->b_datap->db_type == M_PROTO);
-
-	for (i = 0; i < num_entries; i++) {
-		mutex_enter(&fanout[i].isaf_lock);
-		salist = NULL;
-
-		for (walker = fanout[i].isaf_ipsa; walker != NULL;
-		    walker = walker->ipsa_next) {
-			IPSECHW_DEBUG(IPSECHW_SADB,
-			    ("sadb_ill_df: sending SA to ill via IP \n"));
-			/*
-			 * Duplicate the template mp passed and
-			 * complete DL_CONTROL_REQ data.
-			 * To be more memory efficient, we could use
-			 * dupb() for the M_CTL and copyb() for the M_PROTO
-			 * as the M_CTL, since the M_CTL is the same for
-			 * every SA entry passed down to IP for the same ill.
-			 *
-			 * Note that copymsg/copyb ensure that the new mblk
-			 * is at least as large as the source mblk even if it's
-			 * not using all its storage -- therefore, nmp
-			 * has trailing space for sadb_req_from_sa to add
-			 * the SA-specific bits.
-			 */
-			mutex_enter(&walker->ipsa_lock);
-			if (ipsec_capab_match(ill,
-			    ill->ill_phyint->phyint_ifindex, ill->ill_isv6,
-			    walker, ns)) {
-				nmp = copymsg(mp);
-				if (nmp == NULL) {
-					IPSECHW_DEBUG(IPSECHW_SADB,
-					    ("sadb_ill_df: alloc error\n"));
-					error = ENOMEM;
-					mutex_exit(&walker->ipsa_lock);
-					break;
-				}
-				if (sadb_req_from_sa(walker, nmp, is_inbound)) {
-					nmp->b_next = salist;
-					salist = nmp;
-				}
-			}
-			mutex_exit(&walker->ipsa_lock);
-		}
-		mutex_exit(&fanout[i].isaf_lock);
-		while (salist != NULL) {
-			nmp = salist;
-			salist = nmp->b_next;
-			nmp->b_next = NULL;
-			ill_dlpi_send(ill, nmp);
-		}
-		if (error != 0)
-			break;	/* out of for loop. */
-	}
-}
-
-/*
- * Called by ill_ipsec_capab_add(). Sends a copy of the SADB of
- * the type specified by sa_type to the specified ill.
- *
- * We call for each fanout table defined by the SADB (one per
- * protocol). sadb_ill_df() finally calls ill_dlpi_send() for
- * each SADB entry in order to send a corresponding DL_CONTROL_REQ
- * message to the ill.
- */
-void
-sadb_ill_download(ill_t *ill, uint_t sa_type)
-{
-	mblk_t *protomp;	/* prototype message */
-	dl_control_req_t *ctrl;
-	sadbp_t *spp;
-	sadb_t *sp;
-	int dlt;
-	ip_stack_t	*ipst = ill->ill_ipst;
-	netstack_t	*ns = ipst->ips_netstack;
-
-	ASSERT(sa_type == SADB_SATYPE_AH || sa_type == SADB_SATYPE_ESP);
-
-	/*
-	 * Allocate and initialize prototype answer. A duplicate for
-	 * each SA is sent down to the interface.
-	 */
-
-	/* DL_CONTROL_REQ M_PROTO mblk_t */
-	protomp = allocb(sizeof (dl_control_req_t) +
-	    sizeof (dl_ct_ipsec_key_t) + sizeof (dl_ct_ipsec_t), BPRI_HI);
-	if (protomp == NULL)
-		return;
-	protomp->b_datap->db_type = M_PROTO;
-
-	dlt = (sa_type == SADB_SATYPE_AH) ? DL_CT_IPSEC_AH : DL_CT_IPSEC_ESP;
-	if (sa_type == SADB_SATYPE_ESP) {
-		ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
-
-		spp = &espstack->esp_sadb;
-	} else {
-		ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
-
-		spp = &ahstack->ah_sadb;
-	}
-
-	ctrl = (dl_control_req_t *)protomp->b_wptr;
-	ctrl->dl_primitive = DL_CONTROL_REQ;
-	ctrl->dl_operation = DL_CO_SET;
-	ctrl->dl_type = dlt;
-	ctrl->dl_key_offset = sizeof (dl_control_req_t);
-	ctrl->dl_key_length = sizeof (dl_ct_ipsec_key_t);
-	ctrl->dl_data_offset = sizeof (dl_control_req_t) +
-	    sizeof (dl_ct_ipsec_key_t);
-	ctrl->dl_data_length = sizeof (dl_ct_ipsec_t);
-	protomp->b_wptr += sizeof (dl_control_req_t);
-
-	/*
-	 * then for each SADB entry, we fill out the dl_ct_ipsec_key_t
-	 * and dl_ct_ipsec_t
-	 */
-	sp = ill->ill_isv6 ? &(spp->s_v6) : &(spp->s_v4);
-	sadb_ill_df(ill, protomp, sp->sdb_of, sp->sdb_hashsize, B_FALSE);
-	sadb_ill_df(ill, protomp, sp->sdb_if, sp->sdb_hashsize, B_TRUE);
-	freemsg(protomp);
-}
-
-/*
  * Call me to free up a security association fanout.  Use the forever
  * variable to indicate freeing up the SAs (forever == B_FALSE, e.g.
  * an SADB_FLUSH message), or destroying everything (forever == B_TRUE,
@@ -1119,30 +785,11 @@ sadb_destroy(sadb_t *sp, netstack_t *ns)
 	ASSERT(sp->sdb_acq == NULL);
 }
 
-static void
-sadb_send_flush_req(sadbp_t *spp)
-{
-	mblk_t *ctl_mp;
-
-	/*
-	 * we've been unplumbed, or never were plumbed; don't go there.
-	 */
-	if (spp->s_ip_q == NULL)
-		return;
-
-	/* have IP send a flush msg to the IPsec accelerators */
-	ctl_mp = sadb_fmt_sa_req(DL_CO_FLUSH, spp->s_satype, NULL, B_TRUE);
-	if (ctl_mp != NULL)
-		putnext(spp->s_ip_q, ctl_mp);
-}
-
 void
 sadbp_flush(sadbp_t *spp, netstack_t *ns)
 {
 	sadb_flush(&spp->s_v4, ns);
 	sadb_flush(&spp->s_v6, ns);
-
-	sadb_send_flush_req(spp);
 }
 
 void
@@ -1151,7 +798,6 @@ sadbp_destroy(sadbp_t *spp, netstack_t *ns)
 	sadb_destroy(&spp->s_v4, ns);
 	sadb_destroy(&spp->s_v6, ns);
 
-	sadb_send_flush_req(spp);
 	if (spp->s_satype == SADB_SATYPE_AH) {
 		ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
@@ -1259,11 +905,11 @@ sadb_cloneassoc(ipsa_t *ipsa)
 	/* bzero and initialize locks, in case *_init() allocates... */
 	mutex_init(&newbie->ipsa_lock, NULL, MUTEX_DEFAULT, NULL);
 
-	if (newbie->ipsa_cred != NULL)
-		crhold(newbie->ipsa_cred);
+	if (newbie->ipsa_tsl != NULL)
+		label_hold(newbie->ipsa_tsl);
 
-	if (newbie->ipsa_ocred != NULL)
-		crhold(newbie->ipsa_ocred);
+	if (newbie->ipsa_otsl != NULL)
+		label_hold(newbie->ipsa_otsl);
 
 	/*
 	 * While somewhat dain-bramaged, the most graceful way to
@@ -1554,14 +1200,14 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg)
 		encr = B_FALSE;
 	}
 
-	if (ipsa->ipsa_cred != NULL) {
-		senslen = sadb_sens_len_from_cred(ipsa->ipsa_cred);
+	if (ipsa->ipsa_tsl != NULL) {
+		senslen = sadb_sens_len_from_label(ipsa->ipsa_tsl);
 		alloclen += senslen;
 		sensinteg = B_TRUE;
 	}
 
-	if (ipsa->ipsa_ocred != NULL) {
-		osenslen = sadb_sens_len_from_cred(ipsa->ipsa_ocred);
+	if (ipsa->ipsa_otsl != NULL) {
+		osenslen = sadb_sens_len_from_label(ipsa->ipsa_otsl);
 		alloclen += osenslen;
 		osensinteg = B_TRUE;
 	}
@@ -1792,8 +1438,8 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg)
 
 	if (sensinteg) {
 		sens = (sadb_sens_t *)walker;
-		sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY,
-		    ipsa->ipsa_cred, senslen);
+		sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
+		    ipsa->ipsa_tsl, senslen);
 
 		walker = (sadb_ext_t *)((uint64_t *)walker +
 		    walker->sadb_ext_len);
@@ -1802,8 +1448,8 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg)
 	if (osensinteg) {
 		sens = (sadb_sens_t *)walker;
 
-		sadb_sens_from_cred(sens, SADB_X_EXT_OUTER_SENS,
-		    ipsa->ipsa_ocred, osenslen);
+		sadb_sens_from_label(sens, SADB_X_EXT_OUTER_SENS,
+		    ipsa->ipsa_otsl, osenslen);
 		if (ipsa->ipsa_mac_exempt)
 			sens->sadb_x_sens_flags = SADB_X_SENS_IMPLICIT;
 
@@ -2123,7 +1769,6 @@ sadb_addrcheck(queue_t *pfkey_q, mblk_t *mp, sadb_ext_t *ext, uint_t serial,
 	sadb_address_t *addr = (sadb_address_t *)ext;
 	struct sockaddr_in *sin;
 	struct sockaddr_in6 *sin6;
-	ire_t *ire;
 	int diagnostic, type;
 	boolean_t normalized = B_FALSE;
 
@@ -2249,18 +1894,12 @@ bail:
 		/*
 		 * At this point, we're a unicast IPv6 address.
 		 *
-		 * A ctable lookup for local is sufficient here.  If we're
-		 * local, return KS_IN_ADDR_ME, otherwise KS_IN_ADDR_NOTME.
-		 *
 		 * XXX Zones alert -> me/notme decision needs to be tempered
 		 * by what zone we're in when we go to zone-aware IPsec.
 		 */
-		ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL,
-		    IRE_LOCAL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
-		    ns->netstack_ip);
-		if (ire != NULL) {
+		if (ip_type_v6(&sin6->sin6_addr, ns->netstack_ip) ==
+		    IRE_LOCAL) {
 			/* Hey hey, it's local. */
-			IRE_REFRELE(ire);
 			return (KS_IN_ADDR_ME);
 		}
 	} else {
@@ -2272,23 +1911,17 @@ bail:
 		/*
 		 * At this point we're a unicast or broadcast IPv4 address.
 		 *
-		 * Lookup on the ctable for IRE_BROADCAST or IRE_LOCAL.
-		 * A NULL return value is NOTME, otherwise, look at the
-		 * returned ire for broadcast or not and return accordingly.
+		 * Check if the address is IRE_BROADCAST or IRE_LOCAL.
 		 *
 		 * XXX Zones alert -> me/notme decision needs to be tempered
 		 * by what zone we're in when we go to zone-aware IPsec.
 		 */
-		ire = ire_ctable_lookup(sin->sin_addr.s_addr, 0,
-		    IRE_LOCAL | IRE_BROADCAST, NULL, ALL_ZONES, NULL,
-		    MATCH_IRE_TYPE, ns->netstack_ip);
-		if (ire != NULL) {
-			/* Check for local or broadcast */
-			type = ire->ire_type;
-			IRE_REFRELE(ire);
-			ASSERT(type == IRE_LOCAL || type == IRE_BROADCAST);
-			return ((type == IRE_LOCAL) ? KS_IN_ADDR_ME :
-			    KS_IN_ADDR_MBCAST);
+		type = ip_type_v4(sin->sin_addr.s_addr, ns->netstack_ip);
+		switch (type) {
+		case IRE_LOCAL:
+			return (KS_IN_ADDR_ME);
+		case IRE_BROADCAST:
+			return (KS_IN_ADDR_MBCAST);
 		}
 	}
 
@@ -2763,7 +2396,6 @@ struct sadb_purge_state
 	ipsa_query_t sq;
 	boolean_t inbnd;
 	uint8_t sadb_sa_state;
-	mblk_t *mq;
 };
 
 static void
@@ -2785,7 +2417,7 @@ sadb_purge_cb(isaf_t *head, ipsa_t *entry, void *cookie)
 		sadb_delete_cluster(entry);
 	}
 	entry->ipsa_state = IPSA_STATE_DEAD;
-	(void) sadb_torch_assoc(head, entry, ps->inbnd, &ps->mq);
+	(void) sadb_torch_assoc(head, entry);
 }
 
 /*
@@ -2794,15 +2426,13 @@ sadb_purge_cb(isaf_t *head, ipsa_t *entry, void *cookie)
  */
 int
 sadb_purge_sa(mblk_t *mp, keysock_in_t *ksi, sadb_t *sp,
-	int *diagnostic, queue_t *pfkey_q, queue_t *ip_q)
+	int *diagnostic, queue_t *pfkey_q)
 {
 	struct sadb_purge_state ps;
 	int error = sadb_form_query(ksi, 0,
 	    IPSA_Q_SRC|IPSA_Q_DST|IPSA_Q_SRCID|IPSA_Q_DSTID|IPSA_Q_KMC,
 	    &ps.sq, diagnostic);
 
-	ps.mq = NULL;
-
 	if (error != 0)
 		return (error);
 
@@ -2819,9 +2449,6 @@ sadb_purge_sa(mblk_t *mp, keysock_in_t *ksi, sadb_t *sp,
 	ps.inbnd = B_FALSE;
 	sadb_walker(sp->sdb_of, sp->sdb_hashsize, sadb_purge_cb, &ps);
 
-	if (ps.mq != NULL)
-		sadb_drain_torchq(ip_q, ps.mq);
-
 	ASSERT(mp->b_cont != NULL);
 	sadb_pfkey_echo(pfkey_q, mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi,
 	    NULL);
@@ -2870,12 +2497,11 @@ sadb_delpair_state_one(isaf_t *head, ipsa_t *entry, void *cookie)
 	}
 
 	entry->ipsa_state = IPSA_STATE_DEAD;
-	(void) sadb_torch_assoc(head, entry, B_FALSE, &ps->mq);
+	(void) sadb_torch_assoc(head, entry);
 	if (peer_assoc != NULL) {
 		mutex_enter(&peer_assoc->ipsa_lock);
 		peer_assoc->ipsa_state = IPSA_STATE_DEAD;
-		(void) sadb_torch_assoc(inbound_bucket, peer_assoc,
-		    B_FALSE, &ps->mq);
+		(void) sadb_torch_assoc(inbound_bucket, peer_assoc);
 	}
 	mutex_exit(&inbound_bucket->isaf_lock);
 }
@@ -2889,7 +2515,6 @@ sadb_delpair_state(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
 	int error;
 
 	ps.sq.spp = spp;		/* XXX param */
-	ps.mq = NULL;
 
 	error = sadb_form_query(ksi, IPSA_Q_DST|IPSA_Q_SRC,
 	    IPSA_Q_SRC|IPSA_Q_DST|IPSA_Q_SRCID|IPSA_Q_DSTID|IPSA_Q_KMC,
@@ -2902,9 +2527,6 @@ sadb_delpair_state(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
 	sadb_walker(ps.sq.sp->sdb_of, ps.sq.sp->sdb_hashsize,
 	    sadb_delpair_state_one, &ps);
 
-	if (ps.mq != NULL)
-		sadb_drain_torchq(pfkey_q, ps.mq);
-
 	ASSERT(mp->b_cont != NULL);
 	sadb_pfkey_echo(pfkey_q, mp, (sadb_msg_t *)mp->b_cont->b_rptr,
 	    ksi, NULL);
@@ -2921,7 +2543,6 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
 	ipsa_query_t sq;
 	ipsa_t *echo_target = NULL;
 	ipsap_t ipsapp;
-	mblk_t *torchq = NULL;
 	uint_t	error = 0;
 
 	if (sadb_msg_type == SADB_X_DELPAIR_STATE)
@@ -2965,7 +2586,7 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
 			}
 			ipsapp.ipsap_sa_ptr->ipsa_state = IPSA_STATE_DEAD;
 			(void) sadb_torch_assoc(ipsapp.ipsap_bucket,
-			    ipsapp.ipsap_sa_ptr, B_FALSE, &torchq);
+			    ipsapp.ipsap_sa_ptr);
 			/*
 			 * sadb_torch_assoc() releases the ipsa_lock
 			 * and calls sadb_unlinkassoc() which does a
@@ -2984,7 +2605,7 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
 				ipsapp.ipsap_psa_ptr->ipsa_state =
 				    IPSA_STATE_DEAD;
 				(void) sadb_torch_assoc(ipsapp.ipsap_pbucket,
-				    ipsapp.ipsap_psa_ptr, B_FALSE, &torchq);
+				    ipsapp.ipsap_psa_ptr);
 			} else {
 				/*
 				 * Only half of the "pair" has been deleted.
@@ -3004,9 +2625,6 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
 		mutex_exit(&ipsapp.ipsap_pbucket->isaf_lock);
 	}
 
-	if (torchq != NULL)
-		sadb_drain_torchq(spp->s_ip_q, torchq);
-
 	ASSERT(mp->b_cont != NULL);
 
 	if (error == 0)
@@ -3269,7 +2887,7 @@ sadb_nat_calculations(ipsa_t *newbie, sadb_address_t *natt_loc_ext,
  * case here.
  */
 int
-sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
+sadb_common_add(queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
     keysock_in_t *ksi, isaf_t *primary, isaf_t *secondary,
     ipsa_t *newbie, boolean_t clone, boolean_t is_inbound, int *diagnostic,
     netstack_t *ns, sadbp_t *spp)
@@ -3313,11 +2931,11 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
 	int error = 0;
 	boolean_t isupdate = (newbie != NULL);
 	uint32_t *src_addr_ptr, *dst_addr_ptr, *isrc_addr_ptr, *idst_addr_ptr;
-	mblk_t *ctl_mp = NULL;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 	ip_stack_t 	*ipst = ns->netstack_ip;
 	ipsec_alginfo_t *alg;
 	int		rcode;
+	boolean_t	async = B_FALSE;
 
 	init_ipsa_pair(&ipsapp);
 
@@ -3549,7 +3167,14 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
 	newbie->ipsa_authtmpl = NULL;
 	newbie->ipsa_encrtmpl = NULL;
 
+#ifdef IPSEC_LATENCY_TEST
+	if (akey != NULL && newbie->ipsa_auth_alg != SADB_AALG_NONE) {
+#else
 	if (akey != NULL) {
+#endif
+		async = (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
+		    IPSEC_ALGS_EXEC_ASYNC);
+
 		newbie->ipsa_authkeybits = akey->sadb_key_bits;
 		newbie->ipsa_authkeylen = SADB_1TO8(akey->sadb_key_bits);
 		/* In case we have to round up to the next byte... */
@@ -3604,6 +3229,8 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
 
 	if (ekey != NULL) {
 		mutex_enter(&ipss->ipsec_alg_lock);
+		async = async || (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
+		    IPSEC_ALGS_EXEC_ASYNC);
 		alg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
 		    [newbie->ipsa_encr_alg];
 
@@ -3757,6 +3384,9 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
 		}
 	}
 
+	if (async)
+		newbie->ipsa_flags |= IPSA_F_ASYNC;
+
 	/*
 	 * Ptrs to processing functions.
 	 */
@@ -3812,7 +3442,7 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
 	if (sens != NULL) {
 		uint64_t *bitmap = (uint64_t *)(sens + 1);
 
-		newbie->ipsa_cred = sadb_cred_from_sens(sens, bitmap);
+		newbie->ipsa_tsl = sadb_label_from_sens(sens, bitmap);
 	}
 
 	/*
@@ -3820,41 +3450,55 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
 	 */
 	if (osens != NULL) {
 		uint64_t *bitmap = (uint64_t *)(osens + 1);
-		cred_t *cred, *effective_cred;
+		ts_label_t *tsl, *effective_tsl;
 		uint32_t *peer_addr_ptr;
+		zoneid_t zoneid = GLOBAL_ZONEID;
+		zone_t *zone;
 
 		peer_addr_ptr = is_inbound ? src_addr_ptr : dst_addr_ptr;
 
-		cred = sadb_cred_from_sens(osens, bitmap);
+		tsl = sadb_label_from_sens(osens, bitmap);
 		newbie->ipsa_mac_exempt = CONN_MAC_DEFAULT;
 
 		if (osens->sadb_x_sens_flags & SADB_X_SENS_IMPLICIT) {
 			newbie->ipsa_mac_exempt = CONN_MAC_IMPLICIT;
 		}
 
-		error = tsol_check_dest(cred, peer_addr_ptr,
+		error = tsol_check_dest(tsl, peer_addr_ptr,
 		    (af == AF_INET6)?IPV6_VERSION:IPV4_VERSION,
-		    newbie->ipsa_mac_exempt, &effective_cred);
+		    newbie->ipsa_mac_exempt, B_TRUE, &effective_tsl);
 		if (error != 0) {
-			crfree(cred);
+			label_rele(tsl);
 			mutex_exit(&newbie->ipsa_lock);
 			goto error;
 		}
 
-		if (effective_cred != NULL) {
-			crfree(cred);
-			cred = effective_cred;
+		if (effective_tsl != NULL) {
+			label_rele(tsl);
+			tsl = effective_tsl;
 		}
 
-		newbie->ipsa_ocred = cred;
+		newbie->ipsa_otsl = tsl;
+
+		zone = zone_find_by_label(tsl);
+		if (zone != NULL) {
+			zoneid = zone->zone_id;
+			zone_rele(zone);
+		}
+		/*
+		 * For exclusive stacks we set the zoneid to zero to operate
+		 * as if in the global zone for tsol_compute_label_v4/v6
+		 */
+		if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
+			zoneid = GLOBAL_ZONEID;
 
 		if (af == AF_INET6) {
-			error = tsol_compute_label_v6(cred,
+			error = tsol_compute_label_v6(tsl, zoneid,
 			    (in6_addr_t *)peer_addr_ptr,
 			    newbie->ipsa_opt_storage, ipst);
 		} else {
-			error = tsol_compute_label(cred, *peer_addr_ptr,
-			    newbie->ipsa_opt_storage, ipst);
+			error = tsol_compute_label_v4(tsl, zoneid,
+			    *peer_addr_ptr, newbie->ipsa_opt_storage, ipst);
 		}
 		if (error != 0) {
 			mutex_exit(&newbie->ipsa_lock);
@@ -3916,9 +3560,6 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
 		mutex_enter(&primary->isaf_lock);
 	}
 
-	IPSECHW_DEBUG(IPSECHW_SADB, ("sadb_common_add: spi = 0x%x\n",
-	    newbie->ipsa_spi));
-
 	/*
 	 * sadb_insertassoc() doesn't increment the reference
 	 * count.  We therefore have to increment the
@@ -3938,10 +3579,6 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
 
 	mutex_enter(&newbie->ipsa_lock);
 	error = sadb_insertassoc(newbie, primary);
-	if (error == 0) {
-		ctl_mp = sadb_fmt_sa_req(DL_CO_SET, newbie->ipsa_type, newbie,
-		    is_inbound);
-	}
 	mutex_exit(&newbie->ipsa_lock);
 
 	if (error != 0) {
@@ -3982,13 +3619,6 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
 	ASSERT(MUTEX_NOT_HELD(&newbie->ipsa_lock));
 	ASSERT(newbie_clone == NULL ||
 	    (MUTEX_NOT_HELD(&newbie_clone->ipsa_lock)));
-	/*
-	 * If hardware acceleration could happen, send it.
-	 */
-	if (ctl_mp != NULL) {
-		putnext(ip_q, ctl_mp);
-		ctl_mp = NULL;
-	}
 
 error_unlock:
 
@@ -4037,8 +3667,6 @@ error:
 	if (newbie_clone != NULL) {
 		IPSA_REFRELE(newbie_clone);
 	}
-	if (ctl_mp != NULL)
-		freemsg(ctl_mp);
 
 	if (error == 0) {
 		/*
@@ -4315,37 +3943,12 @@ sadb_age_bytes(queue_t *pfkey_q, ipsa_t *assoc, uint64_t bytes,
 }
 
 /*
- * Push one or more DL_CO_DELETE messages queued up by
- * sadb_torch_assoc down to the underlying driver now that it's a
- * convenient time for it (i.e., ipsa bucket locks not held).
- */
-static void
-sadb_drain_torchq(queue_t *q, mblk_t *mp)
-{
-	while (mp != NULL) {
-		mblk_t *next = mp->b_next;
-		mp->b_next = NULL;
-		if (q != NULL)
-			putnext(q, mp);
-		else
-			freemsg(mp);
-		mp = next;
-	}
-}
-
-/*
  * "Torch" an individual SA.  Returns NULL, so it can be tail-called from
  *     sadb_age_assoc().
- *
- * If SA is hardware-accelerated, and we can't allocate the mblk
- * containing the DL_CO_DELETE, just return; it will remain in the
- * table and be swept up by sadb_ager() in a subsequent pass.
  */
 static ipsa_t *
-sadb_torch_assoc(isaf_t *head, ipsa_t *sa, boolean_t inbnd, mblk_t **mq)
+sadb_torch_assoc(isaf_t *head, ipsa_t *sa)
 {
-	mblk_t *mp;
-
 	ASSERT(MUTEX_HELD(&head->isaf_lock));
 	ASSERT(MUTEX_HELD(&sa->ipsa_lock));
 	ASSERT(sa->ipsa_state == IPSA_STATE_DEAD);
@@ -4355,15 +3958,6 @@ sadb_torch_assoc(isaf_t *head, ipsa_t *sa, boolean_t inbnd, mblk_t **mq)
 	 */
 	head->isaf_gen++;
 
-	if (sa->ipsa_flags & IPSA_F_HW) {
-		mp = sadb_fmt_sa_req(DL_CO_DELETE, sa->ipsa_type, sa, inbnd);
-		if (mp == NULL) {
-			mutex_exit(&sa->ipsa_lock);
-			return (NULL);
-		}
-		mp->b_next = *mq;
-		*mq = mp;
-	}
 	mutex_exit(&sa->ipsa_lock);
 	sadb_unlinkassoc(sa);
 
@@ -4404,7 +3998,7 @@ sadb_idle_activities(ipsa_t *assoc, time_t delta, boolean_t inbound)
  */
 static ipsa_t *
 sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc,
-    time_t current, int reap_delay, boolean_t inbound, mblk_t **mq)
+    time_t current, int reap_delay, boolean_t inbound)
 {
 	ipsa_t *retval = NULL;
 	boolean_t dropped_mutex = B_FALSE;
@@ -4419,7 +4013,7 @@ sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc,
 	    (assoc->ipsa_hardexpiretime != 0))) &&
 	    (assoc->ipsa_hardexpiretime <= current)) {
 		assoc->ipsa_state = IPSA_STATE_DEAD;
-		return (sadb_torch_assoc(head, assoc, inbound, mq));
+		return (sadb_torch_assoc(head, assoc));
 	}
 
 	/*
@@ -4433,7 +4027,7 @@ sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc,
 	if (assoc->ipsa_hardexpiretime != 0 &&
 	    assoc->ipsa_hardexpiretime <= current) {
 		if (assoc->ipsa_state == IPSA_STATE_DEAD)
-			return (sadb_torch_assoc(head, assoc, inbound, mq));
+			return (sadb_torch_assoc(head, assoc));
 
 		if (inbound) {
 			sadb_delete_cluster(assoc);
@@ -4516,8 +4110,7 @@ sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc,
  * the second time sadb_ager() runs.
  */
 void
-sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
-    netstack_t *ns)
+sadb_ager(sadb_t *sp, queue_t *pfkey_q, int reap_delay, netstack_t *ns)
 {
 	int i;
 	isaf_t *bucket;
@@ -4527,7 +4120,6 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
 	templist_t *haspeerlist, *newbie;
 	/* Snapshot current time now. */
 	time_t current = gethrestime_sec();
-	mblk_t *mq = NULL;
 	haspeerlist = NULL;
 
 	/*
@@ -4559,7 +4151,7 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
 		    assoc = spare) {
 			spare = assoc->ipsa_next;
 			if (sadb_age_assoc(bucket, pfkey_q, assoc, current,
-			    reap_delay, B_TRUE, &mq) != NULL) {
+			    reap_delay, B_TRUE) != NULL) {
 				/*
 				 * Put SA's which have a peer or SA's which
 				 * are paired on a list for processing after
@@ -4585,10 +4177,6 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
 		mutex_exit(&bucket->isaf_lock);
 	}
 
-	if (mq != NULL) {
-		sadb_drain_torchq(ip_q, mq);
-		mq = NULL;
-	}
 	age_pair_peer_list(haspeerlist, sp, B_FALSE);
 	haspeerlist = NULL;
 
@@ -4600,7 +4188,7 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
 		    assoc = spare) {
 			spare = assoc->ipsa_next;
 			if (sadb_age_assoc(bucket, pfkey_q, assoc, current,
-			    reap_delay, B_FALSE, &mq) != NULL) {
+			    reap_delay, B_FALSE) != NULL) {
 				/*
 				 * sadb_age_assoc() increments the refcnt,
 				 * effectively doing an IPSA_REFHOLD().
@@ -4621,10 +4209,6 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
 		}
 		mutex_exit(&bucket->isaf_lock);
 	}
-	if (mq != NULL) {
-		sadb_drain_torchq(ip_q, mq);
-		mq = NULL;
-	}
 
 	age_pair_peer_list(haspeerlist, sp, B_TRUE);
 
@@ -5227,7 +4811,7 @@ update_pairing(ipsap_t *ipsapp, ipsa_query_t *sq, keysock_in_t *ksi,
 static ipsacq_t *
 sadb_checkacquire(iacqf_t *bucket, ipsec_action_t *ap, ipsec_policy_t *pp,
     uint32_t *src, uint32_t *dst, uint32_t *isrc, uint32_t *idst,
-    uint64_t unique_id, cred_t *cr)
+    uint64_t unique_id, ts_label_t *tsl)
 {
 	ipsacq_t *walker;
 	sa_family_t fam;
@@ -5257,7 +4841,7 @@ sadb_checkacquire(iacqf_t *bucket, ipsec_action_t *ap, ipsec_policy_t *pp,
 		    (pp == walker->ipsacq_policy) &&
 		    /* XXX do deep compares of ap/pp? */
 		    (unique_id == walker->ipsacq_unique_id) &&
-		    (ipsec_label_match(cr, walker->ipsacq_cred)))
+		    (ipsec_label_match(tsl, walker->ipsacq_tsl)))
 			break;			/* everything matched */
 		mutex_exit(&walker->ipsacq_lock);
 	}
@@ -5272,31 +4856,32 @@ sadb_checkacquire(iacqf_t *bucket, ipsec_action_t *ap, ipsec_policy_t *pp,
  * send the acquire up..
  *
  * In cases where we need both AH and ESP, add the SA to the ESP ACQUIRE
- * list.  The ah_add_sa_finish() routines can look at the packet's ipsec_out_t
- * and handle this case specially.
+ * list.  The ah_add_sa_finish() routines can look at the packet's attached
+ * attributes and handle this case specially.
  */
 void
-sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
+sadb_acquire(mblk_t *datamp, ip_xmit_attr_t *ixa, boolean_t need_ah,
+    boolean_t need_esp)
 {
+	mblk_t	*asyncmp;
 	sadbp_t *spp;
 	sadb_t *sp;
 	ipsacq_t *newbie;
 	iacqf_t *bucket;
-	mblk_t *datamp = mp->b_cont;
 	mblk_t *extended;
 	ipha_t *ipha = (ipha_t *)datamp->b_rptr;
 	ip6_t *ip6h = (ip6_t *)datamp->b_rptr;
 	uint32_t *src, *dst, *isrc, *idst;
-	ipsec_policy_t *pp = io->ipsec_out_policy;
-	ipsec_action_t *ap = io->ipsec_out_act;
+	ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
+	ipsec_action_t *ap = ixa->ixa_ipsec_action;
 	sa_family_t af;
 	int hashoffset;
 	uint32_t seq;
 	uint64_t unique_id = 0;
 	ipsec_selector_t sel;
-	boolean_t tunnel_mode = io->ipsec_out_tunnel;
-	cred_t 		*cr = NULL;
-	netstack_t	*ns = io->ipsec_out_ns;
+	boolean_t tunnel_mode = (ixa->ixa_flags & IXAF_IPSEC_TUNNEL) != 0;
+	ts_label_t 	*tsl = NULL;
+	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 	sadb_sens_t 	*sens = NULL;
 	int 		sens_len;
@@ -5315,12 +4900,10 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 
 		spp = &ahstack->ah_sadb;
 	}
-	sp = io->ipsec_out_v4 ? &spp->s_v4 : &spp->s_v6;
-
-	ASSERT(mp->b_cont != NULL);
+	sp = (ixa->ixa_flags & IXAF_IS_IPV4) ? &spp->s_v4 : &spp->s_v6;
 
 	if (is_system_labeled())
-		cr = msg_getcred(mp->b_cont, NULL);
+		tsl = ixa->ixa_tsl;
 
 	if (ap == NULL)
 		ap = pp->ipsp_act;
@@ -5328,7 +4911,7 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 	ASSERT(ap != NULL);
 
 	if (ap->ipa_act.ipa_apply.ipp_use_unique || tunnel_mode)
-		unique_id = SA_FORM_UNIQUE_ID(io);
+		unique_id = SA_FORM_UNIQUE_ID(ixa);
 
 	/*
 	 * Set up an ACQUIRE record.
@@ -5345,14 +4928,14 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 		dst = (uint32_t *)&ipha->ipha_dst;
 		af = AF_INET;
 		hashoffset = OUTBOUND_HASH_V4(sp, ipha->ipha_dst);
-		ASSERT(io->ipsec_out_v4 == B_TRUE);
+		ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
 	} else {
 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
 		src = (uint32_t *)&ip6h->ip6_src;
 		dst = (uint32_t *)&ip6h->ip6_dst;
 		af = AF_INET6;
 		hashoffset = OUTBOUND_HASH_V6(sp, ip6h->ip6_dst);
-		ASSERT(io->ipsec_out_v4 == B_FALSE);
+		ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
 	}
 
 	if (tunnel_mode) {
@@ -5363,14 +4946,14 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 			 * with self-encapsulated protection.  Until we better
 			 * support this, drop the packet.
 			 */
-			ip_drop_packet(mp, B_FALSE, NULL, NULL,
+			ip_drop_packet(datamp, B_FALSE, NULL,
 			    DROPPER(ipss, ipds_spd_got_selfencap),
 			    &ipss->ipsec_spd_dropper);
 			return;
 		}
 		/* Snag inner addresses. */
-		isrc = io->ipsec_out_insrc;
-		idst = io->ipsec_out_indst;
+		isrc = ixa->ixa_ipsec_insrc;
+		idst = ixa->ixa_ipsec_indst;
 	} else {
 		isrc = idst = NULL;
 	}
@@ -5382,7 +4965,7 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 	bucket = &(sp->sdb_acq[hashoffset]);
 	mutex_enter(&bucket->iacqf_lock);
 	newbie = sadb_checkacquire(bucket, ap, pp, src, dst, isrc, idst,
-	    unique_id, cr);
+	    unique_id, tsl);
 
 	if (newbie == NULL) {
 		/*
@@ -5391,7 +4974,7 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 		newbie = kmem_zalloc(sizeof (*newbie), KM_NOSLEEP);
 		if (newbie == NULL) {
 			mutex_exit(&bucket->iacqf_lock);
-			ip_drop_packet(mp, B_FALSE, NULL, NULL,
+			ip_drop_packet(datamp, B_FALSE, NULL,
 			    DROPPER(ipss, ipds_sadb_acquire_nomem),
 			    &ipss->ipsec_sadb_dropper);
 			return;
@@ -5433,11 +5016,30 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 	 */
 	ASSERT(MUTEX_HELD(&newbie->ipsacq_lock));
 
-	mp->b_next = NULL;
+	/*
+	 * Make the ip_xmit_attr_t into something we can queue.
+	 * If no memory it frees datamp.
+	 */
+	asyncmp = ip_xmit_attr_to_mblk(ixa);
+	if (asyncmp != NULL)
+		linkb(asyncmp, datamp);
+
 	/* Queue up packet.  Use b_next. */
-	if (newbie->ipsacq_numpackets == 0) {
+
+	if (asyncmp == NULL) {
+		/* Statistics for allocation failure */
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
+			BUMP_MIB(&ixa->ixa_ipst->ips_ip_mib,
+			    ipIfStatsOutDiscards);
+		} else {
+			BUMP_MIB(&ixa->ixa_ipst->ips_ip6_mib,
+			    ipIfStatsOutDiscards);
+		}
+		ip_drop_output("No memory for asyncmp", datamp, NULL);
+		freemsg(datamp);
+	} else if (newbie->ipsacq_numpackets == 0) {
 		/* First one. */
-		newbie->ipsacq_mp = mp;
+		newbie->ipsacq_mp = asyncmp;
 		newbie->ipsacq_numpackets = 1;
 		newbie->ipsacq_expire = gethrestime_sec();
 		/*
@@ -5448,28 +5050,28 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 		newbie->ipsacq_seq = seq;
 		newbie->ipsacq_addrfam = af;
 
-		newbie->ipsacq_srcport = io->ipsec_out_src_port;
-		newbie->ipsacq_dstport = io->ipsec_out_dst_port;
-		newbie->ipsacq_icmp_type = io->ipsec_out_icmp_type;
-		newbie->ipsacq_icmp_code = io->ipsec_out_icmp_code;
+		newbie->ipsacq_srcport = ixa->ixa_ipsec_src_port;
+		newbie->ipsacq_dstport = ixa->ixa_ipsec_dst_port;
+		newbie->ipsacq_icmp_type = ixa->ixa_ipsec_icmp_type;
+		newbie->ipsacq_icmp_code = ixa->ixa_ipsec_icmp_code;
 		if (tunnel_mode) {
-			newbie->ipsacq_inneraddrfam = io->ipsec_out_inaf;
-			newbie->ipsacq_proto = io->ipsec_out_inaf == AF_INET6 ?
+			newbie->ipsacq_inneraddrfam = ixa->ixa_ipsec_inaf;
+			newbie->ipsacq_proto = ixa->ixa_ipsec_inaf == AF_INET6 ?
 			    IPPROTO_IPV6 : IPPROTO_ENCAP;
-			newbie->ipsacq_innersrcpfx = io->ipsec_out_insrcpfx;
-			newbie->ipsacq_innerdstpfx = io->ipsec_out_indstpfx;
+			newbie->ipsacq_innersrcpfx = ixa->ixa_ipsec_insrcpfx;
+			newbie->ipsacq_innerdstpfx = ixa->ixa_ipsec_indstpfx;
 			IPSA_COPY_ADDR(newbie->ipsacq_innersrc,
-			    io->ipsec_out_insrc, io->ipsec_out_inaf);
+			    ixa->ixa_ipsec_insrc, ixa->ixa_ipsec_inaf);
 			IPSA_COPY_ADDR(newbie->ipsacq_innerdst,
-			    io->ipsec_out_indst, io->ipsec_out_inaf);
+			    ixa->ixa_ipsec_indst, ixa->ixa_ipsec_inaf);
 		} else {
-			newbie->ipsacq_proto = io->ipsec_out_proto;
+			newbie->ipsacq_proto = ixa->ixa_ipsec_proto;
 		}
 		newbie->ipsacq_unique_id = unique_id;
 
-		if (cr != NULL) {
-			crhold(cr);
-			newbie->ipsacq_cred = cr;
+		if (ixa->ixa_tsl != NULL) {
+			label_hold(ixa->ixa_tsl);
+			newbie->ipsacq_tsl = ixa->ixa_tsl;
 		}
 	} else {
 		/* Scan to the end of the list & insert. */
@@ -5477,13 +5079,16 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 
 		while (lastone->b_next != NULL)
 			lastone = lastone->b_next;
-		lastone->b_next = mp;
+		lastone->b_next = asyncmp;
 		if (newbie->ipsacq_numpackets++ == ipsacq_maxpackets) {
 			newbie->ipsacq_numpackets = ipsacq_maxpackets;
 			lastone = newbie->ipsacq_mp;
 			newbie->ipsacq_mp = lastone->b_next;
 			lastone->b_next = NULL;
-			ip_drop_packet(lastone, B_FALSE, NULL, NULL,
+
+			/* Freeing the async message */
+			lastone = ip_xmit_attr_free_mblk(lastone);
+			ip_drop_packet(lastone, B_FALSE, NULL,
 			    DROPPER(ipss, ipds_sadb_acquire_toofull),
 			    &ipss->ipsec_sadb_dropper);
 		} else {
@@ -5518,17 +5123,17 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 	 * opportunities here in failure cases.
 	 */
 	(void) memset(&sel, 0, sizeof (sel));
-	sel.ips_isv4 = io->ipsec_out_v4;
+	sel.ips_isv4 = (ixa->ixa_flags & IXAF_IS_IPV4) != 0;
 	if (tunnel_mode) {
-		sel.ips_protocol = (io->ipsec_out_inaf == AF_INET) ?
+		sel.ips_protocol = (ixa->ixa_ipsec_inaf == AF_INET) ?
 		    IPPROTO_ENCAP : IPPROTO_IPV6;
 	} else {
-		sel.ips_protocol = io->ipsec_out_proto;
-		sel.ips_local_port = io->ipsec_out_src_port;
-		sel.ips_remote_port = io->ipsec_out_dst_port;
+		sel.ips_protocol = ixa->ixa_ipsec_proto;
+		sel.ips_local_port = ixa->ixa_ipsec_src_port;
+		sel.ips_remote_port = ixa->ixa_ipsec_dst_port;
 	}
-	sel.ips_icmp_type = io->ipsec_out_icmp_type;
-	sel.ips_icmp_code = io->ipsec_out_icmp_code;
+	sel.ips_icmp_type = ixa->ixa_ipsec_icmp_type;
+	sel.ips_icmp_code = ixa->ixa_ipsec_icmp_code;
 	sel.ips_is_icmp_inv_acq = 0;
 	if (af == AF_INET) {
 		sel.ips_local_addr_v4 = ipha->ipha_src;
@@ -5542,13 +5147,13 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
 	if (extended == NULL)
 		goto punt_extended;
 
-	if (cr != NULL) {
+	if (ixa->ixa_tsl != NULL) {
 		/*
 		 * XXX MLS correct condition here?
 		 * XXX MLS other credential attributes in acquire?
 		 * XXX malloc failure?  don't fall back to original?
 		 */
-		sens = sadb_make_sens_ext(cr, &sens_len);
+		sens = sadb_make_sens_ext(ixa->ixa_tsl, &sens_len);
 
 		if (sens == NULL) {
 			freeb(extended);
@@ -5585,13 +5190,13 @@ punt_extended:
 void
 sadb_destroy_acquire(ipsacq_t *acqrec, netstack_t *ns)
 {
-	mblk_t *mp;
+	mblk_t		*mp;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
 	ASSERT(MUTEX_HELD(acqrec->ipsacq_linklock));
 
 	if (acqrec->ipsacq_policy != NULL) {
-		IPPOL_REFRELE(acqrec->ipsacq_policy, ns);
+		IPPOL_REFRELE(acqrec->ipsacq_policy);
 	}
 	if (acqrec->ipsacq_act != NULL) {
 		IPACT_REFRELE(acqrec->ipsacq_act);
@@ -5602,9 +5207,9 @@ sadb_destroy_acquire(ipsacq_t *acqrec, netstack_t *ns)
 	if (acqrec->ipsacq_next != NULL)
 		acqrec->ipsacq_next->ipsacq_ptpn = acqrec->ipsacq_ptpn;
 
-	if (acqrec->ipsacq_cred) {
-		crfree(acqrec->ipsacq_cred);
-		acqrec->ipsacq_cred = NULL;
+	if (acqrec->ipsacq_tsl != NULL) {
+		label_rele(acqrec->ipsacq_tsl);
+		acqrec->ipsacq_tsl = NULL;
 	}
 
 	/*
@@ -5618,7 +5223,9 @@ sadb_destroy_acquire(ipsacq_t *acqrec, netstack_t *ns)
 		mp = acqrec->ipsacq_mp;
 		acqrec->ipsacq_mp = mp->b_next;
 		mp->b_next = NULL;
-		ip_drop_packet(mp, B_FALSE, NULL, NULL,
+		/* Freeing the async message */
+		mp = ip_xmit_attr_free_mblk(mp);
+		ip_drop_packet(mp, B_FALSE, NULL,
 		    DROPPER(ipss, ipds_sadb_acquire_timeout),
 		    &ipss->ipsec_sadb_dropper);
 	}
@@ -5795,24 +5402,23 @@ sadb_action_to_ecomb(uint8_t *start, uint8_t *limit, ipsec_action_t *act,
 
 /* ARGSUSED */
 int
-sadb_sens_len_from_cred(cred_t *cr)
+sadb_sens_len_from_label(ts_label_t *tsl)
 {
 	int baselen = sizeof (sadb_sens_t) + _C_LEN * 4;
 	return (roundup(baselen, sizeof (uint64_t)));
 }
 
 void
-sadb_sens_from_cred(sadb_sens_t *sens, int exttype, cred_t *cr, int senslen)
+sadb_sens_from_label(sadb_sens_t *sens, int exttype, ts_label_t *tsl,
+    int senslen)
 {
 	uint8_t *bitmap;
 	bslabel_t *sl;
-	ts_label_t *tsl;
 
 	/* LINTED */
 	ASSERT((_C_LEN & 1) == 0);
 	ASSERT((senslen & 7) == 0);
 
-	tsl = crgetlabel(cr);
 	sl = label2bslabel(tsl);
 
 	sens->sadb_sens_exttype = exttype;
@@ -5830,14 +5436,14 @@ sadb_sens_from_cred(sadb_sens_t *sens, int exttype, cred_t *cr, int senslen)
 }
 
 static sadb_sens_t *
-sadb_make_sens_ext(cred_t *cr, int *len)
+sadb_make_sens_ext(ts_label_t *tsl, int *len)
 {
 	/* XXX allocation failure? */
-	int sens_len = sadb_sens_len_from_cred(cr);
+	int sens_len = sadb_sens_len_from_label(tsl);
 
 	sadb_sens_t *sens = kmem_alloc(sens_len, KM_SLEEP);
 
-	sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY, cr, sens_len);
+	sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY, tsl, sens_len);
 
 	*len = sens_len;
 
@@ -5849,12 +5455,12 @@ sadb_make_sens_ext(cred_t *cr, int *len)
  * With a special designated "not a label" cred_t ?
  */
 /* ARGSUSED */
-cred_t *
-sadb_cred_from_sens(sadb_sens_t *sens, uint64_t *bitmap)
+ts_label_t *
+sadb_label_from_sens(sadb_sens_t *sens, uint64_t *bitmap)
 {
 	int bitmap_len = SADB_64TO8(sens->sadb_sens_sens_len);
 	bslabel_t sl;
-	cred_t *cr;
+	ts_label_t *tsl;
 
 	if (sens->sadb_sens_integ_level != 0)
 		return (NULL);
@@ -5868,13 +5474,13 @@ sadb_cred_from_sens(sadb_sens_t *sens, uint64_t *bitmap)
 	bcopy(bitmap, &((_bslabel_impl_t *)&sl)->compartments,
 	    bitmap_len);
 
-	cr = newcred_from_bslabel(&sl, sens->sadb_sens_dpd, KM_NOSLEEP);
-	if (cr == NULL)
-		return (cr);
+	tsl = labelalloc(&sl, sens->sadb_sens_dpd, KM_NOSLEEP);
+	if (tsl == NULL)
+		return (NULL);
 
 	if (sens->sadb_x_sens_flags & SADB_X_SENS_UNLABELED)
-		crgetlabel(cr)->tsl_flags |= TSLF_UNLABELED;
-	return (cr);
+		tsl->tsl_flags |= TSLF_UNLABELED;
+	return (tsl);
 }
 
 /* End XXX label-library-leakage */
@@ -6359,12 +5965,13 @@ sadb_getspi(keysock_in_t *ksi, uint32_t master_spi, int *diagnostic,
  *
  * Caller frees the message, so we don't have to here.
  *
- * NOTE:	The ip_q parameter may be used in the future for ACQUIRE
+ * NOTE:	The pfkey_q parameter may be used in the future for ACQUIRE
  *		failures.
  */
 /* ARGSUSED */
 void
-sadb_in_acquire(sadb_msg_t *samsg, sadbp_t *sp, queue_t *ip_q, netstack_t *ns)
+sadb_in_acquire(sadb_msg_t *samsg, sadbp_t *sp, queue_t *pfkey_q,
+    netstack_t *ns)
 {
 	int i;
 	ipsacq_t *acqrec;
@@ -6624,36 +6231,6 @@ sadb_replay_delete(ipsa_t *assoc)
 }
 
 /*
- * Given a queue that presumably points to IP, send a T_BIND_REQ for _proto_
- * down.  The caller will handle the T_BIND_ACK locally.
- */
-boolean_t
-sadb_t_bind_req(queue_t *q, int proto)
-{
-	struct T_bind_req *tbr;
-	mblk_t *mp;
-
-	mp = allocb_cred(sizeof (struct T_bind_req) + 1, kcred, NOPID);
-	if (mp == NULL) {
-		/* cmn_err(CE_WARN, */
-		/* "sadb_t_bind_req(%d): couldn't allocate mblk\n", proto); */
-		return (B_FALSE);
-	}
-	mp->b_datap->db_type = M_PCPROTO;
-	tbr = (struct T_bind_req *)mp->b_rptr;
-	mp->b_wptr += sizeof (struct T_bind_req);
-	tbr->PRIM_type = T_BIND_REQ;
-	tbr->ADDR_length = 0;
-	tbr->ADDR_offset = 0;
-	tbr->CONIND_number = 0;
-	*mp->b_wptr = (uint8_t)proto;
-	mp->b_wptr++;
-
-	putnext(q, mp);
-	return (B_TRUE);
-}
-
-/*
  * Special front-end to ipsec_rl_strlog() dealing with SA failure.
  * this is designed to take only a format string with "* %x * %s *", so
  * that "spi" is printed first, then "addr" is converted using inet_pton().
@@ -6676,7 +6253,6 @@ ipsec_assocfailure(short mid, short sid, char level, ushort_t sl, char *fmt,
 
 /*
  * Fills in a reference to the policy, if any, from the conn, in *ppp
- * Releases a reference to the passed conn_t.
  */
 static void
 ipsec_conn_pol(ipsec_selector_t *sel, conn_t *connp, ipsec_policy_t **ppp)
@@ -6684,15 +6260,14 @@ ipsec_conn_pol(ipsec_selector_t *sel, conn_t *connp, ipsec_policy_t **ppp)
 	ipsec_policy_t	*pp;
 	ipsec_latch_t	*ipl = connp->conn_latch;
 
-	if ((ipl != NULL) && (ipl->ipl_out_policy != NULL)) {
-		pp = ipl->ipl_out_policy;
+	if ((ipl != NULL) && (connp->conn_ixa->ixa_ipsec_policy != NULL)) {
+		pp = connp->conn_ixa->ixa_ipsec_policy;
 		IPPOL_REFHOLD(pp);
 	} else {
-		pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, NULL, sel,
+		pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, sel,
 		    connp->conn_netstack);
 	}
 	*ppp = pp;
-	CONN_DEC_REF(connp);
 }
 
 /*
@@ -6753,6 +6328,7 @@ ipsec_udp_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, ip_stack_t *ipst)
 	mutex_exit(&connfp->connf_lock);
 
 	ipsec_conn_pol(sel, connp, ppp);
+	CONN_DEC_REF(connp);
 }
 
 static conn_t *
@@ -6866,6 +6442,7 @@ ipsec_tcp_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, ip_stack_t *ipst)
 	}
 
 	ipsec_conn_pol(sel, connp, ppp);
+	CONN_DEC_REF(connp);
 }
 
 static void
@@ -6895,21 +6472,27 @@ ipsec_sctp_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
 	pptr[0] = sel->ips_remote_port;
 	pptr[1] = sel->ips_local_port;
 
+	/*
+	 * For labeled systems, there's no need to check the
+	 * label here.  It's known to be good as we checked
+	 * before allowing the connection to become bound.
+	 */
 	if (sel->ips_isv4) {
 		in6_addr_t	src, dst;
 
 		IN6_IPADDR_TO_V4MAPPED(sel->ips_remote_addr_v4, &dst);
 		IN6_IPADDR_TO_V4MAPPED(sel->ips_local_addr_v4, &src);
 		connp = sctp_find_conn(&dst, &src, ports, ALL_ZONES,
-		    ipst->ips_netstack->netstack_sctp);
+		    0, ipst->ips_netstack->netstack_sctp);
 	} else {
 		connp = sctp_find_conn(&sel->ips_remote_addr_v6,
 		    &sel->ips_local_addr_v6, ports, ALL_ZONES,
-		    ipst->ips_netstack->netstack_sctp);
+		    0, ipst->ips_netstack->netstack_sctp);
 	}
 	if (connp == NULL)
 		return;
 	ipsec_conn_pol(sel, connp, ppp);
+	CONN_DEC_REF(connp);
 }
 
 /*
@@ -6985,7 +6568,7 @@ ipsec_get_inverse_acquire_sel(ipsec_selector_t *sel, sadb_address_t *srcext,
 static int
 ipsec_tun_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
     sadb_address_t *innsrcext, sadb_address_t *inndstext, ipsec_tun_pol_t *itp,
-    int *diagnostic, netstack_t *ns)
+    int *diagnostic)
 {
 	int err;
 	ipsec_policy_head_t *polhead;
@@ -7045,8 +6628,7 @@ ipsec_tun_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
 	polhead = itp->itp_policy;
 	ASSERT(polhead != NULL);
 	rw_enter(&polhead->iph_lock, RW_READER);
-	*ppp = ipsec_find_policy_head(NULL, polhead,
-	    IPSEC_TYPE_INBOUND, sel, ns);
+	*ppp = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_INBOUND, sel);
 	rw_exit(&polhead->iph_lock);
 
 	/*
@@ -7059,6 +6641,10 @@ ipsec_tun_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
 	return (0);
 }
 
+/*
+ * For sctp conn_faddr is the primary address, hence this is of limited
+ * use for sctp.
+ */
 static void
 ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
     ip_stack_t *ipst)
@@ -7068,7 +6654,7 @@ ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
 	conn_t		*connp;
 
 	if (isv4) {
-		connfp = &ipst->ips_ipcl_proto_fanout[sel->ips_protocol];
+		connfp = &ipst->ips_ipcl_proto_fanout_v4[sel->ips_protocol];
 	} else {
 		connfp = &ipst->ips_ipcl_proto_fanout_v6[sel->ips_protocol];
 	}
@@ -7076,17 +6662,20 @@ ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
 	mutex_enter(&connfp->connf_lock);
 	for (connp = connfp->connf_head; connp != NULL;
 	    connp = connp->conn_next) {
-		if (!((isv4 && !((connp->conn_src == 0 ||
-		    connp->conn_src == sel->ips_local_addr_v4) &&
-		    (connp->conn_rem == 0 ||
-		    connp->conn_rem == sel->ips_remote_addr_v4))) ||
-		    (!isv4 && !((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
-		    IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6,
-		    &sel->ips_local_addr_v6)) &&
-		    (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
-		    IN6_ARE_ADDR_EQUAL(&connp->conn_remv6,
-		    &sel->ips_remote_addr_v6)))))) {
-			break;
+		if (isv4) {
+			if ((connp->conn_laddr_v4 == INADDR_ANY ||
+			    connp->conn_laddr_v4 == sel->ips_local_addr_v4) &&
+			    (connp->conn_faddr_v4 == INADDR_ANY ||
+			    connp->conn_faddr_v4 == sel->ips_remote_addr_v4))
+				break;
+		} else {
+			if ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
+			    IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
+			    &sel->ips_local_addr_v6)) &&
+			    (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
+			    IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
+			    &sel->ips_remote_addr_v6)))
+				break;
 		}
 	}
 	if (connp == NULL) {
@@ -7098,6 +6687,7 @@ ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
 	mutex_exit(&connfp->connf_lock);
 
 	ipsec_conn_pol(sel, connp, ppp);
+	CONN_DEC_REF(connp);
 }
 
 /*
@@ -7245,7 +6835,7 @@ ipsec_construct_inverse_acquire(sadb_msg_t *samsg, sadb_ext_t *extv[],
 			isel.ips_isv4 = (sel.ips_protocol == IPPROTO_ENCAP);
 		} /* Else isel is initialized by ipsec_tun_pol(). */
 		err = ipsec_tun_pol(&isel, &pp, innsrcext, inndstext, itp,
-		    &diagnostic, ns);
+		    &diagnostic);
 		/*
 		 * NOTE:  isel isn't used for now, but in RFC 430x IPsec, it
 		 * may be.
@@ -7263,8 +6853,7 @@ ipsec_construct_inverse_acquire(sadb_msg_t *samsg, sadb_ext_t *extv[],
 	 * look in the global policy.
 	 */
 	if (pp == NULL) {
-		pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, NULL, NULL, &sel,
-		    ns);
+		pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, NULL, &sel, ns);
 		if (pp == NULL) {
 			/* There's no global policy. */
 			err = ENOENT;
@@ -7282,7 +6871,7 @@ ipsec_construct_inverse_acquire(sadb_msg_t *samsg, sadb_ext_t *extv[],
 	    (itp != NULL && (itp->itp_flags & ITPF_P_TUNNEL)),
 	    samsg->sadb_msg_seq, samsg->sadb_msg_pid, sens, ns);
 	if (pp != NULL) {
-		IPPOL_REFRELE(pp, ns);
+		IPPOL_REFRELE(pp);
 	}
 	ASSERT(err == 0 && diagnostic == 0);
 	if (retmp == NULL)
@@ -7306,37 +6895,49 @@ bail:
 /*
  * sadb_set_lpkt: Return TRUE if we can swap in a value to ipsa->ipsa_lpkt and
  * freemsg the previous value.  Return FALSE if we lost the race and the SA is
- * in a non-LARVAL state.  free clue: ip_drop_packet(NULL) is safe.
+ * in a non-LARVAL state. We also return FALSE if we can't allocate the attrmp.
  */
 boolean_t
-sadb_set_lpkt(ipsa_t *ipsa, mblk_t *npkt, netstack_t *ns)
+sadb_set_lpkt(ipsa_t *ipsa, mblk_t *npkt, ip_recv_attr_t *ira)
 {
-	mblk_t *opkt;
+	mblk_t		*opkt;
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 	boolean_t is_larval;
 
-	/*
-	 * Check the packet's netstack id in case we go asynch with a
-	 * taskq_dispatch.
-	 */
-	ASSERT(((ipsec_in_t *)npkt->b_rptr)->ipsec_in_type == IPSEC_IN);
-	ASSERT(((ipsec_in_t *)npkt->b_rptr)->ipsec_in_stackid ==
-	    ns->netstack_stackid);
-
 	mutex_enter(&ipsa->ipsa_lock);
 	is_larval = (ipsa->ipsa_state == IPSA_STATE_LARVAL);
 	if (is_larval) {
-		opkt = ipsa->ipsa_lpkt;
-		ipsa->ipsa_lpkt = npkt;
+		mblk_t	*attrmp;
+
+		attrmp = ip_recv_attr_to_mblk(ira);
+		if (attrmp == NULL) {
+			ill_t *ill = ira->ira_ill;
+
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", npkt, ill);
+			freemsg(npkt);
+			opkt = NULL;
+			is_larval = B_FALSE;
+		} else {
+			ASSERT(attrmp->b_cont == NULL);
+			attrmp->b_cont = npkt;
+			npkt = attrmp;
+			opkt = ipsa->ipsa_lpkt;
+			ipsa->ipsa_lpkt = npkt;
+		}
 	} else {
 		/* We lost the race. */
 		opkt = NULL;
 	}
 	mutex_exit(&ipsa->ipsa_lock);
 
-	ip_drop_packet(opkt, B_TRUE, NULL, NULL,
-	    DROPPER(ipss, ipds_sadb_inlarval_replace),
-	    &ipss->ipsec_sadb_dropper);
+	if (opkt != NULL) {
+		opkt = ip_recv_attr_free_mblk(opkt);
+		ip_drop_packet(opkt, B_TRUE, ira->ira_ill,
+		    DROPPER(ipss, ipds_sadb_inlarval_replace),
+		    &ipss->ipsec_sadb_dropper);
+	}
 	return (is_larval);
 }
 
@@ -7353,7 +6954,6 @@ sadb_clear_lpkt(ipsa_t *ipsa)
 	opkt = ipsa->ipsa_lpkt;
 	ipsa->ipsa_lpkt = NULL;
 	mutex_exit(&ipsa->ipsa_lock);
-
 	return (opkt);
 }
 
@@ -7361,18 +6961,18 @@ sadb_clear_lpkt(ipsa_t *ipsa)
  * Buffer a packet that's in IDLE state as set by Solaris Clustering.
  */
 void
-sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, netstack_t *ns)
+sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, ip_recv_attr_t *ira)
 {
+	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
 	ipsec_stack_t   *ipss = ns->netstack_ipsec;
-	extern void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t,
-	    sa_family_t, in6_addr_t, in6_addr_t, void *);
 	in6_addr_t *srcaddr = (in6_addr_t *)(&ipsa->ipsa_srcaddr);
 	in6_addr_t *dstaddr = (in6_addr_t *)(&ipsa->ipsa_dstaddr);
+	mblk_t		*mp;
 
 	ASSERT(ipsa->ipsa_state == IPSA_STATE_IDLE);
 
 	if (cl_inet_idlesa == NULL) {
-		ip_drop_packet(bpkt, B_TRUE, NULL, NULL,
+		ip_drop_packet(bpkt, B_TRUE, ira->ira_ill,
 		    DROPPER(ipss, ipds_sadb_inidle_overflow),
 		    &ipss->ipsec_sadb_dropper);
 		return;
@@ -7382,13 +6982,14 @@ sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, netstack_t *ns)
 	    (ipsa->ipsa_type == SADB_SATYPE_AH) ? IPPROTO_AH : IPPROTO_ESP,
 	    ipsa->ipsa_spi, ipsa->ipsa_addrfam, *srcaddr, *dstaddr, NULL);
 
-	/*
-	 * Check the packet's netstack id in case we go asynch with a
-	 * taskq_dispatch.
-	 */
-	ASSERT(((ipsec_in_t *)bpkt->b_rptr)->ipsec_in_type == IPSEC_IN);
-	ASSERT(((ipsec_in_t *)bpkt->b_rptr)->ipsec_in_stackid ==
-	    ns->netstack_stackid);
+	mp = ip_recv_attr_to_mblk(ira);
+	if (mp == NULL) {
+		ip_drop_packet(bpkt, B_TRUE, ira->ira_ill,
+		    DROPPER(ipss, ipds_sadb_inidle_overflow),
+		    &ipss->ipsec_sadb_dropper);
+		return;
+	}
+	linkb(mp, bpkt);
 
 	mutex_enter(&ipsa->ipsa_lock);
 	ipsa->ipsa_mblkcnt++;
@@ -7399,16 +7000,17 @@ sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, netstack_t *ns)
 		ipsa->ipsa_bpkt_tail = bpkt;
 		if (ipsa->ipsa_mblkcnt > SADB_MAX_IDLEPKTS) {
 			mblk_t *tmp;
+
 			tmp = ipsa->ipsa_bpkt_head;
 			ipsa->ipsa_bpkt_head = ipsa->ipsa_bpkt_head->b_next;
-			ip_drop_packet(tmp, B_TRUE, NULL, NULL,
+			tmp = ip_recv_attr_free_mblk(tmp);
+			ip_drop_packet(tmp, B_TRUE, NULL,
 			    DROPPER(ipss, ipds_sadb_inidle_overflow),
 			    &ipss->ipsec_sadb_dropper);
 			ipsa->ipsa_mblkcnt --;
 		}
 	}
 	mutex_exit(&ipsa->ipsa_lock);
-
 }
 
 /*
@@ -7419,30 +7021,28 @@ void
 sadb_clear_buf_pkt(void *ipkt)
 {
 	mblk_t	*tmp, *buf_pkt;
-	netstack_t *ns;
-	ipsec_in_t *ii;
+	ip_recv_attr_t	iras;
 
 	buf_pkt = (mblk_t *)ipkt;
 
-	ii = (ipsec_in_t *)buf_pkt->b_rptr;
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
-	ns = netstack_find_by_stackid(ii->ipsec_in_stackid);
-	if (ns != NULL && ns != ii->ipsec_in_ns) {
-		netstack_rele(ns);
-		ns = NULL;  /* For while-loop below. */
-	}
-
 	while (buf_pkt != NULL) {
+		mblk_t *data_mp;
+
 		tmp = buf_pkt->b_next;
 		buf_pkt->b_next = NULL;
-		if (ns != NULL)
-			ip_fanout_proto_again(buf_pkt, NULL, NULL, NULL);
-		else
-			freemsg(buf_pkt);
+
+		data_mp = buf_pkt->b_cont;
+		buf_pkt->b_cont = NULL;
+		if (!ip_recv_attr_from_mblk(buf_pkt, &iras)) {
+			/* The ill or ip_stack_t disappeared on us. */
+			ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
+			freemsg(data_mp);
+		} else {
+			ip_input_post_ipsec(data_mp, &iras);
+		}
+		ira_cleanup(&iras, B_TRUE);
 		buf_pkt = tmp;
 	}
-	if (ns != NULL)
-		netstack_rele(ns);
 }
 /*
  * Walker callback used by sadb_alg_update() to free/create crypto
@@ -7454,6 +7054,8 @@ struct sadb_update_alg_state {
 	ipsec_algtype_t alg_type;
 	uint8_t alg_id;
 	boolean_t is_added;
+	boolean_t async_auth;
+	boolean_t async_encr;
 };
 
 static void
@@ -7470,6 +7072,15 @@ sadb_alg_update_cb(isaf_t *head, ipsa_t *entry, void *cookie)
 
 	mutex_enter(&entry->ipsa_lock);
 
+	if ((entry->ipsa_encr_alg != SADB_EALG_NONE && entry->ipsa_encr_alg !=
+	    SADB_EALG_NULL && update_state->async_encr) ||
+	    (entry->ipsa_auth_alg != SADB_AALG_NONE &&
+	    update_state->async_auth)) {
+		entry->ipsa_flags |= IPSA_F_ASYNC;
+	} else {
+		entry->ipsa_flags &= ~IPSA_F_ASYNC;
+	}
+
 	switch (update_state->alg_type) {
 	case IPSEC_ALG_AUTH:
 		if (entry->ipsa_auth_alg == update_state->alg_id)
@@ -7511,8 +7122,11 @@ sadb_alg_update_cb(isaf_t *head, ipsa_t *entry, void *cookie)
 }
 
 /*
- * Invoked by IP when an software crypto provider has been updated.
- * The type and id of the corresponding algorithm is passed as argument.
+ * Invoked by IP when an software crypto provider has been updated, or if
+ * the crypto synchrony changes.  The type and id of the corresponding
+ * algorithm is passed as argument.  The type is set to ALL in the case of
+ * a synchrony change.
+ *
  * is_added is B_TRUE if the provider was added, B_FALSE if it was
  * removed. The function updates the SADB and free/creates the
  * context templates associated with SAs if needed.
@@ -7529,12 +7143,17 @@ sadb_alg_update(ipsec_algtype_t alg_type, uint8_t alg_id, boolean_t is_added,
 	struct sadb_update_alg_state update_state;
 	ipsecah_stack_t	*ahstack = ns->netstack_ipsecah;
 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
+	ipsec_stack_t *ipss = ns->netstack_ipsec;
 
 	update_state.alg_type = alg_type;
 	update_state.alg_id = alg_id;
 	update_state.is_added = is_added;
+	update_state.async_auth = ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
+	    IPSEC_ALGS_EXEC_ASYNC;
+	update_state.async_encr = ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
+	    IPSEC_ALGS_EXEC_ASYNC;
 
-	if (alg_type == IPSEC_ALG_AUTH) {
+	if (alg_type == IPSEC_ALG_AUTH || alg_type == IPSEC_ALG_ALL) {
 		/* walk the AH tables only for auth. algorithm changes */
 		SADB_ALG_UPDATE_WALK(ahstack->ah_sadb.s_v4, sdb_of);
 		SADB_ALG_UPDATE_WALK(ahstack->ah_sadb.s_v4, sdb_if);
@@ -7693,15 +7312,15 @@ ipsec_check_key(crypto_mech_type_t mech_type, sadb_key_t *sadb_key,
  *
  * This is inelegant and really could use refactoring.
  */
-int
-sadb_whack_label(mblk_t **mpp, ipsa_t *assoc)
+mblk_t *
+sadb_whack_label_v4(mblk_t *mp, ipsa_t *assoc, kstat_named_t *counter,
+    ipdropper_t *dropper)
 {
 	int delta;
 	int plen;
 	dblk_t *db;
 	int hlen;
 	uint8_t *opt_storage = assoc->ipsa_opt_storage;
-	mblk_t *mp = *mpp;
 	ipha_t *ipha = (ipha_t *)mp->b_rptr;
 
 	plen = ntohs(ipha->ipha_length);
@@ -7731,8 +7350,10 @@ sadb_whack_label(mblk_t **mpp, ipsa_t *assoc)
 		new_mp = allocb_tmpl(hlen + copylen +
 		    (mp->b_rptr - mp->b_datap->db_base), mp);
 
-		if (new_mp == NULL)
-			return (ENOMEM);
+		if (new_mp == NULL) {
+			ip_drop_packet(mp, B_FALSE, NULL, counter,  dropper);
+			return (NULL);
+		}
 
 		/* keep the bias */
 		new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base;
@@ -7743,7 +7364,7 @@ sadb_whack_label(mblk_t **mpp, ipsa_t *assoc)
 			new_mp->b_cont = mp->b_cont;
 			freeb(mp);
 		}
-		*mpp = mp = new_mp;
+		mp = new_mp;
 		ipha = (ipha_t *)mp->b_rptr;
 	}
 
@@ -7768,11 +7389,12 @@ sadb_whack_label(mblk_t **mpp, ipsa_t *assoc)
 
 	ipha->ipha_length = htons(plen);
 
-	return (0);
+	return (mp);
 }
 
-int
-sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc)
+mblk_t *
+sadb_whack_label_v6(mblk_t *mp, ipsa_t *assoc, kstat_named_t *counter,
+    ipdropper_t *dropper)
 {
 	int delta;
 	int plen;
@@ -7780,7 +7402,6 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc)
 	int hlen;
 	uint8_t *opt_storage = assoc->ipsa_opt_storage;
 	uint_t sec_opt_len; /* label option length not including type, len */
-	mblk_t *mp = *mpp;
 	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
 
 	plen = ntohs(ip6h->ip6_plen);
@@ -7818,8 +7439,10 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc)
 			copylen = hdr_len;
 		new_mp = allocb_tmpl(hlen + copylen +
 		    (mp->b_rptr - mp->b_datap->db_base), mp);
-		if (new_mp == NULL)
-			return (ENOMEM);
+		if (new_mp == NULL) {
+			ip_drop_packet(mp, B_FALSE, NULL, counter,  dropper);
+			return (NULL);
+		}
 
 		/* keep the bias */
 		new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base;
@@ -7830,7 +7453,7 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc)
 			new_mp->b_cont = mp->b_cont;
 			freeb(mp);
 		}
-		*mpp = mp = new_mp;
+		mp = new_mp;
 		ip6h = (ip6_t *)mp->b_rptr;
 	}
 
@@ -7856,10 +7479,46 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc)
 
 	ip6h->ip6_plen = htons(plen);
 
-	return (0);
+	return (mp);
 }
 
+/* Whack the labels and update ip_xmit_attr_t as needed */
+mblk_t *
+sadb_whack_label(mblk_t *mp, ipsa_t *assoc, ip_xmit_attr_t *ixa,
+    kstat_named_t *counter, ipdropper_t *dropper)
+{
+	int adjust;
+	int iplen;
 
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
+
+		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+		iplen = ntohs(ipha->ipha_length);
+		mp = sadb_whack_label_v4(mp, assoc, counter, dropper);
+		if (mp == NULL)
+			return (NULL);
+
+		ipha = (ipha_t *)mp->b_rptr;
+		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+		adjust = (int)ntohs(ipha->ipha_length) - iplen;
+	} else {
+		ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
+
+		ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
+		iplen = ntohs(ip6h->ip6_plen);
+		mp = sadb_whack_label_v6(mp, assoc, counter, dropper);
+		if (mp == NULL)
+			return (NULL);
+
+		ip6h = (ip6_t *)mp->b_rptr;
+		ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
+		adjust = (int)ntohs(ip6h->ip6_plen) - iplen;
+	}
+	ixa->ixa_pktlen += adjust;
+	ixa->ixa_ip_hdr_length += adjust;
+	return (mp);
+}
 
 /*
  * If this is an outgoing SA then add some fuzz to the
@@ -7969,7 +7628,7 @@ age_pair_peer_list(templist_t *haspeerlist, sadb_t *sp, boolean_t outbound)
 				    *((ipaddr_t *)&dying->
 				    ipsa_srcaddr));
 			}
-		bucket = &(sp->sdb_of[outhash]);
+			bucket = &(sp->sdb_of[outhash]);
 		}
 
 		mutex_enter(&bucket->isaf_lock);
diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c
index 37a9f47432..e6903cefc2 100644
--- a/usr/src/uts/common/inet/ip/spd.c
+++ b/usr/src/uts/common/inet/ip/spd.c
@@ -37,6 +37,7 @@
 #include <sys/strsubr.h>
 #include <sys/strsun.h>
 #include <sys/strlog.h>
+#include <sys/strsun.h>
 #include <sys/cmn_err.h>
 #include <sys/zone.h>
 
@@ -59,7 +60,6 @@
 
 #include <net/pfkeyv2.h>
 #include <net/pfpolicy.h>
-#include <inet/ipsec_info.h>
 #include <inet/sadb.h>
 #include <inet/ipsec_impl.h>
 
@@ -75,16 +75,8 @@
 static void ipsec_update_present_flags(ipsec_stack_t *);
 static ipsec_act_t *ipsec_act_wildcard_expand(ipsec_act_t *, uint_t *,
     netstack_t *);
-static void ipsec_out_free(void *);
-static void ipsec_in_free(void *);
-static mblk_t *ipsec_attach_global_policy(mblk_t **, conn_t *,
-    ipsec_selector_t *, netstack_t *);
-static mblk_t *ipsec_apply_global_policy(mblk_t *, conn_t *,
-    ipsec_selector_t *, netstack_t *);
 static mblk_t *ipsec_check_ipsecin_policy(mblk_t *, ipsec_policy_t *,
-    ipha_t *, ip6_t *, uint64_t, netstack_t *);
-static void ipsec_in_release_refs(ipsec_in_t *);
-static void ipsec_out_release_refs(ipsec_out_t *);
+    ipha_t *, ip6_t *, uint64_t, ip_recv_attr_t *, netstack_t *);
 static void ipsec_action_free_table(ipsec_action_t *);
 static void ipsec_action_reclaim(void *);
 static void ipsec_action_reclaim_stack(netstack_t *);
@@ -105,9 +97,9 @@ typedef enum { SELRET_NOMEM, SELRET_BADPKT, SELRET_SUCCESS, SELRET_TUNFRAG}
 static selret_t ipsec_init_inbound_sel(ipsec_selector_t *, mblk_t *,
     ipha_t *, ip6_t *, uint8_t);
 
-static boolean_t ipsec_check_ipsecin_action(struct ipsec_in_s *, mblk_t *,
+static boolean_t ipsec_check_ipsecin_action(ip_recv_attr_t *, mblk_t *,
     struct ipsec_action_s *, ipha_t *ipha, ip6_t *ip6h, const char **,
-    kstat_named_t **);
+    kstat_named_t **, netstack_t *);
 static void ipsec_unregister_prov_update(void);
 static void ipsec_prov_update_callback_stack(uint32_t, void *, netstack_t *);
 static boolean_t ipsec_compare_action(ipsec_policy_t *, ipsec_policy_t *);
@@ -117,15 +109,13 @@ static void ipsec_kstat_destroy(ipsec_stack_t *);
 static int ipsec_free_tables(ipsec_stack_t *);
 static int tunnel_compare(const void *, const void *);
 static void ipsec_freemsg_chain(mblk_t *);
-static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *, ire_t *,
+static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *,
     struct kstat_named *, ipdropper_t *);
 static boolean_t ipsec_kstat_init(ipsec_stack_t *);
 static void ipsec_kstat_destroy(ipsec_stack_t *);
 static int ipsec_free_tables(ipsec_stack_t *);
 static int tunnel_compare(const void *, const void *);
 static void ipsec_freemsg_chain(mblk_t *);
-static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *, ire_t *,
-    struct kstat_named *, ipdropper_t *);
 
 /*
  * Selector hash table is statically sized at module load time.
@@ -150,16 +140,15 @@ static crypto_notify_handle_t prov_update_handle = NULL;
 static kmem_cache_t *ipsec_action_cache;
 static kmem_cache_t *ipsec_sel_cache;
 static kmem_cache_t *ipsec_pol_cache;
-static kmem_cache_t *ipsec_info_cache;
 
 /* Frag cache prototypes */
-static void ipsec_fragcache_clean(ipsec_fragcache_t *);
+static void ipsec_fragcache_clean(ipsec_fragcache_t *, ipsec_stack_t *);
 static ipsec_fragcache_entry_t *fragcache_delentry(int,
-    ipsec_fragcache_entry_t *, ipsec_fragcache_t *);
+    ipsec_fragcache_entry_t *, ipsec_fragcache_t *, ipsec_stack_t *);
 boolean_t ipsec_fragcache_init(ipsec_fragcache_t *);
-void ipsec_fragcache_uninit(ipsec_fragcache_t *);
-mblk_t *ipsec_fragcache_add(ipsec_fragcache_t *, mblk_t *, mblk_t *, int,
-    ipsec_stack_t *);
+void ipsec_fragcache_uninit(ipsec_fragcache_t *, ipsec_stack_t *ipss);
+mblk_t *ipsec_fragcache_add(ipsec_fragcache_t *, mblk_t *, mblk_t *,
+    int, ipsec_stack_t *);
 
 int ipsec_hdr_pullup_needed = 0;
 int ipsec_weird_null_inbound_policy = 0;
@@ -240,23 +229,28 @@ ipsec_freemsg_chain(mblk_t *mp)
 		ASSERT(mp->b_prev == NULL);
 		mpnext = mp->b_next;
 		mp->b_next = NULL;
-		freemsg(mp);	/* Always works, even if NULL */
+		freemsg(mp);
 		mp = mpnext;
 	}
 }
 
-/* ip_drop all messages in an mblk chain */
+/*
+ * ip_drop all messages in an mblk chain
+ * Can handle a b_next chain of ip_recv_attr_t mblks, or just a b_next chain
+ * of data.
+ */
 static void
-ip_drop_packet_chain(mblk_t *mp, boolean_t inbound, ill_t *arriving,
-    ire_t *outbound_ire, struct kstat_named *counter, ipdropper_t *who_called)
+ip_drop_packet_chain(mblk_t *mp, boolean_t inbound, ill_t *ill,
+    struct kstat_named *counter, ipdropper_t *who_called)
 {
 	mblk_t *mpnext;
 	while (mp != NULL) {
 		ASSERT(mp->b_prev == NULL);
 		mpnext = mp->b_next;
 		mp->b_next = NULL;
-		ip_drop_packet(mp, inbound, arriving, outbound_ire, counter,
-		    who_called);
+		if (ip_recv_attr_is_mblk(mp))
+			mp = ip_recv_attr_free_mblk(mp);
+		ip_drop_packet(mp, inbound, ill, counter, who_called);
 		mp = mpnext;
 	}
 }
@@ -287,7 +281,7 @@ ipsec_policy_cmpbyid(const void *a, const void *b)
 	 * ipsl_sel (selector set), so an entry with a NULL ipsp_sel is not
 	 * actually in-tree but rather a template node being used in
 	 * an avl_find query; see ipsec_policy_delete().  This gives us
-	 * a placeholder in the ordering just before the the first entry with
+	 * a placeholder in the ordering just before the first entry with
 	 * a key >= the one we're looking for, so we can walk forward from
 	 * that point to get the remaining entries with the same id.
 	 */
@@ -443,7 +437,6 @@ ipsec_policy_g_destroy(void)
 	kmem_cache_destroy(ipsec_action_cache);
 	kmem_cache_destroy(ipsec_sel_cache);
 	kmem_cache_destroy(ipsec_pol_cache);
-	kmem_cache_destroy(ipsec_info_cache);
 
 	ipsec_unregister_prov_update();
 
@@ -693,9 +686,6 @@ ipsec_policy_g_init(void)
 	ipsec_pol_cache = kmem_cache_create("ipsec_policy",
 	    sizeof (ipsec_policy_t), _POINTER_ALIGNMENT, NULL, NULL,
 	    NULL, NULL, NULL, 0);
-	ipsec_info_cache = kmem_cache_create("ipsec_info",
-	    sizeof (ipsec_info_t), _POINTER_ALIGNMENT, NULL, NULL,
-	    NULL, NULL, NULL, 0);
 
 	/*
 	 * We want to be informed each time a stack is created or
@@ -920,6 +910,7 @@ ipsec_copy_policy(const ipsec_policy_t *src)
 	src->ipsp_sel->ipsl_refs++;
 
 	HASH_NULL(dst, ipsp_hash);
+	dst->ipsp_netstack = src->ipsp_netstack;
 	dst->ipsp_refs = 1;
 	dst->ipsp_sel = src->ipsp_sel;
 	dst->ipsp_act = src->ipsp_act;
@@ -1469,7 +1460,7 @@ ipsec_req_from_conn(conn_t *connp, ipsec_req_t *req, int af)
 
 	bzero(req, sizeof (*req));
 
-	mutex_enter(&connp->conn_lock);
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
 	ipl = connp->conn_latch;
 
 	/*
@@ -1478,20 +1469,20 @@ ipsec_req_from_conn(conn_t *connp, ipsec_req_t *req, int af)
 	 * look at configured policy.
 	 */
 	if (ipl != NULL) {
-		if (ipl->ipl_in_action != NULL) {
-			rv = ipsec_req_from_act(ipl->ipl_in_action, req);
+		if (connp->conn_latch_in_action != NULL) {
+			rv = ipsec_req_from_act(connp->conn_latch_in_action,
+			    req);
 			goto done;
 		}
-		if (ipl->ipl_in_policy != NULL) {
-			rv = ipsec_req_from_act(ipl->ipl_in_policy->ipsp_act,
-			    req);
+		if (connp->conn_latch_in_policy != NULL) {
+			rv = ipsec_req_from_act(
+			    connp->conn_latch_in_policy->ipsp_act, req);
 			goto done;
 		}
 	}
 	if (connp->conn_policy != NULL)
 		rv = ipsec_req_from_head(connp->conn_policy, req, af);
 done:
-	mutex_exit(&connp->conn_lock);
 	return (rv);
 }
 
@@ -1502,66 +1493,18 @@ ipsec_actvec_free(ipsec_act_t *act, uint_t nact)
 }
 
 /*
- * When outbound policy is not cached, look it up the hard way and attach
- * an ipsec_out_t to the packet..
- */
-static mblk_t *
-ipsec_attach_global_policy(mblk_t **mp, conn_t *connp, ipsec_selector_t *sel,
-    netstack_t *ns)
-{
-	ipsec_policy_t *p;
-
-	p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, NULL, sel, ns);
-
-	if (p == NULL)
-		return (NULL);
-	return (ipsec_attach_ipsec_out(mp, connp, p, sel->ips_protocol, ns));
-}
-
-/*
- * We have an ipsec_out already, but don't have cached policy; fill it in
- * with the right actions.
- */
-static mblk_t *
-ipsec_apply_global_policy(mblk_t *ipsec_mp, conn_t *connp,
-    ipsec_selector_t *sel, netstack_t *ns)
-{
-	ipsec_out_t *io;
-	ipsec_policy_t *p;
-
-	ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
-	ASSERT(ipsec_mp->b_cont->b_datap->db_type == M_DATA);
-
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-
-	if (io->ipsec_out_policy == NULL) {
-		p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, io, sel, ns);
-		io->ipsec_out_policy = p;
-	}
-	return (ipsec_mp);
-}
-
-
-/*
  * Consumes a reference to ipsp.
  */
 static mblk_t *
-ipsec_check_loopback_policy(mblk_t *first_mp, boolean_t mctl_present,
+ipsec_check_loopback_policy(mblk_t *data_mp, ip_recv_attr_t *ira,
     ipsec_policy_t *ipsp)
 {
-	mblk_t *ipsec_mp;
-	ipsec_in_t *ii;
-	netstack_t	*ns;
-
-	if (!mctl_present)
-		return (first_mp);
+	if (!(ira->ira_flags & IRAF_IPSEC_SECURE))
+		return (data_mp);
 
-	ipsec_mp = first_mp;
+	ASSERT(ira->ira_flags & IRAF_LOOPBACK);
 
-	ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	ns = ii->ipsec_in_ns;
-	ASSERT(ii->ipsec_in_loopback);
-	IPPOL_REFRELE(ipsp, ns);
+	IPPOL_REFRELE(ipsp);
 
 	/*
 	 * We should do an actual policy check here.  Revisit this
@@ -1569,7 +1512,7 @@ ipsec_check_loopback_policy(mblk_t *first_mp, boolean_t mctl_present,
 	 * get there.)
 	 */
 
-	return (first_mp);
+	return (data_mp);
 }
 
 /*
@@ -1577,20 +1520,19 @@ ipsec_check_loopback_policy(mblk_t *first_mp, boolean_t mctl_present,
  * expected by the SAs it traversed on the way in.
  */
 static boolean_t
-ipsec_check_ipsecin_unique(ipsec_in_t *ii, const char **reason,
-    kstat_named_t **counter, uint64_t pkt_unique)
+ipsec_check_ipsecin_unique(ip_recv_attr_t *ira, const char **reason,
+    kstat_named_t **counter, uint64_t pkt_unique, netstack_t *ns)
 {
 	uint64_t ah_mask, esp_mask;
 	ipsa_t *ah_assoc;
 	ipsa_t *esp_assoc;
-	netstack_t	*ns = ii->ipsec_in_ns;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
-	ASSERT(ii->ipsec_in_secure);
-	ASSERT(!ii->ipsec_in_loopback);
+	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+	ASSERT(!(ira->ira_flags & IRAF_LOOPBACK));
 
-	ah_assoc = ii->ipsec_in_ah_sa;
-	esp_assoc = ii->ipsec_in_esp_sa;
+	ah_assoc = ira->ira_ipsec_ah_sa;
+	esp_assoc = ira->ira_ipsec_esp_sa;
 	ASSERT((ah_assoc != NULL) || (esp_assoc != NULL));
 
 	ah_mask = (ah_assoc != NULL) ? ah_assoc->ipsa_unique_mask : 0;
@@ -1621,30 +1563,30 @@ ipsec_check_ipsecin_unique(ipsec_in_t *ii, const char **reason,
 }
 
 static boolean_t
-ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap,
-    ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter)
+ipsec_check_ipsecin_action(ip_recv_attr_t *ira, mblk_t *mp, ipsec_action_t *ap,
+    ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter,
+    netstack_t *ns)
 {
 	boolean_t ret = B_TRUE;
 	ipsec_prot_t *ipp;
 	ipsa_t *ah_assoc;
 	ipsa_t *esp_assoc;
 	boolean_t decaps;
-	netstack_t	*ns = ii->ipsec_in_ns;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
 	ASSERT((ipha == NULL && ip6h != NULL) ||
 	    (ip6h == NULL && ipha != NULL));
 
-	if (ii->ipsec_in_loopback) {
+	if (ira->ira_flags & IRAF_LOOPBACK) {
 		/*
 		 * Besides accepting pointer-equivalent actions, we also
 		 * accept any ICMP errors we generated for ourselves,
 		 * regardless of policy.  If we do not wish to make this
 		 * assumption in the future, check here, and where
-		 * icmp_loopback is initialized in ip.c and ip6.c.  (Look for
-		 * ipsec_out_icmp_loopback.)
+		 * IXAF_TRUSTED_ICMP is initialized in ip.c and ip6.c.
 		 */
-		if (ap == ii->ipsec_in_action || ii->ipsec_in_icmp_loopback)
+		if (ap == ira->ira_ipsec_action ||
+		    (ira->ira_flags & IRAF_TRUSTED_ICMP))
 			return (B_TRUE);
 
 		/* Deep compare necessary here?? */
@@ -1652,12 +1594,13 @@ ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap,
 		*reason = "loopback policy mismatch";
 		return (B_FALSE);
 	}
-	ASSERT(!ii->ipsec_in_icmp_loopback);
+	ASSERT(!(ira->ira_flags & IRAF_TRUSTED_ICMP));
+	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
 
-	ah_assoc = ii->ipsec_in_ah_sa;
-	esp_assoc = ii->ipsec_in_esp_sa;
+	ah_assoc = ira->ira_ipsec_ah_sa;
+	esp_assoc = ira->ira_ipsec_esp_sa;
 
-	decaps = ii->ipsec_in_decaps;
+	decaps = (ira->ira_flags & IRAF_IPSEC_DECAPS);
 
 	switch (ap->ipa_act.ipa_type) {
 	case IPSEC_ACT_DISCARD:
@@ -1744,10 +1687,10 @@ ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap,
 				}
 			}
 		} else if (esp_assoc != NULL) {
-				/*
-				 * Don't allow this. Check IPSEC NOTE above
-				 * ip_fanout_proto().
-				 */
+			/*
+			 * Don't allow this. Check IPSEC NOTE above
+			 * ip_fanout_proto().
+			 */
 			*counter = DROPPER(ipss, ipds_spd_got_esp);
 			*reason = "unexpected ESP";
 			ret = B_FALSE;
@@ -1777,17 +1720,18 @@ ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap,
 			ret = B_FALSE;
 			break;
 		}
-		if (ii->ipsec_in_action != NULL) {
+		if (ira->ira_ipsec_action != NULL) {
 			/*
 			 * This can happen if we do a double policy-check on
 			 * a packet
 			 * XXX XXX should fix this case!
 			 */
-			IPACT_REFRELE(ii->ipsec_in_action);
+			IPACT_REFRELE(ira->ira_ipsec_action);
 		}
-		ASSERT(ii->ipsec_in_action == NULL);
+		ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+		ASSERT(ira->ira_ipsec_action == NULL);
 		IPACT_REFHOLD(ap);
-		ii->ipsec_in_action = ap;
+		ira->ira_ipsec_action = ap;
 		break;	/* from switch */
 	}
 	return (ret);
@@ -1818,9 +1762,9 @@ static uint64_t
 conn_to_unique(conn_t *connp, mblk_t *data_mp, ipha_t *ipha, ip6_t *ip6h)
 {
 	ipsec_selector_t sel;
-	uint8_t ulp = connp->conn_ulp;
+	uint8_t ulp = connp->conn_proto;
 
-	ASSERT(connp->conn_latch->ipl_in_policy != NULL);
+	ASSERT(connp->conn_latch_in_policy != NULL);
 
 	if ((ulp == IPPROTO_TCP || ulp == IPPROTO_UDP || ulp == IPPROTO_SCTP) &&
 	    (connp->conn_fport == 0 || connp->conn_lport == 0)) {
@@ -1839,46 +1783,51 @@ conn_to_unique(conn_t *connp, mblk_t *data_mp, ipha_t *ipha, ip6_t *ip6h)
 	    SELRET_SUCCESS) {
 		ASSERT(sel.ips_local_port == connp->conn_lport);
 		ASSERT(sel.ips_remote_port == connp->conn_fport);
-		ASSERT(sel.ips_protocol == connp->conn_ulp);
+		ASSERT(sel.ips_protocol == connp->conn_proto);
 	}
-	ASSERT(connp->conn_ulp != 0);
+	ASSERT(connp->conn_proto != 0);
 #endif
 
 	return (SA_UNIQUE_ID(connp->conn_fport, connp->conn_lport, ulp, 0));
 }
 
 /*
- * Called to check policy on a latched connection, both from this file
- * and from tcp.c
+ * Called to check policy on a latched connection.
+ * Note that we don't dereference conn_latch or conn_ihere since the conn might
+ * be closing. The caller passes a held ipsec_latch_t instead.
  */
-boolean_t
-ipsec_check_ipsecin_latch(ipsec_in_t *ii, mblk_t *mp, ipsec_latch_t *ipl,
-    ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter,
-    conn_t *connp)
+static boolean_t
+ipsec_check_ipsecin_latch(ip_recv_attr_t *ira, mblk_t *mp, ipsec_latch_t *ipl,
+    ipsec_action_t *ap, ipha_t *ipha, ip6_t *ip6h, const char **reason,
+    kstat_named_t **counter, conn_t *connp, netstack_t *ns)
 {
-	netstack_t	*ns = ii->ipsec_in_ns;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
 	ASSERT(ipl->ipl_ids_latched == B_TRUE);
+	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
 
-	if (!ii->ipsec_in_loopback) {
+	if (!(ira->ira_flags & IRAF_LOOPBACK)) {
 		/*
 		 * Over loopback, there aren't real security associations,
 		 * so there are neither identities nor "unique" values
 		 * for us to check the packet against.
 		 */
-		if ((ii->ipsec_in_ah_sa != NULL) &&
-		    (!spd_match_inbound_ids(ipl, ii->ipsec_in_ah_sa))) {
-			*counter = DROPPER(ipss, ipds_spd_ah_badid);
-			*reason = "AH identity mismatch";
-			return (B_FALSE);
+		if (ira->ira_ipsec_ah_sa != NULL) {
+			if (!spd_match_inbound_ids(ipl,
+			    ira->ira_ipsec_ah_sa)) {
+				*counter = DROPPER(ipss, ipds_spd_ah_badid);
+				*reason = "AH identity mismatch";
+				return (B_FALSE);
+			}
 		}
 
-		if ((ii->ipsec_in_esp_sa != NULL) &&
-		    (!spd_match_inbound_ids(ipl, ii->ipsec_in_esp_sa))) {
-			*counter = DROPPER(ipss, ipds_spd_esp_badid);
-			*reason = "ESP identity mismatch";
-			return (B_FALSE);
+		if (ira->ira_ipsec_esp_sa != NULL) {
+			if (!spd_match_inbound_ids(ipl,
+			    ira->ira_ipsec_esp_sa)) {
+				*counter = DROPPER(ipss, ipds_spd_esp_badid);
+				*reason = "ESP identity mismatch";
+				return (B_FALSE);
+			}
 		}
 
 		/*
@@ -1886,14 +1835,13 @@ ipsec_check_ipsecin_latch(ipsec_in_t *ii, mblk_t *mp, ipsec_latch_t *ipl,
 		 * In DEBUG kernels (see conn_to_unique()'s implementation),
 		 * verify this even if it REALLY slows things down.
 		 */
-		if (!ipsec_check_ipsecin_unique(ii, reason, counter,
-		    conn_to_unique(connp, mp, ipha, ip6h))) {
+		if (!ipsec_check_ipsecin_unique(ira, reason, counter,
+		    conn_to_unique(connp, mp, ipha, ip6h), ns)) {
 			return (B_FALSE);
 		}
 	}
-
-	return (ipsec_check_ipsecin_action(ii, mp, ipl->ipl_in_action,
-	    ipha, ip6h, reason, counter));
+	return (ipsec_check_ipsecin_action(ira, mp, ap, ipha, ip6h, reason,
+	    counter, ns));
 }
 
 /*
@@ -1903,52 +1851,48 @@ ipsec_check_ipsecin_latch(ipsec_in_t *ii, mblk_t *mp, ipsec_latch_t *ipl,
  * Called from ipsec_check_global_policy, and ipsec_check_inbound_policy.
  *
  * Consumes a reference to ipsp.
+ * Returns the mblk if ok.
  */
 static mblk_t *
-ipsec_check_ipsecin_policy(mblk_t *first_mp, ipsec_policy_t *ipsp,
-    ipha_t *ipha, ip6_t *ip6h, uint64_t pkt_unique, netstack_t *ns)
+ipsec_check_ipsecin_policy(mblk_t *data_mp, ipsec_policy_t *ipsp,
+    ipha_t *ipha, ip6_t *ip6h, uint64_t pkt_unique, ip_recv_attr_t *ira,
+    netstack_t *ns)
 {
-	ipsec_in_t *ii;
 	ipsec_action_t *ap;
 	const char *reason = "no policy actions found";
-	mblk_t *data_mp, *ipsec_mp;
-	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 	ip_stack_t	*ipst = ns->netstack_ip;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 	kstat_named_t *counter;
 
 	counter = DROPPER(ipss, ipds_spd_got_secure);
 
-	data_mp = first_mp->b_cont;
-	ipsec_mp = first_mp;
-
 	ASSERT(ipsp != NULL);
 
 	ASSERT((ipha == NULL && ip6h != NULL) ||
 	    (ip6h == NULL && ipha != NULL));
 
-	ii = (ipsec_in_t *)ipsec_mp->b_rptr;
+	if (ira->ira_flags & IRAF_LOOPBACK)
+		return (ipsec_check_loopback_policy(data_mp, ira, ipsp));
 
-	if (ii->ipsec_in_loopback)
-		return (ipsec_check_loopback_policy(first_mp, B_TRUE, ipsp));
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
-	ASSERT(ii->ipsec_in_secure);
+	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
 
-	if (ii->ipsec_in_action != NULL) {
+	if (ira->ira_ipsec_action != NULL) {
 		/*
 		 * this can happen if we do a double policy-check on a packet
 		 * Would be nice to be able to delete this test..
 		 */
-		IPACT_REFRELE(ii->ipsec_in_action);
+		IPACT_REFRELE(ira->ira_ipsec_action);
 	}
-	ASSERT(ii->ipsec_in_action == NULL);
+	ASSERT(ira->ira_ipsec_action == NULL);
 
-	if (!SA_IDS_MATCH(ii->ipsec_in_ah_sa, ii->ipsec_in_esp_sa)) {
+	if (!SA_IDS_MATCH(ira->ira_ipsec_ah_sa, ira->ira_ipsec_esp_sa)) {
 		reason = "inbound AH and ESP identities differ";
 		counter = DROPPER(ipss, ipds_spd_ahesp_diffid);
 		goto drop;
 	}
 
-	if (!ipsec_check_ipsecin_unique(ii, &reason, &counter, pkt_unique))
+	if (!ipsec_check_ipsecin_unique(ira, &reason, &counter, pkt_unique,
+	    ns))
 		goto drop;
 
 	/*
@@ -1957,21 +1901,21 @@ ipsec_check_ipsecin_policy(mblk_t *first_mp, ipsec_policy_t *ipsp,
 	 */
 
 	for (ap = ipsp->ipsp_act; ap != NULL; ap = ap->ipa_next) {
-		if (ipsec_check_ipsecin_action(ii, data_mp, ap,
-		    ipha, ip6h, &reason, &counter)) {
+		if (ipsec_check_ipsecin_action(ira, data_mp, ap,
+		    ipha, ip6h, &reason, &counter, ns)) {
 			BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
-			IPPOL_REFRELE(ipsp, ns);
-			return (first_mp);
+			IPPOL_REFRELE(ipsp);
+			return (data_mp);
 		}
 	}
 drop:
 	ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
 	    "ipsec inbound policy mismatch: %s, packet dropped\n",
 	    reason);
-	IPPOL_REFRELE(ipsp, ns);
-	ASSERT(ii->ipsec_in_action == NULL);
+	IPPOL_REFRELE(ipsp);
+	ASSERT(ira->ira_ipsec_action == NULL);
 	BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
-	ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter,
+	ip_drop_packet(data_mp, B_TRUE, NULL, counter,
 	    &ipss->ipsec_spd_dropper);
 	return (NULL);
 }
@@ -2075,7 +2019,7 @@ ipsec_find_policy_chain(ipsec_policy_t *best, ipsec_policy_t *chain,
  */
 ipsec_policy_t *
 ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head,
-    int direction, ipsec_selector_t *sel, netstack_t *ns)
+    int direction, ipsec_selector_t *sel)
 {
 	ipsec_policy_t *curbest;
 	ipsec_policy_root_t *root;
@@ -2121,7 +2065,7 @@ ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head,
 		IPPOL_REFHOLD(curbest);
 
 		if (best != NULL) {
-			IPPOL_REFRELE(best, ns);
+			IPPOL_REFRELE(best);
 		}
 	}
 
@@ -2139,20 +2083,17 @@ ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head,
  * reference when done.
  */
 ipsec_policy_t *
-ipsec_find_policy(int direction, conn_t *connp, ipsec_out_t *io,
-    ipsec_selector_t *sel, netstack_t *ns)
+ipsec_find_policy(int direction, const conn_t *connp, ipsec_selector_t *sel,
+    netstack_t *ns)
 {
 	ipsec_policy_t *p;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
 	p = ipsec_find_policy_head(NULL, &ipss->ipsec_system_policy,
-	    direction, sel, ns);
+	    direction, sel);
 	if ((connp != NULL) && (connp->conn_policy != NULL)) {
 		p = ipsec_find_policy_head(p, connp->conn_policy,
-		    direction, sel, ns);
-	} else if ((io != NULL) && (io->ipsec_out_polhead != NULL)) {
-		p = ipsec_find_policy_head(p, io->ipsec_out_polhead,
-		    direction, sel, ns);
+		    direction, sel);
 	}
 
 	return (p);
@@ -2172,21 +2113,16 @@ ipsec_find_policy(int direction, conn_t *connp, ipsec_out_t *io,
  * floor.
  */
 mblk_t *
-ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
-    ipha_t *ipha, ip6_t *ip6h, boolean_t mctl_present, netstack_t *ns)
+ipsec_check_global_policy(mblk_t *data_mp, conn_t *connp,
+    ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, netstack_t *ns)
 {
 	ipsec_policy_t *p;
 	ipsec_selector_t sel;
-	mblk_t *data_mp, *ipsec_mp;
 	boolean_t policy_present;
 	kstat_named_t *counter;
-	ipsec_in_t *ii = NULL;
 	uint64_t pkt_unique;
-	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 	ip_stack_t	*ipst = ns->netstack_ip;
-
-	data_mp = mctl_present ? first_mp->b_cont : first_mp;
-	ipsec_mp = mctl_present ? first_mp : NULL;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
 	sel.ips_is_icmp_inv_acq = 0;
 
@@ -2203,13 +2139,7 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
 		 * No global policy and no per-socket policy;
 		 * just pass it back (but we shouldn't get here in that case)
 		 */
-		return (first_mp);
-	}
-
-	if (ipsec_mp != NULL) {
-		ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
-		ii = (ipsec_in_t *)(ipsec_mp->b_rptr);
-		ASSERT(ii->ipsec_in_type == IPSEC_IN);
+		return (data_mp);
 	}
 
 	/*
@@ -2217,32 +2147,11 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
 	 * Otherwise consult system policy.
 	 */
 	if ((connp != NULL) && (connp->conn_latch != NULL)) {
-		p = connp->conn_latch->ipl_in_policy;
+		p = connp->conn_latch_in_policy;
 		if (p != NULL) {
 			IPPOL_REFHOLD(p);
 		}
 		/*
-		 * The caller may have mistakenly assigned an ip6i_t as the
-		 * ip6h for this packet, so take that corner-case into
-		 * account.
-		 */
-		if (ip6h != NULL && ip6h->ip6_nxt == IPPROTO_RAW) {
-			ip6h++;
-			/* First check for bizarro split-mblk headers. */
-			if ((uintptr_t)ip6h > (uintptr_t)data_mp->b_wptr ||
-			    ((uintptr_t)ip6h) + sizeof (ip6_t) >
-			    (uintptr_t)data_mp->b_wptr) {
-				ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH,
-				    "ipsec_check_global_policy", ipha, ip6h,
-				    B_TRUE, ns);
-				counter = DROPPER(ipss, ipds_spd_nomem);
-				goto fail;
-			}
-			/* Next, see if ip6i is at the end of an mblk. */
-			if (ip6h == (ip6_t *)data_mp->b_wptr)
-				ip6h = (ip6_t *)data_mp->b_cont->b_rptr;
-		}
-		/*
 		 * Fudge sel for UNIQUE_ID setting below.
 		 */
 		pkt_unique = conn_to_unique(connp, data_mp, ipha, ip6h);
@@ -2271,20 +2180,19 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
 		 * local policy alone.
 		 */
 
-		p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, NULL, &sel,
-		    ns);
+		p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, &sel, ns);
 		pkt_unique = SA_UNIQUE_ID(sel.ips_remote_port,
 		    sel.ips_local_port, sel.ips_protocol, 0);
 	}
 
 	if (p == NULL) {
-		if (ipsec_mp == NULL) {
+		if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
 			/*
 			 * We have no policy; default to succeeding.
 			 * XXX paranoid system design doesn't do this.
 			 */
 			BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
-			return (first_mp);
+			return (data_mp);
 		} else {
 			counter = DROPPER(ipss, ipds_spd_got_secure);
 			ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED,
@@ -2293,16 +2201,16 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
 			goto fail;
 		}
 	}
-	if ((ii != NULL) && (ii->ipsec_in_secure)) {
-		return (ipsec_check_ipsecin_policy(ipsec_mp, p, ipha, ip6h,
-		    pkt_unique, ns));
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+		return (ipsec_check_ipsecin_policy(data_mp, p, ipha, ip6h,
+		    pkt_unique, ira, ns));
 	}
 	if (p->ipsp_act->ipa_allow_clear) {
 		BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
-		IPPOL_REFRELE(p, ns);
-		return (first_mp);
+		IPPOL_REFRELE(p);
+		return (data_mp);
 	}
-	IPPOL_REFRELE(p, ns);
+	IPPOL_REFRELE(p);
 	/*
 	 * If we reach here, we will drop the packet because it failed the
 	 * global policy check because the packet was cleartext, and it
@@ -2313,7 +2221,7 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
 	counter = DROPPER(ipss, ipds_spd_got_clear);
 
 fail:
-	ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter,
+	ip_drop_packet(data_mp, B_TRUE, NULL, counter,
 	    &ipss->ipsec_spd_dropper);
 	BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
 	return (NULL);
@@ -2435,7 +2343,7 @@ ipsec_inbound_accept_clear(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
 			case ICMP_FRAGMENTATION_NEEDED:
 				/*
 				 * Be in sync with icmp_inbound, where we have
-				 * already set ire_max_frag.
+				 * already set dce_pmtu
 				 */
 #ifdef FRAGCACHE_DEBUG
 			cmn_err(CE_WARN, "ICMP frag needed\n");
@@ -2496,27 +2404,44 @@ ipsec_latch_ids(ipsec_latch_t *ipl, ipsid_t *local, ipsid_t *remote)
 }
 
 void
-ipsec_latch_inbound(ipsec_latch_t *ipl, ipsec_in_t *ii)
+ipsec_latch_inbound(conn_t *connp, ip_recv_attr_t *ira)
 {
 	ipsa_t *sa;
+	ipsec_latch_t *ipl = connp->conn_latch;
 
 	if (!ipl->ipl_ids_latched) {
 		ipsid_t *local = NULL;
 		ipsid_t *remote = NULL;
 
-		if (!ii->ipsec_in_loopback) {
-			if (ii->ipsec_in_esp_sa != NULL)
-				sa = ii->ipsec_in_esp_sa;
+		if (!(ira->ira_flags & IRAF_LOOPBACK)) {
+			ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+			if (ira->ira_ipsec_esp_sa != NULL)
+				sa = ira->ira_ipsec_esp_sa;
 			else
-				sa = ii->ipsec_in_ah_sa;
+				sa = ira->ira_ipsec_ah_sa;
 			ASSERT(sa != NULL);
 			local = sa->ipsa_dst_cid;
 			remote = sa->ipsa_src_cid;
 		}
 		ipsec_latch_ids(ipl, local, remote);
 	}
-	ipl->ipl_in_action = ii->ipsec_in_action;
-	IPACT_REFHOLD(ipl->ipl_in_action);
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+		if (connp->conn_latch_in_action != NULL) {
+			/*
+			 * Previously cached action.  This is probably
+			 * harmless, but in DEBUG kernels, check for
+			 * action equality.
+			 *
+			 * Preserve the existing action to preserve latch
+			 * invariance.
+			 */
+			ASSERT(connp->conn_latch_in_action ==
+			    ira->ira_ipsec_action);
+			return;
+		}
+		connp->conn_latch_in_action = ira->ira_ipsec_action;
+		IPACT_REFHOLD(connp->conn_latch_in_action);
+	}
 }
 
 /*
@@ -2527,27 +2452,25 @@ ipsec_latch_inbound(ipsec_latch_t *ipl, ipsec_in_t *ii)
  * see also ipsec_check_ipsecin_latch() and ipsec_check_global_policy()
  */
 mblk_t *
-ipsec_check_inbound_policy(mblk_t *first_mp, conn_t *connp,
-    ipha_t *ipha, ip6_t *ip6h, boolean_t mctl_present)
+ipsec_check_inbound_policy(mblk_t *mp, conn_t *connp,
+    ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira)
 {
-	ipsec_in_t *ii;
-	boolean_t ret;
-	mblk_t *mp = mctl_present ? first_mp->b_cont : first_mp;
-	mblk_t *ipsec_mp = mctl_present ? first_mp : NULL;
-	ipsec_latch_t *ipl;
-	uint64_t unique_id;
+	boolean_t	ret;
+	ipsec_latch_t	*ipl;
+	ipsec_action_t	*ap;
+	uint64_t	unique_id;
 	ipsec_stack_t	*ipss;
 	ip_stack_t	*ipst;
 	netstack_t	*ns;
 	ipsec_policy_head_t *policy_head;
+	ipsec_policy_t	*p = NULL;
 
 	ASSERT(connp != NULL);
 	ns = connp->conn_netstack;
 	ipss = ns->netstack_ipsec;
 	ipst = ns->netstack_ip;
 
-	if (ipsec_mp == NULL) {
-clear:
+	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
 		/*
 		 * This is the case where the incoming datagram is
 		 * cleartext and we need to see whether this client
@@ -2559,49 +2482,49 @@ clear:
 		mutex_enter(&connp->conn_lock);
 		if (connp->conn_state_flags & CONN_CONDEMNED) {
 			mutex_exit(&connp->conn_lock);
-			ip_drop_packet(first_mp, B_TRUE, NULL,
-			    NULL, DROPPER(ipss, ipds_spd_got_clear),
+			ip_drop_packet(mp, B_TRUE, NULL,
+			    DROPPER(ipss, ipds_spd_got_clear),
 			    &ipss->ipsec_spd_dropper);
 			BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
 			return (NULL);
 		}
-		if ((ipl = connp->conn_latch) != NULL) {
+		if (connp->conn_latch != NULL) {
 			/* Hold a reference in case the conn is closing */
-			IPLATCH_REFHOLD(ipl);
+			p = connp->conn_latch_in_policy;
+			if (p != NULL)
+				IPPOL_REFHOLD(p);
 			mutex_exit(&connp->conn_lock);
 			/*
 			 * Policy is cached in the conn.
 			 */
-			if ((ipl->ipl_in_policy != NULL) &&
-			    (!ipl->ipl_in_policy->ipsp_act->ipa_allow_clear)) {
+			if (p != NULL && !p->ipsp_act->ipa_allow_clear) {
 				ret = ipsec_inbound_accept_clear(mp,
 				    ipha, ip6h);
 				if (ret) {
 					BUMP_MIB(&ipst->ips_ip_mib,
 					    ipsecInSucceeded);
-					IPLATCH_REFRELE(ipl, ns);
-					return (first_mp);
+					IPPOL_REFRELE(p);
+					return (mp);
 				} else {
 					ipsec_log_policy_failure(
 					    IPSEC_POLICY_MISMATCH,
 					    "ipsec_check_inbound_policy", ipha,
 					    ip6h, B_FALSE, ns);
-					ip_drop_packet(first_mp, B_TRUE, NULL,
-					    NULL,
+					ip_drop_packet(mp, B_TRUE, NULL,
 					    DROPPER(ipss, ipds_spd_got_clear),
 					    &ipss->ipsec_spd_dropper);
 					BUMP_MIB(&ipst->ips_ip_mib,
 					    ipsecInFailed);
-					IPLATCH_REFRELE(ipl, ns);
+					IPPOL_REFRELE(p);
 					return (NULL);
 				}
 			} else {
 				BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
-				IPLATCH_REFRELE(ipl, ns);
-				return (first_mp);
+				if (p != NULL)
+					IPPOL_REFRELE(p);
+				return (mp);
 			}
 		} else {
-			uchar_t db_type;
 			policy_head = connp->conn_policy;
 
 			/* Hold a reference in case the conn is closing */
@@ -2611,50 +2534,22 @@ clear:
 			/*
 			 * As this is a non-hardbound connection we need
 			 * to look at both per-socket policy and global
-			 * policy. As this is cleartext, mark the mp as
-			 * M_DATA in case if it is an ICMP error being
-			 * reported before calling ipsec_check_global_policy
-			 * so that it does not mistake it for IPSEC_IN.
+			 * policy.
 			 */
-			db_type = mp->b_datap->db_type;
-			mp->b_datap->db_type = M_DATA;
-			first_mp = ipsec_check_global_policy(first_mp, connp,
-			    ipha, ip6h, mctl_present, ns);
+			mp = ipsec_check_global_policy(mp, connp,
+			    ipha, ip6h, ira, ns);
 			if (policy_head != NULL)
 				IPPH_REFRELE(policy_head, ns);
-			if (first_mp != NULL)
-				mp->b_datap->db_type = db_type;
-			return (first_mp);
+			return (mp);
 		}
 	}
-	/*
-	 * If it is inbound check whether the attached message
-	 * is secure or not. We have a special case for ICMP,
-	 * where we have a IPSEC_IN message and the attached
-	 * message is not secure. See icmp_inbound_error_fanout
-	 * for details.
-	 */
-	ASSERT(ipsec_mp != NULL);
-	ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
-	ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-
-	if (!ii->ipsec_in_secure)
-		goto clear;
-
-	/*
-	 * mp->b_cont could be either a M_CTL message
-	 * for icmp errors being sent up or a M_DATA message.
-	 */
-	ASSERT(mp->b_datap->db_type == M_CTL || mp->b_datap->db_type == M_DATA);
-
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
 
 	mutex_enter(&connp->conn_lock);
 	/* Connection is closing */
 	if (connp->conn_state_flags & CONN_CONDEMNED) {
 		mutex_exit(&connp->conn_lock);
-		ip_drop_packet(first_mp, B_TRUE, NULL,
-		    NULL, DROPPER(ipss, ipds_spd_got_clear),
+		ip_drop_packet(mp, B_TRUE, NULL,
+		    DROPPER(ipss, ipds_spd_got_clear),
 		    &ipss->ipsec_spd_dropper);
 		BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
 		return (NULL);
@@ -2679,58 +2574,64 @@ clear:
 		 * policy. It will check against conn or global
 		 * depending on whichever is stronger.
 		 */
-		retmp = ipsec_check_global_policy(first_mp, connp,
-		    ipha, ip6h, mctl_present, ns);
+		retmp = ipsec_check_global_policy(mp, connp,
+		    ipha, ip6h, ira, ns);
 		if (policy_head != NULL)
 			IPPH_REFRELE(policy_head, ns);
 		return (retmp);
 	}
 
 	IPLATCH_REFHOLD(ipl);
+	/* Hold reference on conn_latch_in_action in case conn is closing */
+	ap = connp->conn_latch_in_action;
+	if (ap != NULL)
+		IPACT_REFHOLD(ap);
 	mutex_exit(&connp->conn_lock);
 
-	if (ipl->ipl_in_action != NULL) {
+	if (ap != NULL) {
 		/* Policy is cached & latched; fast(er) path */
 		const char *reason;
 		kstat_named_t *counter;
 
-		if (ipsec_check_ipsecin_latch(ii, mp, ipl,
-		    ipha, ip6h, &reason, &counter, connp)) {
+		if (ipsec_check_ipsecin_latch(ira, mp, ipl, ap,
+		    ipha, ip6h, &reason, &counter, connp, ns)) {
 			BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
-			IPLATCH_REFRELE(ipl, ns);
-			return (first_mp);
+			IPLATCH_REFRELE(ipl);
+			IPACT_REFRELE(ap);
+			return (mp);
 		}
 		ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0,
 		    SL_ERROR|SL_WARN|SL_CONSOLE,
 		    "ipsec inbound policy mismatch: %s, packet dropped\n",
 		    reason);
-		ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter,
+		ip_drop_packet(mp, B_TRUE, NULL, counter,
 		    &ipss->ipsec_spd_dropper);
 		BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
-		IPLATCH_REFRELE(ipl, ns);
+		IPLATCH_REFRELE(ipl);
+		IPACT_REFRELE(ap);
 		return (NULL);
-	} else if (ipl->ipl_in_policy == NULL) {
+	}
+	if ((p = connp->conn_latch_in_policy) == NULL) {
 		ipsec_weird_null_inbound_policy++;
-		IPLATCH_REFRELE(ipl, ns);
-		return (first_mp);
+		IPLATCH_REFRELE(ipl);
+		return (mp);
 	}
 
 	unique_id = conn_to_unique(connp, mp, ipha, ip6h);
-	IPPOL_REFHOLD(ipl->ipl_in_policy);
-	first_mp = ipsec_check_ipsecin_policy(first_mp, ipl->ipl_in_policy,
-	    ipha, ip6h, unique_id, ns);
+	IPPOL_REFHOLD(p);
+	mp = ipsec_check_ipsecin_policy(mp, p, ipha, ip6h, unique_id, ira, ns);
 	/*
 	 * NOTE: ipsecIn{Failed,Succeeeded} bumped by
 	 * ipsec_check_ipsecin_policy().
 	 */
-	if (first_mp != NULL)
-		ipsec_latch_inbound(ipl, ii);
-	IPLATCH_REFRELE(ipl, ns);
-	return (first_mp);
+	if (mp != NULL)
+		ipsec_latch_inbound(connp, ira);
+	IPLATCH_REFRELE(ipl);
+	return (mp);
 }
 
 /*
- * Handle all sorts of cases like tunnel-mode, ICMP, and ip6i prepending.
+ * Handle all sorts of cases like tunnel-mode and ICMP.
  */
 static int
 prepended_length(mblk_t *mp, uintptr_t hptr)
@@ -2779,19 +2680,24 @@ prepended_length(mblk_t *mp, uintptr_t hptr)
  *		      should put this packet in a fragment-gathering queue.
  *		      Only returned if SEL_TUNNEL_MODE and SEL_PORT_POLICY
  *		      is set.
+ *
+ * Note that ipha/ip6h can be in a different mblk (mp->b_cont) in the case
+ * of tunneled packets.
+ * Also, mp->b_rptr can be an ICMP error where ipha/ip6h is the packet in
+ * error past the ICMP error.
  */
 static selret_t
 ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
     ip6_t *ip6h, uint8_t sel_flags)
 {
 	uint16_t *ports;
-	int outer_hdr_len = 0;	/* For ICMP, tunnel-mode, or ip6i cases... */
+	int outer_hdr_len = 0;	/* For ICMP or tunnel-mode cases... */
 	ushort_t hdr_len;
 	mblk_t *spare_mp = NULL;
 	uint8_t *nexthdrp, *transportp;
 	uint8_t nexthdr;
 	uint8_t icmp_proto;
-	ip6_pkt_t ipp;
+	ip_pkt_t ipp;
 	boolean_t port_policy_present = (sel_flags & SEL_PORT_POLICY);
 	boolean_t is_icmp = (sel_flags & SEL_IS_ICMP);
 	boolean_t tunnel_mode = (sel_flags & SEL_TUNNEL_MODE);
@@ -2802,44 +2708,14 @@ ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
 
 	if (ip6h != NULL) {
 		outer_hdr_len = prepended_length(mp, (uintptr_t)ip6h);
-
 		nexthdr = ip6h->ip6_nxt;
-
-		/*
-		 * The caller may have mistakenly assigned an ip6i_t as the
-		 * ip6h for this packet, so take that corner-case into
-		 * account.
-		 */
-		if (nexthdr == IPPROTO_RAW) {
-			ip6h++;
-			/* First check for bizarro split-mblk headers. */
-			if ((uintptr_t)ip6h > (uintptr_t)mp->b_wptr ||
-			    ((uintptr_t)ip6h) + sizeof (ip6_t) >
-			    (uintptr_t)mp->b_wptr) {
-				return (SELRET_BADPKT);
-			}
-			/* Next, see if ip6i is at the end of an mblk. */
-			if (ip6h == (ip6_t *)mp->b_wptr)
-				ip6h = (ip6_t *)mp->b_cont->b_rptr;
-
-			nexthdr = ip6h->ip6_nxt;
-
-			/*
-			 * Finally, if we haven't adjusted for ip6i, do so
-			 * now.  ip6i_t structs are prepended, so an ICMP
-			 * or tunnel packet would just be overwritten.
-			 */
-			if (outer_hdr_len == 0)
-				outer_hdr_len = sizeof (ip6i_t);
-		}
-
 		icmp_proto = IPPROTO_ICMPV6;
 		sel->ips_isv4 = B_FALSE;
 		sel->ips_local_addr_v6 = ip6h->ip6_dst;
 		sel->ips_remote_addr_v6 = ip6h->ip6_src;
 
 		bzero(&ipp, sizeof (ipp));
-		(void) ip_find_hdr_v6(mp, ip6h, &ipp, NULL);
+		(void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &ipp, NULL);
 
 		switch (nexthdr) {
 		case IPPROTO_HOPOPTS:
@@ -2852,7 +2728,6 @@ ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
 			 */
 			if ((spare_mp = msgpullup(mp, -1)) == NULL)
 				return (SELRET_NOMEM);
-
 			if (!ip_hdr_length_nexthdr_v6(spare_mp,
 			    (ip6_t *)(spare_mp->b_rptr + outer_hdr_len),
 			    &hdr_len, &nexthdrp)) {
@@ -2930,6 +2805,10 @@ ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
 	return (SELRET_SUCCESS);
 }
 
+/*
+ * This is called with a b_next chain of messages from the fragcache code,
+ * hence it needs to discard a chain on error.
+ */
 static boolean_t
 ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
     ip6_t *ip6h, int outer_hdr_len, ipsec_stack_t *ipss)
@@ -2967,7 +2846,7 @@ ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
 			    &hdr_len, &nexthdrp)) {
 				/* Always works, even if NULL. */
 				ipsec_freemsg_chain(spare_mp);
-				ip_drop_packet_chain(mp, B_FALSE, NULL, NULL,
+				ip_drop_packet_chain(mp, B_FALSE, NULL,
 				    DROPPER(ipss, ipds_spd_nomem),
 				    &ipss->ipsec_spd_dropper);
 				return (B_FALSE);
@@ -3005,7 +2884,7 @@ ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
 		 */
 		if (spare_mp == NULL &&
 		    (spare_mp = msgpullup(mp, -1)) == NULL) {
-			ip_drop_packet_chain(mp, B_FALSE, NULL, NULL,
+			ip_drop_packet_chain(mp, B_FALSE, NULL,
 			    DROPPER(ipss, ipds_spd_nomem),
 			    &ipss->ipsec_spd_dropper);
 			return (B_FALSE);
@@ -3029,13 +2908,68 @@ ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
 }
 
 /*
+ * Prepend an mblk with a ipsec_crypto_t to the message chain.
+ * Frees the argument and returns NULL should the allocation fail.
+ * Returns the pointer to the crypto data part.
+ */
+mblk_t *
+ipsec_add_crypto_data(mblk_t *data_mp, ipsec_crypto_t **icp)
+{
+	mblk_t	*mp;
+
+	mp = allocb(sizeof (ipsec_crypto_t), BPRI_MED);
+	if (mp == NULL) {
+		freemsg(data_mp);
+		return (NULL);
+	}
+	bzero(mp->b_rptr, sizeof (ipsec_crypto_t));
+	mp->b_wptr += sizeof (ipsec_crypto_t);
+	mp->b_cont = data_mp;
+	mp->b_datap->db_type = M_EVENT;	/* For ASSERT */
+	*icp = (ipsec_crypto_t *)mp->b_rptr;
+	return (mp);
+}
+
+/*
+ * Remove what was prepended above. Return b_cont and a pointer to the
+ * crypto data.
+ * The caller must call ipsec_free_crypto_data for mblk once it is done
+ * with the crypto data.
+ */
+mblk_t *
+ipsec_remove_crypto_data(mblk_t *crypto_mp, ipsec_crypto_t **icp)
+{
+	ASSERT(crypto_mp->b_datap->db_type == M_EVENT);
+	ASSERT(MBLKL(crypto_mp) == sizeof (ipsec_crypto_t));
+
+	*icp = (ipsec_crypto_t *)crypto_mp->b_rptr;
+	return (crypto_mp->b_cont);
+}
+
+/*
+ * Free what was prepended above. Return b_cont.
+ */
+mblk_t *
+ipsec_free_crypto_data(mblk_t *crypto_mp)
+{
+	mblk_t	*mp;
+
+	ASSERT(crypto_mp->b_datap->db_type == M_EVENT);
+	ASSERT(MBLKL(crypto_mp) == sizeof (ipsec_crypto_t));
+
+	mp = crypto_mp->b_cont;
+	freeb(crypto_mp);
+	return (mp);
+}
+
+/*
  * Create an ipsec_action_t based on the way an inbound packet was protected.
  * Used to reflect traffic back to a sender.
  *
  * We don't bother interning the action into the hash table.
  */
 ipsec_action_t *
-ipsec_in_to_out_action(ipsec_in_t *ii)
+ipsec_in_to_out_action(ip_recv_attr_t *ira)
 {
 	ipsa_t *ah_assoc, *esp_assoc;
 	uint_t auth_alg = 0, encr_alg = 0, espa_alg = 0;
@@ -3057,10 +2991,12 @@ ipsec_in_to_out_action(ipsec_in_t *ii)
 	 */
 	ap->ipa_act.ipa_type = IPSEC_ACT_APPLY;
 	ap->ipa_act.ipa_log = 0;
-	ah_assoc = ii->ipsec_in_ah_sa;
+	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+
+	ah_assoc = ira->ira_ipsec_ah_sa;
 	ap->ipa_act.ipa_apply.ipp_use_ah = (ah_assoc != NULL);
 
-	esp_assoc = ii->ipsec_in_esp_sa;
+	esp_assoc = ira->ira_ipsec_esp_sa;
 	ap->ipa_act.ipa_apply.ipp_use_esp = (esp_assoc != NULL);
 
 	if (esp_assoc != NULL) {
@@ -3074,7 +3010,8 @@ ipsec_in_to_out_action(ipsec_in_t *ii)
 	ap->ipa_act.ipa_apply.ipp_encr_alg = (uint8_t)encr_alg;
 	ap->ipa_act.ipa_apply.ipp_auth_alg = (uint8_t)auth_alg;
 	ap->ipa_act.ipa_apply.ipp_esp_auth_alg = (uint8_t)espa_alg;
-	ap->ipa_act.ipa_apply.ipp_use_se = ii->ipsec_in_decaps;
+	ap->ipa_act.ipa_apply.ipp_use_se =
+	    !!(ira->ira_flags & IRAF_IPSEC_DECAPS);
 	unique = B_FALSE;
 
 	if (esp_assoc != NULL) {
@@ -3104,7 +3041,7 @@ ipsec_in_to_out_action(ipsec_in_t *ii)
 	ap->ipa_act.ipa_apply.ipp_use_unique = unique;
 	ap->ipa_want_unique = unique;
 	ap->ipa_allow_clear = B_FALSE;
-	ap->ipa_want_se = ii->ipsec_in_decaps;
+	ap->ipa_want_se = !!(ira->ira_flags & IRAF_IPSEC_DECAPS);
 	ap->ipa_want_ah = (ah_assoc != NULL);
 	ap->ipa_want_esp = (esp_assoc != NULL);
 
@@ -3500,13 +3437,14 @@ ipsec_sel_rel(ipsec_sel_t **spp, netstack_t *ns)
  * Free a policy rule which we know is no longer being referenced.
  */
 void
-ipsec_policy_free(ipsec_policy_t *ipp, netstack_t *ns)
+ipsec_policy_free(ipsec_policy_t *ipp)
 {
 	ASSERT(ipp->ipsp_refs == 0);
 	ASSERT(ipp->ipsp_sel != NULL);
 	ASSERT(ipp->ipsp_act != NULL);
+	ASSERT(ipp->ipsp_netstack != NULL);
 
-	ipsec_sel_rel(&ipp->ipsp_sel, ns);
+	ipsec_sel_rel(&ipp->ipsp_sel, ipp->ipsp_netstack);
 	IPACT_REFRELE(ipp->ipsp_act);
 	kmem_cache_free(ipsec_pol_cache, ipp);
 }
@@ -3544,6 +3482,7 @@ ipsec_policy_create(ipsec_selkey_t *keys, const ipsec_act_t *a,
 
 	HASH_NULL(ipp, ipsp_hash);
 
+	ipp->ipsp_netstack = ns;	/* Needed for ipsec_policy_free */
 	ipp->ipsp_refs = 1;	/* caller's reference */
 	ipp->ipsp_sel = sp;
 	ipp->ipsp_act = ap;
@@ -3613,7 +3552,7 @@ ipsec_policy_delete(ipsec_policy_head_t *php, ipsec_selkey_t *keys, int dir,
 			continue;
 		}
 
-		IPPOL_UNCHAIN(php, ip, ns);
+		IPPOL_UNCHAIN(php, ip);
 
 		php->iph_gen++;
 		ipsec_update_present_flags(ns->netstack_ipsec);
@@ -3664,7 +3603,7 @@ ipsec_policy_delete_index(ipsec_policy_head_t *php, uint64_t policy_index,
 			break;
 		}
 
-		IPPOL_UNCHAIN(php, ip, ns);
+		IPPOL_UNCHAIN(php, ip);
 		found = B_TRUE;
 	}
 
@@ -3897,8 +3836,7 @@ ipsec_enter_policy(ipsec_policy_head_t *php, ipsec_policy_t *ipp, int direction,
 }
 
 static void
-ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr,
-    netstack_t *ns)
+ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr)
 {
 	ipsec_policy_t *ip, *nip;
 	int af, chain, nchain;
@@ -3906,7 +3844,7 @@ ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr,
 	for (af = 0; af < IPSEC_NAF; af++) {
 		for (ip = ipr->ipr_nonhash[af]; ip != NULL; ip = nip) {
 			nip = ip->ipsp_hash.hash_next;
-			IPPOL_UNCHAIN(php, ip, ns);
+			IPPOL_UNCHAIN(php, ip);
 		}
 		ipr->ipr_nonhash[af] = NULL;
 	}
@@ -3916,7 +3854,7 @@ ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr,
 		for (ip = ipr->ipr_hash[chain].hash_head; ip != NULL;
 		    ip = nip) {
 			nip = ip->ipsp_hash.hash_next;
-			IPPOL_UNCHAIN(php, ip, ns);
+			IPPOL_UNCHAIN(php, ip);
 		}
 		ipr->ipr_hash[chain].hash_head = NULL;
 	}
@@ -3954,8 +3892,9 @@ ipsec_polhead_flush(ipsec_policy_head_t *php, netstack_t *ns)
 	ASSERT(RW_WRITE_HELD(&php->iph_lock));
 
 	for (dir = 0; dir < IPSEC_NTYPES; dir++)
-		ipsec_ipr_flush(php, &php->iph_root[dir], ns);
+		ipsec_ipr_flush(php, &php->iph_root[dir]);
 
+	php->iph_gen++;
 	ipsec_update_present_flags(ns->netstack_ipsec);
 }
 
@@ -4066,727 +4005,219 @@ ipsec_polhead_split(ipsec_policy_head_t *php, netstack_t *ns)
  *
  * NOTE2:	This function is called by cleartext cases, so it needs to be
  *		in IP proper.
+ *
+ * Note: the caller has moved other parts of ira into ixa already.
  */
 boolean_t
-ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h, zoneid_t zoneid)
-{
-	ipsec_in_t  *ii;
-	ipsec_out_t  *io;
-	boolean_t v4;
-	mblk_t *mp;
-	boolean_t secure;
-	uint_t ifindex;
+ipsec_in_to_out(ip_recv_attr_t *ira, ip_xmit_attr_t *ixa, mblk_t *data_mp,
+    ipha_t *ipha, ip6_t *ip6h)
+{
 	ipsec_selector_t sel;
-	ipsec_action_t *reflect_action = NULL;
-	netstack_t	*ns;
-
-	ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
+	ipsec_action_t	*reflect_action = NULL;
+	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
 
 	bzero((void*)&sel, sizeof (sel));
 
-	ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-
-	mp = ipsec_mp->b_cont;
-	ASSERT(mp != NULL);
-
-	if (ii->ipsec_in_action != NULL) {
+	if (ira->ira_ipsec_action != NULL) {
 		/* transfer reference.. */
-		reflect_action = ii->ipsec_in_action;
-		ii->ipsec_in_action = NULL;
-	} else if (!ii->ipsec_in_loopback)
-		reflect_action = ipsec_in_to_out_action(ii);
-	secure = ii->ipsec_in_secure;
-	ifindex = ii->ipsec_in_ill_index;
-	ns = ii->ipsec_in_ns;
-	v4 = ii->ipsec_in_v4;
-
-	ipsec_in_release_refs(ii);	/* No netstack_rele/hold needed */
-
-	/*
-	 * Use the global zone's id if we don't have a specific zone
-	 * identified. This is likely to happen when the received packet's
-	 * destination is a Trusted Extensions all-zones address. We did
-	 * not copy the zoneid from ii->ipsec_in_zone id because that
-	 * information represents the zoneid we started input processing
-	 * with. The caller should have a better idea of which zone the
-	 * received packet was destined for.
-	 */
-
-	if (zoneid == ALL_ZONES)
-		zoneid = GLOBAL_ZONEID;
+		reflect_action = ira->ira_ipsec_action;
+		ira->ira_ipsec_action = NULL;
+	} else if (!(ira->ira_flags & IRAF_LOOPBACK))
+		reflect_action = ipsec_in_to_out_action(ira);
 
 	/*
 	 * The caller is going to send the datagram out which might
-	 * go on the wire or delivered locally through ip_wput_local.
+	 * go on the wire or delivered locally through ire_send_local.
 	 *
 	 * 1) If it goes out on the wire, new associations will be
 	 *    obtained.
-	 * 2) If it is delivered locally, ip_wput_local will convert
-	 *    this IPSEC_OUT to a IPSEC_IN looking at the requests.
+	 * 2) If it is delivered locally, ire_send_local will convert
+	 *    this ip_xmit_attr_t back to a ip_recv_attr_t looking at the
+	 *    requests.
 	 */
+	ixa->ixa_ipsec_action = reflect_action;
 
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	bzero(io, sizeof (ipsec_out_t));
-	io->ipsec_out_type = IPSEC_OUT;
-	io->ipsec_out_len = sizeof (ipsec_out_t);
-	io->ipsec_out_frtn.free_func = ipsec_out_free;
-	io->ipsec_out_frtn.free_arg = (char *)io;
-	io->ipsec_out_act = reflect_action;
-
-	if (!ipsec_init_outbound_ports(&sel, mp, ipha, ip6h, 0,
-	    ns->netstack_ipsec))
+	if (!ipsec_init_outbound_ports(&sel, data_mp, ipha, ip6h, 0,
+	    ns->netstack_ipsec)) {
+		/* Note: data_mp already consumed and ip_drop_packet done */
 		return (B_FALSE);
-
-	io->ipsec_out_src_port = sel.ips_local_port;
-	io->ipsec_out_dst_port = sel.ips_remote_port;
-	io->ipsec_out_proto = sel.ips_protocol;
-	io->ipsec_out_icmp_type = sel.ips_icmp_type;
-	io->ipsec_out_icmp_code = sel.ips_icmp_code;
+	}
+	ixa->ixa_ipsec_src_port = sel.ips_local_port;
+	ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
+	ixa->ixa_ipsec_proto = sel.ips_protocol;
+	ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
+	ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
 
 	/*
 	 * Don't use global policy for this, as we want
 	 * to use the same protection that was applied to the inbound packet.
+	 * Thus we set IXAF_NO_IPSEC is it arrived in the clear to make
+	 * it be sent in the clear.
 	 */
-	io->ipsec_out_use_global_policy = B_FALSE;
-	io->ipsec_out_proc_begin = B_FALSE;
-	io->ipsec_out_secure = secure;
-	io->ipsec_out_v4 = v4;
-	io->ipsec_out_ill_index = ifindex;
-	io->ipsec_out_zoneid = zoneid;
-	io->ipsec_out_ns = ns;		/* No netstack_hold */
+	if (ira->ira_flags & IRAF_IPSEC_SECURE)
+		ixa->ixa_flags |= IXAF_IPSEC_SECURE;
+	else
+		ixa->ixa_flags |= IXAF_NO_IPSEC;
 
 	return (B_TRUE);
 }
 
-mblk_t *
-ipsec_in_tag(mblk_t *mp, mblk_t *cont, netstack_t *ns)
-{
-	ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
-	ipsec_in_t *nii;
-	mblk_t *nmp;
-	frtn_t nfrtn;
-	ipsec_stack_t *ipss = ns->netstack_ipsec;
-
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
-	ASSERT(ii->ipsec_in_len == sizeof (ipsec_in_t));
-
-	nmp = ipsec_in_alloc(ii->ipsec_in_v4, ns);
-	if (nmp == NULL) {
-		ip_drop_packet_chain(cont, B_FALSE, NULL, NULL,
-		    DROPPER(ipss, ipds_spd_nomem),
-		    &ipss->ipsec_spd_dropper);
-		return (NULL);
-	}
-
-	ASSERT(nmp->b_datap->db_type == M_CTL);
-	ASSERT(nmp->b_wptr == (nmp->b_rptr + sizeof (ipsec_info_t)));
-
-	/*
-	 * Bump refcounts.
-	 */
-	if (ii->ipsec_in_ah_sa != NULL)
-		IPSA_REFHOLD(ii->ipsec_in_ah_sa);
-	if (ii->ipsec_in_esp_sa != NULL)
-		IPSA_REFHOLD(ii->ipsec_in_esp_sa);
-	if (ii->ipsec_in_policy != NULL)
-		IPPH_REFHOLD(ii->ipsec_in_policy);
-
-	/*
-	 * Copy everything, but preserve the free routine provided by
-	 * ipsec_in_alloc().
-	 */
-	nii = (ipsec_in_t *)nmp->b_rptr;
-	nfrtn = nii->ipsec_in_frtn;
-	bcopy(ii, nii, sizeof (*ii));
-	nii->ipsec_in_frtn = nfrtn;
-
-	nmp->b_cont = cont;
-
-	return (nmp);
-}
-
-mblk_t *
-ipsec_out_tag(mblk_t *mp, mblk_t *cont, netstack_t *ns)
-{
-	ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
-	ipsec_out_t *nio;
-	mblk_t *nmp;
-	frtn_t nfrtn;
-	ipsec_stack_t *ipss = ns->netstack_ipsec;
-
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t));
-
-	nmp = ipsec_alloc_ipsec_out(ns);
-	if (nmp == NULL) {
-		ip_drop_packet_chain(cont, B_FALSE, NULL, NULL,
-		    DROPPER(ipss, ipds_spd_nomem),
-		    &ipss->ipsec_spd_dropper);
-		return (NULL);
-	}
-	ASSERT(nmp->b_datap->db_type == M_CTL);
-	ASSERT(nmp->b_wptr == (nmp->b_rptr + sizeof (ipsec_info_t)));
-
-	/*
-	 * Bump refcounts.
-	 */
-	if (io->ipsec_out_ah_sa != NULL)
-		IPSA_REFHOLD(io->ipsec_out_ah_sa);
-	if (io->ipsec_out_esp_sa != NULL)
-		IPSA_REFHOLD(io->ipsec_out_esp_sa);
-	if (io->ipsec_out_polhead != NULL)
-		IPPH_REFHOLD(io->ipsec_out_polhead);
-	if (io->ipsec_out_policy != NULL)
-		IPPOL_REFHOLD(io->ipsec_out_policy);
-	if (io->ipsec_out_act != NULL)
-		IPACT_REFHOLD(io->ipsec_out_act);
-	if (io->ipsec_out_latch != NULL)
-		IPLATCH_REFHOLD(io->ipsec_out_latch);
-	if (io->ipsec_out_cred != NULL)
-		crhold(io->ipsec_out_cred);
-
-	/*
-	 * Copy everything, but preserve the free routine provided by
-	 * ipsec_alloc_ipsec_out().
-	 */
-	nio = (ipsec_out_t *)nmp->b_rptr;
-	nfrtn = nio->ipsec_out_frtn;
-	bcopy(io, nio, sizeof (*io));
-	nio->ipsec_out_frtn = nfrtn;
-
-	nmp->b_cont = cont;
-
-	return (nmp);
-}
-
-static void
-ipsec_out_release_refs(ipsec_out_t *io)
+void
+ipsec_out_release_refs(ip_xmit_attr_t *ixa)
 {
-	netstack_t	*ns = io->ipsec_out_ns;
-
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t));
-	ASSERT(io->ipsec_out_ns != NULL);
+	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
+		return;
 
-	/* Note: IPSA_REFRELE is multi-line macro */
-	if (io->ipsec_out_ah_sa != NULL)
-		IPSA_REFRELE(io->ipsec_out_ah_sa);
-	if (io->ipsec_out_esp_sa != NULL)
-		IPSA_REFRELE(io->ipsec_out_esp_sa);
-	if (io->ipsec_out_polhead != NULL)
-		IPPH_REFRELE(io->ipsec_out_polhead, ns);
-	if (io->ipsec_out_policy != NULL)
-		IPPOL_REFRELE(io->ipsec_out_policy, ns);
-	if (io->ipsec_out_act != NULL)
-		IPACT_REFRELE(io->ipsec_out_act);
-	if (io->ipsec_out_cred != NULL) {
-		crfree(io->ipsec_out_cred);
-		io->ipsec_out_cred = NULL;
+	if (ixa->ixa_ipsec_ah_sa != NULL) {
+		IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
+		ixa->ixa_ipsec_ah_sa = NULL;
 	}
-	if (io->ipsec_out_latch) {
-		IPLATCH_REFRELE(io->ipsec_out_latch, ns);
-		io->ipsec_out_latch = NULL;
+	if (ixa->ixa_ipsec_esp_sa != NULL) {
+		IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
+		ixa->ixa_ipsec_esp_sa = NULL;
 	}
-}
-
-static void
-ipsec_out_free(void *arg)
-{
-	ipsec_out_t *io = (ipsec_out_t *)arg;
-	ipsec_out_release_refs(io);
-	kmem_cache_free(ipsec_info_cache, arg);
-}
-
-static void
-ipsec_in_release_refs(ipsec_in_t *ii)
-{
-	netstack_t	*ns = ii->ipsec_in_ns;
-
-	ASSERT(ii->ipsec_in_ns != NULL);
-
-	/* Note: IPSA_REFRELE is multi-line macro */
-	if (ii->ipsec_in_ah_sa != NULL)
-		IPSA_REFRELE(ii->ipsec_in_ah_sa);
-	if (ii->ipsec_in_esp_sa != NULL)
-		IPSA_REFRELE(ii->ipsec_in_esp_sa);
-	if (ii->ipsec_in_policy != NULL)
-		IPPH_REFRELE(ii->ipsec_in_policy, ns);
-	if (ii->ipsec_in_da != NULL) {
-		freeb(ii->ipsec_in_da);
-		ii->ipsec_in_da = NULL;
+	if (ixa->ixa_ipsec_policy != NULL) {
+		IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+		ixa->ixa_ipsec_policy = NULL;
 	}
-}
-
-static void
-ipsec_in_free(void *arg)
-{
-	ipsec_in_t *ii = (ipsec_in_t *)arg;
-	ipsec_in_release_refs(ii);
-	kmem_cache_free(ipsec_info_cache, arg);
-}
-
-/*
- * This is called only for outbound datagrams if the datagram needs to
- * go out secure.  A NULL mp can be passed to get an ipsec_out. This
- * facility is used by ip_unbind.
- *
- * NOTE : o As the data part could be modified by ipsec_out_process etc.
- *	    we can't make it fast by calling a dup.
- */
-mblk_t *
-ipsec_alloc_ipsec_out(netstack_t *ns)
-{
-	mblk_t *ipsec_mp;
-	ipsec_out_t *io = kmem_cache_alloc(ipsec_info_cache, KM_NOSLEEP);
-
-	if (io == NULL)
-		return (NULL);
-
-	bzero(io, sizeof (ipsec_out_t));
-
-	io->ipsec_out_type = IPSEC_OUT;
-	io->ipsec_out_len = sizeof (ipsec_out_t);
-	io->ipsec_out_frtn.free_func = ipsec_out_free;
-	io->ipsec_out_frtn.free_arg = (char *)io;
-
-	/*
-	 * Set the zoneid to ALL_ZONES which is used as an invalid value. Code
-	 * using ipsec_out_zoneid should assert that the zoneid has been set to
-	 * a sane value.
-	 */
-	io->ipsec_out_zoneid = ALL_ZONES;
-	io->ipsec_out_ns = ns;		/* No netstack_hold */
-
-	ipsec_mp = desballoc((uint8_t *)io, sizeof (ipsec_info_t), BPRI_HI,
-	    &io->ipsec_out_frtn);
-	if (ipsec_mp == NULL) {
-		ipsec_out_free(io);
-
-		return (NULL);
+	if (ixa->ixa_ipsec_action != NULL) {
+		IPACT_REFRELE(ixa->ixa_ipsec_action);
+		ixa->ixa_ipsec_action = NULL;
 	}
-	ipsec_mp->b_datap->db_type = M_CTL;
-	ipsec_mp->b_wptr = ipsec_mp->b_rptr + sizeof (ipsec_info_t);
-
-	return (ipsec_mp);
-}
-
-/*
- * Attach an IPSEC_OUT; use pol for policy if it is non-null.
- * Otherwise initialize using conn.
- *
- * If pol is non-null, we consume a reference to it.
- */
-mblk_t *
-ipsec_attach_ipsec_out(mblk_t **mp, conn_t *connp, ipsec_policy_t *pol,
-    uint8_t proto, netstack_t *ns)
-{
-	mblk_t *ipsec_mp;
-	ipsec_stack_t	*ipss = ns->netstack_ipsec;
-
-	ASSERT((pol != NULL) || (connp != NULL));
-
-	ipsec_mp = ipsec_alloc_ipsec_out(ns);
-	if (ipsec_mp == NULL) {
-		ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_NOTE,
-		    "ipsec_attach_ipsec_out: Allocation failure\n");
-		ip_drop_packet(*mp, B_FALSE, NULL, NULL,
-		    DROPPER(ipss, ipds_spd_nomem),
-		    &ipss->ipsec_spd_dropper);
-		*mp = NULL;
-		return (NULL);
+	if (ixa->ixa_ipsec_latch) {
+		IPLATCH_REFRELE(ixa->ixa_ipsec_latch);
+		ixa->ixa_ipsec_latch = NULL;
 	}
-	ipsec_mp->b_cont = *mp;
-	/*
-	 * If *mp is NULL, ipsec_init_ipsec_out() won't/should not be using it.
-	 */
-	return (ipsec_init_ipsec_out(ipsec_mp, mp, connp, pol, proto, ns));
+	/* Clear the soft references to the SAs */
+	ixa->ixa_ipsec_ref[0].ipsr_sa = NULL;
+	ixa->ixa_ipsec_ref[0].ipsr_bucket = NULL;
+	ixa->ixa_ipsec_ref[0].ipsr_gen = 0;
+	ixa->ixa_ipsec_ref[1].ipsr_sa = NULL;
+	ixa->ixa_ipsec_ref[1].ipsr_bucket = NULL;
+	ixa->ixa_ipsec_ref[1].ipsr_gen = 0;
+	ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
 }
 
-/*
- * Initialize the IPSEC_OUT (ipsec_mp) using pol if it is non-null.
- * Otherwise initialize using conn.
- *
- * If pol is non-null, we consume a reference to it.
- */
-mblk_t *
-ipsec_init_ipsec_out(mblk_t *ipsec_mp, mblk_t **mp, conn_t *connp,
-    ipsec_policy_t *pol, uint8_t proto, netstack_t *ns)
+void
+ipsec_in_release_refs(ip_recv_attr_t *ira)
 {
-	ipsec_out_t *io;
-	ipsec_policy_t *p;
-	ipha_t *ipha;
-	ip6_t *ip6h;
-	ipsec_stack_t *ipss = ns->netstack_ipsec;
-
-	ASSERT(ipsec_mp->b_cont == *mp);
-
-	ASSERT((pol != NULL) || (connp != NULL));
-
-	ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
-	ASSERT(ipsec_mp->b_wptr == (ipsec_mp->b_rptr + sizeof (ipsec_info_t)));
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t));
-	io->ipsec_out_latch = NULL;
-	/*
-	 * Set the zoneid when we have the connp.
-	 * Otherwise, we're called from ip_wput_attach_policy() who will take
-	 * care of setting the zoneid.
-	 */
-	if (connp != NULL)
-		io->ipsec_out_zoneid = connp->conn_zoneid;
-
-	io->ipsec_out_ns = ns;		/* No netstack_hold */
-
-	if (*mp != NULL) {
-		ipha = (ipha_t *)(*mp)->b_rptr;
-		if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
-			io->ipsec_out_v4 = B_TRUE;
-			ip6h = NULL;
-		} else {
-			io->ipsec_out_v4 = B_FALSE;
-			ip6h = (ip6_t *)ipha;
-			ipha = NULL;
-		}
-	} else {
-		ASSERT(connp != NULL && connp->conn_policy_cached);
-		ip6h = NULL;
-		ipha = NULL;
-		io->ipsec_out_v4 = !connp->conn_pkt_isv6;
-	}
-
-	p = NULL;
-
-	/*
-	 * Take latched policies over global policy.  Check here again for
-	 * this, in case we had conn_latch set while the packet was flying
-	 * around in IP.
-	 */
-	if (connp != NULL && connp->conn_latch != NULL) {
-		ASSERT(ns == connp->conn_netstack);
-		p = connp->conn_latch->ipl_out_policy;
-		io->ipsec_out_latch = connp->conn_latch;
-		IPLATCH_REFHOLD(connp->conn_latch);
-		if (p != NULL) {
-			IPPOL_REFHOLD(p);
-		}
-		io->ipsec_out_src_port = connp->conn_lport;
-		io->ipsec_out_dst_port = connp->conn_fport;
-		io->ipsec_out_icmp_type = io->ipsec_out_icmp_code = 0;
-		if (pol != NULL)
-			IPPOL_REFRELE(pol, ns);
-	} else if (pol != NULL) {
-		ipsec_selector_t sel;
-
-		bzero((void*)&sel, sizeof (sel));
-
-		p = pol;
-		/*
-		 * conn does not have the port information. Get
-		 * it from the packet.
-		 */
+	if (!(ira->ira_flags & IRAF_IPSEC_SECURE))
+		return;
 
-		if (!ipsec_init_outbound_ports(&sel, *mp, ipha, ip6h, 0,
-		    ns->netstack_ipsec)) {
-			/* Callee did ip_drop_packet() on *mp. */
-			*mp = NULL;
-			freeb(ipsec_mp);
-			return (NULL);
-		}
-		io->ipsec_out_src_port = sel.ips_local_port;
-		io->ipsec_out_dst_port = sel.ips_remote_port;
-		io->ipsec_out_icmp_type = sel.ips_icmp_type;
-		io->ipsec_out_icmp_code = sel.ips_icmp_code;
+	if (ira->ira_ipsec_ah_sa != NULL) {
+		IPSA_REFRELE(ira->ira_ipsec_ah_sa);
+		ira->ira_ipsec_ah_sa = NULL;
 	}
-
-	io->ipsec_out_proto = proto;
-	io->ipsec_out_use_global_policy = B_TRUE;
-	io->ipsec_out_secure = (p != NULL);
-	io->ipsec_out_policy = p;
-
-	if (p == NULL) {
-		if (connp->conn_policy != NULL) {
-			io->ipsec_out_secure = B_TRUE;
-			ASSERT(io->ipsec_out_latch == NULL);
-			ASSERT(io->ipsec_out_use_global_policy == B_TRUE);
-			io->ipsec_out_need_policy = B_TRUE;
-			ASSERT(io->ipsec_out_polhead == NULL);
-			IPPH_REFHOLD(connp->conn_policy);
-			io->ipsec_out_polhead = connp->conn_policy;
-		}
-	} else {
-		/* Handle explicit drop action. */
-		if (p->ipsp_act->ipa_act.ipa_type == IPSEC_ACT_DISCARD ||
-		    p->ipsp_act->ipa_act.ipa_type == IPSEC_ACT_REJECT) {
-			ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
-			    DROPPER(ipss, ipds_spd_explicit),
-			    &ipss->ipsec_spd_dropper);
-			*mp = NULL;
-			ipsec_mp = NULL;
-		}
+	if (ira->ira_ipsec_esp_sa != NULL) {
+		IPSA_REFRELE(ira->ira_ipsec_esp_sa);
+		ira->ira_ipsec_esp_sa = NULL;
 	}
-
-	return (ipsec_mp);
+	ira->ira_flags &= ~IRAF_IPSEC_SECURE;
 }
 
 /*
- * Allocate an IPSEC_IN mblk.  This will be prepended to an inbound datagram
- * and keep track of what-if-any IPsec processing will be applied to the
- * datagram.
- */
-mblk_t *
-ipsec_in_alloc(boolean_t isv4, netstack_t *ns)
-{
-	mblk_t *ipsec_in;
-	ipsec_in_t *ii = kmem_cache_alloc(ipsec_info_cache, KM_NOSLEEP);
-
-	if (ii == NULL)
-		return (NULL);
-
-	bzero(ii, sizeof (ipsec_info_t));
-	ii->ipsec_in_type = IPSEC_IN;
-	ii->ipsec_in_len = sizeof (ipsec_in_t);
-
-	ii->ipsec_in_v4 = isv4;
-	ii->ipsec_in_secure = B_TRUE;
-	ii->ipsec_in_ns = ns;		/* No netstack_hold */
-	ii->ipsec_in_stackid = ns->netstack_stackid;
-
-	ii->ipsec_in_frtn.free_func = ipsec_in_free;
-	ii->ipsec_in_frtn.free_arg = (char *)ii;
-
-	ii->ipsec_in_zoneid = ALL_ZONES; /* default for received packets */
-
-	ipsec_in = desballoc((uint8_t *)ii, sizeof (ipsec_info_t), BPRI_HI,
-	    &ii->ipsec_in_frtn);
-	if (ipsec_in == NULL) {
-		ip1dbg(("ipsec_in_alloc: IPSEC_IN allocation failure.\n"));
-		ipsec_in_free(ii);
-		return (NULL);
-	}
-
-	ipsec_in->b_datap->db_type = M_CTL;
-	ipsec_in->b_wptr += sizeof (ipsec_info_t);
-
-	return (ipsec_in);
-}
-
-/*
- * This is called from ip_wput_local when a packet which needs
- * security is looped back, to convert the IPSEC_OUT to a IPSEC_IN
- * before fanout, where the policy check happens.  In most of the
- * cases, IPSEC processing has *never* been done.  There is one case
- * (ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed) where
- * the packet is destined for localhost, IPSEC processing has already
- * been done.
+ * This is called from ire_send_local when a packet
+ * is looped back. We setup the ip_recv_attr_t "borrowing" the references
+ * held by the callers.
+ * Note that we don't do any IPsec but we carry the actions and IPSEC flags
+ * across so that the fanout policy checks see that IPsec was applied.
  *
- * Future: This could happen after SA selection has occurred for
- * outbound.. which will tell us who the src and dst identities are..
- * Then it's just a matter of splicing the ah/esp SA pointers from the
- * ipsec_out_t to the ipsec_in_t.
+ * The caller should do ipsec_in_release_refs() on the ira by calling
+ * ira_cleanup().
  */
 void
-ipsec_out_to_in(mblk_t *ipsec_mp)
+ipsec_out_to_in(ip_xmit_attr_t *ixa, ill_t *ill, ip_recv_attr_t *ira)
 {
-	ipsec_in_t  *ii;
-	ipsec_out_t *io;
 	ipsec_policy_t *pol;
 	ipsec_action_t *act;
-	boolean_t v4, icmp_loopback;
-	zoneid_t zoneid;
-	netstack_t *ns;
 
-	ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
+	/* Non-IPsec operations */
+	ira->ira_free_flags = 0;
+	ira->ira_zoneid = ixa->ixa_zoneid;
+	ira->ira_cred = ixa->ixa_cred;
+	ira->ira_cpid = ixa->ixa_cpid;
+	ira->ira_tsl = ixa->ixa_tsl;
+	ira->ira_ill = ira->ira_rill = ill;
+	ira->ira_flags = ixa->ixa_flags & IAF_MASK;
+	ira->ira_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
+	ira->ira_pktlen = ixa->ixa_pktlen;
+	ira->ira_ip_hdr_length = ixa->ixa_ip_hdr_length;
+	ira->ira_protocol = ixa->ixa_protocol;
+	ira->ira_mhip = NULL;
+
+	ira->ira_flags |= IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
+
+	ira->ira_sqp = ixa->ixa_sqp;
+	ira->ira_ring = NULL;
+
+	ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+	ira->ira_rifindex = ira->ira_ruifindex;
+
+	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
+		return;
 
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
+	ira->ira_flags |= IRAF_IPSEC_SECURE;
 
-	v4 = io->ipsec_out_v4;
-	zoneid = io->ipsec_out_zoneid;
-	icmp_loopback = io->ipsec_out_icmp_loopback;
-	ns = io->ipsec_out_ns;
+	ira->ira_ipsec_ah_sa = NULL;
+	ira->ira_ipsec_esp_sa = NULL;
 
-	act = io->ipsec_out_act;
+	act = ixa->ixa_ipsec_action;
 	if (act == NULL) {
-		pol = io->ipsec_out_policy;
+		pol = ixa->ixa_ipsec_policy;
 		if (pol != NULL) {
 			act = pol->ipsp_act;
 			IPACT_REFHOLD(act);
 		}
 	}
-	io->ipsec_out_act = NULL;
-
-	ipsec_out_release_refs(io);	/* No netstack_rele/hold needed */
-
-	ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	bzero(ii, sizeof (ipsec_in_t));
-	ii->ipsec_in_type = IPSEC_IN;
-	ii->ipsec_in_len = sizeof (ipsec_in_t);
-	ii->ipsec_in_loopback = B_TRUE;
-	ii->ipsec_in_ns = ns;		/* No netstack_hold */
-
-	ii->ipsec_in_frtn.free_func = ipsec_in_free;
-	ii->ipsec_in_frtn.free_arg = (char *)ii;
-	ii->ipsec_in_action = act;
-	ii->ipsec_in_zoneid = zoneid;
-
-	/*
-	 * In most of the cases, we can't look at the ipsec_out_XXX_sa
-	 * because this never went through IPSEC processing. So, look at
-	 * the requests and infer whether it would have gone through
-	 * IPSEC processing or not. Initialize the "done" fields with
-	 * the requests. The possible values for "done" fields are :
-	 *
-	 * 1) zero, indicates that a particular preference was never
-	 *    requested.
-	 * 2) non-zero, indicates that it could be IPSEC_PREF_REQUIRED/
-	 *    IPSEC_PREF_NEVER. If IPSEC_REQ_DONE is set, it means that
-	 *    IPSEC processing has been completed.
-	 */
-	ii->ipsec_in_secure = B_TRUE;
-	ii->ipsec_in_v4 = v4;
-	ii->ipsec_in_icmp_loopback = icmp_loopback;
+	ixa->ixa_ipsec_action = NULL;
+	ira->ira_ipsec_action = act;
 }
 
 /*
- * Consults global policy to see whether this datagram should
- * go out secure. If so it attaches a ipsec_mp in front and
- * returns.
+ * Consults global policy and per-socket policy to see whether this datagram
+ * should go out secure. If so it updates the ip_xmit_attr_t
+ * Should not be used when connecting, since then we want to latch the policy.
+ *
+ * If connp is NULL we just look at the global policy.
+ *
+ * Returns NULL if the packet was dropped, in which case the MIB has
+ * been incremented and ip_drop_packet done.
  */
 mblk_t *
-ip_wput_attach_policy(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire,
-    conn_t *connp, boolean_t unspec_src, zoneid_t zoneid)
+ip_output_attach_policy(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
+    const conn_t *connp, ip_xmit_attr_t *ixa)
 {
-	mblk_t *mp;
-	ipsec_out_t *io = NULL;
 	ipsec_selector_t sel;
-	uint_t	ill_index;
-	boolean_t conn_dontroutex;
-	boolean_t conn_multicast_loopx;
-	boolean_t policy_present;
-	ip_stack_t	*ipst = ire->ire_ipst;
+	boolean_t	policy_present;
+	ip_stack_t	*ipst = ixa->ixa_ipst;
 	netstack_t	*ns = ipst->ips_netstack;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	ipsec_policy_t	*p;
 
+	ixa->ixa_ipsec_policy_gen = ipss->ipsec_system_policy.iph_gen;
 	ASSERT((ipha != NULL && ip6h == NULL) ||
 	    (ip6h != NULL && ipha == NULL));
 
-	bzero((void*)&sel, sizeof (sel));
-
 	if (ipha != NULL)
 		policy_present = ipss->ipsec_outbound_v4_policy_present;
 	else
 		policy_present = ipss->ipsec_outbound_v6_policy_present;
-	/*
-	 * Fast Path to see if there is any policy.
-	 */
-	if (!policy_present) {
-		if (ipsec_mp->b_datap->db_type == M_CTL) {
-			io = (ipsec_out_t *)ipsec_mp->b_rptr;
-			if (!io->ipsec_out_secure) {
-				/*
-				 * If there is no global policy and ip_wput
-				 * or ip_wput_multicast has attached this mp
-				 * for multicast case, free the ipsec_mp and
-				 * return the original mp.
-				 */
-				mp = ipsec_mp->b_cont;
-				freeb(ipsec_mp);
-				ipsec_mp = mp;
-				io = NULL;
-			}
-			ASSERT(io == NULL || !io->ipsec_out_tunnel);
-		}
-		if (((io == NULL) || (io->ipsec_out_polhead == NULL)) &&
-		    ((connp == NULL) || (connp->conn_policy == NULL)))
-			return (ipsec_mp);
-	}
 
-	ill_index = 0;
-	conn_multicast_loopx = conn_dontroutex = B_FALSE;
-	mp = ipsec_mp;
-	if (ipsec_mp->b_datap->db_type == M_CTL) {
-		mp = ipsec_mp->b_cont;
-		/*
-		 * This is a connection where we have some per-socket
-		 * policy or ip_wput has attached an ipsec_mp for
-		 * the multicast datagram.
-		 */
-		io = (ipsec_out_t *)ipsec_mp->b_rptr;
-		if (!io->ipsec_out_secure) {
-			/*
-			 * This ipsec_mp was allocated in ip_wput or
-			 * ip_wput_multicast so that we will know the
-			 * value of ill_index, conn_dontroute,
-			 * conn_multicast_loop in the multicast case if
-			 * we inherit global policy here.
-			 */
-			ill_index = io->ipsec_out_ill_index;
-			conn_dontroutex = io->ipsec_out_dontroute;
-			conn_multicast_loopx = io->ipsec_out_multicast_loop;
-			freeb(ipsec_mp);
-			ipsec_mp = mp;
-			io = NULL;
-		}
-		ASSERT(io == NULL || !io->ipsec_out_tunnel);
-	}
+	if (!policy_present && (connp == NULL || connp->conn_policy == NULL))
+		return (mp);
+
+	bzero((void*)&sel, sizeof (sel));
 
 	if (ipha != NULL) {
-		sel.ips_local_addr_v4 = (ipha->ipha_src != 0 ?
-		    ipha->ipha_src : ire->ire_src_addr);
+		sel.ips_local_addr_v4 = ipha->ipha_src;
 		sel.ips_remote_addr_v4 = ip_get_dst(ipha);
-		sel.ips_protocol = (uint8_t)ipha->ipha_protocol;
 		sel.ips_isv4 = B_TRUE;
 	} else {
-		ushort_t hdr_len;
-		uint8_t	*nexthdrp;
-		boolean_t is_fragment;
-
 		sel.ips_isv4 = B_FALSE;
-		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
-			if (!unspec_src)
-				sel.ips_local_addr_v6 = ire->ire_src_addr_v6;
-		} else {
-			sel.ips_local_addr_v6 = ip6h->ip6_src;
-		}
-
-		sel.ips_remote_addr_v6 = ip_get_dst_v6(ip6h, mp, &is_fragment);
-		if (is_fragment) {
-			/*
-			 * It's a packet fragment for a packet that
-			 * we have already processed (since IPsec processing
-			 * is done before fragmentation), so we don't
-			 * have to do policy checks again. Fragments can
-			 * come back to us for processing if they have
-			 * been queued up due to flow control.
-			 */
-			if (ipsec_mp->b_datap->db_type == M_CTL) {
-				mp = ipsec_mp->b_cont;
-				freeb(ipsec_mp);
-				ipsec_mp = mp;
-			}
-			return (ipsec_mp);
-		}
-
-		/* IPv6 common-case. */
-		sel.ips_protocol = ip6h->ip6_nxt;
-		switch (ip6h->ip6_nxt) {
-		case IPPROTO_TCP:
-		case IPPROTO_UDP:
-		case IPPROTO_SCTP:
-		case IPPROTO_ICMPV6:
-			break;
-		default:
-			if (!ip_hdr_length_nexthdr_v6(mp, ip6h,
-			    &hdr_len, &nexthdrp)) {
-				BUMP_MIB(&ipst->ips_ip6_mib,
-				    ipIfStatsOutDiscards);
-				freemsg(ipsec_mp); /* Not IPsec-related drop. */
-				return (NULL);
-			}
-			sel.ips_protocol = *nexthdrp;
-			break;
-		}
+		sel.ips_local_addr_v6 = ip6h->ip6_src;
+		sel.ips_remote_addr_v6 = ip_get_dst_v6(ip6h, mp, NULL);
 	}
+	sel.ips_protocol = ixa->ixa_protocol;
 
 	if (!ipsec_init_outbound_ports(&sel, mp, ipha, ip6h, 0, ipss)) {
 		if (ipha != NULL) {
@@ -4794,65 +4225,36 @@ ip_wput_attach_policy(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire,
 		} else {
 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
 		}
-
-		/* Callee dropped the packet. */
+		/* Note: mp already consumed and ip_drop_packet done */
 		return (NULL);
 	}
 
-	if (io != NULL) {
-		/*
-		 * We seem to have some local policy (we already have
-		 * an ipsec_out).  Look at global policy and see
-		 * whether we have to inherit or not.
-		 */
-		io->ipsec_out_need_policy = B_FALSE;
-		ipsec_mp = ipsec_apply_global_policy(ipsec_mp, connp,
-		    &sel, ns);
-		ASSERT((io->ipsec_out_policy != NULL) ||
-		    (io->ipsec_out_act != NULL));
-		ASSERT(io->ipsec_out_need_policy == B_FALSE);
-		return (ipsec_mp);
+	ASSERT(ixa->ixa_ipsec_policy == NULL);
+	p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
+	ixa->ixa_ipsec_policy = p;
+	if (p != NULL) {
+		ixa->ixa_flags |= IXAF_IPSEC_SECURE;
+		if (connp == NULL || connp->conn_policy == NULL)
+			ixa->ixa_flags |= IXAF_IPSEC_GLOBAL_POLICY;
+	} else {
+		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
 	}
-	/*
-	 * We pass in a pointer to a pointer because mp can become
-	 * NULL due to allocation failures or explicit drops.  Callers
-	 * of this function should assume a NULL mp means the packet
-	 * was dropped.
-	 */
-	ipsec_mp = ipsec_attach_global_policy(&mp, connp, &sel, ns);
-	if (ipsec_mp == NULL)
-		return (mp);
 
 	/*
 	 * Copy the right port information.
 	 */
-	ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-
-	ASSERT(io->ipsec_out_need_policy == B_FALSE);
-	ASSERT((io->ipsec_out_policy != NULL) ||
-	    (io->ipsec_out_act != NULL));
-	io->ipsec_out_src_port = sel.ips_local_port;
-	io->ipsec_out_dst_port = sel.ips_remote_port;
-	io->ipsec_out_icmp_type = sel.ips_icmp_type;
-	io->ipsec_out_icmp_code = sel.ips_icmp_code;
-	/*
-	 * Set ill_index, conn_dontroute and conn_multicast_loop
-	 * for multicast datagrams.
-	 */
-	io->ipsec_out_ill_index = ill_index;
-	io->ipsec_out_dontroute = conn_dontroutex;
-	io->ipsec_out_multicast_loop = conn_multicast_loopx;
-
-	if (zoneid == ALL_ZONES)
-		zoneid = GLOBAL_ZONEID;
-	io->ipsec_out_zoneid = zoneid;
-	return (ipsec_mp);
+	ixa->ixa_ipsec_src_port = sel.ips_local_port;
+	ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
+	ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
+	ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
+	ixa->ixa_ipsec_proto = sel.ips_protocol;
+	return (mp);
 }
 
 /*
  * When appropriate, this function caches inbound and outbound policy
- * for this connection.
+ * for this connection. The outbound policy is stored in conn_ixa.
+ * Note that it can not be used for SCTP since conn_faddr isn't set for SCTP.
  *
  * XXX need to work out more details about per-interface policy and
  * caching here!
@@ -4866,20 +4268,38 @@ ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4)
 	netstack_t	*ns = connp->conn_netstack;
 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
 
+	connp->conn_ixa->ixa_ipsec_policy_gen =
+	    ipss->ipsec_system_policy.iph_gen;
 	/*
 	 * There is no policy latching for ICMP sockets because we can't
 	 * decide on which policy to use until we see the packet and get
 	 * type/code selectors.
 	 */
-	if (connp->conn_ulp == IPPROTO_ICMP ||
-	    connp->conn_ulp == IPPROTO_ICMPV6) {
+	if (connp->conn_proto == IPPROTO_ICMP ||
+	    connp->conn_proto == IPPROTO_ICMPV6) {
 		connp->conn_in_enforce_policy =
 		    connp->conn_out_enforce_policy = B_TRUE;
 		if (connp->conn_latch != NULL) {
-			IPLATCH_REFRELE(connp->conn_latch, ns);
+			IPLATCH_REFRELE(connp->conn_latch);
 			connp->conn_latch = NULL;
 		}
-		connp->conn_flags |= IPCL_CHECK_POLICY;
+		if (connp->conn_latch_in_policy != NULL) {
+			IPPOL_REFRELE(connp->conn_latch_in_policy);
+			connp->conn_latch_in_policy = NULL;
+		}
+		if (connp->conn_latch_in_action != NULL) {
+			IPACT_REFRELE(connp->conn_latch_in_action);
+			connp->conn_latch_in_action = NULL;
+		}
+		if (connp->conn_ixa->ixa_ipsec_policy != NULL) {
+			IPPOL_REFRELE(connp->conn_ixa->ixa_ipsec_policy);
+			connp->conn_ixa->ixa_ipsec_policy = NULL;
+		}
+		if (connp->conn_ixa->ixa_ipsec_action != NULL) {
+			IPACT_REFRELE(connp->conn_ixa->ixa_ipsec_action);
+			connp->conn_ixa->ixa_ipsec_action = NULL;
+		}
+		connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
 		return (0);
 	}
 
@@ -4898,38 +4318,57 @@ ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4)
 			return (ENOMEM);
 		}
 
-		sel.ips_protocol = connp->conn_ulp;
+		bzero((void*)&sel, sizeof (sel));
+
+		sel.ips_protocol = connp->conn_proto;
 		sel.ips_local_port = connp->conn_lport;
 		sel.ips_remote_port = connp->conn_fport;
 		sel.ips_is_icmp_inv_acq = 0;
 		sel.ips_isv4 = isv4;
 		if (isv4) {
-			sel.ips_local_addr_v4 = connp->conn_src;
-			sel.ips_remote_addr_v4 = connp->conn_rem;
+			sel.ips_local_addr_v4 = connp->conn_laddr_v4;
+			sel.ips_remote_addr_v4 = connp->conn_faddr_v4;
 		} else {
-			sel.ips_local_addr_v6 = connp->conn_srcv6;
-			sel.ips_remote_addr_v6 = connp->conn_remv6;
+			sel.ips_local_addr_v6 = connp->conn_laddr_v6;
+			sel.ips_remote_addr_v6 = connp->conn_faddr_v6;
 		}
 
-		p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, NULL, &sel,
-		    ns);
-		if (connp->conn_latch->ipl_in_policy != NULL)
-			IPPOL_REFRELE(connp->conn_latch->ipl_in_policy, ns);
-		connp->conn_latch->ipl_in_policy = p;
+		p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, &sel, ns);
+		if (connp->conn_latch_in_policy != NULL)
+			IPPOL_REFRELE(connp->conn_latch_in_policy);
+		connp->conn_latch_in_policy = p;
 		connp->conn_in_enforce_policy = (p != NULL);
 
-		p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, NULL, &sel,
-		    ns);
-		if (connp->conn_latch->ipl_out_policy != NULL)
-			IPPOL_REFRELE(connp->conn_latch->ipl_out_policy, ns);
-		connp->conn_latch->ipl_out_policy = p;
+		p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
+		if (connp->conn_ixa->ixa_ipsec_policy != NULL)
+			IPPOL_REFRELE(connp->conn_ixa->ixa_ipsec_policy);
+		connp->conn_ixa->ixa_ipsec_policy = p;
 		connp->conn_out_enforce_policy = (p != NULL);
-
+		if (p != NULL) {
+			connp->conn_ixa->ixa_flags |= IXAF_IPSEC_SECURE;
+			if (connp->conn_policy == NULL) {
+				connp->conn_ixa->ixa_flags |=
+				    IXAF_IPSEC_GLOBAL_POLICY;
+			}
+		} else {
+			connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+		}
 		/* Clear the latched actions too, in case we're recaching. */
-		if (connp->conn_latch->ipl_out_action != NULL)
-			IPACT_REFRELE(connp->conn_latch->ipl_out_action);
-		if (connp->conn_latch->ipl_in_action != NULL)
-			IPACT_REFRELE(connp->conn_latch->ipl_in_action);
+		if (connp->conn_ixa->ixa_ipsec_action != NULL) {
+			IPACT_REFRELE(connp->conn_ixa->ixa_ipsec_action);
+			connp->conn_ixa->ixa_ipsec_action = NULL;
+		}
+		if (connp->conn_latch_in_action != NULL) {
+			IPACT_REFRELE(connp->conn_latch_in_action);
+			connp->conn_latch_in_action = NULL;
+		}
+		connp->conn_ixa->ixa_ipsec_src_port = sel.ips_local_port;
+		connp->conn_ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
+		connp->conn_ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
+		connp->conn_ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
+		connp->conn_ixa->ixa_ipsec_proto = sel.ips_protocol;
+	} else {
+		connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
 	}
 
 	/*
@@ -4945,28 +4384,125 @@ ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4)
 	 * global policy (because conn_policy_cached is already set).
 	 */
 	connp->conn_policy_cached = B_TRUE;
-	if (connp->conn_in_enforce_policy)
-		connp->conn_flags |= IPCL_CHECK_POLICY;
 	return (0);
 }
 
+/*
+ * When appropriate, this function caches outbound policy for faddr/fport.
+ * It is used when we are not connected i.e., when we can not latch the
+ * policy.
+ */
 void
-iplatch_free(ipsec_latch_t *ipl, netstack_t *ns)
-{
-	if (ipl->ipl_out_policy != NULL)
-		IPPOL_REFRELE(ipl->ipl_out_policy, ns);
-	if (ipl->ipl_in_policy != NULL)
-		IPPOL_REFRELE(ipl->ipl_in_policy, ns);
-	if (ipl->ipl_in_action != NULL)
-		IPACT_REFRELE(ipl->ipl_in_action);
-	if (ipl->ipl_out_action != NULL)
-		IPACT_REFRELE(ipl->ipl_out_action);
+ipsec_cache_outbound_policy(const conn_t *connp, const in6_addr_t *v6src,
+    const in6_addr_t *v6dst, in_port_t dstport, ip_xmit_attr_t *ixa)
+{
+	boolean_t	isv4 = (ixa->ixa_flags & IXAF_IS_IPV4) != 0;
+	boolean_t	global_policy_present;
+	netstack_t	*ns = connp->conn_netstack;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+
+	ixa->ixa_ipsec_policy_gen = ipss->ipsec_system_policy.iph_gen;
+
+	/*
+	 * There is no policy caching for ICMP sockets because we can't
+	 * decide on which policy to use until we see the packet and get
+	 * type/code selectors.
+	 */
+	if (connp->conn_proto == IPPROTO_ICMP ||
+	    connp->conn_proto == IPPROTO_ICMPV6) {
+		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+		if (ixa->ixa_ipsec_policy != NULL) {
+			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+			ixa->ixa_ipsec_policy = NULL;
+		}
+		if (ixa->ixa_ipsec_action != NULL) {
+			IPACT_REFRELE(ixa->ixa_ipsec_action);
+			ixa->ixa_ipsec_action = NULL;
+		}
+		return;
+	}
+
+	global_policy_present = isv4 ?
+	    (ipss->ipsec_outbound_v4_policy_present ||
+	    ipss->ipsec_inbound_v4_policy_present) :
+	    (ipss->ipsec_outbound_v6_policy_present ||
+	    ipss->ipsec_inbound_v6_policy_present);
+
+	if ((connp->conn_policy != NULL) || global_policy_present) {
+		ipsec_selector_t sel;
+		ipsec_policy_t	*p;
+
+		bzero((void*)&sel, sizeof (sel));
+
+		sel.ips_protocol = connp->conn_proto;
+		sel.ips_local_port = connp->conn_lport;
+		sel.ips_remote_port = dstport;
+		sel.ips_is_icmp_inv_acq = 0;
+		sel.ips_isv4 = isv4;
+		if (isv4) {
+			IN6_V4MAPPED_TO_IPADDR(v6src, sel.ips_local_addr_v4);
+			IN6_V4MAPPED_TO_IPADDR(v6dst, sel.ips_remote_addr_v4);
+		} else {
+			sel.ips_local_addr_v6 = *v6src;
+			sel.ips_remote_addr_v6 = *v6dst;
+		}
+
+		p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
+		if (ixa->ixa_ipsec_policy != NULL)
+			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+		ixa->ixa_ipsec_policy = p;
+		if (p != NULL) {
+			ixa->ixa_flags |= IXAF_IPSEC_SECURE;
+			if (connp->conn_policy == NULL)
+				ixa->ixa_flags |= IXAF_IPSEC_GLOBAL_POLICY;
+		} else {
+			ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+		}
+		/* Clear the latched actions too, in case we're recaching. */
+		if (ixa->ixa_ipsec_action != NULL) {
+			IPACT_REFRELE(ixa->ixa_ipsec_action);
+			ixa->ixa_ipsec_action = NULL;
+		}
+
+		ixa->ixa_ipsec_src_port = sel.ips_local_port;
+		ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
+		ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
+		ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
+		ixa->ixa_ipsec_proto = sel.ips_protocol;
+	} else {
+		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+		if (ixa->ixa_ipsec_policy != NULL) {
+			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+			ixa->ixa_ipsec_policy = NULL;
+		}
+		if (ixa->ixa_ipsec_action != NULL) {
+			IPACT_REFRELE(ixa->ixa_ipsec_action);
+			ixa->ixa_ipsec_action = NULL;
+		}
+	}
+}
+
+/*
+ * Returns B_FALSE if the policy has gone stale.
+ */
+boolean_t
+ipsec_outbound_policy_current(ip_xmit_attr_t *ixa)
+{
+	ipsec_stack_t	*ipss = ixa->ixa_ipst->ips_netstack->netstack_ipsec;
+
+	if (!(ixa->ixa_flags & IXAF_IPSEC_GLOBAL_POLICY))
+		return (B_TRUE);
+
+	return (ixa->ixa_ipsec_policy_gen == ipss->ipsec_system_policy.iph_gen);
+}
+
+void
+iplatch_free(ipsec_latch_t *ipl)
+{
 	if (ipl->ipl_local_cid != NULL)
 		IPSID_REFRELE(ipl->ipl_local_cid);
 	if (ipl->ipl_remote_cid != NULL)
 		IPSID_REFRELE(ipl->ipl_remote_cid);
-	if (ipl->ipl_local_id != NULL)
-		crfree(ipl->ipl_local_id);
 	mutex_destroy(&ipl->ipl_lock);
 	kmem_free(ipl, sizeof (*ipl));
 }
@@ -5622,18 +5158,19 @@ ipsec_unregister_prov_update(void)
  * SAs are available.  If there's no per-tunnel policy, or a match comes back
  * with no match, then still return the packet and have global policy take
  * a crack at it in IP.
+ * This updates the ip_xmit_attr with the IPsec policy.
  *
  * Remember -> we can be forwarding packets.  Keep that in mind w.r.t.
  * inner-packet contents.
  */
 mblk_t *
 ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
-    ip6_t *inner_ipv6, ipha_t *outer_ipv4, ip6_t *outer_ipv6, int outer_hdr_len)
+    ip6_t *inner_ipv6, ipha_t *outer_ipv4, ip6_t *outer_ipv6, int outer_hdr_len,
+    ip_xmit_attr_t *ixa)
 {
 	ipsec_policy_head_t *polhead;
 	ipsec_selector_t sel;
-	mblk_t *ipsec_mp, *ipsec_mp_head, *nmp;
-	ipsec_out_t *io;
+	mblk_t *nmp;
 	boolean_t is_fragment;
 	ipsec_policy_t *pol;
 	ipsec_tun_pol_t *itp = iptun->iptun_itp;
@@ -5644,6 +5181,15 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
 	    outer_ipv4 != NULL && outer_ipv6 == NULL);
 	/* We take care of inners in a bit. */
 
+	/* Are the IPsec fields initialized at all? */
+	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE)) {
+		ASSERT(ixa->ixa_ipsec_policy == NULL);
+		ASSERT(ixa->ixa_ipsec_latch == NULL);
+		ASSERT(ixa->ixa_ipsec_action == NULL);
+		ASSERT(ixa->ixa_ipsec_ah_sa == NULL);
+		ASSERT(ixa->ixa_ipsec_esp_sa == NULL);
+	}
+
 	ASSERT(itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE));
 	polhead = itp->itp_policy;
 
@@ -5675,7 +5221,7 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
 		if (mp->b_cont != NULL) {
 			nmp = msgpullup(mp, -1);
 			if (nmp == NULL) {
-				ip_drop_packet(mp, B_FALSE, NULL, NULL,
+				ip_drop_packet(mp, B_FALSE, NULL,
 				    DROPPER(ipss, ipds_spd_nomem),
 				    &ipss->ipsec_spd_dropper);
 				return (NULL);
@@ -5734,8 +5280,8 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
 				ip6h = (ip6_t *)mp->b_rptr;
 				if (!ip_hdr_length_nexthdr_v6(mp, ip6h,
 				    &ip6_hdr_length, &v6_proto_p)) {
-					ip_drop_packet_chain(mp, B_FALSE,
-					    NULL, NULL, DROPPER(ipss,
+					ip_drop_packet_chain(mp, B_FALSE, NULL,
+					    DROPPER(ipss,
 					    ipds_spd_malformed_packet),
 					    &ipss->ipsec_spd_dropper);
 					return (NULL);
@@ -5761,8 +5307,8 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
 				sel.ips_remote_addr_v6 = inner_ipv6->ip6_dst;
 				if (!ip_hdr_length_nexthdr_v6(mp,
 				    inner_ipv6, &ip6_hdr_length, &v6_proto_p)) {
-					ip_drop_packet_chain(mp, B_FALSE,
-					    NULL, NULL, DROPPER(ipss,
+					ip_drop_packet_chain(mp, B_FALSE, NULL,
+					    DROPPER(ipss,
 					    ipds_spd_malformed_frag),
 					    &ipss->ipsec_spd_dropper);
 					return (NULL);
@@ -5802,8 +5348,7 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
 		/* Success so far! */
 	}
 	rw_enter(&polhead->iph_lock, RW_READER);
-	pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_OUTBOUND,
-	    &sel, ns);
+	pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_OUTBOUND, &sel);
 	rw_exit(&polhead->iph_lock);
 	if (pol == NULL) {
 		/*
@@ -5825,7 +5370,7 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
 		cmn_err(CE_WARN, "ipsec_tun_outbound(): No matching tunnel "
 		    "per-port policy\n");
 #endif
-		ip_drop_packet_chain(mp, B_FALSE, NULL, NULL,
+		ip_drop_packet_chain(mp, B_FALSE, NULL,
 		    DROPPER(ipss, ipds_spd_explicit),
 		    &ipss->ipsec_spd_dropper);
 		return (NULL);
@@ -5835,101 +5380,65 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
 	cmn_err(CE_WARN, "Having matching tunnel per-port policy\n");
 #endif
 
-	/* Construct an IPSEC_OUT message. */
-	ipsec_mp = ipsec_mp_head = ipsec_alloc_ipsec_out(ns);
-	if (ipsec_mp == NULL) {
-		IPPOL_REFRELE(pol, ns);
-		ip_drop_packet(mp, B_FALSE, NULL, NULL,
-		    DROPPER(ipss, ipds_spd_nomem),
-		    &ipss->ipsec_spd_dropper);
-		return (NULL);
-	}
-	ipsec_mp->b_cont = mp;
-	io = (ipsec_out_t *)ipsec_mp->b_rptr;
-	IPPH_REFHOLD(polhead);
 	/*
-	 * NOTE: free() function of ipsec_out mblk will release polhead and
-	 * pol references.
+	 * NOTE: ixa_cleanup() function will release pol references.
 	 */
-	io->ipsec_out_polhead = polhead;
-	io->ipsec_out_policy = pol;
+	ixa->ixa_ipsec_policy = pol;
 	/*
 	 * NOTE: There is a subtle difference between iptun_zoneid and
 	 * iptun_connp->conn_zoneid explained in iptun_conn_create().  When
 	 * interacting with the ip module, we must use conn_zoneid.
 	 */
-	io->ipsec_out_zoneid = iptun->iptun_connp->conn_zoneid;
-	io->ipsec_out_v4 = (outer_ipv4 != NULL);
-	io->ipsec_out_secure = B_TRUE;
+	ixa->ixa_zoneid = iptun->iptun_connp->conn_zoneid;
+
+	ASSERT((outer_ipv4 != NULL) ? (ixa->ixa_flags & IXAF_IS_IPV4) :
+	    !(ixa->ixa_flags & IXAF_IS_IPV4));
+	ASSERT(ixa->ixa_ipsec_policy != NULL);
+	ixa->ixa_flags |= IXAF_IPSEC_SECURE;
 
 	if (!(itp->itp_flags & ITPF_P_TUNNEL)) {
 		/* Set up transport mode for tunnelled packets. */
-		io->ipsec_out_proto = (inner_ipv4 != NULL) ? IPPROTO_ENCAP :
+		ixa->ixa_ipsec_proto = (inner_ipv4 != NULL) ? IPPROTO_ENCAP :
 		    IPPROTO_IPV6;
-		return (ipsec_mp);
+		return (mp);
 	}
 
 	/* Fill in tunnel-mode goodies here. */
-	io->ipsec_out_tunnel = B_TRUE;
+	ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
 	/* XXX Do I need to fill in all of the goodies here? */
 	if (inner_ipv4) {
-		io->ipsec_out_inaf = AF_INET;
-		io->ipsec_out_insrc[0] =
+		ixa->ixa_ipsec_inaf = AF_INET;
+		ixa->ixa_ipsec_insrc[0] =
 		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v4;
-		io->ipsec_out_indst[0] =
+		ixa->ixa_ipsec_indst[0] =
 		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v4;
 	} else {
-		io->ipsec_out_inaf = AF_INET6;
-		io->ipsec_out_insrc[0] =
+		ixa->ixa_ipsec_inaf = AF_INET6;
+		ixa->ixa_ipsec_insrc[0] =
 		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[0];
-		io->ipsec_out_insrc[1] =
+		ixa->ixa_ipsec_insrc[1] =
 		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[1];
-		io->ipsec_out_insrc[2] =
+		ixa->ixa_ipsec_insrc[2] =
 		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[2];
-		io->ipsec_out_insrc[3] =
+		ixa->ixa_ipsec_insrc[3] =
 		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[3];
-		io->ipsec_out_indst[0] =
+		ixa->ixa_ipsec_indst[0] =
 		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[0];
-		io->ipsec_out_indst[1] =
+		ixa->ixa_ipsec_indst[1] =
 		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[1];
-		io->ipsec_out_indst[2] =
+		ixa->ixa_ipsec_indst[2] =
 		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[2];
-		io->ipsec_out_indst[3] =
+		ixa->ixa_ipsec_indst[3] =
 		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[3];
 	}
-	io->ipsec_out_insrcpfx = pol->ipsp_sel->ipsl_key.ipsl_local_pfxlen;
-	io->ipsec_out_indstpfx = pol->ipsp_sel->ipsl_key.ipsl_remote_pfxlen;
+	ixa->ixa_ipsec_insrcpfx = pol->ipsp_sel->ipsl_key.ipsl_local_pfxlen;
+	ixa->ixa_ipsec_indstpfx = pol->ipsp_sel->ipsl_key.ipsl_remote_pfxlen;
 	/* NOTE:  These are used for transport mode too. */
-	io->ipsec_out_src_port = pol->ipsp_sel->ipsl_key.ipsl_lport;
-	io->ipsec_out_dst_port = pol->ipsp_sel->ipsl_key.ipsl_rport;
-	io->ipsec_out_proto = pol->ipsp_sel->ipsl_key.ipsl_proto;
+	ixa->ixa_ipsec_src_port = pol->ipsp_sel->ipsl_key.ipsl_lport;
+	ixa->ixa_ipsec_dst_port = pol->ipsp_sel->ipsl_key.ipsl_rport;
+	ixa->ixa_ipsec_proto = pol->ipsp_sel->ipsl_key.ipsl_proto;
 
-	/*
-	 * The mp pointer still valid
-	 * Add ipsec_out to each fragment.
-	 * The fragment head already has one
-	 */
-	nmp = mp->b_next;
-	mp->b_next = NULL;
-	mp = nmp;
-	ASSERT(ipsec_mp != NULL);
-	while (mp != NULL) {
-		nmp = mp->b_next;
-		ipsec_mp->b_next = ipsec_out_tag(ipsec_mp_head, mp, ns);
-		if (ipsec_mp->b_next == NULL) {
-			ip_drop_packet_chain(ipsec_mp_head, B_FALSE, NULL, NULL,
-			    DROPPER(ipss, ipds_spd_nomem),
-			    &ipss->ipsec_spd_dropper);
-			ip_drop_packet_chain(mp, B_FALSE, NULL, NULL,
-			    DROPPER(ipss, ipds_spd_nomem),
-			    &ipss->ipsec_spd_dropper);
-			return (NULL);
-		}
-		ipsec_mp = ipsec_mp->b_next;
-		mp->b_next = NULL;
-		mp = nmp;
-	}
-	return (ipsec_mp_head);
+	return (mp);
 }
 
 /*
@@ -5937,16 +5446,28 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
  * calls ip_drop_packet() for me on NULL returns.
  */
 mblk_t *
-ipsec_check_ipsecin_policy_reasm(mblk_t *ipsec_mp, ipsec_policy_t *pol,
+ipsec_check_ipsecin_policy_reasm(mblk_t *attr_mp, ipsec_policy_t *pol,
     ipha_t *inner_ipv4, ip6_t *inner_ipv6, uint64_t pkt_unique, netstack_t *ns)
 {
-	/* Assume ipsec_mp is a chain of b_next-linked IPSEC_IN M_CTLs. */
+	/* Assume attr_mp is a chain of b_next-linked ip_recv_attr mblk. */
 	mblk_t *data_chain = NULL, *data_tail = NULL;
-	mblk_t *ii_next;
-
-	while (ipsec_mp != NULL) {
-		ii_next = ipsec_mp->b_next;
-		ipsec_mp->b_next = NULL;  /* No tripping asserts. */
+	mblk_t *next;
+	mblk_t *data_mp;
+	ip_recv_attr_t	iras;
+
+	while (attr_mp != NULL) {
+		ASSERT(ip_recv_attr_is_mblk(attr_mp));
+		next = attr_mp->b_next;
+		attr_mp->b_next = NULL;  /* No tripping asserts. */
+
+		data_mp = attr_mp->b_cont;
+		attr_mp->b_cont = NULL;
+		if (!ip_recv_attr_from_mblk(attr_mp, &iras)) {
+			/* The ill or ip_stack_t disappeared on us */
+			freemsg(data_mp);	/* ip_drop_packet?? */
+			ira_cleanup(&iras, B_TRUE);
+			goto fail;
+		}
 
 		/*
 		 * Need IPPOL_REFHOLD(pol) for extras because
@@ -5954,67 +5475,67 @@ ipsec_check_ipsecin_policy_reasm(mblk_t *ipsec_mp, ipsec_policy_t *pol,
 		 */
 		IPPOL_REFHOLD(pol);
 
-		if (ipsec_check_ipsecin_policy(ipsec_mp, pol, inner_ipv4,
-		    inner_ipv6, pkt_unique, ns) != NULL) {
-			if (data_tail == NULL) {
-				/* First one */
-				data_chain = data_tail = ipsec_mp->b_cont;
-			} else {
-				data_tail->b_next = ipsec_mp->b_cont;
-				data_tail = data_tail->b_next;
-			}
-			freeb(ipsec_mp);
+		data_mp = ipsec_check_ipsecin_policy(data_mp, pol, inner_ipv4,
+		    inner_ipv6, pkt_unique, &iras, ns);
+		ira_cleanup(&iras, B_TRUE);
+
+		if (data_mp == NULL)
+			goto fail;
+
+		if (data_tail == NULL) {
+			/* First one */
+			data_chain = data_tail = data_mp;
 		} else {
-			/*
-			 * ipsec_check_ipsecin_policy() freed ipsec_mp
-			 * already.   Need to get rid of any extra pol
-			 * references, and any remaining bits as well.
-			 */
-			IPPOL_REFRELE(pol, ns);
-			ipsec_freemsg_chain(data_chain);
-			ipsec_freemsg_chain(ii_next);	/* ipdrop stats? */
-			return (NULL);
+			data_tail->b_next = data_mp;
+			data_tail = data_mp;
 		}
-		ipsec_mp = ii_next;
+		attr_mp = next;
 	}
 	/*
 	 * One last release because either the loop bumped it up, or we never
 	 * called ipsec_check_ipsecin_policy().
 	 */
-	IPPOL_REFRELE(pol, ns);
+	IPPOL_REFRELE(pol);
 
 	/* data_chain is ready for return to tun module. */
 	return (data_chain);
-}
 
+fail:
+	/*
+	 * Need to get rid of any extra pol
+	 * references, and any remaining bits as well.
+	 */
+	IPPOL_REFRELE(pol);
+	ipsec_freemsg_chain(data_chain);
+	ipsec_freemsg_chain(next);	/* ipdrop stats? */
+	return (NULL);
+}
 
 /*
- * Returns B_TRUE if the inbound packet passed an IPsec policy check.  Returns
- * B_FALSE if it failed or if it is a fragment needing its friends before a
+ * Return a message if the inbound packet passed an IPsec policy check.  Returns
+ * NULL if it failed or if it is a fragment needing its friends before a
  * policy check can be performed.
  *
- * Expects a non-NULL *data_mp, an optional ipsec_mp, and a non-NULL polhead.
- * data_mp may be reassigned with a b_next chain of packets if fragments
+ * Expects a non-NULL data_mp, and a non-NULL polhead.
+ * The returned mblk may be a b_next chain of packets if fragments
  * neeeded to be collected for a proper policy check.
  *
- * Always frees ipsec_mp, but only frees data_mp if returns B_FALSE.  This
- * function calls ip_drop_packet() on data_mp if need be.
+ * This function calls ip_drop_packet() on data_mp if need be.
  *
  * NOTE:  outer_hdr_len is signed.  If it's a negative value, the caller
  * is inspecting an ICMP packet.
  */
-boolean_t
-ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
+mblk_t *
+ipsec_tun_inbound(ip_recv_attr_t *ira, mblk_t *data_mp, ipsec_tun_pol_t *itp,
     ipha_t *inner_ipv4, ip6_t *inner_ipv6, ipha_t *outer_ipv4,
     ip6_t *outer_ipv6, int outer_hdr_len, netstack_t *ns)
 {
 	ipsec_policy_head_t *polhead;
 	ipsec_selector_t sel;
-	mblk_t *message = (ipsec_mp == NULL) ? *data_mp : ipsec_mp;
 	ipsec_policy_t *pol;
 	uint16_t tmpport;
 	selret_t rc;
-	boolean_t retval, port_policy_present, is_icmp, global_present;
+	boolean_t port_policy_present, is_icmp, global_present;
 	in6_addr_t tmpaddr;
 	ipaddr_t tmp4;
 	uint8_t flags, *inner_hdr;
@@ -6032,7 +5553,6 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 
 	ASSERT(inner_ipv4 != NULL && inner_ipv6 == NULL ||
 	    inner_ipv4 == NULL && inner_ipv6 != NULL);
-	ASSERT(message == *data_mp || message->b_cont == *data_mp);
 
 	if (outer_hdr_len < 0) {
 		outer_hdr_len = (-outer_hdr_len);
@@ -6042,6 +5562,8 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 	}
 
 	if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
+		mblk_t *mp = data_mp;
+
 		polhead = itp->itp_policy;
 		/*
 		 * We need to perform full Tunnel-Mode enforcement,
@@ -6061,53 +5583,66 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 		flags = ((port_policy_present ? SEL_PORT_POLICY : SEL_NONE) |
 		    (is_icmp ? SEL_IS_ICMP : SEL_NONE) | SEL_TUNNEL_MODE);
 
-		rc = ipsec_init_inbound_sel(&sel, *data_mp, inner_ipv4,
+		rc = ipsec_init_inbound_sel(&sel, data_mp, inner_ipv4,
 		    inner_ipv6, flags);
 
 		switch (rc) {
 		case SELRET_NOMEM:
-			ip_drop_packet(message, B_TRUE, NULL, NULL,
+			ip_drop_packet(data_mp, B_TRUE, NULL,
 			    DROPPER(ipss, ipds_spd_nomem),
 			    &ipss->ipsec_spd_dropper);
-			return (B_FALSE);
+			return (NULL);
 		case SELRET_TUNFRAG:
 			/*
 			 * At this point, if we're cleartext, we don't want
 			 * to go there.
 			 */
-			if (ipsec_mp == NULL) {
-				ip_drop_packet(*data_mp, B_TRUE, NULL, NULL,
+			if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
+				ip_drop_packet(data_mp, B_TRUE, NULL,
 				    DROPPER(ipss, ipds_spd_got_clear),
 				    &ipss->ipsec_spd_dropper);
-				*data_mp = NULL;
-				return (B_FALSE);
+				return (NULL);
+			}
+			/*
+			 * If we need to queue the packet. First we
+			 * get an mblk with the attributes. ipsec_fragcache_add
+			 * will prepend that to the queued data and return
+			 * a list of b_next messages each of which starts with
+			 * the attribute mblk.
+			 */
+			mp = ip_recv_attr_to_mblk(ira);
+			if (mp == NULL) {
+				ip_drop_packet(data_mp, B_TRUE, NULL,
+				    DROPPER(ipss, ipds_spd_nomem),
+				    &ipss->ipsec_spd_dropper);
+				return (NULL);
 			}
-			ASSERT(((ipsec_in_t *)ipsec_mp->b_rptr)->
-			    ipsec_in_secure);
-			message = ipsec_fragcache_add(&itp->itp_fragcache,
-			    ipsec_mp, *data_mp, outer_hdr_len, ipss);
+			mp = ipsec_fragcache_add(&itp->itp_fragcache,
+			    mp, data_mp, outer_hdr_len, ipss);
 
-			if (message == NULL) {
+			if (mp == NULL) {
 				/*
 				 * Data is cached, fragment chain is not
-				 * complete.  I consume ipsec_mp and data_mp
+				 * complete.
 				 */
-				return (B_FALSE);
+				return (NULL);
 			}
 
 			/*
 			 * If we get here, we have a full fragment chain.
 			 * Reacquire headers and selectors from first fragment.
 			 */
-			inner_hdr = message->b_cont->b_rptr;
+			ASSERT(ip_recv_attr_is_mblk(mp));
+			data_mp = mp->b_cont;
+			inner_hdr = data_mp->b_rptr;
 			if (outer_ipv4 != NULL) {
 				inner_hdr += IPH_HDR_LENGTH(
-				    (ipha_t *)message->b_cont->b_rptr);
+				    (ipha_t *)data_mp->b_rptr);
 			} else {
-				inner_hdr += ip_hdr_length_v6(message->b_cont,
-				    (ip6_t *)message->b_cont->b_rptr);
+				inner_hdr += ip_hdr_length_v6(data_mp,
+				    (ip6_t *)data_mp->b_rptr);
 			}
-			ASSERT(inner_hdr <= message->b_cont->b_wptr);
+			ASSERT(inner_hdr <= data_mp->b_wptr);
 
 			if (inner_ipv4 != NULL) {
 				inner_ipv4 = (ipha_t *)inner_hdr;
@@ -6121,7 +5656,7 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 			 * Use SEL_TUNNEL_MODE to take into account the outer
 			 * header.  Use SEL_POST_FRAG so we always get ports.
 			 */
-			rc = ipsec_init_inbound_sel(&sel, message->b_cont,
+			rc = ipsec_init_inbound_sel(&sel, data_mp,
 			    inner_ipv4, inner_ipv6,
 			    SEL_TUNNEL_MODE | SEL_POST_FRAG);
 			switch (rc) {
@@ -6132,17 +5667,15 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 				 */
 				break;
 			case SELRET_NOMEM:
-				ip_drop_packet_chain(message, B_TRUE,
-				    NULL, NULL,
+				ip_drop_packet_chain(mp, B_TRUE, NULL,
 				    DROPPER(ipss, ipds_spd_nomem),
 				    &ipss->ipsec_spd_dropper);
-				return (B_FALSE);
+				return (NULL);
 			case SELRET_BADPKT:
-				ip_drop_packet_chain(message, B_TRUE,
-				    NULL, NULL,
+				ip_drop_packet_chain(mp, B_TRUE, NULL,
 				    DROPPER(ipss, ipds_spd_malformed_frag),
 				    &ipss->ipsec_spd_dropper);
-				return (B_FALSE);
+				return (NULL);
 			case SELRET_TUNFRAG:
 				cmn_err(CE_WARN, "(TUNFRAG on 2nd call...)");
 				/* FALLTHRU */
@@ -6151,7 +5684,7 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 				    " returns bizarro 0x%x", rc);
 				/* Guaranteed panic! */
 				ASSERT(rc == SELRET_NOMEM);
-				return (B_FALSE);
+				return (NULL);
 			}
 			/* FALLTHRU */
 		case SELRET_SUCCESS:
@@ -6174,7 +5707,7 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 			    "ipsec_init_inbound_sel() returns bizarro 0x%x",
 			    rc);
 			ASSERT(rc == SELRET_NOMEM);	/* Guaranteed panic! */
-			return (B_FALSE);
+			return (NULL);
 		}
 
 		if (is_icmp) {
@@ -6192,42 +5725,54 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 		/* find_policy_head() */
 		rw_enter(&polhead->iph_lock, RW_READER);
 		pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_INBOUND,
-		    &sel, ns);
+		    &sel);
 		rw_exit(&polhead->iph_lock);
 		if (pol != NULL) {
-			if (ipsec_mp == NULL ||
-			    !((ipsec_in_t *)ipsec_mp->b_rptr)->
-			    ipsec_in_secure) {
-				retval = pol->ipsp_act->ipa_allow_clear;
-				if (!retval) {
+			uint64_t pkt_unique;
+
+			if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
+				if (!pol->ipsp_act->ipa_allow_clear) {
 					/*
 					 * XXX should never get here with
 					 * tunnel reassembled fragments?
 					 */
-					ASSERT(message->b_next == NULL);
-					ip_drop_packet(message, B_TRUE, NULL,
-					    NULL,
+					ASSERT(mp == data_mp);
+					ip_drop_packet(data_mp, B_TRUE, NULL,
 					    DROPPER(ipss, ipds_spd_got_clear),
 					    &ipss->ipsec_spd_dropper);
-				} else if (ipsec_mp != NULL) {
-					freeb(ipsec_mp);
+					IPPOL_REFRELE(pol);
+					return (NULL);
+				} else {
+					IPPOL_REFRELE(pol);
+					return (mp);
 				}
-
-				IPPOL_REFRELE(pol, ns);
-				return (retval);
 			}
+			pkt_unique = SA_UNIQUE_ID(sel.ips_remote_port,
+			    sel.ips_local_port,
+			    (inner_ipv4 == NULL) ? IPPROTO_IPV6 :
+			    IPPROTO_ENCAP, sel.ips_protocol);
+
 			/*
 			 * NOTE: The following releases pol's reference and
 			 * calls ip_drop_packet() for me on NULL returns.
 			 *
 			 * "sel" is still good here, so let's use it!
 			 */
-			*data_mp = ipsec_check_ipsecin_policy_reasm(message,
-			    pol, inner_ipv4, inner_ipv6, SA_UNIQUE_ID(
-			    sel.ips_remote_port, sel.ips_local_port,
-			    (inner_ipv4 == NULL) ? IPPROTO_IPV6 :
-			    IPPROTO_ENCAP, sel.ips_protocol), ns);
-			return (*data_mp != NULL);
+			if (data_mp == mp) {
+				/* A single packet without attributes */
+				data_mp = ipsec_check_ipsecin_policy(data_mp,
+				    pol, inner_ipv4, inner_ipv6, pkt_unique,
+				    ira, ns);
+			} else {
+				/*
+				 * We pass in the b_next chain of attr_mp's
+				 * and get back a b_next chain of data_mp's.
+				 */
+				data_mp = ipsec_check_ipsecin_policy_reasm(mp,
+				    pol, inner_ipv4, inner_ipv6, pkt_unique,
+				    ns);
+			}
+			return (data_mp);
 		}
 
 		/*
@@ -6237,11 +5782,10 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 		 * a new-style tunnel-mode tunnel.
 		 */
 		if ((itp->itp_flags & ITPF_P_TUNNEL) && !is_icmp) {
-			ip_drop_packet_chain(message, B_TRUE, NULL,
-			    NULL,
+			ip_drop_packet_chain(data_mp, B_TRUE, NULL,
 			    DROPPER(ipss, ipds_spd_explicit),
 			    &ipss->ipsec_spd_dropper);
-			return (B_FALSE);
+			return (NULL);
 		}
 	}
 
@@ -6251,24 +5795,22 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 	 * tunnel-mode tunnel, which either returns with a pass, or gets
 	 * hit by the ip_drop_packet_chain() call right above here.
 	 */
+	ASSERT(data_mp->b_next == NULL);
 
 	/* If no per-tunnel security, check global policy now. */
-	if (ipsec_mp != NULL && !global_present) {
-		if (((ipsec_in_t *)(ipsec_mp->b_rptr))->
-		    ipsec_in_icmp_loopback) {
+	if ((ira->ira_flags & IRAF_IPSEC_SECURE) && !global_present) {
+		if (ira->ira_flags & IRAF_TRUSTED_ICMP) {
 			/*
-			 * This is an ICMP message with an ipsec_mp
-			 * attached.  We should accept it.
+			 * This is an ICMP message that was geenrated locally.
+			 * We should accept it.
 			 */
-			if (ipsec_mp != NULL)
-				freeb(ipsec_mp);
-			return (B_TRUE);
+			return (data_mp);
 		}
 
-		ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+		ip_drop_packet(data_mp, B_TRUE, NULL,
 		    DROPPER(ipss, ipds_spd_got_secure),
 		    &ipss->ipsec_spd_dropper);
-		return (B_FALSE);
+		return (NULL);
 	}
 
 	if (is_icmp) {
@@ -6294,11 +5836,10 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 		}
 	}
 
-	/* NOTE:  Frees message if it returns NULL. */
-	if (ipsec_check_global_policy(message, NULL, outer_ipv4, outer_ipv6,
-	    (ipsec_mp != NULL), ns) == NULL) {
-		return (B_FALSE);
-	}
+	data_mp = ipsec_check_global_policy(data_mp, NULL, outer_ipv4,
+	    outer_ipv6, ira, ns);
+	if (data_mp == NULL)
+		return (NULL);
 
 	if (is_icmp) {
 		/* Set things back to normal. */
@@ -6314,14 +5855,11 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
 		}
 	}
 
-	if (ipsec_mp != NULL)
-		freeb(ipsec_mp);
-
 	/*
 	 * At this point, we pretend it's a cleartext accepted
 	 * packet.
 	 */
-	return (B_TRUE);
+	return (data_mp);
 }
 
 /*
@@ -6365,7 +5903,7 @@ itp_unlink(ipsec_tun_pol_t *node, netstack_t *ns)
 
 	rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_WRITER);
 	ipss->ipsec_tunnel_policy_gen++;
-	ipsec_fragcache_uninit(&node->itp_fragcache);
+	ipsec_fragcache_uninit(&node->itp_fragcache, ipss);
 	avl_remove(&ipss->ipsec_tunnel_policies, node);
 	rw_exit(&ipss->ipsec_tunnel_policy_lock);
 	ITP_REFRELE(node, ns);
@@ -6615,7 +6153,7 @@ ipsec_fragcache_init(ipsec_fragcache_t *frag)
 }
 
 void
-ipsec_fragcache_uninit(ipsec_fragcache_t *frag)
+ipsec_fragcache_uninit(ipsec_fragcache_t *frag, ipsec_stack_t *ipss)
 {
 	ipsec_fragcache_entry_t *fep;
 	int i;
@@ -6627,7 +6165,7 @@ ipsec_fragcache_uninit(ipsec_fragcache_t *frag)
 			fep = (frag->itpf_ptr)[i];
 			while (fep != NULL) {
 				/* Returned fep is next in chain or NULL */
-				fep = fragcache_delentry(i, fep, frag);
+				fep = fragcache_delentry(i, fep, frag, ipss);
 			}
 		}
 		/*
@@ -6658,10 +6196,12 @@ ipsec_fragcache_uninit(ipsec_fragcache_t *frag)
 /*
  * Add a fragment to the fragment cache.   Consumes mp if NULL is returned.
  * Returns mp if a whole fragment has been assembled, NULL otherwise
+ * The returned mp could be a b_next chain of fragments.
+ *
+ * The iramp argument is set on inbound; NULL if outbound.
  */
-
 mblk_t *
-ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
+ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *iramp, mblk_t *mp,
     int outer_hdr_len, ipsec_stack_t *ipss)
 {
 	boolean_t is_v4;
@@ -6672,7 +6212,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 	uint8_t v6_proto;
 	uint8_t *v6_proto_p;
 	uint16_t ip6_hdr_length;
-	ip6_pkt_t ipp;
+	ip_pkt_t ipp;
 	ip6_frag_t *fraghdr;
 	ipsec_fragcache_entry_t *fep;
 	int i;
@@ -6680,10 +6220,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 	int firstbyte, lastbyte;
 	int offset;
 	int last;
-	boolean_t inbound = (ipsec_mp != NULL);
-	mblk_t *first_mp = inbound ? ipsec_mp : mp;
-
-	ASSERT(first_mp == mp || first_mp->b_cont == mp);
+	boolean_t inbound = (iramp != NULL);
 
 	/*
 	 * You're on the slow path, so insure that every packet in the
@@ -6692,14 +6229,14 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 	if (mp->b_cont != NULL) {
 		nmp = msgpullup(mp, -1);
 		if (nmp == NULL) {
-			ip_drop_packet(first_mp, inbound, NULL, NULL,
+			ip_drop_packet(mp, inbound, NULL,
 			    DROPPER(ipss, ipds_spd_nomem),
 			    &ipss->ipsec_spd_dropper);
+			if (inbound)
+				(void) ip_recv_attr_free_mblk(iramp);
 			return (NULL);
 		}
 		freemsg(mp);
-		if (ipsec_mp != NULL)
-			ipsec_mp->b_cont = nmp;
 		mp = nmp;
 	}
 
@@ -6721,9 +6258,11 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 			 * If it fails we have a malformed packet
 			 */
 			mutex_exit(&frag->itpf_lock);
-			ip_drop_packet(first_mp, inbound, NULL, NULL,
+			ip_drop_packet(mp, inbound, NULL,
 			    DROPPER(ipss, ipds_spd_malformed_packet),
 			    &ipss->ipsec_spd_dropper);
+			if (inbound)
+				(void) ip_recv_attr_free_mblk(iramp);
 			return (NULL);
 		} else {
 			v6_proto = *v6_proto_p;
@@ -6731,16 +6270,18 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 
 
 		bzero(&ipp, sizeof (ipp));
-		(void) ip_find_hdr_v6(mp, ip6h, &ipp, NULL);
+		(void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &ipp, NULL);
 		if (!(ipp.ipp_fields & IPPF_FRAGHDR)) {
 			/*
 			 * We think this is a fragment, but didn't find
 			 * a fragment header.  Something is wrong.
 			 */
 			mutex_exit(&frag->itpf_lock);
-			ip_drop_packet(first_mp, inbound, NULL, NULL,
+			ip_drop_packet(mp, inbound, NULL,
 			    DROPPER(ipss, ipds_spd_malformed_frag),
 			    &ipss->ipsec_spd_dropper);
+			if (inbound)
+				(void) ip_recv_attr_free_mblk(iramp);
 			return (NULL);
 		}
 		fraghdr = ipp.ipp_fraghdr;
@@ -6759,7 +6300,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 	 */
 	itpf_time = gethrestime_sec();
 	if (itpf_time >= frag->itpf_expire_hint)
-		ipsec_fragcache_clean(frag);
+		ipsec_fragcache_clean(frag, ipss);
 
 	/* Lookup to see if there is an existing entry */
 
@@ -6814,11 +6355,13 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 	/* check for bogus fragments and delete the entry */
 	if (firstbyte > 0 && firstbyte <= 8) {
 		if (fep != NULL)
-			(void) fragcache_delentry(i, fep, frag);
+			(void) fragcache_delentry(i, fep, frag, ipss);
 		mutex_exit(&frag->itpf_lock);
-		ip_drop_packet(first_mp, inbound, NULL, NULL,
+		ip_drop_packet(mp, inbound, NULL,
 		    DROPPER(ipss, ipds_spd_malformed_frag),
 		    &ipss->ipsec_spd_dropper);
+		if (inbound)
+			(void) ip_recv_attr_free_mblk(iramp);
 		return (NULL);
 	}
 
@@ -6826,12 +6369,14 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 	if (fep == NULL) {
 		if (frag->itpf_freelist == NULL) {
 			/* see if there is some space */
-			ipsec_fragcache_clean(frag);
+			ipsec_fragcache_clean(frag, ipss);
 			if (frag->itpf_freelist == NULL) {
 				mutex_exit(&frag->itpf_lock);
-				ip_drop_packet(first_mp, inbound, NULL, NULL,
+				ip_drop_packet(mp, inbound, NULL,
 				    DROPPER(ipss, ipds_spd_nomem),
 				    &ipss->ipsec_spd_dropper);
+				if (inbound)
+					(void) ip_recv_attr_free_mblk(iramp);
 				return (NULL);
 			}
 		}
@@ -6879,7 +6424,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 		ipha_t *niph;
 		ipha_t *oniph;
 		ip6_t *nip6h;
-		ip6_pkt_t nipp;
+		ip_pkt_t nipp;
 		ip6_frag_t *nfraghdr;
 		uint16_t nip6_hdr_length;
 		uint8_t *nv6_proto_p;
@@ -6929,14 +6474,17 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 			if (!ip_hdr_length_nexthdr_v6(ndata_mp, nip6h,
 			    &nip6_hdr_length, &nv6_proto_p)) {
 				mutex_exit(&frag->itpf_lock);
-				ip_drop_packet_chain(nmp, inbound, NULL, NULL,
+				ip_drop_packet_chain(nmp, inbound, NULL,
 				    DROPPER(ipss, ipds_spd_malformed_frag),
 				    &ipss->ipsec_spd_dropper);
 				ipsec_freemsg_chain(ndata_mp);
+				if (inbound)
+					(void) ip_recv_attr_free_mblk(iramp);
 				return (NULL);
 			}
 			bzero(&nipp, sizeof (nipp));
-			(void) ip_find_hdr_v6(ndata_mp, nip6h, &nipp, NULL);
+			(void) ip_find_hdr_v6(ndata_mp, nip6h, B_FALSE, &nipp,
+			    NULL);
 			nfraghdr = nipp.ipp_fraghdr;
 			nfirstbyte = ntohs(nfraghdr->ip6f_offlg &
 			    IP6F_OFF_MASK);
@@ -6968,11 +6516,13 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 			if (bcmp(data, ndata, MIN(lastbyte, nlastbyte) -
 			    firstbyte)) {
 				/* Overlapping data does not match */
-				(void) fragcache_delentry(i, fep, frag);
+				(void) fragcache_delentry(i, fep, frag, ipss);
 				mutex_exit(&frag->itpf_lock);
-				ip_drop_packet(first_mp, inbound, NULL, NULL,
+				ip_drop_packet(mp, inbound, NULL,
 				    DROPPER(ipss, ipds_spd_overlap_frag),
 				    &ipss->ipsec_spd_dropper);
+				if (inbound)
+					(void) ip_recv_attr_free_mblk(iramp);
 				return (NULL);
 			}
 			/* Part of defense for jolt2.c fragmentation attack */
@@ -6987,9 +6537,11 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 				 *  ----------		  ------
 				 */
 				mutex_exit(&frag->itpf_lock);
-				ip_drop_packet(first_mp, inbound, NULL, NULL,
+				ip_drop_packet(mp, inbound, NULL,
 				    DROPPER(ipss, ipds_spd_evil_frag),
 				    &ipss->ipsec_spd_dropper);
+				if (inbound)
+					(void) ip_recv_attr_free_mblk(iramp);
 				return (NULL);
 			}
 
@@ -7027,12 +6579,17 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 				if (bcmp(data, ndata, MIN(lastbyte, nlastbyte)
 				    - nfirstbyte)) {
 					/* Overlap mismatch */
-					(void) fragcache_delentry(i, fep, frag);
+					(void) fragcache_delentry(i, fep, frag,
+					    ipss);
 					mutex_exit(&frag->itpf_lock);
-					ip_drop_packet(first_mp, inbound, NULL,
-					    NULL, DROPPER(ipss,
+					ip_drop_packet(mp, inbound, NULL,
+					    DROPPER(ipss,
 					    ipds_spd_overlap_frag),
 					    &ipss->ipsec_spd_dropper);
+					if (inbound) {
+						(void) ip_recv_attr_free_mblk(
+						    iramp);
+					}
 					return (NULL);
 				}
 			}
@@ -7046,21 +6603,31 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 
 		prevmp = nmp;
 	}
-	first_mp->b_next = nmp;
+	/* Prepend the attributes before we link it in */
+	if (iramp != NULL) {
+		ASSERT(iramp->b_cont == NULL);
+		iramp->b_cont = mp;
+		mp = iramp;
+		iramp = NULL;
+	}
+	mp->b_next = nmp;
 
 	if (prevmp == NULL) {
-		fep->itpfe_fraglist = first_mp;
+		fep->itpfe_fraglist = mp;
 	} else {
-		prevmp->b_next = first_mp;
+		prevmp->b_next = mp;
 	}
 	if (last)
 		fep->itpfe_last = 1;
 
 	/* Part of defense for jolt2.c fragmentation attack */
 	if (++(fep->itpfe_depth) > IPSEC_MAX_FRAGS) {
-		(void) fragcache_delentry(i, fep, frag);
+		(void) fragcache_delentry(i, fep, frag, ipss);
 		mutex_exit(&frag->itpf_lock);
-		ip_drop_packet(first_mp, inbound, NULL, NULL,
+		if (inbound)
+			mp = ip_recv_attr_free_mblk(mp);
+
+		ip_drop_packet(mp, inbound, NULL,
 		    DROPPER(ipss, ipds_spd_max_frags),
 		    &ipss->ipsec_spd_dropper);
 		return (NULL);
@@ -7078,7 +6645,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 
 #ifdef FRAGCACHE_DEBUG
 	cmn_err(CE_WARN, "Last fragment cached.\n");
-	cmn_err(CE_WARN, "mp = %p, first_mp = %p.\n", mp, first_mp);
+	cmn_err(CE_WARN, "mp = %p\n", mp);
 #endif
 
 	offset = 0;
@@ -7118,14 +6685,15 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 			if (!ip_hdr_length_nexthdr_v6(data_mp, ip6h,
 			    &ip6_hdr_length, &v6_proto_p)) {
 				mutex_exit(&frag->itpf_lock);
-				ip_drop_packet_chain(mp, inbound, NULL, NULL,
+				ip_drop_packet_chain(mp, inbound, NULL,
 				    DROPPER(ipss, ipds_spd_malformed_frag),
 				    &ipss->ipsec_spd_dropper);
 				return (NULL);
 			}
 			v6_proto = *v6_proto_p;
 			bzero(&ipp, sizeof (ipp));
-			(void) ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
+			(void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp,
+			    NULL);
 			fraghdr = ipp.ipp_fraghdr;
 			firstbyte = ntohs(fraghdr->ip6f_offlg &
 			    IP6F_OFF_MASK);
@@ -7163,7 +6731,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 		    (!is_v4 && !(fraghdr->ip6f_offlg & IP6F_MORE_FRAG))) {
 			mp = fep->itpfe_fraglist;
 			fep->itpfe_fraglist = NULL;
-			(void) fragcache_delentry(i, fep, frag);
+			(void) fragcache_delentry(i, fep, frag, ipss);
 			mutex_exit(&frag->itpf_lock);
 
 			if ((is_v4 && (firstbyte + ntohs(iph->ipha_length) >
@@ -7171,7 +6739,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 			    ntohs(ip6h->ip6_plen) > 65535))) {
 				/* It is an invalid "ping-o-death" packet */
 				/* Discard it */
-				ip_drop_packet_chain(mp, inbound, NULL, NULL,
+				ip_drop_packet_chain(mp, inbound, NULL,
 				    DROPPER(ipss, ipds_spd_evil_frag),
 				    &ipss->ipsec_spd_dropper);
 				return (NULL);
@@ -7181,7 +6749,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 			    "mp->b_next = %p", mp, mp->b_next);
 #endif
 			/*
-			 * For inbound case, mp has ipsec_in b_next'd chain
+			 * For inbound case, mp has attrmp b_next'd chain
 			 * For outbound case, it is just data mp chain
 			 */
 			return (mp);
@@ -7202,7 +6770,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
 }
 
 static void
-ipsec_fragcache_clean(ipsec_fragcache_t *frag)
+ipsec_fragcache_clean(ipsec_fragcache_t *frag, ipsec_stack_t *ipss)
 {
 	ipsec_fragcache_entry_t *fep;
 	int i;
@@ -7221,7 +6789,7 @@ ipsec_fragcache_clean(ipsec_fragcache_t *frag)
 		while (fep) {
 			if (fep->itpfe_exp < itpf_time) {
 				/* found */
-				fep = fragcache_delentry(i, fep, frag);
+				fep = fragcache_delentry(i, fep, frag, ipss);
 			} else {
 				if (fep->itpfe_exp < earlyexp) {
 					earlyfep = fep;
@@ -7237,12 +6805,12 @@ ipsec_fragcache_clean(ipsec_fragcache_t *frag)
 
 	/* if (!found) */
 	if (frag->itpf_freelist == NULL)
-		(void) fragcache_delentry(earlyi, earlyfep, frag);
+		(void) fragcache_delentry(earlyi, earlyfep, frag, ipss);
 }
 
 static ipsec_fragcache_entry_t *
 fragcache_delentry(int slot, ipsec_fragcache_entry_t *fep,
-    ipsec_fragcache_t *frag)
+    ipsec_fragcache_t *frag, ipsec_stack_t *ipss)
 {
 	ipsec_fragcache_entry_t *targp;
 	ipsec_fragcache_entry_t *nextp = fep->itpfe_next;
@@ -7250,7 +6818,12 @@ fragcache_delentry(int slot, ipsec_fragcache_entry_t *fep,
 	ASSERT(MUTEX_HELD(&frag->itpf_lock));
 
 	/* Free up any fragment list still in cache entry */
-	ipsec_freemsg_chain(fep->itpfe_fraglist);
+	if (fep->itpfe_fraglist != NULL) {
+		ip_drop_packet_chain(fep->itpfe_fraglist,
+		    ip_recv_attr_is_mblk(fep->itpfe_fraglist), NULL,
+		    DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper);
+	}
+	fep->itpfe_fraglist = NULL;
 
 	targp = (frag->itpf_ptr)[slot];
 	ASSERT(targp != 0);
diff --git a/usr/src/uts/common/inet/ip/spdsock.c b/usr/src/uts/common/inet/ip/spdsock.c
index e15d23fdd8..1b25af4a97 100644
--- a/usr/src/uts/common/inet/ip/spdsock.c
+++ b/usr/src/uts/common/inet/ip/spdsock.c
@@ -58,7 +58,6 @@
 #include <inet/nd.h>
 #include <inet/ip_if.h>
 #include <inet/optcom.h>
-#include <inet/ipsec_info.h>
 #include <inet/ipsec_impl.h>
 #include <inet/spdsock.h>
 #include <inet/sadb.h>
@@ -1150,9 +1149,8 @@ spdsock_addrule(queue_t *q, ipsec_policy_head_t *iph, mblk_t *mp,
 
 fail:
 	rw_exit(&iph->iph_lock);
-	while ((--rulep) >= &rules[0]) {
-		IPPOL_REFRELE(rulep->pol, spds->spds_netstack);
-	}
+	while ((--rulep) >= &rules[0])
+		IPPOL_REFRELE(rulep->pol);
 	ipsec_actvec_free(actp, nact);
 fail2:
 	if (itp != NULL) {
@@ -2519,8 +2517,8 @@ error:
  * be invoked either once IPsec is loaded on a cached request, or
  * when a request is received while IPsec is loaded.
  */
-static void
-spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
+static int
+spdsock_do_updatealg(spd_ext_t *extv[], spd_stack_t *spds)
 {
 	struct spd_ext_actions *actp;
 	struct spd_attribute *attr, *endattr;
@@ -2529,17 +2527,15 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
 	ipsec_algtype_t alg_type = 0;
 	boolean_t skip_alg = B_TRUE, doing_proto = B_FALSE;
 	uint_t i, cur_key, cur_block, algid;
+	int diag = -1;
 
-	*diag = -1;
 	ASSERT(MUTEX_HELD(&spds->spds_alg_lock));
 
 	/* parse the message, building the list of algorithms */
 
 	actp = (struct spd_ext_actions *)extv[SPD_EXT_ACTION];
-	if (actp == NULL) {
-		*diag = SPD_DIAGNOSTIC_NO_ACTION_EXT;
-		return;
-	}
+	if (actp == NULL)
+		return (SPD_DIAGNOSTIC_NO_ACTION_EXT);
 
 	start = (uint64_t *)actp;
 	end = (start + actp->spd_actions_len);
@@ -2583,7 +2579,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
 				ss1dbg(spds, ("spdsock_do_updatealg: "
 				    "invalid alg id %d\n",
 				    attr->spd_attr_value));
-				*diag = SPD_DIAGNOSTIC_ALG_ID_RANGE;
+				diag = SPD_DIAGNOSTIC_ALG_ID_RANGE;
 				goto bail;
 			}
 			alg->alg_id = attr->spd_attr_value;
@@ -2623,7 +2619,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
 			    cur_key >= alg->alg_nkey_sizes) {
 				ss1dbg(spds, ("spdsock_do_updatealg: "
 				    "too many key sizes\n"));
-				*diag = SPD_DIAGNOSTIC_ALG_NUM_KEY_SIZES;
+				diag = SPD_DIAGNOSTIC_ALG_NUM_KEY_SIZES;
 				goto bail;
 			}
 			alg->alg_key_sizes[cur_key++] = attr->spd_attr_value;
@@ -2659,7 +2655,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
 			    cur_block >= alg->alg_nblock_sizes) {
 				ss1dbg(spds, ("spdsock_do_updatealg: "
 				    "too many block sizes\n"));
-				*diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES;
+				diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES;
 				goto bail;
 			}
 			alg->alg_block_sizes[cur_block++] =
@@ -2686,7 +2682,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
 			    cur_block >= alg->alg_nparams) {
 				ss1dbg(spds, ("spdsock_do_updatealg: "
 				    "too many params\n"));
-				*diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES;
+				diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES;
 				goto bail;
 			}
 			/*
@@ -2703,7 +2699,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
 			if (attr->spd_attr_value > CRYPTO_MAX_MECH_NAME) {
 				ss1dbg(spds, ("spdsock_do_updatealg: "
 				    "mech name too long\n"));
-				*diag = SPD_DIAGNOSTIC_ALG_MECH_NAME_LEN;
+				diag = SPD_DIAGNOSTIC_ALG_MECH_NAME_LEN;
 				goto bail;
 			}
 			mech_name = (char *)(attr + 1);
@@ -2751,6 +2747,7 @@ bail:
 		for (algid = 0; algid < IPSEC_MAX_ALGS; algid++)
 		if (spds->spds_algs[alg_type][algid] != NULL)
 			ipsec_alg_free(spds->spds_algs[alg_type][algid]);
+	return (diag);
 }
 
 /*
@@ -2803,9 +2800,12 @@ spdsock_updatealg(queue_t *q, mblk_t *mp, spd_ext_t *extv[])
 		int diag;
 
 		mutex_enter(&spds->spds_alg_lock);
-		spdsock_do_updatealg(extv, &diag, spds);
-		mutex_exit(&spds->spds_alg_lock);
+		diag = spdsock_do_updatealg(extv, spds);
 		if (diag == -1) {
+			/* Keep the lock held while we walk the SA tables. */
+			sadb_alg_update(IPSEC_ALG_ALL, 0, 0,
+			    spds->spds_netstack);
+			mutex_exit(&spds->spds_alg_lock);
 			spd_echo(q, mp);
 			if (audit_active) {
 				cred_t *cr;
@@ -2817,6 +2817,7 @@ spdsock_updatealg(queue_t *q, mblk_t *mp, spd_ext_t *extv[])
 				    cpid);
 			}
 		} else {
+			mutex_exit(&spds->spds_alg_lock);
 			spdsock_diag(q, mp, diag);
 			if (audit_active) {
 				cred_t *cr;
@@ -3117,10 +3118,7 @@ spdsock_update_pending_algs(netstack_t *ns)
 
 	mutex_enter(&spds->spds_alg_lock);
 	if (spds->spds_algs_pending) {
-		int diag;
-
-		spdsock_do_updatealg(spds->spds_extv_algs, &diag,
-		    spds);
+		(void) spdsock_do_updatealg(spds->spds_extv_algs, spds);
 		spds->spds_algs_pending = B_FALSE;
 	}
 	mutex_exit(&spds->spds_alg_lock);
@@ -3265,7 +3263,7 @@ spdsock_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
 int
 spdsock_opt_set(queue_t *q, uint_t mgmt_flags, int level, int name,
     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
-    void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+    void *thisdg_attrs, cred_t *cr)
 {
 	int *i1 = (int *)invalp;
 	spdsock_t *ss = (spdsock_t *)q->q_ptr;
@@ -3337,11 +3335,9 @@ spdsock_wput_other(queue_t *q, mblk_t *mp)
 			}
 			if (((union T_primitives *)mp->b_rptr)->type ==
 			    T_SVR4_OPTMGMT_REQ) {
-				(void) svr4_optcom_req(q, mp, cr,
-				    &spdsock_opt_obj, B_FALSE);
+				svr4_optcom_req(q, mp, cr, &spdsock_opt_obj);
 			} else {
-				(void) tpi_optcom_req(q, mp, cr,
-				    &spdsock_opt_obj, B_FALSE);
+				tpi_optcom_req(q, mp, cr, &spdsock_opt_obj);
 			}
 			break;
 		case T_DATA_REQ:
diff --git a/usr/src/uts/common/inet/ip/spdsock_opt_data.c b/usr/src/uts/common/inet/ip/spdsock_opt_data.c
index df797bb37a..c5438f29cc 100644
--- a/usr/src/uts/common/inet/ip/spdsock_opt_data.c
+++ b/usr/src/uts/common/inet/ip/spdsock_opt_data.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/stream.h>
 #define	_SUN_TPI_VERSION 1
@@ -53,9 +51,9 @@
  */
 
 opdes_t spdsock_opt_arr[] = {
-	{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_PASSNEXT,
+	{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, 0,
 	    (t_uscalar_t)sizeof (int), 0 },
-	{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_PASSNEXT,
+	{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, 0,
 	    (t_uscalar_t)sizeof (int), 0 },
 };
 
@@ -88,7 +86,6 @@ optdb_obj_t spdsock_opt_obj = {
 	NULL,			/* SPDSOCK default value function pointer */
 	spdsock_opt_get,	/* SPDSOCK get function pointer */
 	spdsock_opt_set,	/* SPDSOCK set function pointer */
-	B_TRUE,			/* SPDSOCK is tpi provider */
 	SPDSOCK_OPT_ARR_CNT,	/* SPDSOCK option database count of entries */
 	spdsock_opt_arr,	/* SPDSOCK option database */
 	SPDSOCK_VALID_LEVELS_CNT, /* SPDSOCK valid level count of entries */
diff --git a/usr/src/uts/common/inet/ip/tn_ipopt.c b/usr/src/uts/common/inet/ip/tn_ipopt.c
index 359b8d4623..1ce050ec69 100644
--- a/usr/src/uts/common/inet/ip/tn_ipopt.c
+++ b/usr/src/uts/common/inet/ip/tn_ipopt.c
@@ -271,38 +271,40 @@ tsol_get_option_v6(mblk_t *mp, tsol_ip_label_t *label_type, uchar_t **buffer)
  * tsol_check_dest()
  *
  * This routine verifies if a destination is allowed to recieve messages
- * based on the message cred's security label. If any adjustments to
- * the cred are needed due to the connection's MAC mode or
- * the destination's ability to receive labels, an "effective cred"
- * will be returned.
+ * based on the security label. If any adjustments to the label are needed
+ * due to the connection's MAC mode or the destination's ability
+ * to receive labels, an "effective label" will be returned.
+ *
+ * zone_is_global is set if the actual zoneid is global. That is, it is
+ * not set for an exclusive-IP zone.
  *
- * On successful return, effective_cred will point to the new creds needed
- * or will be NULL if new creds aren't needed. On error, effective_cred
- * is NULL.
+ * On successful return, effective_tsl will point to the new label needed
+ * or will be NULL if a new label isn't needed. On error, effective_tsl will
+ * point to NULL.
  *
  * Returns:
- *	0		Have or constructed appropriate credentials
- *	EHOSTUNREACH	The credentials failed the remote host accreditation
+ *      0		Label (was|is now) correct
+ *	EHOSTUNREACH	The label failed the remote host accreditation
  *      ENOMEM		Memory allocation failure
  */
 int
-tsol_check_dest(const cred_t *credp, const void *dst, uchar_t version,
-    uint_t mac_mode, cred_t **effective_cred)
+tsol_check_dest(const ts_label_t *tsl, const void *dst,
+    uchar_t version, uint_t mac_mode, boolean_t zone_is_global,
+    ts_label_t **effective_tsl)
 {
-	ts_label_t	*tsl, *newtsl = NULL;
+	ts_label_t	*newtsl = NULL;
 	tsol_tpc_t	*dst_rhtp;
-	zoneid_t	zoneid;
 
-	if (effective_cred != NULL)
-		*effective_cred = NULL;
+	if (effective_tsl != NULL)
+		*effective_tsl = NULL;
 	ASSERT(version == IPV4_VERSION ||
 	    (version == IPV6_VERSION &&
 	    !IN6_IS_ADDR_V4MAPPED((in6_addr_t *)dst)));
 
 	/* Always pass kernel level communication (NULL label) */
-	if ((tsl = crgetlabel(credp)) == NULL) {
+	if (tsl == NULL) {
 		DTRACE_PROBE2(tx__tnopt__log__info__labeling__mac__allownull,
-		    char *, "destination ip(1) with null cred was passed",
+		    char *, "destination ip(1) with null label was passed",
 		    ipaddr_t, dst);
 		return (0);
 	}
@@ -358,9 +360,8 @@ tsol_check_dest(const cred_t *credp, const void *dst, uchar_t version,
 		}
 		if (!blequal(&dst_rhtp->tpc_tp.tp_def_label,
 		    &tsl->tsl_label)) {
-			zoneid = crgetzoneid(credp);
 			if (mac_mode != CONN_MAC_AWARE ||
-			    !(zoneid == GLOBAL_ZONEID ||
+			    !(zone_is_global ||
 			    bldominates(&tsl->tsl_label,
 			    &dst_rhtp->tpc_tp.tp_def_label))) {
 				DTRACE_PROBE4(
@@ -438,51 +439,43 @@ tsol_check_dest(const cred_t *credp, const void *dst, uchar_t version,
 	}
 
 	/*
-	 * Generate a new cred if we modified the security label or
-	 * label flags.
+	 * Return the new label.
 	 */
 	if (newtsl != NULL) {
-		if (effective_cred != NULL) {
-			*effective_cred = copycred_from_tslabel(credp,
-			    newtsl, KM_NOSLEEP);
-		}
-		label_rele(newtsl);
-		if (effective_cred != NULL && *effective_cred == NULL) {
-			TPC_RELE(dst_rhtp);
-			return (ENOMEM);
-		}
+		if (effective_tsl != NULL)
+			*effective_tsl = newtsl;
+		else
+			label_rele(newtsl);
 	}
 	TPC_RELE(dst_rhtp);
 	return (0);
 }
 
 /*
- * tsol_compute_label()
+ * tsol_compute_label_v4()
  *
  * This routine computes the IP label that should be on a packet based on the
  * connection and destination information.
  *
+ * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones).
+ *
  * Returns:
  *      0		Fetched label
  *	EHOSTUNREACH	No route to destination
  *	EINVAL		Label cannot be computed
  */
 int
-tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage,
-    ip_stack_t *ipst)
+tsol_compute_label_v4(const ts_label_t *tsl, zoneid_t zoneid, ipaddr_t dst,
+    uchar_t *opt_storage, ip_stack_t *ipst)
 {
 	uint_t		sec_opt_len;
-	ts_label_t	*tsl;
-	ire_t		*ire, *sire = NULL;
-	tsol_ire_gw_secattr_t *attrp;
-	zoneid_t	zoneid, ip_zoneid;
-
-	ASSERT(credp != NULL);
+	ire_t		*ire;
+	tsol_ire_gw_secattr_t *attrp = NULL;
 
 	if (opt_storage != NULL)
 		opt_storage[IPOPT_OLEN] = 0;
 
-	if ((tsl = crgetlabel(credp)) == NULL)
+	if (tsl == NULL)
 		return (0);
 
 	/* always pass multicast */
@@ -493,67 +486,44 @@ tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage,
 		return (0);
 
 	if (tsl->tsl_flags & TSLF_UNLABELED) {
-
 		/*
 		 * The destination is unlabeled. Only add a label if the
 		 * destination is not a broadcast/local/loopback address,
 		 * the destination is not on the same subnet, and the
 		 * next-hop gateway is labeled.
-		 *
-		 * For exclusive stacks we set the zoneid to zero
-		 * to operate as if we are in the global zone for
-		 * IRE lookups.
 		 */
-		zoneid = crgetzoneid(credp);
-		if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
-			ip_zoneid = GLOBAL_ZONEID;
-		else
-			ip_zoneid = zoneid;
-
-		ire = ire_cache_lookup(dst, ip_zoneid, tsl, ipst);
-
-		if (ire != NULL && (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL |
-		    IRE_LOOPBACK | IRE_INTERFACE)) != 0) {
-			IRE_REFRELE(ire);
-			return (0);
-		} else if (ire == NULL) {
-			ire = ire_ftable_lookup(dst, 0, 0, 0, NULL, &sire,
-			    ip_zoneid, 0, tsl, (MATCH_IRE_RECURSIVE |
-			    MATCH_IRE_DEFAULT | MATCH_IRE_SECATTR), ipst);
-		}
-
-		/* no route to destination */
-		if (ire == NULL) {
+		ire = ire_route_recursive_v4(dst, 0, NULL, zoneid, tsl,
+		    MATCH_IRE_SECATTR, B_TRUE, 0, ipst, NULL, &attrp, NULL);
+		ASSERT(ire != NULL);
+		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+			/* no route to destination */
+			ire_refrele(ire);
 			DTRACE_PROBE3(
 			    tx__tnopt__log__info__labeling__routedst__v4,
 			    char *, "No route to unlabeled dest ip(1) with "
-			    "creds(2).", ipaddr_t, dst, cred_t *, credp);
+			    "with label(2).", ipaddr_t, dst, ts_label_t *, tsl);
 			return (EHOSTUNREACH);
 		}
+		if (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK |
+		    IRE_INTERFACE)) {
+			ire_refrele(ire);
+			return (0);
+		}
 
 		/*
-		 * Prefix IRE from f-table lookup means that the destination
-		 * is not directly connected; check the next-hop attributes.
+		 * ire_route_recursive gives us the first attrp it finds
+		 * in the recursive lookup.
 		 */
-		if (sire != NULL) {
-			ASSERT(ire != NULL);
-			IRE_REFRELE(ire);
-			ire = sire;
-		}
-
 		/*
 		 * Return now if next hop gateway is unlabeled. There is
 		 * no need to generate a CIPSO option for this message.
 		 */
-		attrp = ire->ire_gw_secattr;
 		if (attrp == NULL || attrp->igsa_rhc == NULL ||
 		    attrp->igsa_rhc->rhc_tpc->tpc_tp.host_type == UNLABELED) {
-			IRE_REFRELE(ire);
+			ire_refrele(ire);
 			return (0);
 		}
-
-		IRE_REFRELE(ire);
-
+		ire_refrele(ire);
 	}
 
 	/* compute the CIPSO option */
@@ -562,8 +532,8 @@ tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage,
 
 	if (sec_opt_len == 0) {
 		DTRACE_PROBE3(tx__tnopt__log__error__labeling__lostops__v4,
-		    char *, "options lack length for dest ip(1) with creds(2).",
-		    ipaddr_t, dst, cred_t *, credp);
+		    char *, "options lack length for dest ip(1) with label(2).",
+		    ipaddr_t, dst, ts_label_t *, tsl);
 		return (EINVAL);
 	}
 
@@ -575,6 +545,9 @@ tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage,
  * header, move the 'buflen' bytes back to fill the gap, and return the number
  * of bytes removed (as zero or negative number).  Assumes that the headers are
  * sane.
+ *
+ * Note that tsol_remove_secopt does not adjust ipha_length but
+ * tsol_remove_secopt_v6 does adjust ip6_plen.
  */
 int
 tsol_remove_secopt(ipha_t *ipha, int buflen)
@@ -659,6 +632,9 @@ tsol_remove_secopt(ipha_t *ipha, int buflen)
  * option cannot be inserted.  (Note that negative return values are possible
  * when noops must be compressed, and that only -1 indicates error.  Successful
  * return value is always evenly divisible by 4, by definition.)
+ *
+ * Note that tsol_prepend_option does not adjust ipha_length but
+ * tsol_prepend_option_v6 does adjust ip6_plen.
  */
 int
 tsol_prepend_option(uchar_t *optbuf, ipha_t *ipha, int buflen)
@@ -810,28 +786,39 @@ tsol_prepend_option(uchar_t *optbuf, ipha_t *ipha, int buflen)
 }
 
 /*
- * tsol_check_label()
+ * tsol_check_label_v4()
  *
  * This routine computes the IP label that should be on the packet based on the
- * connection and destination information.  If the label is there, it returns
- * zero, so the caller knows that the label is syncronized, and further calls
- * are not required.  If the label isn't right, then the right one is inserted.
+ * connection and destination information.  It's called by the IP forwarding
+ * logic and by ip_output_simple. The ULPs generate the labels before calling
+ * conn_ip_output. If any adjustments to
+ * the label are needed due to the connection's MAC-exempt status or
+ * the destination's ability to receive labels, an "effective label"
+ * will be returned.
  *
  * The packet's header is clear before entering IPsec's engine.
  *
+ * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones).
+ * zone_is_global is set if the actual zoneid is global.
+ *
+ * On successful return, effective_tslp will point to the new label needed
+ * or will be NULL if a new label isn't needed. On error, effective_tsl will
+ * point to NULL.
+ *
  * Returns:
- *      0		Label on packet (was|is now) correct
+ *      0		Label on (was|is now) correct
  *      EACCES		The packet failed the remote host accreditation.
  *      ENOMEM		Memory allocation failure.
  *	EINVAL		Label cannot be computed
  */
 int
-tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
-    ip_stack_t *ipst, pid_t pid)
+tsol_check_label_v4(const ts_label_t *tsl, zoneid_t zoneid, mblk_t **mpp,
+    uint_t mac_mode, boolean_t zone_is_global, ip_stack_t *ipst,
+    ts_label_t **effective_tslp)
 {
 	mblk_t *mp = *mpp;
 	ipha_t  *ipha;
-	cred_t *effective_cred = NULL;
+	ts_label_t *effective_tsl = NULL;
 	uchar_t opt_storage[IP_MAX_OPT_LENGTH];
 	uint_t hlen;
 	uint_t sec_opt_len;
@@ -839,19 +826,18 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
 	int delta_remove = 0, delta_add, adjust;
 	int retv;
 
+	*effective_tslp = NULL;
 	opt_storage[IPOPT_OPTVAL] = 0;
 
 	ipha = (ipha_t *)mp->b_rptr;
 
 	/*
 	 * Verify the destination is allowed to receive packets at
-	 * the security label of the message data. check_dest()
-	 * may create a new effective cred with a modified label
-	 * or label flags. Apply any such cred to the message block
-	 * for use in future routing decisions.
+	 * the security label of the message data. tsol_check_dest()
+	 * may create a new effective label or label flags.
 	 */
-	retv = tsol_check_dest(credp, &ipha->ipha_dst, IPV4_VERSION,
-	    mac_mode, &effective_cred);
+	retv = tsol_check_dest(tsl, &ipha->ipha_dst, IPV4_VERSION,
+	    mac_mode, zone_is_global, &effective_tsl);
 	if (retv != 0)
 		return (retv);
 
@@ -859,16 +845,15 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
 	 * Calculate the security label to be placed in the text
 	 * of the message (if any).
 	 */
-	if (effective_cred != NULL) {
-		if ((retv = tsol_compute_label(effective_cred,
+	if (effective_tsl != NULL) {
+		if ((retv = tsol_compute_label_v4(effective_tsl, zoneid,
 		    ipha->ipha_dst, opt_storage, ipst)) != 0) {
-			crfree(effective_cred);
+			label_rele(effective_tsl);
 			return (retv);
 		}
-		mblk_setcred(mp, effective_cred, pid);
-		crfree(effective_cred);
+		*effective_tslp = effective_tsl;
 	} else {
-		if ((retv = tsol_compute_label(credp,
+		if ((retv = tsol_compute_label_v4(tsl, zoneid,
 		    ipha->ipha_dst, opt_storage, ipst)) != 0) {
 			return (retv);
 		}
@@ -890,10 +875,6 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
 			return (0);
 	}
 
-	if (msg_getcred(mp, NULL) == NULL) {
-		mblk_setcred(mp, (cred_t *)credp, NOPID);
-	}
-
 	/*
 	 * If there is an option there, then it must be the wrong one; delete.
 	 */
@@ -918,8 +899,13 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
 			copylen = 256;
 		new_mp = allocb_tmpl(hlen + copylen +
 		    (mp->b_rptr - mp->b_datap->db_base), mp);
-		if (new_mp == NULL)
+		if (new_mp == NULL) {
+			if (effective_tsl != NULL) {
+				label_rele(effective_tsl);
+				*effective_tslp = NULL;
+			}
 			return (ENOMEM);
+		}
 
 		/* keep the bias */
 		new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base;
@@ -948,6 +934,10 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
 	return (0);
 
 param_prob:
+	if (effective_tsl != NULL) {
+		label_rele(effective_tsl);
+		*effective_tslp = NULL;
+	}
 	return (EINVAL);
 }
 
@@ -972,19 +962,17 @@ param_prob:
  * i.e starting from the IP6OPT_LS but not including the pad at the end.
  * The user must prepend two octets (either padding or next header / length)
  * and append padding out to the next 8 octet boundary.
+ *
+ * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones).
  */
 int
-tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst,
-    uchar_t *opt_storage, ip_stack_t *ipst)
+tsol_compute_label_v6(const ts_label_t *tsl, zoneid_t zoneid,
+    const in6_addr_t *dst, uchar_t *opt_storage, ip_stack_t *ipst)
 {
-	ts_label_t	*tsl;
 	uint_t		sec_opt_len;
 	uint32_t	doi;
-	zoneid_t	zoneid, ip_zoneid;
-	ire_t		*ire, *sire;
-	tsol_ire_gw_secattr_t *attrp;
-
-	ASSERT(credp != NULL);
+	ire_t		*ire;
+	tsol_ire_gw_secattr_t *attrp = NULL;
 
 	if (ip6opt_ls == 0)
 		return (EINVAL);
@@ -992,15 +980,13 @@ tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst,
 	if (opt_storage != NULL)
 		opt_storage[IPOPT_OLEN] = 0;
 
-	if ((tsl = crgetlabel(credp)) == NULL)
+	if (tsl == NULL)
 		return (0);
 
 	/* Always pass multicast */
 	if (IN6_IS_ADDR_MULTICAST(dst))
 		return (0);
 
-	zoneid = crgetzoneid(credp);
-
 	/*
 	 * Fill in a V6 label.  If a new format is added here, make certain
 	 * that the maximum size of this label is reflected in sys/tsol/tnet.h
@@ -1012,62 +998,41 @@ tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst,
 	if (tsl->tsl_flags & TSLF_UNLABELED) {
 		/*
 		 * The destination is unlabeled. Only add a label if the
-		 * destination is not broadcast/local/loopback address,
+		 * destination is not a broadcast/local/loopback address,
 		 * the destination is not on the same subnet, and the
 		 * next-hop gateway is labeled.
-		 *
-		 * For exclusive stacks we set the zoneid to zero to
-		 * operate as if we are in the global zone when
-		 * performing IRE lookups and conn_t comparisons.
 		 */
-		if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
-			ip_zoneid = GLOBAL_ZONEID;
-		else
-			ip_zoneid = zoneid;
-
-		sire = NULL;
-		ire = ire_cache_lookup_v6(dst, ip_zoneid, tsl, ipst);
-
-		if (ire != NULL && (ire->ire_type & (IRE_LOCAL |
-		    IRE_LOOPBACK | IRE_INTERFACE)) != 0) {
-			IRE_REFRELE(ire);
-			return (0);
-		} else if (ire == NULL) {
-			ire = ire_ftable_lookup_v6(dst, NULL, NULL, 0, NULL,
-			    &sire, ip_zoneid, 0, tsl, (MATCH_IRE_RECURSIVE |
-			    MATCH_IRE_DEFAULT | MATCH_IRE_SECATTR), ipst);
-		}
-
-		/* no route to destination */
-		if (ire == NULL) {
+		ire = ire_route_recursive_v6(dst, 0, NULL, zoneid, tsl,
+		    MATCH_IRE_SECATTR, B_TRUE, 0, ipst, NULL, &attrp, NULL);
+		ASSERT(ire != NULL);
+		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+			/* no route to destination */
+			ire_refrele(ire);
 			DTRACE_PROBE3(
 			    tx__tnopt__log__info__labeling__routedst__v6,
 			    char *, "No route to unlabeled dest ip6(1) with "
-			    "creds(2).", in6_addr_t *, dst, cred_t *, credp);
+			    "label(2).", in6_addr_t *, dst, ts_label_t *, tsl);
 			return (EHOSTUNREACH);
 		}
-
+		if (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK |
+		    IRE_INTERFACE)) {
+			ire_refrele(ire);
+			return (0);
+		}
 		/*
-		 * Prefix IRE from f-table lookup means that the destination
-		 * is not directly connected; check the next-hop attributes.
+		 * ire_route_recursive gives us the first attrp it finds
+		 * in the recursive lookup.
 		 */
-		if (sire != NULL) {
-			ASSERT(ire != NULL);
-			IRE_REFRELE(ire);
-			ire = sire;
-		}
-
 		/*
 		 * Return now if next hop gateway is unlabeled. There is
 		 * no need to generate a CIPSO option for this message.
 		 */
-		attrp = ire->ire_gw_secattr;
 		if (attrp == NULL || attrp->igsa_rhc == NULL ||
 		    attrp->igsa_rhc->rhc_tpc->tpc_tp.host_type == UNLABELED) {
-			IRE_REFRELE(ire);
+			ire_refrele(ire);
 			return (0);
 		}
-		IRE_REFRELE(ire);
+		ire_refrele(ire);
 	}
 
 	/* compute the CIPSO option */
@@ -1079,7 +1044,7 @@ tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst,
 	if (sec_opt_len == 0) {
 		DTRACE_PROBE3(tx__tnopt__log__error__labeling__lostops__v6,
 		    char *, "options lack length for dest ip6(1) with "
-		    "creds(2).", in6_addr_t *, dst, cred_t *, credp);
+		    "label(2).", in6_addr_t *, dst, ts_label_t *, tsl);
 		return (EINVAL);
 	}
 
@@ -1188,6 +1153,9 @@ tsol_find_secopt_v6(
  * Header and data following the label option that is deleted are copied
  * (i.e. slid backward) to the right position, and returns the number
  * of bytes removed (as zero or negative number.)
+ *
+ * Note that tsol_remove_secopt does not adjust ipha_length but
+ * tsol_remove_secopt_v6 does adjust ip6_plen.
  */
 int
 tsol_remove_secopt_v6(ip6_t *ip6h, int buflen)
@@ -1286,6 +1254,9 @@ tsol_remove_secopt_v6(ip6_t *ip6h, int buflen)
  * extra option being added. Header and data following the position where
  * the label option is inserted are copied (i.e. slid forward) to the right
  * position.
+ *
+ * Note that tsol_prepend_option does not adjust ipha_length but
+ * tsol_prepend_option_v6 does adjust ip6_plen.
  */
 int
 tsol_prepend_option_v6(uchar_t *optbuf, ip6_t *ip6h, int buflen)
@@ -1368,22 +1339,36 @@ tsol_prepend_option_v6(uchar_t *optbuf, ip6_t *ip6h, int buflen)
  * tsol_check_label_v6()
  *
  * This routine computes the IP label that should be on the packet based on the
- * connection and destination information.  It's called only by the IP
- * forwarding logic, because all internal modules atop IP know how to generate
- * their own labels.
+ * connection and destination information.  It's called by the IP forwarding
+ * logic and by ip_output_simple. The ULPs generate the labels before calling
+ * conn_ip_output. If any adjustments to
+ * the label are needed due to the connection's MAC-exempt status or
+ * the destination's ability to receive labels, an "effective label"
+ * will be returned.
+ *
+ * The packet's header is clear before entering IPsec's engine.
+ *
+ * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones).
+ * zone_is_global is set if the actual zoneid is global.
+ *
+ * On successful return, effective_tslp will point to the new label needed
+ * or will be NULL if a new label isn't needed. On error, effective_tsl will
+ * point to NULL.
  *
  * Returns:
- *      0		Label on packet was already correct
+ *      0		Label on (was|is now) correct
  *      EACCES		The packet failed the remote host accreditation.
  *      ENOMEM		Memory allocation failure.
+ *	EINVAL		Label cannot be computed
  */
 int
-tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
-    ip_stack_t *ipst, pid_t pid)
+tsol_check_label_v6(const ts_label_t *tsl, zoneid_t zoneid, mblk_t **mpp,
+    uint_t mac_mode, boolean_t zone_is_global, ip_stack_t *ipst,
+    ts_label_t **effective_tslp)
 {
 	mblk_t *mp = *mpp;
 	ip6_t  *ip6h;
-	cred_t *effective_cred;
+	ts_label_t *effective_tsl = NULL;
 	/*
 	 * Label option length is limited to IP_MAX_OPT_LENGTH for
 	 * symmetry with IPv4. Can be relaxed if needed
@@ -1399,16 +1384,16 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
 	uint_t	hbhlen;
 	boolean_t hbh_needed;
 
+	*effective_tslp = NULL;
+
 	/*
 	 * Verify the destination is allowed to receive packets at
-	 * the security label of the message data. check_dest()
-	 * may create a new effective cred with a modified label
-	 * or label flags. Apply any such cred to the message block
-	 * for use in future routing decisions.
+	 * the security label of the message data. tsol_check_dest()
+	 * may create a new effective label or label flags.
 	 */
 	ip6h = (ip6_t *)mp->b_rptr;
-	retv = tsol_check_dest(credp, &ip6h->ip6_dst, IPV6_VERSION,
-	    mode, &effective_cred);
+	retv = tsol_check_dest(tsl, &ip6h->ip6_dst, IPV6_VERSION,
+	    mac_mode, zone_is_global, &effective_tsl);
 	if (retv != 0)
 		return (retv);
 
@@ -1416,16 +1401,15 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
 	 * Calculate the security label to be placed in the text
 	 * of the message (if any).
 	 */
-	if (effective_cred != NULL) {
-		if ((retv = tsol_compute_label_v6(effective_cred,
+	if (effective_tsl != NULL) {
+		if ((retv = tsol_compute_label_v6(effective_tsl, zoneid,
 		    &ip6h->ip6_dst, opt_storage, ipst)) != 0) {
-			crfree(effective_cred);
+			label_rele(effective_tsl);
 			return (retv);
 		}
-		mblk_setcred(mp, effective_cred, pid);
-		crfree(effective_cred);
+		*effective_tslp = effective_tsl;
 	} else {
-		if ((retv = tsol_compute_label_v6(credp,
+		if ((retv = tsol_compute_label_v6(tsl, zoneid,
 		    &ip6h->ip6_dst, opt_storage, ipst)) != 0)
 			return (retv);
 	}
@@ -1457,10 +1441,6 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
 		return (0);
 	}
 
-	if (msg_getcred(mp, NULL) == NULL) {
-		mblk_setcred(mp, (cred_t *)credp, NOPID);
-	}
-
 	if (secopt != NULL && sec_opt_len != 0 &&
 	    (bcmp(opt_storage, secopt, sec_opt_len + 2) == 0)) {
 		/* The packet has the correct label already */
@@ -1499,8 +1479,13 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
 			copylen = hdr_len;
 		new_mp = allocb_tmpl(hlen + copylen +
 		    (mp->b_rptr - mp->b_datap->db_base), mp);
-		if (new_mp == NULL)
+		if (new_mp == NULL) {
+			if (effective_tsl != NULL) {
+				label_rele(effective_tsl);
+				*effective_tslp = NULL;
+			}
 			return (ENOMEM);
+		}
 
 		/* keep the bias */
 		new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base;
@@ -1522,208 +1507,13 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
 	ASSERT(mp->b_wptr + delta_add <= DB_LIM(mp));
 	mp->b_wptr += delta_add;
 
+	/* tsol_prepend_option_v6 has adjusted ip6_plen */
 	return (0);
 
 param_prob:
-	return (EINVAL);
-}
-
-/*
- * Update the given IPv6 "sticky options" structure to contain the provided
- * label, which is encoded as an IPv6 option.  Existing label is removed if
- * necessary, and storage is allocated/freed/resized.
- *
- * Returns 0 on success, errno on failure.
- */
-int
-tsol_update_sticky(ip6_pkt_t *ipp, uint_t *labellen, const uchar_t *labelopt)
-{
-	int rawlen, optlen, newlen;
-	uchar_t *newopts;
-
-	/*
-	 * rawlen is the size of the IPv6 label to be inserted from labelopt.
-	 * optlen is the total length of that option, including any necessary
-	 * headers and padding.  newlen is the new size of the total hop-by-hop
-	 * options buffer, including user options.
-	 */
-	ASSERT(*labellen <= ipp->ipp_hopoptslen);
-	ASSERT((ipp->ipp_hopopts == NULL && ipp->ipp_hopoptslen == 0) ||
-	    (ipp->ipp_hopopts != NULL && ipp->ipp_hopoptslen != 0));
-
-	if ((rawlen = labelopt[1]) != 0) {
-		rawlen += 2;	/* add in header size */
-		optlen = (2 + rawlen + 7) & ~7;
-	} else {
-		optlen = 0;
-	}
-	newlen = ipp->ipp_hopoptslen + optlen - *labellen;
-	if (newlen == 0 && ipp->ipp_hopopts != NULL) {
-		/* Deleting all existing hop-by-hop options */
-		kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
-		ipp->ipp_hopopts = NULL;
-		ipp->ipp_fields &= ~IPPF_HOPOPTS;
-	} else if (optlen != *labellen) {
-		/* If the label not same size as last time, then reallocate */
-		if (newlen > IP6_MAX_OPT_LENGTH)
-			return (EHOSTUNREACH);
-		newopts = kmem_alloc(newlen, KM_NOSLEEP);
-		if (newopts == NULL)
-			return (ENOMEM);
-		/*
-		 * If the user has hop-by-hop stickyoptions set, then copy his
-		 * options in after the security label.
-		 */
-		if (ipp->ipp_hopoptslen > *labellen) {
-			bcopy(ipp->ipp_hopopts + *labellen, newopts + optlen,
-			    ipp->ipp_hopoptslen - *labellen);
-			/*
-			 * Stomp out any header gunk here - this was the
-			 * previous next-header and option length field.
-			 */
-			newopts[optlen] = IP6OPT_PADN;
-			newopts[optlen + 1] = 0;
-		}
-		if (ipp->ipp_hopopts != NULL)
-			kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
-		ipp->ipp_hopopts = (ip6_hbh_t *)newopts;
-	}
-	ipp->ipp_hopoptslen = newlen;
-	*labellen = optlen;
-
-	newopts = (uchar_t *)ipp->ipp_hopopts;
-
-	/* If there are any options, then fix up reported length */
-	if (newlen > 0) {
-		newopts[1] = (newlen + 7) / 8 - 1;
-		ipp->ipp_fields |= IPPF_HOPOPTS;
-	}
-
-	/* If there's a label, then insert it now */
-	if (optlen > 0) {
-		/* skip next-header and length fields */
-		newopts += 2;
-		bcopy(labelopt, newopts, rawlen);
-		newopts += rawlen;
-		/* make sure padding comes out right */
-		optlen -= 2 + rawlen;
-		if (optlen == 1) {
-			newopts[0] = IP6OPT_PAD1;
-		} else if (optlen > 1) {
-			newopts[0] = IP6OPT_PADN;
-			optlen -=  2;
-			newopts[1] = optlen;
-			if (optlen > 0)
-				bzero(newopts + 2, optlen);
-		}
-	}
-	return (0);
-}
-
-int
-tsol_update_options(uchar_t **opts, uint_t *totlen, uint_t *labellen,
-    const uchar_t *labelopt)
-{
-	int optlen, newlen;
-	uchar_t *newopts;
-
-	optlen = (labelopt[IPOPT_OLEN] + 3) & ~3;
-	newlen = *totlen + optlen - *labellen;
-	if (optlen > *labellen) {
-		if (newlen > IP_MAX_OPT_LENGTH)
-			return (EHOSTUNREACH);
-		newopts = (uchar_t *)mi_alloc(newlen, BPRI_HI);
-		if (newopts == NULL)
-			return (ENOMEM);
-		if (*totlen > *labellen) {
-			bcopy(*opts + *labellen, newopts + optlen,
-			    *totlen - *labellen);
-		}
-		if (*opts != NULL)
-			mi_free((char *)*opts);
-		*opts = newopts;
-	} else if (optlen < *labellen) {
-		if (newlen == 0 && *opts != NULL) {
-			mi_free((char *)*opts);
-			*opts = NULL;
-		}
-		if (*totlen > *labellen) {
-			ovbcopy(*opts + *labellen, *opts + optlen,
-			    *totlen - *labellen);
-		}
-	}
-	*totlen = newlen;
-	*labellen = optlen;
-	if (optlen > 0) {
-		newopts = *opts;
-		bcopy(labelopt, newopts, optlen);
-		/* check if there are user-supplied options that follow */
-		if (optlen < newlen) {
-			/* compute amount of embedded alignment needed */
-			optlen -= newopts[IPOPT_OLEN];
-			newopts += newopts[IPOPT_OLEN];
-			while (--optlen >= 0)
-				*newopts++ = IPOPT_NOP;
-		} else if (optlen != newopts[IPOPT_OLEN]) {
-			/*
-			 * The label option is the only option and it is
-			 * not a multiple of 4 bytes.
-			 */
-			optlen -= newopts[IPOPT_OLEN];
-			newopts += newopts[IPOPT_OLEN];
-			while (--optlen >= 0)
-				*newopts++ = IPOPT_EOL;
-		}
+	if (effective_tsl != NULL) {
+		label_rele(effective_tsl);
+		*effective_tslp = NULL;
 	}
-	return (0);
-}
-
-/*
- * This does the bulk of the processing for setting IPPROTO_IP {T_,}IP_OPTIONS.
- */
-boolean_t
-tsol_option_set(uchar_t **opts, uint_t *optlen, uint_t labellen,
-    const uchar_t *useropts, uint_t userlen)
-{
-	int newlen;
-	uchar_t *newopts;
-
-	newlen = userlen + labellen;
-	if (newlen > *optlen) {
-		/* need more room */
-		newopts = (uchar_t *)mi_alloc(newlen, BPRI_HI);
-		if (newopts == NULL)
-			return (B_FALSE);
-		/*
-		 * The supplied *opts can't be NULL in this case,
-		 * since there's an existing label.
-		 */
-		if (labellen > 0)
-			bcopy(*opts, newopts, labellen);
-		if (*opts != NULL)
-			mi_free((char *)*opts);
-		*opts = newopts;
-	}
-
-	if (newlen == 0) {
-		/* special case -- no remaining IP options at all */
-		if (*opts != NULL) {
-			mi_free((char *)*opts);
-			*opts = NULL;
-		}
-	} else if (userlen > 0) {
-		/* merge in the user's options */
-		newopts = *opts;
-		if (labellen > 0) {
-			int extra = labellen - newopts[IPOPT_OLEN];
-
-			newopts += newopts[IPOPT_OLEN];
-			while (--extra >= 0)
-				*newopts++ = IPOPT_NOP;
-		}
-		bcopy(useropts, newopts, userlen);
-	}
-
-	*optlen = newlen;
-	return (B_TRUE);
+	return (EINVAL);
 }
diff --git a/usr/src/uts/common/inet/ip/tnet.c b/usr/src/uts/common/inet/ip/tnet.c
index 1e5c0eb170..262d5bc339 100644
--- a/usr/src/uts/common/inet/ip/tnet.c
+++ b/usr/src/uts/common/inet/ip/tnet.c
@@ -133,16 +133,7 @@ int tsol_strict_error;
  *	- A set of route-related attributes that only get set for prefix
  *	  IREs.  If this is non-NULL, the prefix IRE has been associated
  *	  with a set of gateway security attributes by way of route add/
- *	  change functionality.  This field stays NULL for IRE_CACHEs.
- *
- * igsa_gcgrp
- *
- *	- Group of gc's which only gets set for IRE_CACHEs.  Each of the gc
- *	  points to a gcdb record that contains the security attributes
- *	  used to perform the credential checks of the packet which uses
- *	  the IRE.  If the group is not empty, the list of gc's can be
- *	  traversed starting at gcgrp_head.  This field stays NULL for
- *	  prefix IREs.
+ *	  change functionality.
  */
 
 static kmem_cache_t *ire_gw_secattr_cache;
@@ -223,7 +214,6 @@ ire_gw_secattr_constructor(void *buf, void *cdrarg, int kmflags)
 
 	attrp->igsa_rhc = NULL;
 	attrp->igsa_gc = NULL;
-	attrp->igsa_gcgrp = NULL;
 
 	return (0);
 }
@@ -257,14 +247,9 @@ ire_gw_secattr_free(tsol_ire_gw_secattr_t *attrp)
 		GC_REFRELE(attrp->igsa_gc);
 		attrp->igsa_gc = NULL;
 	}
-	if (attrp->igsa_gcgrp != NULL) {
-		GCGRP_REFRELE(attrp->igsa_gcgrp);
-		attrp->igsa_gcgrp = NULL;
-	}
 
 	ASSERT(attrp->igsa_rhc == NULL);
 	ASSERT(attrp->igsa_gc == NULL);
-	ASSERT(attrp->igsa_gcgrp == NULL);
 
 	kmem_cache_free(ire_gw_secattr_cache, attrp);
 }
@@ -387,9 +372,6 @@ rtsa_validate(const struct rtsa_s *rp)
 /*
  * A brief explanation of the reference counting scheme:
  *
- * Prefix IREs have a non-NULL igsa_gc and a NULL igsa_gcgrp;
- * IRE_CACHEs have it vice-versa.
- *
  * Apart from dynamic references due to to reference holds done
  * actively by threads, we have the following references:
  *
@@ -402,8 +384,6 @@ rtsa_validate(const struct rtsa_s *rp)
  *	  to the gc_refcnt.
  *
  * gcgrp_refcnt:
- *	- An IRE_CACHE that points to an igsa_gcgrp contributes a reference
- *	  to the gcgrp_refcnt of the associated tsol_gcgrp_t.
  *	- Every tsol_gc_t in the chain headed by tsol_gcgrp_t contributes
  *	  a reference to the gcgrp_refcnt.
  */
@@ -613,7 +593,6 @@ gcgrp_inactive(tsol_gcgrp_t *gcgrp)
 	mod_hash_t *hashp;
 
 	ASSERT(MUTEX_HELD(&gcgrp_lock));
-	ASSERT(!RW_LOCK_HELD(&gcgrp->gcgrp_rwlock));
 	ASSERT(gcgrp != NULL && gcgrp->gcgrp_refcnt == 0);
 	ASSERT(gcgrp->gcgrp_head == NULL && gcgrp->gcgrp_count == 0);
 
@@ -686,21 +665,21 @@ cipso_to_sl(const uchar_t *option, bslabel_t *sl)
 }
 
 /*
- * If present, parse a CIPSO label in the incoming packet and
- * construct a ts_label_t that reflects the CIPSO label and attach it
- * to the dblk cred.  Later as the mblk flows up through the stack any
+ * If present, parse the CIPSO label in the incoming packet and
+ * construct a ts_label_t that reflects the CIPSO label and put it in
+ * the ip_recv_attr_t. Later as the packet flows up through the stack any
  * code that needs to examine the packet label can inspect the label
- * from the dblk cred. This function is called right in ip_rput for
- * all packets, i.e. locally destined and to be forwarded packets. The
- * forwarding path needs to examine the label to determine how to
- * forward the packet.
+ * from the ira_tsl. This function is
+ * called right in ip_input for all packets, i.e. locally destined and
+ * to be forwarded packets. The forwarding path needs to examine the label
+ * to determine how to forward the packet.
  *
  * This routine pulls all message text up into the first mblk.
  * For IPv4, only the first 20 bytes of the IP header are guaranteed
  * to exist. For IPv6, only the IPv6 header is guaranteed to exist.
  */
 boolean_t
-tsol_get_pkt_label(mblk_t *mp, int version)
+tsol_get_pkt_label(mblk_t *mp, int version, ip_recv_attr_t *ira)
 {
 	tsol_tpc_t	*src_rhtp = NULL;
 	uchar_t		*opt_ptr = NULL;
@@ -713,7 +692,6 @@ tsol_get_pkt_label(mblk_t *mp, int version)
 	const void	*src;
 	const ip6_t	*ip6h;
 	cred_t		*credp;
-	pid_t		cpid;
 	int 		proto;
 
 	ASSERT(DB_TYPE(mp) == M_DATA);
@@ -846,28 +824,37 @@ tsol_get_pkt_label(mblk_t *mp, int version)
 		return (B_FALSE);
 	}
 
-	/* Make sure no other thread is messing with this mblk */
-	ASSERT(DB_REF(mp) == 1);
-	/* Preserve db_cpid */
-	credp = msg_extractcred(mp, &cpid);
-	if (credp == NULL) {
+	if (ira->ira_cred == NULL) {
 		credp = newcred_from_bslabel(&sl, doi, KM_NOSLEEP);
+		if (credp == NULL)
+			return (B_FALSE);
 	} else {
 		cred_t	*newcr;
 
-		newcr = copycred_from_bslabel(credp, &sl, doi,
+		newcr = copycred_from_bslabel(ira->ira_cred, &sl, doi,
 		    KM_NOSLEEP);
-		crfree(credp);
+		if (newcr == NULL)
+			return (B_FALSE);
+		if (ira->ira_free_flags & IRA_FREE_CRED) {
+			crfree(ira->ira_cred);
+			ira->ira_free_flags &= ~IRA_FREE_CRED;
+			ira->ira_cred = NULL;
+		}
 		credp = newcr;
 	}
-	if (credp == NULL)
-		return (B_FALSE);
 
-	crgetlabel(credp)->tsl_flags |= label_flags;
-
-	mblk_setcred(mp, credp, cpid);
-	crfree(credp);			/* mblk has ref on cred */
+	/*
+	 * Put the label in ira_tsl for convinience, while keeping
+	 * the cred in ira_cred for getpeerucred which is used to get
+	 * labels with TX.
+	 * Note: no explicit refcnt/free_flag for ira_tsl. The free_flag
+	 * for IRA_FREE_CRED is sufficient for both.
+	 */
+	ira->ira_tsl = crgetlabel(credp);
+	ira->ira_cred = credp;
+	ira->ira_free_flags |= IRA_FREE_CRED;
 
+	ira->ira_tsl->tsl_flags |= label_flags;
 	return (B_TRUE);
 }
 
@@ -878,25 +865,25 @@ tsol_get_pkt_label(mblk_t *mp, int version)
  */
 boolean_t
 tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
-    boolean_t shared_addr, const conn_t *connp)
+    ip_recv_attr_t *ira, const conn_t *connp)
 {
 	const cred_t *credp;
 	ts_label_t *plabel, *conn_plabel;
 	tsol_tpc_t *tp;
 	boolean_t retv;
 	const bslabel_t *label, *conn_label;
+	boolean_t shared_addr = (ira->ira_flags & IRAF_TX_SHARED_ADDR);
 
 	/*
-	 * The cases in which this can happen are:
-	 *	- IPv6 Router Alert, where ip_rput_data_v6 deliberately skips
-	 *	  over the label attachment process.
-	 *	- MLD output looped-back to ourselves.
-	 *	- IPv4 Router Discovery, where tsol_get_pkt_label intentionally
-	 *	  avoids the labeling process.
-	 * We trust that all valid paths in the code set the cred pointer when
-	 * needed.
+	 * tsol_get_pkt_label intentionally avoids the labeling process for:
+	 *	- IPv6 router and neighbor discovery as well as redirects.
+	 *	- MLD packets. (Anything between ICMPv6 code 130 and 138.)
+	 *	- IGMP packets.
+	 *	- IPv4 router discovery.
+	 * In those cases ire_cred is NULL.
 	 */
-	if ((credp = msg_getcred(mp, NULL)) == NULL)
+	credp = ira->ira_cred;
+	if (credp == NULL)
 		return (B_TRUE);
 
 	/*
@@ -904,17 +891,18 @@ tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
 	 * same zoneid as the selected destination, then no checks are
 	 * necessary.  Membership in the zone is enough proof.  This is
 	 * intended to be a hot path through this function.
+	 * Note: Using crgetzone here is ok since the peer is local.
 	 */
 	if (!crisremote(credp) &&
 	    crgetzone(credp) == crgetzone(connp->conn_cred))
 		return (B_TRUE);
 
-	plabel = crgetlabel(credp);
+	plabel = ira->ira_tsl;
 	conn_plabel = crgetlabel(connp->conn_cred);
 	ASSERT(plabel != NULL && conn_plabel != NULL);
 
 	label = label2bslabel(plabel);
-	conn_label = label2bslabel(crgetlabel(connp->conn_cred));
+	conn_label = label2bslabel(conn_plabel);
 
 
 	/*
@@ -954,12 +942,8 @@ tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
 		    blequal(label, conn_label))
 			return (B_TRUE);
 
-		/*
-		 * conn_zoneid is global for an exclusive stack, thus we use
-		 * conn_cred to get the zoneid
-		 */
 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
-		    (crgetzoneid(connp->conn_cred) != GLOBAL_ZONEID &&
+		    (!connp->conn_zone_is_global &&
 		    (plabel->tsl_doi != conn_plabel->tsl_doi ||
 		    !bldominates(conn_label, label)))) {
 			DTRACE_PROBE3(
@@ -1046,16 +1030,13 @@ tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
 }
 
 boolean_t
-tsol_can_accept_raw(mblk_t *mp, boolean_t check_host)
+tsol_can_accept_raw(mblk_t *mp, ip_recv_attr_t *ira, boolean_t check_host)
 {
 	ts_label_t	*plabel = NULL;
 	tsol_tpc_t	*src_rhtp, *dst_rhtp;
 	boolean_t	retv;
-	cred_t		*credp;
 
-	credp = msg_getcred(mp, NULL);
-	if (credp != NULL)
-		plabel = crgetlabel(credp);
+	plabel = ira->ira_tsl;
 
 	/* We are bootstrapping or the internal template was never deleted */
 	if (plabel == NULL)
@@ -1144,7 +1125,7 @@ tsol_can_accept_raw(mblk_t *mp, boolean_t check_host)
  * TSLF_UNLABELED flag is sufficient.
  */
 boolean_t
-tsol_can_reply_error(const mblk_t *mp)
+tsol_can_reply_error(const mblk_t *mp, ip_recv_attr_t *ira)
 {
 	ts_label_t	*plabel = NULL;
 	tsol_tpc_t	*rhtp;
@@ -1152,7 +1133,6 @@ tsol_can_reply_error(const mblk_t *mp)
 	const ip6_t	*ip6h;
 	boolean_t	retv;
 	bslabel_t	*pktbs;
-	cred_t		*credp;
 
 	/* Caller must pull up at least the IP header */
 	ASSERT(MBLKL(mp) >= (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ?
@@ -1161,9 +1141,7 @@ tsol_can_reply_error(const mblk_t *mp)
 	if (!tsol_strict_error)
 		return (B_TRUE);
 
-	credp = msg_getcred(mp, NULL);
-	if (credp != NULL)
-		plabel = crgetlabel(credp);
+	plabel = ira->ira_tsl;
 
 	/* We are bootstrapping or the internal template was never deleted */
 	if (plabel == NULL)
@@ -1227,33 +1205,30 @@ tsol_can_reply_error(const mblk_t *mp)
 }
 
 /*
- * Finds the zone associated with the given packet.  Returns GLOBAL_ZONEID if
- * the zone cannot be located.
+ * Finds the zone associated with the receive attributes.  Returns GLOBAL_ZONEID
+ * if the zone cannot be located.
  *
  * This is used by the classifier when the packet matches an ALL_ZONES IRE, and
  * there's no MLP defined.
  *
  * Note that we assume that this is only invoked in the ALL_ZONES case.
- * Handling other cases would require handle exclusive stack zones where either
+ * Handling other cases would require handling exclusive IP zones where either
  * this routine or the callers would have to map from
  * the zoneid (zone->zone_id) to what IP uses in conn_zoneid etc.
  */
 zoneid_t
-tsol_packet_to_zoneid(const mblk_t *mp)
+tsol_attr_to_zoneid(const ip_recv_attr_t *ira)
 {
-	cred_t *cr = msg_getcred(mp, NULL);
 	zone_t *zone;
 	ts_label_t *label;
 
-	if (cr != NULL) {
-		if ((label = crgetlabel(cr)) != NULL) {
-			zone = zone_find_by_label(label);
-			if (zone != NULL) {
-				zoneid_t zoneid = zone->zone_id;
+	if ((label = ira->ira_tsl) != NULL) {
+		zone = zone_find_by_label(label);
+		if (zone != NULL) {
+			zoneid_t zoneid = zone->zone_id;
 
-				zone_rele(zone);
-				return (zoneid);
-			}
+			zone_rele(zone);
+			return (zoneid);
 		}
 	}
 	return (GLOBAL_ZONEID);
@@ -1273,7 +1248,7 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
 	/* Not in Trusted mode or IRE is local/loopback/broadcast/interface */
 	if (!is_system_labeled() ||
 	    (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
-	    IRE_INTERFACE)))
+	    IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)))
 		goto done;
 
 	/*
@@ -1304,29 +1279,16 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
 	mutex_enter(&attrp->igsa_lock);
 
 	/*
-	 * Depending on the IRE type (prefix vs. cache), we seek the group
+	 * We seek the group
 	 * structure which contains all security credentials of the gateway.
-	 * A prefix IRE is associated with at most one gateway credential,
-	 * while a cache IRE is associated with every credentials that the
-	 * gateway has.
+	 * An offline IRE is associated with at most one gateway credential.
 	 */
-	if ((gc = attrp->igsa_gc) != NULL) {			/* prefix */
+	if ((gc = attrp->igsa_gc) != NULL) {
 		gcgrp = gc->gc_grp;
 		ASSERT(gcgrp != NULL);
 		rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
-	} else if ((gcgrp = attrp->igsa_gcgrp) != NULL) {	/* cache */
-		rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
-		gc = gcgrp->gcgrp_head;
-		if (gc == NULL) {
-			/* gc group is empty, so the drop lock now */
-			ASSERT(gcgrp->gcgrp_count == 0);
-			rw_exit(&gcgrp->gcgrp_rwlock);
-			gcgrp = NULL;
-		}
-	}
-
-	if (gcgrp != NULL)
 		GCGRP_REFHOLD(gcgrp);
+	}
 
 	if ((gw_rhc = attrp->igsa_rhc) != NULL) {
 		/*
@@ -1354,12 +1316,11 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
 				ASSERT(ga->ga_af == AF_INET6);
 				paddr = &ga->ga_addr;
 			}
-		} else if (ire->ire_ipversion == IPV6_VERSION &&
-		    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
-			paddr = &ire->ire_gateway_addr_v6;
-		} else if (ire->ire_ipversion == IPV4_VERSION &&
-		    ire->ire_gateway_addr != INADDR_ANY) {
-			paddr = &ire->ire_gateway_addr;
+		} else if (ire->ire_type & IRE_OFFLINK) {
+			if (ire->ire_ipversion == IPV6_VERSION)
+				paddr = &ire->ire_gateway_addr_v6;
+			else if (ire->ire_ipversion == IPV4_VERSION)
+				paddr = &ire->ire_gateway_addr;
 		}
 
 		/* We've found a gateway address to do the template lookup */
@@ -1408,6 +1369,7 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
 	}
 
 	if (gc != NULL) {
+
 		tsol_gcdb_t *gcdb;
 		/*
 		 * In the case of IRE_CACHE we've got one or more gateway
@@ -1418,18 +1380,9 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
 		 * just the route itself, so the loop is executed only once.
 		 */
 		ASSERT(gcgrp != NULL);
-		do {
-			gcdb = gc->gc_db;
-			if (tsl->tsl_doi == gcdb->gcdb_doi &&
-			    _blinrange(&tsl->tsl_label, &gcdb->gcdb_slrange))
-				break;
-			if (ire->ire_type == IRE_CACHE)
-				gc = gc->gc_next;
-			else
-				gc = NULL;
-		} while (gc != NULL);
-
-		if (gc == NULL) {
+		gcdb = gc->gc_db;
+		if (tsl->tsl_doi != gcdb->gcdb_doi ||
+		    !_blinrange(&tsl->tsl_label, &gcdb->gcdb_slrange)) {
 			DTRACE_PROBE3(
 			    tx__ip__log__drop__irematch__nogcmatched,
 			    char *, "ire(1), tsl(2): all gc failed match",
@@ -1493,12 +1446,13 @@ done:
 
 /*
  * Performs label accreditation checks for packet forwarding.
+ * Add or remove a CIPSO option as needed.
  *
  * Returns a pointer to the modified mblk if allowed for forwarding,
  * or NULL if the packet must be dropped.
  */
 mblk_t *
-tsol_ip_forward(ire_t *ire, mblk_t *mp)
+tsol_ip_forward(ire_t *ire, mblk_t *mp, const ip_recv_attr_t *ira)
 {
 	tsol_ire_gw_secattr_t *attrp = NULL;
 	ipha_t		*ipha;
@@ -1516,11 +1470,14 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
 	boolean_t	need_tpc_rele = B_FALSE;
 	ipaddr_t	*gw;
 	ip_stack_t	*ipst = ire->ire_ipst;
-	cred_t		*credp;
-	pid_t		pid;
+	int		err;
+	ts_label_t	*effective_tsl = NULL;
 
 	ASSERT(ire != NULL && mp != NULL);
-	ASSERT(ire->ire_stq != NULL);
+	/*
+	 * Note that the ire is the first one found, i.e., an IRE_OFFLINK if
+	 * the destination is offlink.
+	 */
 
 	af = (ire->ire_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6;
 
@@ -1530,16 +1487,6 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
 		psrc = &ipha->ipha_src;
 		pdst = &ipha->ipha_dst;
 		proto = ipha->ipha_protocol;
-
-		/*
-		 * off_link is TRUE if destination not directly reachable.
-		 * Surya note: we avoid creation of per-dst IRE_CACHE entries
-		 * for forwarded packets, so we set off_link to be TRUE
-		 * if the packet dst is different from the ire_addr of
-		 * the ire for the nexthop.
-		 */
-		off_link = ((ipha->ipha_dst != ire->ire_addr) ||
-		    (ire->ire_gateway_addr != INADDR_ANY));
 		if (!tsol_get_option_v4(mp, &label_type, &opt_ptr))
 			return (NULL);
 	} else {
@@ -1561,14 +1508,15 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
 			}
 			proto = *nexthdrp;
 		}
-
-		/* destination not directly reachable? */
-		off_link = !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
 		if (!tsol_get_option_v6(mp, &label_type, &opt_ptr))
 			return (NULL);
 	}
+	/*
+	 * off_link is TRUE if destination not directly reachable.
+	 */
+	off_link = (ire->ire_type & IRE_OFFLINK);
 
-	if ((tsl = msg_getlabel(mp)) == NULL)
+	if ((tsl = ira->ira_tsl) == NULL)
 		return (mp);
 
 	if (tsl->tsl_flags & TSLF_IMPLICIT_IN) {
@@ -1611,11 +1559,7 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
 			attrp = ire->ire_gw_secattr;
 			gw_rhtp = attrp->igsa_rhc->rhc_tpc;
 		} else  {
-			/*
-			 * use the ire_addr if this is the IRE_CACHE of nexthop
-			 */
-			gw = (ire->ire_gateway_addr == NULL? &ire->ire_addr :
-			    &ire->ire_gateway_addr);
+			gw = &ire->ire_gateway_addr;
 			gw_rhtp = find_tpc(gw, ire->ire_ipversion, B_FALSE);
 			need_tpc_rele = B_TRUE;
 		}
@@ -1702,7 +1646,13 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
 			/* adjust is negative */
 			ASSERT((mp->b_wptr + adjust) >= mp->b_rptr);
 			mp->b_wptr += adjust;
-
+			/*
+			 * Note that caller adjusts ira_pktlen and
+			 * ira_ip_hdr_length
+			 *
+			 * For AF_INET6 note that tsol_remove_secopt_v6
+			 * adjusted ip6_plen.
+			 */
 			if (af == AF_INET) {
 				ipha = (ipha_t *)mp->b_rptr;
 				iplen = ntohs(ipha->ipha_length) + adjust;
@@ -1729,17 +1679,34 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
 	    (!off_link || gw_rhtp->tpc_tp.host_type == UNLABELED))
 		goto keep_label;
 
-
-	credp = msg_getcred(mp, &pid);
-	if ((af == AF_INET &&
-	    tsol_check_label(credp, &mp, CONN_MAC_DEFAULT, ipst, pid) != 0) ||
-	    (af == AF_INET6 &&
-	    tsol_check_label_v6(credp, &mp, CONN_MAC_DEFAULT, ipst,
-	    pid) != 0)) {
+	/*
+	 * Since we are forwarding packets we use GLOBAL_ZONEID for
+	 * the IRE lookup in tsol_check_label.
+	 * Since mac_exempt is false the zoneid isn't used for anything
+	 * but the IRE lookup, hence we set zone_is_global to false.
+	 */
+	if (af == AF_INET) {
+		err = tsol_check_label_v4(tsl, GLOBAL_ZONEID, &mp,
+		    CONN_MAC_DEFAULT, B_FALSE, ipst, &effective_tsl);
+	} else {
+		err = tsol_check_label_v6(tsl, GLOBAL_ZONEID, &mp,
+		    CONN_MAC_DEFAULT, B_FALSE, ipst, &effective_tsl);
+	}
+	if (err != 0) {
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+		ip_drop_output("tsol_check_label", mp, NULL);
+		freemsg(mp);
 		mp = NULL;
 		goto keep_label;
 	}
 
+	/*
+	 * The effective_tsl must never affect the routing decision, hence
+	 * we ignore it here.
+	 */
+	if (effective_tsl != NULL)
+		label_rele(effective_tsl);
+
 	if (af == AF_INET) {
 		ipha = (ipha_t *)mp->b_rptr;
 		ipha->ipha_hdr_checksum = 0;
@@ -1885,13 +1852,13 @@ tsol_rtsa_init(rt_msghdr_t *rtm, tsol_rtsecattr_t *sp, caddr_t cp)
 }
 
 int
-tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
-    tsol_gcgrp_t *gcgrp)
+tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc)
 {
 	tsol_ire_gw_secattr_t *attrp;
 	boolean_t exists = B_FALSE;
 	in_addr_t ga_addr4;
 	void *paddr = NULL;
+	tsol_gcgrp_t *gcgrp = NULL;
 
 	ASSERT(ire != NULL);
 
@@ -1917,20 +1884,16 @@ tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
 
 		if (attrp->igsa_gc != NULL)
 			GC_REFRELE(attrp->igsa_gc);
-		if (attrp->igsa_gcgrp != NULL)
-			GCGRP_REFRELE(attrp->igsa_gcgrp);
 	}
 	ASSERT(!exists || MUTEX_HELD(&attrp->igsa_lock));
 
 	/*
 	 * References already held by caller and we keep them;
-	 * note that both gc and gcgrp may be set to NULL to
-	 * clear out igsa_gc and igsa_gcgrp, respectively.
+	 * note that gc may be set to NULL to clear out igsa_gc.
 	 */
 	attrp->igsa_gc = gc;
-	attrp->igsa_gcgrp = gcgrp;
 
-	if (gcgrp == NULL && gc != NULL) {
+	if (gc != NULL) {
 		gcgrp = gc->gc_grp;
 		ASSERT(gcgrp != NULL);
 	}
@@ -1955,12 +1918,11 @@ tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
 			ASSERT(ga->ga_af == AF_INET6);
 			paddr = &ga->ga_addr;
 		}
-	} else if (ipversion == IPV6_VERSION &&
-	    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
-		paddr = &ire->ire_gateway_addr_v6;
-	} else if (ipversion == IPV4_VERSION &&
-	    ire->ire_gateway_addr != INADDR_ANY) {
-		paddr = &ire->ire_gateway_addr;
+	} else if (ire->ire_type & IRE_OFFLINK) {
+		if (ipversion == IPV6_VERSION)
+			paddr = &ire->ire_gateway_addr_v6;
+		else if (ipversion == IPV4_VERSION)
+			paddr = &ire->ire_gateway_addr;
 	}
 
 	/*
@@ -1990,7 +1952,7 @@ tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
  * If we can't figure out what it is, then return mlptSingle.  That's actually
  * an error case.
  *
- * The callers are assume to pass in zone->zone_id and not the zoneid that
+ * The callers are assumed to pass in zone->zone_id and not the zoneid that
  * is stored in a conn_t (since the latter will be GLOBAL_ZONEID in an
  * exclusive stack zone).
  */
@@ -2022,23 +1984,28 @@ tsol_mlp_addr_type(zoneid_t zoneid, uchar_t version, const void *addr,
 		version = IPV4_VERSION;
 	}
 
+	/* Check whether the IRE_LOCAL (or ipif) is ALL_ZONES */
 	if (version == IPV4_VERSION) {
 		in4 = *(const in_addr_t *)addr;
 		if ((in4 == INADDR_ANY) || CLASSD(in4)) {
 			return (mlptBoth);
 		}
-		ire = ire_cache_lookup(in4, ip_zoneid, NULL, ipst);
+		ire = ire_ftable_lookup_v4(in4, 0, 0, IRE_LOCAL|IRE_LOOPBACK,
+		    NULL, ip_zoneid, NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY,
+		    0, ipst, NULL);
 	} else {
 		if (IN6_IS_ADDR_UNSPECIFIED((const in6_addr_t *)addr) ||
 		    IN6_IS_ADDR_MULTICAST((const in6_addr_t *)addr)) {
 			return (mlptBoth);
 		}
-		ire = ire_cache_lookup_v6(addr, ip_zoneid, NULL, ipst);
+		ire = ire_ftable_lookup_v6(addr, 0, 0, IRE_LOCAL|IRE_LOOPBACK,
+		    NULL, ip_zoneid, NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY,
+		    0, ipst, NULL);
 	}
 	/*
 	 * If we can't find the IRE, then we have to behave exactly like
-	 * ip_bind_laddr{,_v6}.  That means looking up the IPIF so that users
-	 * can bind to addresses on "down" interfaces.
+	 * ip_laddr_verify_{v4,v6}.  That means looking up the IPIF so that
+	 * users can bind to addresses on "down" interfaces.
 	 *
 	 * If we can't find that either, then the bind is going to fail, so
 	 * just give up.  Note that there's a miniscule chance that the address
@@ -2047,10 +2014,10 @@ tsol_mlp_addr_type(zoneid_t zoneid, uchar_t version, const void *addr,
 	if (ire == NULL) {
 		if (version == IPV4_VERSION)
 			ipif = ipif_lookup_addr(*(const in_addr_t *)addr, NULL,
-			    ip_zoneid, NULL, NULL, NULL, NULL, ipst);
+			    ip_zoneid, ipst);
 		else
 			ipif = ipif_lookup_addr_v6((const in6_addr_t *)addr,
-			    NULL, ip_zoneid, NULL, NULL, NULL, NULL, ipst);
+			    NULL, ip_zoneid, ipst);
 		if (ipif == NULL) {
 			return (mlptSingle);
 		}
diff --git a/usr/src/uts/common/inet/ip2mac_impl.h b/usr/src/uts/common/inet/ip2mac_impl.h
index 19d0931441..9a09e14487 100644
--- a/usr/src/uts/common/inet/ip2mac_impl.h
+++ b/usr/src/uts/common/inet/ip2mac_impl.h
@@ -37,10 +37,10 @@ extern "C" {
 
 #ifdef _KERNEL
 
-extern void nce_cb_dispatch(nce_t *);
-extern void nce_ip2mac_response(ip2mac_t *, nce_t *);
-extern void nce_cb_refhold_locked(nce_t *);
-extern void nce_cb_refrele(nce_t *);
+extern void ncec_cb_dispatch(ncec_t *);
+extern void ncec_ip2mac_response(ip2mac_t *, ncec_t *);
+extern void ncec_cb_refhold_locked(ncec_t *);
+extern void ncec_cb_refrele(ncec_t *);
 
 #endif /* _KERNEL */
 
diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h
index 5408ab9e55..10c6c81ba2 100644
--- a/usr/src/uts/common/inet/ip6.h
+++ b/usr/src/uts/common/inet/ip6.h
@@ -57,105 +57,12 @@ typedef enum {
 	IP6_SCOPE_GLOBAL
 } in6addr_scope_t;
 
-#ifdef	_KERNEL
+/* From RFC 3542 - setting for IPV6_USE_MIN_MTU socket option */
+#define	IPV6_USE_MIN_MTU_MULTICAST	-1	/* Default */
+#define	IPV6_USE_MIN_MTU_NEVER		0
+#define	IPV6_USE_MIN_MTU_ALWAYS		1
 
-/*
- * Private header used between the transports and IP to carry the content
- * of the options IPV6_PKTINFO/IPV6_RECVPKTINFO (the interface index only)
- * and IPV6_NEXTHOP.
- * Also used to specify that raw sockets do not want the UDP/TCP transport
- * checksums calculated in IP (akin to IP_HDR_INCLUDED) and provide for
- * IPV6_CHECKSUM on the transmit side (using ip6i_checksum_off).
- *
- * When this header is used it must be the first header in the packet i.e.
- * before the real ip6 header. The use of a next header value of 255
- * (IPPROTO_RAW) in this header indicates its presence. Note that
- * ip6_nxt = IPPROTO_RAW indicates that "this" header is ip6_info - the
- * next header is always IPv6.
- *
- * Note that ip6i_nexthop is at the same offset as ip6_dst so that
- * this header can be kept in the packet while the it passes through
- * ip_newroute* and the ndp code. Those routines will use ip6_dst for
- * resolution.
- *
- * Implementation offset assumptions about ip6_info_t and ip6_t fields
- * and their alignments shown in figure below
- *
- * ip6_info (Private headers from transports to IP) header below
- * _______________________________________________________________ _ _ _ _ _
- * | .... | ip6i_nxt (255)| ......................|ip6i_nexthop| ...ip6_t.
- * --------------------------------------------------------------- - - - - -
- *        ^                                       ^
- * <---- >| same offset for {ip6i_nxt,ip6_nxt}    ^
- *        ^                                       ^
- * <------^-------------------------------------->| same offset for
- *        ^                                       ^ {ip6i_nxthop,ip6_dst}
- * _______________________________________________________________ _ _ _
- * | .... | ip6_nxt       | ......................|ip6_dst     | .other hdrs...
- * --------------------------------------------------------------- - - -
- * ip6_t (IPv6 protocol) header above
- */
-struct ip6_info {
-	union {
-		struct ip6_info_ctl {
-			uint32_t	ip6i_un1_flow;
-			uint16_t	ip6i_un1_plen;   /* payload length */
-			uint8_t		ip6i_un1_nxt;    /* next header */
-			uint8_t		ip6i_un1_hlim;   /* hop limit */
-		} ip6i_un1;
-	} ip6i_ctlun;
-	int		ip6i_flags;	/* See below */
-	int		ip6i_ifindex;
-	int		ip6i_checksum_off;
-	int		ip6i_pad;
-	in6_addr_t	ip6i_nexthop;	/* Same offset as ip6_dst */
-};
-typedef struct ip6_info	ip6i_t;
-
-#define	ip6i_flow	ip6i_ctlun.ip6i_un1.ip6i_un1_flow
-#define	ip6i_vcf	ip6i_flow		/* Version, class, flow */
-#define	ip6i_nxt	ip6i_ctlun.ip6i_un1.ip6i_un1_nxt
-#define	ip6i_hops	ip6i_ctlun.ip6i_un1.ip6i_un1_hlim
-
-/* ip6_info flags */
-#define	IP6I_IFINDEX	0x1	/* ip6i_ifindex is set (to nonzero value) */
-#define	IP6I_NEXTHOP	0x2	/* ip6i_nexthop is different than ip6_dst */
-#define	IP6I_NO_ULP_CKSUM	0x4
-			/*
-			 * Do not generate TCP/UDP/SCTP transport checksum.
-			 * Used by raw sockets. Does not affect the
-			 * generation of transport checksums for ICMPv6
-			 * since such packets always arrive through
-			 * a raw socket.
-			 */
-#define	IP6I_UNSPEC_SRC	0x8
-			/* Used to carry conn_unspec_src through ip_newroute* */
-#define	IP6I_RAW_CHECKSUM	0x10
-			/* Compute checksum and stuff in ip6i_checksum_off */
-#define	IP6I_VERIFY_SRC	0x20	/* Verify ip6_src. Used when IPV6_PKTINFO */
-#define	IP6I_IPMP_PROBE	0x40	/* IPMP (in.mpathd) probe packet */
-				/* 0x80 - 0x100 available */
-#define	IP6I_DONTFRAG	0x200	/* Don't fragment this packet */
-#define	IP6I_HOPLIMIT	0x400	/* hoplimit has been set by the sender */
-
-/*
- * These constants refer to the IPV6_USE_MIN_MTU API.  The
- * actually values used in the API are these values shifted down
- * 10 bits minus 2 [-1, 1].  0 (-2 after conversion) is considered
- * the same as the default (-1).  IP6I_API_USE_MIN_MTU(f, x) returns
- * the flags field updated with min mtu.  IP6I_USE_MIN_MTU_API takes the
- * field and returns the API value (+ the -2 value).
- */
-#define	IP6I_USE_MIN_MTU_UNICAST	0x400
-#define	IP6I_USE_MIN_MTU_ALWAYS		0x800
-#define	IP6I_USE_MIN_MTU_NEVER		0xC00
-#define	IP6I_USE_MIN_MTU_API(x)		((((x) & 0xC00) >> 10) - 2)
-#define	IP6I_API_USE_MIN_MTU(f, x)	(((f) & ~0xC00) &\
-					((((x) + 2) & 0x3) << 11))
-#define	IPV6_USE_MIN_MTU_DEFAULT	-2
-#define	IPV6_USE_MIN_MTU_UNICAST	-1
-#define	IPV6_USE_MIN_MTU_ALWAYS		0
-#define	IPV6_USE_MIN_MTU_NEVER		1
+#ifdef	_KERNEL
 
 /* Extract the scope from a multicast address */
 #ifdef _BIG_ENDIAN
@@ -195,28 +102,18 @@ typedef struct ip6_info	ip6i_t;
 #define	MIN_EHDR_LEN		8
 #define	MAX_EHDR_LEN		2048
 
-/*
- * The high-order bit of the version field is used by the transports to
- * indicate a reachability confirmation to IP.
- */
-#define	IP_FORWARD_PROG_BIT		0x8
-
 #ifdef _BIG_ENDIAN
 #define	IPV6_DEFAULT_VERS_AND_FLOW	0x60000000
 #define	IPV6_VERS_AND_FLOW_MASK		0xF0000000
-#define	IP_FORWARD_PROG			((uint32_t)IP_FORWARD_PROG_BIT << 28)
 #define	V6_MCAST			0xFF000000
 #define	V6_LINKLOCAL			0xFE800000
 
 #define	IPV6_FLOW_TCLASS(x)		(((x) & IPV6_FLOWINFO_TCLASS) >> 20)
 #define	IPV6_TCLASS_FLOW(f, c)		(((f) & ~IPV6_FLOWINFO_TCLASS) |\
 					((c) << 20))
-
 #else
 #define	IPV6_DEFAULT_VERS_AND_FLOW	0x00000060
 #define	IPV6_VERS_AND_FLOW_MASK		0x000000F0
-#define	IP_FORWARD_PROG			((uint32_t)IP_FORWARD_PROG_BIT << 4)
-
 #define	V6_MCAST			0x000000FF
 #define	V6_LINKLOCAL			0x000080FE
 
@@ -328,71 +225,66 @@ extern const in6_addr_t	ipv6_unspecified_group;
  * FUNCTION PROTOTYPES
  */
 
-struct ipsec_out_s;
-
 extern void	convert2ascii(char *buf, const in6_addr_t *addr);
 extern char	*inet_ntop(int, const void *, char *, int);
 extern int	inet_pton(int, char *, void *);
-extern void	icmp_time_exceeded_v6(queue_t *, mblk_t *, uint8_t,
-    boolean_t, boolean_t, zoneid_t, ip_stack_t *);
-extern void	icmp_unreachable_v6(queue_t *, mblk_t *, uint8_t,
-    boolean_t, boolean_t, zoneid_t, ip_stack_t *);
-extern void	icmp_inbound_error_fanout_v6(queue_t *, mblk_t *, ip6_t *,
-    icmp6_t *, ill_t *, ill_t *, boolean_t, zoneid_t);
-extern boolean_t conn_wantpacket_v6(conn_t *, ill_t *, ip6_t *, int, zoneid_t);
-extern mblk_t	*ip_add_info_v6(mblk_t *, ill_t *, const in6_addr_t *);
+extern void	icmp_param_problem_nexthdr_v6(mblk_t *, boolean_t,
+    ip_recv_attr_t *);
+extern void	icmp_pkt2big_v6(mblk_t *, uint32_t, boolean_t,
+    ip_recv_attr_t *);
+extern void	icmp_time_exceeded_v6(mblk_t *, uint8_t, boolean_t,
+    ip_recv_attr_t *);
+extern void	icmp_unreachable_v6(mblk_t *, uint8_t, boolean_t,
+    ip_recv_attr_t *);
+extern mblk_t	*icmp_inbound_v6(mblk_t *, ip_recv_attr_t *);
+extern void	icmp_inbound_error_fanout_v6(mblk_t *, icmp6_t *,
+    ip_recv_attr_t *);
+extern void	icmp_update_out_mib_v6(ill_t *, icmp6_t *);
+
+extern boolean_t conn_wantpacket_v6(conn_t *, ip_recv_attr_t *, ip6_t *);
+
 extern in6addr_scope_t	ip_addr_scope_v6(const in6_addr_t *);
-extern mblk_t	*ip_bind_v6(queue_t *, mblk_t *, conn_t *, ip6_pkt_t *);
-extern void	ip_build_hdrs_v6(uchar_t *, uint_t, ip6_pkt_t *, uint8_t);
-extern int	ip_fanout_send_icmp_v6(queue_t *, mblk_t *, uint_t,
-    uint_t, uint8_t, uint_t, boolean_t, zoneid_t, ip_stack_t *);
-extern int	ip_find_hdr_v6(mblk_t *, ip6_t *, ip6_pkt_t *, uint8_t *);
-extern in6_addr_t ip_get_dst_v6(ip6_t *, mblk_t *, boolean_t *);
+extern void	ip_build_hdrs_v6(uchar_t *, uint_t, const ip_pkt_t *, uint8_t,
+    uint32_t);
+extern void	ip_fanout_udp_multi_v6(mblk_t *, ip6_t *, uint16_t, uint16_t,
+    ip_recv_attr_t *);
+extern void	ip_fanout_send_icmp_v6(mblk_t *, uint_t, uint8_t,
+    ip_recv_attr_t *);
+extern void	ip_fanout_proto_v6(mblk_t *, ip6_t *, ip_recv_attr_t *);
+extern int	ip_find_hdr_v6(mblk_t *, ip6_t *, boolean_t, ip_pkt_t *,
+    uint8_t *);
+extern in6_addr_t ip_get_dst_v6(ip6_t *, const mblk_t *, boolean_t *);
 extern ip6_rthdr_t	*ip_find_rthdr_v6(ip6_t *, uint8_t *);
-extern int	ip_hdr_complete_v6(ip6_t *, zoneid_t, ip_stack_t *);
 extern boolean_t	ip_hdr_length_nexthdr_v6(mblk_t *, ip6_t *,
     uint16_t *, uint8_t **);
 extern int	ip_hdr_length_v6(mblk_t *, ip6_t *);
-extern int	ip_check_v6_mblk(mblk_t *, ill_t *);
 extern uint32_t	ip_massage_options_v6(ip6_t *, ip6_rthdr_t *, netstack_t *);
-extern void	ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *, int, int);
-extern void 	ip_wput_ipsec_out_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
-    ire_t *);
-extern int	ip_total_hdrs_len_v6(ip6_pkt_t *);
+extern void	ip_forward_xmit_v6(nce_t *, mblk_t *, ip6_t *, ip_recv_attr_t *,
+    uint32_t, uint32_t);
+extern mblk_t	*ip_fraghdr_add_v6(mblk_t *, uint32_t, ip_xmit_attr_t *);
+extern int	ip_fragment_v6(mblk_t *, nce_t *, iaflags_t, uint_t, uint32_t,
+    uint32_t, zoneid_t, zoneid_t, pfirepostfrag_t postfragfn,
+    uintptr_t *ixa_cookie);
+extern int	ip_process_options_v6(mblk_t *, ip6_t *,
+    uint8_t *, uint_t, uint8_t, ip_recv_attr_t *);
+extern void	ip_process_rthdr(mblk_t *, ip6_t *, ip6_rthdr_t *,
+    ip_recv_attr_t *);
+extern int	ip_total_hdrs_len_v6(const ip_pkt_t *);
+extern mblk_t	*ipsec_early_ah_v6(mblk_t *, ip_recv_attr_t *);
 extern int	ipsec_ah_get_hdr_size_v6(mblk_t *, boolean_t);
-extern void	ip_wput_v6(queue_t *, mblk_t *);
-extern void	ip_wput_local_v6(queue_t *, ill_t *, ip6_t *, mblk_t *,
-    ire_t *, int, zoneid_t);
-extern void	ip_output_v6(void *, mblk_t *, void *, int);
-extern void	ip_xmit_v6(mblk_t *, ire_t *, uint_t, conn_t *, int,
-    struct ipsec_out_s *);
+extern void	ip_send_potential_redirect_v6(mblk_t *, ip6_t *, ire_t *,
+    ip_recv_attr_t *);
 extern void	ip_rput_v6(queue_t *, mblk_t *);
-extern void	ip_rput_data_v6(queue_t *, ill_t *, mblk_t *, ip6_t *,
-    uint_t, mblk_t *, mblk_t *);
-extern void	mld_input(queue_t *, mblk_t *, ill_t *);
+extern mblk_t	*mld_input(mblk_t *, ip_recv_attr_t *);
 extern void	mld_joingroup(ilm_t *);
 extern void	mld_leavegroup(ilm_t *);
 extern void	mld_timeout_handler(void *);
 
 extern void	pr_addr_dbg(char *, int, const void *);
-extern int	ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t,
-    const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *),
-    ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t,
-    const in6_addr_t *, mblk_t *);
-extern void	ip_newroute_ipif_v6(queue_t *, mblk_t *, ipif_t *,
-    const in6_addr_t *, const in6_addr_t *, int, zoneid_t);
-extern void	ip_newroute_v6(queue_t *, mblk_t *, const in6_addr_t *,
-    const in6_addr_t *, ill_t *, zoneid_t, ip_stack_t *);
 extern void	*ip6_kstat_init(netstackid_t, ip6_stat_t *);
 extern void	ip6_kstat_fini(netstackid_t, kstat_t *);
-extern size_t	ip6_get_src_preferences(conn_t *, uint32_t *);
-extern int	ip6_set_src_preferences(conn_t *, uint32_t);
-extern int	ip6_set_pktinfo(cred_t *, conn_t *, struct in6_pktinfo *);
-extern int	ip_proto_bind_laddr_v6(conn_t *, mblk_t **, uint8_t,
-    const in6_addr_t *, uint16_t, boolean_t);
-extern int	ip_proto_bind_connected_v6(conn_t *, mblk_t **,
-    uint8_t, in6_addr_t *, uint16_t, const in6_addr_t *, ip6_pkt_t *,
-    uint16_t, boolean_t, boolean_t, cred_t *);
+extern size_t	ip6_get_src_preferences(ip_xmit_attr_t *, uint32_t *);
+extern int	ip6_set_src_preferences(ip_xmit_attr_t *, uint32_t);
 
 #endif	/* _KERNEL */
 
diff --git a/usr/src/uts/common/inet/ip_arp.h b/usr/src/uts/common/inet/ip_arp.h
new file mode 100644
index 0000000000..2cb7e7a05a
--- /dev/null
+++ b/usr/src/uts/common/inet/ip_arp.h
@@ -0,0 +1,136 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _IP_ARP_H
+#define	_IP_ARP_H
+
+/*
+ * Data-structures and functions related to the IP STREAMS queue that handles
+ * packets with the SAP set to 0x806 (ETHERTYPE_ARP).
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <inet/ip.h>
+#include <inet/ip_ndp.h>
+#include <sys/stream.h>
+
+#ifdef _KERNEL
+extern struct streamtab dummymodinfo;
+
+struct arl_ill_common_s;
+/*
+ * The arl_s structure tracks the state of the associated ARP stream.
+ */
+typedef struct arl_s {
+	queue_t		*arl_rq;
+	queue_t		*arl_wq;
+	ip_stack_t	*arl_ipst;
+	zoneid_t	arl_zoneid;
+	cred_t		*arl_credp;
+	ip_m_t		arl_media;
+	struct arl_ill_common_s *arl_common;
+	int		arl_muxid;
+	uint_t		arl_ppa;
+	t_uscalar_t	arl_sap;
+	t_uscalar_t	arl_sap_length;
+	uint_t	arl_phys_addr_length;
+	char		*arl_name;
+	int		arl_name_length;
+	t_uscalar_t	arl_mactype;
+#define	arl_first_mp_to_free	arl_dlpi_deferred
+	mblk_t		*arl_dlpi_deferred;
+	mblk_t		*arl_unbind_mp;
+	mblk_t		*arl_detach_mp;
+#define	arl_last_mp_to_free	arl_detach_mp
+	uint_t		arl_state_flags;
+	uint_t
+		arl_needs_attach:1,
+		arl_dlpi_style_set:1,
+		arl_pad_to_bit_31:30;
+	uint_t		arl_refcnt;
+	kcondvar_t	arl_cv;
+	t_uscalar_t	arl_dlpi_pending;
+	kmutex_t	arl_lock;
+	int		arl_error;
+} arl_t;
+
+/*
+ * The arl_ill_common_t structure is a super-structure that contains pointers
+ * to a pair of matching ill_t, arl_t structures. Given an arl_t (or
+ * ill_t) the corresponding ill_t (or arl_t) must be obtained by
+ * synchronizing on the ai_lock,  and ensuring that the desired ill/arl
+ * pointer is non-null, not condemned. The arl_ill_common_t is allocated in
+ * arl_init() and freed only when both the ill_t and the arl_t structures
+ * become NULL.
+ * Lock hierarchy: the ai_lock must be take before the ill_lock or arl_lock.
+ */
+
+typedef struct arl_ill_common_s {
+	kmutex_t	ai_lock;
+	ill_t		*ai_ill;
+	arl_t		*ai_arl;
+	kcondvar_t	ai_ill_unplumb_done; /* sent from ip_modclose() */
+} arl_ill_common_t;
+
+extern	boolean_t	arp_no_defense;
+
+extern	struct module_info arp_mod_info;
+extern	int		arp_ll_up(ill_t *);
+extern	int		arp_ll_down(ill_t *);
+extern	boolean_t	arp_announce(ncec_t *);
+extern	boolean_t	arp_probe(ncec_t *);
+extern	int		arp_request(ncec_t *, in_addr_t, ill_t *);
+extern	void		arp_failure(mblk_t *, ip_recv_attr_t *);
+extern	int		arl_wait_for_info_ack(arl_t *);
+extern	int		arl_init(queue_t *, arl_t *);
+extern	void		arl_set_muxid(ill_t *, int);
+extern	int		arl_get_muxid(ill_t *);
+extern	void		arp_send_replumb_conf(ill_t *);
+extern	void		arp_unbind_complete(ill_t *);
+extern  ill_t		*arl_to_ill(arl_t *);
+#endif
+
+#define	ARP_RETRANS_TIMER	500 /* time in milliseconds */
+
+/* The following are arl_state_flags */
+#define	ARL_LL_SUBNET_PENDING	0x01	/* Waiting for DL_INFO_ACK from drv */
+#define	ARL_CONDEMNED		0x02	/* No more new ref's to the ILL */
+#define	ARL_DL_UNBIND_IN_PROGRESS	0x04	/* UNBIND_REQ is sent */
+#define	ARL_LL_BIND_PENDING	0x0020	/* BIND sent */
+#define	ARL_LL_UP		0x0040	/* BIND acked */
+#define	ARL_LL_DOWN		0x0080
+#define	ARL_LL_UNBOUND		0x0100	/* UNBIND acked */
+#define	ARL_LL_REPLUMBING	0x0200	/* replumb in progress */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _IP_ARP_H */
diff --git a/usr/src/uts/common/inet/ip_ftable.h b/usr/src/uts/common/inet/ip_ftable.h
index 6a3a05183b..d8fa9e566d 100644
--- a/usr/src/uts/common/inet/ip_ftable.h
+++ b/usr/src/uts/common/inet/ip_ftable.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -56,7 +56,7 @@ struct rt_entry {
  *
  * The comment below (and for other netstack_t references) refers
  * to the fact that we only do netstack_hold in particular cases,
- * such as the references from open streams (ill_t and conn_t's
+ * such as the references from open endpoints (ill_t and conn_t's
  * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
  * ire_t's when an ill goes away.
  */
@@ -74,26 +74,8 @@ int rtfunc(struct radix_node *, void *);
 typedef struct rt_entry rt_t;
 typedef struct rtfuncarg rtf_t;
 
-/* For ire_forward() */
-enum ire_forward_action {
-	Forward_ok,			/* OK to use this IRE to forward */
-	Forward_check_multirt,		/* CGTP multirt check required */
-	Forward_ret_icmp_err,		/* Callers to return an ICMP error */
-	Forward_blackhole		/* Packet is silently discarded */
-};
-
 struct ts_label_s;
-extern	ire_t	*ire_ftable_lookup(ipaddr_t, ipaddr_t, ipaddr_t, int,
-    const ipif_t *, ire_t **, zoneid_t, uint32_t,
-    const struct ts_label_s *, int, ip_stack_t *);
-extern	ire_t *ire_lookup_multi(ipaddr_t, zoneid_t, ip_stack_t *);
-extern	ire_t *ipif_lookup_multi_ire(ipif_t *, ipaddr_t);
 extern	void ire_delete_host_redirects(ipaddr_t, ip_stack_t *);
-extern	ire_t *ire_ihandle_lookup_onlink(ire_t *);
-extern	ire_t *ire_forward(ipaddr_t, enum ire_forward_action *, ire_t *,
-    ire_t *, const struct ts_label_s *, ip_stack_t *);
-extern	ire_t *ire_forward_simple(ipaddr_t, enum ire_forward_action *,
-    ip_stack_t *);
 extern irb_t	*ire_get_bucket(ire_t *);
 extern uint_t ifindex_lookup(const struct sockaddr *, zoneid_t);
 extern int ipfil_sendpkt(const struct sockaddr *, mblk_t *, uint_t, zoneid_t);
diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h
index 80a9f74e88..d081d9256b 100644
--- a/usr/src/uts/common/inet/ip_if.h
+++ b/usr/src/uts/common/inet/ip_if.h
@@ -80,12 +80,12 @@ extern "C" {
 
 #define	IFF_PHYINTINST_FLAGS	(IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP| \
     IFF_MULTICAST|IFF_ROUTER|IFF_NONUD|IFF_NORTEXCH|IFF_IPV4|IFF_IPV6| \
-    IFF_XRESOLV|IFF_COS_ENABLED)
+    IFF_COS_ENABLED|IFF_FIXEDMTU)
 
 #define	IFF_LOGINT_FLAGS	(IFF_UP|IFF_BROADCAST|IFF_POINTOPOINT| \
     IFF_UNNUMBERED|IFF_DHCPRUNNING|IFF_PRIVATE|IFF_NOXMIT|IFF_NOLOCAL| \
     IFF_DEPRECATED|IFF_ADDRCONF|IFF_ANYCAST|IFF_NOFAILOVER| \
-    IFF_PREFERRED|IFF_TEMPORARY|IFF_FIXEDMTU|IFF_DUPLICATE)
+    IFF_PREFERRED|IFF_TEMPORARY|IFF_DUPLICATE)
 
 #define	PHYI_LOOPBACK		IFF_LOOPBACK	/* is a loopback net */
 #define	PHYI_RUNNING		IFF_RUNNING	/* resources allocated */
@@ -109,8 +109,8 @@ extern "C" {
 #define	ILLF_NORTEXCH		IFF_NORTEXCH	/* No routing info exchange */
 #define	ILLF_IPV4		IFF_IPV4	/* IPv4 interface */
 #define	ILLF_IPV6		IFF_IPV6	/* IPv6 interface */
-#define	ILLF_XRESOLV		IFF_XRESOLV	/* IPv6 external resolver */
 #define	ILLF_COS_ENABLED	IFF_COS_ENABLED	/* Is CoS marking supported */
+#define	ILLF_FIXEDMTU		IFF_FIXEDMTU	/* set with SIOCSLIFMTU */
 
 #define	IPIF_UP			IFF_UP		/* interface is up */
 #define	IPIF_BROADCAST		IFF_BROADCAST	/* broadcast address valid */
@@ -126,7 +126,6 @@ extern "C" {
 #define	IPIF_NOFAILOVER		IFF_NOFAILOVER	/* No failover on NIC failure */
 #define	IPIF_PREFERRED		IFF_PREFERRED	/* Prefer as source address */
 #define	IPIF_TEMPORARY		IFF_TEMPORARY	/* RFC3041 */
-#define	IPIF_FIXEDMTU		IFF_FIXEDMTU	/* set with SIOCSLIFMTU */
 #define	IPIF_DUPLICATE		IFF_DUPLICATE	/* address is in use */
 
 #ifdef DEBUG
@@ -135,6 +134,12 @@ extern "C" {
 #define	ILL_MAC_PERIM_HELD(ill)
 #endif
 
+/*
+ * match flags for ipif_lookup_addr_common* functions
+ */
+#define	IPIF_MATCH_ILLGRP	0x00000001
+#define	IPIF_MATCH_NONDUP	0x00000002
+
 /* for ipif_resolver_up */
 enum ip_resolver_action {
 	Res_act_initial,		/* initial address establishment */
@@ -143,134 +148,144 @@ enum ip_resolver_action {
 	Res_act_none			/* do nothing */
 };
 
-extern	mblk_t	*ill_arp_alloc(ill_t *, const uchar_t *, caddr_t);
-extern	mblk_t	*ipif_area_alloc(ipif_t *, uint_t);
-extern	mblk_t	*ipif_ared_alloc(ipif_t *);
-extern	mblk_t	*ill_ared_alloc(ill_t *, ipaddr_t);
-extern	mblk_t	*ill_arie_alloc(ill_t *, const char *, const void *);
-extern	boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t);
+extern	int	ill_add_ires(ill_t *);
+extern	void	ill_delete_ires(ill_t *);
 extern	void	ill_dlpi_done(ill_t *, t_uscalar_t);
+extern	boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t);
+extern	void	ill_dlpi_dispatch(ill_t *, mblk_t *);
 extern	void	ill_dlpi_send(ill_t *, mblk_t *);
 extern	void	ill_dlpi_send_deferred(ill_t *);
+extern	void	ill_dlpi_queue(ill_t *, mblk_t *);
+extern	void	ill_dlpi_send_queued(ill_t *);
+extern	void	ill_mcast_queue(ill_t *, mblk_t *);
+extern	void	ill_mcast_send_queued(ill_t *);
+extern	void	ill_mcast_timer_start(ip_stack_t *);
 extern	void	ill_capability_done(ill_t *);
 
 extern	mblk_t	*ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t);
 /* NOTE: Keep unmodified ill_lookup_on_ifindex for ipp for now */
-extern  ill_t	*ill_lookup_on_ifindex_global_instance(uint_t, boolean_t,
-    queue_t *, mblk_t *, ipsq_func_t, int *);
-extern  ill_t	*ill_lookup_on_ifindex(uint_t, boolean_t, queue_t *, mblk_t *,
-    ipsq_func_t, int *, ip_stack_t *);
-extern	ill_t	*ill_lookup_on_name(char *, boolean_t,
-    boolean_t, queue_t *, mblk_t *, ipsq_func_t, int *, boolean_t *,
+extern  ill_t	*ill_lookup_on_ifindex_global_instance(uint_t, boolean_t);
+extern  ill_t	*ill_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *);
+extern  ill_t	*ill_lookup_on_ifindex_zoneid(uint_t, zoneid_t, boolean_t,
     ip_stack_t *);
+extern	ill_t	*ill_lookup_on_name(char *, boolean_t,
+    boolean_t, boolean_t *, ip_stack_t *);
+extern boolean_t ip_ifindex_valid(uint_t, boolean_t, ip_stack_t *);
 extern uint_t	ill_get_next_ifindex(uint_t, boolean_t, ip_stack_t *);
 extern uint_t	ill_get_ifindex_by_name(char *, ip_stack_t *);
-extern	void	ill_grp_cache_delete(ire_t *, char *);
-extern	void	ill_ipif_cache_delete(ire_t *, char *);
-extern	void	ill_stq_cache_delete(ire_t *, char *);
+extern uint_t	ill_get_upper_ifindex(const ill_t *);
 extern	void	ill_delete(ill_t *);
 extern	void	ill_delete_tail(ill_t *);
 extern	int	ill_dl_phys(ill_t *, ipif_t *, mblk_t *, queue_t *);
-extern	int	ill_dls_info(struct sockaddr_dl *, const ipif_t *);
+extern	int	ill_dls_info(struct sockaddr_dl *, const ill_t *);
 extern	void	ill_fastpath_ack(ill_t *, mblk_t *);
-extern	void	ill_fastpath_nack(ill_t *);
 extern	int	ill_fastpath_probe(ill_t *, mblk_t *);
-extern	void	ill_fastpath_flush(ill_t *);
 extern	int	ill_forward_set(ill_t *, boolean_t);
 extern	void	ill_frag_prune(ill_t *, uint_t);
 extern	void	ill_frag_free_pkts(ill_t *, ipfb_t *, ipf_t *, int);
 extern	time_t	ill_frag_timeout(ill_t *, time_t);
 extern	int	ill_init(queue_t *, ill_t *);
-extern	void	ill_refresh_bcast(ill_t *);
 extern	void	ill_restart_dad(ill_t *, boolean_t);
 extern	void	ill_setdefaulttoken(ill_t *);
 extern	void	ill_setdesttoken(ill_t *);
+extern	void	ill_set_inputfn(ill_t *);
+extern	void	ill_set_inputfn_all(ip_stack_t *);
 extern	int	ill_set_phys_addr(ill_t *, mblk_t *);
 extern	int	ill_replumb(ill_t *, mblk_t *);
 extern	void	ill_set_ndmp(ill_t *, mblk_t *, uint_t, uint_t);
 
-extern mblk_t	*ill_pending_mp_get(ill_t *, conn_t **, uint_t);
-extern boolean_t ill_pending_mp_add(ill_t *, conn_t *, mblk_t *);
 extern	boolean_t ill_is_freeable(ill_t *ill);
 extern	void	ill_refhold(ill_t *);
 extern	void	ill_refhold_locked(ill_t *);
-extern	int	ill_check_and_refhold(ill_t *);
+extern	boolean_t ill_check_and_refhold(ill_t *);
 extern	void	ill_refrele(ill_t *);
 extern	boolean_t ill_waiter_inc(ill_t *);
 extern	void	ill_waiter_dcr(ill_t *);
 extern	void	ill_trace_ref(ill_t *);
 extern	void	ill_untrace_ref(ill_t *);
+extern	void	ill_downi(ire_t *, char *);
+extern	void	ill_downi_if_clone(ire_t *, char *);
 extern	boolean_t ill_down_start(queue_t *, mblk_t *);
+extern	ill_t	*ill_lookup_group_v4(ipaddr_t, zoneid_t,
+    ip_stack_t *, boolean_t *, ipaddr_t *);
 extern	ill_t	*ill_lookup_group_v6(const in6_addr_t *, zoneid_t,
-    ip_stack_t *);
+    ip_stack_t *, boolean_t *, in6_addr_t *);
 
 extern	void	ill_capability_ack(ill_t *, mblk_t *);
 extern	void	ill_capability_probe(ill_t *);
 extern	void	ill_capability_reset(ill_t *, boolean_t);
 extern	void	ill_taskq_dispatch(ip_stack_t *);
 
-extern	void	ill_mtu_change(ire_t *, char *);
+extern	void	ill_get_name(const ill_t *, char *, int);
+extern	void	ill_group_cleanup(ill_t *);
 extern	int	ill_up_ipifs(ill_t *, queue_t *, mblk_t *);
+extern	void	ip_update_source_selection(ip_stack_t *);
 extern uint_t	ill_appaddr_cnt(const ill_t *);
 extern uint_t	ill_ptpaddr_cnt(const ill_t *);
+extern uint_t   ill_admupaddr_cnt(const ill_t *);
+
+extern	ill_t	*ill_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t);
+extern void	ill_save_ire(ill_t *, ire_t *);
+extern void	ill_remove_saved_ire(ill_t *, ire_t *);
+extern int	ill_recover_saved_ire(ill_t *);
 
 extern	void	ip_interface_cleanup(ip_stack_t *);
 extern	void	ipif_get_name(const ipif_t *, char *, int);
 extern	ipif_t	*ipif_getby_indexes(uint_t, uint_t, boolean_t, ip_stack_t *);
 extern	void	ipif_init(ip_stack_t *);
-extern	ipif_t	*ipif_lookup_addr(ipaddr_t, ill_t *, zoneid_t, queue_t *,
-    mblk_t *, ipsq_func_t, int *, ip_stack_t *);
-extern	boolean_t ip_addr_exists(ipaddr_t, zoneid_t, ip_stack_t *);
+extern	ipif_t	*ipif_lookup_addr(ipaddr_t, ill_t *, zoneid_t, ip_stack_t *);
+extern	ipif_t	*ipif_lookup_addr_exact(ipaddr_t, ill_t *, ip_stack_t *);
+extern	ipif_t	*ipif_lookup_addr_nondup(ipaddr_t, ill_t *, zoneid_t,
+    ip_stack_t *);
 extern	ipif_t	*ipif_lookup_addr_v6(const in6_addr_t *, ill_t *, zoneid_t,
-    queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *);
-extern  boolean_t ip_addr_exists_v6(const in6_addr_t *, zoneid_t,
     ip_stack_t *);
 extern	ipif_t	*ipif_lookup_addr_exact_v6(const in6_addr_t *, ill_t *,
     ip_stack_t *);
+extern	ipif_t	*ipif_lookup_addr_nondup_v6(const in6_addr_t *, ill_t *,
+    zoneid_t, ip_stack_t *);
 extern	zoneid_t ipif_lookup_addr_zoneid(ipaddr_t, ill_t *, ip_stack_t *);
 extern	zoneid_t ipif_lookup_addr_zoneid_v6(const in6_addr_t *, ill_t *,
     ip_stack_t *);
-extern	ipif_t	*ipif_lookup_group(ipaddr_t, zoneid_t, ip_stack_t *);
-extern	ipif_t	*ipif_lookup_group_v6(const in6_addr_t *, zoneid_t,
-    ip_stack_t *);
-extern  ipif_t	*ipif_lookup_interface(ipaddr_t, ipaddr_t,
-    queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *);
-extern	ipif_t	*ipif_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t);
+extern  ipif_t	*ipif_lookup_interface(ipaddr_t, ipaddr_t, ip_stack_t *);
 extern	ipif_t	*ipif_lookup_remote(ill_t *, ipaddr_t, zoneid_t);
-extern	ipif_t	*ipif_lookup_onlink_addr(ipaddr_t, zoneid_t, ip_stack_t *);
-extern	ipif_t	*ipif_lookup_seqid(ill_t *, uint_t);
-extern	boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int, ipif_t **);
-extern	ipif_t	*ipif_select_source(ill_t *, ipaddr_t, zoneid_t);
-extern	boolean_t	ipif_usesrc_avail(ill_t *, zoneid_t);
+extern boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *,
+    ipif_t **);
+extern boolean_t ipif_lookup_testaddr_v4(ill_t *, const in_addr_t *,
+    ipif_t **);
+extern	ipif_t	*ipif_select_source_v4(ill_t *, ipaddr_t, zoneid_t, boolean_t,
+    boolean_t *);
+extern	boolean_t ipif_zone_avail(uint_t, boolean_t, zoneid_t, ip_stack_t *);
+extern	ipif_t	*ipif_good_addr(ill_t *, zoneid_t);
+extern	int	ip_select_source_v4(ill_t *, ipaddr_t, ipaddr_t, ipaddr_t,
+    zoneid_t, ip_stack_t *, ipaddr_t *, uint32_t *, uint64_t *);
 extern	void	ipif_refhold(ipif_t *);
 extern	void	ipif_refhold_locked(ipif_t *);
 extern	void	ipif_refrele(ipif_t *);
 extern	void	ipif_all_down_tail(ipsq_t *, queue_t *, mblk_t *, void *);
-extern	void	ipif_resolver_down(ipif_t *);
 extern	int	ipif_resolver_up(ipif_t *, enum ip_resolver_action);
-extern	int	ipif_arp_setup_multicast(ipif_t *, mblk_t **);
 extern	int	ipif_down(ipif_t *, queue_t *, mblk_t *);
-extern	void	ipif_down_tail(ipif_t *);
+extern	int	ipif_down_tail(ipif_t *);
 extern	void	ipif_multicast_down(ipif_t *);
 extern	void	ipif_multicast_up(ipif_t *);
 extern	void	ipif_ndp_down(ipif_t *);
 extern	int	ipif_ndp_up(ipif_t *, boolean_t);
-extern	int	ipif_ndp_setup_multicast(ipif_t *, struct nce_s **);
 extern	int	ipif_up_done(ipif_t *);
 extern	int	ipif_up_done_v6(ipif_t *);
 extern	void	ipif_up_notify(ipif_t *);
-extern	void	ipif_update_other_ipifs_v6(ipif_t *);
-extern	void	ipif_recreate_interface_routes_v6(ipif_t *, ipif_t *);
-extern	void	ill_update_source_selection(ill_t *);
 extern	ipif_t	*ipif_select_source_v6(ill_t *, const in6_addr_t *, boolean_t,
-    uint32_t, zoneid_t);
+    uint32_t, zoneid_t, boolean_t, boolean_t *);
+extern	int	ip_select_source_v6(ill_t *, const in6_addr_t *,
+    const in6_addr_t *, zoneid_t, ip_stack_t *, uint_t, uint32_t, in6_addr_t *,
+    uint32_t *, uint64_t *);
 extern	boolean_t	ipif_cant_setlinklocal(ipif_t *);
 extern	void	ipif_setlinklocal(ipif_t *);
 extern	void	ipif_setdestlinklocal(ipif_t *);
-extern	ipif_t	*ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t, queue_t *,
-    mblk_t *, ipsq_func_t, int *, ip_stack_t *);
+extern	ipif_t	*ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t,
+    ip_stack_t *);
 extern	ipif_t	*ipif_get_next_ipif(ipif_t *curr, ill_t *ill);
 extern	void	ipif_ill_refrele_tail(ill_t *ill);
+extern	void	ipif_nce_down(ipif_t *ipif);
+extern	int	ipif_arp_down(ipif_t *ipif);
 extern	void	ipif_mask_reply(ipif_t *);
 extern	int 	ipif_up(ipif_t *, queue_t *, mblk_t *);
 
@@ -290,7 +305,7 @@ extern	void	qwriter_ip(ill_t *, queue_t *, mblk_t *, ipsq_func_t, int,
     boolean_t);
 
 typedef	int	ip_extract_func_t(queue_t *, mblk_t *, const ip_ioctl_cmd_t *,
-    cmd_info_t *, ipsq_func_t);
+    cmd_info_t *);
 
 extern	ip_extract_func_t ip_extract_arpreq, ip_extract_lifreq;
 
@@ -298,16 +313,14 @@ extern	int	ip_addr_availability_check(ipif_t *);
 extern	void	ip_ll_subnet_defaults(ill_t *, mblk_t *);
 
 extern	int	ip_rt_add(ipaddr_t, ipaddr_t, ipaddr_t, ipaddr_t, int,
-    ipif_t *, ire_t **, boolean_t, queue_t *, mblk_t *, ipsq_func_t,
-    struct rtsa_s *, ip_stack_t *);
+    ill_t *, ire_t **, boolean_t, struct rtsa_s *, ip_stack_t *, zoneid_t);
 extern	int	ip_rt_add_v6(const in6_addr_t *, const in6_addr_t *,
-    const in6_addr_t *, const in6_addr_t *, int, ipif_t *, ire_t **,
-    queue_t *, mblk_t *, ipsq_func_t, struct rtsa_s *, ip_stack_t *ipst);
+    const in6_addr_t *, const in6_addr_t *, int, ill_t *, ire_t **,
+    struct rtsa_s *, ip_stack_t *, zoneid_t);
 extern	int	ip_rt_delete(ipaddr_t, ipaddr_t, ipaddr_t, uint_t, int,
-    ipif_t *, boolean_t, queue_t *, mblk_t *, ipsq_func_t, ip_stack_t *);
+    ill_t *, boolean_t, ip_stack_t *, zoneid_t);
 extern	int	ip_rt_delete_v6(const in6_addr_t *, const in6_addr_t *,
-    const in6_addr_t *, uint_t, int, ipif_t *, queue_t *, mblk_t *,
-    ipsq_func_t, ip_stack_t *);
+    const in6_addr_t *, uint_t, int, ill_t *, ip_stack_t *, zoneid_t);
 extern int ip_siocdelndp_v6(ipif_t *, sin_t *, queue_t *, mblk_t *,
     ip_ioctl_cmd_t *, void *);
 extern int ip_siocqueryndp_v6(ipif_t *, sin_t *, queue_t *, mblk_t *,
@@ -454,11 +467,12 @@ extern int ip_sioctl_get_lifsrcof(ipif_t *, sin_t *, queue_t *,
 
 extern	void	ip_sioctl_copyin_resume(ipsq_t *, queue_t *, mblk_t *, void *);
 extern	void	ip_sioctl_copyin_setup(queue_t *, mblk_t *);
-extern	void	ip_sioctl_iocack(ipsq_t *, queue_t *, mblk_t *, void *);
 extern	ip_ioctl_cmd_t *ip_sioctl_lookup(int);
-
-extern	void	conn_delete_ire(conn_t *, caddr_t);
-extern	boolean_t	phyint_exists(uint_t, ip_stack_t *);
+extern void	ipif_delete_ires_v4(ipif_t *);
+extern void	ipif_delete_ires_v6(ipif_t *);
+extern int	ipif_arp_up(ipif_t *, enum ip_resolver_action, boolean_t);
+extern void	ipif_dup_recovery(void *);
+extern void	ipif_do_recovery(ipif_t *);
 
 /*
  * Notes on reference tracing on ill, ipif, ire, nce data structures:
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index 5f9d674e17..694f7a63b0 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -50,10 +50,12 @@ extern "C" {
 #define	IP_HDR_CSUM_TTL_ADJUST	256
 #define	IP_TCP_CSUM_COMP	IPPROTO_TCP
 #define	IP_UDP_CSUM_COMP	IPPROTO_UDP
+#define	IP_ICMPV6_CSUM_COMP	IPPROTO_ICMPV6
 #else
 #define	IP_HDR_CSUM_TTL_ADJUST	1
 #define	IP_TCP_CSUM_COMP	(IPPROTO_TCP << 8)
 #define	IP_UDP_CSUM_COMP	(IPPROTO_UDP << 8)
+#define	IP_ICMPV6_CSUM_COMP	(IPPROTO_ICMPV6 << 8)
 #endif
 
 #define	TCP_CHECKSUM_OFFSET	16
@@ -62,240 +64,20 @@ extern "C" {
 #define	UDP_CHECKSUM_OFFSET	6
 #define	UDP_CHECKSUM_SIZE	2
 
+#define	ICMPV6_CHECKSUM_OFFSET	2
+#define	ICMPV6_CHECKSUM_SIZE	2
+
 #define	IPH_TCPH_CHECKSUMP(ipha, hlen)	\
 	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET)))
 
 #define	IPH_UDPH_CHECKSUMP(ipha, hlen)	\
 	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET)))
 
+#define	IPH_ICMPV6_CHECKSUMP(ipha, hlen)	\
+	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + ICMPV6_CHECKSUM_OFFSET)))
+
 #define	ILL_HCKSUM_CAPABLE(ill)		\
 	(((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0)
-/*
- * Macro that performs software checksum calculation on the IP header.
- */
-#define	IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) {		\
-	(sum) += (ttl_protocol) + (ipha)->ipha_ident +			\
-	    ((v_hlen_tos_len) >> 16) +					\
-	    ((v_hlen_tos_len) & 0xFFFF) +				\
-	    (ipha)->ipha_fragment_offset_and_flags;			\
-	(sum) = (((sum) & 0xFFFF) + ((sum) >> 16));			\
-	(sum) = ~((sum) + ((sum) >> 16));				\
-	(ipha)->ipha_hdr_checksum = (uint16_t)(sum);			\
-}
-
-#define	IS_IP_HDR_HWCKSUM(ipsec, mp, ill)				\
-	((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&		\
-	ILL_HCKSUM_CAPABLE(ill) && dohwcksum)
-
-/*
- * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs
- * several checks on the IRE and ILL (among other things) in order to see
- * whether or not hardware checksum offload is allowed for the outgoing
- * packet.  It assumes that the caller has held a reference to the IRE.
- */
-#define	IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end,		\
-	    max_frag, ipsec_len, pseudo) {				\
-	uint32_t _hck_flags;						\
-	/*								\
-	 * We offload checksum calculation to hardware when IPsec isn't	\
-	 * present and if fragmentation isn't required.  We also check	\
-	 * if M_DATA fastpath is safe to be used on the	corresponding	\
-	 * IRE; this check is performed without grabbing ire_lock but	\
-	 * instead by holding a reference to it.  This is sufficient	\
-	 * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the	\
-	 * DL_NOTE_FASTPATH_FLUSH indication could come up from the	\
-	 * driver and trigger the IRE (hence fp_mp) deletion.  This is	\
-	 * why only IRE_CACHE type is eligible for offload.		\
-	 *								\
-	 * The presense of IP options also forces the network stack to	\
-	 * calculate the checksum in software.  This is because:	\
-	 *								\
-	 * Wrap around: certain partial-checksum NICs (eri, ce) limit	\
-	 * the size of "start offset" width to 6-bit.  This effectively	\
-	 * sets the largest value of the offset to 64-bytes, starting	\
-	 * from the MAC header.  When the cumulative MAC and IP headers	\
-	 * exceed such limit, the offset will wrap around.  This causes	\
-	 * the checksum to be calculated at the wrong place.		\
-	 *								\
-	 * IPv4 source routing: none of the full-checksum capable NICs	\
-	 * is capable of correctly handling the	IPv4 source-routing	\
-	 * option for purposes of calculating the pseudo-header; the	\
-	 * actual destination is different from the destination in the	\
-	 * header which is that of the next-hop.  (This case may not be	\
-	 * true for NICs which can parse IPv6 extension headers, but	\
-	 * we choose to simplify the implementation by not offloading	\
-	 * checksum when they are present.)				\
-	 *								\
-	 */								\
-	if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) &&			\
-	    !((ire)->ire_flags & RTF_MULTIRT) &&			\
-	    (!((ire)->ire_type & IRE_BROADCAST) ||			\
-	    (ill)->ill_type == IFT_ETHER) &&			\
-	    (ipsec_len) == 0 &&						\
-	    (((ire)->ire_ipversion == IPV4_VERSION &&			\
-	    (start) == IP_SIMPLE_HDR_LENGTH &&				\
-	    ((ire)->ire_nce != NULL &&					\
-	    (ire)->ire_nce->nce_fp_mp != NULL &&	\
-	    MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) ||	\
-	    ((ire)->ire_ipversion == IPV6_VERSION &&			\
-	    (start) == IPV6_HDR_LEN &&					\
-	    (ire)->ire_nce->nce_fp_mp != NULL &&			\
-	    MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) &&	\
-	    (max_frag) >= (uint_t)((end) + (ipsec_len)) &&		\
-	    dohwcksum) {						\
-		_hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \
-	} else {							\
-		_hck_flags = 0;						\
-	}								\
-	IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp,	\
-	    up, proto, start, end, pseudo);				\
-}
-
-/*
- * Based on the device capabilities, this macro either marks an outgoing
- * packet with hardware checksum offload information or calculate the
- * checksum in software.  If the latter is performed, the checksum field
- * of the dblk is cleared; otherwise it will be non-zero and contain the
- * necessary flag(s) for the driver.
- */
-#define	IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start,	\
-	    end, pseudo) {						\
-	uint32_t _sum;							\
-	/*								\
-	 * Underlying interface supports hardware checksum offload for	\
-	 * the payload; leave the payload checksum for the hardware to	\
-	 * calculate.  N.B: We only need to set up checksum info on the	\
-	 * first mblk.							\
-	 */								\
-	DB_CKSUMFLAGS(mp) = 0;						\
-	if (((ipver) == IPV4_VERSION &&					\
-	    ((hck_flags) & HCKSUM_INET_FULL_V4)) ||			\
-	    ((ipver) == IPV6_VERSION &&					\
-	    ((hck_flags) & HCKSUM_INET_FULL_V6))) {			\
-		/*							\
-		 * Hardware calculates pseudo-header, header and the	\
-		 * payload checksums, so clear the checksum field in	\
-		 * the protocol header.					\
-		 */							\
-		*(up) = 0;						\
-		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;			\
-	} else if ((hck_flags) & HCKSUM_INET_PARTIAL)  {		\
-		/*							\
-		 * Partial checksum offload has been enabled.  Fill	\
-		 * the checksum field in the protocl header with the	\
-		 * pseudo-header checksum value.			\
-		 */							\
-		_sum = ((proto) == IPPROTO_UDP) ?			\
-		    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP;		\
-		_sum += *(up) + (pseudo);				\
-		_sum = (_sum & 0xFFFF) + (_sum >> 16);			\
-		*(up) = (_sum & 0xFFFF) + (_sum >> 16);			\
-		/*							\
-		 * Offsets are relative to beginning of IP header.	\
-		 */							\
-		DB_CKSUMSTART(mp) = (start);				\
-		DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ?		\
-		    (start) + UDP_CHECKSUM_OFFSET :			\
-		    (start) + TCP_CHECKSUM_OFFSET;			\
-		DB_CKSUMEND(mp) = (end);				\
-		DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;			\
-	} else {							\
-		/*							\
-		 * Software checksumming.				\
-		 */							\
-		_sum = ((proto) == IPPROTO_UDP) ?			\
-		    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP;		\
-		_sum += (pseudo);					\
-		_sum = IP_CSUM(mp, start, _sum);			\
-		*(up) = (uint16_t)(((proto) == IPPROTO_UDP) ?		\
-		    (_sum ? _sum : ~_sum) : _sum);			\
-	}								\
-	/*								\
-	 * Hardware supports IP header checksum offload; clear the	\
-	 * contents of IP header checksum field as expected by NIC.	\
-	 * Do this only if we offloaded either full or partial sum.	\
-	 */								\
-	if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 &&	\
-	    ((hck_flags) & HCKSUM_IPHDRCKSUM)) {			\
-		DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;			\
-		((ipha_t *)(ihp))->ipha_hdr_checksum = 0;		\
-	}								\
-}
-
-/*
- * Macro to inspect the checksum of a fully-reassembled incoming datagram.
- */
-#define	IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) {		\
-	(err) = B_FALSE;						\
-	if ((hck_flags) & HCK_FULLCKSUM) {				\
-		/*							\
-		 * The sum of all fragment checksums should		\
-		 * result in -0 (0xFFFF) or otherwise invalid.		\
-		 */							\
-		if ((sum) != 0xFFFF)					\
-			(err) = B_TRUE;					\
-	} else if ((hck_flags) & HCK_PARTIALCKSUM) {			\
-		(sum) += (pseudo);					\
-		(sum) = ((sum) & 0xFFFF) + ((sum) >> 16);		\
-		(sum) = ((sum) & 0xFFFF) + ((sum) >> 16);		\
-		if (~(sum) & 0xFFFF)					\
-			(err) = B_TRUE;					\
-	} else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) {		\
-		(err) = B_TRUE;						\
-	}								\
-}
-
-/*
- * This macro inspects an incoming packet to see if the checksum value
- * contained in it is valid; if the hardware has provided the information,
- * the value is verified, otherwise it performs software checksumming.
- * The checksum value is returned to caller.
- */
-#define	IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \
-	int32_t _len;							\
-									\
-	(err) = B_FALSE;						\
-	if ((hck_flags) & HCK_FULLCKSUM) {				\
-		/*							\
-		 * Full checksum has been computed by the hardware	\
-		 * and has been attached.  If the driver wants us to	\
-		 * verify the correctness of the attached value, in	\
-		 * order to protect against faulty hardware, compare	\
-		 * it against -0 (0xFFFF) to see if it's valid.		\
-		 */							\
-		(sum) = DB_CKSUM16(mp);					\
-		if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \
-			(err) = B_TRUE;					\
-	} else if (((hck_flags) & HCK_PARTIALCKSUM) &&			\
-	    ((mp1) == NULL || (mp1)->b_cont == NULL) &&			\
-	    (ulph_off) >= DB_CKSUMSTART(mp) &&				\
-	    ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) {	\
-		uint32_t _adj;						\
-		/*							\
-		 * Partial checksum has been calculated by hardware	\
-		 * and attached to the packet; in addition, any		\
-		 * prepended extraneous data is even byte aligned,	\
-		 * and there are at most two mblks associated with	\
-		 * the packet.  If any such data exists, we adjust	\
-		 * the checksum; also take care any postpended data.	\
-		 */							\
-		IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj);	\
-		/*							\
-		 * One's complement subtract extraneous checksum	\
-		 */							\
-		(sum) += DB_CKSUM16(mp);				\
-		if (_adj >= (sum))					\
-			(sum) = ~(_adj - (sum)) & 0xFFFF;		\
-		else							\
-			(sum) -= _adj;					\
-		(sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16);		\
-		(sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16);		\
-		if (~(sum) & 0xFFFF)					\
-			(err) = B_TRUE;					\
-	} else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) {		\
-		(err) = B_TRUE;						\
-	}								\
-}
 
 /*
  * Macro to adjust a given checksum value depending on any prepended
@@ -338,98 +120,37 @@ extern "C" {
 	}								\
 }
 
-#define	ILL_MDT_CAPABLE(ill)		\
-	(((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0)
-
-/*
- * ioctl identifier and structure for Multidata Transmit update
- * private M_CTL communication from IP to ULP.
- */
-#define	MDT_IOC_INFO_UPDATE	(('M' << 8) + 1020)
-
-typedef struct ip_mdt_info_s {
-	uint_t	mdt_info_id;	/* MDT_IOC_INFO_UPDATE */
-	ill_mdt_capab_t	mdt_capab; /* ILL MDT capabilities */
-} ip_mdt_info_t;
+#define	IS_SIMPLE_IPH(ipha)						\
+	((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)
 
 /*
- * Macro that determines whether or not a given ILL is allowed for MDT.
+ * Currently supported flags for LSO.
  */
-#define	ILL_MDT_USABLE(ill)						\
-	(ILL_MDT_CAPABLE(ill) &&					\
-	ill->ill_mdt_capab != NULL &&					\
-	ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 &&		\
-	ill->ill_mdt_capab->ill_mdt_on != 0)
+#define	LSO_BASIC_TCP_IPV4	DLD_LSO_BASIC_TCP_IPV4
+#define	LSO_BASIC_TCP_IPV6	DLD_LSO_BASIC_TCP_IPV6
 
-#define	ILL_LSO_CAPABLE(ill)		\
-	(((ill)->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0)
+#define	ILL_LSO_CAPABLE(ill)						\
+	(((ill)->ill_capabilities & ILL_CAPAB_LSO) != 0)
 
-/*
- * ioctl identifier and structure for Large Segment Offload
- * private M_CTL communication from IP to ULP.
- */
-#define	LSO_IOC_INFO_UPDATE	(('L' << 24) + ('S' << 16) + ('O' << 8))
-
-typedef struct ip_lso_info_s {
-	uint_t	lso_info_id;	/* LSO_IOC_INFO_UPDATE */
-	ill_lso_capab_t	lso_capab; /* ILL LSO capabilities */
-} ip_lso_info_t;
-
-/*
- * Macro that determines whether or not a given ILL is allowed for LSO.
- */
 #define	ILL_LSO_USABLE(ill)						\
 	(ILL_LSO_CAPABLE(ill) &&					\
-	ill->ill_lso_capab != NULL &&					\
-	ill->ill_lso_capab->ill_lso_on != 0)
+	ill->ill_lso_capab != NULL)
 
-#define	ILL_LSO_TCP_USABLE(ill)						\
+#define	ILL_LSO_TCP_IPV4_USABLE(ill)					\
 	(ILL_LSO_USABLE(ill) &&						\
-	ill->ill_lso_capab->ill_lso_flags & DLD_LSO_TX_BASIC_TCP_IPV4)
+	ill->ill_lso_capab->ill_lso_flags & LSO_BASIC_TCP_IPV4)
 
-/*
- * Macro that determines whether or not a given CONN may be considered
- * for fast path prior to proceeding further with LSO or Multidata.
- */
-#define	CONN_IS_LSO_MD_FASTPATH(connp)	\
-	((connp)->conn_dontroute == 0 &&	/* SO_DONTROUTE */	\
-	!((connp)->conn_nexthop_set) &&		/* IP_NEXTHOP */	\
-	(connp)->conn_outgoing_ill == NULL)	/* IP{V6}_BOUND_IF */
-
-/* Definitions for fragmenting IP packets using MDT. */
-
-/*
- * Smaller and private version of pdescinfo_t used specifically for IP,
- * which allows for only a single payload span per packet.
- */
-typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2)	ip_pdescinfo_t;
+#define	ILL_LSO_TCP_IPV6_USABLE(ill)					\
+	(ILL_LSO_USABLE(ill) &&						\
+	ill->ill_lso_capab->ill_lso_flags & LSO_BASIC_TCP_IPV6)
 
-/*
- * Macro version of ip_can_frag_mdt() which avoids the function call if we
- * only examine a single message block.
- */
-#define	IP_CAN_FRAG_MDT(mp, hdr_len, len)			\
-	(((mp)->b_cont == NULL) ?				\
-	(MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) :	\
-	ip_can_frag_mdt((mp), (hdr_len), (len)))
+#define	ILL_ZCOPY_CAPABLE(ill)						\
+	(((ill)->ill_capabilities & ILL_CAPAB_ZEROCOPY) != 0)
 
-/*
- * Macro that determines whether or not a given IPC requires
- * outbound IPSEC processing.
- */
-#define	CONN_IPSEC_OUT_ENCAPSULATED(connp)	\
-	((connp)->conn_out_enforce_policy ||	\
-	((connp)->conn_latch != NULL &&		\
-	(connp)->conn_latch->ipl_out_policy != NULL))
+#define	ILL_ZCOPY_USABLE(ill)						\
+	(ILL_ZCOPY_CAPABLE(ill) && (ill->ill_zerocopy_capab != NULL) &&	\
+	(ill->ill_zerocopy_capab->ill_zerocopy_flags != 0))
 
-/*
- * Macro that checks whether or not a particular UDP conn is
- * flow-controlling on the read-side.
- *
- * Note that this check is done after the conn is found in
- * the UDP fanout table.
- */
-#define	CONN_UDP_FLOWCTLD(connp) !canputnext((connp)->conn_rq)
 
 /* Macro that follows definitions of flags for mac_tx() (see mac_client.h) */
 #define	IP_DROP_ON_NO_DESC	0x01	/* Equivalent to MAC_DROP_ON_NO_DESC */
@@ -437,74 +158,7 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2)	ip_pdescinfo_t;
 #define	ILL_DIRECT_CAPABLE(ill)						\
 	(((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0)
 
-#define	ILL_SEND_TX(ill, ire, hint, mp, flag, connp) {			\
-	if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) {		\
-		ill_dld_direct_t *idd;					\
-		uintptr_t	cookie;					\
-		conn_t		*udp_connp = (conn_t *)connp;		\
-									\
-		idd = &(ill)->ill_dld_capab->idc_direct;		\
-		/*							\
-		 * Send the packet directly to DLD, where it		\
-		 * may be queued depending on the availability		\
-		 * of transmit resources at the media layer.		\
-		 * Ignore the returned value for the time being 	\
-		 * In future, we may want to take this into		\
-		 * account and flow control the TCP.			\
-		 */							\
-		cookie = idd->idd_tx_df(idd->idd_tx_dh, mp,		\
-		    (uintptr_t)(hint), flag);				\
-									\
-		/*							\
-		 * non-NULL cookie indicates flow control situation	\
-		 * and the cookie itself identifies this specific	\
-		 * Tx ring that is blocked. This cookie is used to	\
-		 * block the UDP conn that is sending packets over	\
-		 * this specific Tx ring.				\
-		 */							\
-		if ((cookie != NULL) && (udp_connp != NULL) &&		\
-		    (udp_connp->conn_ulp == IPPROTO_UDP)) {		\
-			idl_tx_list_t *idl_txl;				\
-			ip_stack_t *ipst;				\
-									\
-			/*						\
-			 * Flow controlled.				\
-			 */						\
-			DTRACE_PROBE2(ill__send__tx__cookie,		\
-			    uintptr_t, cookie, conn_t *, udp_connp);	\
-			ipst = udp_connp->conn_netstack->netstack_ip;	\
-			idl_txl =					\
-			    &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];\
-			mutex_enter(&idl_txl->txl_lock);		\
-			if (udp_connp->conn_direct_blocked ||		\
-			    (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh,	\
-			    cookie) == 0)) {				\
-				DTRACE_PROBE1(ill__tx__not__blocked,	\
-				    boolean,				\
-				    udp_connp->conn_direct_blocked);	\
-			} else if (idl_txl->txl_cookie != NULL &&	\
-			    idl_txl->txl_cookie != cookie) {		\
-				udp_t *udp = udp_connp->conn_udp;	\
-				udp_stack_t *us = udp->udp_us;		\
-									\
-				DTRACE_PROBE2(ill__send__tx__collision,	\
-				    uintptr_t, cookie,			\
-				    uintptr_t, idl_txl->txl_cookie);	\
-				UDP_STAT(us, udp_cookie_coll);		\
-			} else {					\
-				udp_connp->conn_direct_blocked = B_TRUE;\
-				idl_txl->txl_cookie = cookie;		\
-				conn_drain_insert(udp_connp, idl_txl);	\
-				DTRACE_PROBE1(ill__send__tx__insert,	\
-				    conn_t *, udp_connp);		\
-			}						\
-			mutex_exit(&idl_txl->txl_lock);			\
-		}							\
-	} else {							\
-		putnext((ire)->ire_stq, mp);				\
-	}								\
-}
-
+/* This macro is used by the mac layer */
 #define	MBLK_RX_FANOUT_SLOWPATH(mp, ipha)				\
 	(DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \
 	(((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr))
@@ -520,13 +174,11 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2)	ip_pdescinfo_t;
 	    netstackid_to_zoneid((ipst)->ips_netstack->netstack_stackid) : \
 	    (zoneid))
 
-extern int	ip_wput_frag_mdt_min;
-extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t);
-extern mblk_t   *ip_prepend_zoneid(mblk_t *, zoneid_t, ip_stack_t *);
 extern void ill_flow_enable(void *, ip_mac_tx_cookie_t);
-extern zoneid_t	ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_stack_t *, zoneid_t);
+extern zoneid_t	ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_recv_attr_t *,
+    zoneid_t);
 extern zoneid_t	ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *,
-    ip_stack_t *, zoneid_t);
+    ip_recv_attr_t *, zoneid_t);
 
 /*
  * flag passed in by IP based protocols to get a private ip stream with
@@ -542,8 +194,6 @@ extern zoneid_t	ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *,
 #define	DEV_IP	"/devices/pseudo/ip@0:ip"
 #define	DEV_IP6	"/devices/pseudo/ip6@0:ip6"
 
-extern struct kmem_cache  *ip_helper_stream_cache;
-
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h
index f4882f7640..d4dfd9c97e 100644
--- a/usr/src/uts/common/inet/ip_ire.h
+++ b/usr/src/uts/common/inet/ip_ire.h
@@ -68,106 +68,26 @@ extern "C" {
 	((addr).s6_addr8[14] & (mask).s6_addr8[14]) ^ 			\
 	((addr).s6_addr8[15] & (mask).s6_addr8[15])) & ((table_size) - 1))
 
+#define	IRE_HIDDEN_TYPE(ire_type) ((ire_type) &			\
+	(IRE_HOST | IRE_PREFIX | IRE_DEFAULT | IRE_IF_ALL | IRE_BROADCAST))
+
 /*
  * match parameter definitions for IRE lookup routines.
  */
 #define	MATCH_IRE_DSTONLY	0x0000	/* Match just the address */
 #define	MATCH_IRE_TYPE		0x0001	/* Match IRE type */
-#define	MATCH_IRE_SRC		0x0002	/* Match IRE source address */
-#define	MATCH_IRE_MASK		0x0004	/* Match IRE mask */
-#define	MATCH_IRE_WQ		0x0008	/* Match IRE ire_stq to write queue */
-#define	MATCH_IRE_GW		0x0010	/* Match IRE gateway */
-#define	MATCH_IRE_IPIF		0x0020	/* Match IRE ipif */
-#define	MATCH_IRE_RECURSIVE	0x0040	/* Do recursive lookup if necessary */
-#define	MATCH_IRE_DEFAULT	0x0080	/* Return default route if no route */
-					/* found. */
-#define	MATCH_IRE_RJ_BHOLE	0x0100	/* During lookup if we hit an ire */
-					/* with RTF_REJECT or RTF_BLACKHOLE, */
-					/* return the ire. No recursive */
-					/* lookup should be done. */
-#define	MATCH_IRE_IHANDLE	0x0200	/* Match IRE on ihandle */
-#define	MATCH_IRE_MARK_TESTHIDDEN 0x0400 /* Match IRE_MARK_TESTHIDDEN IREs */
-
-/*
- * MATCH_IRE_PARENT is used whenever we unconditionally want to get the
- * parent IRE (sire) while recursively searching IREs for an offsubnet
- * destination. With this flag, even if no IRE_CACHETABLE or IRE_INTERFACE
- * is found to help resolving IRE_OFFSUBNET in lookup routines, the
- * IRE_OFFSUBNET sire, if any, is returned to the caller.
- */
-/* UNUSED			0x0800  */
-#define	MATCH_IRE_ILL		0x1000	/* Match IRE on the ill */
-
-#define	MATCH_IRE_PARENT	0x2000	/* Match parent ire, if any, */
-					/* even if ire is not matched. */
-#define	MATCH_IRE_ZONEONLY	0x4000	/* Match IREs in specified zone, ie */
+#define	MATCH_IRE_MASK		0x0002	/* Match IRE mask */
+#define	MATCH_IRE_SHORTERMASK	0x0004	/* A mask shorter than the argument */
+#define	MATCH_IRE_GW		0x0008	/* Match IRE gateway */
+#define	MATCH_IRE_ILL		0x0010	/* Match IRE on the ill */
+#define	MATCH_IRE_ZONEONLY	0x0020	/* Match IREs in specified zone, ie */
 					/* don't match IRE_LOCALs from other */
 					/* zones or shared IREs */
-#define	MATCH_IRE_MARK_PRIVATE_ADDR	0x8000	/* Match IRE ire_marks with */
-						/* IRE_MARK_PRIVATE_ADDR. */
-#define	MATCH_IRE_SECATTR	0x10000	/* Match gateway security attributes */
-#define	MATCH_IRE_COMPLETE	0x20000	/* ire_ftable_lookup() can return */
-					/* IRE_CACHE entry only if it is  */
-					/* ND_REACHABLE			  */
+#define	MATCH_IRE_SECATTR	0x0040	/* Match gateway security attributes */
+#define	MATCH_IRE_TESTHIDDEN 	0x0080	/* Match ire_testhidden IREs */
 
-/*
- * Any ire to nce association is long term, and
- * the refhold and refrele may be done by different
- * threads. So all cases of making or breaking ire to
- * nce association should all effectively use the NOTR variants.
- * To understand the *effectively* part read on.
- *
- * ndp_lookup() and ndp_add_v4()/ndp_add_v6() implicitly do
- * NCE_REFHOLD. So wherever we make ire to nce association after
- * calling these functions, we effectively want to end up with
- * NCE_REFHOLD_NOTR. We call this macro to achieve this effect. This
- * macro changes a NCE_REFHOLD to a NCE_REFHOLD_NOTR. The macro's
- * NCE_REFRELE cancels off ndp_lookup[ndp_add]'s implicit NCE_REFHOLD,
- * and what you are left with is a NCE_REFHOLD_NOTR
- */
-#define	NCE_REFHOLD_TO_REFHOLD_NOTR(nce) {	\
-	NCE_REFHOLD_NOTR(nce);			\
-	NCE_REFRELE(nce);			\
-}
-
-/*
- * find the next ire_t entry in the ire_next chain starting at ire
- * that is not CONDEMNED.  ire is set to NULL if we reach the end of the list.
- * Caller must hold the ire_bucket lock.
- */
+#define	MAX_IRE_RECURSION	4	/* Max IREs in ire_route_recursive */
 
-#define	IRE_FIND_NEXT_ORIGIN(ire) {					\
-	while ((ire) != NULL && ((ire)->ire_marks & IRE_MARK_CONDEMNED))\
-		(ire) = (ire)->ire_next;				\
-}
-
-
-/* Structure for ire_cache_count() */
-typedef struct {
-	int	icc_total;	/* Total number of IRE_CACHE */
-	int	icc_unused;	/* # off/no PMTU unused since last reclaim */
-	int	icc_offlink;	/* # offlink without PMTU information */
-	int	icc_pmtu;	/* # offlink with PMTU information */
-	int	icc_onlink;	/* # onlink */
-} ire_cache_count_t;
-
-/*
- * Structure for ire_cache_reclaim(). Each field is a fraction i.e. 1 meaning
- * reclaim all, N meaning reclaim 1/Nth of all entries, 0 meaning reclaim none.
- *
- * The comment below (and for other netstack_t references) refers
- * to the fact that we only do netstack_hold in particular cases,
- * such as the references from open streams (ill_t and conn_t's
- * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
- * ire_t's when an ill goes away.
- */
-typedef struct {
-	int	icr_unused;	/* Fraction for unused since last reclaim */
-	int	icr_offlink;	/* Fraction for offlink without PMTU info */
-	int	icr_pmtu;	/* Fraction for offlink with PMTU info */
-	int	icr_onlink;	/* Fraction for onlink */
-	ip_stack_t *icr_ipst;	/* Does not have a netstack_hold */
-} ire_cache_reclaim_t;
 
 /*
  * We use atomics so that we get an accurate accounting on the ires.
@@ -176,180 +96,250 @@ typedef struct {
 #define	BUMP_IRE_STATS(ire_stats, x) atomic_add_64(&(ire_stats).x, 1)
 
 #ifdef _KERNEL
-/*
- * Structure for passing args for the IRE cache lookup functions.
- */
-typedef struct ire_ctable_args_s {
-	void			*ict_addr;
-	void			*ict_gateway;
-	int			ict_type;
-	const ipif_t		*ict_ipif;
-	zoneid_t		ict_zoneid;
-	const ts_label_t	*ict_tsl;
-	int			ict_flags;
-	ip_stack_t		*ict_ipst;
-	queue_t			*ict_wq;
-} ire_ctable_args_t;
-
 struct ts_label_s;
 struct nce_s;
+/*
+ * structure for passing args between ire_ftable_lookup and ire_find_best_route
+ */
+typedef struct ire_ftable_args_s {
+	in6_addr_t		ift_addr_v6;
+	in6_addr_t		ift_mask_v6;
+	in6_addr_t		ift_gateway_v6;
+#define	ift_addr		V4_PART_OF_V6(ift_addr_v6)
+#define	ift_mask		V4_PART_OF_V6(ift_mask_v6)
+#define	ift_gateway		V4_PART_OF_V6(ift_gateway_v6)
+	int			ift_type;
+	const ill_t		*ift_ill;
+	zoneid_t		ift_zoneid;
+	const ts_label_t	*ift_tsl;
+	int			ift_flags;
+	ire_t			*ift_best_ire;
+} ire_ftable_args_t;
 
 extern	ipaddr_t	ip_plen_to_mask(uint_t);
 extern	in6_addr_t	*ip_plen_to_mask_v6(uint_t, in6_addr_t *);
 
 extern	int	ip_ire_advise(queue_t *, mblk_t *, cred_t *);
 extern	int	ip_ire_delete(queue_t *, mblk_t *, cred_t *);
-extern	boolean_t ip_ire_clookup_and_delete(ipaddr_t, ipif_t *, ip_stack_t *);
-extern	void	ip_ire_clookup_and_delete_v6(const in6_addr_t *,
-    ip_stack_t *);
-
-extern	void	ip_ire_req(queue_t *, mblk_t *);
+extern	void	ip_ire_reclaim(void *);
 
 extern	int	ip_mask_to_plen(ipaddr_t);
 extern	int	ip_mask_to_plen_v6(const in6_addr_t *);
 
-extern	ire_t	*ipif_to_ire(const ipif_t *);
-extern	ire_t	*ipif_to_ire_v6(const ipif_t *);
-
-extern	int	ire_add(ire_t **, queue_t *, mblk_t *, ipsq_func_t, boolean_t);
-extern	void	ire_add_then_send(queue_t *, ire_t *, mblk_t *);
-extern	int	ire_add_v6(ire_t **, queue_t *, mblk_t *, ipsq_func_t);
-extern	int	ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q,
-    mblk_t *mp, ipsq_func_t func);
+extern	ire_t	*ire_add(ire_t *);
+extern	ire_t	*ire_add_v6(ire_t *);
+extern	int	ire_atomic_start(irb_t *irb_ptr, ire_t *ire);
 extern	void	ire_atomic_end(irb_t *irb_ptr, ire_t *ire);
 
-extern	void	ire_cache_count(ire_t *, char *);
-extern	ire_t	*ire_cache_lookup(ipaddr_t, zoneid_t,
-    const struct ts_label_s *, ip_stack_t *);
-extern	ire_t	*ire_cache_lookup_simple(ipaddr_t, ip_stack_t *);
-extern	ire_t	*ire_cache_lookup_v6(const in6_addr_t *, zoneid_t,
-    const struct ts_label_s *, ip_stack_t *);
-extern	void	ire_cache_reclaim(ire_t *, char *);
-
-extern	ire_t	*ire_create_mp(uchar_t *, uchar_t *, uchar_t *, uchar_t *,
-    uint_t, struct nce_s *, queue_t *, queue_t *, ushort_t, ipif_t *, ipaddr_t,
-    uint32_t, uint32_t, uint32_t, const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *,
-    ip_stack_t *);
-extern	ire_t	*ire_create(uchar_t *, uchar_t *, uchar_t *, uchar_t *,
-    uint_t *, struct nce_s *, queue_t *, queue_t *, ushort_t, ipif_t *,
-    ipaddr_t, uint32_t, uint32_t, uint32_t, const iulp_t *, tsol_gc_t *,
-    tsol_gcgrp_t *, ip_stack_t *);
-
-extern	ire_t	**ire_check_and_create_bcast(ipif_t *, ipaddr_t,
-    ire_t **, int);
-extern	ire_t	**ire_create_bcast(ipif_t *, ipaddr_t, ire_t **);
-extern	ire_t	*ire_init(ire_t *, uchar_t *, uchar_t *, uchar_t *, uchar_t *,
-    uint_t *, struct nce_s *, queue_t *, queue_t *, ushort_t, ipif_t *,
-    ipaddr_t, uint32_t, uint32_t, uint32_t, const iulp_t *, tsol_gc_t *,
-    tsol_gcgrp_t *, ip_stack_t *);
-
-extern	boolean_t ire_init_common(ire_t *, uint_t *, struct nce_s *, queue_t *,
-    queue_t *, ushort_t, ipif_t *, uint32_t, uint32_t, uint32_t, uchar_t,
-    const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
-
-extern	ire_t	*ire_create_v6(const in6_addr_t *, const in6_addr_t *,
-    const in6_addr_t *, const in6_addr_t *, uint_t *, struct nce_s *, queue_t *,
-    queue_t *, ushort_t, ipif_t *,
-    const in6_addr_t *, uint32_t, uint32_t, uint_t, const iulp_t *,
-    tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
-
-extern	ire_t	*ire_create_mp_v6(const in6_addr_t *, const in6_addr_t *,
-    const in6_addr_t *, const in6_addr_t *, struct nce_s *, queue_t *,
-    queue_t *, ushort_t, ipif_t *,
-    const in6_addr_t *, uint32_t, uint32_t, uint_t, const iulp_t *,
-    tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
-
+extern	ire_t	*ire_create(uchar_t *, uchar_t *, uchar_t *,
+    ushort_t, ill_t *, zoneid_t, uint_t, tsol_gc_t *, ip_stack_t *);
 
-extern	void	ire_clookup_delete_cache_gw(ipaddr_t, zoneid_t,
-    ip_stack_t *);
-extern	void	ire_clookup_delete_cache_gw_v6(const in6_addr_t *, zoneid_t,
+extern	ire_t	**ire_create_bcast(ill_t *, ipaddr_t, zoneid_t, ire_t **);
+extern	ire_t	*ire_create_if_clone(ire_t *, const in6_addr_t *, uint_t *);
+extern	ire_t	*ire_lookup_bcast(ill_t *, ipaddr_t, zoneid_t);
+extern	int	ire_init_v4(ire_t *, uchar_t *, uchar_t *, uchar_t *,
+    ushort_t, ill_t *, zoneid_t, uint_t, tsol_gc_t *, ip_stack_t *);
+extern	int	ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
+    const in6_addr_t *, ushort_t, ill_t *, zoneid_t, uint_t, tsol_gc_t *,
     ip_stack_t *);
 
-extern	ire_t	*ire_ctable_lookup(ipaddr_t, ipaddr_t, int, const ipif_t *,
-    zoneid_t, const struct ts_label_s *, int, ip_stack_t *);
+extern	int	ire_init_common(ire_t *, ushort_t, ill_t *, zoneid_t, uint_t,
+    uchar_t, tsol_gc_t *, ip_stack_t *);
 
-extern	ire_t	*ire_ctable_lookup_v6(const in6_addr_t *, const in6_addr_t *,
-    int, const ipif_t *, zoneid_t, const struct ts_label_s *, int,
-    ip_stack_t *);
+extern	ire_t	*ire_create_v6(const in6_addr_t *, const in6_addr_t *,
+    const in6_addr_t *, ushort_t, ill_t *, zoneid_t, uint_t,
+    tsol_gc_t *, ip_stack_t *);
 
 extern	void	ire_delete(ire_t *);
-extern	void	ire_delete_cache_gw(ire_t *, char *);
-extern	void	ire_delete_cache_gw_v6(ire_t *, char *);
-extern	void	ire_delete_cache_v6(ire_t *, char *);
 extern	void	ire_delete_v6(ire_t *);
 
-extern	void	ire_expire(ire_t *, char *);
+/*
+ * ire_pref used to make sure we don't set up routing loops in the ire_dep
+ * chain.
+ */
+extern	int	ire_pref(ire_t *);
+extern	boolean_t ire_dep_build(ire_t *[], uint_t [], uint_t);
+extern	void	ire_dep_delete_if_clone(ire_t *);
+extern	void	ire_dep_incr_generation(ire_t *);
+extern	void	ire_dep_remove(ire_t *);
+extern	void	ire_dep_unbuild(ire_t *[], uint_t);
+extern	uint_t	ire_dep_validate_generations(ire_t *);
+extern	void	ire_dep_invalidate_generations(ire_t *);
+extern	boolean_t ire_determine_nce_capable(ire_t *);
 
 extern	void	ire_flush_cache_v4(ire_t *, int);
 extern	void	ire_flush_cache_v6(ire_t *, int);
 
+extern	ire_t	*ire_ftable_lookup_v4(ipaddr_t, ipaddr_t, ipaddr_t, int,
+    const ill_t *, zoneid_t, const struct ts_label_s *, int, uint32_t,
+    ip_stack_t *, uint_t *);
 extern	ire_t	*ire_ftable_lookup_v6(const in6_addr_t *, const in6_addr_t *,
-    const in6_addr_t *, int, const ipif_t *, ire_t **, zoneid_t,
-    uint32_t, const struct ts_label_s *, int, ip_stack_t *);
-
-extern	ire_t	*ire_ihandle_lookup_onlink(ire_t *);
-extern	ire_t	*ire_ihandle_lookup_offlink(ire_t *, ire_t *);
-extern	ire_t	*ire_ihandle_lookup_offlink_v6(ire_t *, ire_t *);
-
-extern  boolean_t	ire_local_same_lan(ire_t *, ire_t *);
-extern	boolean_t	ire_local_ok_across_zones(ire_t *, zoneid_t, void *,
-    const struct ts_label_s *, ip_stack_t *);
-
-extern	ire_t 	*ire_lookup_local(zoneid_t, ip_stack_t *);
-extern	ire_t 	*ire_lookup_local_v6(zoneid_t, ip_stack_t *);
-
-extern  ire_t	*ire_lookup_multi(ipaddr_t, zoneid_t, ip_stack_t *);
-extern  ire_t	*ire_lookup_multi_v6(const in6_addr_t *, zoneid_t,
-    ip_stack_t *);
-
+    const in6_addr_t *, int, const ill_t *, zoneid_t,
+    const struct ts_label_s *, int, uint32_t, ip_stack_t *, uint_t *);
+
+extern	ire_t	*ire_ftable_lookup_simple_v4(ipaddr_t, uint32_t, ip_stack_t *,
+    uint_t *);
+extern	ire_t	*ire_ftable_lookup_simple_v6(const in6_addr_t *, uint32_t,
+    ip_stack_t *, uint_t *);
+
+extern boolean_t ire_gateway_ok_zone_v4(ipaddr_t, zoneid_t, ill_t *,
+    const ts_label_t *, ip_stack_t *, boolean_t);
+extern boolean_t ire_gateway_ok_zone_v6(const in6_addr_t *, zoneid_t, ill_t *,
+    const ts_label_t *, ip_stack_t *, boolean_t);
+
+extern ire_t	*ire_alt_local(ire_t *, zoneid_t, const ts_label_t *,
+    const ill_t *, uint_t *);
+
+extern  ill_t	*ire_lookup_multi_ill_v4(ipaddr_t, zoneid_t, ip_stack_t *,
+    boolean_t *, ipaddr_t *);
+extern  ill_t	*ire_lookup_multi_ill_v6(const in6_addr_t *, zoneid_t,
+    ip_stack_t *, boolean_t *, in6_addr_t *);
+
+extern	ire_t	*ire_nexthop(ire_t *);
+extern	ill_t	*ire_nexthop_ill(ire_t *);
+extern	ill_t	*ire_nce_ill(ire_t *);
+
+extern	ire_t	*ire_reject(ip_stack_t *, boolean_t);
+extern	ire_t	*ire_blackhole(ip_stack_t *, boolean_t);
+extern	ire_t	*ire_multicast(ill_t *);
+
+/* The different ire_recvfn functions */
+extern void	ire_recv_forward_v4(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_noroute_v4(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_broadcast_v4(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_multicast_v4(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_multirt_v4(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_loopback_v4(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_local_v4(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_noaccept_v4(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+
+extern void	ire_recv_forward_v6(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_noroute_v6(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_multicast_v6(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_multirt_v6(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_loopback_v6(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+extern void	ire_recv_local_v6(ire_t *, mblk_t *, void *, ip_recv_attr_t *);
+extern void	ire_recv_noaccept_v6(ire_t *, mblk_t *, void *,
+    ip_recv_attr_t *);
+
+extern	void	irb_refhold(irb_t *);
+extern	void	irb_refhold_locked(irb_t *);
+extern	void	irb_refrele(irb_t *);
+extern  void	irb_increment_generation(irb_t *);
+
+extern	void	ire_refhold(ire_t *);
+extern	void	ire_refhold_notr(ire_t *);
+extern	void	ire_refhold_locked(ire_t *);
 extern	void	ire_refrele(ire_t *);
 extern	void	ire_refrele_notr(ire_t *);
-extern	ire_t	*ire_route_lookup(ipaddr_t, ipaddr_t, ipaddr_t, int,
-    const ipif_t *, ire_t **, zoneid_t, const struct ts_label_s *, int,
-    ip_stack_t *);
-
-extern	ire_t	*ire_route_lookup_v6(const in6_addr_t *, const in6_addr_t *,
-    const in6_addr_t *, int, const ipif_t *, ire_t **, zoneid_t,
-    const struct ts_label_s *, int, ip_stack_t *);
-
-extern ill_t	*ire_to_ill(const ire_t *);
+extern	void	ire_make_condemned(ire_t *);
+extern	boolean_t ire_no_good(ire_t *);
+extern	nce_t	*ire_handle_condemned_nce(nce_t *, ire_t *, ipha_t *, ip6_t *,
+    boolean_t);
+
+extern ire_t   	*ire_round_robin(irb_t *, ire_ftable_args_t *, uint_t,
+    ire_t *, ip_stack_t *);
+
+extern ire_t	*ire_route_recursive_v4(ipaddr_t, uint_t, const ill_t *,
+    zoneid_t, const ts_label_t *, uint_t, boolean_t, uint32_t, ip_stack_t *,
+    ipaddr_t *, tsol_ire_gw_secattr_t **, uint_t *);
+extern ire_t	*ire_route_recursive_v6(const in6_addr_t *, uint_t,
+    const ill_t *, zoneid_t, const ts_label_t *, uint_t, boolean_t, uint32_t,
+    ip_stack_t *, in6_addr_t *, tsol_ire_gw_secattr_t **, uint_t *);
+extern ire_t	*ire_route_recursive_dstonly_v4(ipaddr_t, boolean_t,
+    uint32_t, ip_stack_t *);
+extern ire_t	*ire_route_recursive_dstonly_v6(const in6_addr_t *, boolean_t,
+    uint32_t, ip_stack_t *);
+extern ire_t	*ire_route_recursive_impl_v4(ire_t *ire, ipaddr_t, uint_t,
+    const ill_t *, zoneid_t, const ts_label_t *, uint_t, boolean_t, uint32_t,
+    ip_stack_t *, ipaddr_t *, tsol_ire_gw_secattr_t **, uint_t *);
+extern ire_t	*ire_route_recursive_impl_v6(ire_t *ire, const in6_addr_t *,
+    uint_t, const ill_t *, zoneid_t, const ts_label_t *, uint_t, boolean_t,
+    uint32_t, ip_stack_t *, in6_addr_t *, tsol_ire_gw_secattr_t **, uint_t *);
+
+/* The different ire_sendfn functions */
+extern int	ire_send_local_v4(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+extern int	ire_send_multirt_v4(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+extern int	ire_send_noroute_v4(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+extern int	ire_send_multicast_v4(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+extern int	ire_send_broadcast_v4(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+extern int	ire_send_wire_v4(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+extern int	ire_send_local_v6(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+extern int	ire_send_multirt_v6(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+extern int	ire_send_noroute_v6(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+extern int	ire_send_multicast_v6(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+extern int	ire_send_wire_v6(ire_t *, mblk_t *, void *,
+    ip_xmit_attr_t *, uint32_t *);
+
+extern nce_t	*ire_to_nce_pkt(ire_t *, mblk_t *);
+extern nce_t	*ire_to_nce(ire_t *, ipaddr_t, const in6_addr_t *);
+
+/* Different ire_postfragfn functions */
+extern int	ip_xmit(mblk_t *, struct nce_s *,
+    iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *);
+extern int	ip_postfrag_loopcheck(mblk_t *, struct nce_s *,
+    iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *);
+extern int	ip_postfrag_multirt_v4(mblk_t *, struct nce_s *,
+    iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *);
+extern int	ip_postfrag_multirt_v6(mblk_t *, struct nce_s *,
+    iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *);
+
+extern void	ip_postfrag_loopback(mblk_t *, struct nce_s *,
+    iaflags_t, uint_t, zoneid_t);
+extern int	ire_revalidate_nce(ire_t *);
+
+extern ire_t	*ip_select_route_pkt(mblk_t *, ip_xmit_attr_t *,
+    uint_t *, int *, boolean_t *);
+extern ire_t	*ip_select_route(const in6_addr_t *, ip_xmit_attr_t *,
+    uint_t *, in6_addr_t *, int *, boolean_t *);
+extern ire_t	*ip_select_route_v4(ipaddr_t, ip_xmit_attr_t *,
+    uint_t *, ipaddr_t *, int *, boolean_t *);
+extern ire_t	*ip_select_route_v6(const in6_addr_t *, ip_xmit_attr_t *,
+    uint_t *, in6_addr_t *, int *, boolean_t *);
 
 extern	void	ire_walk(pfv_t, void *, ip_stack_t *);
 extern	void	ire_walk_ill(uint_t, uint_t, pfv_t, void *, ill_t *);
-extern	void	ire_walk_ill_v4(uint_t, uint_t, pfv_t, void *, ill_t *);
-extern	void	ire_walk_ill_v6(uint_t, uint_t, pfv_t, void *, ill_t *);
 extern	void	ire_walk_v4(pfv_t, void *, zoneid_t, ip_stack_t *);
 extern  void	ire_walk_ill_tables(uint_t match_flags, uint_t ire_type,
     pfv_t func, void *arg, size_t ftbl_sz, size_t htbl_sz,
-    irb_t **ipftbl, size_t ctbl_sz, irb_t *ipctbl, ill_t *ill,
+    irb_t **ipftbl, ill_t *ill,
     zoneid_t zoneid, ip_stack_t *);
 extern	void	ire_walk_v6(pfv_t, void *, zoneid_t, ip_stack_t *);
 
-extern boolean_t	ire_multirt_lookup(ire_t **, ire_t **, uint32_t, int *,
-    const struct ts_label_s *, ip_stack_t *);
-extern boolean_t	ire_multirt_need_resolve(ipaddr_t,
-    const struct ts_label_s *, ip_stack_t *);
-extern boolean_t	ire_multirt_lookup_v6(ire_t **, ire_t **, uint32_t,
-    const struct ts_label_s *, ip_stack_t *);
-extern boolean_t	ire_multirt_need_resolve_v6(const in6_addr_t *,
-    const struct ts_label_s *, ip_stack_t *);
-
-extern ire_t	*ipif_lookup_multi_ire(ipif_t *, ipaddr_t);
-extern ire_t	*ipif_lookup_multi_ire_v6(ipif_t *, const in6_addr_t *);
-
-extern ire_t	*ire_get_next_bcast_ire(ire_t *, ire_t *);
-extern ire_t	*ire_get_next_default_ire(ire_t *, ire_t *);
-
-extern  void	ire_arpresolve(ire_t *);
-extern  void	ire_freemblk(ire_t *);
 extern boolean_t	ire_match_args(ire_t *, ipaddr_t, ipaddr_t, ipaddr_t,
-    int, const ipif_t *, zoneid_t, uint32_t, const struct ts_label_s *, int,
-    queue_t *);
-extern  int	ire_nce_init(ire_t *, struct nce_s *);
+    int, const ill_t *, zoneid_t, const struct ts_label_s *, int);
+extern boolean_t	ire_match_args_v6(ire_t *, const in6_addr_t *,
+    const in6_addr_t *, const in6_addr_t *, int, const ill_t *, zoneid_t,
+    const ts_label_t *, int);
+
+extern  struct nce_s	*arp_nce_init(ill_t *, in_addr_t, int);
 extern  boolean_t	ire_walk_ill_match(uint_t, uint_t, ire_t *, ill_t *,
     zoneid_t, ip_stack_t *);
-extern	ire_t	*ire_arpresolve_lookup(ipaddr_t, ipaddr_t, ipif_t *, zoneid_t,
-    ip_stack_t *, queue_t *);
+extern  void ire_increment_generation(ire_t *);
+extern  void ire_increment_multicast_generation(ip_stack_t *, boolean_t);
 
 #endif /* _KERNEL */
 
diff --git a/usr/src/uts/common/inet/ip_multi.h b/usr/src/uts/common/inet/ip_multi.h
index 7dee133967..c41ef99e3e 100644
--- a/usr/src/uts/common/inet/ip_multi.h
+++ b/usr/src/uts/common/inet/ip_multi.h
@@ -49,18 +49,9 @@ typedef enum {
 } ilg_stat_t;
 
 /*
- * Flags shared via ips_mrt_flags, used by mcast_restart_timers_thread().
- */
-typedef enum {
-	IP_MRT_STOP	= 0x1,	/* request to stop thread */
-	IP_MRT_DONE	= 0x2,	/* indication that thread is stopped */
-	IP_MRT_RUN 	= 0x4	/* request to restart timers */
-} ip_mrt_flags_t;
-
-/*
  * Extern functions
  */
-extern	mblk_t		*igmp_input(queue_t *, mblk_t *, ill_t *);
+extern	mblk_t		*igmp_input(mblk_t *, ip_recv_attr_t *);
 extern	void		igmp_joingroup(ilm_t *);
 extern	void		igmp_leavegroup(ilm_t *);
 extern	void		igmp_slowtimo(void *);
@@ -73,85 +64,64 @@ extern	void		mld_statechange(ilm_t *, mcast_record_t, slist_t *);
 extern	void		mld_slowtimo(void *);
 
 extern	void		ilg_delete_all(conn_t *connp);
-extern	ilg_t		*ilg_lookup_ill_v6(conn_t *, const in6_addr_t *,
-    ill_t *);
-extern	ilg_t		*ilg_lookup_ill_withsrc(conn_t *, ipaddr_t, ipaddr_t,
-    ill_t *);
-extern	ilg_t		*ilg_lookup_ill_withsrc_v6(conn_t *, const in6_addr_t *,
-    const in6_addr_t *, ill_t *);
+extern	boolean_t	conn_hasmembers_ill_withsrc_v4(conn_t *, ipaddr_t,
+    ipaddr_t, ill_t *);
+extern	boolean_t	conn_hasmembers_ill_withsrc_v6(conn_t *,
+    const in6_addr_t *, const in6_addr_t *, ill_t *);
 
 extern void		ill_leave_multicast(ill_t *);
 extern void		ill_recover_multicast(ill_t *);
-extern int		ip_get_dlpi_mbcast(ill_t *, mblk_t *);
-
-extern	void		ilm_free(ipif_t *);
-extern	ilm_t		*ilm_lookup_ill(ill_t *, ipaddr_t, zoneid_t);
-extern	ilm_t		*ilm_lookup_ill_v6(ill_t *, const in6_addr_t *,
-    boolean_t, zoneid_t);
-extern	ilm_t		*ilm_lookup_ipif(ipif_t *, ipaddr_t);
-
-extern int		ilm_numentries_v6(ill_t *, const in6_addr_t *);
-extern int		ilm_walk_ipif(ipif_t *);
-extern int		ilm_walk_ill(ill_t *);
-extern void		ilm_walker_cleanup(ill_t *);
-extern int		ip_ll_send_disabmulti_req(ill_t *, const in6_addr_t *);
-extern int		ip_ll_send_enabmulti_req(ill_t *, const in6_addr_t *);
-
-extern	int		ip_addmulti(ipaddr_t, ipif_t *, ilg_stat_t,
-    mcast_record_t, slist_t *);
-extern	int		ip_addmulti_v6(const in6_addr_t *, ill_t *,
-    zoneid_t, ilg_stat_t, mcast_record_t, slist_t *);
-extern	int		ip_delmulti(ipaddr_t, ipif_t *, boolean_t, boolean_t);
-extern	int		ip_delmulti_v6(const in6_addr_t *, ill_t *,
-    zoneid_t, boolean_t, boolean_t);
+extern void		ip_dlur_to_mhi(ill_t *, mblk_t *,
+    struct mac_header_info_s *);
+
+/* These make up the data path interface used by ip_output and ip_input */
+extern boolean_t	ill_hasmembers_v4(ill_t *, ipaddr_t);
+extern boolean_t	ill_hasmembers_v6(ill_t *, const in6_addr_t *);
+extern boolean_t	ill_hasmembers_otherzones_v4(ill_t *, ipaddr_t,
+    zoneid_t);
+extern boolean_t	ill_hasmembers_otherzones_v6(ill_t *,
+    const in6_addr_t *, zoneid_t);
+extern zoneid_t		ill_hasmembers_nextzone_v4(ill_t *, ipaddr_t, zoneid_t);
+extern zoneid_t		ill_hasmembers_nextzone_v6(ill_t *, const in6_addr_t *,
+    zoneid_t);
+
+extern	ilm_t		*ip_addmulti(const in6_addr_t *, ill_t *, zoneid_t,
+    int *);
+extern	int		ip_delmulti(ilm_t *);
+extern	int		ip_mforward(mblk_t *, ip_recv_attr_t *);
+extern	void		ip_mroute_decap(mblk_t *, ip_recv_attr_t *);
 extern	int		ill_join_allmulti(ill_t *);
 extern	void		ill_leave_allmulti(ill_t *);
 extern	int		ip_join_allmulti(uint_t, boolean_t, ip_stack_t *);
 extern	int		ip_leave_allmulti(uint_t, boolean_t, ip_stack_t *);
 extern	void		ip_purge_allmulti(ill_t *);
-extern	void		ip_multicast_loopback(queue_t *, ill_t *, mblk_t *,
-    int, zoneid_t);
-extern	int		ip_mforward(ill_t *, ipha_t *, mblk_t *);
-extern	void		ip_mroute_decap(queue_t *, mblk_t *, ill_t *);
 extern	int		ip_mroute_mrt(mblk_t *, ip_stack_t *);
 extern	int		ip_mroute_stats(mblk_t *, ip_stack_t *);
 extern	int		ip_mroute_vif(mblk_t *, ip_stack_t *);
-extern	int		ip_mrouter_done(mblk_t *, ip_stack_t *);
-extern	int		ip_mrouter_get(int, queue_t *, uchar_t *);
-extern	int		ip_mrouter_set(int, queue_t *, int, uchar_t *, int,
-    mblk_t *);
+extern	int		ip_mrouter_done(ip_stack_t *);
+extern	int		ip_mrouter_get(int, conn_t *, uchar_t *);
+extern	int		ip_mrouter_set(int, conn_t *, int, uchar_t *, int);
 extern	void		ip_mrouter_stack_init(ip_stack_t *);
 extern	void		ip_mrouter_stack_destroy(ip_stack_t *);
 
-extern	int		ip_opt_add_group(conn_t *, boolean_t, ipaddr_t,
-    ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *);
-extern	int		ip_opt_delete_group(conn_t *, boolean_t, ipaddr_t,
-    ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *);
-extern	int		ip_opt_add_group_v6(conn_t *, boolean_t,
-    const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *);
-extern	int		ip_opt_delete_group_v6(conn_t *, boolean_t,
-    const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *);
+extern	int		ip_opt_add_group(conn_t *, boolean_t,
+    const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
+extern	int		ip_opt_delete_group(conn_t *, boolean_t,
+    const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
 
 extern  int		mrt_ioctl(ipif_t *, sin_t *, queue_t *, mblk_t *,
     ip_ioctl_cmd_t *, void *);
 extern	int		ip_sioctl_msfilter(ipif_t *, sin_t *, queue_t *,
     mblk_t *, ip_ioctl_cmd_t *, void *);
-extern	int		ip_extract_msfilter(queue_t *, mblk_t *,
-    const ip_ioctl_cmd_t *, cmd_info_t *, ipsq_func_t);
 extern	int		ip_copyin_msfilter(queue_t *, mblk_t *);
 
-extern	void		ip_wput_ctl(queue_t *, mblk_t *);
-
-extern	int		pim_input(queue_t *, mblk_t *, ill_t *);
-extern	void		reset_conn_ipif(ipif_t *);
-extern	void		reset_conn_ill(ill_t *);
+extern	mblk_t		*pim_input(mblk_t *, ip_recv_attr_t *);
+extern	void		update_conn_ill(ill_t *, ip_stack_t *);
 extern	void		reset_mrt_ill(ill_t *);
 extern	void		reset_mrt_vif_ipif(ipif_t *);
-extern	void		mcast_restart_timers_thread(ip_stack_t *);
+extern	void		igmp_start_timers(unsigned, ip_stack_t *);
+extern	void		mld_start_timers(unsigned, ip_stack_t *);
 extern	void		ilm_inactive(ilm_t *);
-extern	ilm_t		*ilm_walker_start(ilm_walker_t *, ill_t *);
-extern	ilm_t		*ilm_walker_step(ilm_walker_t *, ilm_t *);
-extern	void		ilm_walker_finish(ilm_walker_t *);
 
 #endif /* _KERNEL */
 
diff --git a/usr/src/uts/common/inet/ip_ndp.h b/usr/src/uts/common/inet/ip_ndp.h
index c1a48b1f1a..21c907f3f3 100644
--- a/usr/src/uts/common/inet/ip_ndp.h
+++ b/usr/src/uts/common/inet/ip_ndp.h
@@ -35,7 +35,7 @@
 
 /*
  * Internal definitions for the kernel implementation of the IPv6
- * Neighbor Discovery Protocol (NDP).
+ * Neighbor Discovery Protocol (NDP) and Address Resolution Protocol (ARP).
  */
 
 #ifdef	__cplusplus
@@ -48,131 +48,149 @@ extern "C" {
  * callbacks set up with ip2mac interface, waiting for result
  * of neighbor resolution.
  */
-typedef struct nce_cb_s {
-	list_node_t		nce_cb_node;
-	void			*nce_cb_id;
-	uint32_t		nce_cb_flags;
-	ip2mac_callback_t	*nce_cb_func;
-	void			*nce_cb_arg;
-} nce_cb_t;
+typedef struct ncec_cb_s {
+	list_node_t		ncec_cb_node;	/* next entry in list */
+	void			*ncec_cb_id;
+	uint32_t		ncec_cb_flags;
+	ip2mac_callback_t	*ncec_cb_func;
+	void			*ncec_cb_arg;
+} ncec_cb_t;
 
 #define	NCE_CB_DISPATCHED	0x00000001
 
 /*
- * NDP Cache Entry
+ * Core information tracking Neighbor Reachability is tracked in the
+ * ncec_s/ncec_t. The information contained in the ncec_t does not contain
+ * any link-specific details other than the pointer to the ill_t itself.
+ * The link-specific information is tracked in the nce_t structure.
  */
-typedef struct nce_s {
-	struct	nce_s	*nce_next;	/* Hash chain next pointer */
-	struct	nce_s	**nce_ptpn;	/* Pointer to previous next */
-	struct 	ill_s	*nce_ill;	/* Associated ill */
-	uint16_t	nce_flags;	/* See below */
-	uint16_t	nce_state;	/* See reachability states in if.h */
-	int16_t		nce_pcnt;	/* Probe counter */
-	uint16_t	nce_rcnt;	/* Retransmit counter */
-	in6_addr_t	nce_addr;	/* address of the nighbor */
-	in6_addr_t	nce_mask;	/* If not all ones, mask allows an */
-	    /* entry  to respond to requests for a group of addresses, for */
-	    /* instantance multicast addresses				   */
-	in6_addr_t	nce_extract_mask; /* For mappings */
-	uint32_t	nce_ll_extract_start;	/* For mappings */
-#define	nce_first_mp_to_free	nce_fp_mp
-	mblk_t		*nce_fp_mp;	/* link layer fast path mp */
-	mblk_t		*nce_res_mp;	/* DL_UNITDATA_REQ */
-	mblk_t		*nce_qd_mp;	/* Head outgoing queued packets */
-#define	nce_last_mp_to_free	nce_qd_mp
-	mblk_t		*nce_timer_mp;	/* NDP timer mblk */
-	mblk_t		*nce_mp;	/* mblk we are in, last to be freed */
-	uint64_t	nce_last;	/* Time last reachable in msec */
-	uint32_t	nce_refcnt;	/* nce active usage count */
-	kmutex_t	nce_lock;	/* See comments on top for what */
+struct ncec_s {
+	struct	ncec_s	*ncec_next;	/* Hash chain next pointer */
+	struct	ncec_s	**ncec_ptpn;	/* Pointer to previous next */
+	struct 	ill_s	*ncec_ill;	/* Associated ill */
+	uint16_t	ncec_flags;	/* See below */
+	uint16_t	ncec_state;	/* See reachability states in if.h */
+	int16_t		ncec_pcnt;	/* Probe counter */
+	uint16_t	ncec_rcnt;	/* Retransmit counter */
+	in6_addr_t	ncec_addr;	/* address of the nighbor */
+	uchar_t		*ncec_lladdr;
+	mblk_t		*ncec_qd_mp;	/* Head outgoing queued packets */
+	uint64_t	ncec_last;	/* Time last reachable in msec */
+	uint32_t	ncec_refcnt;	/* ncec active usage count */
+	kmutex_t	ncec_lock;	/* See comments on top for what */
 					/* this field protects */
-	int		nce_unsolicit_count; /* Unsolicited Adv count */
-	struct nce_s	*nce_fastpath;	/* for fastpath list */
-	timeout_id_t	nce_timeout_id;
-	uchar_t		nce_ipversion;	/* IPv4(ARP)/IPv6(NDP) version */
-	uint_t		nce_defense_count;	/* number of NDP conflicts */
-	uint_t		nce_defense_time;	/* last time defended (secs) */
-	uint64_t	nce_init_time;  /* time when it was set to ND_INITIAL */
-	boolean_t	nce_trace_disable;	/* True when alloc fails */
-	list_t		nce_cb;
-	uint_t		nce_cb_walker_cnt;
+	int		ncec_unsolicit_count; /* Unsolicited Adv count */
+	timeout_id_t	ncec_timeout_id;
+	uchar_t		ncec_ipversion;	/* IPv4(ARP)/IPv6(NDP) version */
+	uint_t		ncec_defense_count;	/* number of NDP conflicts */
+	uint_t		ncec_last_time_defended; /* last time defended (secs) */
+	uint64_t	ncec_init_time; /* time when it was set to ND_INITIAL */
+	boolean_t	ncec_trace_disable;	/* True when alloc fails */
+	/*
+	 * interval to keep track of DAD probes.
+	 */
+	clock_t		ncec_xmit_interval;
+	ip_stack_t	*ncec_ipst;	/* Does not have a netstack_hold */
+	list_t		ncec_cb;	/* callbacks waiting for resolution */
+	uint_t		ncec_cb_walker_cnt;
+	uint_t		ncec_nprobes;
+	uint_t		ncec_lladdr_length;
+};
+
+/*
+ * The nce_t list hangs off the ill_s and tracks information that depends
+ * on the underlying physical link. Thus when the ill goes down,
+ * the nce_t list has to be flushed. This is  done as part of ill_delete()
+ *
+ * When the fastpath ack comes back in ill_fastpath_ack we call
+ * nce_fastpath_update to update the nce_t. We never actually
+ * flush the fastpath list, which is kept as an index into the
+ * ncec_t structures.
+ *
+ * when we ndp_delete, we remove the nce entries pointing
+ * at the dying ncec from the ill_fastpath_list chain.
+ *
+ */
+struct nce_s	{
+	list_node_t	nce_node;
+	ill_t		*nce_ill;
+	boolean_t	nce_is_condemned;
+	in6_addr_t	nce_addr;
+	/*
+	 * link-layer specific fields below
+	 */
+	mblk_t		*nce_dlur_mp;	/* DL_UNITDATA_REQ mp */
+	mblk_t		*nce_fp_mp;	/* fast path mp */
+	struct ncec_s	*nce_common;
+	kmutex_t	nce_lock;
+	uint32_t	nce_refcnt;
 	uint_t		nce_ipif_cnt;	/* number of ipifs with the nce_addr */
 					/* as their local address */
-} nce_t;
+};
 
 /*
  * The ndp_g_t structure contains protocol specific information needed
  * to synchronize and manage neighbor cache entries for IPv4 and IPv6.
  * There are 2 such structures, ips_ndp4 and ips_ndp6.
  * ips_ndp6 contains the data structures needed for IPv6 Neighbor Discovery.
- * ips_ndp4 has IPv4 link layer info in its nce_t structures
- * Note that the nce_t is not currently used as the arp cache itself;
- * it is used for the following purposes:
- *   - queue packets in nce_qd_mp while waiting for arp resolution to complete
- *   - nce_{res, fp}_mp are used to track DL_UNITDATA request/responses.
- *   - track state of ARP resolution in the nce_state;
+ * ips_ndp4 contains the data structures for IPv4 ARP.
  *
  * Locking notes:
  * ndp_g_lock protects neighbor cache tables access and
- * insertion/removal of cache entries into/from these tables.
- * nce_lock protects nce_pcnt, nce_rcnt, nce_qd_mp nce_state, nce_res_mp,
- * nce_refcnt, nce_last, and nce_cb_walker_cnt.
- * nce_refcnt is incremented for every ire pointing to this nce and
- * every time ndp_lookup() finds an nce.
- * Should there be a need to obtain nce_lock and ndp_g_lock, ndp_g_lock is
+ * insertion/removal of cache entries into/from these tables. The ncec_lock
+ * and nce_lock protect fields in the ncec_t and nce_t structures.
+ * Should there be a need to obtain nce[c]_lock and ndp_g_lock, ndp_g_lock is
  * acquired first.
- * To avoid becoming exclusive when deleting NCEs, ndp_walk() routine holds
- * the ndp_g_lock (i.e global lock) and marks NCEs to be deleted with
- * NCE_F_CONDEMNED.  When all active users of such NCEs are gone the walk
- * routine passes a list for deletion to nce_ire_delete_list().
- *
- * When the link-layer address of some onlink host changes, ARP will send
- * an AR_CN_ANNOUNCE message to ip so that stale neighbor-cache
- * information will not get used. This message is processed in ip_arp_news()
- * by walking the nce list, and updating as appropriate. The ndp_g_hw_change
- * flag is set by ip_arp_news() to notify nce_t users that ip_arp_news() is
- * in progress.
  */
 typedef	struct ndp_g_s {
 	kmutex_t	ndp_g_lock;	/* Lock protecting  cache hash table */
-	nce_t		*nce_mask_entries;	/* mask not all ones */
-	nce_t		*nce_hash_tbl[NCE_TABLE_SIZE];
+	ncec_t		*nce_hash_tbl[NCE_TABLE_SIZE];
 	int		ndp_g_walker; /* # of active thread walking hash list */
 	boolean_t	ndp_g_walker_cleanup; /* true implies defer deletion. */
-	int		ndp_g_hw_change; /* non-zero if nce flush in progress */
 } ndp_g_t;
 
-#define	NDP_HW_CHANGE_INCR(ndp) {		\
-	mutex_enter(&(ndp)->ndp_g_lock);	\
-	(ndp)->ndp_g_hw_change++;		\
-	mutex_exit(&(ndp)->ndp_g_lock);		\
-}
-
-#define	NDP_HW_CHANGE_DECR(ndp) {		\
-	mutex_enter(&(ndp)->ndp_g_lock);	\
-	(ndp)->ndp_g_hw_change--;		\
-	mutex_exit(&(ndp)->ndp_g_lock);		\
-}
-
-/* nce_flags  */
-#define	NCE_F_PERMANENT		0x1
-#define	NCE_F_MAPPING		0x2
+/* ncec_flags  */
+#define	NCE_F_MYADDR		0x1	/* ipif exists for the ncec_addr */
+#define	NCE_F_UNVERIFIED	0x2	/* DAD in progress. */
 #define	NCE_F_ISROUTER		0x4
-/*	unused			0x8 */
+#define	NCE_F_FAST		0x8
+
+/*
+ * NCE_F_NONUD is used to disable IPv6 Neighbor Unreachability Detection or
+ * IPv4 aging and maps to the ATF_PERM flag for arp(1m)
+ */
 #define	NCE_F_NONUD		0x10
+
 #define	NCE_F_ANYCAST		0x20
 #define	NCE_F_CONDEMNED		0x40
 #define	NCE_F_UNSOL_ADV		0x80
 #define	NCE_F_BCAST		0x100
+#define	NCE_F_MCAST		0x200
+
+/*
+ * NCE_F_PUBLISH is set for all ARP/ND entries that we announce. This
+ * includes locally configured addresses as well as those that we proxy for.
+ */
+#define	NCE_F_PUBLISH		0x400
+
+/*
+ * NCE_F_AUTHORITY is set for any address that we have authoritatitve
+ * information for. This includes locally configured addresses as well
+ * as statically configured arp entries that are set up using the "permanent"
+ * option described in arp(1m). The NCE_F_AUTHORITY asserts that we would
+ * reject any updates for that nce's (host, link-layer-address) information
+ */
+#define	NCE_F_AUTHORITY		0x800
 
-#define	NCE_EXTERNAL_FLAGS_MASK \
-	(NCE_F_PERMANENT | NCE_F_MAPPING | NCE_F_ISROUTER | NCE_F_NONUD | \
-	NCE_F_ANYCAST | NCE_F_UNSOL_ADV)
+#define	NCE_F_DELAYED		0x1000 /* rescheduled on dad_defend_rate */
+#define	NCE_F_STATIC		0x2000
 
 /* State REACHABLE, STALE, DELAY or PROBE */
-#define	NCE_ISREACHABLE(nce)			\
-	(((((nce)->nce_state) >= ND_REACHABLE) &&	\
-	((nce)->nce_state) <= ND_PROBE))
+#define	NCE_ISREACHABLE(ncec)			\
+	(((((ncec)->ncec_state) >= ND_REACHABLE) &&	\
+	((ncec)->ncec_state) <= ND_PROBE))
+
+#define	NCE_ISCONDEMNED(ncec)	((ncec)->ncec_flags & NCE_F_CONDEMNED)
 
 /* NDP flags set in SOL/ADV requests */
 #define	NDP_UNICAST		0x1
@@ -184,95 +202,14 @@ typedef	struct ndp_g_s {
 /* Number of packets queued in NDP for a neighbor */
 #define	ND_MAX_Q		4
 
-
-#ifdef DEBUG
-#define	NCE_TRACE_REF(nce)		nce_trace_ref(nce)
-#define	NCE_UNTRACE_REF(nce)		nce_untrace_ref(nce)
-#else
-#define	NCE_TRACE_REF(nce)
-#define	NCE_UNTRACE_REF(nce)
-#endif
-
-#define	NCE_REFHOLD(nce) {		\
-	mutex_enter(&(nce)->nce_lock);	\
-	(nce)->nce_refcnt++;		\
-	ASSERT((nce)->nce_refcnt != 0);	\
-	NCE_TRACE_REF(nce);		\
-	mutex_exit(&(nce)->nce_lock);	\
-}
-
-#define	NCE_REFHOLD_NOTR(nce) {		\
-	mutex_enter(&(nce)->nce_lock);	\
-	(nce)->nce_refcnt++;		\
-	ASSERT((nce)->nce_refcnt != 0);	\
-	mutex_exit(&(nce)->nce_lock);	\
-}
-
-#define	NCE_REFHOLD_LOCKED(nce) {		\
-	ASSERT(MUTEX_HELD(&(nce)->nce_lock));	\
-	(nce)->nce_refcnt++;			\
-	NCE_TRACE_REF(nce);			\
-}
-
-/* nce_inactive destroys the mutex thus no mutex_exit is needed */
-#define	NCE_REFRELE(nce) {		\
-	mutex_enter(&(nce)->nce_lock);	\
-	NCE_UNTRACE_REF(nce);		\
-	ASSERT((nce)->nce_refcnt != 0);	\
-	if (--(nce)->nce_refcnt == 0)	\
-		ndp_inactive(nce);	\
-	else {				\
-		mutex_exit(&(nce)->nce_lock);\
-	}				\
-}
-
-#define	NCE_REFRELE_NOTR(nce) {		\
-	mutex_enter(&(nce)->nce_lock);	\
-	ASSERT((nce)->nce_refcnt != 0);	\
-	if (--(nce)->nce_refcnt == 0)	\
-		ndp_inactive(nce);	\
-	else {				\
-		mutex_exit(&(nce)->nce_lock);\
-	}				\
-}
-
-#define	NDP_RESTART_TIMER(nce, ms) {	\
-	ASSERT(!MUTEX_HELD(&(nce)->nce_lock));				\
-	if ((nce)->nce_timeout_id != 0) {				\
-		/* Ok to untimeout bad id. we don't hold a lock. */	\
-		(void) untimeout((nce)->nce_timeout_id);		\
-	}								\
-	mutex_enter(&(nce)->nce_lock);					\
-	/* Don't start the timer if the nce has been deleted */		\
-	if (!((nce)->nce_flags & NCE_F_CONDEMNED)) 			\
-		nce->nce_timeout_id = timeout(ndp_timer, nce, 		\
-		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));	\
-	mutex_exit(&(nce)->nce_lock);					\
-}
-
-/* Structure for ndp_cache_count() */
-typedef struct {
-	int	ncc_total;	/* Total number of NCEs */
-	int	ncc_host;	/* NCE entries without R bit set */
-} ncc_cache_count_t;
-
-/*
- * Structure of ndp_cache_reclaim().  Each field is a fraction i.e. 1 means
- * reclaim all, N means reclaim 1/Nth of all entries, 0 means reclaim none.
- */
-typedef struct {
-	int	ncr_host;	/* Fraction for host entries */
-} nce_cache_reclaim_t;
-
 /*
- * Structure for nce_delete_hw_changed; specifies an IPv4 address to link-layer
- * address mapping.  Any route that has a cached copy of a mapping for that
- * IPv4 address that doesn't match the given mapping must be purged.
+ * Structure for nce_update_hw_changed;
  */
 typedef struct {
 	ipaddr_t hwm_addr;	/* IPv4 address */
-	uint_t hwm_hwlen;	/* Length of hardware address (may be 0) */
+	uint_t	hwm_hwlen;	/* Length of hardware address (may be 0) */
 	uchar_t *hwm_hwaddr;	/* Pointer to new hardware address, if any */
+	int	hwm_flags;
 } nce_hw_map_t;
 
 /* When SAP is greater than zero address appears before SAP */
@@ -284,6 +221,15 @@ typedef struct {
 	((sizeof (dl_unitdata_req_t)) + ((ill)->ill_phys_addr_length)) : \
 	(sizeof (dl_unitdata_req_t)))
 
+#define	NCE_MYADDR(ncec)	(((ncec)->ncec_flags & NCE_F_MYADDR) != 0)
+
+/*
+ * NCE_PUBLISH() identifies the addresses that we are publishing. This
+ * includes locally configured address (NCE_MYADDR()) as well as those that
+ * we are proxying.
+ */
+#define	NCE_PUBLISH(ncec) ((ncec->ncec_flags & NCE_F_PUBLISH) != 0)
+
 #ifdef _BIG_ENDIAN
 #define	NCE_LL_SAP_COPY(ill, mp) \
 	{ \
@@ -327,55 +273,65 @@ typedef struct {
 /* NDP Cache Entry Hash Table */
 #define	NCE_TABLE_SIZE	256
 
-extern	void	ndp_cache_count(nce_t *, char *);
-extern	void	ndp_cache_reclaim(nce_t *, char *);
-extern	void	ndp_delete(nce_t *);
-extern	void	ndp_delete_per_ill(nce_t *, uchar_t *);
-extern	void	ndp_fastpath_flush(nce_t *, char  *);
-extern	boolean_t ndp_fastpath_update(nce_t *, void  *);
+extern	void	ip_nce_reclaim(void *);
+extern	void	ncec_delete(ncec_t *);
+extern	void	ncec_delete_per_ill(ncec_t *, uchar_t *);
+extern	void	nce_fastpath_update(ill_t *, mblk_t  *);
 extern	nd_opt_hdr_t *ndp_get_option(nd_opt_hdr_t *, int, int);
-extern	void	ndp_inactive(nce_t *);
-extern	void	ndp_input(ill_t *, mblk_t *, mblk_t *);
-extern	boolean_t ndp_lookup_ipaddr(in_addr_t, netstack_t *);
-extern	nce_t	*ndp_lookup_v6(ill_t *, boolean_t, const in6_addr_t *,
-    boolean_t);
-extern	nce_t	*ndp_lookup_v4(ill_t *, const in_addr_t *, boolean_t);
-extern	int	ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t,
+extern	void	ncec_inactive(ncec_t *);
+extern	void	ndp_input(mblk_t *, ip_recv_attr_t *);
+extern	ncec_t	*ncec_lookup_illgrp_v6(ill_t *, const in6_addr_t *);
+extern	ncec_t	*ncec_lookup_illgrp_v4(ill_t *, const in_addr_t *);
+extern	nce_t	*nce_lookup_v4(ill_t *, const in_addr_t *);
+extern	nce_t	*nce_lookup_v6(ill_t *, const in6_addr_t *);
+extern	void	nce_make_unreachable(ncec_t *);
+extern	mblk_t	*ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t,
     mblk_t *);
-extern	int	ndp_noresolver(ill_t *, const in6_addr_t *);
-extern	void	ndp_process(nce_t *, uchar_t *, uint32_t, boolean_t);
+extern  nce_t	*ndp_nce_init(ill_t *, const in6_addr_t *, int);
+extern  void	nce_process(ncec_t *, uchar_t *, uint32_t, boolean_t);
 extern	int	ndp_query(ill_t *, lif_nd_req_t *);
-extern	int	ndp_resolver(ill_t *, const in6_addr_t *, mblk_t *, zoneid_t);
 extern	int	ndp_sioc_update(ill_t *, lif_nd_req_t *);
 extern	boolean_t	ndp_verify_optlen(nd_opt_hdr_t *, int);
-extern	void	ndp_timer(void *);
-extern	void	ndp_walk(ill_t *, pfi_t, void *, ip_stack_t *);
-extern	void	ndp_walk_common(ndp_g_t *, ill_t *, pfi_t,
+extern	void	nce_timer(void *);
+extern	void	ncec_walk(ill_t *, pfi_t, void *, ip_stack_t *);
+extern	void	ncec_walk_common(ndp_g_t *, ill_t *, pfi_t,
     void *, boolean_t);
-extern	boolean_t	ndp_restart_dad(nce_t *);
-extern	void	ndp_do_recovery(ipif_t *);
-extern	void	nce_resolv_failed(nce_t *);
-extern	void	arp_resolv_failed(nce_t *);
-extern	void	nce_fastpath_list_add(nce_t *);
-extern	void	nce_fastpath_list_delete(nce_t *);
-extern	void	nce_fastpath_list_dispatch(ill_t *,
-    boolean_t (*)(nce_t *, void  *), void *);
-extern	void	nce_queue_mp_common(nce_t *, mblk_t *, boolean_t);
-extern	void	nce_delete_hw_changed(nce_t *, void *);
-extern	void	nce_fastpath(nce_t *);
-extern	int	ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
-    const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
-    nce_t **);
-extern	int	ndp_lookup_then_add_v6(ill_t *, boolean_t, uchar_t *,
-    const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, uint32_t,
-    uint16_t, uint16_t, nce_t **);
-extern	int	ndp_lookup_then_add_v4(ill_t *,
-    const in_addr_t *, uint16_t, nce_t **, nce_t *);
-extern void	ip_ndp_resolve(nce_t *);
+extern	boolean_t	nce_restart_dad(ncec_t *);
+extern	void	ndp_resolv_failed(ncec_t *);
+extern	void	arp_resolv_failed(ncec_t *);
+extern	void	nce_fastpath_list_delete(ill_t *, ncec_t *, list_t *);
+extern	void	nce_queue_mp(ncec_t *, mblk_t *, boolean_t);
+extern	void	nce_update_hw_changed(ncec_t *, void *);
+extern	int	nce_lookup_then_add_v6(ill_t *, uchar_t *, uint_t,
+    const in6_addr_t *, uint16_t, uint16_t, nce_t **);
+extern	int	nce_lookup_then_add_v4(ill_t *, uchar_t *, uint_t,
+    const in_addr_t *, uint16_t, uint16_t, nce_t **);
+extern boolean_t nce_cmp_ll_addr(const ncec_t *, const uchar_t *, uint32_t);
+extern void	nce_update(ncec_t *, uint16_t, uchar_t *);
+extern nce_t   *nce_lookup_mapping(ill_t *, const in6_addr_t *);
+
+extern void	nce_restart_timer(ncec_t *, uint_t);
+extern void	ncec_refrele(ncec_t *);
+extern void	ncec_refhold(ncec_t *);
+extern void	ncec_refrele_notr(ncec_t *);
+extern void	ncec_refhold_notr(ncec_t *);
+extern void	nce_resolv_ok(ncec_t *);
+extern uint32_t	ndp_solicit(ncec_t *, in6_addr_t, ill_t *);
+extern boolean_t ip_nce_conflict(mblk_t *, ip_recv_attr_t *, ncec_t *);
+extern boolean_t ndp_announce(ncec_t *);
+extern void	ip_nce_lookup_and_update(ipaddr_t *, ipif_t *, ip_stack_t *,
+    uchar_t *, int, int);
+extern void	nce_refrele(nce_t *);
+extern void	nce_refhold(nce_t *);
+extern void	nce_delete(nce_t *);
+extern void	nce_flush(ill_t *, boolean_t);
+extern void	nce_walk(ill_t *, pfi_t, void *);
+extern void	ip_ndp_resolve(struct ncec_s *);
+extern void	ip_addr_recover(ipsq_t *, queue_t *, mblk_t *, void *);
 
 #ifdef DEBUG
-extern	void	nce_trace_ref(nce_t *);
-extern	void	nce_untrace_ref(nce_t *);
+extern	void	nce_trace_ref(ncec_t *);
+extern	void	nce_untrace_ref(ncec_t *);
 #endif
 
 #endif	/* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_netinfo.h b/usr/src/uts/common/inet/ip_netinfo.h
index b34cf0751e..a496248e23 100644
--- a/usr/src/uts/common/inet/ip_netinfo.h
+++ b/usr/src/uts/common/inet/ip_netinfo.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,10 +41,13 @@ extern void ip_net_init(ip_stack_t *, netstack_t *);
 extern void ip_net_destroy(ip_stack_t *);
 extern void ipv4_hook_init(ip_stack_t *);
 extern void ipv6_hook_init(ip_stack_t *);
+extern void arp_hook_init(ip_stack_t *);
 extern void ipv4_hook_destroy(ip_stack_t *);
 extern void ipv6_hook_destroy(ip_stack_t *);
+extern void arp_hook_destroy(ip_stack_t *);
 extern void ipv4_hook_shutdown(ip_stack_t *);
 extern void ipv6_hook_shutdown(ip_stack_t *);
+extern void arp_hook_shutdown(ip_stack_t *);
 extern void ip_ne_queue_func(void *);
 
 #endif	/* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_rts.h b/usr/src/uts/common/inet/ip_rts.h
index 61bc451995..f5cbedd370 100644
--- a/usr/src/uts/common/inet/ip_rts.h
+++ b/usr/src/uts/common/inet/ip_rts.h
@@ -48,7 +48,8 @@ extern "C" {
 #ifdef _KERNEL
 
 extern	void	ip_rts_change(int, ipaddr_t, ipaddr_t,
-    ipaddr_t, ipaddr_t, ipaddr_t, int, int, int, ip_stack_t *);
+    ipaddr_t, ipaddr_t, ipaddr_t, int, int,
+    int, ip_stack_t *);
 
 extern	void	ip_rts_change_v6(int, const in6_addr_t *, const in6_addr_t *,
     const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, int, int, int,
@@ -74,15 +75,17 @@ extern	size_t	rts_data_msg_size(int, sa_family_t, uint_t);
 
 extern	void	rts_fill_msg_v6(int, int, const in6_addr_t *,
     const in6_addr_t *, const in6_addr_t *, const in6_addr_t *,
-    const in6_addr_t *, const in6_addr_t *, const ipif_t *, mblk_t *,
-    uint_t, const tsol_gc_t *);
+    const in6_addr_t *, const in6_addr_t *, const in6_addr_t *,
+    const ill_t *, mblk_t *, const tsol_gc_t *);
 
 extern	size_t	rts_header_msg_size(int);
 
+extern void	rts_merge_metrics(iulp_t *, const iulp_t *);
+
 extern	void	rts_queue_input(mblk_t *, conn_t *, sa_family_t, uint_t,
     ip_stack_t *);
 
-extern int ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *, cred_t *);
+extern int ip_rts_request_common(mblk_t *mp, conn_t *, cred_t *);
 
 #endif /* _KERNEL */
 
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index b5d9715c65..d2f6c07234 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -38,6 +38,7 @@ extern "C" {
 #ifdef _KERNEL
 #include <sys/list.h>
 
+
 /*
  * IP statistics.
  */
@@ -46,52 +47,45 @@ extern "C" {
 		((ipst)->ips_ip_statistics.x.value.ui64 += (n))
 
 typedef struct ip_stat {
-	kstat_named_t	ipsec_fanout_proto;
 	kstat_named_t	ip_udp_fannorm;
 	kstat_named_t	ip_udp_fanmb;
-	kstat_named_t	ip_udp_fanothers;
-	kstat_named_t	ip_udp_fast_path;
-	kstat_named_t	ip_udp_slow_path;
-	kstat_named_t	ip_udp_input_err;
-	kstat_named_t	ip_tcppullup;
-	kstat_named_t	ip_tcpoptions;
-	kstat_named_t	ip_multipkttcp;
-	kstat_named_t	ip_tcp_fast_path;
-	kstat_named_t	ip_tcp_slow_path;
-	kstat_named_t	ip_tcp_input_error;
+	kstat_named_t	ip_recv_pullup;
 	kstat_named_t	ip_db_ref;
-	kstat_named_t	ip_notaligned1;
-	kstat_named_t	ip_notaligned2;
-	kstat_named_t	ip_multimblk3;
-	kstat_named_t	ip_multimblk4;
-	kstat_named_t	ip_ipoptions;
-	kstat_named_t	ip_classify_fail;
+	kstat_named_t	ip_notaligned;
+	kstat_named_t	ip_multimblk;
 	kstat_named_t	ip_opt;
-	kstat_named_t	ip_udp_rput_local;
 	kstat_named_t	ipsec_proto_ahesp;
 	kstat_named_t	ip_conn_flputbq;
 	kstat_named_t	ip_conn_walk_drain;
 	kstat_named_t   ip_out_sw_cksum;
+	kstat_named_t	ip_out_sw_cksum_bytes;
 	kstat_named_t   ip_in_sw_cksum;
-	kstat_named_t   ip_trash_ire_reclaim_calls;
-	kstat_named_t   ip_trash_ire_reclaim_success;
-	kstat_named_t   ip_ire_arp_timer_expired;
-	kstat_named_t   ip_ire_redirect_timer_expired;
-	kstat_named_t	ip_ire_pmtu_timer_expired;
-	kstat_named_t	ip_input_multi_squeue;
+	kstat_named_t   ip_ire_reclaim_calls;
+	kstat_named_t   ip_ire_reclaim_deleted;
+	kstat_named_t   ip_nce_reclaim_calls;
+	kstat_named_t   ip_nce_reclaim_deleted;
+	kstat_named_t   ip_dce_reclaim_calls;
+	kstat_named_t   ip_dce_reclaim_deleted;
 	kstat_named_t	ip_tcp_in_full_hw_cksum_err;
 	kstat_named_t	ip_tcp_in_part_hw_cksum_err;
 	kstat_named_t	ip_tcp_in_sw_cksum_err;
-	kstat_named_t	ip_tcp_out_sw_cksum_bytes;
 	kstat_named_t	ip_udp_in_full_hw_cksum_err;
 	kstat_named_t	ip_udp_in_part_hw_cksum_err;
 	kstat_named_t	ip_udp_in_sw_cksum_err;
-	kstat_named_t	ip_udp_out_sw_cksum_bytes;
-	kstat_named_t	ip_frag_mdt_pkt_out;
-	kstat_named_t	ip_frag_mdt_discarded;
-	kstat_named_t	ip_frag_mdt_allocfail;
-	kstat_named_t	ip_frag_mdt_addpdescfail;
-	kstat_named_t	ip_frag_mdt_allocd;
+	kstat_named_t	conn_in_recvdstaddr;
+	kstat_named_t	conn_in_recvopts;
+	kstat_named_t	conn_in_recvif;
+	kstat_named_t	conn_in_recvslla;
+	kstat_named_t	conn_in_recvucred;
+	kstat_named_t	conn_in_recvttl;
+	kstat_named_t	conn_in_recvhopopts;
+	kstat_named_t	conn_in_recvhoplimit;
+	kstat_named_t	conn_in_recvdstopts;
+	kstat_named_t	conn_in_recvrthdrdstopts;
+	kstat_named_t	conn_in_recvrthdr;
+	kstat_named_t	conn_in_recvpktinfo;
+	kstat_named_t	conn_in_recvtclass;
+	kstat_named_t	conn_in_timestamp;
 } ip_stat_t;
 
 
@@ -103,20 +97,22 @@ typedef struct ip_stat {
 	((ipst)->ips_ip6_statistics.x.value.ui64 += (n))
 
 typedef struct ip6_stat {
-	kstat_named_t	ip6_udp_fast_path;
-	kstat_named_t	ip6_udp_slow_path;
 	kstat_named_t	ip6_udp_fannorm;
 	kstat_named_t	ip6_udp_fanmb;
+	kstat_named_t	ip6_recv_pullup;
+	kstat_named_t	ip6_db_ref;
+	kstat_named_t	ip6_notaligned;
+	kstat_named_t	ip6_multimblk;
+	kstat_named_t	ipsec_proto_ahesp;
 	kstat_named_t   ip6_out_sw_cksum;
+	kstat_named_t	ip6_out_sw_cksum_bytes;
 	kstat_named_t   ip6_in_sw_cksum;
 	kstat_named_t	ip6_tcp_in_full_hw_cksum_err;
 	kstat_named_t	ip6_tcp_in_part_hw_cksum_err;
 	kstat_named_t	ip6_tcp_in_sw_cksum_err;
-	kstat_named_t	ip6_tcp_out_sw_cksum_bytes;
 	kstat_named_t	ip6_udp_in_full_hw_cksum_err;
 	kstat_named_t	ip6_udp_in_part_hw_cksum_err;
 	kstat_named_t	ip6_udp_in_sw_cksum_err;
-	kstat_named_t	ip6_udp_out_sw_cksum_bytes;
 	kstat_named_t	ip6_frag_mdt_pkt_out;
 	kstat_named_t	ip6_frag_mdt_discarded;
 	kstat_named_t	ip6_frag_mdt_allocfail;
@@ -150,6 +146,8 @@ typedef struct srcid_map {
 struct ip_stack {
 	netstack_t	*ips_netstack;	/* Common netstack */
 
+	uint_t			ips_src_generation;	/* Both IPv4 and IPv6 */
+
 	struct ipparam_s	*ips_param_arr; 	/* ndd variable table */
 	struct ipndp_s		*ips_ndp_arr;
 
@@ -178,10 +176,6 @@ struct ip_stack {
 	kmutex_t	ips_ip_mi_lock;
 	kmutex_t	ips_ip_addr_avail_lock;
 	krwlock_t	ips_ill_g_lock;
-	krwlock_t	ips_ipsec_capab_ills_lock;
-				/* protects the list of IPsec capable ills */
-	struct ipsec_capab_ill_s *ips_ipsec_capab_ills_ah;
-	struct ipsec_capab_ill_s *ips_ipsec_capab_ills_esp;
 
 	krwlock_t	ips_ill_g_usesrc_lock;
 
@@ -198,10 +192,10 @@ struct ip_stack {
 	struct connf_s	*ips_rts_clients;
 	struct connf_s	*ips_ipcl_conn_fanout;
 	struct connf_s	*ips_ipcl_bind_fanout;
-	struct connf_s	*ips_ipcl_proto_fanout;
+	struct connf_s	*ips_ipcl_proto_fanout_v4;
 	struct connf_s	*ips_ipcl_proto_fanout_v6;
 	struct connf_s	*ips_ipcl_udp_fanout;
-	struct connf_s	*ips_ipcl_raw_fanout;
+	struct connf_s	*ips_ipcl_raw_fanout;		/* RAW SCTP sockets */
 	struct connf_s	*ips_ipcl_iptun_fanout;
 	uint_t		ips_ipcl_conn_fanout_size;
 	uint_t		ips_ipcl_bind_fanout_size;
@@ -237,31 +231,47 @@ struct ip_stack {
 	/* IPv4 forwarding table */
 	struct radix_node_head *ips_ip_ftable;
 
-	/* This is dynamically allocated in ip_ire_init */
-	struct irb	 *ips_ip_cache_table;
-
 #define	IPV6_ABITS		128
 #define	IP6_MASK_TABLE_SIZE	(IPV6_ABITS + 1)	/* 129 ptrs */
-
 	struct irb	*ips_ip_forwarding_table_v6[IP6_MASK_TABLE_SIZE];
-	/* This is dynamically allocated in ip_ire_init */
-	struct irb	*ips_ip_cache_table_v6;
 
-	uint32_t	ips_ire_handle;
 	/*
 	 * ire_ft_init_lock is used while initializing ip_forwarding_table
 	 * dynamically in ire_add.
 	 */
 	kmutex_t	ips_ire_ft_init_lock;
-	kmutex_t	ips_ire_handle_lock;	/* Protects ire_handle */
 
-	uint32_t	ips_ip_cache_table_size;
-	uint32_t	ips_ip6_cache_table_size;
+	/*
+	 * This is the IPv6 counterpart of RADIX_NODE_HEAD_LOCK. It is used
+	 * to prevent adds and deletes while we are doing a ftable_lookup
+	 * and extracting the ire_generation.
+	 */
+	krwlock_t	ips_ip6_ire_head_lock;
+
 	uint32_t	ips_ip6_ftable_hash_size;
 
 	ire_stats_t 	ips_ire_stats_v4;	/* IPv4 ire statistics */
 	ire_stats_t 	ips_ire_stats_v6;	/* IPv6 ire statistics */
 
+	/* Count how many condemned objects for kmem_cache callbacks */
+	uint32_t	ips_num_ire_condemned;
+	uint32_t	ips_num_nce_condemned;
+	uint32_t	ips_num_dce_condemned;
+
+	struct ire_s	*ips_ire_reject_v4;	/* For unreachable dests */
+	struct ire_s	*ips_ire_reject_v6;	/* For unreachable dests */
+	struct ire_s	*ips_ire_blackhole_v4;	/* For temporary failures */
+	struct ire_s	*ips_ire_blackhole_v6;	/* For temporary failures */
+
+	/* ips_ire_dep_lock protects ire_dep_* relationship between IREs */
+	krwlock_t	ips_ire_dep_lock;
+
+	/* Destination Cache Entries */
+	struct dce_s	*ips_dce_default;
+	uint_t		ips_dce_hashsize;
+	struct dcb_s	*ips_dce_hash_v4;
+	struct dcb_s	*ips_dce_hash_v6;
+
 	/* pending binds */
 	mblk_t		*ips_ip6_asp_pending_ops;
 	mblk_t		*ips_ip6_asp_pending_ops_tail;
@@ -293,9 +303,10 @@ struct ip_stack {
 	uint_t		ips_icmp_pkt_err_sent;
 
 	/* Protected by ip_mi_lock */
-	void		*ips_ip_g_head;		/* Instance Data List Head */
+	void		*ips_ip_g_head;	/* IP Instance Data List Head */
+	void		*ips_arp_g_head; /* ARP Instance Data List Head */
 
-	caddr_t		ips_ip_g_nd;		/* Named Dispatch List Head */
+	caddr_t		ips_ip_g_nd;	/* Named Dispatch List Head */
 
 	/* Multirouting stuff */
 	/* Interval (in ms) between consecutive 'bad MTU' warnings */
@@ -306,27 +317,11 @@ struct ip_stack {
 	struct cgtp_filter_ops *ips_ip_cgtp_filter_ops;	/* CGTP hooks */
 	boolean_t	ips_ip_cgtp_filter;	/* Enable/disable CGTP hooks */
 
-	kmutex_t	ips_ip_trash_timer_lock;
-	timeout_id_t	ips_ip_ire_expire_id;	/* IRE expiration timer. */
 	struct ipsq_s	*ips_ipsq_g_head;
 	uint_t		ips_ill_index;	/* Used to assign interface indicies */
 	/* When set search for unused index */
 	boolean_t	ips_ill_index_wrap;
 
-	clock_t		ips_ip_ire_arp_time_elapsed;
-			/* Time since IRE cache last flushed */
-	clock_t		ips_ip_ire_rd_time_elapsed;
-			/* ... redirect IREs last flushed */
-	clock_t		ips_ip_ire_pmtu_time_elapsed;
-			/* Time since path mtu increase */
-
-	uint_t		ips_ip_redirect_cnt;
-			/* Num of redirect routes in ftable */
-	uint_t		ips_ipv6_ire_default_count;
-			/* Number of IPv6 IRE_DEFAULT entries */
-	uint_t		ips_ipv6_ire_default_index;
-			/* Walking IPv6 index used to mod in */
-
 	uint_t		ips_loopback_packets;
 
 	/* NDP/NCE structures for IPv4 and IPv6 */
@@ -379,15 +374,17 @@ struct ip_stack {
 	struct srcid_map *ips_srcid_head;
 	krwlock_t	ips_srcid_lock;
 
-	uint64_t	ips_ipif_g_seqid;
+	uint64_t	ips_ipif_g_seqid;	/* Used only for sctp_addr.c */
 	union phyint_list_u *ips_phyint_g_list;	/* start of phyint list */
 
-/* ip_neti.c */
+/* ip_netinfo.c */
 	hook_family_t	ips_ipv4root;
 	hook_family_t	ips_ipv6root;
+	hook_family_t	ips_arproot;
 
 	net_handle_t		ips_ipv4_net_data;
 	net_handle_t		ips_ipv6_net_data;
+	net_handle_t		ips_arp_net_data;
 
 	/*
 	 * Hooks for firewalling
@@ -397,17 +394,23 @@ struct ip_stack {
 	hook_event_t		ips_ip4_forwarding_event;
 	hook_event_t		ips_ip4_loopback_in_event;
 	hook_event_t		ips_ip4_loopback_out_event;
+
 	hook_event_t		ips_ip6_physical_in_event;
 	hook_event_t		ips_ip6_physical_out_event;
 	hook_event_t		ips_ip6_forwarding_event;
 	hook_event_t		ips_ip6_loopback_in_event;
 	hook_event_t		ips_ip6_loopback_out_event;
 
+	hook_event_t		ips_arp_physical_in_event;
+	hook_event_t		ips_arp_physical_out_event;
+	hook_event_t		ips_arp_nic_events;
+
 	hook_event_token_t	ips_ipv4firewall_physical_in;
 	hook_event_token_t	ips_ipv4firewall_physical_out;
 	hook_event_token_t	ips_ipv4firewall_forwarding;
 	hook_event_token_t	ips_ipv4firewall_loopback_in;
 	hook_event_token_t	ips_ipv4firewall_loopback_out;
+
 	hook_event_token_t	ips_ipv6firewall_physical_in;
 	hook_event_token_t	ips_ipv6firewall_physical_out;
 	hook_event_token_t	ips_ipv6firewall_forwarding;
@@ -419,6 +422,10 @@ struct ip_stack {
 	hook_event_token_t	ips_ipv4nicevents;
 	hook_event_token_t	ips_ipv6nicevents;
 
+	hook_event_token_t	ips_arp_physical_in;
+	hook_event_token_t	ips_arp_physical_out;
+	hook_event_token_t	ips_arpnicevents;
+
 	net_handle_t		ips_ip4_observe_pr;
 	net_handle_t		ips_ip6_observe_pr;
 	hook_event_t		ips_ip4_observe;
@@ -432,13 +439,6 @@ struct ip_stack {
 	krwlock_t		ips_ipmp_lock;
 	mod_hash_t		*ips_ipmp_grp_hash;
 
-/* igmp.c */
-	/* multicast restart timers thread logic */
-	kmutex_t		ips_mrt_lock;
-	uint_t			ips_mrt_flags;
-	kcondvar_t		ips_mrt_cv;
-	kcondvar_t		ips_mrt_done_cv;
-	kthread_t		*ips_mrt_thread;
 };
 typedef struct ip_stack ip_stack_t;
 
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index e24bcd9a73..15a7c32376 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -41,8 +41,11 @@ extern "C" {
 #include <sys/sunddi.h>
 #include <sys/sunldi.h>
 
-typedef void (*edesc_spf)(void *, mblk_t *, void *, int);
-typedef void (*edesc_rpf)(void *, mblk_t *, void *);
+typedef void (*edesc_rpf)(void *, mblk_t *, void *, ip_recv_attr_t *);
+struct icmph_s;
+struct icmp6_hdr;
+typedef boolean_t (*edesc_vpf)(conn_t *, void *, struct icmph_s *,
+    struct icmp6_hdr *, ip_recv_attr_t *);
 
 /*
  * ==============================
@@ -53,7 +56,7 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
 /*
  * The connection structure contains the common information/flags/ref needed.
  * Implementation will keep the connection struct, the layers (with their
- * respective data for event i.e. tcp_t if event was tcp_input) all in one
+ * respective data for event i.e. tcp_t if event was tcp_input_data) all in one
  * contiguous memory location.
  */
 
@@ -61,14 +64,14 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
 /* Unused			0x00020000 */
 /* Unused			0x00040000 */
 #define	IPCL_FULLY_BOUND	0x00080000	/* Bound to correct squeue */
-#define	IPCL_CHECK_POLICY	0x00100000	/* Needs policy checking */
-#define	IPCL_SOCKET		0x00200000	/* Sockfs connection */
-#define	IPCL_ACCEPTOR		0x00400000	/* Sockfs priv acceptor */
+/* Unused			0x00100000 */
+/* Unused 			0x00200000 */
+/* Unused			0x00400000 */
 #define	IPCL_CL_LISTENER	0x00800000	/* Cluster listener */
-#define	IPCL_EAGER		0x01000000	/* Incoming connection */
+/* Unused			0x01000000 */
 /* Unused			0x02000000 */
-#define	IPCL_TCP6		0x04000000	/* AF_INET6 TCP */
-#define	IPCL_TCP4		0x08000000	/* IPv4 packet format TCP */
+/* Unused			0x04000000 */
+/* Unused			0x08000000 */
 /* Unused			0x10000000 */
 /* Unused			0x20000000 */
 #define	IPCL_CONNECTED		0x40000000	/* Conn in connected table */
@@ -83,41 +86,21 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
 #define	IPCL_RTSCONN		0x00000020	/* From rts_conn_cache */
 /* Unused			0x00000040 */
 #define	IPCL_IPTUN		0x00000080	/* iptun module above us */
+
 #define	IPCL_NONSTR		0x00001000	/* A non-STREAMS socket */
-#define	IPCL_IN_SQUEUE		0x10000000	/* Waiting squeue to finish */
+/* Unused			0x10000000 */
 
-/* Conn Masks */
-#define	IPCL_TCP		(IPCL_TCP4|IPCL_TCP6)
 #define	IPCL_REMOVED		0x00000100
 #define	IPCL_REUSED		0x00000200
 
-/* The packet format is IPv4; could be an AF_INET or AF_INET6 socket */
-#define	IPCL_IS_TCP4(connp)						\
-	(((connp)->conn_flags & IPCL_TCP4))
-
-/* Connected AF_INET with no IPsec policy */
-#define	IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp)				\
-	(((connp)->conn_flags &						\
-		(IPCL_TCP4|IPCL_CONNECTED|IPCL_CHECK_POLICY|IPCL_TCP6))	\
-		== (IPCL_TCP4|IPCL_CONNECTED))
-
 #define	IPCL_IS_CONNECTED(connp)					\
 	((connp)->conn_flags & IPCL_CONNECTED)
 
 #define	IPCL_IS_BOUND(connp)						\
 	((connp)->conn_flags & IPCL_BOUND)
 
-/* AF_INET TCP that is bound */
-#define	IPCL_IS_TCP4_BOUND(connp)					\
-	(((connp)->conn_flags &						\
-		(IPCL_TCP4|IPCL_BOUND|IPCL_TCP6)) ==			\
-		(IPCL_TCP4|IPCL_BOUND))
-
-#define	IPCL_IS_FULLY_BOUND(connp)					\
-	((connp)->conn_flags & IPCL_FULLY_BOUND)
-
 /*
- * Can't use conn_protocol since we need to tell difference
+ * Can't use conn_proto since we need to tell difference
  * between a real TCP socket and a SOCK_RAW, IPPROTO_TCP.
  */
 #define	IPCL_IS_TCP(connp)						\
@@ -180,22 +163,80 @@ typedef struct ip_helper_stream_info_s {
 #define	CONN_MAC_IMPLICIT 2
 
 /*
+ * conn receive ancillary definition.
+ *
+ * These are the set of socket options that make the receive side
+ * potentially pass up ancillary data items.
+ * We have a union with an integer so that we can quickly check whether
+ * any ancillary data items need to be added.
+ */
+typedef struct crb_s {
+	union {
+		uint32_t	crbu_all;
+		struct {
+			uint32_t
+	crbb_recvdstaddr : 1,		/* IP_RECVDSTADDR option */
+	crbb_recvopts : 1,		/* IP_RECVOPTS option */
+	crbb_recvif : 1,		/* IP_RECVIF option */
+	crbb_recvslla : 1,		/* IP_RECVSLLA option */
+
+	crbb_recvttl : 1,		/* IP_RECVTTL option */
+	crbb_ip_recvpktinfo : 1,	/* IP*_RECVPKTINFO option  */
+	crbb_ipv6_recvhoplimit : 1,	/* IPV6_RECVHOPLIMIT option */
+	crbb_ipv6_recvhopopts : 1,	/* IPV6_RECVHOPOPTS option */
+
+	crbb_ipv6_recvdstopts : 1,	/* IPV6_RECVDSTOPTS option */
+	crbb_ipv6_recvrthdr : 1,	/* IPV6_RECVRTHDR option */
+	crbb_old_ipv6_recvdstopts : 1,	/* old form of IPV6_DSTOPTS */
+	crbb_ipv6_recvrthdrdstopts : 1,	/* IPV6_RECVRTHDRDSTOPTS */
+
+	crbb_ipv6_recvtclass : 1,	/* IPV6_RECVTCLASS */
+	crbb_recvucred : 1,		/* IP_RECVUCRED option */
+	crbb_timestamp : 1;		/* SO_TIMESTAMP "socket" option */
+
+		} crbb;
+	} crbu;
+} crb_t;
+
+#define	crb_all				crbu.crbu_all
+#define	crb_recvdstaddr			crbu.crbb.crbb_recvdstaddr
+#define	crb_recvopts			crbu.crbb.crbb_recvopts
+#define	crb_recvif			crbu.crbb.crbb_recvif
+#define	crb_recvslla			crbu.crbb.crbb_recvslla
+#define	crb_recvttl			crbu.crbb.crbb_recvttl
+#define	crb_ip_recvpktinfo		crbu.crbb.crbb_ip_recvpktinfo
+#define	crb_ipv6_recvhoplimit		crbu.crbb.crbb_ipv6_recvhoplimit
+#define	crb_ipv6_recvhopopts		crbu.crbb.crbb_ipv6_recvhopopts
+#define	crb_ipv6_recvdstopts		crbu.crbb.crbb_ipv6_recvdstopts
+#define	crb_ipv6_recvrthdr		crbu.crbb.crbb_ipv6_recvrthdr
+#define	crb_old_ipv6_recvdstopts	crbu.crbb.crbb_old_ipv6_recvdstopts
+#define	crb_ipv6_recvrthdrdstopts	crbu.crbb.crbb_ipv6_recvrthdrdstopts
+#define	crb_ipv6_recvtclass		crbu.crbb.crbb_ipv6_recvtclass
+#define	crb_recvucred			crbu.crbb.crbb_recvucred
+#define	crb_timestamp			crbu.crbb.crbb_timestamp
+
+/*
  * The initial fields in the conn_t are setup by the kmem_cache constructor,
  * and are preserved when it is freed. Fields after that are bzero'ed when
  * the conn_t is freed.
+ *
+ * Much of the conn_t is protected by conn_lock.
+ *
+ * conn_lock is also used by some ULPs (like UDP and RAWIP) to protect
+ * their state.
  */
 struct conn_s {
 	kmutex_t	conn_lock;
 	uint32_t	conn_ref;		/* Reference counter */
 	uint32_t	conn_flags;		/* Conn Flags */
 
-
 	union {
 		tcp_t		*cp_tcp;	/* Pointer to the tcp struct */
 		struct udp_s	*cp_udp;	/* Pointer to the udp struct */
 		struct icmp_s	*cp_icmp;	/* Pointer to rawip struct */
 		struct rts_s	*cp_rts;	/* Pointer to rts struct */
 		struct iptun_s	*cp_iptun;	/* Pointer to iptun_t */
+		struct sctp_s	*cp_sctp;	/* For IPCL_SCTPCONN */
 		void		*cp_priv;
 	} conn_proto_priv;
 #define	conn_tcp	conn_proto_priv.cp_tcp
@@ -203,71 +244,68 @@ struct conn_s {
 #define	conn_icmp	conn_proto_priv.cp_icmp
 #define	conn_rts	conn_proto_priv.cp_rts
 #define	conn_iptun	conn_proto_priv.cp_iptun
+#define	conn_sctp	conn_proto_priv.cp_sctp
 #define	conn_priv	conn_proto_priv.cp_priv
 
 	kcondvar_t	conn_cv;
-	uint8_t		conn_ulp;		/* protocol type */
+	uint8_t		conn_proto;		/* protocol type */
 
 	edesc_rpf	conn_recv;		/* Pointer to recv routine */
+	edesc_rpf	conn_recvicmp;		/* For ICMP error */
+	edesc_vpf	conn_verifyicmp;	/* Verify ICMP error */
+
+	ip_xmit_attr_t	*conn_ixa;		/* Options if no ancil data */
 
 	/* Fields after this are bzero'ed when the conn_t is freed. */
+#define	conn_start_clr	conn_recv_ancillary
+
+	/* Options for receive-side ancillary data */
+	crb_t		conn_recv_ancillary;
 
 	squeue_t	*conn_sqp;		/* Squeue for processing */
 	uint_t		conn_state_flags;	/* IP state flags */
-#define	conn_start_clr	conn_state_flags
 
-	ire_t		*conn_ire_cache; 	/* outbound ire cache */
+	int		conn_lingertime;	/* linger time (in seconds) */
+
 	unsigned int
 		conn_on_sqp : 1,		/* Conn is being processed */
-		conn_dontroute : 1,		/* SO_DONTROUTE state */
-		conn_loopback : 1,		/* SO_LOOPBACK state */
+		conn_linger : 1,		/* SO_LINGER state */
+		conn_useloopback : 1,		/* SO_USELOOPBACK state */
 		conn_broadcast : 1,		/* SO_BROADCAST state */
 
 		conn_reuseaddr : 1,		/* SO_REUSEADDR state */
-		conn_multicast_loop : 1,	/* IP_MULTICAST_LOOP */
+		conn_keepalive : 1,		/* SO_KEEPALIVE state */
 		conn_multi_router : 1,		/* Wants all multicast pkts */
-		conn_draining : 1,		/* ip_wsrv running */
-
 		conn_did_putbq : 1,		/* ip_wput did a putbq */
+
 		conn_unspec_src : 1,		/* IP_UNSPEC_SRC */
 		conn_policy_cached : 1,		/* Is policy cached/latched ? */
 		conn_in_enforce_policy : 1,	/* Enforce Policy on inbound */
-
 		conn_out_enforce_policy : 1,	/* Enforce Policy on outbound */
-		conn_af_isv6 : 1,		/* ip address family ver 6 */
-		conn_pkt_isv6 : 1,		/* ip packet format ver 6 */
-		conn_ip_recvpktinfo : 1,	/* IPV*_RECVPKTINFO option */
-
-		conn_ipv6_recvhoplimit : 1,	/* IPV6_RECVHOPLIMIT option */
-		conn_ipv6_recvhopopts : 1,	/* IPV6_RECVHOPOPTS option */
-		conn_ipv6_recvdstopts : 1,	/* IPV6_RECVDSTOPTS option */
-		conn_ipv6_recvrthdr : 1,	/* IPV6_RECVRTHDR option */
 
-		conn_ipv6_recvrtdstopts : 1,	/* IPV6_RECVRTHDRDSTOPTS */
+		conn_debug : 1,			/* SO_DEBUG */
 		conn_ipv6_v6only : 1,		/* IPV6_V6ONLY */
-		conn_ipv6_recvtclass : 1,	/* IPV6_RECVTCLASS */
+		conn_oobinline : 1, 		/* SO_OOBINLINE state */
+		conn_dgram_errind : 1,		/* SO_DGRAM_ERRIND state */
+
+		conn_exclbind : 1,		/* SO_EXCLBIND state */
+		conn_mdt_ok : 1,		/* MDT is permitted */
+		conn_allzones : 1,		/* SO_ALLZONES */
 		conn_ipv6_recvpathmtu : 1,	/* IPV6_RECVPATHMTU */
 
-		conn_pathmtu_valid : 1,		/* The cached mtu is valid. */
-		conn_ipv6_dontfrag : 1,		/* IPV6_DONTFRAG */
-		conn_fully_bound : 1,		/* Fully bound connection */
-		conn_recvif : 1,		/* IP_RECVIF option */
+		conn_mcbc_bind : 1,		/* Bound to multi/broadcast */
 
-		conn_recvslla : 1,		/* IP_RECVSLLA option */
-		conn_mdt_ok : 1,		/* MDT is permitted */
-		conn_nexthop_set : 1,
-		conn_allzones : 1;		/* SO_ALLZONES */
+		conn_pad_to_bit_31 : 11;
 
-	unsigned int
-		conn_lso_ok : 1;		/* LSO is usable */
 	boolean_t conn_direct_blocked;		/* conn is flow-controlled */
 
 	squeue_t	*conn_initial_sqp;	/* Squeue at open time */
 	squeue_t	*conn_final_sqp;	/* Squeue after connect */
 	ill_t		*conn_dhcpinit_ill;	/* IP_DHCPINIT_IF */
-	ipsec_latch_t	*conn_latch;		/* latched state */
-	ill_t		*conn_outgoing_ill;	/* IP{,V6}_BOUND_IF */
-	edesc_spf	conn_send;		/* Pointer to send routine */
+	ipsec_latch_t	*conn_latch;		/* latched IDS */
+	struct ipsec_policy_s	*conn_latch_in_policy; /* latched policy (in) */
+	struct ipsec_action_s	*conn_latch_in_action; /* latched action (in) */
+	uint_t		conn_bound_if;		/* IP*_BOUND_IF */
 	queue_t		*conn_rq;		/* Read queue */
 	queue_t		*conn_wq;		/* Write queue */
 	dev_t		conn_dev;		/* Minor number */
@@ -275,80 +313,137 @@ struct conn_s {
 	ip_helper_stream_info_t *conn_helper_info;
 
 	cred_t		*conn_cred;		/* Credentials */
+	pid_t		conn_cpid;		/* pid from open/connect */
+	uint64_t	conn_open_time;		/* time when this was opened */
+
 	connf_t		*conn_g_fanout;		/* Global Hash bucket head */
 	struct conn_s	*conn_g_next;		/* Global Hash chain next */
 	struct conn_s	*conn_g_prev;		/* Global Hash chain prev */
 	struct ipsec_policy_head_s *conn_policy; /* Configured policy */
-	in6_addr_t	conn_bound_source_v6;
-#define	conn_bound_source	V4_PART_OF_V6(conn_bound_source_v6)
-
+	in6_addr_t	conn_bound_addr_v6;	/* Address in bind() */
+#define	conn_bound_addr_v4	V4_PART_OF_V6(conn_bound_addr_v6)
 	connf_t		*conn_fanout;		/* Hash bucket we're part of */
 	struct conn_s	*conn_next;		/* Hash chain next */
 	struct conn_s	*conn_prev;		/* Hash chain prev */
+
 	struct {
-		in6_addr_t connua_laddr;	/* Local address */
+		in6_addr_t connua_laddr;	/* Local address - match */
 		in6_addr_t connua_faddr;	/* Remote address */
 	} connua_v6addr;
-#define	conn_src	V4_PART_OF_V6(connua_v6addr.connua_laddr)
-#define	conn_rem	V4_PART_OF_V6(connua_v6addr.connua_faddr)
-#define	conn_srcv6	connua_v6addr.connua_laddr
-#define	conn_remv6	connua_v6addr.connua_faddr
+#define	conn_laddr_v4	V4_PART_OF_V6(connua_v6addr.connua_laddr)
+#define	conn_faddr_v4	V4_PART_OF_V6(connua_v6addr.connua_faddr)
+#define	conn_laddr_v6	connua_v6addr.connua_laddr
+#define	conn_faddr_v6	connua_v6addr.connua_faddr
+	in6_addr_t	conn_saddr_v6;		/* Local address - source */
+#define	conn_saddr_v4	V4_PART_OF_V6(conn_saddr_v6)
+
 	union {
 		/* Used for classifier match performance */
-		uint32_t		conn_ports2;
+		uint32_t		connu_ports2;
 		struct {
-			in_port_t	tcpu_fport;	/* Remote port */
-			in_port_t	tcpu_lport;	/* Local port */
-		} tcpu_ports;
+			in_port_t	connu_fport;	/* Remote port */
+			in_port_t	connu_lport;	/* Local port */
+		} connu_ports;
 	} u_port;
-#define	conn_fport	u_port.tcpu_ports.tcpu_fport
-#define	conn_lport	u_port.tcpu_ports.tcpu_lport
-#define	conn_ports	u_port.conn_ports2
-#define	conn_upq	conn_rq
-	uint8_t		conn_unused_byte;
-
-	uint_t		conn_proto;		/* SO_PROTOTYPE state */
-	ill_t		*conn_incoming_ill;	/* IP{,V6}_BOUND_IF */
+#define	conn_fport	u_port.connu_ports.connu_fport
+#define	conn_lport	u_port.connu_ports.connu_lport
+#define	conn_ports	u_port.connu_ports2
+
+	uint_t		conn_incoming_ifindex;	/* IP{,V6}_BOUND_IF, scopeid */
 	ill_t		*conn_oper_pending_ill; /* pending shared ioctl */
 
-	ilg_t	*conn_ilg;		/* Group memberships */
-	int	conn_ilg_allocated;	/* Number allocated */
-	int	conn_ilg_inuse;		/* Number currently used */
-	int	conn_ilg_walker_cnt;	/* No of ilg walkers */
-	/* XXXX get rid of this, once ilg_delete_all is fixed */
-	kcondvar_t	conn_refcv;
-
-	struct ipif_s	*conn_multicast_ipif;	/* IP_MULTICAST_IF */
-	ill_t		*conn_multicast_ill;	/* IPV6_MULTICAST_IF */
-	struct	conn_s	*conn_drain_next;	/* Next conn in drain list */
-	struct	conn_s	*conn_drain_prev;	/* Prev conn in drain list */
+	krwlock_t	conn_ilg_lock;		/* Protects conn_ilg_* */
+	ilg_t		*conn_ilg;		/* Group memberships */
+
+	kcondvar_t	conn_refcv;		/* For conn_oper_pending_ill */
+
+	struct conn_s 	*conn_drain_next;	/* Next conn in drain list */
+	struct conn_s	*conn_drain_prev;	/* Prev conn in drain list */
 	idl_t		*conn_idl;		/* Ptr to the drain list head */
 	mblk_t		*conn_ipsec_opt_mp;	/* ipsec option mblk */
-	uint32_t	conn_src_preferences;	/* prefs for src addr select */
-	/* mtuinfo from IPV6_PACKET_TOO_BIG conditional on conn_pathmtu_valid */
-	struct ip6_mtuinfo mtuinfo;
 	zoneid_t	conn_zoneid;		/* zone connection is in */
-	in6_addr_t	conn_nexthop_v6;	/* nexthop IP address */
-	uchar_t		conn_broadcast_ttl; 	/* IP_BROADCAST_TTL */
-#define	conn_nexthop_v4	V4_PART_OF_V6(conn_nexthop_v6)
-	cred_t		*conn_effective_cred;	/* Effective TX credentials */
 	int		conn_rtaware; 		/* RT_AWARE sockopt value */
 	kcondvar_t	conn_sq_cv;		/* For non-STREAMS socket IO */
-	kthread_t	*conn_sq_caller;	/* Caller of squeue sync ops */
 	sock_upcalls_t	*conn_upcalls;		/* Upcalls to sockfs */
 	sock_upper_handle_t conn_upper_handle;	/* Upper handle: sonode * */
 
 	unsigned int
-		conn_ulp_labeled : 1,		/* ULP label is synced */
 		conn_mlp_type : 2,		/* mlp_type_t; tsol/tndb.h */
 		conn_anon_mlp : 1,		/* user wants anon MLP */
-
 		conn_anon_port : 1,		/* user bound anonymously */
+
 		conn_mac_mode : 2,		/* normal/loose/implicit MAC */
-		conn_spare : 26;
+		conn_anon_priv_bind : 1,	/* *_ANON_PRIV_BIND state */
+		conn_zone_is_global : 1,	/* GLOBAL_ZONEID */
+		conn_spare : 24;
 
 	boolean_t	conn_flow_cntrld;
 	netstack_t	*conn_netstack;	/* Corresponds to a netstack_hold */
+
+	/*
+	 * IP format that packets received for this struct should use.
+	 * Value can be IP4_VERSION or IPV6_VERSION.
+	 * The sending version is encoded using IXAF_IS_IPV4.
+	 */
+	ushort_t	conn_ipversion;
+
+	/* Written to only once at the time of opening the endpoint */
+	sa_family_t	conn_family;		/* Family from socket() call */
+	uint_t		conn_so_type;		/* Type from socket() call */
+
+	uint_t		conn_sndbuf;		/* SO_SNDBUF state */
+	uint_t		conn_rcvbuf;		/* SO_RCVBUF state */
+	uint_t		conn_wroff;		/* Current write offset */
+
+	uint_t		conn_sndlowat;		/* Send buffer low water mark */
+	uint_t		conn_rcvlowat;		/* Recv buffer low water mark */
+
+	uint8_t		conn_default_ttl;	/* Default TTL/hoplimit */
+
+	uint32_t	conn_flowinfo;	/* Connected flow id and tclass */
+
+	/*
+	 * The most recent address for sendto. Initially set to zero
+	 * which is always different than then the destination address
+	 * since the send interprets zero as the loopback address.
+	 */
+	in6_addr_t	conn_v6lastdst;
+#define	conn_v4lastdst	V4_PART_OF_V6(conn_v6lastdst)
+	ushort_t	conn_lastipversion;
+	in_port_t	conn_lastdstport;
+	uint32_t	conn_lastflowinfo;	/* IPv6-only */
+	uint_t		conn_lastscopeid;	/* IPv6-only */
+	uint_t		conn_lastsrcid;		/* Only for AF_INET6 */
+	/*
+	 * When we are not connected conn_saddr might be unspecified.
+	 * We track the source that was used with conn_v6lastdst here.
+	 */
+	in6_addr_t	conn_v6lastsrc;
+#define	conn_v4lastsrc	V4_PART_OF_V6(conn_v6lastsrc)
+
+	/* Templates for transmitting packets */
+	ip_pkt_t	conn_xmit_ipp;		/* Options if no ancil data */
+
+	/*
+	 * Header template - conn_ht_ulp is a pointer into conn_ht_iphc.
+	 * Note that ixa_ip_hdr_length indicates the offset of ht_ulp in
+	 * ht_iphc
+	 *
+	 * The header template is maintained for connected endpoints (and
+	 * updated when sticky options are changed) and also for the lastdst.
+	 * There is no conflict between those usages since SOCK_DGRAM and
+	 * SOCK_RAW can not be used to specify a destination address (with
+	 * sendto/sendmsg) if the socket has been connected.
+	 */
+	uint8_t		*conn_ht_iphc;		/* Start of IP header */
+	uint_t		conn_ht_iphc_allocated;	/* Allocated buffer size */
+	uint_t		conn_ht_iphc_len;	/* IP+ULP size */
+	uint8_t		*conn_ht_ulp;		/* Upper-layer header */
+	uint_t		conn_ht_ulp_len;	/* ULP header len */
+
+	/* Checksum to compensate for source routed packets. Host byte order */
+	uint32_t	conn_sum;
+
 #ifdef CONN_DEBUG
 #define	CONN_TRACE_MAX	10
 	int		conn_trace_last;	/* ndx of last used tracebuf */
@@ -357,18 +452,6 @@ struct conn_s {
 };
 
 /*
- * These two macros are used by TX. First priority is SCM_UCRED having
- * set the label in the mblk. Second priority is the open credentials with
- * peer's label (aka conn_effective_cred). Last priority is the open
- * credentials. BEST_CRED takes all three into account in the above order.
- * CONN_CRED is for connection-oriented cases when we don't need to look
- * at the mblk.
- */
-#define	CONN_CRED(connp) ((connp)->conn_effective_cred == NULL ? \
-	(connp)->conn_cred : (connp)->conn_effective_cred)
-#define	BEST_CRED(mp, connp, pidp) ip_best_cred(mp, connp, pidp)
-
-/*
  * connf_t - connection fanout data.
  *
  * The hash tables and their linkage (conn_t.{hashnextp, hashprevp} are
@@ -461,29 +544,22 @@ struct connf_s {
 
 
 /*
- * IPCL_PROTO_MATCH() only matches conns with the specified zoneid, while
- * IPCL_PROTO_MATCH_V6() can match other conns in the multicast case, see
- * ip_fanout_proto().
+ * IPCL_PROTO_MATCH() and IPCL_PROTO_MATCH_V6() only matches conns with
+ * the specified ira_zoneid or conn_allzones by calling conn_wantpacket.
  */
-#define	IPCL_PROTO_MATCH(connp, protocol, ipha, ill,			\
-    fanout_flags, zoneid)						\
-	((((connp)->conn_src == INADDR_ANY) ||				\
-	(((connp)->conn_src == ((ipha)->ipha_dst)) &&			\
-	    (((connp)->conn_rem == INADDR_ANY) ||			\
-	((connp)->conn_rem == ((ipha)->ipha_src))))) &&			\
-	IPCL_ZONE_MATCH(connp, zoneid) &&				\
-	(conn_wantpacket((connp), (ill), (ipha), (fanout_flags), 	\
-	    (zoneid)) || ((protocol) == IPPROTO_PIM) ||			\
-	    ((protocol) == IPPROTO_RSVP)))
-
-#define	IPCL_PROTO_MATCH_V6(connp, protocol, ip6h, ill,			   \
-    fanout_flags, zoneid)						   \
-	((IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6) ||		   \
-	(IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &((ip6h)->ip6_dst)) &&   \
-	(IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_remv6) ||		   \
-	IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, &((ip6h)->ip6_src))))) && \
-	(conn_wantpacket_v6((connp), (ill), (ip6h),			   \
-	(fanout_flags), (zoneid)) || ((protocol) == IPPROTO_RSVP)))
+#define	IPCL_PROTO_MATCH(connp, ira, ipha)				\
+	((((connp)->conn_laddr_v4 == INADDR_ANY) ||			\
+	(((connp)->conn_laddr_v4 == ((ipha)->ipha_dst)) &&		\
+	    (((connp)->conn_faddr_v4 == INADDR_ANY) ||			\
+	((connp)->conn_faddr_v4 == ((ipha)->ipha_src))))) &&		\
+	conn_wantpacket((connp), (ira), (ipha)))
+
+#define	IPCL_PROTO_MATCH_V6(connp, ira, ip6h)				\
+	((IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6) ||		\
+	(IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &((ip6h)->ip6_dst)) &&   \
+	(IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_faddr_v6) ||		      \
+	IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, &((ip6h)->ip6_src))))) && \
+	(conn_wantpacket_v6((connp), (ira), (ip6h))))
 
 #define	IPCL_CONN_HASH(src, ports, ipst)				\
 	((unsigned)(ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^	\
@@ -493,31 +569,17 @@ struct connf_s {
 	IPCL_CONN_HASH(V4_PART_OF_V6((src)), (ports), (ipst))
 
 #define	IPCL_CONN_MATCH(connp, proto, src, dst, ports)			\
-	((connp)->conn_ulp == (proto) &&				\
+	((connp)->conn_proto == (proto) &&				\
 		(connp)->conn_ports == (ports) &&      			\
-		_IPCL_V4_MATCH((connp)->conn_remv6, (src)) &&		\
-		_IPCL_V4_MATCH((connp)->conn_srcv6, (dst)) &&		\
+		_IPCL_V4_MATCH((connp)->conn_faddr_v6, (src)) &&	\
+		_IPCL_V4_MATCH((connp)->conn_laddr_v6, (dst)) &&	\
 		!(connp)->conn_ipv6_v6only)
 
 #define	IPCL_CONN_MATCH_V6(connp, proto, src, dst, ports)		\
-	((connp)->conn_ulp == (proto) &&				\
+	((connp)->conn_proto == (proto) &&				\
 		(connp)->conn_ports == (ports) &&      			\
-		IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, &(src)) &&	\
-		IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(dst)))
-
-#define	IPCL_CONN_INIT(connp, protocol, src, rem, ports) {		\
-	(connp)->conn_ulp = protocol;					\
-	IN6_IPADDR_TO_V4MAPPED(src, &(connp)->conn_srcv6);		\
-	IN6_IPADDR_TO_V4MAPPED(rem, &(connp)->conn_remv6);		\
-	(connp)->conn_ports = ports;					\
-}
-
-#define	IPCL_CONN_INIT_V6(connp, protocol, src, rem, ports) {		\
-	(connp)->conn_ulp = protocol;					\
-	(connp)->conn_srcv6 = src;					\
-	(connp)->conn_remv6 = rem;					\
-	(connp)->conn_ports = ports;					\
-}
+		IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, &(src)) &&	\
+		IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(dst)))
 
 #define	IPCL_PORT_HASH(port, size) \
 	((((port) >> 8) ^ (port)) & ((size) - 1))
@@ -527,33 +589,45 @@ struct connf_s {
 	    (ipst)->ips_ipcl_bind_fanout_size)
 
 #define	IPCL_BIND_MATCH(connp, proto, laddr, lport)			\
-	((connp)->conn_ulp == (proto) &&				\
+	((connp)->conn_proto == (proto) &&				\
 		(connp)->conn_lport == (lport) &&			\
-		(_IPCL_V4_MATCH_ANY((connp)->conn_srcv6) ||		\
-		_IPCL_V4_MATCH((connp)->conn_srcv6, (laddr))) &&	\
+		(_IPCL_V4_MATCH_ANY((connp)->conn_laddr_v6) ||		\
+		_IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr))) &&	\
 		!(connp)->conn_ipv6_v6only)
 
 #define	IPCL_BIND_MATCH_V6(connp, proto, laddr, lport)			\
-	((connp)->conn_ulp == (proto) &&				\
+	((connp)->conn_proto == (proto) &&				\
 		(connp)->conn_lport == (lport) &&			\
-		(IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(laddr)) ||	\
-		IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6)))
+		(IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(laddr)) || \
+		IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6)))
 
+/*
+ * We compare conn_laddr since it captures both connected and a bind to
+ * a multicast or broadcast address.
+ * The caller needs to match the zoneid and also call conn_wantpacket
+ * for multicast, broadcast, or when conn_incoming_ifindex is set.
+ */
 #define	IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)		\
 	(((connp)->conn_lport == (lport)) &&				\
-	((_IPCL_V4_MATCH_ANY((connp)->conn_srcv6) ||			\
-	(_IPCL_V4_MATCH((connp)->conn_srcv6, (laddr)) &&		\
-	(_IPCL_V4_MATCH_ANY((connp)->conn_remv6) ||			\
-	(_IPCL_V4_MATCH((connp)->conn_remv6, (faddr)) &&		\
+	((_IPCL_V4_MATCH_ANY((connp)->conn_laddr_v6) ||			\
+	(_IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr)) &&		\
+	(_IPCL_V4_MATCH_ANY((connp)->conn_faddr_v6) ||			\
+	(_IPCL_V4_MATCH((connp)->conn_faddr_v6, (faddr)) &&		\
 	(connp)->conn_fport == (fport)))))) &&				\
 	!(connp)->conn_ipv6_v6only)
 
+/*
+ * We compare conn_laddr since it captures both connected and a bind to
+ * a multicast or broadcast address.
+ * The caller needs to match the zoneid and also call conn_wantpacket_v6
+ * for multicast or when conn_incoming_ifindex is set.
+ */
 #define	IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr)	\
 	(((connp)->conn_lport == (lport)) &&			\
-	(IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6) ||	\
-	(IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(laddr)) &&	\
-	(IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_remv6) ||	\
-	(IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, &(faddr)) &&	\
+	(IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6) ||	\
+	(IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(laddr)) &&	\
+	(IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_faddr_v6) ||	\
+	(IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, &(faddr)) &&	\
 	(connp)->conn_fport == (fport))))))
 
 #define	IPCL_IPTUN_HASH(laddr, faddr)					\
@@ -567,32 +641,12 @@ struct connf_s {
 	    (laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3])
 
 #define	IPCL_IPTUN_MATCH(connp, laddr, faddr)			\
-	(_IPCL_V4_MATCH((connp)->conn_srcv6, (laddr)) &&	\
-	_IPCL_V4_MATCH((connp)->conn_remv6, (faddr)))
+	(_IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr)) &&	\
+	_IPCL_V4_MATCH((connp)->conn_faddr_v6, (faddr)))
 
 #define	IPCL_IPTUN_MATCH_V6(connp, laddr, faddr)		\
-	(IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, (laddr)) &&	\
-	IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, (faddr)))
-
-#define	IPCL_TCP_EAGER_INIT(connp, protocol, src, rem, ports) {		\
-	(connp)->conn_flags |= (IPCL_TCP4|IPCL_EAGER);			\
-	IN6_IPADDR_TO_V4MAPPED(src, &(connp)->conn_srcv6);		\
-	IN6_IPADDR_TO_V4MAPPED(rem, &(connp)->conn_remv6);		\
-	(connp)->conn_ports = ports;					\
-	(connp)->conn_send = ip_output;					\
-	(connp)->conn_sqp = IP_SQUEUE_GET(lbolt);			\
-	(connp)->conn_initial_sqp = (connp)->conn_sqp;			\
-}
-
-#define	IPCL_TCP_EAGER_INIT_V6(connp, protocol, src, rem, ports) {	\
-	(connp)->conn_flags |= (IPCL_TCP6|IPCL_EAGER);			\
-	(connp)->conn_srcv6 = src;					\
-	(connp)->conn_remv6 = rem;					\
-	(connp)->conn_ports = ports;					\
-	(connp)->conn_send = ip_output_v6;				\
-	(connp)->conn_sqp = IP_SQUEUE_GET(lbolt);			\
-	(connp)->conn_initial_sqp = (connp)->conn_sqp;			\
-}
+	(IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, (laddr)) &&	\
+	IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, (faddr)))
 
 #define	IPCL_UDP_HASH(lport, ipst)	\
 	IPCL_PORT_HASH(lport, (ipst)->ips_ipcl_udp_fanout_size)
@@ -606,18 +660,20 @@ struct connf_s {
 /*
  * This is similar to IPCL_BIND_MATCH except that the local port check
  * is changed to a wildcard port check.
+ * We compare conn_laddr since it captures both connected and a bind to
+ * a multicast or broadcast address.
  */
 #define	IPCL_RAW_MATCH(connp, proto, laddr)			\
-	((connp)->conn_ulp == (proto) &&			\
+	((connp)->conn_proto == (proto) &&			\
 	(connp)->conn_lport == 0 &&				\
-	(_IPCL_V4_MATCH_ANY((connp)->conn_srcv6) ||		\
-	_IPCL_V4_MATCH((connp)->conn_srcv6, (laddr))))
+	(_IPCL_V4_MATCH_ANY((connp)->conn_laddr_v6) ||		\
+	_IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr))))
 
 #define	IPCL_RAW_MATCH_V6(connp, proto, laddr)			\
-	((connp)->conn_ulp == (proto) &&			\
+	((connp)->conn_proto == (proto) &&			\
 	(connp)->conn_lport == 0 &&				\
-	(IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6) ||	\
-	IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(laddr))))
+	(IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6) ||	\
+	IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(laddr))))
 
 /* Function prototypes */
 extern void ipcl_g_init(void);
@@ -631,28 +687,27 @@ void ipcl_hash_insert_wildcard(connf_t *, conn_t *);
 void ipcl_hash_remove(conn_t *);
 void ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp);
 
-extern int	ipcl_bind_insert(conn_t *, uint8_t, ipaddr_t, uint16_t);
-extern int	ipcl_bind_insert_v6(conn_t *, uint8_t, const in6_addr_t *,
-		    uint16_t);
-extern int	ipcl_conn_insert(conn_t *, uint8_t, ipaddr_t, ipaddr_t,
-		    uint32_t);
-extern int	ipcl_conn_insert_v6(conn_t *, uint8_t, const in6_addr_t *,
-		    const in6_addr_t *, uint32_t, uint_t);
+extern int	ipcl_bind_insert(conn_t *);
+extern int	ipcl_bind_insert_v4(conn_t *);
+extern int	ipcl_bind_insert_v6(conn_t *);
+extern int	ipcl_conn_insert(conn_t *);
+extern int	ipcl_conn_insert_v4(conn_t *);
+extern int	ipcl_conn_insert_v6(conn_t *);
 extern conn_t	*ipcl_get_next_conn(connf_t *, conn_t *, uint32_t);
 
-void ipcl_proto_insert(conn_t *, uint8_t);
-void ipcl_proto_insert_v6(conn_t *, uint8_t);
-conn_t *ipcl_classify_v4(mblk_t *, uint8_t, uint_t, zoneid_t, ip_stack_t *);
-conn_t *ipcl_classify_v6(mblk_t *, uint8_t, uint_t, zoneid_t, ip_stack_t *);
-conn_t *ipcl_classify(mblk_t *, zoneid_t, ip_stack_t *);
-conn_t *ipcl_classify_raw(mblk_t *, uint8_t, zoneid_t, uint32_t, ipha_t *,
+conn_t *ipcl_classify_v4(mblk_t *, uint8_t, uint_t, ip_recv_attr_t *,
+	    ip_stack_t *);
+conn_t *ipcl_classify_v6(mblk_t *, uint8_t, uint_t, ip_recv_attr_t *,
 	    ip_stack_t *);
+conn_t *ipcl_classify(mblk_t *, ip_recv_attr_t *, ip_stack_t *);
+conn_t *ipcl_classify_raw(mblk_t *, uint8_t, uint32_t, ipha_t *,
+    ip6_t *, ip_recv_attr_t *, ip_stack_t *);
 conn_t *ipcl_iptun_classify_v4(ipaddr_t *, ipaddr_t *, ip_stack_t *);
 conn_t *ipcl_iptun_classify_v6(in6_addr_t *, in6_addr_t *, ip_stack_t *);
 void	ipcl_globalhash_insert(conn_t *);
 void	ipcl_globalhash_remove(conn_t *);
 void	ipcl_walk(pfv_t, void *, ip_stack_t *);
-conn_t	*ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack_t *);
+conn_t	*ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack_t *);
 conn_t	*ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
 	    ip_stack_t *);
 conn_t	*ipcl_lookup_listener_v4(uint16_t, ipaddr_t, zoneid_t, ip_stack_t *);
@@ -661,17 +716,19 @@ conn_t	*ipcl_lookup_listener_v6(uint16_t, in6_addr_t *, uint_t, zoneid_t,
 int	conn_trace_ref(conn_t *);
 int	conn_untrace_ref(conn_t *);
 void	ipcl_conn_cleanup(conn_t *);
-conn_t *ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *, ipha_t *, tcph_t *,
+extern uint_t	conn_recvancillary_size(conn_t *, crb_t, ip_recv_attr_t *,
+    mblk_t *, ip_pkt_t *);
+extern void	conn_recvancillary_add(conn_t *, crb_t, ip_recv_attr_t *,
+    ip_pkt_t *, uchar_t *, uint_t);
+conn_t *ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *, ipha_t *, tcpha_t *,
 	    ip_stack_t *);
-conn_t *ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *, ip6_t *, tcph_t *,
+conn_t *ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *, ip6_t *, tcpha_t *,
 	    ip_stack_t *);
 
-extern int ip_create_helper_stream(conn_t *connp, ldi_ident_t li);
-extern void ip_free_helper_stream(conn_t *connp);
-
-extern int ip_get_options(conn_t *, int, int, void *, t_uscalar_t *, cred_t *);
-extern int ip_set_options(conn_t *, int, int, const void *, t_uscalar_t,
-    cred_t *);
+extern int ip_create_helper_stream(conn_t *, ldi_ident_t);
+extern void ip_free_helper_stream(conn_t *);
+extern int	ip_helper_stream_setup(queue_t *, dev_t *, int, int,
+    cred_t *, boolean_t);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/inet/ipdrop.h b/usr/src/uts/common/inet/ipdrop.h
index 153c9c1925..74fe8cfd94 100644
--- a/usr/src/uts/common/inet/ipdrop.h
+++ b/usr/src/uts/common/inet/ipdrop.h
@@ -41,8 +41,10 @@ typedef struct ipdropper_s {
 
 void ip_drop_register(ipdropper_t *, char *);
 void ip_drop_unregister(ipdropper_t *);
-void ip_drop_packet(mblk_t *, boolean_t, ill_t *, ire_t *, struct kstat_named *,
+void ip_drop_packet(mblk_t *, boolean_t, ill_t *, struct kstat_named *,
     ipdropper_t *);
+void ip_drop_input(char *, mblk_t *, ill_t *);
+void ip_drop_output(char *, mblk_t *, ill_t *);
 
 /*
  * ip_dropstats - When a protocol developer comes up with a new reason to
diff --git a/usr/src/uts/common/inet/ipp_common.h b/usr/src/uts/common/inet/ipp_common.h
index 9ac9837f66..d7380896b6 100644
--- a/usr/src/uts/common/inet/ipp_common.h
+++ b/usr/src/uts/common/inet/ipp_common.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_INET_IPP_COMMON_H
 #define	_INET_IPP_COMMON_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -49,14 +47,6 @@ extern uint32_t ipp_action_count;
 #define	IPP_ENABLED(proc, ipst)	((ipp_action_count != 0) && \
 	(~((ipst)->ips_ip_policy_mask) & (proc)))
 
-/* Apply IPQoS policies for inbound traffic? */
-#define	IP6_IN_IPP(flags, ipst) (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&	\
-	(!((flags) & IP6_NO_IPPOLICY)))
-
-/* Apply IPQoS policies for oubound traffic? */
-#define	IP6_OUT_IPP(flags, ipst)	\
-	(IPP_ENABLED(IPP_LOCAL_OUT, ipst) && (!((flags) & IP6_NO_IPPOLICY)))
-
 /* Extracts 8 bit traffic class from IPV6 flow label field */
 #ifdef  _BIG_ENDIAN
 #define	__IPV6_TCLASS_FROM_FLOW(n)	(((n)>>20) & 0xff)
@@ -78,7 +68,9 @@ typedef	struct ip_priv {
 } ip_priv_t;
 
 /* The entry point for ip policy processing */
-extern void ip_process(ip_proc_t, mblk_t **, uint32_t);
+#ifdef	ILL_CONDEMNED
+extern mblk_t *ip_process(ip_proc_t, mblk_t *, ill_t *, ill_t *);
+#endif
 extern void ip_priv_free(void *);
 #endif /* _KERNEL */
 
diff --git a/usr/src/uts/common/inet/ipsec_impl.h b/usr/src/uts/common/inet/ipsec_impl.h
index c5fa9367fe..228e01008d 100644
--- a/usr/src/uts/common/inet/ipsec_impl.h
+++ b/usr/src/uts/common/inet/ipsec_impl.h
@@ -410,24 +410,25 @@ struct ipsec_policy_s
 	uint32_t		ipsp_refs;
 	ipsec_sel_t		*ipsp_sel;	/* selector set (shared) */
 	ipsec_action_t		*ipsp_act; 	/* action (may be shared) */
+	netstack_t		*ipsp_netstack;	/* No netstack_hold */
 };
 
 #define	IPPOL_REFHOLD(ipp) {			\
 	atomic_add_32(&(ipp)->ipsp_refs, 1);	\
 	ASSERT((ipp)->ipsp_refs != 0);		\
 }
-#define	IPPOL_REFRELE(ipp, ns) {				\
+#define	IPPOL_REFRELE(ipp) {					\
 	ASSERT((ipp)->ipsp_refs != 0);				\
 	membar_exit();						\
 	if (atomic_add_32_nv(&(ipp)->ipsp_refs, -1) == 0)	\
-		ipsec_policy_free(ipp, ns);			\
+		ipsec_policy_free(ipp);				\
 	(ipp) = 0;						\
 }
 
-#define	IPPOL_UNCHAIN(php, ip, ns)					\
-	HASHLIST_UNCHAIN((ip), ipsp_hash);				\
-	avl_remove(&(php)->iph_rulebyid, (ip));				\
-	IPPOL_REFRELE(ip, ns);
+#define	IPPOL_UNCHAIN(php, ip)					\
+	HASHLIST_UNCHAIN((ip), ipsp_hash);			\
+	avl_remove(&(php)->iph_rulebyid, (ip));			\
+	IPPOL_REFRELE(ip);
 
 /*
  * Policy ruleset.  One per (protocol * direction) for system policy.
@@ -590,8 +591,6 @@ typedef struct ipsid_s
 	atomic_add_32(&(ipsid)->ipsid_refcnt, -1);		\
 }
 
-struct ipsec_out_s;
-
 /*
  * Following are the estimates of what the maximum AH and ESP header size
  * would be. This is used to tell the upper layer the right value of MSS
@@ -708,6 +707,17 @@ typedef struct ipsif_s
 	kmutex_t ipsif_lock;
 } ipsif_t;
 
+/*
+ * For call to the kernel crypto framework. State needed during
+ * the execution of a crypto request.
+ */
+typedef struct ipsec_crypto_s {
+	size_t		ic_skip_len;		/* len to skip for AH auth */
+	crypto_data_t	ic_crypto_data;		/* single op crypto data */
+	crypto_dual_data_t ic_crypto_dual_data; /* for dual ops */
+	crypto_data_t	ic_crypto_mac;		/* to store the MAC */
+	ipsa_cm_mech_t	ic_cmm;
+} ipsec_crypto_t;
 
 /*
  * IPsec stack instances
@@ -826,45 +836,40 @@ extern boolean_t ipsec_loaded(ipsec_stack_t *);
 extern boolean_t ipsec_failed(ipsec_stack_t *);
 
 /*
- * callback from ipsec_loader to ip
- */
-extern void ip_ipsec_load_complete(ipsec_stack_t *);
-
-/*
  * ipsec policy entrypoints (spd.c)
  */
 
 extern void ipsec_policy_g_destroy(void);
 extern void ipsec_policy_g_init(void);
 
+extern mblk_t	*ipsec_add_crypto_data(mblk_t *, ipsec_crypto_t **);
+extern mblk_t	*ipsec_remove_crypto_data(mblk_t *, ipsec_crypto_t **);
+extern mblk_t	*ipsec_free_crypto_data(mblk_t *);
 extern int ipsec_alloc_table(ipsec_policy_head_t *, int, int, boolean_t,
     netstack_t *);
 extern void ipsec_polhead_init(ipsec_policy_head_t *, int);
 extern void ipsec_polhead_destroy(ipsec_policy_head_t *);
 extern void ipsec_polhead_free_table(ipsec_policy_head_t *);
 extern mblk_t *ipsec_check_global_policy(mblk_t *, conn_t *, ipha_t *,
-		    ip6_t *, boolean_t, netstack_t *);
+    ip6_t *, ip_recv_attr_t *, netstack_t *ns);
 extern mblk_t *ipsec_check_inbound_policy(mblk_t *, conn_t *, ipha_t *, ip6_t *,
-    boolean_t);
+    ip_recv_attr_t *);
 
-extern boolean_t ipsec_in_to_out(mblk_t *, ipha_t *, ip6_t *, zoneid_t);
+extern boolean_t ipsec_in_to_out(ip_recv_attr_t *, ip_xmit_attr_t *,
+    mblk_t *, ipha_t *, ip6_t *);
+extern void ipsec_in_release_refs(ip_recv_attr_t *);
+extern void ipsec_out_release_refs(ip_xmit_attr_t *);
 extern void ipsec_log_policy_failure(int, char *, ipha_t *, ip6_t *, boolean_t,
-		    netstack_t *);
+    netstack_t *);
 extern boolean_t ipsec_inbound_accept_clear(mblk_t *, ipha_t *, ip6_t *);
 extern int ipsec_conn_cache_policy(conn_t *, boolean_t);
-extern mblk_t *ipsec_alloc_ipsec_out(netstack_t *);
-extern mblk_t	*ipsec_attach_ipsec_out(mblk_t **, conn_t *, ipsec_policy_t *,
-    uint8_t, netstack_t *);
-extern mblk_t	*ipsec_init_ipsec_out(mblk_t *, mblk_t **, conn_t *,
-    ipsec_policy_t *, uint8_t, netstack_t *);
-struct ipsec_in_s;
-extern ipsec_action_t *ipsec_in_to_out_action(struct ipsec_in_s *);
-extern boolean_t ipsec_check_ipsecin_latch(struct ipsec_in_s *, mblk_t *,
-    struct ipsec_latch_s *, ipha_t *, ip6_t *, const char **, kstat_named_t **,
-    conn_t *);
-extern void ipsec_latch_inbound(ipsec_latch_t *ipl, struct ipsec_in_s *ii);
-
-extern void ipsec_policy_free(ipsec_policy_t *, netstack_t *);
+extern void ipsec_cache_outbound_policy(const conn_t *, const in6_addr_t *,
+    const in6_addr_t *, in_port_t, ip_xmit_attr_t *);
+extern boolean_t ipsec_outbound_policy_current(ip_xmit_attr_t *);
+extern ipsec_action_t *ipsec_in_to_out_action(ip_recv_attr_t *);
+extern void ipsec_latch_inbound(conn_t *connp, ip_recv_attr_t *ira);
+
+extern void ipsec_policy_free(ipsec_policy_t *);
 extern void ipsec_action_free(ipsec_action_t *);
 extern void ipsec_polhead_free(ipsec_policy_head_t *, netstack_t *);
 extern ipsec_policy_head_t *ipsec_polhead_split(ipsec_policy_head_t *,
@@ -894,12 +899,8 @@ extern void ipsec_actvec_free(ipsec_act_t *, uint_t);
 extern int ipsec_req_from_head(ipsec_policy_head_t *, ipsec_req_t *, int);
 extern mblk_t *ipsec_construct_inverse_acquire(sadb_msg_t *, sadb_ext_t **,
     netstack_t *);
-extern mblk_t *ip_wput_attach_policy(mblk_t *, ipha_t *, ip6_t *, ire_t *,
-    conn_t *, boolean_t, zoneid_t);
-extern mblk_t	*ip_wput_ire_parse_ipsec_out(mblk_t *, ipha_t *, ip6_t *,
-    ire_t *, conn_t *, boolean_t, zoneid_t);
-extern ipsec_policy_t *ipsec_find_policy(int, conn_t *,
-    struct ipsec_out_s *, ipsec_selector_t *, netstack_t *);
+extern ipsec_policy_t *ipsec_find_policy(int, const conn_t *,
+    ipsec_selector_t *, netstack_t *);
 extern ipsid_t *ipsid_lookup(int, char *, netstack_t *);
 extern boolean_t ipsid_equal(ipsid_t *, ipsid_t *);
 extern void ipsid_gc(netstack_t *);
@@ -912,29 +913,29 @@ extern void ipsec_enter_policy(ipsec_policy_head_t *, ipsec_policy_t *, int,
     netstack_t *);
 extern boolean_t ipsec_check_action(ipsec_act_t *, int *, netstack_t *);
 
-extern mblk_t *ipsec_out_tag(mblk_t *, mblk_t *, netstack_t *);
-extern mblk_t *ipsec_in_tag(mblk_t *, mblk_t *, netstack_t *);
-extern mblk_t *ip_copymsg(mblk_t *mp);
-
-extern void iplatch_free(ipsec_latch_t *, netstack_t *);
+extern void iplatch_free(ipsec_latch_t *);
 extern ipsec_latch_t *iplatch_create(void);
 extern int ipsec_set_req(cred_t *, conn_t *, ipsec_req_t *);
 
 extern void ipsec_insert_always(avl_tree_t *tree, void *new_node);
 
 extern int32_t ipsec_act_ovhd(const ipsec_act_t *act);
-extern int sadb_whack_label(mblk_t **, ipsa_t *);
-extern int sadb_whack_label_v6(mblk_t **, ipsa_t *);
+extern mblk_t *sadb_whack_label(mblk_t *, ipsa_t *, ip_xmit_attr_t *,
+    kstat_named_t *, ipdropper_t *);
+extern mblk_t *sadb_whack_label_v4(mblk_t *, ipsa_t *, kstat_named_t *,
+    ipdropper_t *);
+extern mblk_t *sadb_whack_label_v6(mblk_t *, ipsa_t *, kstat_named_t *,
+    ipdropper_t *);
 extern boolean_t update_iv(uint8_t *, queue_t *, ipsa_t *, ipsecesp_stack_t *);
 
 /*
  * Tunnel-support SPD functions and variables.
  */
 struct iptun_s;	/* Defined in inet/iptun/iptun_impl.h. */
-extern boolean_t ipsec_tun_inbound(mblk_t *, mblk_t **,  ipsec_tun_pol_t *,
+extern mblk_t *ipsec_tun_inbound(ip_recv_attr_t *, mblk_t *,  ipsec_tun_pol_t *,
     ipha_t *, ip6_t *, ipha_t *, ip6_t *, int, netstack_t *);
 extern mblk_t *ipsec_tun_outbound(mblk_t *, struct iptun_s *, ipha_t *,
-    ip6_t *, ipha_t *, ip6_t *, int);
+    ip6_t *, ipha_t *, ip6_t *, int, ip_xmit_attr_t *);
 extern void itp_free(ipsec_tun_pol_t *, netstack_t *);
 extern ipsec_tun_pol_t *create_tunnel_policy(char *, int *, uint64_t *,
     netstack_t *);
@@ -951,9 +952,9 @@ extern ipsec_tun_pol_t *itp_get_byaddr(uint32_t *, uint32_t *, int,
  */
 
 extern void ipsecah_in_assocfailure(mblk_t *, char, ushort_t, char *,
-    uint32_t, void *, int, ipsecah_stack_t *);
+    uint32_t, void *, int, ip_recv_attr_t *ira);
 extern void ipsecesp_in_assocfailure(mblk_t *, char, ushort_t, char *,
-    uint32_t, void *, int, ipsecesp_stack_t *);
+    uint32_t, void *, int, ip_recv_attr_t *ira);
 extern void ipsecesp_send_keepalive(ipsa_t *);
 
 /*
@@ -987,13 +988,8 @@ extern void ipsecah_algs_changed(netstack_t *);
 extern void ipsecesp_algs_changed(netstack_t *);
 extern void ipsecesp_init_funcs(ipsa_t *);
 extern void ipsecah_init_funcs(ipsa_t *);
-extern ipsec_status_t ipsecah_icmp_error(mblk_t *);
-extern ipsec_status_t ipsecesp_icmp_error(mblk_t *);
-
-/*
- * Wrapper for putnext() to ipsec accelerated interface.
- */
-extern void ipsec_hw_putnext(queue_t *, mblk_t *);
+extern mblk_t *ipsecah_icmp_error(mblk_t *, ip_recv_attr_t *);
+extern mblk_t *ipsecesp_icmp_error(mblk_t *, ip_recv_attr_t *);
 
 /*
  * spdsock functions that are called directly by IP.
@@ -1003,11 +999,11 @@ extern void spdsock_update_pending_algs(netstack_t *);
 /*
  * IP functions that are called from AH and ESP.
  */
-extern boolean_t ipsec_outbound_sa(mblk_t *, uint_t);
-extern esph_t *ipsec_inbound_esp_sa(mblk_t *, netstack_t *);
-extern ah_t *ipsec_inbound_ah_sa(mblk_t *, netstack_t *);
+extern boolean_t ipsec_outbound_sa(mblk_t *, ip_xmit_attr_t *, uint_t);
+extern mblk_t *ipsec_inbound_esp_sa(mblk_t *, ip_recv_attr_t *, esph_t **);
+extern mblk_t *ipsec_inbound_ah_sa(mblk_t *, ip_recv_attr_t *, ah_t **);
 extern ipsec_policy_t *ipsec_find_policy_head(ipsec_policy_t *,
-    ipsec_policy_head_t *, int, ipsec_selector_t *, netstack_t *);
+    ipsec_policy_head_t *, int, ipsec_selector_t *);
 
 /*
  * IP dropper init/destroy.
@@ -1019,7 +1015,7 @@ void ip_drop_destroy(ipsec_stack_t *);
  * Common functions
  */
 extern boolean_t ip_addr_match(uint8_t *, int, in6_addr_t *);
-extern boolean_t ipsec_label_match(cred_t *, cred_t *);
+extern boolean_t ipsec_label_match(ts_label_t *, ts_label_t *);
 
 /*
  * AH and ESP counters types.
diff --git a/usr/src/uts/common/inet/ipsec_info.h b/usr/src/uts/common/inet/ipsec_info.h
index 3c7ede8405..c1bde9fcb7 100644
--- a/usr/src/uts/common/inet/ipsec_info.h
+++ b/usr/src/uts/common/inet/ipsec_info.h
@@ -34,22 +34,12 @@ extern "C" {
 
 /*
  * IPsec informational messages.  These are M_CTL STREAMS messages, which
- * convey IPsec information between various IP and related modules.  The
- * messages come in a few flavors:
- *
- *	* IPSEC_{IN,OUT}  -  These show what IPsec action have been taken (for
- *	  inbound datagrams), or need to be taken (for outbound datagrams).
- *	  They flow between AH/ESP and IP.
+ * convey IPsec information between various IP and related modules.  Most
+ * have been deprecated by the de-STREAMS-ing of TCP/IP.  What remains is:
  *
  *	* Keysock consumer interface  -  These messages are wrappers for
  *	  PF_KEY messages.  They flow between AH/ESP and keysock.
  *
- * Some of these messages include pointers such as a netstack_t pointer.
- * We do not explicitly reference count those with netstack_hold/rele,
- * since we depend on IP's ability to discard all of the IPSEC_{IN,OUT}
- * messages in order to handle the ipsa pointers.
- * We have special logic when doing asynch callouts to kEF for which we
- * verify netstack_t pointer using the netstackid_t.
  */
 
 /*
@@ -69,223 +59,11 @@ extern "C" {
  * M_CTL types for IPsec messages.  Remember, the values 0x40 - 0x4f and 0x60
  * - 0x6f are not to be used because of potential little-endian confusion.
  *
- * Offsets 1-25 (decimal) are in use, spread through this file.
+ * Offsets 3-7 (decimal) are in use, spread through this file.
  * Check for duplicates through the whole file before adding.
  */
 
 /*
- * IPSEC_{IN,OUT} policy expressors.
- */
-#define	IPSEC_IN	(IPSEC_M_CTL + 1)
-#define	IPSEC_OUT	(IPSEC_M_CTL + 2)
-#define	MAXSALTSIZE 8
-
-/*
- * For combined mode ciphers, store the crypto_mechanism_t in the
- * per-packet ipsec_in_t/ipsec_out_t structures. This is because the PARAMS
- * and nonce values change for each packet. For non-combined mode
- * ciphers, these values are constant for the life of the SA.
- */
-typedef struct ipsa_cm_mech_s {
-	crypto_mechanism_t combined_mech;
-	union {
-		CK_AES_CCM_PARAMS paramu_ccm;
-		CK_AES_GCM_PARAMS paramu_gcm;
-	} paramu;
-	uint8_t nonce[MAXSALTSIZE + sizeof (uint64_t)];
-#define	param_ulMACSize paramu.paramu_ccm.ulMACSize
-#define	param_ulNonceSize paramu.paramu_ccm.ipsa_ulNonceSize
-#define	param_ulAuthDataSize paramu.paramu_ccm.ipsa_ulAuthDataSize
-#define	param_ulDataSize paramu.paramu_ccm.ipsa_ulDataSize
-#define	param_nonce paramu.paramu_ccm.nonce
-#define	param_authData paramu.paramu_ccm.authData
-#define	param_pIv paramu.paramu_gcm.ipsa_pIv
-#define	param_ulIvLen paramu.paramu_gcm.ulIvLen
-#define	param_ulIvBits paramu.paramu_gcm.ulIvBits
-#define	param_pAAD paramu.paramu_gcm.pAAD
-#define	param_ulAADLen paramu.paramu_gcm.ulAADLen
-#define	param_ulTagBits paramu.paramu_gcm.ulTagBits
-} ipsa_cm_mech_t;
-
-/*
- * This is used for communication between IP and IPSEC (AH/ESP)
- * for Inbound datagrams. IPSEC_IN is allocated by IP before IPSEC
- * processing begins. On return spi fields are initialized so that
- * IP can locate the security associations later on for doing policy
- * checks. For loopback case, IPSEC processing is not done. But the
- * attributes of the security are reflected in <foo>_done fields below.
- * The code in policy check infers that it is a loopback case and
- * would not try to get the associations.
- *
- * The comment below (and for other netstack_t references) refers
- * to the fact that we only do netstack_hold in particular cases,
- * such as the references from open streams (ill_t and conn_t's
- * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
- * ire_t's when an ill goes away.
- */
-typedef struct ipsec_in_s {
-	uint32_t ipsec_in_type;
-	uint32_t ipsec_in_len;
-	frtn_t ipsec_in_frtn;		/* for esballoc() callback */
-	struct ipsa_s 	*ipsec_in_ah_sa;	/* SA for AH */
-	struct ipsa_s 	*ipsec_in_esp_sa;	/* SA for ESP */
-
-	struct ipsec_policy_head_s *ipsec_in_policy;
-	struct ipsec_action_s *ipsec_in_action; /* how we made it in.. */
-	unsigned int
-		ipsec_in_secure : 1,	/* Is the message attached secure ? */
-		ipsec_in_v4 : 1,	/* Is this an ipv4 packet ? */
-		ipsec_in_loopback : 1,	/* Is this a loopback request ? */
-		ipsec_in_dont_check : 1, /* Used by TCP to avoid policy check */
-
-		ipsec_in_decaps : 1,	/* Was this packet decapsulated from */
-					/* a matching inner packet? */
-		ipsec_in_accelerated : 1, /* hardware accelerated packet */
-
-		ipsec_in_icmp_loopback : 1, /* Looped-back ICMP packet, */
-					    /* all should trust this. */
-		ipsec_in_pad_bits : 25;
-
-	int    ipsec_in_ill_index;	/* interface on which ipha_dst was */
-					/* configured when pkt was recv'd  */
-	int    ipsec_in_rill_index;	/* interface on which pkt was recv'd */
-	uint32_t ipsec_in_esp_udp_ports;	/* For an ESP-in-UDP packet. */
-	mblk_t *ipsec_in_da;		/* data attr. for accelerated pkts */
-
-	/*
-	 * For call to the kernel crypto framework. State needed during
-	 * the execution of a crypto request. Storing these here
-	 * allow us to avoid a separate allocation before calling the
-	 * crypto framework.
-	 */
-	size_t ipsec_in_skip_len;		/* len to skip for AH auth */
-	crypto_data_t ipsec_in_crypto_data;	/* single op crypto data */
-	crypto_dual_data_t ipsec_in_crypto_dual_data; /* for dual ops */
-	crypto_data_t ipsec_in_crypto_mac;	/* to store the MAC */
-
-	zoneid_t ipsec_in_zoneid;	/* target zone for the datagram */
-	netstack_t *ipsec_in_ns;	/* Does not have a netstack_hold */
-	ipsa_cm_mech_t ipsec_in_cmm;	/* PARAMS for Combined mode mechs */
-	netstackid_t ipsec_in_stackid;	/* Used while waing for kEF callback */
-} ipsec_in_t;
-
-#define	IPSECOUT_MAX_ADDRLEN 4	/* Max addr len. (in 32-bit words) */
-/*
- * This is used for communication between IP and IPSEC (AH/ESP)
- * for Outbound datagrams. IPSEC_OUT is allocated by IP before IPSEC
- * processing begins. On return SA fields are initialized so that
- * IP can locate the security associations later on for doing policy
- * checks.  The policy and the actions associated with this packet are
- * stored in the ipsec_out_policy and ipsec_out_act fields respectively.
- * IPSEC_OUT is also used to carry non-ipsec information when conn is
- * absent or the conn information is lost across the calls to ARP.
- * example: message from ARP or from ICMP error routines.
- */
-typedef struct ipsec_out_s {
-	uint32_t ipsec_out_type;
-	uint32_t ipsec_out_len;
-	frtn_t ipsec_out_frtn;		/* for esballoc() callback */
-	struct ipsec_policy_head_s *ipsec_out_polhead;
-	ipsec_latch_t		*ipsec_out_latch;
-	struct ipsec_policy_s 	*ipsec_out_policy; /* why are we here? */
-	struct ipsec_action_s	*ipsec_out_act;	/* what do we want? */
-	struct ipsa_s	*ipsec_out_ah_sa; /* AH SA used for the packet */
-	struct ipsa_s	*ipsec_out_esp_sa; /* ESP SA used for the packet */
-	/*
-	 * NOTE: "Source" and "Dest" are w.r.t. outbound datagrams.  Ports can
-	 *	 be zero, and the protocol number is needed to make the ports
-	 *	 significant.
-	 */
-	uint16_t ipsec_out_src_port;	/* Source port number of d-gram. */
-	uint16_t ipsec_out_dst_port;	/* Destination port number of d-gram. */
-	uint8_t  ipsec_out_icmp_type;	/* ICMP type of d-gram */
-	uint8_t  ipsec_out_icmp_code;	/* ICMP code of d-gram */
-
-	sa_family_t ipsec_out_inaf;	/* Inner address family */
-	uint32_t ipsec_out_insrc[IPSECOUT_MAX_ADDRLEN];	/* Inner src address */
-	uint32_t ipsec_out_indst[IPSECOUT_MAX_ADDRLEN];	/* Inner dest address */
-	uint8_t  ipsec_out_insrcpfx;	/* Inner source prefix */
-	uint8_t  ipsec_out_indstpfx;	/* Inner destination prefix */
-
-	uint_t ipsec_out_ill_index;	/* ill index used for multicast etc. */
-	uint8_t ipsec_out_proto;	/* IP protocol number for d-gram. */
-	unsigned int
-		ipsec_out_tunnel : 1,	/* Tunnel mode? */
-		ipsec_out_use_global_policy : 1, /* Inherit global policy ? */
-		ipsec_out_secure : 1,	/* Is this secure ? */
-		ipsec_out_proc_begin : 1, /* IPSEC processing begun */
-		/*
-		 * Following five values reflects the values stored
-		 * in conn.
-		 */
-		ipsec_out_multicast_loop : 1,
-		ipsec_out_dontroute : 1,
-		ipsec_out_reserved : 1,
-		ipsec_out_v4 : 1,
-
-		ipsec_out_unspec_src : 1,	/* IPv6 ip6i_t info */
-		ipsec_out_reachable : 1, 	/* NDP reachability info */
-		ipsec_out_failed: 1,
-		ipsec_out_se_done: 1,
-
-		ipsec_out_esp_done: 1,
-		ipsec_out_ah_done: 1,
-		ipsec_out_need_policy: 1,
-
-		/*
-		 * To indicate that packet must be accelerated, i.e.
-		 * ICV or encryption performed, by Provider.
-		 */
-		ipsec_out_accelerated : 1,
-		/*
-		 * Used by IP to tell IPsec that the outbound ill for this
-		 * packet supports acceleration of the AH or ESP prototocol.
-		 * If set, ipsec_out_capab_ill_index contains the
-		 * index of the ill.
-		 */
-		ipsec_out_is_capab_ill : 1,
-		/*
-		 * Indicates ICMP message destined for self.  These
-		 * messages are to be trusted by all receivers.
-		 */
-		ipsec_out_icmp_loopback: 1,
-		ipsec_out_ip_nexthop : 1,	/* IP_NEXTHOP option is set */
-		ipsec_out_pad_bits : 13;
-	cred_t	*ipsec_out_cred;
-	uint32_t ipsec_out_capab_ill_index;
-
-	/*
-	 * For call to the kernel crypto framework. State needed during
-	 * the execution of a crypto request. Storing these here
-	 * allow us to avoid a separate allocation before calling the
-	 * crypto framework.
-	 */
-	size_t ipsec_out_skip_len;		/* len to skip for AH auth */
-	crypto_data_t ipsec_out_crypto_data;	/* single op crypto data */
-	crypto_dual_data_t ipsec_out_crypto_dual_data; /* for dual ops */
-	crypto_data_t ipsec_out_crypto_mac;	/* to store the MAC */
-
-	zoneid_t ipsec_out_zoneid;	/* source zone for the datagram */
-	in6_addr_t ipsec_out_nexthop_v6;	/* nexthop IP address */
-#define	ipsec_out_nexthop_addr V4_PART_OF_V6(ipsec_out_nexthop_v6)
-	netstack_t *ipsec_out_ns;	/* Does not have a netstack_hold */
-	netstackid_t ipsec_out_stackid;	/* Used while waing for kEF callback */
-	ipsa_cm_mech_t ipsec_out_cmm;	/* PARAMS for Combined mode mechs */
-} ipsec_out_t;
-
-/*
- * This is used to mark the ipsec_out_t *req* fields
- * when the operation is done without affecting the
- * requests.
- */
-#define	IPSEC_REQ_DONE		0x80000000
-/*
- * Operation could not be performed by the AH/ESP
- * module.
- */
-#define	IPSEC_REQ_FAILED	0x40000000
-
-/*
  * Keysock consumer interface.
  *
  * The driver/module keysock (which is a driver to PF_KEY sockets, but is
@@ -368,32 +146,6 @@ typedef struct keysock_out_err_s {
 } keysock_out_err_t;
 
 /*
- * M_CTL message type for sending inbound pkt information between IP & ULP.
- * These are _not_ related to IPsec in any way, but are here so that there is
- * one place where all these values are defined which makes it easier to track.
- * The choice of this value has the same rationale as explained above.
- */
-#define	IN_PKTINFO		(IPSEC_M_CTL + 24)
-
-
-/*
- * IPSEC_CTL messages are used by IPsec to send control type requests
- * to IP. Such a control message is currently used by IPsec to request
- * that IP send the contents of an IPsec SA or the entire SADB to
- * every IPsec hardware acceleration capable provider.
- */
-
-#define	IPSEC_CTL		(IPSEC_M_CTL + 25)
-
-typedef struct ipsec_ctl_s {
-	uint32_t ipsec_ctl_type;
-	uint32_t ipsec_ctl_len;
-	uint_t ipsec_ctl_sa_type;
-	void *ipsec_ctl_sa;
-} ipsec_ctl_t;
-
-
-/*
  * All IPsec informational messages are placed into the ipsec_info_t
  * union, so that allocation can be done once, and IPsec informational
  * messages can be recycled.
@@ -403,13 +155,10 @@ typedef union ipsec_info_u {
 		uint32_t ipsec_allu_type;
 		uint32_t ipsec_allu_len;	/* In bytes */
 	} ipsec_allu;
-	ipsec_in_t ipsec_in;
-	ipsec_out_t ipsec_out;
 	keysock_hello_ack_t keysock_hello_ack;
 	keysock_in_t keysock_in;
 	keysock_out_t keysock_out;
 	keysock_out_err_t keysock_out_err;
-	ipsec_ctl_t ipsec_ctl;
 } ipsec_info_t;
 #define	ipsec_info_type ipsec_allu.ipsec_allu_type
 #define	ipsec_info_len ipsec_allu.ipsec_allu_len
diff --git a/usr/src/uts/common/inet/ipsecah.h b/usr/src/uts/common/inet/ipsecah.h
index c389664164..cde745da88 100644
--- a/usr/src/uts/common/inet/ipsecah.h
+++ b/usr/src/uts/common/inet/ipsecah.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_INET_IPSECAH_H
 #define	_INET_IPSECAH_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <inet/ip.h>
 #include <inet/ipdrop.h>
 
@@ -62,9 +60,6 @@ typedef struct ah_kstats_s
 	kstat_named_t ah_stat_acquire_requests;
 	kstat_named_t ah_stat_bytes_expired;
 	kstat_named_t ah_stat_out_discards;
-	kstat_named_t ah_stat_in_accelerated;
-	kstat_named_t ah_stat_out_accelerated;
-	kstat_named_t ah_stat_noaccel;
 	kstat_named_t ah_stat_crypto_sync;
 	kstat_named_t ah_stat_crypto_async;
 	kstat_named_t ah_stat_crypto_failures;
@@ -116,8 +111,6 @@ struct ipsecah_stack {
 	 */
 	queue_t			*ah_pfkey_q;
 	timeout_id_t		ah_event;
-
-	mblk_t			*ah_ip_unbind;
 };
 typedef struct ipsecah_stack ipsecah_stack_t;
 
diff --git a/usr/src/uts/common/inet/ipsecesp.h b/usr/src/uts/common/inet/ipsecesp.h
index 2dfb73c667..7be35276aa 100644
--- a/usr/src/uts/common/inet/ipsecesp.h
+++ b/usr/src/uts/common/inet/ipsecesp.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_INET_IPSECESP_H
 #define	_INET_IPSECESP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <inet/ip.h>
 #include <inet/ipdrop.h>
 
@@ -70,10 +68,7 @@ struct ipsecesp_stack {
 	queue_t			*esp_pfkey_q;
 	timeout_id_t		esp_event;
 
-	mblk_t			*esp_ip_unbind;
-
 	sadbp_t			esp_sadb;
-
 };
 typedef struct ipsecesp_stack ipsecesp_stack_t;
 
diff --git a/usr/src/uts/common/inet/iptun/iptun.c b/usr/src/uts/common/inet/iptun/iptun.c
index bc2f1d64d5..505aaccb31 100644
--- a/usr/src/uts/common/inet/iptun/iptun.c
+++ b/usr/src/uts/common/inet/iptun/iptun.c
@@ -76,6 +76,8 @@
 #include <inet/ip.h>
 #include <inet/ip_ire.h>
 #include <inet/ipsec_impl.h>
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
 #include <inet/iptun.h>
 #include "iptun_impl.h"
 
@@ -87,8 +89,6 @@
 
 #define	IPTUN_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
 
-#define	IPTUNQ_DEV	"/dev/iptunq"
-
 #define	IPTUN_MIN_IPV4_MTU	576		/* ip.h still uses 68 (!) */
 #define	IPTUN_MIN_IPV6_MTU	IPV6_MIN_MTU
 #define	IPTUN_MAX_IPV4_MTU	(IP_MAXPACKET - sizeof (ipha_t))
@@ -113,15 +113,18 @@ static iptun_encaplim_t	iptun_encaplim_init = {
 	0
 };
 
-/* Table containing per-iptun-type information. */
+/*
+ * Table containing per-iptun-type information.
+ * Since IPv6 can run over all of these we have the IPv6 min as the min MTU.
+ */
 static iptun_typeinfo_t	iptun_type_table[] = {
-	{ IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION, ip_output,
-	    IPTUN_MIN_IPV4_MTU,	IPTUN_MAX_IPV4_MTU,	B_TRUE },
-	{ IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION, ip_output_v6,
+	{ IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION,
+	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV4_MTU,	B_TRUE },
+	{ IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION,
 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV6_MTU,	B_TRUE },
-	{ IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION, ip_output,
-	    IPTUN_MIN_IPV4_MTU,	IPTUN_MAX_IPV4_MTU,	B_FALSE },
-	{ IPTUN_TYPE_UNKNOWN, NULL, 0, NULL, 0, 0, B_FALSE }
+	{ IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION,
+	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV4_MTU,	B_FALSE },
+	{ IPTUN_TYPE_UNKNOWN, NULL, 0, 0, 0, B_FALSE }
 };
 
 /*
@@ -140,7 +143,6 @@ kmem_cache_t	*iptun_cache;
 ddi_taskq_t 	*iptun_taskq;
 
 typedef enum {
-	IPTUN_TASK_PMTU_UPDATE,	/* obtain new destination path-MTU */
 	IPTUN_TASK_MTU_UPDATE,	/* tell mac about new tunnel link MTU */
 	IPTUN_TASK_LADDR_UPDATE, /* tell mac about new local address */
 	IPTUN_TASK_RADDR_UPDATE, /* tell mac about new remote address */
@@ -158,13 +160,23 @@ static int iptun_enter(iptun_t *);
 static void iptun_exit(iptun_t *);
 static void iptun_headergen(iptun_t *, boolean_t);
 static void iptun_drop_pkt(mblk_t *, uint64_t *);
-static void iptun_input(void *, mblk_t *, void *);
+static void iptun_input(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void iptun_input_icmp(void *, mblk_t *, void *, ip_recv_attr_t *);
 static void iptun_output(iptun_t *, mblk_t *);
-static uint32_t iptun_get_maxmtu(iptun_t *, uint32_t);
-static uint32_t iptun_update_mtu(iptun_t *, uint32_t);
-static uint32_t iptun_get_dst_pmtu(iptun_t *);
+static uint32_t iptun_get_maxmtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
+static uint32_t iptun_update_mtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
+static uint32_t iptun_get_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
+static void iptun_update_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
 static int iptun_setladdr(iptun_t *, const struct sockaddr_storage *);
 
+static void iptun_output_6to4(iptun_t *, mblk_t *);
+static void iptun_output_common(iptun_t *, ip_xmit_attr_t *, mblk_t *);
+static boolean_t iptun_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
+    ip_recv_attr_t *);
+
+static void iptun_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
+    ixa_notify_arg_t);
+
 static mac_callbacks_t iptun_m_callbacks;
 
 static int
@@ -295,13 +307,6 @@ iptun_m_tx(void *arg, mblk_t *mpchain)
 		return (NULL);
 	}
 
-	/*
-	 * Request the destination's path MTU information regularly in case
-	 * path MTU has increased.
-	 */
-	if (IPTUN_PMTU_TOO_OLD(iptun))
-		iptun_task_dispatch(iptun, IPTUN_TASK_PMTU_UPDATE);
-
 	for (mp = mpchain; mp != NULL; mp = nmp) {
 		nmp = mp->b_next;
 		mp->b_next = NULL;
@@ -350,7 +355,7 @@ iptun_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
 		}
 		break;
 	case MAC_PROP_MTU: {
-		uint32_t maxmtu = iptun_get_maxmtu(iptun, 0);
+		uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
 
 		if (value < iptun->iptun_typeinfo->iti_minmtu ||
 		    value > maxmtu) {
@@ -434,7 +439,7 @@ iptun_m_getprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
 		}
 		break;
 	case MAC_PROP_MTU: {
-		uint32_t maxmtu = iptun_get_maxmtu(iptun, 0);
+		uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
 
 		if (is_possible) {
 			range.range_uint32[0].mpur_min =
@@ -516,20 +521,11 @@ iptun_enter_by_linkid(datalink_id_t linkid, iptun_t **iptun)
 }
 
 /*
- * Handle tasks that were deferred through the iptun_taskq.  These fall into
- * two categories:
- *
- * 1. Tasks that were defered because we didn't want to spend time doing them
- * while in the data path.  Only IPTUN_TASK_PMTU_UPDATE falls into this
- * category.
- *
- * 2. Tasks that were defered because they require calling up to the mac
- * module, and we can't call up to the mac module while holding locks.
+ * Handle tasks that were deferred through the iptun_taskq because they require
+ * calling up to the mac module, and we can't call up to the mac module while
+ * holding locks.
  *
- * Handling 1 is easy; we just lookup the iptun_t, perform the task, exit the
- * tunnel, and we're done.
- *
- * Handling 2 is tricky to get right without introducing race conditions and
+ * This is tricky to get right without introducing race conditions and
  * deadlocks with the mac module, as we cannot issue an upcall while in the
  * iptun_t.  The reason is that upcalls may try and enter the mac perimeter,
  * while iptun callbacks (such as iptun_m_setprop()) called from the mac
@@ -573,12 +569,6 @@ iptun_task_cb(void *arg)
 	if (iptun_enter_by_linkid(linkid, &iptun) != 0)
 		return;
 
-	if (task == IPTUN_TASK_PMTU_UPDATE) {
-		(void) iptun_update_mtu(iptun, 0);
-		iptun_exit(iptun);
-		return;
-	}
-
 	iptun->iptun_flags |= IPTUN_UPCALL_PENDING;
 
 	switch (task) {
@@ -742,53 +732,143 @@ iptun_canbind(iptun_t *iptun)
 	    !(iptun->iptun_typeinfo->iti_hasraddr)));
 }
 
+/*
+ * Verify that the local address is valid, and insert in the fanout
+ */
 static int
 iptun_bind(iptun_t *iptun)
 {
-	conn_t	*connp = iptun->iptun_connp;
-	int	err;
+	conn_t			*connp = iptun->iptun_connp;
+	int			error = 0;
+	ip_xmit_attr_t		*ixa;
+	iulp_t			uinfo;
+	ip_stack_t		*ipst = connp->conn_netstack->netstack_ip;
+
+	/* Get an exclusive ixa for this thread, and replace conn_ixa */
+	ixa = conn_get_ixa(connp, B_TRUE);
+	if (ixa == NULL)
+		return (ENOMEM);
+	ASSERT(ixa->ixa_refcnt >= 2);
+	ASSERT(ixa == connp->conn_ixa);
+
+	/* We create PMTU state including for 6to4 */
+	ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
 
 	ASSERT(iptun_canbind(iptun));
 
+	mutex_enter(&connp->conn_lock);
+	/*
+	 * Note that conn_proto can't be set since the upper protocol
+	 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
+	 * ipcl_iptun_classify doesn't use conn_proto.
+	 */
+	connp->conn_ipversion = iptun->iptun_typeinfo->iti_ipvers;
+
 	switch (iptun->iptun_typeinfo->iti_type) {
 	case IPTUN_TYPE_IPV4:
-		/*
-		 * When we set a tunnel's destination address, we do not care
-		 * if the destination is reachable.  Transient routing issues
-		 * should not inhibit the creation of a tunnel interface, for
-		 * example.  For that reason, we pass in B_FALSE for the
-		 * verify_dst argument of ip_proto_bind_connected_v4() (and
-		 * similarly for IPv6 tunnels below).
-		 */
-		err = ip_proto_bind_connected_v4(connp, NULL, IPPROTO_ENCAP,
-		    &iptun->iptun_laddr4, 0, iptun->iptun_raddr4, 0, B_TRUE,
-		    B_FALSE, iptun->iptun_cred);
+		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
+		    &connp->conn_laddr_v6);
+		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_raddr4,
+		    &connp->conn_faddr_v6);
+		ixa->ixa_flags |= IXAF_IS_IPV4;
+		if (ip_laddr_verify_v4(iptun->iptun_laddr4, IPCL_ZONEID(connp),
+		    ipst, B_FALSE) != IPVL_UNICAST_UP) {
+			mutex_exit(&connp->conn_lock);
+			error = EADDRNOTAVAIL;
+			goto done;
+		}
 		break;
 	case IPTUN_TYPE_IPV6:
-		err = ip_proto_bind_connected_v6(connp, NULL, IPPROTO_IPV6,
-		    &iptun->iptun_laddr6, 0, &iptun->iptun_raddr6, NULL, 0,
-		    B_TRUE, B_FALSE, iptun->iptun_cred);
+		connp->conn_laddr_v6 = iptun->iptun_laddr6;
+		connp->conn_faddr_v6 = iptun->iptun_raddr6;
+		ixa->ixa_flags &= ~IXAF_IS_IPV4;
+		/* We use a zero scopeid for now */
+		if (ip_laddr_verify_v6(&iptun->iptun_laddr6, IPCL_ZONEID(connp),
+		    ipst, B_FALSE, 0) != IPVL_UNICAST_UP) {
+			mutex_exit(&connp->conn_lock);
+			error = EADDRNOTAVAIL;
+			goto done;
+		}
 		break;
 	case IPTUN_TYPE_6TO4:
-		err = ip_proto_bind_laddr_v4(connp, NULL, IPPROTO_IPV6,
-		    iptun->iptun_laddr4, 0, B_TRUE);
-		break;
+		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
+		    &connp->conn_laddr_v6);
+		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_faddr_v6);
+		ixa->ixa_flags |= IXAF_IS_IPV4;
+		mutex_exit(&connp->conn_lock);
+
+		switch (ip_laddr_verify_v4(iptun->iptun_laddr4,
+		    IPCL_ZONEID(connp), ipst, B_FALSE)) {
+		case IPVL_UNICAST_UP:
+		case IPVL_UNICAST_DOWN:
+			break;
+		default:
+			error = EADDRNOTAVAIL;
+			goto done;
+		}
+		goto insert;
 	}
 
-	if (err == 0) {
-		iptun->iptun_flags |= IPTUN_BOUND;
+	/* In case previous destination was multirt */
+	ip_attr_newdst(ixa);
 
-		/*
-		 * Now that we're bound with ip below us, this is a good time
-		 * to initialize the destination path MTU and to re-calculate
-		 * the tunnel's link MTU.
-		 */
-		(void) iptun_update_mtu(iptun, 0);
+	/*
+	 * When we set a tunnel's destination address, we do not
+	 * care if the destination is reachable.  Transient routing
+	 * issues should not inhibit the creation of a tunnel
+	 * interface, for example. Thus we pass B_FALSE here.
+	 */
+	connp->conn_saddr_v6 = connp->conn_laddr_v6;
+	mutex_exit(&connp->conn_lock);
 
-		if (IS_IPTUN_RUNNING(iptun))
-			iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
-	}
-	return (err);
+	/* As long as the MTU is large we avoid fragmentation */
+	ixa->ixa_flags |= IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF;
+
+	/* We handle IPsec in iptun_output_common */
+	error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
+	    &connp->conn_saddr_v6, &uinfo, 0);
+
+	if (error != 0)
+		goto done;
+
+	/* saddr shouldn't change since it was already set */
+	ASSERT(IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
+	    &connp->conn_saddr_v6));
+
+	/* We set IXAF_VERIFY_PMTU to catch PMTU increases */
+	ixa->ixa_flags |= IXAF_VERIFY_PMTU;
+	ASSERT(uinfo.iulp_mtu != 0);
+
+	/*
+	 * Allow setting new policies.
+	 * The addresses/ports are already set, thus the IPsec policy calls
+	 * can handle their passed-in conn's.
+	 */
+	connp->conn_policy_cached = B_FALSE;
+
+insert:
+	error = ipcl_conn_insert(connp);
+	if (error != 0)
+		goto done;
+
+	/* Record this as the "last" send even though we haven't sent any */
+	connp->conn_v6lastdst = connp->conn_faddr_v6;
+
+	iptun->iptun_flags |= IPTUN_BOUND;
+	/*
+	 * Now that we're bound with ip below us, this is a good
+	 * time to initialize the destination path MTU and to
+	 * re-calculate the tunnel's link MTU.
+	 */
+	(void) iptun_update_mtu(iptun, ixa, 0);
+
+	if (IS_IPTUN_RUNNING(iptun))
+		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
+
+done:
+	ixa_refrele(ixa);
+	return (error);
 }
 
 static void
@@ -986,7 +1066,7 @@ iptun_set_sec_simple(iptun_t *iptun, const ipsec_req_t *ipsr)
 		 * Adjust MTU and make sure the DL side knows what's up.
 		 */
 		itp->itp_flags = ITPF_P_ACTIVE;
-		(void) iptun_update_mtu(iptun, 0);
+		(void) iptun_update_mtu(iptun, NULL, 0);
 		old_policy = B_FALSE;	/* Blank out inactive - we succeeded */
 	} else {
 		rw_exit(&itp->itp_policy->iph_lock);
@@ -1170,8 +1250,16 @@ iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp)
 	connp->conn_flags |= IPCL_IPTUN;
 	connp->conn_iptun = iptun;
 	connp->conn_recv = iptun_input;
-	connp->conn_rq = ns->netstack_iptun->iptuns_g_q;
-	connp->conn_wq = WR(connp->conn_rq);
+	connp->conn_recvicmp = iptun_input_icmp;
+	connp->conn_verifyicmp = iptun_verifyicmp;
+
+	/*
+	 * Register iptun_notify to listen to capability changes detected by IP.
+	 * This upcall is made in the context of the call to conn_ip_output.
+	 */
+	connp->conn_ixa->ixa_notify = iptun_notify;
+	connp->conn_ixa->ixa_notify_cookie = iptun;
+
 	/*
 	 * For exclusive stacks we set conn_zoneid to GLOBAL_ZONEID as is done
 	 * for all other conn_t's.
@@ -1187,11 +1275,32 @@ iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp)
 	connp->conn_cred = credp;
 	/* crfree() is done in ipcl_conn_destroy(), called by CONN_DEC_REF() */
 	crhold(connp->conn_cred);
+	connp->conn_cpid = NOPID;
 
-	connp->conn_send = iptun->iptun_typeinfo->iti_txfunc;
-	connp->conn_af_isv6 = iptun->iptun_typeinfo->iti_ipvers == IPV6_VERSION;
+	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+	connp->conn_ixa->ixa_zoneid = connp->conn_zoneid;
 	ASSERT(connp->conn_ref == 1);
 
+	/* Cache things in ixa without an extra refhold */
+	connp->conn_ixa->ixa_cred = connp->conn_cred;
+	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+	if (is_system_labeled())
+		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
+
+	/*
+	 * Have conn_ip_output drop packets should our outer source
+	 * go invalid
+	 */
+	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
+
+	switch (iptun->iptun_typeinfo->iti_ipvers) {
+	case IPV4_VERSION:
+		connp->conn_family = AF_INET6;
+		break;
+	case IPV6_VERSION:
+		connp->conn_family = AF_INET;
+		break;
+	}
 	mutex_enter(&connp->conn_lock);
 	connp->conn_state_flags &= ~CONN_INCIPIENT;
 	mutex_exit(&connp->conn_lock);
@@ -1207,26 +1316,6 @@ iptun_conn_destroy(conn_t *connp)
 	CONN_DEC_REF(connp);
 }
 
-static int
-iptun_create_g_q(iptun_stack_t *iptuns, cred_t *credp)
-{
-	int	err;
-	conn_t	*connp;
-
-	ASSERT(iptuns->iptuns_g_q == NULL);
-	/*
-	 * The global queue for this stack is set when iptunq_open() calls
-	 * iptun_set_g_q().
-	 */
-	err = ldi_open_by_name(IPTUNQ_DEV, FWRITE|FREAD, credp,
-	    &iptuns->iptuns_g_q_lh, iptun_ldi_ident);
-	if (err == 0) {
-		connp = iptuns->iptuns_g_q->q_ptr;
-		connp->conn_recv = iptun_input;
-	}
-	return (err);
-}
-
 static iptun_t *
 iptun_alloc(void)
 {
@@ -1289,11 +1378,6 @@ iptun_free(iptun_t *iptun)
 		iptun->iptun_connp = NULL;
 	}
 
-	netstack_rele(iptun->iptun_ns);
-	iptun->iptun_ns = NULL;
-	crfree(iptun->iptun_cred);
-	iptun->iptun_cred = NULL;
-
 	kmem_cache_free(iptun_cache, iptun);
 	atomic_dec_32(&iptun_tunnelcount);
 }
@@ -1340,19 +1424,6 @@ iptun_create(iptun_kparams_t *ik, cred_t *credp)
 	ns = netstack_find_by_cred(credp);
 	iptuns = ns->netstack_iptun;
 
-	/*
-	 * Before we create any tunnel, we need to ensure that the default
-	 * STREAMS queue (used to satisfy the ip module's requirement for one)
-	 * is created.  We only do this once per stack.  The stream is closed
-	 * when the stack is destroyed in iptun_stack_fni().
-	 */
-	mutex_enter(&iptuns->iptuns_lock);
-	if (iptuns->iptuns_g_q == NULL)
-		err = iptun_create_g_q(iptuns, zone_kcred());
-	mutex_exit(&iptuns->iptuns_lock);
-	if (err != 0)
-		goto done;
-
 	if ((iptun = iptun_alloc()) == NULL) {
 		err = ENOMEM;
 		goto done;
@@ -1360,8 +1431,6 @@ iptun_create(iptun_kparams_t *ik, cred_t *credp)
 
 	iptun->iptun_linkid = ik->iptun_kparam_linkid;
 	iptun->iptun_zoneid = zoneid;
-	crhold(credp);
-	iptun->iptun_cred = credp;
 	iptun->iptun_ns = ns;
 
 	iptun->iptun_typeinfo = iptun_gettypeinfo(ik->iptun_kparam_type);
@@ -1668,49 +1737,142 @@ iptun_set_policy(datalink_id_t linkid, ipsec_tun_pol_t *itp)
 		ITP_REFHOLD(itp);
 		iptun->iptun_itp = itp;
 		/* IPsec policy means IPsec overhead, which means lower MTU. */
-		(void) iptun_update_mtu(iptun, 0);
+		(void) iptun_update_mtu(iptun, NULL, 0);
 	}
 	iptun_exit(iptun);
 }
 
 /*
  * Obtain the path MTU to the tunnel destination.
+ * Can return zero in some cases.
  */
 static uint32_t
-iptun_get_dst_pmtu(iptun_t *iptun)
+iptun_get_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
 {
-	ire_t		*ire = NULL;
-	ip_stack_t	*ipst = iptun->iptun_ns->netstack_ip;
 	uint32_t	pmtu = 0;
+	conn_t		*connp = iptun->iptun_connp;
+	boolean_t	need_rele = B_FALSE;
 
 	/*
-	 * We only obtain the destination IRE for tunnels that have a remote
-	 * tunnel address.
+	 * We only obtain the pmtu for tunnels that have a remote tunnel
+	 * address.
 	 */
 	if (!(iptun->iptun_flags & IPTUN_RADDR))
 		return (0);
 
-	switch (iptun->iptun_typeinfo->iti_ipvers) {
-	case IPV4_VERSION:
-		ire = ire_route_lookup(iptun->iptun_raddr4, INADDR_ANY,
-		    INADDR_ANY, 0, NULL, NULL, iptun->iptun_connp->conn_zoneid,
-		    NULL, (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
-		break;
-	case IPV6_VERSION:
-		ire = ire_route_lookup_v6(&iptun->iptun_raddr6, NULL, NULL, 0,
-		    NULL, NULL, iptun->iptun_connp->conn_zoneid, NULL,
-		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
-		break;
+	if (ixa == NULL) {
+		ixa = conn_get_ixa(connp, B_FALSE);
+		if (ixa == NULL)
+			return (0);
+		need_rele = B_TRUE;
 	}
+	/*
+	 * Guard against ICMP errors before we have sent, as well as against
+	 * and a thread which held conn_ixa.
+	 */
+	if (ixa->ixa_ire != NULL) {
+		pmtu = ip_get_pmtu(ixa);
 
-	if (ire != NULL) {
-		pmtu = ire->ire_max_frag;
-		ire_refrele(ire);
+		/*
+		 * For both IPv4 and IPv6 we can have indication that the outer
+		 * header needs fragmentation.
+		 */
+		if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
+			/* Must allow fragmentation in ip_output */
+			ixa->ixa_flags &= ~IXAF_DONTFRAG;
+		} else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
+			ixa->ixa_flags |= IXAF_DONTFRAG;
+		} else {
+			/* ip_get_pmtu might have set this - we don't want it */
+			ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
+		}
 	}
+
+	if (need_rele)
+		ixa_refrele(ixa);
 	return (pmtu);
 }
 
 /*
+ * Update the ip_xmit_attr_t to capture the current lower path mtu as known
+ * by ip.
+ */
+static void
+iptun_update_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
+{
+	uint32_t	pmtu;
+	conn_t		*connp = iptun->iptun_connp;
+	boolean_t	need_rele = B_FALSE;
+
+	/* IXAF_VERIFY_PMTU is not set if we don't have a fixed destination */
+	if (!(iptun->iptun_flags & IPTUN_RADDR))
+		return;
+
+	if (ixa == NULL) {
+		ixa = conn_get_ixa(connp, B_FALSE);
+		if (ixa == NULL)
+			return;
+		need_rele = B_TRUE;
+	}
+	/*
+	 * Guard against ICMP errors before we have sent, as well as against
+	 * and a thread which held conn_ixa.
+	 */
+	if (ixa->ixa_ire != NULL) {
+		pmtu = ip_get_pmtu(ixa);
+		/*
+		 * Update ixa_fragsize and ixa_pmtu.
+		 */
+		ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
+
+		/*
+		 * For both IPv4 and IPv6 we can have indication that the outer
+		 * header needs fragmentation.
+		 */
+		if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
+			/* Must allow fragmentation in ip_output */
+			ixa->ixa_flags &= ~IXAF_DONTFRAG;
+		} else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
+			ixa->ixa_flags |= IXAF_DONTFRAG;
+		} else {
+			/* ip_get_pmtu might have set this - we don't want it */
+			ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
+		}
+	}
+
+	if (need_rele)
+		ixa_refrele(ixa);
+}
+
+/*
+ * There is nothing that iptun can verify in addition to IP having
+ * verified the IP addresses in the fanout.
+ */
+/* ARGSUSED */
+static boolean_t
+iptun_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
+    ip_recv_attr_t *ira)
+{
+	return (B_TRUE);
+}
+
+/*
+ * Notify function registered with ip_xmit_attr_t.
+ */
+static void
+iptun_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
+    ixa_notify_arg_t narg)
+{
+	iptun_t		*iptun = (iptun_t *)arg;
+
+	switch (ntype) {
+	case IXAN_PMTU:
+		(void) iptun_update_mtu(iptun, ixa, narg);
+		break;
+	}
+}
+
+/*
  * Returns the max of old_ovhd and the overhead associated with pol.
  */
 static uint32_t
@@ -1765,18 +1927,18 @@ iptun_get_ipsec_overhead(iptun_t *iptun)
 		/* Check for both IPv4 and IPv6. */
 		sel.ips_protocol = IPPROTO_ENCAP;
 		pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
-		    &sel, ns);
+		    &sel);
 		if (pol != NULL) {
 			ipsec_ovhd = ipsec_act_ovhd(&pol->ipsp_act->ipa_act);
-			IPPOL_REFRELE(pol, ns);
+			IPPOL_REFRELE(pol);
 		}
 		sel.ips_protocol = IPPROTO_IPV6;
 		pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
-		    &sel, ns);
+		    &sel);
 		if (pol != NULL) {
 			ipsec_ovhd = max(ipsec_ovhd,
 			    ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
-			IPPOL_REFRELE(pol, ns);
+			IPPOL_REFRELE(pol);
 		}
 		IPPH_REFRELE(iph, ns);
 	} else {
@@ -1802,10 +1964,14 @@ iptun_get_ipsec_overhead(iptun_t *iptun)
 }
 
 /*
- * Calculate and return the maximum possible MTU for the given tunnel.
+ * Calculate and return the maximum possible upper MTU for the given tunnel.
+ *
+ * If new_pmtu is set then we also need to update the lower path MTU information
+ * in the ip_xmit_attr_t. That is needed since we set IXAF_VERIFY_PMTU so that
+ * we are notified by conn_ip_output() when the path MTU increases.
  */
 static uint32_t
-iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu)
+iptun_get_maxmtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
 {
 	size_t		header_size, ipsec_overhead;
 	uint32_t	maxmtu, pmtu;
@@ -1816,13 +1982,11 @@ iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu)
 	 * iptun_get_dst_pmtu().
 	 */
 	if (new_pmtu != 0) {
-		if (iptun->iptun_flags & IPTUN_RADDR) {
+		if (iptun->iptun_flags & IPTUN_RADDR)
 			iptun->iptun_dpmtu = new_pmtu;
-			iptun->iptun_dpmtu_lastupdate = ddi_get_lbolt();
-		}
 		pmtu = new_pmtu;
 	} else if (iptun->iptun_flags & IPTUN_RADDR) {
-		if ((pmtu = iptun_get_dst_pmtu(iptun)) == 0) {
+		if ((pmtu = iptun_get_dst_pmtu(iptun, ixa)) == 0) {
 			/*
 			 * We weren't able to obtain the path-MTU of the
 			 * destination.  Use the previous value.
@@ -1830,7 +1994,6 @@ iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu)
 			pmtu = iptun->iptun_dpmtu;
 		} else {
 			iptun->iptun_dpmtu = pmtu;
-			iptun->iptun_dpmtu_lastupdate = ddi_get_lbolt();
 		}
 	} else {
 		/*
@@ -1866,19 +2029,23 @@ iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu)
 }
 
 /*
- * Re-calculate the tunnel's MTU and notify the MAC layer of any change in
- * MTU.  The new_pmtu argument is the new path MTU to the tunnel destination
- * to be used in the tunnel MTU calculation.  Passing in 0 for new_pmtu causes
- * the path MTU to be dynamically updated using iptun_update_pmtu().
+ * Re-calculate the tunnel's MTU as seen from above and notify the MAC layer
+ * of any change in MTU.  The new_pmtu argument is the new lower path MTU to
+ * the tunnel destination to be used in the tunnel MTU calculation.  Passing
+ * in 0 for new_pmtu causes the lower path MTU to be dynamically updated using
+ * ip_get_pmtu().
  *
  * If the calculated tunnel MTU is different than its previous value, then we
  * notify the MAC layer above us of this change using mac_maxsdu_update().
  */
 static uint32_t
-iptun_update_mtu(iptun_t *iptun, uint32_t new_pmtu)
+iptun_update_mtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
 {
 	uint32_t newmtu;
 
+	/* We always update the ixa since we might have set IXAF_VERIFY_PMTU */
+	iptun_update_dst_pmtu(iptun, ixa);
+
 	/*
 	 * We return the current MTU without updating it if it was pegged to a
 	 * static value using the MAC_PROP_MTU link property.
@@ -1887,8 +2054,7 @@ iptun_update_mtu(iptun_t *iptun, uint32_t new_pmtu)
 		return (iptun->iptun_mtu);
 
 	/* If the MTU isn't fixed, then use the maximum possible value. */
-	newmtu = iptun_get_maxmtu(iptun, new_pmtu);
-
+	newmtu = iptun_get_maxmtu(iptun, ixa, new_pmtu);
 	/*
 	 * We only dynamically adjust the tunnel MTU for tunnels with
 	 * destinations because dynamic MTU calculations are based on the
@@ -1929,7 +2095,7 @@ iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt)
 {
 	mblk_t *icmperr_mp;
 
-	if ((icmperr_mp = allocb_tmpl(hdrs_size, orig_pkt)) != NULL) {
+	if ((icmperr_mp = allocb(hdrs_size, BPRI_MED)) != NULL) {
 		icmperr_mp->b_wptr += hdrs_size;
 		/* tack on the offending packet */
 		icmperr_mp->b_cont = orig_pkt;
@@ -1942,12 +2108,15 @@ iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt)
  * the ICMP error.
  */
 static void
-iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp)
+iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp,
+    ts_label_t *tsl)
 {
 	size_t	orig_pktsize, hdrs_size;
 	mblk_t	*icmperr_mp;
 	ipha_t	*new_ipha;
 	icmph_t	*new_icmp;
+	ip_xmit_attr_t	ixas;
+	conn_t	*connp = iptun->iptun_connp;
 
 	orig_pktsize = msgdsize(mp);
 	hdrs_size = sizeof (ipha_t) + sizeof (icmph_t);
@@ -1974,17 +2143,35 @@ iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp)
 	new_icmp->icmph_checksum = 0;
 	new_icmp->icmph_checksum = IP_CSUM(icmperr_mp, sizeof (ipha_t), 0);
 
-	ip_output(iptun->iptun_connp, icmperr_mp, iptun->iptun_connp->conn_wq,
-	    IP_WPUT);
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+	if (new_ipha->ipha_src == INADDR_ANY)
+		ixas.ixa_flags |= IXAF_SET_SOURCE;
+
+	ixas.ixa_zoneid = IPCL_ZONEID(connp);
+	ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
+	ixas.ixa_cred = connp->conn_cred;
+	ixas.ixa_cpid = NOPID;
+	if (is_system_labeled())
+		ixas.ixa_tsl = tsl;
+
+	ixas.ixa_ifindex = 0;
+	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+	(void) ip_output_simple(icmperr_mp, &ixas);
+	ixa_cleanup(&ixas);
 }
 
 static void
-iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp)
+iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp,
+    ts_label_t *tsl)
 {
 	size_t	orig_pktsize, hdrs_size;
 	mblk_t	*icmp6err_mp;
 	ip6_t	*new_ip6h;
 	icmp6_t	*new_icmp6;
+	ip_xmit_attr_t	ixas;
+	conn_t	*connp = iptun->iptun_connp;
 
 	orig_pktsize = msgdsize(mp);
 	hdrs_size = sizeof (ip6_t) + sizeof (icmp6_t);
@@ -2004,16 +2191,31 @@ iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp)
 	new_ip6h->ip6_dst = orig_ip6h->ip6_src;
 
 	*new_icmp6 = *icmp6;
-	/* The checksum is calculated in ip_wput_ire_v6(). */
+	/* The checksum is calculated in ip_output_simple and friends. */
 	new_icmp6->icmp6_cksum = new_ip6h->ip6_plen;
 
-	ip_output_v6(iptun->iptun_connp, icmp6err_mp,
-	    iptun->iptun_connp->conn_wq, IP_WPUT);
+	bzero(&ixas, sizeof (ixas));
+	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+	if (IN6_IS_ADDR_UNSPECIFIED(&new_ip6h->ip6_src))
+		ixas.ixa_flags |= IXAF_SET_SOURCE;
+
+	ixas.ixa_zoneid = IPCL_ZONEID(connp);
+	ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
+	ixas.ixa_cred = connp->conn_cred;
+	ixas.ixa_cpid = NOPID;
+	if (is_system_labeled())
+		ixas.ixa_tsl = tsl;
+
+	ixas.ixa_ifindex = 0;
+	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+	(void) ip_output_simple(icmp6err_mp, &ixas);
+	ixa_cleanup(&ixas);
 }
 
 static void
 iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp,
-    uint8_t type, uint8_t code)
+    uint8_t type, uint8_t code, ts_label_t *tsl)
 {
 	icmph_t icmp;
 
@@ -2021,12 +2223,12 @@ iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp,
 	icmp.icmph_type = type;
 	icmp.icmph_code = code;
 
-	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp);
+	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
 }
 
 static void
 iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha,
-    mblk_t *mp)
+    mblk_t *mp, ts_label_t *tsl)
 {
 	icmph_t	icmp;
 
@@ -2035,12 +2237,12 @@ iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha,
 	icmp.icmph_du_zero = 0;
 	icmp.icmph_du_mtu = htons(newmtu);
 
-	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp);
+	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
 }
 
 static void
 iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp,
-    uint8_t type, uint8_t code, uint32_t offset)
+    uint8_t type, uint8_t code, uint32_t offset, ts_label_t *tsl)
 {
 	icmp6_t icmp6;
 
@@ -2050,12 +2252,12 @@ iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp,
 	if (type == ICMP6_PARAM_PROB)
 		icmp6.icmp6_pptr = htonl(offset);
 
-	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp);
+	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
 }
 
 static void
 iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h,
-    mblk_t *mp)
+    mblk_t *mp, ts_label_t *tsl)
 {
 	icmp6_t icmp6;
 
@@ -2063,7 +2265,7 @@ iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h,
 	icmp6.icmp6_code = 0;
 	icmp6.icmp6_mtu = htonl(newmtu);
 
-	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp);
+	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
 }
 
 /*
@@ -2105,13 +2307,15 @@ is_icmp_error(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
 /*
  * Find inner and outer IP headers from a tunneled packet as setup for calls
  * into ipsec_tun_{in,out}bound().
+ * Note that we need to allow the outer header to be in a separate mblk from
+ * the inner header.
+ * If the caller knows the outer_hlen, the caller passes it in. Otherwise zero.
  */
 static size_t
-iptun_find_headers(mblk_t *mp, ipha_t **outer4, ipha_t **inner4, ip6_t **outer6,
-    ip6_t **inner6)
+iptun_find_headers(mblk_t *mp, size_t outer_hlen, ipha_t **outer4,
+    ipha_t **inner4, ip6_t **outer6, ip6_t **inner6)
 {
 	ipha_t	*ipha;
-	size_t	outer_hlen;
 	size_t	first_mblkl = MBLKL(mp);
 	mblk_t	*inner_mp;
 
@@ -2128,12 +2332,14 @@ iptun_find_headers(mblk_t *mp, ipha_t **outer4, ipha_t **inner4, ip6_t **outer6,
 	case IPV4_VERSION:
 		*outer4 = ipha;
 		*outer6 = NULL;
-		outer_hlen = IPH_HDR_LENGTH(ipha);
+		if (outer_hlen == 0)
+			outer_hlen = IPH_HDR_LENGTH(ipha);
 		break;
 	case IPV6_VERSION:
 		*outer4 = NULL;
 		*outer6 = (ip6_t *)ipha;
-		outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha);
+		if (outer_hlen == 0)
+			outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha);
 		break;
 	default:
 		return (0);
@@ -2192,8 +2398,8 @@ iptun_find_headers(mblk_t *mp, ipha_t **outer4, ipha_t **inner4, ip6_t **outer6,
  * whatever the very-inner packet is (IPv4(2) or IPv6).
  */
 static void
-iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
-    icmph_t *icmph)
+iptun_input_icmp_v4(iptun_t *iptun, mblk_t *data_mp, icmph_t *icmph,
+    ip_recv_attr_t *ira)
 {
 	uint8_t	*orig;
 	ipha_t	*outer4, *inner4;
@@ -2201,12 +2407,6 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
 	int	outer_hlen;
 	uint8_t	type, code;
 
-	/*
-	 * Change the db_type to M_DATA because subsequent operations assume
-	 * the ICMP packet is M_DATA again (i.e. calls to msgdsize()).
-	 */
-	data_mp->b_datap->db_type = M_DATA;
-
 	ASSERT(data_mp->b_cont == NULL);
 	/*
 	 * Temporarily move b_rptr forward so that iptun_find_headers() can
@@ -2220,13 +2420,12 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
 	 * here).
 	 */
 	ASSERT(MBLKL(data_mp) >= 0);
-	outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6,
+	outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
 	    &inner6);
 	ASSERT(outer6 == NULL);
 	data_mp->b_rptr = orig;
 	if (outer_hlen == 0) {
-		iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp),
-		    &iptun->iptun_ierrors);
+		iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
 		return;
 	}
 
@@ -2234,10 +2433,9 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
 	ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP ||
 	    outer4->ipha_protocol == IPPROTO_IPV6);
 
-	/* ipsec_tun_inbound() always frees ipsec_mp. */
-	if (!ipsec_tun_inbound(ipsec_mp, &data_mp, iptun->iptun_itp,
-	    inner4, inner6, outer4, outer6, -outer_hlen,
-	    iptun->iptun_ns)) {
+	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
+	    inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
+	if (data_mp == NULL) {
 		/* Callee did all of the freeing. */
 		atomic_inc_64(&iptun->iptun_ierrors);
 		return;
@@ -2269,15 +2467,15 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
 			 * also have IPsec policy by letting iptun_update_mtu
 			 * take care of it.
 			 */
-			newmtu =
-			    iptun_update_mtu(iptun, ntohs(icmph->icmph_du_mtu));
+			newmtu = iptun_update_mtu(iptun, NULL,
+			    ntohs(icmph->icmph_du_mtu));
 
 			if (inner4 != NULL) {
 				iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
-				    data_mp);
+				    data_mp, ira->ira_tsl);
 			} else {
 				iptun_icmp_toobig_v6(iptun, newmtu, inner6,
-				    data_mp);
+				    data_mp, ira->ira_tsl);
 			}
 			return;
 		}
@@ -2310,10 +2508,13 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
 		return;
 	}
 
-	if (inner4 != NULL)
-		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code);
-	else
-		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0);
+	if (inner4 != NULL) {
+		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
+		    ira->ira_tsl);
+	} else {
+		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
+		    ira->ira_tsl);
+	}
 }
 
 /*
@@ -2324,17 +2525,17 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
 static boolean_t
 iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr)
 {
-	ip6_pkt_t	pkt;
+	ip_pkt_t	pkt;
 	uint8_t		*endptr;
 	ip6_dest_t	*destp;
 	struct ip6_opt	*optp;
 
 	pkt.ipp_fields = 0; /* must be initialized */
-	(void) ip_find_hdr_v6(mp, ip6h, &pkt, NULL);
+	(void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &pkt, NULL);
 	if ((pkt.ipp_fields & IPPF_DSTOPTS) != 0) {
 		destp = pkt.ipp_dstopts;
-	} else if ((pkt.ipp_fields & IPPF_RTDSTOPTS) != 0) {
-		destp = pkt.ipp_rtdstopts;
+	} else if ((pkt.ipp_fields & IPPF_RTHDRDSTOPTS) != 0) {
+		destp = pkt.ipp_rthdrdstopts;
 	} else {
 		return (B_FALSE);
 	}
@@ -2370,8 +2571,8 @@ iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr)
  * whatever the very-inner packet is (IPv4 or IPv6(2)).
  */
 static void
-iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
-    icmp6_t *icmp6h)
+iptun_input_icmp_v6(iptun_t *iptun, mblk_t *data_mp, icmp6_t *icmp6h,
+    ip_recv_attr_t *ira)
 {
 	uint8_t	*orig;
 	ipha_t	*outer4, *inner4;
@@ -2379,12 +2580,6 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
 	int	outer_hlen;
 	uint8_t	type, code;
 
-	/*
-	 * Change the db_type to M_DATA because subsequent operations assume
-	 * the ICMP packet is M_DATA again (i.e. calls to msgdsize().)
-	 */
-	data_mp->b_datap->db_type = M_DATA;
-
 	ASSERT(data_mp->b_cont == NULL);
 
 	/*
@@ -2399,19 +2594,18 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
 	 * here).
 	 */
 	ASSERT(MBLKL(data_mp) >= 0);
-	outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6,
+	outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
 	    &inner6);
 	ASSERT(outer4 == NULL);
 	data_mp->b_rptr = orig;	/* Restore r_ptr */
 	if (outer_hlen == 0) {
-		iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp),
-		    &iptun->iptun_ierrors);
+		iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
 		return;
 	}
 
-	if (!ipsec_tun_inbound(ipsec_mp, &data_mp, iptun->iptun_itp,
-	    inner4, inner6, outer4, outer6, -outer_hlen,
-	    iptun->iptun_ns)) {
+	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
+	    inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
+	if (data_mp == NULL) {
 		/* Callee did all of the freeing. */
 		atomic_inc_64(&iptun->iptun_ierrors);
 		return;
@@ -2466,13 +2660,15 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
 		 * have IPsec policy by letting iptun_update_mtu take care of
 		 * it.
 		 */
-		newmtu = iptun_update_mtu(iptun, ntohl(icmp6h->icmp6_mtu));
+		newmtu = iptun_update_mtu(iptun, NULL,
+		    ntohl(icmp6h->icmp6_mtu));
 
 		if (inner4 != NULL) {
 			iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
-			    data_mp);
+			    data_mp, ira->ira_tsl);
 		} else {
-			iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp);
+			iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp,
+			    ira->ira_tsl);
 		}
 		return;
 	}
@@ -2481,51 +2677,57 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
 		return;
 	}
 
-	if (inner4 != NULL)
-		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code);
-	else
-		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0);
+	if (inner4 != NULL) {
+		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
+		    ira->ira_tsl);
+	} else {
+		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
+		    ira->ira_tsl);
+	}
 }
 
+/*
+ * Called as conn_recvicmp from IP for ICMP errors.
+ */
+/* ARGSUSED2 */
 static void
-iptun_input_icmp(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp)
+iptun_input_icmp(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 {
-	mblk_t	*tmpmp;
-	size_t	hlen;
+	conn_t		*connp = arg;
+	iptun_t		*iptun = connp->conn_iptun;
+	mblk_t		*tmpmp;
+	size_t		hlen;
 
-	if (data_mp->b_cont != NULL) {
+	ASSERT(IPCL_IS_IPTUN(connp));
+
+	if (mp->b_cont != NULL) {
 		/*
 		 * Since ICMP error processing necessitates access to bits
 		 * that are within the ICMP error payload (the original packet
 		 * that caused the error), pull everything up into a single
 		 * block for convenience.
 		 */
-		data_mp->b_datap->db_type = M_DATA;
-		if ((tmpmp = msgpullup(data_mp, -1)) == NULL) {
-			iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp),
-			    &iptun->iptun_norcvbuf);
+		if ((tmpmp = msgpullup(mp, -1)) == NULL) {
+			iptun_drop_pkt(mp, &iptun->iptun_norcvbuf);
 			return;
 		}
-		freemsg(data_mp);
-		data_mp = tmpmp;
-		if (ipsec_mp != NULL)
-			ipsec_mp->b_cont = data_mp;
+		freemsg(mp);
+		mp = tmpmp;
 	}
 
+	hlen = ira->ira_ip_hdr_length;
 	switch (iptun->iptun_typeinfo->iti_ipvers) {
 	case IPV4_VERSION:
 		/*
 		 * The outer IP header coming up from IP is always ipha_t
 		 * alligned (otherwise, we would have crashed in ip).
 		 */
-		hlen = IPH_HDR_LENGTH((ipha_t *)data_mp->b_rptr);
-		iptun_input_icmp_v4(iptun, ipsec_mp, data_mp,
-		    (icmph_t *)(data_mp->b_rptr + hlen));
+		iptun_input_icmp_v4(iptun, mp, (icmph_t *)(mp->b_rptr + hlen),
+		    ira);
 		break;
 	case IPV6_VERSION:
-		hlen = ip_hdr_length_v6(data_mp, (ip6_t *)data_mp->b_rptr);
-		iptun_input_icmp_v6(iptun, ipsec_mp, data_mp,
-		    (icmp6_t *)(data_mp->b_rptr + hlen));
+		iptun_input_icmp_v6(iptun, mp, (icmp6_t *)(mp->b_rptr + hlen),
+		    ira);
 		break;
 	}
 }
@@ -2578,63 +2780,24 @@ iptun_in_6to4_ok(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
  * Input function for everything that comes up from the ip module below us.
  * This is called directly from the ip module via connp->conn_recv().
  *
- * There are two kinds of packets that can arrive here: (1) IP-in-IP tunneled
- * packets and (2) ICMP errors containing IP-in-IP packets transmitted by us.
- * They have the following structure:
- *
- * 1) M_DATA
- * 2) M_CTL[->M_DATA]
- *
- * (2) Is an M_CTL optionally followed by M_DATA, where the M_CTL block is the
- * start of the actual ICMP packet (it doesn't contain any special control
- * information).
- *
- * Either (1) or (2) can be IPsec-protected, in which case an M_CTL block
- * containing an ipsec_in_t will have been prepended to either (1) or (2),
- * making a total of four combinations of possible mblk chains:
- *
- * A) (1)
- * B) (2)
- * C) M_CTL(ipsec_in_t)->(1)
- * D) M_CTL(ipsec_in_t)->(2)
+ * We receive M_DATA messages with IP-in-IP tunneled packets.
  */
-/* ARGSUSED */
+/* ARGSUSED2 */
 static void
-iptun_input(void *arg, mblk_t *mp, void *arg2)
+iptun_input(void *arg, mblk_t *data_mp, void *arg2, ip_recv_attr_t *ira)
 {
 	conn_t	*connp = arg;
 	iptun_t	*iptun = connp->conn_iptun;
 	int	outer_hlen;
 	ipha_t	*outer4, *inner4;
 	ip6_t	*outer6, *inner6;
-	mblk_t	*data_mp = mp;
 
 	ASSERT(IPCL_IS_IPTUN(connp));
-	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
-
-	if (DB_TYPE(mp) == M_CTL) {
-		if (((ipsec_in_t *)(mp->b_rptr))->ipsec_in_type != IPSEC_IN) {
-			iptun_input_icmp(iptun, NULL, mp);
-			return;
-		}
-
-		data_mp = mp->b_cont;
-		if (DB_TYPE(data_mp) == M_CTL) {
-			/* Protected ICMP packet. */
-			iptun_input_icmp(iptun, mp, data_mp);
-			return;
-		}
-	}
-
-	/*
-	 * Request the destination's path MTU information regularly in case
-	 * path MTU has increased.
-	 */
-	if (IPTUN_PMTU_TOO_OLD(iptun))
-		iptun_task_dispatch(iptun, IPTUN_TASK_PMTU_UPDATE);
+	ASSERT(DB_TYPE(data_mp) == M_DATA);
 
-	if ((outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6,
-	    &inner6)) == 0)
+	outer_hlen = iptun_find_headers(data_mp, ira->ira_ip_hdr_length,
+	    &outer4, &inner4, &outer6, &inner6);
+	if (outer_hlen == 0)
 		goto drop;
 
 	/*
@@ -2644,25 +2807,22 @@ iptun_input(void *arg, mblk_t *mp, void *arg2)
 	 * the more involved tsol_receive_local() since the tunnel link itself
 	 * cannot be assigned to shared-stack non-global zones.
 	 */
-	if (is_system_labeled()) {
-		cred_t *msg_cred;
-
-		if ((msg_cred = msg_getcred(data_mp, NULL)) == NULL)
+	if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+		if (ira->ira_tsl == NULL)
 			goto drop;
-		if (tsol_check_dest(msg_cred, (outer4 != NULL ?
+		if (tsol_check_dest(ira->ira_tsl, (outer4 != NULL ?
 		    (void *)&outer4->ipha_dst : (void *)&outer6->ip6_dst),
 		    (outer4 != NULL ? IPV4_VERSION : IPV6_VERSION),
-		    CONN_MAC_DEFAULT, NULL) != 0)
+		    CONN_MAC_DEFAULT, B_FALSE, NULL) != 0)
 			goto drop;
 	}
 
-	if (!ipsec_tun_inbound((mp == data_mp ? NULL : mp), &data_mp,
-	    iptun->iptun_itp, inner4, inner6, outer4, outer6, outer_hlen,
-	    iptun->iptun_ns)) {
+	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
+	    inner4, inner6, outer4, outer6, outer_hlen, iptun->iptun_ns);
+	if (data_mp == NULL) {
 		/* Callee did all of the freeing. */
 		return;
 	}
-	mp = data_mp;
 
 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
 	    !iptun_in_6to4_ok(iptun, outer4, inner6))
@@ -2673,6 +2833,8 @@ iptun_input(void *arg, mblk_t *mp, void *arg2)
 	 * we might as well split up any b_next chains here.
 	 */
 	do {
+		mblk_t	*mp;
+
 		mp = data_mp->b_next;
 		data_mp->b_next = NULL;
 
@@ -2684,7 +2846,7 @@ iptun_input(void *arg, mblk_t *mp, void *arg2)
 	} while (data_mp != NULL);
 	return;
 drop:
-	iptun_drop_pkt(mp, &iptun->iptun_ierrors);
+	iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
 }
 
 /*
@@ -2744,6 +2906,10 @@ iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
 		/* destination is a 6to4 router */
 		IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst,
 		    (struct in_addr *)&outer4->ipha_dst);
+
+		/* Reject attempts to send to INADDR_ANY */
+		if (outer4->ipha_dst == INADDR_ANY)
+			return (B_FALSE);
 	} else {
 		/*
 		 * The destination is a native IPv6 address.  If output to a
@@ -2770,12 +2936,11 @@ iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
  */
 static mblk_t *
 iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
-    ipha_t *inner4, ip6_t *inner6)
+    ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
 {
 	uint8_t	*innerptr = (inner4 != NULL ?
 	    (uint8_t *)inner4 : (uint8_t *)inner6);
-	size_t	minmtu = (inner4 != NULL ?
-	    IPTUN_MIN_IPV4_MTU : IPTUN_MIN_IPV6_MTU);
+	size_t	minmtu = iptun->iptun_typeinfo->iti_minmtu;
 
 	if (inner4 != NULL) {
 		ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP);
@@ -2791,13 +2956,11 @@ iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
 	} else {
 		ASSERT(outer4->ipha_protocol == IPPROTO_IPV6 &&
 		    inner6 != NULL);
-
-		if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
-		    !iptun_out_process_6to4(iptun, outer4, inner6)) {
-			iptun_drop_pkt(mp, &iptun->iptun_oerrors);
-			return (NULL);
-		}
 	}
+	if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
+		outer4->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
+	else
+		outer4->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
 
 	/*
 	 * As described in section 3.2.2 of RFC4213, if the packet payload is
@@ -2807,11 +2970,19 @@ iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
 	 * won't be allowed to drop its MTU as a result, since the packet was
 	 * already smaller than the smallest allowable MTU for that interface.
 	 */
-	if (mp->b_wptr - innerptr <= minmtu)
+	if (mp->b_wptr - innerptr <= minmtu) {
 		outer4->ipha_fragment_offset_and_flags = 0;
+		ixa->ixa_flags &= ~IXAF_DONTFRAG;
+	} else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) &&
+	    (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4)) {
+		ixa->ixa_flags |= IXAF_DONTFRAG;
+	}
 
-	outer4->ipha_length = htons(msgdsize(mp));
+	ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(outer4);
+	ixa->ixa_pktlen = msgdsize(mp);
+	ixa->ixa_protocol = outer4->ipha_protocol;
 
+	outer4->ipha_length = htons(ixa->ixa_pktlen);
 	return (mp);
 }
 
@@ -2830,7 +3001,7 @@ iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
 	ASSERT(mp->b_cont == NULL);
 
 	mp->b_rptr += sizeof (ip6_t);
-	newmp = allocb_tmpl(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), mp);
+	newmp = allocb(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), BPRI_MED);
 	if (newmp == NULL) {
 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
 		return (NULL);
@@ -2861,8 +3032,12 @@ iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
  * on error.
  */
 static mblk_t *
-iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6)
+iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
+    ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
 {
+	uint8_t		*innerptr = (inner4 != NULL ?
+	    (uint8_t *)inner4 : (uint8_t *)inner6);
+	size_t		minmtu = iptun->iptun_typeinfo->iti_minmtu;
 	uint8_t		*limit, *configlimit;
 	uint32_t	offset;
 	iptun_ipv6hdrs_t *v6hdrs;
@@ -2887,7 +3062,7 @@ iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6)
 			mp->b_rptr = (uint8_t *)inner6;
 			offset = limit - mp->b_rptr;
 			iptun_icmp_error_v6(iptun, inner6, mp, ICMP6_PARAM_PROB,
-			    0, offset);
+			    0, offset, ixa->ixa_tsl);
 			atomic_inc_64(&iptun->iptun_noxmtbuf);
 			return (NULL);
 		}
@@ -2900,6 +3075,7 @@ iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6)
 			if ((mp = iptun_insert_encaplimit(iptun, mp, outer6,
 			    (*limit - 1))) == NULL)
 				return (NULL);
+			v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
 		} else {
 			/*
 			 * There is an existing encapsulation limit option in
@@ -2914,9 +3090,23 @@ iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6)
 			if ((*limit - 1) < *configlimit)
 				*configlimit = (*limit - 1);
 		}
+		ixa->ixa_ip_hdr_length = sizeof (iptun_ipv6hdrs_t);
+		ixa->ixa_protocol = v6hdrs->it6h_encaplim.iel_destopt.ip6d_nxt;
+	} else {
+		ixa->ixa_ip_hdr_length = sizeof (ip6_t);
+		ixa->ixa_protocol = outer6->ip6_nxt;
 	}
+	/*
+	 * See iptun_output_process_ipv4() why we allow fragmentation for
+	 * small packets
+	 */
+	if (mp->b_wptr - innerptr <= minmtu)
+		ixa->ixa_flags &= ~IXAF_DONTFRAG;
+	else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL))
+		ixa->ixa_flags |= IXAF_DONTFRAG;
 
-	outer6->ip6_plen = htons(msgdsize(mp) - sizeof (ip6_t));
+	ixa->ixa_pktlen = msgdsize(mp);
+	outer6->ip6_plen = htons(ixa->ixa_pktlen - sizeof (ip6_t));
 	return (mp);
 }
 
@@ -2929,11 +3119,9 @@ static void
 iptun_output(iptun_t *iptun, mblk_t *mp)
 {
 	conn_t	*connp = iptun->iptun_connp;
-	int	outer_hlen;
 	mblk_t	*newmp;
-	ipha_t	*outer4, *inner4;
-	ip6_t	*outer6, *inner6;
-	ipsec_tun_pol_t	*itp = iptun->iptun_itp;
+	int	error;
+	ip_xmit_attr_t	*ixa;
 
 	ASSERT(mp->b_datap->db_type == M_DATA);
 
@@ -2946,17 +3134,262 @@ iptun_output(iptun_t *iptun, mblk_t *mp)
 		mp = newmp;
 	}
 
-	outer_hlen = iptun_find_headers(mp, &outer4, &inner4, &outer6, &inner6);
+	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
+		iptun_output_6to4(iptun, mp);
+		return;
+	}
+
+	if (is_system_labeled()) {
+		/*
+		 * Since the label can be different meaning a potentially
+		 * different IRE,we always use a unique ip_xmit_attr_t.
+		 */
+		ixa = conn_get_ixa_exclusive(connp);
+	} else {
+		/*
+		 * If no other thread is using conn_ixa this just gets a
+		 * reference to conn_ixa. Otherwise we get a safe copy of
+		 * conn_ixa.
+		 */
+		ixa = conn_get_ixa(connp, B_FALSE);
+	}
+	if (ixa == NULL) {
+		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+		return;
+	}
+
+	/*
+	 * In case we got a safe copy of conn_ixa, then we need
+	 * to fill in any pointers in it.
+	 */
+	if (ixa->ixa_ire == NULL) {
+		error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
+		    &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
+		    NULL, NULL, 0);
+		if (error != 0) {
+			if (ixa->ixa_ire != NULL &&
+			    (error == EHOSTUNREACH || error == ENETUNREACH)) {
+				/*
+				 * Let conn_ip_output/ire_send_noroute return
+				 * the error and send any local ICMP error.
+				 */
+				error = 0;
+			} else {
+				ixa_refrele(ixa);
+				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+				return;
+			}
+		}
+	}
+
+	iptun_output_common(iptun, ixa, mp);
+	ixa_refrele(ixa);
+}
+
+/*
+ * We use an ixa based on the last destination.
+ */
+static void
+iptun_output_6to4(iptun_t *iptun, mblk_t *mp)
+{
+	conn_t		*connp = iptun->iptun_connp;
+	ipha_t		*outer4, *inner4;
+	ip6_t		*outer6, *inner6;
+	ip_xmit_attr_t	*ixa;
+	ip_xmit_attr_t	*oldixa;
+	int		error;
+	boolean_t	need_connect;
+	in6_addr_t	v6dst;
+
+	ASSERT(mp->b_cont == NULL);	/* Verified by iptun_output */
+
+	/* Make sure we set ipha_dst before we look at ipha_dst */
+
+	(void) iptun_find_headers(mp, 0, &outer4, &inner4, &outer6, &inner6);
+	ASSERT(outer4 != NULL);
+	if (!iptun_out_process_6to4(iptun, outer4, inner6)) {
+		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+		return;
+	}
+
+	if (is_system_labeled()) {
+		/*
+		 * Since the label can be different meaning a potentially
+		 * different IRE,we always use a unique ip_xmit_attr_t.
+		 */
+		ixa = conn_get_ixa_exclusive(connp);
+	} else {
+		/*
+		 * If no other thread is using conn_ixa this just gets a
+		 * reference to conn_ixa. Otherwise we get a safe copy of
+		 * conn_ixa.
+		 */
+		ixa = conn_get_ixa(connp, B_FALSE);
+	}
+	if (ixa == NULL) {
+		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+		return;
+	}
+
+	mutex_enter(&connp->conn_lock);
+	if (connp->conn_v4lastdst == outer4->ipha_dst) {
+		need_connect = (ixa->ixa_ire == NULL);
+	} else {
+		/* In case previous destination was multirt */
+		ip_attr_newdst(ixa);
+
+		/*
+		 * We later update conn_ixa when we update conn_v4lastdst
+		 * which enables subsequent packets to avoid redoing
+		 * ip_attr_connect
+		 */
+		need_connect = B_TRUE;
+	}
+	mutex_exit(&connp->conn_lock);
+
+	/*
+	 * In case we got a safe copy of conn_ixa, or otherwise we don't
+	 * have a current ixa_ire, then we need to fill in any pointers in
+	 * the ixa.
+	 */
+	if (need_connect) {
+		IN6_IPADDR_TO_V4MAPPED(outer4->ipha_dst, &v6dst);
+
+		/* We handle IPsec in iptun_output_common */
+		error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
+		    &v6dst, &v6dst, 0, NULL, NULL, 0);
+		if (error != 0) {
+			if (ixa->ixa_ire != NULL &&
+			    (error == EHOSTUNREACH || error == ENETUNREACH)) {
+				/*
+				 * Let conn_ip_output/ire_send_noroute return
+				 * the error and send any local ICMP error.
+				 */
+				error = 0;
+			} else {
+				ixa_refrele(ixa);
+				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+				return;
+			}
+		}
+	}
+
+	iptun_output_common(iptun, ixa, mp);
+
+	/* Atomically replace conn_ixa and conn_v4lastdst */
+	mutex_enter(&connp->conn_lock);
+	if (connp->conn_v4lastdst != outer4->ipha_dst) {
+		/* Remember the dst which corresponds to conn_ixa */
+		connp->conn_v6lastdst = v6dst;
+		oldixa = conn_replace_ixa(connp, ixa);
+	} else {
+		oldixa = NULL;
+	}
+	mutex_exit(&connp->conn_lock);
+	ixa_refrele(ixa);
+	if (oldixa != NULL)
+		ixa_refrele(oldixa);
+}
+
+/*
+ * Check the destination/label. Modifies *mpp by adding/removing CIPSO.
+ *
+ * We get the label from the message in order to honor the
+ * ULPs/IPs choice of label. This will be NULL for forwarded
+ * packets, neighbor discovery packets and some others.
+ */
+static int
+iptun_output_check_label(mblk_t **mpp, ip_xmit_attr_t *ixa)
+{
+	cred_t	*cr;
+	int	adjust;
+	int	iplen;
+	int	err;
+	ts_label_t *effective_tsl = NULL;
+
+
+	ASSERT(is_system_labeled());
+
+	cr = msg_getcred(*mpp, NULL);
+	if (cr == NULL)
+		return (0);
+
+	/*
+	 * We need to start with a label based on the IP/ULP above us
+	 */
+	ip_xmit_attr_restore_tsl(ixa, cr);
+
+	/*
+	 * Need to update packet with any CIPSO option since
+	 * conn_ip_output doesn't do that.
+	 */
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t *ipha;
+
+		ipha = (ipha_t *)(*mpp)->b_rptr;
+		iplen = ntohs(ipha->ipha_length);
+		err = tsol_check_label_v4(ixa->ixa_tsl,
+		    ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
+		    ixa->ixa_ipst, &effective_tsl);
+		if (err != 0)
+			return (err);
+
+		ipha = (ipha_t *)(*mpp)->b_rptr;
+		adjust = (int)ntohs(ipha->ipha_length) - iplen;
+	} else {
+		ip6_t *ip6h;
+
+		ip6h = (ip6_t *)(*mpp)->b_rptr;
+		iplen = ntohs(ip6h->ip6_plen);
+
+		err = tsol_check_label_v6(ixa->ixa_tsl,
+		    ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
+		    ixa->ixa_ipst, &effective_tsl);
+		if (err != 0)
+			return (err);
+
+		ip6h = (ip6_t *)(*mpp)->b_rptr;
+		adjust = (int)ntohs(ip6h->ip6_plen) - iplen;
+	}
+
+	if (effective_tsl != NULL) {
+		/* Update the label */
+		ip_xmit_attr_replace_tsl(ixa, effective_tsl);
+	}
+	ixa->ixa_pktlen += adjust;
+	ixa->ixa_ip_hdr_length += adjust;
+	return (0);
+}
+
+
+static void
+iptun_output_common(iptun_t *iptun, ip_xmit_attr_t *ixa, mblk_t *mp)
+{
+	ipsec_tun_pol_t	*itp = iptun->iptun_itp;
+	int		outer_hlen;
+	mblk_t		*newmp;
+	ipha_t		*outer4, *inner4;
+	ip6_t		*outer6, *inner6;
+	int		error;
+	boolean_t	update_pktlen;
+
+	ASSERT(ixa->ixa_ire != NULL);
+
+	outer_hlen = iptun_find_headers(mp, 0, &outer4, &inner4, &outer6,
+	    &inner6);
 	if (outer_hlen == 0) {
 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
 		return;
 	}
 
 	/* Perform header processing. */
-	if (outer4 != NULL)
-		mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6);
-	else
-		mp = iptun_out_process_ipv6(iptun, mp, outer6, inner6);
+	if (outer4 != NULL) {
+		mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6,
+		    ixa);
+	} else {
+		mp = iptun_out_process_ipv6(iptun, mp, outer6, inner4, inner6,
+		    ixa);
+	}
 	if (mp == NULL)
 		return;
 
@@ -2964,27 +3397,57 @@ iptun_output(iptun_t *iptun, mblk_t *mp)
 	 * Let's hope the compiler optimizes this with "branch taken".
 	 */
 	if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
-		if ((mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4,
-		    outer6, outer_hlen)) == NULL) {
-			/* ipsec_tun_outbound() frees mp on error. */
+		/* This updates the ip_xmit_attr_t */
+		mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4,
+		    outer6, outer_hlen, ixa);
+		if (mp == NULL) {
 			atomic_inc_64(&iptun->iptun_oerrors);
 			return;
 		}
+		if (is_system_labeled()) {
+			/*
+			 * Might change the packet by adding/removing CIPSO.
+			 * After this caller inner* and outer* and outer_hlen
+			 * might be invalid.
+			 */
+			error = iptun_output_check_label(&mp, ixa);
+			if (error != 0) {
+				ip2dbg(("label check failed (%d)\n", error));
+				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+				return;
+			}
+		}
+
 		/*
 		 * ipsec_tun_outbound() returns a chain of tunneled IP
 		 * fragments linked with b_next (or a single message if the
-		 * tunneled packet wasn't a fragment).  Each message in the
-		 * chain is prepended by an IPSEC_OUT M_CTL block with
+		 * tunneled packet wasn't a fragment).
+		 * If fragcache returned a list then we need to update
+		 * ixa_pktlen for all packets in the list.
+		 */
+		update_pktlen = (mp->b_next != NULL);
+
+		/*
+		 * Otherwise, we're good to go.  The ixa has been updated with
 		 * instructions for outbound IPsec processing.
 		 */
 		for (newmp = mp; newmp != NULL; newmp = mp) {
-			ASSERT(newmp->b_datap->db_type == M_CTL);
 			atomic_inc_64(&iptun->iptun_opackets);
-			atomic_add_64(&iptun->iptun_obytes,
-			    msgdsize(newmp->b_cont));
+			atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
 			mp = mp->b_next;
 			newmp->b_next = NULL;
-			connp->conn_send(connp, newmp, connp->conn_wq, IP_WPUT);
+
+			if (update_pktlen)
+				ixa->ixa_pktlen = msgdsize(mp);
+
+			atomic_inc_64(&iptun->iptun_opackets);
+			atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
+
+			error = conn_ip_output(newmp, ixa);
+			if (error == EMSGSIZE) {
+				/* IPsec policy might have changed */
+				(void) iptun_update_mtu(iptun, ixa, 0);
+			}
 		}
 	} else {
 		/*
@@ -2992,30 +3455,37 @@ iptun_output(iptun_t *iptun, mblk_t *mp)
 		 * packet in its output path if there's no active tunnel
 		 * policy.
 		 */
-		atomic_inc_64(&iptun->iptun_opackets);
-		atomic_add_64(&iptun->iptun_obytes, msgdsize(mp));
-		connp->conn_send(connp, mp, connp->conn_wq, IP_WPUT);
-	}
-}
+		ASSERT(ixa->ixa_ipsec_policy == NULL);
+		mp = ip_output_attach_policy(mp, outer4, outer6, NULL, ixa);
+		if (mp == NULL) {
+			atomic_inc_64(&iptun->iptun_oerrors);
+			return;
+		}
+		if (is_system_labeled()) {
+			/*
+			 * Might change the packet by adding/removing CIPSO.
+			 * After this caller inner* and outer* and outer_hlen
+			 * might be invalid.
+			 */
+			error = iptun_output_check_label(&mp, ixa);
+			if (error != 0) {
+				ip2dbg(("label check failed (%d)\n", error));
+				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+				return;
+			}
+		}
 
-/*
- * Note that the setting or clearing iptun_{set,get}_g_q() is serialized via
- * iptuns_lock and iptunq_open(), so we must never be in a situation where
- * iptun_set_g_q() is called if the queue has already been set or vice versa
- * (hence the ASSERT()s.)
- */
-void
-iptun_set_g_q(netstack_t *ns, queue_t *q)
-{
-	ASSERT(ns->netstack_iptun->iptuns_g_q == NULL);
-	ns->netstack_iptun->iptuns_g_q = q;
-}
+		atomic_inc_64(&iptun->iptun_opackets);
+		atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
 
-void
-iptun_clear_g_q(netstack_t *ns)
-{
-	ASSERT(ns->netstack_iptun->iptuns_g_q != NULL);
-	ns->netstack_iptun->iptuns_g_q = NULL;
+		error = conn_ip_output(mp, ixa);
+		if (error == EMSGSIZE) {
+			/* IPsec policy might have changed */
+			(void) iptun_update_mtu(iptun, ixa, 0);
+		}
+	}
+	if (ixa->ixa_flags & IXAF_IPSEC_SECURE)
+		ipsec_out_release_refs(ixa);
 }
 
 static mac_callbacks_t iptun_m_callbacks = {
diff --git a/usr/src/uts/common/inet/iptun/iptun_dev.c b/usr/src/uts/common/inet/iptun/iptun_dev.c
index 52218bdc18..5043063690 100644
--- a/usr/src/uts/common/inet/iptun/iptun_dev.c
+++ b/usr/src/uts/common/inet/iptun/iptun_dev.c
@@ -91,11 +91,9 @@ iptun_stack_shutdown(netstackid_t stackid, void *arg)
 	/* note that iptun_delete() removes iptun from the list */
 	while ((iptun = list_head(&iptuns->iptuns_iptunlist)) != NULL) {
 		linkid = iptun->iptun_linkid;
-		(void) iptun_delete(linkid, iptun->iptun_cred);
+		(void) iptun_delete(linkid, iptun->iptun_connp->conn_cred);
 		(void) dls_mgmt_destroy(linkid, B_FALSE);
 	}
-	if (iptuns->iptuns_g_q != NULL)
-		(void) ldi_close(iptuns->iptuns_g_q_lh, FWRITE|FREAD, CRED());
 }
 
 /*
diff --git a/usr/src/uts/common/inet/iptun/iptun_impl.h b/usr/src/uts/common/inet/iptun/iptun_impl.h
index 593adb7d9c..07e168a423 100644
--- a/usr/src/uts/common/inet/iptun/iptun_impl.h
+++ b/usr/src/uts/common/inet/iptun/iptun_impl.h
@@ -80,7 +80,6 @@ typedef struct iptun_typeinfo {
 	iptun_type_t	iti_type;
 	const char	*iti_ident;	/* MAC-Type plugin identifier */
 	uint_t		iti_ipvers;	/* outer header IP version */
-	edesc_spf	iti_txfunc;	/* function used to transmit to ip */
 	uint32_t	iti_minmtu;	/* minimum possible tunnel MTU */
 	uint32_t	iti_maxmtu;	/* maximum possible tunnel MTU */
 	boolean_t	iti_hasraddr;	/* has a remote adress */
@@ -95,13 +94,6 @@ typedef struct iptun_typeinfo {
  *
  * The datapath reads certain fields without locks for performance reasons.
  *
- * - IPTUN_PMTU_TOO_OLD() is used without a lock to determine if the
- *   destination path-MTU should be queried.  This reads iptun_flags
- *   IPTUN_RADDR, IPTUN_FIXED_MTU, and iptun_dpmtu_lastupdate.  All of these
- *   can change without adversely affecting the tunnel, as the worst case
- *   scenario is that we launch a task that will ultimately either do nothing
- *   or needlessly query the destination path-MTU.
- *
  * - IPTUN_IS_RUNNING() is used (read access to iptun_flags IPTUN_BOUND and
  *   IPTUN_MAC_STARTED) to drop packets if they're sent while the tunnel is
  *   not running.  This is harmless as the worst case scenario is that a
@@ -119,12 +111,10 @@ typedef struct iptun_s {
 	conn_t		*iptun_connp;
 	zoneid_t	iptun_zoneid;
 	netstack_t	*iptun_ns;
-	cred_t		*iptun_cred;
 	struct ipsec_tun_pol_s	*iptun_itp;
 	iptun_typeinfo_t	*iptun_typeinfo;
 	uint32_t	iptun_mtu;
 	uint32_t	iptun_dpmtu;	/* destination path MTU */
-	clock_t		iptun_dpmtu_lastupdate;
 	uint8_t		iptun_hoplimit;
 	uint8_t		iptun_encaplimit;
 	iptun_addr_t	iptun_laddr;	/* local address */
@@ -172,37 +162,12 @@ typedef struct iptun_s {
 	    (IPTUN_BOUND | IPTUN_MAC_STARTED))
 
 /*
- * We request ire information for the tunnel destination in order to obtain
- * its path MTU information.  We use that to calculate the initial link MTU of
- * a tunnel.
- *
- * After that, if the path MTU of the tunnel destination becomes smaller
- * than the link MTU of the tunnel, then we will receive a packet too big
- * (aka fragmentation needed) ICMP error when we transmit a packet larger
- * than the path MTU, and we will adjust the tunne's MTU based on the ICMP
- * error's MTU information.
- *
- * In addition to that, we also need to request the ire information
- * periodically to make sure the link MTU of a tunnel doesn't become stale
- * if the path MTU of the tunnel destination becomes larger than the link
- * MTU of the tunnel.  The period for the requests is ten minutes in
- * accordance with rfc1191.
- */
-#define	IPTUN_PMTU_AGE		SEC_TO_TICK(600)
-#define	IPTUN_PMTU_TOO_OLD(ipt)						\
-	(((ipt)->iptun_flags & IPTUN_RADDR) &&				\
-	!((ipt)->iptun_flags & IPTUN_FIXED_MTU) &&			\
-	(ddi_get_lbolt() - (ipt)->iptun_dpmtu_lastupdate) > IPTUN_PMTU_AGE)
-
-/*
- * iptuns_lock protects iptuns_iptunlist and iptuns_g_q.
+ * iptuns_lock protects iptuns_iptunlist.
  */
 typedef struct iptun_stack {
 	netstack_t	*iptuns_netstack; /* Common netstack */
 	kmutex_t	iptuns_lock;
 	list_t		iptuns_iptunlist; /* list of tunnels in this stack. */
-	queue_t		*iptuns_g_q;	/* read-side IP queue */
-	ldi_handle_t	iptuns_g_q_lh;
 	ipaddr_t	iptuns_relay_rtr_addr;
 } iptun_stack_t;
 
@@ -222,8 +187,6 @@ extern int	iptun_info(iptun_kparams_t *, cred_t *);
 extern int	iptun_set_6to4relay(netstack_t *, ipaddr_t);
 extern void	iptun_get_6to4relay(netstack_t *, ipaddr_t *);
 extern void	iptun_set_policy(datalink_id_t, ipsec_tun_pol_t *);
-extern void	iptun_set_g_q(netstack_t *, queue_t *);
-extern void	iptun_clear_g_q(netstack_t *);
 
 #endif	/* _KERNEL */
 
diff --git a/usr/src/uts/common/inet/keysock.h b/usr/src/uts/common/inet/keysock.h
index 50189666c7..cb618cedaf 100644
--- a/usr/src/uts/common/inet/keysock.h
+++ b/usr/src/uts/common/inet/keysock.h
@@ -19,22 +19,20 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_INET_KEYSOCK_H
 #define	_INET_KEYSOCK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 extern int keysock_opt_get(queue_t *, int, int, uchar_t *);
 extern int keysock_opt_set(queue_t *, uint_t, int, int, uint_t,
-    uchar_t *, uint_t *, uchar_t *, void *, cred_t *cr, mblk_t *mblk);
+    uchar_t *, uint_t *, uchar_t *, void *, cred_t *cr);
 
 /*
  * Object to represent database of options to search passed to
diff --git a/usr/src/uts/common/inet/kssl/ksslrec.c b/usr/src/uts/common/inet/kssl/ksslrec.c
index 14a285b4ab..6b7ce0ad42 100644
--- a/usr/src/uts/common/inet/kssl/ksslrec.c
+++ b/usr/src/uts/common/inet/kssl/ksslrec.c
@@ -239,7 +239,7 @@ kssl_compute_record_mac(
 		 * context when called from strsock_kssl_input(). During the
 		 * SSL handshake, we are called for client_finished message
 		 * handling from a squeue worker thread that gets scheduled
-		 * by an squeue_fill() call. This thread is not in interrupt
+		 * by an SQ_FILL call. This thread is not in interrupt
 		 * context and so can block.
 		 */
 		rv = crypto_mac(&spec->hmac_mech, &dd, &spec->hmac_key,
diff --git a/usr/src/uts/common/inet/mi.c b/usr/src/uts/common/inet/mi.c
index f88fe3709b..9fe77e88c4 100644
--- a/usr/src/uts/common/inet/mi.c
+++ b/usr/src/uts/common/inet/mi.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -1359,7 +1359,7 @@ mi_tpi_addr_and_opt(MBLKP mp, char *addr, t_scalar_t addr_length,
 	 * This code is used more than just for unitdata ind
 	 * (also for T_CONN_IND and T_CONN_CON) and
 	 * relies on correct functioning on the happy
-	 * coincidence that the the address and option buffers
+	 * coincidence that the address and option buffers
 	 * represented by length/offset in all these primitives
 	 * are isomorphic in terms of offset from start of data
 	 * structure
diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h
index 16bed4ec2c..06db81ea74 100644
--- a/usr/src/uts/common/inet/mib2.h
+++ b/usr/src/uts/common/inet/mib2.h
@@ -66,8 +66,8 @@ extern "C" {
  * "get all" is supported, so all modules get a copy of the request to
  * return everything it knows.   In general, we use MIB2_IP.  There is
  * one exception: in general, IP will not report information related to
- * IRE_MARK_TESTHIDDEN routes (e.g., in the MIB2_IP_ROUTE table).
- * However, using the special value EXPER_IP_AND_TESTHIDDEN will cause
+ * ire_testhidden and IRE_IF_CLONE routes (e.g., in the MIB2_IP_ROUTE
+ * table). However, using the special value EXPER_IP_AND_ALL_IRES will cause
  * all information to be reported.  This special value should only be
  * used by IPMP-aware low-level utilities (e.g. in.mpathd).
  *
@@ -109,7 +109,7 @@ extern "C" {
 #define	EXPER_IGMP		(EXPER+1)
 #define	EXPER_DVMRP		(EXPER+2)
 #define	EXPER_RAWIP		(EXPER+3)
-#define	EXPER_IP_AND_TESTHIDDEN	(EXPER+4)
+#define	EXPER_IP_AND_ALL_IRES	(EXPER+4)
 
 /*
  * Define range of levels for experimental use
@@ -170,6 +170,7 @@ typedef uint32_t	DeviceIndex;	/* Interface index */
 #define	EXPER_IP_GROUP_SOURCES		102
 #define	EXPER_IP6_GROUP_SOURCES		103
 #define	EXPER_IP_RTATTR			104
+#define	EXPER_IP_DCE			105
 
 /*
  * There can be one of each of these tables per transport (MIB2_* above).
@@ -267,15 +268,13 @@ typedef struct mib2_ip {
 	int	ipMemberEntrySize;	/* Size of ip_member_t */
 	int	ipGroupSourceEntrySize;	/* Size of ip_grpsrc_t */
 
-		/* # of IPv6 packets received by IPv4 and dropped */
-	Counter ipInIPv6;
-		/* # of IPv6 packets transmitted by ip_wput */
-	Counter ipOutIPv6;
-		/* # of times ip_wput has switched to become ip_wput_v6 */
-	Counter ipOutSwitchIPv6;
+	Counter ipInIPv6; /* # of IPv6 packets received by IPv4 and dropped */
+	Counter ipOutIPv6;		/* No longer used */
+	Counter ipOutSwitchIPv6;	/* No longer used */
 
 	int	ipRouteAttributeSize;	/* Size of mib2_ipAttributeEntry_t */
 	int	transportMLPSize;	/* Size of mib2_transportMLPEntry_t */
+	int	ipDestEntrySize;	/* Size of dest_cache_entry_t */
 } mib2_ip_t;
 
 /*
@@ -503,14 +502,11 @@ typedef struct mib2_ipIfStatsEntry {
 	 */
 	Counter ipIfStatsInWrongIPVersion;
 	/*
-	 * Depending on the value of ipIfStatsIPVersion, this counter tracks
-	 * v4: # of IPv6 packets transmitted by ip_wput or,
-	 * v6: # of IPv4 packets transmitted by ip_wput_v6.
+	 * This counter is no longer used
 	 */
 	Counter ipIfStatsOutWrongIPVersion;
 	/*
-	 * Depending on the value of ipIfStatsIPVersion, this counter tracks
-	 * # of times ip_wput has switched to become ip_wput_v6, or vice versa.
+	 * This counter is no longer used
 	 */
 	Counter ipIfStatsOutSwitchIPVersion;
 
@@ -981,6 +977,21 @@ typedef struct ipv6_grpsrc {
 
 
 /*
+ * List of destination cache entries
+ */
+typedef struct dest_cache_entry {
+	/* IP Multicast address */
+	IpAddress	DestIpv4Address;
+	Ip6Address	DestIpv6Address;
+	uint_t		DestFlags;	/* DCEF_* */
+	uint32_t	DestPmtu;	/* Path MTU if DCEF_PMTU */
+	uint32_t	DestIdent;	/* Per destination IP ident. */
+	DeviceIndex	DestIfindex;	/* For IPv6 link-locals */
+	uint32_t	DestAge;	/* Age of MTU info in seconds */
+} dest_cache_entry_t;
+
+
+/*
  * ICMP Group
  */
 typedef struct mib2_icmp {
diff --git a/usr/src/uts/common/inet/optcom.c b/usr/src/uts/common/inet/optcom.c
index e35b7f6af5..e4d1abff4c 100644
--- a/usr/src/uts/common/inet/optcom.c
+++ b/usr/src/uts/common/inet/optcom.c
@@ -58,21 +58,21 @@
  * Function prototypes
  */
 static t_scalar_t process_topthdrs_first_pass(mblk_t *, cred_t *, optdb_obj_t *,
-    boolean_t *, size_t *);
+    size_t *);
 static t_scalar_t do_options_second_pass(queue_t *q, mblk_t *reqmp,
     mblk_t *ack_mp, cred_t *, optdb_obj_t *dbobjp,
-    mblk_t *first_mp, boolean_t is_restart, boolean_t *queued_statusp);
+    t_uscalar_t *worst_statusp);
 static t_uscalar_t get_worst_status(t_uscalar_t, t_uscalar_t);
 static int do_opt_default(queue_t *, struct T_opthdr *, uchar_t **,
     t_uscalar_t *, cred_t *, optdb_obj_t *);
 static void do_opt_current(queue_t *, struct T_opthdr *, uchar_t **,
     t_uscalar_t *, cred_t *cr, optdb_obj_t *);
-static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
+static void do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
-    cred_t *, optdb_obj_t *dbobjp, mblk_t *first_mp);
+    cred_t *, optdb_obj_t *dbobjp);
 static boolean_t opt_level_valid(t_uscalar_t, optlevel_t *, uint_t);
 static size_t opt_level_allopts_lengths(t_uscalar_t, opdes_t *, uint_t);
-static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *);
+static boolean_t opt_length_ok(opdes_t *, t_uscalar_t optlen);
 static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t);
 static boolean_t opt_bloated_maxsize(opdes_t *);
 
@@ -176,35 +176,15 @@ optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
  * job requested.
  * XXX Code below needs some restructuring after we have some more
  * macros to support 'struct opthdr' in the headers.
- *
- * IP-MT notes: The option management framework functions svr4_optcom_req() and
- * tpi_optcom_req() allocate and prepend an M_CTL mblk to the actual
- * T_optmgmt_req mblk and pass the chain as an additional parameter to the
- * protocol set functions. If a protocol set function (such as ip_opt_set)
- * cannot process the option immediately it can return EINPROGRESS. ip_opt_set
- * enqueues the message in the appropriate sq and returns EINPROGRESS. Later
- * the sq framework arranges to restart this operation and passes control to
- * the restart function ip_restart_optmgmt() which in turn calls
- * svr4_optcom_req() or tpi_optcom_req() to restart the option processing.
- *
- * XXX Remove the asynchronous behavior of svr_optcom_req() and
- * tpi_optcom_req().
  */
-int
-svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
-    boolean_t pass_to_ip)
+void
+svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp)
 {
 	pfi_t	deffn = dbobjp->odb_deffn;
 	pfi_t	getfn = dbobjp->odb_getfn;
 	opt_set_fn setfn = dbobjp->odb_setfn;
 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
-	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
-	opt_restart_t *or;
-	struct opthdr *restart_opt;
-	boolean_t is_restart = B_FALSE;
-	mblk_t	*first_mp;
-
 	t_uscalar_t max_optbuf_len;
 	int len;
 	mblk_t	*mp1 = NULL;
@@ -214,33 +194,10 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 	struct opthdr *opt_end;
 	struct opthdr *opt_start;
 	opdes_t	*optd;
-	boolean_t	pass_to_next = B_FALSE;
 	struct T_optmgmt_ack *toa;
 	struct T_optmgmt_req *tor;
 	int error;
 
-	/*
-	 * Allocate M_CTL and prepend to the packet for restarting this
-	 * option if needed. IP may need to queue and restart the option
-	 * if it cannot obtain exclusive conditions immediately. Please see
-	 * IP-MT notes before the start of svr4_optcom_req
-	 */
-	if (mp->b_datap->db_type == M_CTL) {
-		is_restart = B_TRUE;
-		first_mp = mp;
-		mp = mp->b_cont;
-		ASSERT(mp->b_wptr - mp->b_rptr >=
-		    sizeof (struct T_optmgmt_req));
-		tor = (struct T_optmgmt_req *)mp->b_rptr;
-		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
-
-		or = (opt_restart_t *)first_mp->b_rptr;
-		opt_start = or->or_start;
-		opt_end = or->or_end;
-		restart_opt = or->or_ropt;
-		goto restart;
-	}
-
 	tor = (struct T_optmgmt_req *)mp->b_rptr;
 	/* Verify message integrity. */
 	if (mp->b_wptr - mp->b_rptr < sizeof (struct T_optmgmt_req))
@@ -255,7 +212,7 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 		break;
 	default:
 		optcom_err_ack(q, mp, TBADFLAG, 0);
-		return (0);
+		return;
 	}
 	if (tor->MGMT_flags == T_DEFAULT) {
 		/* Is it a request for default option settings? */
@@ -278,7 +235,6 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 		 * ----historical comment end -------
 		 */
 		/* T_DEFAULT not passed down */
-		ASSERT(topmost_tpiprovider == B_TRUE);
 		freemsg(mp);
 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
 		    opt_arr_cnt);
@@ -286,7 +242,7 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 		if (!mp) {
 no_mem:;
 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
-			return (0);
+			return;
 		}
 
 		/* Initialize the T_optmgmt_ack header. */
@@ -362,7 +318,7 @@ no_mem:;
 		mp->b_datap->db_type = M_PCPROTO;
 		/* Ship it back. */
 		qreply(q, mp);
-		return (0);
+		return;
 	}
 	/* T_DEFAULT processing complete - no more T_DEFAULT */
 
@@ -414,15 +370,15 @@ no_mem:;
 			goto bad_opt;
 
 		error = proto_opt_check(opt->level, opt->name, opt->len, NULL,
-		    opt_arr, opt_arr_cnt, topmost_tpiprovider,
+		    opt_arr, opt_arr_cnt,
 		    tor->MGMT_flags == T_NEGOTIATE, tor->MGMT_flags == T_CHECK,
 		    cr);
 		if (error < 0) {
 			optcom_err_ack(q, mp, -error, 0);
-			return (0);
+			return;
 		} else if (error > 0) {
 			optcom_err_ack(q, mp, TSYSERR, error);
-			return (0);
+			return;
 		}
 	} /* end for loop scanning option buffer */
 
@@ -491,24 +447,9 @@ no_mem:;
 		/* Ditch the input buffer. */
 		freemsg(mp);
 		mp = mp1;
-		/* Always let the next module look at the option. */
-		pass_to_next = B_TRUE;
 		break;
 
 	case T_NEGOTIATE:
-		first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
-		if (first_mp == NULL) {
-			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
-			return (0);
-		}
-		first_mp->b_datap->db_type = M_CTL;
-		or = (opt_restart_t *)first_mp->b_rptr;
-		or->or_start = opt_start;
-		or->or_end =  opt_end;
-		or->or_type = T_SVR4_OPTMGMT_REQ;
-		or->or_private = 0;
-		first_mp->b_cont = mp;
-restart:
 		/*
 		 * Here we are expecting that the response buffer is exactly
 		 * the same size as the input buffer.  We pass each opthdr
@@ -523,22 +464,16 @@ restart:
 		 */
 		toa = (struct T_optmgmt_ack *)tor;
 
-		for (opt = is_restart ? restart_opt: opt_start; opt < opt_end;
-		    opt = next_opt) {
+		for (opt = opt_start; opt < opt_end; opt = next_opt) {
 			int error;
 
-			/*
-			 * Point to the current option in or, in case this
-			 * option has to be restarted later on
-			 */
-			or->or_ropt = opt;
 			next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
 			    _TPI_ALIGN_OPT(opt->len));
 
 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
 			    opt->level, opt->name,
 			    opt->len, (uchar_t *)&opt[1],
-			    &opt->len, (uchar_t *)&opt[1], NULL, cr, first_mp);
+			    &opt->len, (uchar_t *)&opt[1], NULL, cr);
 			/*
 			 * Treat positive "errors" as real.
 			 * Note: negative errors are to be treated as
@@ -549,99 +484,48 @@ restart:
 			 * it is valid but was either handled upstream
 			 * or will be handled downstream.
 			 */
-			if (error == EINPROGRESS) {
-				/*
-				 * The message is queued and will be
-				 * reprocessed later. Typically ip queued
-				 * the message to get some exclusive conditions
-				 * and later on calls this func again.
-				 */
-				return (EINPROGRESS);
-			} else if (error > 0) {
+			if (error > 0) {
 				optcom_err_ack(q, mp, TSYSERR, error);
-				freeb(first_mp);
-				return (0);
+				return;
 			}
 			/*
 			 * error < 0 means option is not recognized.
-			 * But with OP_PASSNEXT the next module
-			 * might recognize it.
 			 */
 		}
-		/* Done with the restart control mp. */
-		freeb(first_mp);
-		pass_to_next = B_TRUE;
 		break;
 	default:
 		optcom_err_ack(q, mp, TBADFLAG, 0);
-		return (0);
+		return;
 	}
 
-	if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
-		/* Send it down to the next module and let it reply */
-		toa->PRIM_type = T_SVR4_OPTMGMT_REQ; /* Changed by IP to ACK */
-		if (q->q_next != NULL)
-			putnext(q, mp);
-		else
-			ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
-	} else {
-		/* Set common fields in the header. */
-		toa->MGMT_flags = T_SUCCESS;
-		mp->b_datap->db_type = M_PCPROTO;
-		toa->PRIM_type = T_OPTMGMT_ACK;
-		qreply(q, mp);
-	}
-	return (0);
+	/* Set common fields in the header. */
+	toa->MGMT_flags = T_SUCCESS;
+	mp->b_datap->db_type = M_PCPROTO;
+	toa->PRIM_type = T_OPTMGMT_ACK;
+	qreply(q, mp);
+	return;
 bad_opt:;
 	optcom_err_ack(q, mp, TBADOPT, 0);
-	return (0);
 }
 
 /*
  * New optcom_req inspired by TPI/XTI semantics
  */
-int
-tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
-    boolean_t pass_to_ip)
+void
+tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp)
 {
 	t_scalar_t t_error;
 	mblk_t *toa_mp;
-	boolean_t pass_to_next;
 	size_t toa_len;
 	struct T_optmgmt_ack *toa;
 	struct T_optmgmt_req *tor =
 	    (struct T_optmgmt_req *)mp->b_rptr;
-
-	opt_restart_t *or;
-	boolean_t is_restart = B_FALSE;
-	mblk_t	*first_mp = NULL;
 	t_uscalar_t worst_status;
-	boolean_t queued_status;
-
-	/*
-	 * Allocate M_CTL and prepend to the packet for restarting this
-	 * option if needed. IP may need to queue and restart the option
-	 * if it cannot obtain exclusive conditions immediately. Please see
-	 * IP-MT notes before the start of svr4_optcom_req
-	 */
-	if (mp->b_datap->db_type == M_CTL) {
-		is_restart = B_TRUE;
-		first_mp = mp;
-		toa_mp = mp->b_cont;
-		mp = toa_mp->b_cont;
-		ASSERT(mp->b_wptr - mp->b_rptr >=
-		    sizeof (struct T_optmgmt_req));
-		tor = (struct T_optmgmt_req *)mp->b_rptr;
-		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
-
-		or = (opt_restart_t *)first_mp->b_rptr;
-		goto restart;
-	}
 
 	/* Verify message integrity. */
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_optmgmt_req)) {
 		optcom_err_ack(q, mp, TBADOPT, 0);
-		return (0);
+		return;
 	}
 
 	/* Verify MGMT_flags legal */
@@ -654,7 +538,7 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 		break;
 	default:
 		optcom_err_ack(q, mp, TBADFLAG, 0);
-		return (0);
+		return;
 	}
 
 	/*
@@ -669,7 +553,6 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 	 * T_ALLOPT mean that length can be different for output buffer).
 	 */
 
-	pass_to_next = B_FALSE;	/* initial value */
 	toa_len = 0;		/* initial value */
 
 	/*
@@ -677,13 +560,11 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 	 *	- estimate cumulative length needed for results
 	 *	- set "status" field based on permissions, option header check
 	 *	  etc.
-	 *	- determine "pass_to_next" whether we need to send request to
-	 *	  downstream module/driver.
 	 */
 	if ((t_error = process_topthdrs_first_pass(mp, cr, dbobjp,
-	    &pass_to_next, &toa_len)) != 0) {
+	    &toa_len)) != 0) {
 		optcom_err_ack(q, mp, t_error, 0);
-		return (0);
+		return;
 	}
 
 	/*
@@ -697,26 +578,14 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 	toa_mp = allocb_tmpl(toa_len, mp);
 	if (!toa_mp) {
 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
-		return (0);
+		return;
 	}
 
-	first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
-	if (first_mp == NULL) {
-		freeb(toa_mp);
-		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
-		return (0);
-	}
-	first_mp->b_datap->db_type = M_CTL;
-	or = (opt_restart_t *)first_mp->b_rptr;
 	/*
 	 * Set initial values for generating output.
 	 */
-	or->or_worst_status = T_SUCCESS;
-	or->or_type = T_OPTMGMT_REQ;
-	or->or_private = 0;
-	/* remaining fields fileed in do_options_second_pass */
+	worst_status = T_SUCCESS; /* initial value */
 
-restart:
 	/*
 	 * This routine makes another pass through the option buffer this
 	 * time acting on the request based on "status" result in the
@@ -724,19 +593,11 @@ restart:
 	 * all options of a certain level and acts on each for this request.
 	 */
 	if ((t_error = do_options_second_pass(q, mp, toa_mp, cr, dbobjp,
-	    first_mp, is_restart, &queued_status)) != 0) {
+	    &worst_status)) != 0) {
 		freemsg(toa_mp);
 		optcom_err_ack(q, mp, t_error, 0);
-		return (0);
-	}
-	if (queued_status) {
-		/* Option will be restarted */
-		return (EINPROGRESS);
+		return;
 	}
-	worst_status = or->or_worst_status;
-	/* Done with the first mp */
-	freeb(first_mp);
-	toa_mp->b_cont = NULL;
 
 	/*
 	 * Following code relies on the coincidence that T_optmgmt_req
@@ -749,34 +610,12 @@ restart:
 
 	toa->MGMT_flags = tor->MGMT_flags;
 
-
 	freemsg(mp);		/* free input mblk */
 
-	/*
-	 * If there is atleast one option that requires a downstream
-	 * forwarding and if it is possible, we forward the message
-	 * downstream. Else we ack it.
-	 */
-	if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
-		/*
-		 * We pass it down as T_OPTMGMT_REQ. This code relies
-		 * on the happy coincidence that T_optmgmt_req and
-		 * T_optmgmt_ack are identical data structures
-		 * at the binary representation level.
-		 */
-		toa_mp->b_datap->db_type = M_PROTO;
-		toa->PRIM_type = T_OPTMGMT_REQ;
-		if (q->q_next != NULL)
-			putnext(q, toa_mp);
-		else
-			ip_output(Q_TO_CONN(q), toa_mp, q, IP_WPUT);
-	} else {
-		toa->PRIM_type = T_OPTMGMT_ACK;
-		toa_mp->b_datap->db_type = M_PCPROTO;
-		toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */
-		qreply(q, toa_mp);
-	}
-	return (0);
+	toa->PRIM_type = T_OPTMGMT_ACK;
+	toa_mp->b_datap->db_type = M_PCPROTO;
+	toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */
+	qreply(q, toa_mp);
 }
 
 
@@ -786,17 +625,14 @@ restart:
  *	- estimate cumulative length needed for results
  *	- set "status" field based on permissions, option header check
  *	  etc.
- *	- determine "pass_to_next" whether we need to send request to
- *	  downstream module/driver.
  */
 
 static t_scalar_t
 process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
-    boolean_t *pass_to_nextp, size_t *toa_lenp)
+    size_t *toa_lenp)
 {
 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
-	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
 	optlevel_t *valid_level_arr = dbobjp->odb_valid_levels_arr;
 	uint_t valid_level_arr_cnt = dbobjp->odb_valid_levels_arr_cnt;
 	struct T_opthdr *opt;
@@ -843,18 +679,14 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 				 * unchanged if they do not understand an
 				 * option.
 				 */
-				if (topmost_tpiprovider) {
-					if (!opt_level_valid(opt->level,
-					    valid_level_arr,
-					    valid_level_arr_cnt))
-						return (TBADOPT);
-					/*
-					 * level is valid - initialize
-					 * option as not supported
-					 */
-					opt->status = T_NOTSUPPORT;
-				}
-
+				if (!opt_level_valid(opt->level,
+				    valid_level_arr, valid_level_arr_cnt))
+					return (TBADOPT);
+				/*
+				 * level is valid - initialize
+				 * option as not supported
+				 */
+				opt->status = T_NOTSUPPORT;
 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
 				continue;
 			}
@@ -866,7 +698,6 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 			 */
 			allopt_len = 0;
 			if (tor->MGMT_flags == T_CHECK ||
-			    !topmost_tpiprovider ||
 			    ((allopt_len = opt_level_allopts_lengths(opt->level,
 			    opt_arr, opt_arr_cnt)) == 0)) {
 				/*
@@ -874,11 +705,6 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 				 * It is not valid to to use T_ALLOPT with
 				 * T_CHECK flag.
 				 *
-				 * T_ALLOPT is assumed "expanded" at the
-				 * topmost_tpiprovider level so it should not
-				 * be there as an "option name" if this is not
-				 * a topmost_tpiprovider call and we fail it.
-				 *
 				 * opt_level_allopts_lengths() is used to verify
 				 * that "level" associated with the T_ALLOPT is
 				 * supported.
@@ -892,15 +718,8 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 
 			*toa_lenp += allopt_len;
 			opt->status = T_SUCCESS;
-			/* XXX - always set T_ALLOPT 'pass_to_next' for now */
-			*pass_to_nextp = B_TRUE;
 			continue;
 		}
-		/*
-		 * Check if option wants to flow downstream
-		 */
-		if (optd->opdes_props & OP_PASSNEXT)
-			*pass_to_nextp = B_TRUE;
 
 		/* Additional checks dependent on operation. */
 		switch (tor->MGMT_flags) {
@@ -972,7 +791,9 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
 				 * Note: This can override anything about this
 				 * option request done at a higher level.
 				 */
-				if (!opt_length_ok(optd, opt)) {
+				if (opt->len < sizeof (struct T_opthdr) ||
+				    !opt_length_ok(optd,
+				    opt->len - sizeof (struct T_opthdr))) {
 					/* bad size */
 					*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
 					opt->status = T_FAILURE;
@@ -1034,23 +855,14 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
  */
 static t_scalar_t
 do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
-    optdb_obj_t *dbobjp, mblk_t *first_mp, boolean_t is_restart,
-    boolean_t *queued_statusp)
+    optdb_obj_t *dbobjp, t_uscalar_t *worst_statusp)
 {
-	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
 	int failed_option;
 	struct T_opthdr *opt;
-	struct T_opthdr *opt_start, *opt_end, *restart_opt;
+	struct T_opthdr *opt_start, *opt_end;
 	uchar_t *optr;
 	uint_t optset_context;
 	struct T_optmgmt_req *tor = (struct T_optmgmt_req *)reqmp->b_rptr;
-	opt_restart_t	*or;
-	t_uscalar_t	*worst_statusp;
-	int	err;
-
-	*queued_statusp = B_FALSE;
-	or = (opt_restart_t *)first_mp->b_rptr;
-	worst_statusp = &or->or_worst_status;
 
 	optr = (uchar_t *)ack_mp->b_rptr +
 	    sizeof (struct T_optmgmt_ack); /* assumed int32_t aligned */
@@ -1058,32 +870,16 @@ do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
 	/*
 	 * Set initial values for scanning input
 	 */
-	if (is_restart) {
-		opt_start = (struct T_opthdr *)or->or_start;
-		opt_end = (struct T_opthdr *)or->or_end;
-		restart_opt = (struct T_opthdr *)or->or_ropt;
-	} else {
-		opt_start = (struct T_opthdr *)mi_offset_param(reqmp,
-		    tor->OPT_offset, tor->OPT_length);
-		if (opt_start == NULL)
-			return (TBADOPT);
-		opt_end = (struct T_opthdr *)((uchar_t *)opt_start +
-		    tor->OPT_length);
-		or->or_start = (struct opthdr *)opt_start;
-		or->or_end = (struct opthdr *)opt_end;
-		/*
-		 * construct the mp chain, in case the setfn needs to
-		 * queue this and restart option processing later on.
-		 */
-		first_mp->b_cont = ack_mp;
-		ack_mp->b_cont = reqmp;
-	}
+	opt_start = (struct T_opthdr *)mi_offset_param(reqmp,
+	    tor->OPT_offset, tor->OPT_length);
+	if (opt_start == NULL)
+		return (TBADOPT);
+	opt_end = (struct T_opthdr *)((uchar_t *)opt_start + tor->OPT_length);
 	ASSERT(__TPI_TOPT_ISALIGNED(opt_start)); /* verified in first pass */
 
-	for (opt = is_restart ? restart_opt : opt_start;
-	    opt && (opt < opt_end);
+	for (opt = opt_start; opt && (opt < opt_end);
 	    opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
-		or->or_ropt = (struct opthdr *)opt;
+
 		/* verified in first pass */
 		ASSERT(_TPI_TOPT_VALID(opt, opt_start, opt_end));
 
@@ -1144,9 +940,7 @@ do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
 			 */
 			if (do_opt_default(q, opt, &optr, worst_statusp,
 			    cr, dbobjp) < 0) {
-				/* fail or pass transparently */
-				if (topmost_tpiprovider)
-					opt->status = T_FAILURE;
+				opt->status = T_FAILURE;
 				bcopy(opt, optr, opt->len);
 				optr += _TPI_ALIGN_TOPT(opt->len);
 				*worst_statusp = get_worst_status(opt->status,
@@ -1166,12 +960,8 @@ do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
 				optset_context = SETFN_OPTCOM_CHECKONLY;
 			else	/* T_NEGOTIATE */
 				optset_context = SETFN_OPTCOM_NEGOTIATE;
-			err = do_opt_check_or_negotiate(q, opt, optset_context,
-			    &optr, worst_statusp, cr, dbobjp, first_mp);
-			if (err == EINPROGRESS) {
-				*queued_statusp = B_TRUE;
-				return (0);
-			}
+			do_opt_check_or_negotiate(q, opt, optset_context,
+			    &optr, worst_statusp, cr, dbobjp);
 			break;
 		default:
 			return (TBADFLAG);
@@ -1236,7 +1026,6 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
 	pfi_t	deffn = dbobjp->odb_deffn;
 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
-	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
 
 	struct T_opthdr *topth;
 	opdes_t *optd;
@@ -1248,15 +1037,8 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
 		optd = proto_opt_lookup(reqopt->level, reqopt->name,
 		    opt_arr, opt_arr_cnt);
 
-		if (optd == NULL) {
-			/*
-			 * not found - fail this one. Should not happen
-			 * for topmost_tpiprovider as calling routine
-			 * should have verified it.
-			 */
-			ASSERT(!topmost_tpiprovider);
-			return (-1);
-		}
+		/* Calling routine should have verified it it exists */
+		ASSERT(optd != NULL);
 
 		topth = (struct T_opthdr *)(*resptrp);
 		topth->level = reqopt->level;
@@ -1333,10 +1115,7 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
 	 *
 	 * lookup and stuff default values of all the options of the
 	 * level specified
-	 * Note: This expansion of T_ALLOPT should happen in
-	 * a topmost_tpiprovider.
 	 */
-	ASSERT(topmost_tpiprovider);
 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
 		if (reqopt->level != optd->opdes_level)
 			continue;
@@ -1453,8 +1232,6 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
 	pfi_t	getfn = dbobjp->odb_getfn;
 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
-	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
-
 	struct T_opthdr *topth;
 	opdes_t *optd;
 	int optlen;
@@ -1484,7 +1261,6 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
 			*resptrp -= sizeof (struct T_opthdr);
 		}
 	} else {		/* T_ALLOPT processing */
-		ASSERT(topmost_tpiprovider == B_TRUE);
 		/* scan and get all options */
 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
 			/* skip other levels */
@@ -1530,14 +1306,9 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
 	}
 	if (*resptrp == initptr) {
 		/*
-		 * getfn failed and does not want to handle this option. Maybe
-		 * something downstream will or something upstream did. (If
-		 * topmost_tpiprovider, initialize "status" to failure which
-		 * can possibly change downstream). Copy the input "as is" from
-		 * input option buffer if any to maintain transparency.
+		 * getfn failed and does not want to handle this option.
 		 */
-		if (topmost_tpiprovider)
-			reqopt->status = T_FAILURE;
+		reqopt->status = T_FAILURE;
 		bcopy(reqopt, *resptrp, reqopt->len);
 		*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
 		*worst_statusp = get_worst_status(reqopt->status,
@@ -1545,18 +1316,15 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
 	}
 }
 
-/* ARGSUSED */
-static int
+static void
 do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
-    cred_t *cr, optdb_obj_t *dbobjp, mblk_t *first_mp)
+    cred_t *cr, optdb_obj_t *dbobjp)
 {
 	pfi_t	deffn = dbobjp->odb_deffn;
 	opt_set_fn setfn = dbobjp->odb_setfn;
 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
-	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
-
 	struct T_opthdr *topth;
 	opdes_t *optd;
 	int error;
@@ -1572,12 +1340,10 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
 		error = (*setfn)(q, optset_context, reqopt->level, reqopt->name,
 		    reqopt->len - sizeof (struct T_opthdr),
 		    _TPI_TOPT_DATA(reqopt), &optlen, _TPI_TOPT_DATA(topth),
-		    NULL, cr, first_mp);
+		    NULL, cr);
 		if (error) {
 			/* failed - reset "*resptrp" */
 			*resptrp -= sizeof (struct T_opthdr);
-			if (error == EINPROGRESS)
-				return (error);
 		} else {
 			/*
 			 * success - "value" already filled in setfn()
@@ -1594,7 +1360,6 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
 	} else {		/* T_ALLOPT processing */
 		/* only for T_NEGOTIATE case */
 		ASSERT(optset_context == SETFN_OPTCOM_NEGOTIATE);
-		ASSERT(topmost_tpiprovider == B_TRUE);
 
 		/* scan and set all options to default value */
 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
@@ -1670,7 +1435,7 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
 			    reqopt->level, optd->opdes_name, optsize,
 			    (uchar_t *)optd->opdes_defbuf, &optlen,
-			    _TPI_TOPT_DATA(topth), NULL, cr, NULL);
+			    _TPI_TOPT_DATA(topth), NULL, cr);
 			if (error) {
 				/*
 				 * failed, return as T_FAILURE and null value
@@ -1693,20 +1458,14 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
 
 	if (*resptrp == initptr) {
 		/*
-		 * setfn failed and does not want to handle this option. Maybe
-		 * something downstream will or something upstream
-		 * did. Copy the input as is from input option buffer if any to
-		 * maintain transparency (maybe something at a level above
-		 * did something.
+		 * setfn failed and does not want to handle this option.
 		 */
-		if (topmost_tpiprovider)
-			reqopt->status = T_FAILURE;
+		reqopt->status = T_FAILURE;
 		bcopy(reqopt, *resptrp, reqopt->len);
 		*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
 		*worst_statusp = get_worst_status(reqopt->status,
 		    *worst_statusp);
 	}
-	return (0);
 }
 
 /*
@@ -1886,7 +1645,8 @@ tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp,
 		 */
 
 		/* verify length */
-		if (!opt_length_ok(optd, opt)) {
+		if (opt->len < (t_uscalar_t)sizeof (struct T_opthdr) ||
+		    !opt_length_ok(optd, opt->len - sizeof (struct T_opthdr))) {
 			/* bad size */
 			if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) {
 				/* option is absolute requirement */
@@ -1914,7 +1674,7 @@ tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp,
 		error = (*setfn)(q, optset_context, opt->level, opt->name,
 		    opt->len - (t_uscalar_t)sizeof (struct T_opthdr),
 		    _TPI_TOPT_DATA(opt), &olen, _TPI_TOPT_DATA(opt),
-		    thisdg_attrs, cr, NULL);
+		    thisdg_attrs, cr);
 
 		if (olen > (int)(opt->len - sizeof (struct T_opthdr))) {
 			/*
@@ -2113,8 +1873,12 @@ opt_bloated_maxsize(opdes_t *optd)
 	return (B_FALSE);
 }
 
+/*
+ * optlen is the length of the option content
+ * Caller should check the optlen is at least sizeof (struct T_opthdr)
+ */
 static boolean_t
-opt_length_ok(opdes_t *optd, struct T_opthdr *opt)
+opt_length_ok(opdes_t *optd, t_uscalar_t optlen)
 {
 	/*
 	 * Verify length.
@@ -2122,95 +1886,60 @@ opt_length_ok(opdes_t *optd, struct T_opthdr *opt)
 	 * less than maxlen of variable length option.
 	 */
 	if (optd->opdes_props & OP_VARLEN) {
-		if (opt->len <= optd->opdes_size +
-		    (t_uscalar_t)sizeof (struct T_opthdr))
+		if (optlen <= optd->opdes_size)
 			return (B_TRUE);
 	} else {
 		/* fixed length option */
-		if (opt->len == optd->opdes_size +
-		    (t_uscalar_t)sizeof (struct T_opthdr))
+		if (optlen == optd->opdes_size)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
- * This routine appends a pssed in hop-by-hop option to the existing
- * option (in this case a cipso label encoded in HOPOPT option). The
- * passed in option is always padded. The 'reservelen' is the
- * length of reserved data (label). New memory will be allocated if
- * the current buffer is not large enough. Return failure if memory
+ * This routine manages the allocation and free of the space for
+ * an extension header or option. Returns failure if memory
  * can not be allocated.
  */
 int
-optcom_pkt_set(uchar_t *invalp, uint_t inlen, boolean_t sticky,
-    uchar_t **optbufp, uint_t *optlenp, uint_t reservelen)
+optcom_pkt_set(uchar_t *invalp, uint_t inlen,
+    uchar_t **optbufp, uint_t *optlenp)
 {
 	uchar_t *optbuf;
 	uchar_t	*optp;
 
-	if (!sticky) {
-		*optbufp = invalp;
-		*optlenp = inlen;
-		return (0);
-	}
-
-	if (inlen == *optlenp - reservelen) {
+	if (inlen == *optlenp) {
 		/* Unchanged length - no need to reallocate */
-		optp = *optbufp + reservelen;
+		optp = *optbufp;
 		bcopy(invalp, optp, inlen);
-		if (reservelen != 0) {
-			/*
-			 * Convert the NextHeader and Length of the
-			 * passed in hop-by-hop header to pads
-			 */
-			optp[0] = IP6OPT_PADN;
-			optp[1] = 0;
-		}
 		return (0);
 	}
-	if (inlen + reservelen > 0) {
+	if (inlen > 0) {
 		/* Allocate new buffer before free */
-		optbuf = kmem_alloc(inlen + reservelen, KM_NOSLEEP);
+		optbuf = kmem_alloc(inlen, KM_NOSLEEP);
 		if (optbuf == NULL)
 			return (ENOMEM);
 	} else {
 		optbuf = NULL;
 	}
 
-	/* Copy out old reserved data (label) */
-	if (reservelen > 0)
-		bcopy(*optbufp, optbuf, reservelen);
-
 	/* Free old buffer */
 	if (*optlenp != 0)
 		kmem_free(*optbufp, *optlenp);
 
 	if (inlen > 0)
-		bcopy(invalp, optbuf + reservelen, inlen);
+		bcopy(invalp, optbuf, inlen);
 
-	if (reservelen != 0) {
-		/*
-		 * Convert the NextHeader and Length of the
-		 * passed in hop-by-hop header to pads
-		 */
-		optbuf[reservelen] = IP6OPT_PADN;
-		optbuf[reservelen + 1] = 0;
-		/*
-		 * Set the Length of the hop-by-hop header, number of 8
-		 * byte-words following the 1st 8 bytes
-		 */
-		optbuf[1] = (reservelen + inlen - 1) >> 3;
-	}
 	*optbufp = optbuf;
-	*optlenp = inlen + reservelen;
+	*optlenp = inlen;
 	return (0);
 }
 
 int
 process_auxiliary_options(conn_t *connp, void *control, t_uscalar_t controllen,
-    void *optbuf, optdb_obj_t *dbobjp, int (*opt_set_fn)(conn_t *, uint_t, int,
-    int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *), cred_t *cr)
+    void *optbuf, optdb_obj_t *dbobjp, int (*opt_set_fn)(conn_t *,
+    uint_t, int, int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *),
+    cred_t *cr)
 {
 	struct cmsghdr *cmsg;
 	opdes_t *optd;
@@ -2254,7 +1983,7 @@ process_auxiliary_options(conn_t *connp, void *control, t_uscalar_t controllen,
 		}
 		error = opt_set_fn(connp, SETFN_UD_NEGOTIATE, optd->opdes_level,
 		    optd->opdes_name, len, (uchar_t *)CMSG_CONTENT(cmsg),
-		    &outlen, (uchar_t *)CMSG_CONTENT(cmsg), (void *)optbuf, cr);
+		    &outlen, (uchar_t *)CMSG_CONTENT(cmsg), optbuf, cr);
 		if (error > 0) {
 			return (error);
 		} else if (outlen > len) {
diff --git a/usr/src/uts/common/inet/optcom.h b/usr/src/uts/common/inet/optcom.h
index df4f227e95..01ca52a759 100644
--- a/usr/src/uts/common/inet/optcom.h
+++ b/usr/src/uts/common/inet/optcom.h
@@ -34,6 +34,7 @@ extern "C" {
 #if defined(_KERNEL) && defined(__STDC__)
 
 #include <inet/ipclassifier.h>
+
 /* Options Description Structure */
 typedef struct opdes_s {
 	t_uscalar_t	opdes_name;	/* option name */
@@ -138,20 +139,15 @@ typedef struct opdes_s {
 #define	OA_NO_PERMISSION(x, c)		(OA_MATCHED_PRIV((x), (c)) ? \
 		((x)->opdes_access_priv == 0) : ((x)->opdes_access_nopriv == 0))
 
-#define	PASS_OPT_TO_IP(connp)		\
-	if (IPCL_IS_NONSTR(connp))	\
-		return (-EINVAL)
-
 /*
  * Other properties set in opdes_props field.
  */
-#define	OP_PASSNEXT	0x1	/* to pass option to next module or not */
-#define	OP_VARLEN	0x2	/* option is varible length  */
-#define	OP_NOT_ABSREQ	0x4	/* option is not a "absolute requirement" */
+#define	OP_VARLEN	0x1	/* option is varible length  */
+#define	OP_NOT_ABSREQ	0x2	/* option is not a "absolute requirement" */
 				/* i.e. failure to negotiate does not */
 				/* abort primitive ("ignore" semantics ok) */
-#define	OP_NODEFAULT	0x8	/* no concept of "default value"  */
-#define	OP_DEF_FN	0x10	/* call a "default function" to get default */
+#define	OP_NODEFAULT	0x4	/* no concept of "default value"  */
+#define	OP_DEF_FN	0x8	/* call a "default function" to get default */
 				/* value, not from static table  */
 
 
@@ -165,13 +161,12 @@ typedef	t_uscalar_t optlevel_t;
 typedef int (*opt_def_fn)(queue_t *, int, int, uchar_t *);
 typedef int (*opt_get_fn)(queue_t *, int, int, uchar_t *);
 typedef int (*opt_set_fn)(queue_t *, uint_t, int, int, uint_t, uchar_t *,
-    uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+    uint_t *, uchar_t *, void *, cred_t *);
 
 typedef struct optdb_obj {
 	opt_def_fn	odb_deffn;	/* default value function */
 	opt_get_fn	odb_getfn;	/* get function */
 	opt_set_fn	odb_setfn;	/* set function */
-	boolean_t	odb_topmost_tpiprovider; /* whether topmost tpi */
 					/* provider or downstream */
 	uint_t		odb_opt_arr_cnt; /* count of number of options in db */
 	opdes_t		*odb_opt_des_arr; /* option descriptors in db */
@@ -182,22 +177,6 @@ typedef struct optdb_obj {
 } optdb_obj_t;
 
 /*
- * This is used to restart option processing. This goes inside an M_CTL
- * which is prepended to the packet. IP may need to become exclusive on
- * an ill for setting some options. For dg. IP_ADD_MEMBERSHIP. Since
- * there can be more than 1 option packed in an option buffer, we need to
- * remember where to restart option processing after resuming from a wait
- * for exclusive condition in IP.
- */
-typedef struct opt_restart_s {
-	struct	opthdr	*or_start;		/* start of option buffer */
-	struct	opthdr	*or_end;		/* end of option buffer */
-	struct	opthdr	*or_ropt;		/* restart option here */
-	t_uscalar_t	or_worst_status;	/* Used by tpi_optcom_req */
-	t_uscalar_t	or_type;		/* svr4 or tpi optcom variant */
-	int		or_private;		/* currently used by CGTP */
-} opt_restart_t;
-/*
  * Values for "optset_context" parameter passed to
  * transport specific "setfn()" routines
  */
@@ -210,16 +189,12 @@ typedef struct opt_restart_s {
  * Function prototypes
  */
 extern void optcom_err_ack(queue_t *, mblk_t *, t_scalar_t, int);
-extern int svr4_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *,
-    boolean_t);
-extern int tpi_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *,
-    boolean_t);
+extern void svr4_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *);
+extern void tpi_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *);
 extern int  tpi_optcom_buf(queue_t *, mblk_t *, t_scalar_t *, t_scalar_t,
     cred_t *, optdb_obj_t *, void *, int *);
 extern t_uscalar_t optcom_max_optsize(opdes_t *, uint_t);
-extern int optcom_pkt_set(uchar_t *, uint_t, boolean_t, uchar_t **, uint_t *,
-    uint_t);
-
+extern int optcom_pkt_set(uchar_t *, uint_t, uchar_t **, uint_t *);
 extern int process_auxiliary_options(conn_t *, void *, t_uscalar_t,
     void *, optdb_obj_t *, int (*)(conn_t *, uint_t, int, int, uint_t,
     uchar_t *, uint_t *, uchar_t *, void *, cred_t *), cred_t *);
diff --git a/usr/src/uts/common/inet/proto_set.c b/usr/src/uts/common/inet/proto_set.c
index 45f07d2ed3..499f046f6d 100644
--- a/usr/src/uts/common/inet/proto_set.c
+++ b/usr/src/uts/common/inet/proto_set.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -348,27 +348,21 @@ proto_opt_lookup(t_uscalar_t level, t_uscalar_t name, opdes_t *opt_arr,
 /*
  * Do a lookup of the options in the array and do permission and length checking
  * Returns zero if there is no error (note: for non-tpi-providers not being able
- * to find the option is not an error). TPI errors are returned as -ve.
+ * to find the option is not an error). TPI errors are returned as negative
+ * numbers and errnos as positive numbers.
+ * If max_len is set we update it based on the max length of the option.
  */
 int
 proto_opt_check(int level, int name, int len, t_uscalar_t *max_len,
-    opdes_t *opt_arr, uint_t opt_arr_cnt, boolean_t topmost_tpiprovider,
-    boolean_t negotiate, boolean_t check, cred_t *cr)
+    opdes_t *opt_arr, uint_t opt_arr_cnt, boolean_t negotiate, boolean_t check,
+    cred_t *cr)
 {
 	opdes_t *optd;
 
 	/* Find the option in the opt_arr. */
-	if ((optd = proto_opt_lookup(level, name, opt_arr, opt_arr_cnt)) ==
-	    NULL) {
-		/*
-		 * Not found, that is a bad thing if
-		 * the caller is a tpi provider
-		 */
-		if (topmost_tpiprovider)
-			return (-TBADOPT);
-		else
-			return (0); /* skip unmodified */
-	}
+	optd = proto_opt_lookup(level, name, opt_arr, opt_arr_cnt);
+	if (optd == NULL)
+		return (-TBADOPT);
 
 	/* Additional checks dependent on operation. */
 	if (negotiate) {
@@ -409,15 +403,12 @@ proto_opt_check(int level, int name, int len, t_uscalar_t *max_len,
 				return (-TBADOPT);
 		}
 		/*
-		 * XXX Change the comments.
-		 *
 		 * XXX Since T_CURRENT was not there in TLI and the
 		 * official TLI inspired TPI standard, getsockopt()
 		 * API uses T_CHECK (for T_CURRENT semantics)
-		 * The following fallthru makes sense because of its
-		 * historical use as semantic equivalent to T_CURRENT.
+		 * Thus T_CHECK includes the T_CURRENT semantics due to that
+		 * historical use.
 		 */
-		/* FALLTHRU */
 		if (!OA_READ_PERMISSION(optd, cr)) {
 			/* can't read option value */
 			if (!(OA_MATCHED_PRIV(optd, cr)) &&
diff --git a/usr/src/uts/common/inet/proto_set.h b/usr/src/uts/common/inet/proto_set.h
index 8e714c7c05..488cf4d478 100644
--- a/usr/src/uts/common/inet/proto_set.h
+++ b/usr/src/uts/common/inet/proto_set.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -48,7 +48,7 @@ extern int	proto_tlitosyserr(int);
 extern int	proto_verify_ip_addr(int, const struct sockaddr *, socklen_t);
 
 extern int	proto_opt_check(int, int, int, t_uscalar_t *, opdes_t *,
-    uint_t, boolean_t, boolean_t, boolean_t, cred_t *);
+    uint_t, boolean_t, boolean_t, cred_t *);
 extern opdes_t *proto_opt_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t);
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h
index 5635bb0f01..348c4f5239 100644
--- a/usr/src/uts/common/inet/rawip_impl.h
+++ b/usr/src/uts/common/inet/rawip_impl.h
@@ -69,87 +69,25 @@ typedef struct icmp_stack icmp_stack_t;
 
 /* Internal icmp control structure, one per open stream */
 typedef	struct icmp_s {
-	krwlock_t	icmp_rwlock;	/* Protects most of icmp_t */
-	t_scalar_t	icmp_pending_op;	/* The current TPI operation */
 	/*
-	 * Following fields up to icmp_ipversion protected by conn_lock.
+	 * The addresses and ports in the conn_t and icmp_state are protected by
+	 * conn_lock. conn_lock also protects the content of icmp_t.
 	 */
 	uint_t		icmp_state;	/* TPI state */
-	in6_addr_t	icmp_v6src;	/* Source address of this stream */
-	in6_addr_t	icmp_bound_v6src; /* Explicitely bound to address */
-	sin6_t		icmp_v6dst;	/* Connected destination */
-	/*
-	 * IP format that packets transmitted from this struct should use.
-	 * Value can be IP4_VERSION or IPV6_VERSION.
-	 */
-	uchar_t		icmp_ipversion;
-
-	/* Written to only once at the time of opening the endpoint */
-	sa_family_t	icmp_family;	/* Family from socket() call */
-
-	/* Following protected by icmp_rwlock */
-	uint32_t 	icmp_max_hdr_len; /* For write offset in stream head */
-	uint_t		icmp_proto;
-	uint_t		icmp_ip_snd_options_len; /* Len of IPv4 options */
-	uint8_t		*icmp_ip_snd_options;	/* Ptr to IPv4 options */
-	uint8_t		icmp_multicast_ttl;	/* IP*_MULTICAST_TTL/HOPS */
-	ipaddr_t	icmp_multicast_if_addr; /* IP_MULTICAST_IF option */
-	uint_t		icmp_multicast_if_index; /* IPV6_MULTICAST_IF option */
-	int		icmp_bound_if;		/* IP*_BOUND_IF option */
 
 	/* Written to only once at the time of opening the endpoint */
 	conn_t		*icmp_connp;
 
-	/* Following protected by icmp_rwlock */
 	uint_t
-	    icmp_debug : 1,		/* SO_DEBUG "socket" option. */
-	    icmp_dontroute : 1,		/* SO_DONTROUTE "socket" option. */
-	    icmp_broadcast : 1,		/* SO_BROADCAST "socket" option. */
-	    icmp_reuseaddr : 1,		/* SO_REUSEADDR "socket" option. */
-
-	    icmp_useloopback : 1,	/* SO_USELOOPBACK "socket" option. */
 	    icmp_hdrincl : 1,		/* IP_HDRINCL option + RAW and IGMP */
-	    icmp_dgram_errind : 1,	/* SO_DGRAM_ERRIND option */
-	    icmp_unspec_source : 1,	/* IP*_UNSPEC_SRC option */
 
-	    icmp_raw_checksum : 1,	/* raw checksum per IPV6_CHECKSUM */
-	    icmp_no_tp_cksum : 1,	/* icmp_proto is UDP or TCP */
-	    icmp_ip_recvpktinfo : 1,	/* IPV[4,6]_RECVPKTINFO option  */
-	    icmp_ipv6_recvhoplimit : 1,	/* IPV6_RECVHOPLIMIT option */
+	    icmp_pad_to_bit_31: 31;
 
-	    icmp_ipv6_recvhopopts : 1,	/* IPV6_RECVHOPOPTS option */
-	    icmp_ipv6_recvdstopts : 1,	/* IPV6_RECVDSTOPTS option */
-	    icmp_ipv6_recvrthdr : 1,	/* IPV6_RECVRTHDR option */
-	    icmp_ipv6_recvpathmtu : 1,	/* IPV6_RECVPATHMTU option */
-
-	    icmp_recvif:1,		/* IP_RECVIF for raw sockets option */
-	    icmp_ipv6_recvtclass : 1,	/* IPV6_RECVTCLASS option */
-	    icmp_ipv6_recvrtdstopts : 1, /* Obsolete IPV6_RECVRTHDRDSTOPTS */
-	    icmp_old_ipv6_recvdstopts : 1, /* Old ver of IPV6_RECVDSTOPTS */
-
-	    icmp_timestamp : 1,  	/* SO_TIMESTAMP "socket" option */
-
-	    icmp_pad_to_bit_31: 11;
-
-	uint8_t		icmp_type_of_service;
-	uint8_t		icmp_ttl;		/* TTL or hoplimit */
-	uint32_t	icmp_checksum_off; /* user supplied checksum offset */
 	icmp6_filter_t	*icmp_filter;		/* ICMP6_FILTER option */
 
-	ip6_pkt_t	icmp_sticky_ipp;	/* Sticky options */
-	uint8_t		*icmp_sticky_hdrs;	/* Prebuilt IPv6 hdrs */
-	uint_t		icmp_sticky_hdrs_len;	/* Incl. ip6h and any ip6i */
-	zoneid_t	icmp_zoneid;		/* ID of owning zone */
-	uint_t		icmp_label_len;		/* length of security label */
-	uint_t		icmp_label_len_v6;	/* sec. part of sticky opt */
-	in6_addr_t 	icmp_v6lastdst;		/* most recent destination */
-	cred_t		*icmp_last_cred;	/* most recent credentials */
-	cred_t		*icmp_effective_cred;	/* cred with effective label */
+	/* Set at open time and never changed */
 	icmp_stack_t	*icmp_is;		/* Stack instance */
-	size_t		icmp_xmit_hiwat;
-	size_t		icmp_xmit_lowat;
-	size_t		icmp_recv_hiwat;
-	size_t		icmp_recv_lowat;
+
 	int		icmp_delayed_error;
 	kmutex_t	icmp_recv_lock;
 	mblk_t		*icmp_fallback_queue_head;
@@ -165,6 +103,10 @@ typedef	struct icmp_s {
 extern optdb_obj_t	icmp_opt_obj;
 extern uint_t		icmp_max_optsize;
 
+extern int	icmp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int	icmp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int	icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
+		    uint_t *, uchar_t *, void *, cred_t *);
 extern mblk_t	*icmp_snmp_get(queue_t *q, mblk_t *mpctl);
 
 extern void	icmp_ddi_g_init(void);
diff --git a/usr/src/uts/common/inet/rts_impl.h b/usr/src/uts/common/inet/rts_impl.h
index de7cd8970b..b2b9080e9e 100644
--- a/usr/src/uts/common/inet/rts_impl.h
+++ b/usr/src/uts/common/inet/rts_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -71,13 +71,7 @@ typedef	struct rts_s {
 	uint_t	rts_state;		/* Provider interface state */
 	uint_t	rts_error;		/* Routing socket error code */
 	uint_t	rts_flag;		/* Pending I/O state */
-	uint_t	rts_proto;		/* SO_PROTOTYPE "socket" option. */
-	uint_t	rts_debug : 1,		/* SO_DEBUG "socket" option. */
-		rts_dontroute : 1,	/* SO_DONTROUTE "socket" option. */
-		rts_broadcast : 1,	/* SO_BROADCAST "socket" option. */
-		rts_reuseaddr : 1,	/* SO_REUSEADDR "socket" option. */
-		rts_useloopback : 1,	/* SO_USELOOPBACK "socket" option. */
-		rts_multicast_loop : 1,	/* IP_MULTICAST_LOOP option */
+	uint_t
 		rts_hdrincl : 1,	/* IP_HDRINCL option + RAW and IGMP */
 
 		: 0;
@@ -86,30 +80,16 @@ typedef	struct rts_s {
 	/* Written to only once at the time of opening the endpoint */
 	conn_t		*rts_connp;
 
-	/* Outbound flow control */
-	size_t		rts_xmit_hiwat;
-	size_t		rts_xmit_lowat;
-
-	/* Inbound flow control */
-	size_t		rts_recv_hiwat;
-	size_t		rts_recv_lowat;
-
-	kmutex_t	rts_send_mutex;
-	kmutex_t	rts_recv_mutex;
-	kcondvar_t	rts_send_cv;
-	kcondvar_t	rts_io_cv;
+	kmutex_t	rts_recv_mutex;	/* For recv flow control */
 } rts_t;
 
 #define	RTS_WPUT_PENDING	0x1	/* Waiting for write-side to complete */
-#define	RTS_REQ_PENDING		0x1	/* For direct sockets */
 #define	RTS_WRW_PENDING		0x2	/* Routing socket write in progress */
-#define	RTS_REQ_INPROG		0x2	/* For direct sockets */
 
 /*
  * Object to represent database of options to search passed to
  * {sock,tpi}optcom_req() interface routine to take care of option
  * management and associated methods.
- * XXX. These and other externs should really move to a rts header.
  */
 extern optdb_obj_t	rts_opt_obj;
 extern uint_t		rts_max_optsize;
@@ -119,7 +99,7 @@ extern void	rts_ddi_g_destroy(void);
 
 extern int	rts_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
 extern int	rts_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
-		    uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+		    uint_t *, uchar_t *, void *, cred_t *);
 extern int	rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
 		    uchar_t *ptr);
 
diff --git a/usr/src/uts/common/inet/sadb.h b/usr/src/uts/common/inet/sadb.h
index 6d3b9b5b27..7a45a41b85 100644
--- a/usr/src/uts/common/inet/sadb.h
+++ b/usr/src/uts/common/inet/sadb.h
@@ -37,14 +37,34 @@ extern "C" {
 
 #define	IPSA_MAX_ADDRLEN 4	/* Max address len. (in 32-bits) for an SA. */
 
-/*
- * Return codes of IPsec processing functions.
- */
-typedef enum {
-	IPSEC_STATUS_SUCCESS = 1,
-	IPSEC_STATUS_FAILED = 2,
-	IPSEC_STATUS_PENDING = 3
-} ipsec_status_t;
+#define	MAXSALTSIZE 8
+
+/*
+ * For combined mode ciphers, store the crypto_mechanism_t in the
+ * per-packet ipsec_in_t/ipsec_out_t structures. This is because the PARAMS
+ * and nonce values change for each packet. For non-combined mode
+ * ciphers, these values are constant for the life of the SA.
+ */
+typedef struct ipsa_cm_mech_s {
+	crypto_mechanism_t combined_mech;
+	union {
+		CK_AES_CCM_PARAMS paramu_ccm;
+		CK_AES_GCM_PARAMS paramu_gcm;
+	} paramu;
+	uint8_t nonce[MAXSALTSIZE + sizeof (uint64_t)];
+#define	param_ulMACSize paramu.paramu_ccm.ulMACSize
+#define	param_ulNonceSize paramu.paramu_ccm.ipsa_ulNonceSize
+#define	param_ulAuthDataSize paramu.paramu_ccm.ipsa_ulAuthDataSize
+#define	param_ulDataSize paramu.paramu_ccm.ipsa_ulDataSize
+#define	param_nonce paramu.paramu_ccm.nonce
+#define	param_authData paramu.paramu_ccm.authData
+#define	param_pIv paramu.paramu_gcm.ipsa_pIv
+#define	param_ulIvLen paramu.paramu_gcm.ulIvLen
+#define	param_ulIvBits paramu.paramu_gcm.ulIvBits
+#define	param_pAAD paramu.paramu_gcm.pAAD
+#define	param_ulAADLen paramu.paramu_gcm.ulAADLen
+#define	param_ulTagBits paramu.paramu_gcm.ulTagBits
+} ipsa_cm_mech_t;
 
 /*
  * The Initialization Vector (also known as IV or Nonce) used to
@@ -280,9 +300,13 @@ typedef struct ipsa_s {
 
 	/*
 	 * Input and output processing functions called from IP.
+	 * The mblk_t is the data; the IPsec information is in the attributes
+	 * Returns NULL if the mblk is consumed which it is if there was
+	 * a failure or if pending. If failure then
+	 * the ipIfInDiscards/OutDiscards counters are increased.
 	 */
-	ipsec_status_t (*ipsa_output_func)(mblk_t *);
-	ipsec_status_t (*ipsa_input_func)(mblk_t *, void *);
+	mblk_t *(*ipsa_output_func)(mblk_t *, ip_xmit_attr_t *);
+	mblk_t *(*ipsa_input_func)(mblk_t *, void *, ip_recv_attr_t *);
 
 	/*
 	 * Soft reference to paired SA
@@ -290,8 +314,8 @@ typedef struct ipsa_s {
 	uint32_t	ipsa_otherspi;
 	netstack_t	*ipsa_netstack;	/* Does not have a netstack_hold */
 
-	cred_t *ipsa_cred;			/* MLS: cred_t attributes */
-	cred_t *ipsa_ocred;			/* MLS: outer label */
+	ts_label_t *ipsa_tsl;			/* MLS: label attributes */
+	ts_label_t *ipsa_otsl;			/* MLS: outer label */
 	uint8_t	ipsa_mac_exempt;		/* MLS: mac exempt flag */
 	uchar_t	ipsa_opt_storage[IP_MAX_OPT_LENGTH];
 } ipsa_t;
@@ -382,7 +406,7 @@ typedef struct ipsa_s {
 #define	IPSA_F_EALG1	SADB_X_SAFLAGS_EALG1	/* Encrypt alg flag 1 */
 #define	IPSA_F_EALG2	SADB_X_SAFLAGS_EALG2	/* Encrypt alg flag 2 */
 
-#define	IPSA_F_HW	0x200000		/* hwaccel capable SA */
+#define	IPSA_F_ASYNC	0x200000		/* Call KCF asynchronously? */
 #define	IPSA_F_NATT_LOC	SADB_X_SAFLAGS_NATT_LOC
 #define	IPSA_F_NATT_REM	SADB_X_SAFLAGS_NATT_REM
 #define	IPSA_F_BEHIND_NAT SADB_X_SAFLAGS_NATTED
@@ -503,8 +527,8 @@ typedef struct ipsacq_s {
 	uint8_t	ipsacq_icmp_type;
 	uint8_t ipsacq_icmp_code;
 
-	/* credentials associated with triggering packet */
-	cred_t	*ipsacq_cred;
+	/* label associated with triggering packet */
+	ts_label_t	*ipsacq_tsl;
 } ipsacq_t;
 
 /*
@@ -529,7 +553,7 @@ typedef struct iacqf_s {
  * A (network protocol, ipsec protocol) specific SADB.
  * (i.e., one each for {ah, esp} and {v4, v6}.
  *
- * Keep outbound assocs about the same as ire_cache entries for now.
+ * Keep outbound assocs in a simple hash table for now.
  * One danger point, multiple SAs for a single dest will clog a bucket.
  * For the future, consider two-level hashing (2nd hash on IPC?), then probe.
  */
@@ -550,7 +574,6 @@ typedef struct sadb_s
 typedef struct sadbp_s
 {
 	uint32_t	s_satype;
-	queue_t		*s_ip_q;
 	uint32_t	*s_acquire_timeout;
 	void 		(*s_acqfn)(ipsacq_t *, mblk_t *, netstack_t *);
 	sadb_t		s_v4;
@@ -583,14 +606,16 @@ typedef struct templist_s
 #define	ALL_ZEROES_PTR	((uint32_t *)&ipv6_all_zeros)
 
 /*
- * Form unique id from ipsec_out_t
+ * Form unique id from ip_xmit_attr_t.
  */
-
-#define	SA_FORM_UNIQUE_ID(io)				\
-	SA_UNIQUE_ID((io)->ipsec_out_src_port, (io)->ipsec_out_dst_port, \
-		((io)->ipsec_out_tunnel ? ((io)->ipsec_out_inaf == AF_INET6 ? \
-		    IPPROTO_IPV6 : IPPROTO_ENCAP) : (io)->ipsec_out_proto), \
-		((io)->ipsec_out_tunnel ? (io)->ipsec_out_proto : 0))
+#define	SA_FORM_UNIQUE_ID(ixa)					\
+	SA_UNIQUE_ID((ixa)->ixa_ipsec_src_port, (ixa)->ixa_ipsec_dst_port, \
+	    (((ixa)->ixa_flags & IXAF_IPSEC_TUNNEL) ?			\
+	    ((ixa)->ixa_ipsec_inaf == AF_INET6 ? \
+	    IPPROTO_IPV6 : IPPROTO_ENCAP) :				\
+	    (ixa)->ixa_ipsec_proto),					\
+	    (((ixa)->ixa_flags & IXAF_IPSEC_TUNNEL) ? \
+	    (ixa)->ixa_ipsec_proto : 0))
 
 /*
  * This macro is used to generate unique ids (along with the addresses, both
@@ -698,8 +723,8 @@ boolean_t sadb_match_query(ipsa_query_t *q, ipsa_t *sa);
 /* SA retrieval (inbound and outbound) */
 ipsa_t *ipsec_getassocbyspi(isaf_t *, uint32_t, uint32_t *, uint32_t *,
     sa_family_t);
-ipsa_t *ipsec_getassocbyconn(isaf_t *, ipsec_out_t *, uint32_t *, uint32_t *,
-    sa_family_t, uint8_t, cred_t *);
+ipsa_t *ipsec_getassocbyconn(isaf_t *, ip_xmit_attr_t *, uint32_t *, uint32_t *,
+    sa_family_t, uint8_t, ts_label_t *);
 
 /* SA insertion. */
 int sadb_insertassoc(ipsa_t *, isaf_t *);
@@ -727,9 +752,9 @@ boolean_t sadb_addrfix(keysock_in_t *, queue_t *, mblk_t *, netstack_t *);
 int sadb_addrset(ire_t *);
 int sadb_delget_sa(mblk_t *, keysock_in_t *, sadbp_t *, int *, queue_t *,
     uint8_t);
-int sadb_purge_sa(mblk_t *, keysock_in_t *, sadb_t *, int *, queue_t *,
-    queue_t *);
-int sadb_common_add(queue_t *, queue_t *, mblk_t *, sadb_msg_t *,
+
+int sadb_purge_sa(mblk_t *, keysock_in_t *, sadb_t *, int *, queue_t *);
+int sadb_common_add(queue_t *, mblk_t *, sadb_msg_t *,
     keysock_in_t *, isaf_t *, isaf_t *, ipsa_t *, boolean_t, boolean_t, int *,
     netstack_t *, sadbp_t *);
 void sadb_set_usetime(ipsa_t *);
@@ -737,7 +762,7 @@ boolean_t sadb_age_bytes(queue_t *, ipsa_t *, uint64_t, boolean_t);
 int sadb_update_sa(mblk_t *, keysock_in_t *, mblk_t **, sadbp_t *,
     int *, queue_t *, int (*)(mblk_t *, keysock_in_t *, int *, netstack_t *),
     netstack_t *, uint8_t);
-void sadb_acquire(mblk_t *, ipsec_out_t *, boolean_t, boolean_t);
+void sadb_acquire(mblk_t *, ip_xmit_attr_t *, boolean_t, boolean_t);
 void gcm_params_init(ipsa_t *, uchar_t *, uint_t, uchar_t *, ipsa_cm_mech_t *,
     crypto_data_t *);
 void ccm_params_init(ipsa_t *, uchar_t *, uint_t, uchar_t *, ipsa_cm_mech_t *,
@@ -754,16 +779,17 @@ boolean_t sadb_replay_check(ipsa_t *, uint32_t);
 boolean_t sadb_replay_peek(ipsa_t *, uint32_t);
 int sadb_dump(queue_t *, mblk_t *, keysock_in_t *, sadb_t *);
 void sadb_replay_delete(ipsa_t *);
-void sadb_ager(sadb_t *, queue_t *, queue_t *, int, netstack_t *);
+void sadb_ager(sadb_t *, queue_t *, int, netstack_t *);
 
 timeout_id_t sadb_retimeout(hrtime_t, queue_t *, void (*)(void *), void *,
     uint_t *, uint_t, short);
 void sadb_sa_refrele(void *target);
-boolean_t sadb_set_lpkt(ipsa_t *, mblk_t *, netstack_t *);
+boolean_t sadb_set_lpkt(ipsa_t *, mblk_t *, ip_recv_attr_t *);
 mblk_t *sadb_clear_lpkt(ipsa_t *);
-void sadb_buf_pkt(ipsa_t *, mblk_t *, netstack_t *);
+void sadb_buf_pkt(ipsa_t *, mblk_t *, ip_recv_attr_t *);
 void sadb_clear_buf_pkt(void *ipkt);
 
+/* Note that buf_pkt is the product of ip_recv_attr_to_mblk() */
 #define	HANDLE_BUF_PKT(taskq, stack, dropper, buf_pkt)			\
 {									\
 	if (buf_pkt != NULL) {						\
@@ -774,8 +800,9 @@ void sadb_clear_buf_pkt(void *ipkt);
 			while (buf_pkt != NULL) {			\
 				tmp = buf_pkt->b_next;			\
 				buf_pkt->b_next = NULL;			\
+				buf_pkt = ip_recv_attr_free_mblk(buf_pkt); \
 				ip_drop_packet(buf_pkt, B_TRUE, NULL,	\
-				    NULL, DROPPER(stack,		\
+				    DROPPER(stack,			\
 				    ipds_sadb_inidle_timeout),		\
 				    &dropper);				\
 				buf_pkt = tmp;				\
@@ -785,24 +812,8 @@ void sadb_clear_buf_pkt(void *ipkt);
 }									\
 
 /*
- * Hw accel-related calls (downloading sadb to driver)
- */
-void sadb_ill_download(ill_t *, uint_t);
-mblk_t *sadb_fmt_sa_req(uint_t, uint_t, ipsa_t *, boolean_t);
-/*
- * Sub-set of the IPsec hardware acceleration capabilities functions
- * implemented by ip_if.c
- */
-extern	boolean_t ipsec_capab_match(ill_t *, uint_t, boolean_t, ipsa_t *,
-    netstack_t *);
-extern	void	ill_ipsec_capab_send_all(uint_t, mblk_t *, ipsa_t *,
-    netstack_t *);
-
-
-/*
- * One IPsec -> IP linking routine, and two IPsec rate-limiting routines.
+ * Two IPsec rate-limiting routines.
  */
-extern boolean_t sadb_t_bind_req(queue_t *, int);
 /*PRINTFLIKE6*/
 extern void ipsec_rl_strlog(netstack_t *, short, short, char,
     ushort_t, char *, ...)
@@ -818,7 +829,8 @@ extern void ipsec_assocfailure(short, short, char, ushort_t, char *, uint32_t,
 
 typedef enum ipsec_algtype {
 	IPSEC_ALG_AUTH = 0,
-	IPSEC_ALG_ENCR = 1
+	IPSEC_ALG_ENCR = 1,
+	IPSEC_ALG_ALL = 2
 } ipsec_algtype_t;
 
 /*
@@ -886,11 +898,10 @@ extern void ipsec_alg_fix_min_max(ipsec_alginfo_t *, ipsec_algtype_t,
 extern void alg_flag_check(ipsec_alginfo_t *);
 extern void ipsec_alg_free(ipsec_alginfo_t *);
 extern void ipsec_register_prov_update(void);
-extern void sadb_alg_update(ipsec_algtype_t, uint8_t, boolean_t,
-    netstack_t *);
+extern void sadb_alg_update(ipsec_algtype_t, uint8_t, boolean_t, netstack_t *);
 
-extern int sadb_sens_len_from_cred(cred_t *);
-extern void sadb_sens_from_cred(sadb_sens_t *, int, cred_t *, int);
+extern int sadb_sens_len_from_label(ts_label_t *);
+extern void sadb_sens_from_label(sadb_sens_t *, int, ts_label_t *, int);
 
 /*
  * Context templates management.
diff --git a/usr/src/uts/common/inet/sctp/sctp.c b/usr/src/uts/common/inet/sctp/sctp.c
index 00fc6cda42..d444e1f10e 100644
--- a/usr/src/uts/common/inet/sctp/sctp.c
+++ b/usr/src/uts/common/inet/sctp/sctp.c
@@ -56,6 +56,8 @@
 
 #include <inet/common.h>
 #include <inet/ip.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
 #include <inet/ip6.h>
 #include <inet/mi.h>
 #include <inet/mib2.h>
@@ -74,12 +76,6 @@
 int sctpdebug;
 sin6_t	sctp_sin6_null;	/* Zero address for quick clears */
 
-/*
- * Have to ensure that sctp_g_q_close is not done by an
- * interrupt thread.
- */
-static taskq_t *sctp_taskq;
-
 static void	sctp_closei_local(sctp_t *sctp);
 static int	sctp_init_values(sctp_t *, sctp_t *, int);
 static void	sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp);
@@ -91,12 +87,10 @@ static void	sctp_conn_cache_fini();
 static int	sctp_conn_cache_constructor();
 static void	sctp_conn_cache_destructor();
 static void	sctp_conn_clear(conn_t *);
-void		sctp_g_q_setup(sctp_stack_t *);
-void		sctp_g_q_create(sctp_stack_t *);
-void		sctp_g_q_destroy(sctp_stack_t *);
+static void	sctp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
+    ixa_notify_arg_t);
 
 static void	*sctp_stack_init(netstackid_t stackid, netstack_t *ns);
-static void	sctp_stack_shutdown(netstackid_t stackid, void *arg);
 static void	sctp_stack_fini(netstackid_t stackid, void *arg);
 
 /*
@@ -178,8 +172,8 @@ sctp_create_eager(sctp_t *psctp)
 {
 	sctp_t	*sctp;
 	mblk_t	*ack_mp, *hb_mp;
-	conn_t	*connp, *pconnp;
-	cred_t *credp;
+	conn_t	*connp;
+	cred_t	*credp;
 	sctp_stack_t	*sctps = psctp->sctp_sctps;
 
 	if ((connp = ipcl_conn_create(IPCL_SCTPCONN, KM_NOSLEEP,
@@ -187,8 +181,6 @@ sctp_create_eager(sctp_t *psctp)
 		return (NULL);
 	}
 
-	connp->conn_ulp_labeled = is_system_labeled();
-
 	sctp = CONN2SCTP(connp);
 	sctp->sctp_sctps = sctps;
 
@@ -200,7 +192,6 @@ sctp_create_eager(sctp_t *psctp)
 			freeb(ack_mp);
 		sctp_conn_clear(connp);
 		sctp->sctp_sctps = NULL;
-		SCTP_G_Q_REFRELE(sctps);
 		kmem_cache_free(sctp_conn_cache, connp);
 		return (NULL);
 	}
@@ -208,43 +199,20 @@ sctp_create_eager(sctp_t *psctp)
 	sctp->sctp_ack_mp = ack_mp;
 	sctp->sctp_heartbeat_mp = hb_mp;
 
-	/* Inherit information from the "parent" */
-	sctp->sctp_ipversion = psctp->sctp_ipversion;
-	sctp->sctp_family = psctp->sctp_family;
-	pconnp = psctp->sctp_connp;
-	connp->conn_af_isv6 = pconnp->conn_af_isv6;
-	connp->conn_pkt_isv6 = pconnp->conn_pkt_isv6;
-	connp->conn_ipv6_v6only = pconnp->conn_ipv6_v6only;
 	if (sctp_init_values(sctp, psctp, KM_NOSLEEP) != 0) {
 		freeb(ack_mp);
 		freeb(hb_mp);
 		sctp_conn_clear(connp);
 		sctp->sctp_sctps = NULL;
-		SCTP_G_Q_REFRELE(sctps);
 		kmem_cache_free(sctp_conn_cache, connp);
 		return (NULL);
 	}
 
-	/*
-	 * If the parent is multilevel, then we'll fix up the remote cred
-	 * when we do sctp_accept_comm.
-	 */
-	if ((credp = pconnp->conn_cred) != NULL) {
+	if ((credp = psctp->sctp_connp->conn_cred) != NULL) {
 		connp->conn_cred = credp;
 		crhold(credp);
-		/*
-		 * If the caller has the process-wide flag set, then default to
-		 * MAC exempt mode.  This allows read-down to unlabeled hosts.
-		 */
-		if (getpflags(NET_MAC_AWARE, credp) != 0)
-			connp->conn_mac_mode = CONN_MAC_AWARE;
 	}
 
-	connp->conn_allzones = pconnp->conn_allzones;
-	connp->conn_zoneid = pconnp->conn_zoneid;
-	sctp->sctp_cpid = psctp->sctp_cpid;
-	sctp->sctp_open_time = lbolt64;
-
 	sctp->sctp_mss = psctp->sctp_mss;
 	sctp->sctp_detached = B_TRUE;
 	/*
@@ -263,11 +231,6 @@ void
 sctp_clean_death(sctp_t *sctp, int err)
 {
 	ASSERT(sctp != NULL);
-	ASSERT((sctp->sctp_family == AF_INET &&
-	    sctp->sctp_ipversion == IPV4_VERSION) ||
-	    (sctp->sctp_family == AF_INET6 &&
-	    (sctp->sctp_ipversion == IPV4_VERSION ||
-	    sctp->sctp_ipversion == IPV6_VERSION)));
 
 	dprint(3, ("sctp_clean_death %p, state %d\n", (void *)sctp,
 	    sctp->sctp_state));
@@ -328,7 +291,8 @@ sctp_clean_death(sctp_t *sctp, int err)
 int
 sctp_disconnect(sctp_t *sctp)
 {
-	int	error = 0;
+	int		error = 0;
+	conn_t		*connp = sctp->sctp_connp;
 
 	dprint(3, ("sctp_disconnect %p, state %d\n", (void *)sctp,
 	    sctp->sctp_state));
@@ -358,7 +322,7 @@ sctp_disconnect(sctp_t *sctp)
 		 * If SO_LINGER has set a zero linger time, terminate the
 		 * association and send an ABORT.
 		 */
-		if (sctp->sctp_linger && sctp->sctp_lingertime == 0) {
+		if (connp->conn_linger && connp->conn_lingertime == 0) {
 			sctp_user_abort(sctp, NULL);
 			WAKE_SCTP(sctp);
 			return (error);
@@ -382,7 +346,7 @@ sctp_disconnect(sctp_t *sctp)
 		sctp_send_shutdown(sctp, 0);
 
 		/* Pass gathered wisdom to IP for keeping */
-		sctp_update_ire(sctp);
+		sctp_update_dce(sctp);
 
 		/*
 		 * If lingering on close then wait until the shutdown
@@ -391,21 +355,15 @@ sctp_disconnect(sctp_t *sctp)
 		 * can be called more than once.  Make sure that only
 		 * one thread waits.
 		 */
-		if (sctp->sctp_linger && sctp->sctp_lingertime > 0 &&
+		if (connp->conn_linger && connp->conn_lingertime > 0 &&
 		    sctp->sctp_state >= SCTPS_ESTABLISHED &&
 		    !sctp->sctp_lingering) {
 			clock_t stoptime;	/* in ticks */
 			clock_t ret;
 
-			/*
-			 * Process the sendq to send the SHUTDOWN out
-			 * before waiting.
-			 */
-			sctp_process_sendq(sctp);
-
 			sctp->sctp_lingering = 1;
 			sctp->sctp_client_errno = 0;
-			stoptime = lbolt + sctp->sctp_lingertime;
+			stoptime = lbolt + connp->conn_lingertime * hz;
 
 			mutex_enter(&sctp->sctp_lock);
 			sctp->sctp_running = B_FALSE;
@@ -429,7 +387,6 @@ sctp_disconnect(sctp_t *sctp)
 		}
 
 		WAKE_SCTP(sctp);
-		sctp_process_sendq(sctp);
 		return (error);
 	}
 
@@ -493,7 +450,6 @@ static void
 sctp_closei_local(sctp_t *sctp)
 {
 	mblk_t	*mp;
-	ire_t	*ire = NULL;
 	conn_t	*connp = sctp->sctp_connp;
 
 	/* Sanity check, don't do the same thing twice.  */
@@ -516,11 +472,7 @@ sctp_closei_local(sctp_t *sctp)
 	/* Set the CONN_CLOSING flag so that IP will not cache IRE again. */
 	mutex_enter(&connp->conn_lock);
 	connp->conn_state_flags |= CONN_CLOSING;
-	ire = connp->conn_ire_cache;
-	connp->conn_ire_cache = NULL;
 	mutex_exit(&connp->conn_lock);
-	if (ire != NULL)
-		IRE_REFRELE_NOTR(ire);
 
 	/* Remove from all hashes. */
 	sctp_bind_hash_remove(sctp);
@@ -534,14 +486,12 @@ sctp_closei_local(sctp_t *sctp)
 	 */
 	mutex_enter(&sctp->sctp_recvq_lock);
 	while ((mp = sctp->sctp_recvq) != NULL) {
-		mblk_t *ipsec_mp;
-
 		sctp->sctp_recvq = mp->b_next;
 		mp->b_next = NULL;
-		if ((ipsec_mp = mp->b_prev) != NULL) {
-			freeb(ipsec_mp);
-			mp->b_prev = NULL;
-		}
+
+		if (ip_recv_attr_is_mblk(mp))
+			mp = ip_recv_attr_free_mblk(mp);
+
 		freemsg(mp);
 	}
 	mutex_exit(&sctp->sctp_recvq_lock);
@@ -668,7 +618,7 @@ sctp_free(conn_t *connp)
 	SCTP_UNLINK(sctp, sctps);
 
 	ASSERT(connp->conn_ref == 0);
-	ASSERT(connp->conn_ulp == IPPROTO_SCTP);
+	ASSERT(connp->conn_proto == IPPROTO_SCTP);
 	ASSERT(!MUTEX_HELD(&sctp->sctp_reflock));
 	ASSERT(sctp->sctp_refcnt == 0);
 
@@ -723,8 +673,6 @@ sctp_free(conn_t *connp)
 		list_destroy(&sctp->sctp_saddrs[cnt].sctp_ipif_list);
 	}
 
-	ip6_pkt_free(&sctp->sctp_sticky_ipp);
-
 	if (sctp->sctp_hopopts != NULL) {
 		mi_free(sctp->sctp_hopopts);
 		sctp->sctp_hopopts = NULL;
@@ -737,12 +685,12 @@ sctp_free(conn_t *connp)
 		sctp->sctp_dstoptslen = 0;
 	}
 	ASSERT(sctp->sctp_dstoptslen == 0);
-	if (sctp->sctp_rtdstopts != NULL) {
-		mi_free(sctp->sctp_rtdstopts);
-		sctp->sctp_rtdstopts = NULL;
-		sctp->sctp_rtdstoptslen = 0;
+	if (sctp->sctp_rthdrdstopts != NULL) {
+		mi_free(sctp->sctp_rthdrdstopts);
+		sctp->sctp_rthdrdstopts = NULL;
+		sctp->sctp_rthdrdstoptslen = 0;
 	}
-	ASSERT(sctp->sctp_rtdstoptslen == 0);
+	ASSERT(sctp->sctp_rthdrdstoptslen == 0);
 	if (sctp->sctp_rthdr != NULL) {
 		mi_free(sctp->sctp_rthdr);
 		sctp->sctp_rthdr = NULL;
@@ -806,9 +754,7 @@ sctp_free(conn_t *connp)
 	sctp->sctp_v6label_len = 0;
 	sctp->sctp_v4label_len = 0;
 
-	/* Every sctp_t holds one reference on the default queue */
 	sctp->sctp_sctps = NULL;
-	SCTP_G_Q_REFRELE(sctps);
 
 	sctp_conn_clear(connp);
 	kmem_cache_free(sctp_conn_cache, connp);
@@ -822,10 +768,12 @@ sctp_display(sctp_t *sctp, char *sup_buf)
 	char	buf1[30];
 	static char	priv_buf[INET6_ADDRSTRLEN * 2 + 80];
 	char	*cp;
+	conn_t	*connp;
 
 	if (sctp == NULL)
 		return ("NULL_SCTP");
 
+	connp = sctp->sctp_connp;
 	buf = (sup_buf != NULL) ? sup_buf : priv_buf;
 
 	switch (sctp->sctp_state) {
@@ -865,7 +813,7 @@ sctp_display(sctp_t *sctp, char *sup_buf)
 		break;
 	}
 	(void) mi_sprintf(buf, "[%u, %u] %s",
-	    ntohs(sctp->sctp_lport), ntohs(sctp->sctp_fport), cp);
+	    ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
 
 	return (buf);
 }
@@ -880,13 +828,9 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
 	int	err;
 	int	cnt;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
-	conn_t 	*connp, *pconnp;
+	conn_t 	*connp;
 
-	ASSERT((sctp->sctp_family == AF_INET &&
-	    sctp->sctp_ipversion == IPV4_VERSION) ||
-	    (sctp->sctp_family == AF_INET6 &&
-	    (sctp->sctp_ipversion == IPV4_VERSION ||
-	    sctp->sctp_ipversion == IPV6_VERSION)));
+	connp = sctp->sctp_connp;
 
 	sctp->sctp_nsaddrs = 0;
 	for (cnt = 0; cnt < SCTP_IPIF_HASH; cnt++) {
@@ -895,7 +839,7 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
 		    sizeof (sctp_saddr_ipif_t), offsetof(sctp_saddr_ipif_t,
 		    saddr_ipif));
 	}
-	sctp->sctp_ports = 0;
+	connp->conn_ports = 0;
 	sctp->sctp_running = B_FALSE;
 	sctp->sctp_state = SCTPS_IDLE;
 
@@ -925,51 +869,16 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
 	if (psctp != NULL) {
 		/*
 		 * Inherit from parent
+		 *
+		 * Start by inheriting from the conn_t, including conn_ixa and
+		 * conn_xmit_ipp.
 		 */
-		sctp->sctp_iphc = kmem_zalloc(psctp->sctp_iphc_len, sleep);
-		if (sctp->sctp_iphc == NULL) {
-			sctp->sctp_iphc_len = 0;
-			err = ENOMEM;
-			goto failure;
-		}
-		sctp->sctp_iphc_len = psctp->sctp_iphc_len;
-		sctp->sctp_hdr_len = psctp->sctp_hdr_len;
-
-		sctp->sctp_iphc6 = kmem_zalloc(psctp->sctp_iphc6_len, sleep);
-		if (sctp->sctp_iphc6 == NULL) {
-			sctp->sctp_iphc6_len = 0;
-			err = ENOMEM;
+		err = conn_inherit_parent(psctp->sctp_connp, connp);
+		if (err != 0)
 			goto failure;
-		}
-		sctp->sctp_iphc6_len = psctp->sctp_iphc6_len;
-		sctp->sctp_hdr6_len = psctp->sctp_hdr6_len;
-
-		sctp->sctp_ip_hdr_len = psctp->sctp_ip_hdr_len;
-		sctp->sctp_ip_hdr6_len = psctp->sctp_ip_hdr6_len;
-
-		/*
-		 * Copy the IP+SCTP header templates from listener
-		 */
-		bcopy(psctp->sctp_iphc, sctp->sctp_iphc,
-		    psctp->sctp_hdr_len);
-		sctp->sctp_ipha = (ipha_t *)sctp->sctp_iphc;
-		sctp->sctp_sctph = (sctp_hdr_t *)(sctp->sctp_iphc +
-		    sctp->sctp_ip_hdr_len);
-
-		bcopy(psctp->sctp_iphc6, sctp->sctp_iphc6,
-		    psctp->sctp_hdr6_len);
-		if (((ip6i_t *)(sctp->sctp_iphc6))->ip6i_nxt == IPPROTO_RAW) {
-			sctp->sctp_ip6h = (ip6_t *)(sctp->sctp_iphc6 +
-			    sizeof (ip6i_t));
-		} else {
-			sctp->sctp_ip6h = (ip6_t *)sctp->sctp_iphc6;
-		}
-		sctp->sctp_sctph6 = (sctp_hdr_t *)(sctp->sctp_iphc6 +
-		    sctp->sctp_ip_hdr6_len);
 
 		sctp->sctp_cookie_lifetime = psctp->sctp_cookie_lifetime;
-		sctp->sctp_xmit_lowater = psctp->sctp_xmit_lowater;
-		sctp->sctp_xmit_hiwater = psctp->sctp_xmit_hiwater;
+
 		sctp->sctp_cwnd_max = psctp->sctp_cwnd_max;
 		sctp->sctp_rwnd = psctp->sctp_rwnd;
 		sctp->sctp_irwnd = psctp->sctp_rwnd;
@@ -996,43 +905,23 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
 		sctp->sctp_tx_adaptation_code = psctp->sctp_tx_adaptation_code;
 
 		/* xxx should be a better way to copy these flags xxx */
-		sctp->sctp_debug = psctp->sctp_debug;
 		sctp->sctp_bound_to_all = psctp->sctp_bound_to_all;
 		sctp->sctp_cansleep = psctp->sctp_cansleep;
 		sctp->sctp_send_adaptation = psctp->sctp_send_adaptation;
 		sctp->sctp_ndelay = psctp->sctp_ndelay;
 		sctp->sctp_events = psctp->sctp_events;
-		sctp->sctp_ipv6_recvancillary = psctp->sctp_ipv6_recvancillary;
-
-		/* Copy IP-layer options */
-		connp = sctp->sctp_connp;
-		pconnp = psctp->sctp_connp;
-
-		connp->conn_broadcast = pconnp->conn_broadcast;
-		connp->conn_loopback = pconnp->conn_loopback;
-		connp->conn_dontroute = pconnp->conn_dontroute;
-		connp->conn_reuseaddr = pconnp->conn_reuseaddr;
-
 	} else {
 		/*
-		 * Initialize the header template
-		 */
-		if ((err = sctp_header_init_ipv4(sctp, sleep)) != 0) {
-			goto failure;
-		}
-		if ((err = sctp_header_init_ipv6(sctp, sleep)) != 0) {
-			goto failure;
-		}
-
-		/*
 		 * Set to system defaults
 		 */
 		sctp->sctp_cookie_lifetime =
 		    MSEC_TO_TICK(sctps->sctps_cookie_life);
-		sctp->sctp_xmit_lowater = sctps->sctps_xmit_lowat;
-		sctp->sctp_xmit_hiwater = sctps->sctps_xmit_hiwat;
+		connp->conn_sndlowat = sctps->sctps_xmit_lowat;
+		connp->conn_sndbuf = sctps->sctps_xmit_hiwat;
+		connp->conn_rcvbuf = sctps->sctps_recv_hiwat;
+
 		sctp->sctp_cwnd_max = sctps->sctps_cwnd_max_;
-		sctp->sctp_rwnd = sctps->sctps_recv_hiwat;
+		sctp->sctp_rwnd = connp->conn_rcvbuf;
 		sctp->sctp_irwnd = sctp->sctp_rwnd;
 		sctp->sctp_pd_point = sctp->sctp_rwnd;
 		sctp->sctp_rto_max = MSEC_TO_TICK(sctps->sctps_rto_maxg);
@@ -1049,13 +938,28 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
 
 		sctp->sctp_hb_interval =
 		    MSEC_TO_TICK(sctps->sctps_heartbeat_interval);
+
+		if (connp->conn_family == AF_INET)
+			connp->conn_default_ttl = sctps->sctps_ipv4_ttl;
+		else
+			connp->conn_default_ttl = sctps->sctps_ipv6_hoplimit;
+
+		connp->conn_xmit_ipp.ipp_unicast_hops =
+		    connp->conn_default_ttl;
+
+		/*
+		 * Initialize the header template
+		 */
+		if ((err = sctp_build_hdrs(sctp, sleep)) != 0) {
+			goto failure;
+		}
 	}
+
 	sctp->sctp_understands_asconf = B_TRUE;
 	sctp->sctp_understands_addip = B_TRUE;
 	sctp->sctp_prsctp_aware = B_FALSE;
 
 	sctp->sctp_connp->conn_ref = 1;
-	sctp->sctp_connp->conn_fully_bound = B_FALSE;
 
 	sctp->sctp_prsctpdrop = 0;
 	sctp->sctp_msgcount = 0;
@@ -1063,14 +967,7 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
 	return (0);
 
 failure:
-	if (sctp->sctp_iphc != NULL) {
-		kmem_free(sctp->sctp_iphc, sctp->sctp_iphc_len);
-		sctp->sctp_iphc = NULL;
-	}
-	if (sctp->sctp_iphc6 != NULL) {
-		kmem_free(sctp->sctp_iphc6, sctp->sctp_iphc6_len);
-		sctp->sctp_iphc6 = NULL;
-	}
+	sctp_headers_free(sctp);
 	return (err);
 }
 
@@ -1102,8 +999,122 @@ sctp_icmp_verf(sctp_t *sctp, sctp_hdr_t *sh, mblk_t *mp)
 }
 
 /*
+ * Update the SCTP state according to change of PMTU.
+ *
+ * Path MTU might have changed by either increase or decrease, so need to
+ * adjust the MSS based on the value of ixa_pmtu.
+ */
+static void
+sctp_update_pmtu(sctp_t *sctp, sctp_faddr_t *fp, boolean_t decrease_only)
+{
+	uint32_t	pmtu;
+	int32_t		mss;
+	ip_xmit_attr_t	*ixa = fp->ixa;
+
+	if (sctp->sctp_state < SCTPS_ESTABLISHED)
+		return;
+
+	/*
+	 * Always call ip_get_pmtu() to make sure that IP has updated
+	 * ixa_flags properly.
+	 */
+	pmtu = ip_get_pmtu(ixa);
+
+	/*
+	 * Calculate the MSS by decreasing the PMTU by sctp_hdr_len and
+	 * IPsec overhead if applied. Make sure to use the most recent
+	 * IPsec information.
+	 */
+	mss = pmtu - conn_ipsec_length(sctp->sctp_connp);
+	if (ixa->ixa_flags & IXAF_IS_IPV4)
+		mss -= sctp->sctp_hdr_len;
+	else
+		mss -= sctp->sctp_hdr6_len;
+
+	/*
+	 * Nothing to change, so just return.
+	 */
+	if (mss == fp->sfa_pmss)
+		return;
+
+	/*
+	 * Currently, for ICMP errors, only PMTU decrease is handled.
+	 */
+	if (mss > fp->sfa_pmss && decrease_only)
+		return;
+
+#ifdef DEBUG
+	(void) printf("sctp_update_pmtu mss from %d to %d\n",
+	    fp->sfa_pmss, mss);
+#endif
+	DTRACE_PROBE2(sctp_update_pmtu, int32_t, fp->sfa_pmss, uint32_t, mss);
+
+	/*
+	 * Update ixa_fragsize and ixa_pmtu.
+	 */
+	ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
+
+	/*
+	 * Make sure that sfa_pmss is a multiple of
+	 * SCTP_ALIGN.
+	 */
+	fp->sfa_pmss = mss & ~(SCTP_ALIGN - 1);
+	fp->pmtu_discovered = 1;
+
+#ifdef notyet
+	if (mss < sctp->sctp_sctps->sctps_mss_min)
+		ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
+#endif
+	if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL)
+		ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
+
+	/*
+	 * If below the min size then ip_get_pmtu cleared IXAF_PMTU_IPV4_DF.
+	 * Make sure to clear IXAF_DONTFRAG, which is used by IP to decide
+	 * whether to fragment the packet.
+	 */
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		if (!(ixa->ixa_flags & IXAF_PMTU_IPV4_DF)) {
+			fp->df = B_FALSE;
+			if (fp == sctp->sctp_current) {
+				sctp->sctp_ipha->
+				    ipha_fragment_offset_and_flags = 0;
+			}
+		}
+	}
+}
+
+/*
+ * Notify function registered with ip_xmit_attr_t. It's called in the context
+ * of conn_ip_output so it's safe to update the SCTP state.
+ * Currently only used for pmtu changes.
+ */
+/* ARGSUSED1 */
+static void
+sctp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
+    ixa_notify_arg_t narg)
+{
+	sctp_t		*sctp = (sctp_t *)arg;
+	sctp_faddr_t	*fp;
+
+	switch (ntype) {
+	case IXAN_PMTU:
+		/* Find the faddr based on the ip_xmit_attr_t pointer */
+		for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
+			if (fp->ixa == ixa)
+				break;
+		}
+		if (fp != NULL)
+			sctp_update_pmtu(sctp, fp, B_FALSE);
+		break;
+	default:
+		break;
+	}
+}
+
+/*
  * sctp_icmp_error is called by sctp_input() to process ICMP error messages
- * passed up by IP.  The queue is the default queue.  We need to find a sctp_t
+ * passed up by IP.  We need to find a sctp_t
  * that corresponds to the returned datagram.  Passes the message back in on
  * the correct queue once it has located the connection.
  * Assumes that IP has pulled up everything up to and including
@@ -1116,8 +1127,6 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp)
 	ipha_t	*ipha;
 	int	iph_hdr_length;
 	sctp_hdr_t *sctph;
-	mblk_t *first_mp;
-	uint32_t new_mtu;
 	in6_addr_t dst;
 	sctp_faddr_t *fp;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
@@ -1125,12 +1134,10 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp)
 	dprint(1, ("sctp_icmp_error: sctp=%p, mp=%p\n", (void *)sctp,
 	    (void *)mp));
 
-	first_mp = mp;
-
 	ipha = (ipha_t *)mp->b_rptr;
 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
-		sctp_icmp_error_ipv6(sctp, first_mp);
+		sctp_icmp_error_ipv6(sctp, mp);
 		return;
 	}
 
@@ -1144,7 +1151,7 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp)
 	/* first_mp must expose the full sctp header. */
 	if ((uchar_t *)(sctph + 1) >= mp->b_wptr) {
 		/* not enough data for SCTP header */
-		freemsg(first_mp);
+		freemsg(mp);
 		return;
 	}
 
@@ -1175,19 +1182,7 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp)
 			if (fp == NULL) {
 				break;
 			}
-
-			new_mtu = ntohs(icmph->icmph_du_mtu);
-
-			if (new_mtu - sctp->sctp_hdr_len >= fp->sfa_pmss)
-				break;
-
-			/*
-			 * Make sure that sfa_pmss is a multiple of
-			 * SCTP_ALIGN.
-			 */
-			fp->sfa_pmss = (new_mtu - sctp->sctp_hdr_len) &
-			    ~(SCTP_ALIGN - 1);
-			fp->pmtu_discovered = 1;
+			sctp_update_pmtu(sctp, fp, B_TRUE);
 			/*
 			 * It is possible, even likely that a fast retransmit
 			 * attempt has been dropped by ip as a result of this
@@ -1229,7 +1224,7 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp)
 		break;
 	}
 	}
-	freemsg(first_mp);
+	freemsg(mp);
 }
 
 /*
@@ -1246,7 +1241,6 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp)
 	uint16_t	iph_hdr_length;
 	sctp_hdr_t *sctpha;
 	uint8_t	*nexthdrp;
-	uint32_t new_mtu;
 	sctp_faddr_t *fp;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
 
@@ -1294,16 +1288,16 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp)
 			break;
 		}
 
-		new_mtu = ntohs(icmp6->icmp6_mtu);
-
-		if (new_mtu - sctp->sctp_hdr6_len >= fp->sfa_pmss)
-			break;
-
-		/* Make sure that sfa_pmss is a multiple of SCTP_ALIGN. */
-		fp->sfa_pmss = (new_mtu - sctp->sctp_hdr6_len) &
-		    ~(SCTP_ALIGN - 1);
-		fp->pmtu_discovered = 1;
-
+		sctp_update_pmtu(sctp, fp, B_TRUE);
+		/*
+		 * It is possible, even likely that a fast retransmit
+		 * attempt has been dropped by ip as a result of this
+		 * error, retransmission bundles as much as possible.
+		 * A retransmit here prevents significant delays waiting
+		 * on the timer. Analogous to behaviour of TCP after
+		 * ICMP too big.
+		 */
+		sctp_rexmit(sctp, fp);
 		break;
 
 	case ICMP6_DST_UNREACH:
@@ -1366,12 +1360,12 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp)
  * If parent pointer is passed in, inherit settings from it.
  */
 sctp_t *
-sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
+sctp_create(void *ulpd, sctp_t *parent, int family, int type, int flags,
     sock_upcalls_t *upcalls, sctp_sockbuf_limits_t *sbl,
     cred_t *credp)
 {
 	sctp_t		*sctp, *psctp;
-	conn_t		*sctp_connp;
+	conn_t		*connp;
 	mblk_t		*ack_mp, *hb_mp;
 	int		sleep = flags & SCTP_CAN_BLOCK ? KM_SLEEP : KM_NOSLEEP;
 	zoneid_t	zoneid;
@@ -1403,18 +1397,8 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
 			zoneid = GLOBAL_ZONEID;
 		else
 			zoneid = crgetzoneid(credp);
-
-		/*
-		 * For stackid zero this is done from strplumb.c, but
-		 * non-zero stackids are handled here.
-		 */
-		if (sctps->sctps_g_q == NULL &&
-		    sctps->sctps_netstack->netstack_stackid !=
-		    GLOBAL_NETSTACKID) {
-			sctp_g_q_setup(sctps);
-		}
 	}
-	if ((sctp_connp = ipcl_conn_create(IPCL_SCTPCONN, sleep,
+	if ((connp = ipcl_conn_create(IPCL_SCTPCONN, sleep,
 	    sctps->sctps_netstack)) == NULL) {
 		netstack_rele(sctps->sctps_netstack);
 		SCTP_KSTAT(sctps, sctp_conn_create);
@@ -1425,49 +1409,38 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
 	 * done at top of sctp_create.
 	 */
 	netstack_rele(sctps->sctps_netstack);
-	sctp = CONN2SCTP(sctp_connp);
+	sctp = CONN2SCTP(connp);
 	sctp->sctp_sctps = sctps;
 
-	sctp_connp->conn_ulp_labeled = is_system_labeled();
 	if ((ack_mp = sctp_timer_alloc(sctp, sctp_ack_timer, sleep)) == NULL ||
 	    (hb_mp = sctp_timer_alloc(sctp, sctp_heartbeat_timer,
 	    sleep)) == NULL) {
 		if (ack_mp != NULL)
 			freeb(ack_mp);
-		sctp_conn_clear(sctp_connp);
+		sctp_conn_clear(connp);
 		sctp->sctp_sctps = NULL;
-		SCTP_G_Q_REFRELE(sctps);
-		kmem_cache_free(sctp_conn_cache, sctp_connp);
+		kmem_cache_free(sctp_conn_cache, connp);
 		return (NULL);
 	}
 
 	sctp->sctp_ack_mp = ack_mp;
 	sctp->sctp_heartbeat_mp = hb_mp;
 
-	switch (family) {
-	case AF_INET6:
-		sctp_connp->conn_af_isv6 = B_TRUE;
-		sctp->sctp_ipversion = IPV6_VERSION;
-		sctp->sctp_family = AF_INET6;
-		break;
+	/*
+	 * Have conn_ip_output drop packets should our outer source
+	 * go invalid, and tell us about mtu changes.
+	 */
+	connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
+	    IXAF_VERIFY_PMTU;
+	connp->conn_family = family;
+	connp->conn_so_type = type;
 
-	case AF_INET:
-		sctp_connp->conn_af_isv6 = B_FALSE;
-		sctp_connp->conn_pkt_isv6 = B_FALSE;
-		sctp->sctp_ipversion = IPV4_VERSION;
-		sctp->sctp_family = AF_INET;
-		break;
-	default:
-		ASSERT(0);
-		break;
-	}
 	if (sctp_init_values(sctp, psctp, sleep) != 0) {
 		freeb(ack_mp);
 		freeb(hb_mp);
-		sctp_conn_clear(sctp_connp);
+		sctp_conn_clear(connp);
 		sctp->sctp_sctps = NULL;
-		SCTP_G_Q_REFRELE(sctps);
-		kmem_cache_free(sctp_conn_cache, sctp_connp);
+		kmem_cache_free(sctp_conn_cache, connp);
 		return (NULL);
 	}
 	sctp->sctp_cansleep = ((flags & SCTP_CAN_BLOCK) == SCTP_CAN_BLOCK);
@@ -1476,6 +1449,8 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
 	    sctp->sctp_hdr6_len : sctp->sctp_hdr_len);
 
 	if (psctp != NULL) {
+		conn_t	*pconnp = psctp->sctp_connp;
+
 		RUN_SCTP(psctp);
 		/*
 		 * Inherit local address list, local port. Parent is either
@@ -1488,10 +1463,9 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
 			freeb(ack_mp);
 			freeb(hb_mp);
 			sctp_headers_free(sctp);
-			sctp_conn_clear(sctp_connp);
+			sctp_conn_clear(connp);
 			sctp->sctp_sctps = NULL;
-			SCTP_G_Q_REFRELE(sctps);
-			kmem_cache_free(sctp_conn_cache, sctp_connp);
+			kmem_cache_free(sctp_conn_cache, connp);
 			return (NULL);
 		}
 
@@ -1500,28 +1474,32 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
 		 * followed by sctp_connect(). So don't add this guy to
 		 * bind hash.
 		 */
-		sctp->sctp_lport = psctp->sctp_lport;
+		connp->conn_lport = pconnp->conn_lport;
 		sctp->sctp_state = SCTPS_BOUND;
-		sctp->sctp_allzones = psctp->sctp_allzones;
-		sctp->sctp_zoneid = psctp->sctp_zoneid;
 		WAKE_SCTP(psctp);
 	} else {
-		sctp->sctp_zoneid = zoneid;
-	}
-
-	sctp->sctp_cpid = curproc->p_pid;
-	sctp->sctp_open_time = lbolt64;
+		ASSERT(connp->conn_cred == NULL);
+		connp->conn_zoneid = zoneid;
+		/*
+		 * conn_allzones can not be set this early, hence
+		 * no IPCL_ZONEID
+		 */
+		connp->conn_ixa->ixa_zoneid = zoneid;
+		connp->conn_open_time = lbolt64;
+		connp->conn_cred = credp;
+		crhold(credp);
+		connp->conn_cpid = curproc->p_pid;
 
-	ASSERT(sctp_connp->conn_cred == NULL);
-	sctp_connp->conn_cred = credp;
-	crhold(credp);
+		/*
+		 * If the caller has the process-wide flag set, then default to
+		 * MAC exempt mode.  This allows read-down to unlabeled hosts.
+		 */
+		if (getpflags(NET_MAC_AWARE, credp) != 0)
+			connp->conn_mac_mode = CONN_MAC_AWARE;
 
-	/*
-	 * If the caller has the process-wide flag set, then default to MAC
-	 * exempt mode.  This allows read-down to unlabeled hosts.
-	 */
-	if (getpflags(NET_MAC_AWARE, credp) != 0)
-		sctp_connp->conn_mac_mode = CONN_MAC_AWARE;
+		connp->conn_zone_is_global =
+		    (crgetzoneid(credp) == GLOBAL_ZONEID);
+	}
 
 	/* Initialize SCTP instance values,  our verf tag must never be 0 */
 	(void) random_get_pseudo_bytes((uint8_t *)&sctp->sctp_lvtag,
@@ -1536,20 +1514,17 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
 	sctp->sctp_adv_pap = sctp->sctp_lastack_rxd;
 
 	/* Information required by upper layer */
-	if (ulpd != NULL) {
-		sctp->sctp_ulpd = ulpd;
-
-		ASSERT(upcalls != NULL);
-		sctp->sctp_upcalls = upcalls;
-		ASSERT(sbl != NULL);
-		/* Fill in the socket buffer limits for sctpsockfs */
-		sbl->sbl_txlowat = sctp->sctp_xmit_lowater;
-		sbl->sbl_txbuf = sctp->sctp_xmit_hiwater;
-		sbl->sbl_rxbuf = sctp->sctp_rwnd;
-		sbl->sbl_rxlowat = SCTP_RECV_LOWATER;
-	}
-	/* If no ulpd, must be creating the default sctp */
-	ASSERT(ulpd != NULL || sctps->sctps_gsctp == NULL);
+	ASSERT(ulpd != NULL);
+	sctp->sctp_ulpd = ulpd;
+
+	ASSERT(upcalls != NULL);
+	sctp->sctp_upcalls = upcalls;
+	ASSERT(sbl != NULL);
+	/* Fill in the socket buffer limits for sctpsockfs */
+	sbl->sbl_txlowat = connp->conn_sndlowat;
+	sbl->sbl_txbuf = connp->conn_sndbuf;
+	sbl->sbl_rxbuf = sctp->sctp_rwnd;
+	sbl->sbl_rxlowat = SCTP_RECV_LOWATER;
 
 	/* Insert this in the global list. */
 	SCTP_LINK(sctp, sctps);
@@ -1557,232 +1532,6 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
 	return (sctp);
 }
 
-/*
- * Make sure we wait until the default queue is setup, yet allow
- * sctp_g_q_create() to open a SCTP stream.
- * We need to allow sctp_g_q_create() do do an open
- * of sctp, hence we compare curhread.
- * All others have to wait until the sctps_g_q has been
- * setup.
- */
-void
-sctp_g_q_setup(sctp_stack_t *sctps)
-{
-	mutex_enter(&sctps->sctps_g_q_lock);
-	if (sctps->sctps_g_q != NULL) {
-		mutex_exit(&sctps->sctps_g_q_lock);
-		return;
-	}
-	if (sctps->sctps_g_q_creator == NULL) {
-		/* This thread will set it up */
-		sctps->sctps_g_q_creator = curthread;
-		mutex_exit(&sctps->sctps_g_q_lock);
-		sctp_g_q_create(sctps);
-		mutex_enter(&sctps->sctps_g_q_lock);
-		ASSERT(sctps->sctps_g_q_creator == curthread);
-		sctps->sctps_g_q_creator = NULL;
-		cv_signal(&sctps->sctps_g_q_cv);
-		ASSERT(sctps->sctps_g_q != NULL);
-		mutex_exit(&sctps->sctps_g_q_lock);
-		return;
-	}
-	/* Everybody but the creator has to wait */
-	if (sctps->sctps_g_q_creator != curthread) {
-		while (sctps->sctps_g_q == NULL)
-			cv_wait(&sctps->sctps_g_q_cv, &sctps->sctps_g_q_lock);
-	}
-	mutex_exit(&sctps->sctps_g_q_lock);
-}
-
-#define	IP	"ip"
-
-#define	SCTP6DEV		"/devices/pseudo/sctp6@0:sctp6"
-
-/*
- * Create a default sctp queue here instead of in strplumb
- */
-void
-sctp_g_q_create(sctp_stack_t *sctps)
-{
-	int error;
-	ldi_handle_t	lh = NULL;
-	ldi_ident_t	li = NULL;
-	int		rval;
-	cred_t		*cr;
-	major_t IP_MAJ;
-
-#ifdef NS_DEBUG
-	(void) printf("sctp_g_q_create()for stack %d\n",
-	    sctps->sctps_netstack->netstack_stackid);
-#endif
-
-	IP_MAJ = ddi_name_to_major(IP);
-
-	ASSERT(sctps->sctps_g_q_creator == curthread);
-
-	error = ldi_ident_from_major(IP_MAJ, &li);
-	if (error) {
-#ifdef DEBUG
-		printf("sctp_g_q_create: lyr ident get failed error %d\n",
-		    error);
-#endif
-		return;
-	}
-
-	cr = zone_get_kcred(netstackid_to_zoneid(
-	    sctps->sctps_netstack->netstack_stackid));
-	ASSERT(cr != NULL);
-	/*
-	 * We set the sctp default queue to IPv6 because IPv4 falls
-	 * back to IPv6 when it can't find a client, but
-	 * IPv6 does not fall back to IPv4.
-	 */
-	error = ldi_open_by_name(SCTP6DEV, FREAD|FWRITE, cr, &lh, li);
-	if (error) {
-#ifdef DEBUG
-		printf("sctp_g_q_create: open of SCTP6DEV failed error %d\n",
-		    error);
-#endif
-		goto out;
-	}
-
-	/*
-	 * This ioctl causes the sctp framework to cache a pointer to
-	 * this stream, so we don't want to close the stream after
-	 * this operation.
-	 * Use the kernel credentials that are for the zone we're in.
-	 */
-	error = ldi_ioctl(lh, SCTP_IOC_DEFAULT_Q,
-	    (intptr_t)0, FKIOCTL, cr, &rval);
-	if (error) {
-#ifdef DEBUG
-		printf("sctp_g_q_create: ioctl SCTP_IOC_DEFAULT_Q failed "
-		    "error %d\n", error);
-#endif
-		goto out;
-	}
-	sctps->sctps_g_q_lh = lh;	/* For sctp_g_q_inactive */
-	lh = NULL;
-out:
-	/* Close layered handles */
-	if (li)
-		ldi_ident_release(li);
-	/* Keep cred around until _inactive needs it */
-	sctps->sctps_g_q_cr = cr;
-}
-
-/*
- * Remove the sctp_default queue so that new connections will not find it.
- * SCTP uses sctp_g_q for all transmission, so all sctp'ts implicitly
- * refer to it. Hence have each one have a reference on sctp_g_q_ref!
- *
- * We decrement the refcnt added in sctp_g_q_create. Once all the
- * sctp_t's which use the default go away, sctp_g_q_close will be called
- * and close the sctp_g_q. Once sctp_g_q is closed, sctp_close() will drop the
- * last reference count on the stack by calling netstack_rele().
- */
-void
-sctp_g_q_destroy(sctp_stack_t *sctps)
-{
-	if (sctps->sctps_g_q == NULL) {
-		return;	/* Nothing to cleanup */
-	}
-	/*
-	 * Keep sctps_g_q and sctps_gsctp until the last reference has
-	 * dropped, since the output is always done using those.
-	 * Need to decrement twice to take sctp_g_q_create and
-	 * the gsctp reference into account so that sctp_g_q_inactive is called
-	 * when all but the default queue remains.
-	 */
-#ifdef NS_DEBUG
-	(void) printf("sctp_g_q_destroy: ref %d\n",
-	    sctps->sctps_g_q_ref);
-#endif
-	SCTP_G_Q_REFRELE(sctps);
-}
-
-/*
- * Called when last user (could be sctp_g_q_destroy) drops reference count
- * using SCTP_G_Q_REFRELE.
- * Run by sctp_q_q_inactive using a taskq.
- */
-static void
-sctp_g_q_close(void *arg)
-{
-	sctp_stack_t *sctps = arg;
-	int error;
-	ldi_handle_t	lh = NULL;
-	ldi_ident_t	li = NULL;
-	cred_t		*cr;
-	major_t IP_MAJ;
-
-	IP_MAJ = ddi_name_to_major(IP);
-
-	lh = sctps->sctps_g_q_lh;
-	if (lh == NULL)
-		return;	/* Nothing to cleanup */
-
-	error = ldi_ident_from_major(IP_MAJ, &li);
-	if (error) {
-#ifdef NS_DEBUG
-		printf("sctp_g_q_inactive: lyr ident get failed error %d\n",
-		    error);
-#endif
-		return;
-	}
-
-	cr = sctps->sctps_g_q_cr;
-	sctps->sctps_g_q_cr = NULL;
-	ASSERT(cr != NULL);
-
-	/*
-	 * Make sure we can break the recursion when sctp_close decrements
-	 * the reference count causing g_q_inactive to be called again.
-	 */
-	sctps->sctps_g_q_lh = NULL;
-
-	/* close the default queue */
-	(void) ldi_close(lh, FREAD|FWRITE, cr);
-
-	/* Close layered handles */
-	ldi_ident_release(li);
-	crfree(cr);
-
-	ASSERT(sctps->sctps_g_q != NULL);
-	sctps->sctps_g_q = NULL;
-	/*
-	 * Now free sctps_gsctp.
-	 */
-	ASSERT(sctps->sctps_gsctp != NULL);
-	sctp_closei_local(sctps->sctps_gsctp);
-	SCTP_CONDEMNED(sctps->sctps_gsctp);
-	SCTP_REFRELE(sctps->sctps_gsctp);
-	sctps->sctps_gsctp = NULL;
-}
-
-/*
- * Called when last sctp_t drops reference count using SCTP_G_Q_REFRELE.
- *
- * Have to ensure that the ldi routines are not used by an
- * interrupt thread by using a taskq.
- */
-void
-sctp_g_q_inactive(sctp_stack_t *sctps)
-{
-	if (sctps->sctps_g_q_lh == NULL)
-		return;	/* Nothing to cleanup */
-
-	ASSERT(sctps->sctps_g_q_ref == 0);
-	SCTP_G_Q_REFHOLD(sctps); /* Compensate for what g_q_destroy did */
-
-	if (servicing_interrupt()) {
-		(void) taskq_dispatch(sctp_taskq, sctp_g_q_close,
-		    (void *) sctps, TQ_SLEEP);
-	} else {
-		sctp_g_q_close(sctps);
-	}
-}
-
 /* Run at module load time */
 void
 sctp_ddi_g_init(void)
@@ -1802,16 +1551,12 @@ sctp_ddi_g_init(void)
 	/* Initialize tables used for CRC calculation */
 	sctp_crc32_init();
 
-	sctp_taskq = taskq_create("sctp_taskq", 1, minclsyspri, 1, 1,
-	    TASKQ_PREPOPULATE);
-
 	/*
 	 * We want to be informed each time a stack is created or
 	 * destroyed in the kernel, so we can maintain the
 	 * set of sctp_stack_t's.
 	 */
-	netstack_register(NS_SCTP, sctp_stack_init, sctp_stack_shutdown,
-	    sctp_stack_fini);
+	netstack_register(NS_SCTP, sctp_stack_init, NULL, sctp_stack_fini);
 }
 
 static void *
@@ -1823,8 +1568,6 @@ sctp_stack_init(netstackid_t stackid, netstack_t *ns)
 	sctps->sctps_netstack = ns;
 
 	/* Initialize locks */
-	mutex_init(&sctps->sctps_g_q_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&sctps->sctps_g_q_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&sctps->sctps_g_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&sctps->sctps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL);
 	sctps->sctps_g_num_epriv_ports = SCTP_NUM_EPRIV_PORTS;
@@ -1875,19 +1618,6 @@ sctp_ddi_g_destroy(void)
 	sctp_ftsn_sets_fini();
 
 	netstack_unregister(NS_SCTP);
-	taskq_destroy(sctp_taskq);
-}
-
-/*
- * Shut down the SCTP stack instance.
- */
-/* ARGSUSED */
-static void
-sctp_stack_shutdown(netstackid_t stackid, void *arg)
-{
-	sctp_stack_t *sctps = (sctp_stack_t *)arg;
-
-	sctp_g_q_destroy(sctps);
 }
 
 /*
@@ -1922,8 +1652,6 @@ sctp_stack_fini(netstackid_t stackid, void *arg)
 
 	mutex_destroy(&sctps->sctps_g_lock);
 	mutex_destroy(&sctps->sctps_epriv_port_lock);
-	mutex_destroy(&sctps->sctps_g_q_lock);
-	cv_destroy(&sctps->sctps_g_q_cv);
 
 	kmem_free(sctps, sizeof (*sctps));
 }
@@ -1934,7 +1662,8 @@ sctp_display_all(sctp_stack_t *sctps)
 	sctp_t *sctp_walker;
 
 	mutex_enter(&sctps->sctps_g_lock);
-	for (sctp_walker = sctps->sctps_gsctp; sctp_walker != NULL;
+	for (sctp_walker = list_head(&sctps->sctps_g_list);
+	    sctp_walker != NULL;
 	    sctp_walker = (sctp_t *)list_next(&sctps->sctps_g_list,
 	    sctp_walker)) {
 		(void) sctp_display(sctp_walker, NULL);
@@ -2009,81 +1738,6 @@ sctp_inc_taskq(sctp_stack_t *sctps)
 }
 
 #ifdef DEBUG
-uint32_t sendq_loop_cnt = 0;
-uint32_t sendq_collision = 0;
-uint32_t sendq_empty = 0;
-#endif
-
-void
-sctp_add_sendq(sctp_t *sctp, mblk_t *mp)
-{
-	mutex_enter(&sctp->sctp_sendq_lock);
-	if (sctp->sctp_sendq == NULL) {
-		sctp->sctp_sendq = mp;
-		sctp->sctp_sendq_tail = mp;
-	} else {
-		sctp->sctp_sendq_tail->b_next = mp;
-		sctp->sctp_sendq_tail = mp;
-	}
-	mutex_exit(&sctp->sctp_sendq_lock);
-}
-
-void
-sctp_process_sendq(sctp_t *sctp)
-{
-	mblk_t *mp;
-#ifdef DEBUG
-	uint32_t loop_cnt = 0;
-#endif
-
-	mutex_enter(&sctp->sctp_sendq_lock);
-	if (sctp->sctp_sendq == NULL || sctp->sctp_sendq_sending) {
-#ifdef DEBUG
-		if (sctp->sctp_sendq == NULL)
-			sendq_empty++;
-		else
-			sendq_collision++;
-#endif
-		mutex_exit(&sctp->sctp_sendq_lock);
-		return;
-	}
-	sctp->sctp_sendq_sending = B_TRUE;
-
-	/*
-	 * Note that while we are in this loop, other thread can put
-	 * new packets in the receive queue.  We may be looping for
-	 * quite a while.  This is OK even for an interrupt thread.
-	 * The reason is that SCTP should only able to send a limited
-	 * number of packets out in a burst.  So the number of times
-	 * we go through this loop should not be many.
-	 */
-	while ((mp = sctp->sctp_sendq) != NULL) {
-		sctp->sctp_sendq = mp->b_next;
-		ASSERT(sctp->sctp_connp->conn_ref > 0);
-		mutex_exit(&sctp->sctp_sendq_lock);
-		mp->b_next = NULL;
-		CONN_INC_REF(sctp->sctp_connp);
-		mp->b_flag |= MSGHASREF;
-		/* If we don't have sctp_current, default to IPv4 */
-		IP_PUT(mp, sctp->sctp_connp, sctp->sctp_current == NULL ?
-		    B_TRUE : sctp->sctp_current->isv4);
-		BUMP_LOCAL(sctp->sctp_opkts);
-#ifdef DEBUG
-		loop_cnt++;
-#endif
-		mutex_enter(&sctp->sctp_sendq_lock);
-	}
-
-	sctp->sctp_sendq_tail = NULL;
-	sctp->sctp_sendq_sending = B_FALSE;
-#ifdef DEBUG
-	if (loop_cnt > sendq_loop_cnt)
-		sendq_loop_cnt = loop_cnt;
-#endif
-	mutex_exit(&sctp->sctp_sendq_lock);
-}
-
-#ifdef DEBUG
 uint32_t recvq_loop_cnt = 0;
 uint32_t recvq_call = 0;
 #endif
@@ -2144,10 +1798,19 @@ sctp_find_next_tq(sctp_t *sctp)
  * If the try_harder argument is B_TRUE, this routine sctp_find_next_tq()
  * will try very hard to dispatch the task.  Refer to the comment
  * for that routine on how it does that.
+ *
+ * On failure the message has been freed i.e., this routine always consumes the
+ * message. It bumps ipIfStatsInDiscards and and uses ip_drop_input to drop.
  */
-boolean_t
-sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock)
+void
+sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock,
+    ip_recv_attr_t *ira)
 {
+	mblk_t	*attrmp;
+	ip_stack_t	*ipst = sctp->sctp_sctps->sctps_netstack->netstack_ip;
+
+	ASSERT(ira->ira_ill == NULL);
+
 	if (!caller_hold_lock)
 		mutex_enter(&sctp->sctp_recvq_lock);
 
@@ -2157,12 +1820,28 @@ sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock)
 		if (!sctp_find_next_tq(sctp)) {
 			if (!caller_hold_lock)
 				mutex_exit(&sctp->sctp_recvq_lock);
-			return (B_FALSE);
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+			freemsg(mp);
+			return;
 		}
 		/* Make sure the sctp_t will not go away. */
 		SCTP_REFHOLD(sctp);
 	}
 
+	attrmp = ip_recv_attr_to_mblk(ira);
+	if (attrmp == NULL) {
+		if (!caller_hold_lock)
+			mutex_exit(&sctp->sctp_recvq_lock);
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+		freemsg(mp);
+		return;
+	}
+	ASSERT(attrmp->b_cont == NULL);
+	attrmp->b_cont = mp;
+	mp = attrmp;
+
 	if (sctp->sctp_recvq == NULL) {
 		sctp->sctp_recvq = mp;
 		sctp->sctp_recvq_tail = mp;
@@ -2173,7 +1852,6 @@ sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock)
 
 	if (!caller_hold_lock)
 		mutex_exit(&sctp->sctp_recvq_lock);
-	return (B_TRUE);
 }
 
 static void
@@ -2181,10 +1859,10 @@ sctp_process_recvq(void *arg)
 {
 	sctp_t		*sctp = (sctp_t *)arg;
 	mblk_t		*mp;
-	mblk_t		*ipsec_mp;
 #ifdef DEBUG
 	uint32_t	loop_cnt = 0;
 #endif
+	ip_recv_attr_t	iras;
 
 #ifdef	_BIG_ENDIAN
 #define	IPVER(ip6h)	((((uint32_t *)ip6h)[0] >> 28) & 0x7)
@@ -2204,16 +1882,31 @@ sctp_process_recvq(void *arg)
 	 * quite a while.
 	 */
 	while ((mp = sctp->sctp_recvq) != NULL) {
+		mblk_t *data_mp;
+
 		sctp->sctp_recvq = mp->b_next;
 		mutex_exit(&sctp->sctp_recvq_lock);
 		mp->b_next = NULL;
 #ifdef DEBUG
 		loop_cnt++;
 #endif
-		ipsec_mp = mp->b_prev;
 		mp->b_prev = NULL;
-		sctp_input_data(sctp, mp, ipsec_mp);
 
+		data_mp = mp->b_cont;
+		mp->b_cont = NULL;
+		if (!ip_recv_attr_from_mblk(mp, &iras)) {
+			ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
+			freemsg(mp);
+			ira_cleanup(&iras, B_TRUE);
+			continue;
+		}
+
+		if (iras.ira_flags & IRAF_ICMP_ERROR)
+			sctp_icmp_error(sctp, data_mp);
+		else
+			sctp_input_data(sctp, data_mp, &iras);
+
+		ira_cleanup(&iras, B_TRUE);
 		mutex_enter(&sctp->sctp_recvq_lock);
 	}
 
@@ -2224,8 +1917,6 @@ sctp_process_recvq(void *arg)
 
 	WAKE_SCTP(sctp);
 
-	/* We may have sent something when processing the receive queue. */
-	sctp_process_sendq(sctp);
 #ifdef DEBUG
 	if (loop_cnt > recvq_loop_cnt)
 		recvq_loop_cnt = loop_cnt;
@@ -2238,18 +1929,32 @@ sctp_process_recvq(void *arg)
 static int
 sctp_conn_cache_constructor(void *buf, void *cdrarg, int kmflags)
 {
-	conn_t	*sctp_connp = (conn_t *)buf;
-	sctp_t	*sctp = (sctp_t *)&sctp_connp[1];
+	conn_t	*connp = (conn_t *)buf;
+	sctp_t	*sctp = (sctp_t *)&connp[1];
 
+	bzero(connp, sizeof (conn_t));
 	bzero(buf, (char *)&sctp[1] - (char *)buf);
 
-	sctp->sctp_connp = sctp_connp;
 	mutex_init(&sctp->sctp_reflock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&sctp->sctp_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&sctp->sctp_recvq_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&sctp->sctp_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&sctp->sctp_sendq_lock, NULL, MUTEX_DEFAULT, NULL);
 
+	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
+	connp->conn_flags = IPCL_SCTPCONN;
+	connp->conn_proto = IPPROTO_SCTP;
+	connp->conn_sctp = sctp;
+	sctp->sctp_connp = connp;
+	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+
+	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+	if (connp->conn_ixa == NULL) {
+		return (ENOMEM);
+	}
+	connp->conn_ixa->ixa_refcnt = 1;
+	connp->conn_ixa->ixa_protocol = connp->conn_proto;
+	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
 	return (0);
 }
 
@@ -2257,14 +1962,13 @@ sctp_conn_cache_constructor(void *buf, void *cdrarg, int kmflags)
 static void
 sctp_conn_cache_destructor(void *buf, void *cdrarg)
 {
-	conn_t	*sctp_connp = (conn_t *)buf;
-	sctp_t	*sctp = (sctp_t *)&sctp_connp[1];
+	conn_t	*connp = (conn_t *)buf;
+	sctp_t	*sctp = (sctp_t *)&connp[1];
 
+	ASSERT(sctp->sctp_connp == connp);
 	ASSERT(!MUTEX_HELD(&sctp->sctp_lock));
 	ASSERT(!MUTEX_HELD(&sctp->sctp_reflock));
 	ASSERT(!MUTEX_HELD(&sctp->sctp_recvq_lock));
-	ASSERT(!MUTEX_HELD(&sctp->sctp_sendq_lock));
-	ASSERT(!MUTEX_HELD(&sctp->sctp_connp->conn_lock));
 
 	ASSERT(sctp->sctp_conn_hash_next == NULL);
 	ASSERT(sctp->sctp_conn_hash_prev == NULL);
@@ -2317,16 +2021,6 @@ sctp_conn_cache_destructor(void *buf, void *cdrarg)
 	ASSERT(sctp->sctp_recvq_tail == NULL);
 	ASSERT(sctp->sctp_recvq_tq == NULL);
 
-	ASSERT(sctp->sctp_sendq == NULL);
-	ASSERT(sctp->sctp_sendq_tail == NULL);
-	ASSERT(sctp->sctp_sendq_sending == B_FALSE);
-
-	ASSERT(sctp->sctp_ipp_hopopts == NULL);
-	ASSERT(sctp->sctp_ipp_rtdstopts == NULL);
-	ASSERT(sctp->sctp_ipp_rthdr == NULL);
-	ASSERT(sctp->sctp_ipp_dstopts == NULL);
-	ASSERT(sctp->sctp_ipp_pathmtu == NULL);
-
 	/*
 	 * sctp_pad_mp can be NULL if the memory allocation fails
 	 * in sctp_init_values() and the conn_t is freed.
@@ -2340,8 +2034,18 @@ sctp_conn_cache_destructor(void *buf, void *cdrarg)
 	mutex_destroy(&sctp->sctp_lock);
 	mutex_destroy(&sctp->sctp_recvq_lock);
 	cv_destroy(&sctp->sctp_cv);
-	mutex_destroy(&sctp->sctp_sendq_lock);
 
+	mutex_destroy(&connp->conn_lock);
+	cv_destroy(&connp->conn_cv);
+	rw_destroy(&connp->conn_ilg_lock);
+
+	/* Can be NULL if constructor failed */
+	if (connp->conn_ixa != NULL) {
+		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+		ASSERT(connp->conn_ixa->ixa_ire == NULL);
+		ASSERT(connp->conn_ixa->ixa_nce == NULL);
+		ixa_refrele(connp->conn_ixa);
+	}
 }
 
 static void
@@ -2361,31 +2065,53 @@ sctp_conn_cache_fini()
 void
 sctp_conn_init(conn_t *connp)
 {
-	connp->conn_flags = IPCL_SCTPCONN;
+	ASSERT(connp->conn_flags == IPCL_SCTPCONN);
 	connp->conn_rq = connp->conn_wq = NULL;
-	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
-	connp->conn_ulp = IPPROTO_SCTP;
+	connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
+	    IXAF_VERIFY_PMTU;
+
+	ASSERT(connp->conn_proto == IPPROTO_SCTP);
+	ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
 	connp->conn_state_flags |= CONN_INCIPIENT;
-	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
+
+	ASSERT(connp->conn_sctp != NULL);
+
+	/*
+	 * Register sctp_notify to listen to capability changes detected by IP.
+	 * This upcall is made in the context of the call to conn_ip_output
+	 * thus it holds whatever locks sctp holds across conn_ip_output.
+	 */
+	connp->conn_ixa->ixa_notify = sctp_notify;
+	connp->conn_ixa->ixa_notify_cookie = connp->conn_sctp;
 }
 
 static void
 sctp_conn_clear(conn_t *connp)
 {
 	/* Clean up conn_t stuff */
-	if (connp->conn_latch != NULL)
-		IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
-	if (connp->conn_policy != NULL)
+	if (connp->conn_latch != NULL) {
+		IPLATCH_REFRELE(connp->conn_latch);
+		connp->conn_latch = NULL;
+	}
+	if (connp->conn_latch_in_policy != NULL) {
+		IPPOL_REFRELE(connp->conn_latch_in_policy);
+		connp->conn_latch_in_policy = NULL;
+	}
+	if (connp->conn_latch_in_action != NULL) {
+		IPACT_REFRELE(connp->conn_latch_in_action);
+		connp->conn_latch_in_action = NULL;
+	}
+	if (connp->conn_policy != NULL) {
 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
-	if (connp->conn_ipsec_opt_mp != NULL)
+		connp->conn_policy = NULL;
+	}
+	if (connp->conn_ipsec_opt_mp != NULL) {
 		freemsg(connp->conn_ipsec_opt_mp);
-	if (connp->conn_cred != NULL)
-		crfree(connp->conn_cred);
-	if (connp->conn_effective_cred != NULL)
-		crfree(connp->conn_effective_cred);
-	mutex_destroy(&connp->conn_lock);
-	cv_destroy(&connp->conn_cv);
+		connp->conn_ipsec_opt_mp = NULL;
+	}
 	netstack_rele(connp->conn_netstack);
-	bzero(connp, sizeof (struct conn_s));
+	connp->conn_netstack = NULL;
+
+	/* Leave conn_ixa and other constructed fields in place */
+	ipcl_conn_cleanup(connp);
 }
diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.c b/usr/src/uts/common/inet/sctp/sctp_addr.c
index b347d30dda..306362211d 100644
--- a/usr/src/uts/common/inet/sctp/sctp_addr.c
+++ b/usr/src/uts/common/inet/sctp/sctp_addr.c
@@ -41,6 +41,7 @@
 #include <inet/common.h>
 #include <inet/ip.h>
 #include <inet/ip6.h>
+#include <inet/ip_ire.h>
 #include <inet/ip_if.h>
 #include <inet/ipclassifier.h>
 #include <inet/sctp_ip.h>
@@ -236,6 +237,7 @@ sctp_get_all_ipifs(sctp_t *sctp, int sleep)
 	int			error = 0;
 	sctp_stack_t		*sctps = sctp->sctp_sctps;
 	boolean_t		isv6;
+	conn_t			*connp = sctp->sctp_connp;
 
 	rw_enter(&sctps->sctps_g_ipifs_lock, RW_READER);
 	for (i = 0; i < SCTP_IPIF_HASH; i++) {
@@ -250,8 +252,8 @@ sctp_get_all_ipifs(sctp_t *sctp, int sleep)
 			    !SCTP_IPIF_ZONE_MATCH(sctp, sctp_ipif) ||
 			    SCTP_IS_ADDR_UNSPEC(!isv6,
 			    sctp_ipif->sctp_ipif_saddr) ||
-			    (sctp->sctp_ipversion == IPV4_VERSION && isv6) ||
-			    (sctp->sctp_connp->conn_ipv6_v6only && !isv6)) {
+			    (connp->conn_family == AF_INET && isv6) ||
+			    (connp->conn_ipv6_v6only && !isv6)) {
 				rw_exit(&sctp_ipif->sctp_ipif_lock);
 				sctp_ipif = list_next(
 				    &sctps->sctps_g_ipifs[i].sctp_ipif_list,
@@ -303,6 +305,7 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
 	boolean_t		check_addrs = B_FALSE;
 	boolean_t		check_lport = B_FALSE;
 	uchar_t			*p = list;
+	conn_t			*connp = sctp->sctp_connp;
 
 	/*
 	 * Need to check for port and address depending on the state.
@@ -325,11 +328,11 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
 		boolean_t	lookup_saddr = B_TRUE;
 		uint_t		ifindex = 0;
 
-		switch (sctp->sctp_family) {
+		switch (connp->conn_family) {
 		case AF_INET:
 			sin4 = (struct sockaddr_in *)addrs + cnt;
 			if (sin4->sin_family != AF_INET || (check_lport &&
-			    sin4->sin_port != sctp->sctp_lport)) {
+			    sin4->sin_port != connp->conn_lport)) {
 				err = EINVAL;
 				goto free_ret;
 			}
@@ -351,14 +354,14 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
 		case AF_INET6:
 			sin6 = (struct sockaddr_in6 *)addrs + cnt;
 			if (sin6->sin6_family != AF_INET6 || (check_lport &&
-			    sin6->sin6_port != sctp->sctp_lport)) {
+			    sin6->sin6_port != connp->conn_lport)) {
 				err = EINVAL;
 				goto free_ret;
 			}
 			addr = sin6->sin6_addr;
 			/* Contains the interface index */
 			ifindex = sin6->sin6_scope_id;
-			if (sctp->sctp_connp->conn_ipv6_v6only &&
+			if (connp->conn_ipv6_v6only &&
 			    IN6_IS_ADDR_V4MAPPED(&addr)) {
 				err = EAFNOSUPPORT;
 				goto free_ret;
@@ -382,7 +385,7 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
 		}
 		if (lookup_saddr) {
 			ipif = sctp_lookup_ipif_addr(&addr, B_TRUE,
-			    sctp->sctp_zoneid, !sctp->sctp_connp->conn_allzones,
+			    IPCL_ZONEID(connp), !connp->conn_allzones,
 			    ifindex, 0, B_TRUE, sctp->sctp_sctps);
 			if (ipif == NULL) {
 				/* Address not in the list */
@@ -495,6 +498,8 @@ sctp_ipif_hash_insert(sctp_t *sctp, sctp_ipif_t *ipif, int sleep,
 /*
  * Given a source address, walk through the peer address list to see
  * if the source address is being used.  If it is, reset that.
+ * A cleared saddr will then make sctp_make_mp lookup the destination again
+ * and as part of that look for a new source.
  */
 static void
 sctp_fix_saddr(sctp_t *sctp, in6_addr_t *saddr)
@@ -504,10 +509,6 @@ sctp_fix_saddr(sctp_t *sctp, in6_addr_t *saddr)
 	for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
 		if (!IN6_ARE_ADDR_EQUAL(&fp->saddr, saddr))
 			continue;
-		if (fp->ire != NULL) {
-			IRE_REFRELE_NOTR(fp->ire);
-			fp->ire = NULL;
-		}
 		V6_SET_ZERO(fp->saddr);
 	}
 }
@@ -874,8 +875,8 @@ sctp_update_saddrs(sctp_ipif_t *oipif, sctp_ipif_t *nipif, int idx,
 	sctp_saddr_ipif_t	*sobj;
 	int			count;
 
-	sctp = sctps->sctps_gsctp;
 	mutex_enter(&sctps->sctps_g_lock);
+	sctp = list_head(&sctps->sctps_g_list);
 	while (sctp != NULL && oipif->sctp_ipif_refcnt > 0) {
 		mutex_enter(&sctp->sctp_reflock);
 		if (sctp->sctp_condemned ||
@@ -1202,7 +1203,6 @@ sctp_update_ipif(ipif_t *ipif, int op)
 		rw_downgrade(&sctps->sctps_g_ipifs_lock);
 		rw_enter(&sctp_ipif->sctp_ipif_lock, RW_WRITER);
 		sctp_ipif->sctp_ipif_state = SCTP_IPIFS_UP;
-		sctp_ipif->sctp_ipif_mtu = ipif->ipif_mtu;
 		sctp_ipif->sctp_ipif_flags = ipif->ipif_flags;
 		rw_exit(&sctp_ipif->sctp_ipif_lock);
 		sctp_chk_and_updt_saddr(hindex, sctp_ipif,
@@ -1214,7 +1214,6 @@ sctp_update_ipif(ipif_t *ipif, int op)
 
 		rw_downgrade(&sctps->sctps_g_ipifs_lock);
 		rw_enter(&sctp_ipif->sctp_ipif_lock, RW_WRITER);
-		sctp_ipif->sctp_ipif_mtu = ipif->ipif_mtu;
 		sctp_ipif->sctp_ipif_zoneid = ipif->ipif_zoneid;
 		sctp_ipif->sctp_ipif_flags = ipif->ipif_flags;
 		rw_exit(&sctp_ipif->sctp_ipif_lock);
@@ -1226,7 +1225,6 @@ sctp_update_ipif(ipif_t *ipif, int op)
 		rw_downgrade(&sctps->sctps_g_ipifs_lock);
 		rw_enter(&sctp_ipif->sctp_ipif_lock, RW_WRITER);
 		sctp_ipif->sctp_ipif_state = SCTP_IPIFS_DOWN;
-		sctp_ipif->sctp_ipif_mtu = ipif->ipif_mtu;
 		sctp_ipif->sctp_ipif_flags = ipif->ipif_flags;
 		rw_exit(&sctp_ipif->sctp_ipif_lock);
 
@@ -1277,6 +1275,7 @@ sctp_del_saddr_list(sctp_t *sctp, const void *addrs, int addcnt,
 	in6_addr_t		addr;
 	sctp_ipif_t		*sctp_ipif;
 	int			ifindex = 0;
+	conn_t			*connp = sctp->sctp_connp;
 
 	ASSERT(sctp->sctp_nsaddrs >= addcnt);
 
@@ -1288,7 +1287,7 @@ sctp_del_saddr_list(sctp_t *sctp, const void *addrs, int addcnt,
 	}
 
 	for (cnt = 0; cnt < addcnt; cnt++) {
-		switch (sctp->sctp_family) {
+		switch (connp->conn_family) {
 		case AF_INET:
 			sin4 = (struct sockaddr_in *)addrs + cnt;
 			IN6_INADDR_TO_V4MAPPED(&sin4->sin_addr, &addr);
@@ -1301,7 +1300,7 @@ sctp_del_saddr_list(sctp_t *sctp, const void *addrs, int addcnt,
 			break;
 		}
 		sctp_ipif = sctp_lookup_ipif_addr(&addr, B_FALSE,
-		    sctp->sctp_zoneid, !sctp->sctp_connp->conn_allzones,
+		    IPCL_ZONEID(connp), !connp->conn_allzones,
 		    ifindex, 0, B_TRUE, sctp->sctp_sctps);
 		ASSERT(sctp_ipif != NULL);
 		sctp_ipif_hash_remove(sctp, sctp_ipif);
@@ -1356,10 +1355,10 @@ int
 sctp_saddr_add_addr(sctp_t *sctp, in6_addr_t *addr, uint_t ifindex)
 {
 	sctp_ipif_t		*sctp_ipif;
+	conn_t			*connp = sctp->sctp_connp;
 
-	sctp_ipif = sctp_lookup_ipif_addr(addr, B_TRUE, sctp->sctp_zoneid,
-	    !sctp->sctp_connp->conn_allzones, ifindex, 0, B_TRUE,
-	    sctp->sctp_sctps);
+	sctp_ipif = sctp_lookup_ipif_addr(addr, B_TRUE, IPCL_ZONEID(connp),
+	    !connp->conn_allzones, ifindex, 0, B_TRUE, sctp->sctp_sctps);
 	if (sctp_ipif == NULL)
 		return (EINVAL);
 
@@ -1386,6 +1385,7 @@ sctp_check_saddr(sctp_t *sctp, int supp_af, boolean_t delete,
 	int			scanned = 0;
 	int			naddr;
 	int			nsaddr;
+	conn_t			*connp = sctp->sctp_connp;
 
 	ASSERT(!sctp->sctp_loopback && !sctp->sctp_linklocal && supp_af != 0);
 
@@ -1393,7 +1393,7 @@ sctp_check_saddr(sctp_t *sctp, int supp_af, boolean_t delete,
 	 * Irregardless of the supported address in the INIT, v4
 	 * must be supported.
 	 */
-	if (sctp->sctp_family == AF_INET)
+	if (connp->conn_family == AF_INET)
 		supp_af = PARM_SUPP_V4;
 
 	nsaddr = sctp->sctp_nsaddrs;
@@ -1501,13 +1501,15 @@ sctp_getmyaddrs(void *conn, void *myaddrs, int *addrcnt)
 	int			l;
 	sctp_saddr_ipif_t	*obj;
 	sctp_t			*sctp = (sctp_t *)conn;
-	int			family = sctp->sctp_family;
+	conn_t			*connp = sctp->sctp_connp;
+	int			family = connp->conn_family;
 	int			max = *addrcnt;
 	size_t			added = 0;
 	struct sockaddr_in6	*sin6;
 	struct sockaddr_in	*sin4;
 	int			scanned = 0;
 	boolean_t		skip_lback = B_FALSE;
+	ip_xmit_attr_t		*ixa = connp->conn_ixa;
 
 	if (sctp->sctp_nsaddrs == 0)
 		return (EINVAL);
@@ -1543,15 +1545,27 @@ sctp_getmyaddrs(void *conn, void *myaddrs, int *addrcnt)
 			case AF_INET:
 				sin4 = (struct sockaddr_in *)myaddrs + added;
 				sin4->sin_family = AF_INET;
-				sin4->sin_port = sctp->sctp_lport;
+				sin4->sin_port = connp->conn_lport;
 				IN6_V4MAPPED_TO_INADDR(&addr, &sin4->sin_addr);
 				break;
 
 			case AF_INET6:
 				sin6 = (struct sockaddr_in6 *)myaddrs + added;
 				sin6->sin6_family = AF_INET6;
-				sin6->sin6_port = sctp->sctp_lport;
+				sin6->sin6_port = connp->conn_lport;
 				sin6->sin6_addr = addr;
+				/*
+				 * Note that flowinfo is only returned for
+				 * getpeername just like for TCP and UDP.
+				 */
+				sin6->sin6_flowinfo = 0;
+
+				if (IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr) &&
+				    (ixa->ixa_flags & IXAF_SCOPEID_SET))
+					sin6->sin6_scope_id = ixa->ixa_scopeid;
+				else
+					sin6->sin6_scope_id = 0;
+				sin6->__sin6_src_id = 0;
 				break;
 			}
 			added++;
@@ -1700,6 +1714,7 @@ sctp_get_addrlist(sctp_t *sctp, const void *addrs, uint32_t *addrcnt,
 	uchar_t			*p;
 	int			err = 0;
 	sctp_stack_t		*sctps = sctp->sctp_sctps;
+	conn_t			*connp = sctp->sctp_connp;
 
 	*addrlist = NULL;
 	*size = 0;
@@ -1707,7 +1722,7 @@ sctp_get_addrlist(sctp_t *sctp, const void *addrs, uint32_t *addrcnt,
 	/*
 	 * Create a list of sockaddr_in[6] structs using the input list.
 	 */
-	if (sctp->sctp_family == AF_INET) {
+	if (connp->conn_family == AF_INET) {
 		*size = sizeof (struct sockaddr_in) * *addrcnt;
 		*addrlist = kmem_zalloc(*size,  KM_SLEEP);
 		p = *addrlist;
@@ -1772,7 +1787,7 @@ get_all_addrs:
 	 * We allocate upfront so that the clustering module need to bother
 	 * re-sizing the list.
 	 */
-	if (sctp->sctp_family == AF_INET) {
+	if (connp->conn_family == AF_INET) {
 		*size = sizeof (struct sockaddr_in) *
 		    sctps->sctps_g_ipifs_count;
 	} else {
@@ -1805,7 +1820,7 @@ get_all_addrs:
 			    SCTP_IS_IPIF_LOOPBACK(sctp_ipif) ||
 			    SCTP_IS_IPIF_LINKLOCAL(sctp_ipif) ||
 			    !SCTP_IPIF_ZONE_MATCH(sctp, sctp_ipif) ||
-			    (sctp->sctp_ipversion == IPV4_VERSION &&
+			    (connp->conn_family == AF_INET &&
 			    sctp_ipif->sctp_ipif_isv6) ||
 			    (sctp->sctp_connp->conn_ipv6_v6only &&
 			    !sctp_ipif->sctp_ipif_isv6)) {
@@ -1816,7 +1831,7 @@ get_all_addrs:
 				continue;
 			}
 			rw_exit(&sctp_ipif->sctp_ipif_lock);
-			if (sctp->sctp_family == AF_INET) {
+			if (connp->conn_family == AF_INET) {
 				s4 = (struct sockaddr_in *)p;
 				IN6_V4MAPPED_TO_INADDR(&addr, &s4->sin_addr);
 				s4->sin_family = AF_INET;
diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.h b/usr/src/uts/common/inet/sctp/sctp_addr.h
index 9408c452d4..35e8300958 100644
--- a/usr/src/uts/common/inet/sctp/sctp_addr.h
+++ b/usr/src/uts/common/inet/sctp/sctp_addr.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SCTP_ADDR_H
 #define	_SCTP_ADDR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/list.h>
 #include <sys/zone.h>
 #include <inet/ip.h>
@@ -54,7 +52,6 @@ extern "C" {
 typedef struct sctp_ipif_s {
 	list_node_t		sctp_ipifs;	/* Used by the global list */
 	struct sctp_ill_s	*sctp_ipif_ill;
-	uint_t			sctp_ipif_mtu;
 	uint_t			sctp_ipif_id;
 	in6_addr_t		sctp_ipif_saddr;
 	int			sctp_ipif_state;
diff --git a/usr/src/uts/common/inet/sctp/sctp_asconf.c b/usr/src/uts/common/inet/sctp/sctp_asconf.c
index 859faab0b8..fd7e34f7ba 100644
--- a/usr/src/uts/common/inet/sctp/sctp_asconf.c
+++ b/usr/src/uts/common/inet/sctp/sctp_asconf.c
@@ -571,7 +571,8 @@ sctp_input_asconf(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp)
 	 * it is the clustering module's responsibility to free the lists.
 	 */
 	if (cl_sctp_assoc_change != NULL) {
-		(*cl_sctp_assoc_change)(sctp->sctp_family, alist, asize,
+		(*cl_sctp_assoc_change)(sctp->sctp_connp->conn_family,
+		    alist, asize,
 		    acount, dlist, dsize, dcount, SCTP_CL_PADDR,
 		    (cl_sctp_handle_t)sctp);
 		/* alist and dlist will be freed by the clustering module */
@@ -586,9 +587,10 @@ sctp_input_asconf(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp)
 		ach->sch_len = htons(msgdsize(hmp) - sctp->sctp_hdr_len);
 	else
 		ach->sch_len = htons(msgdsize(hmp) - sctp->sctp_hdr6_len);
-	sctp_set_iplen(sctp, hmp);
 
-	sctp_add_sendq(sctp, hmp);
+	sctp_set_iplen(sctp, hmp, fp->ixa);
+	(void) conn_ip_output(hmp, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 	sctp_validate_peer(sctp);
 }
 
@@ -809,7 +811,7 @@ sctp_input_asconf_ack(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp)
 		mp->b_prev = NULL;
 		ainfo->sctp_cl_alist = NULL;
 		ainfo->sctp_cl_dlist = NULL;
-		(*cl_sctp_assoc_change)(sctp->sctp_family, alist,
+		(*cl_sctp_assoc_change)(sctp->sctp_connp->conn_family, alist,
 		    ainfo->sctp_cl_asize, acount, dlist, ainfo->sctp_cl_dsize,
 		    dcount, SCTP_CL_LADDR, (cl_sctp_handle_t)sctp);
 		/* alist and dlist will be freed by the clustering module */
@@ -1010,12 +1012,13 @@ sctp_wput_asconf(sctp_t *sctp, sctp_faddr_t *fp)
 	fp->suna += MBLKL(mp);
 	/* Attach the header and send the chunk */
 	ipmp->b_cont = mp;
-	sctp_set_iplen(sctp, ipmp);
 	sctp->sctp_cchunk_pend = 1;
 
 	SCTP_SET_SENT_FLAG(sctp->sctp_cxmit_list);
 	SCTP_SET_CHUNK_DEST(sctp->sctp_cxmit_list, fp);
-	sctp_add_sendq(sctp, ipmp);
+	sctp_set_iplen(sctp, ipmp, fp->ixa);
+	(void) conn_ip_output(ipmp, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 	SCTP_FADDR_RC_TIMER_RESTART(sctp, fp, fp->rto);
 #undef	SCTP_SET_SENT_FLAG
 }
@@ -1418,6 +1421,7 @@ sctp_add_ip(sctp_t *sctp, const void *addrs, uint32_t cnt)
 	uint16_t		type = htons(PARM_ADD_IP);
 	boolean_t		v4mapped = B_FALSE;
 	sctp_cl_ainfo_t		*ainfo = NULL;
+	conn_t			*connp = sctp->sctp_connp;
 
 	/* Does the peer understand ASCONF and Add-IP? */
 	if (!sctp->sctp_understands_asconf || !sctp->sctp_understands_addip)
@@ -1453,7 +1457,7 @@ sctp_add_ip(sctp_t *sctp, const void *addrs, uint32_t cnt)
 	 *   o Must be part of the association
 	 */
 	for (i = 0; i < cnt; i++) {
-		switch (sctp->sctp_family) {
+		switch (connp->conn_family) {
 		case AF_INET:
 			sin4 = (struct sockaddr_in *)addrs + i;
 			v4mapped = B_TRUE;
@@ -1538,6 +1542,7 @@ sctp_del_ip(sctp_t *sctp, const void *addrs, uint32_t cnt, uchar_t *ulist,
 	uchar_t			*p = ulist;
 	boolean_t		check_lport = B_FALSE;
 	sctp_stack_t		*sctps = sctp->sctp_sctps;
+	conn_t			*connp = sctp->sctp_connp;
 
 	/* Does the peer understand ASCONF and Add-IP? */
 	if (sctp->sctp_state <= SCTPS_LISTEN || !sctps->sctps_addip_enabled ||
@@ -1577,10 +1582,11 @@ sctp_del_ip(sctp_t *sctp, const void *addrs, uint32_t cnt, uchar_t *ulist,
 	for (i = 0; i < cnt; i++) {
 		ifindex = 0;
 
-		switch (sctp->sctp_family) {
+		switch (connp->conn_family) {
 		case AF_INET:
 			sin4 = (struct sockaddr_in *)addrs + i;
-			if (check_lport && sin4->sin_port != sctp->sctp_lport) {
+			if (check_lport &&
+			    sin4->sin_port != connp->conn_lport) {
 				error = EINVAL;
 				goto fail;
 			}
@@ -1591,7 +1597,7 @@ sctp_del_ip(sctp_t *sctp, const void *addrs, uint32_t cnt, uchar_t *ulist,
 		case AF_INET6:
 			sin6 = (struct sockaddr_in6 *)addrs + i;
 			if (check_lport &&
-			    sin6->sin6_port != sctp->sctp_lport) {
+			    sin6->sin6_port != connp->conn_lport) {
 				error = EINVAL;
 				goto fail;
 			}
@@ -1675,7 +1681,7 @@ fail:
 	for (i = 0; i < addrcnt; i++) {
 		ifindex = 0;
 
-		switch (sctp->sctp_family) {
+		switch (connp->conn_family) {
 		case AF_INET:
 			sin4 = (struct sockaddr_in *)addrs + i;
 			IN6_INADDR_TO_V4MAPPED(&(sin4->sin_addr), &addr);
@@ -1697,7 +1703,7 @@ fail:
 }
 
 int
-sctp_set_peerprim(sctp_t *sctp, const void *inp, uint_t inlen)
+sctp_set_peerprim(sctp_t *sctp, const void *inp)
 {
 	const struct sctp_setprim	*prim = inp;
 	const struct sockaddr_storage	*ss;
@@ -1717,9 +1723,6 @@ sctp_set_peerprim(sctp_t *sctp, const void *inp, uint_t inlen)
 		return (EOPNOTSUPP);
 	}
 
-	if (inlen < sizeof (*prim))
-		return (EINVAL);
-
 	/* Don't do anything if we are not connected */
 	if (sctp->sctp_state != SCTPS_ESTABLISHED)
 		return (EINVAL);
diff --git a/usr/src/uts/common/inet/sctp/sctp_asconf.h b/usr/src/uts/common/inet/sctp/sctp_asconf.h
index 8940aa00bc..221172d7bb 100644
--- a/usr/src/uts/common/inet/sctp/sctp_asconf.h
+++ b/usr/src/uts/common/inet/sctp/sctp_asconf.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _INET_SCTP_SCTP_ASCONF_H
 #define	_INET_SCTP_SCTP_ASCONF_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -57,7 +55,7 @@ extern int sctp_del_ip(sctp_t *, const void *, uint32_t, uchar_t *, size_t);
 extern void sctp_asconf_free_cxmit(sctp_t *, sctp_chunk_hdr_t *);
 extern void sctp_input_asconf(sctp_t *, sctp_chunk_hdr_t *, sctp_faddr_t *);
 extern void sctp_input_asconf_ack(sctp_t *, sctp_chunk_hdr_t *, sctp_faddr_t *);
-extern int sctp_set_peerprim(sctp_t *, const void *, uint_t);
+extern int sctp_set_peerprim(sctp_t *, const void *);
 extern void sctp_wput_asconf(sctp_t *, sctp_faddr_t *);
 
 #ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/sctp/sctp_bind.c b/usr/src/uts/common/inet/sctp/sctp_bind.c
index c0c1c7556e..9e0b0e7418 100644
--- a/usr/src/uts/common/inet/sctp/sctp_bind.c
+++ b/usr/src/uts/common/inet/sctp/sctp_bind.c
@@ -56,6 +56,7 @@ static int
 sctp_select_port(sctp_t *sctp, in_port_t *requested_port, int *user_specified)
 {
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	conn_t		*connp = sctp->sctp_connp;
 
 	/*
 	 * Get a valid port (within the anonymous range and should not
@@ -68,7 +69,7 @@ sctp_select_port(sctp_t *sctp, in_port_t *requested_port, int *user_specified)
 	if (*requested_port == 0) {
 		*requested_port = sctp_update_next_port(
 		    sctps->sctps_next_port_to_try,
-		    crgetzone(sctp->sctp_credp), sctps);
+		    crgetzone(connp->conn_cred), sctps);
 		if (*requested_port == 0)
 			return (EACCES);
 		*user_specified = 0;
@@ -101,7 +102,7 @@ sctp_select_port(sctp_t *sctp, in_port_t *requested_port, int *user_specified)
 			 * sctp_bind() should take a cred_t argument so that
 			 * we can use it here.
 			 */
-			if (secpolicy_net_privaddr(sctp->sctp_credp,
+			if (secpolicy_net_privaddr(connp->conn_cred,
 			    *requested_port, IPPROTO_SCTP) != 0) {
 				dprint(1,
 				    ("sctp_bind(x): no prive for port %d",
@@ -120,6 +121,7 @@ sctp_listen(sctp_t *sctp)
 {
 	sctp_tf_t	*tf;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	conn_t		*connp = sctp->sctp_connp;
 
 	RUN_SCTP(sctp);
 	/*
@@ -138,7 +140,7 @@ sctp_listen(sctp_t *sctp)
 		int ret;
 
 		bzero(&ss, sizeof (ss));
-		ss.ss_family = sctp->sctp_family;
+		ss.ss_family = connp->conn_family;
 
 		WAKE_SCTP(sctp);
 		if ((ret = sctp_bind(sctp, (struct sockaddr *)&ss,
@@ -147,12 +149,18 @@ sctp_listen(sctp_t *sctp)
 		RUN_SCTP(sctp)
 	}
 
+	/* Cache things in the ixa without any refhold */
+	connp->conn_ixa->ixa_cred = connp->conn_cred;
+	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+	if (is_system_labeled())
+		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
+
 	sctp->sctp_state = SCTPS_LISTEN;
 	(void) random_get_pseudo_bytes(sctp->sctp_secret, SCTP_SECRET_LEN);
 	sctp->sctp_last_secret_update = lbolt64;
 	bzero(sctp->sctp_old_secret, SCTP_SECRET_LEN);
 	tf = &sctps->sctps_listen_fanout[SCTP_LISTEN_HASH(
-	    ntohs(sctp->sctp_lport))];
+	    ntohs(connp->conn_lport))];
 	sctp_listen_hash_insert(tf, sctp);
 	WAKE_SCTP(sctp);
 	return (0);
@@ -170,6 +178,10 @@ sctp_bind(sctp_t *sctp, struct sockaddr *sa, socklen_t len)
 	in_port_t	requested_port;
 	in_port_t	allocated_port;
 	int		err = 0;
+	conn_t		*connp = sctp->sctp_connp;
+	uint_t		scope_id;
+	sin_t		*sin;
+	sin6_t		*sin6;
 
 	ASSERT(sctp != NULL);
 
@@ -188,25 +200,35 @@ sctp_bind(sctp_t *sctp, struct sockaddr *sa, socklen_t len)
 
 	switch (sa->sa_family) {
 	case AF_INET:
+		sin = (sin_t *)sa;
 		if (len < sizeof (struct sockaddr_in) ||
-		    sctp->sctp_family == AF_INET6) {
+		    connp->conn_family == AF_INET6) {
 			err = EINVAL;
 			goto done;
 		}
-		requested_port = ntohs(((struct sockaddr_in *)sa)->sin_port);
+		requested_port = ntohs(sin->sin_port);
 		break;
 	case AF_INET6:
+		sin6 = (sin6_t *)sa;
 		if (len < sizeof (struct sockaddr_in6) ||
-		    sctp->sctp_family == AF_INET) {
+		    connp->conn_family == AF_INET) {
 			err = EINVAL;
 			goto done;
 		}
-		requested_port = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
+		requested_port = ntohs(sin6->sin6_port);
 		/* Set the flowinfo. */
-		sctp->sctp_ip6h->ip6_vcf =
-		    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
-		    (((struct sockaddr_in6 *)sa)->sin6_flowinfo &
-		    ~IPV6_VERS_AND_FLOW_MASK);
+		connp->conn_flowinfo =
+		    sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK;
+
+		scope_id = sin6->sin6_scope_id;
+		if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
+			connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
+			connp->conn_ixa->ixa_scopeid = scope_id;
+			connp->conn_incoming_ifindex = scope_id;
+		} else {
+			connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+			connp->conn_incoming_ifindex = connp->conn_bound_if;
+		}
 		break;
 	default:
 		err = EAFNOSUPPORT;
@@ -247,7 +269,7 @@ sctp_bindx(sctp_t *sctp, const void *addrs, int addrcnt, int bindop)
 	switch (bindop) {
 	case SCTP_BINDX_ADD_ADDR:
 		return (sctp_bind_add(sctp, addrs, addrcnt, B_FALSE,
-		    sctp->sctp_lport));
+		    sctp->sctp_connp->conn_lport));
 	case SCTP_BINDX_REM_ADDR:
 		return (sctp_bind_del(sctp, addrs, addrcnt, B_FALSE));
 	default:
@@ -265,6 +287,7 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
 	int		err = 0;
 	boolean_t	do_asconf = B_FALSE;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	conn_t		*connp = sctp->sctp_connp;
 
 	if (!caller_hold_lock)
 		RUN_SCTP(sctp);
@@ -329,7 +352,7 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
 			return (err);
 		}
 		ASSERT(addrlist != NULL);
-		(*cl_sctp_check_addrs)(sctp->sctp_family, port, &addrlist,
+		(*cl_sctp_check_addrs)(connp->conn_family, port, &addrlist,
 		    size, &addrcnt, unspec == 1);
 		if (addrcnt == 0) {
 			/* We free the list */
@@ -345,8 +368,8 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
 		err = sctp_valid_addr_list(sctp, addrlist, addrcnt, llist,
 		    lsize);
 		if (err == 0 && do_listen) {
-			(*cl_sctp_listen)(sctp->sctp_family, llist,
-			    addrcnt, sctp->sctp_lport);
+			(*cl_sctp_listen)(connp->conn_family, llist,
+			    addrcnt, connp->conn_lport);
 			/* list will be freed by the clustering module */
 		} else if (err != 0 && llist != NULL) {
 			kmem_free(llist, lsize);
@@ -373,8 +396,6 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
 	}
 	if (!caller_hold_lock)
 		WAKE_SCTP(sctp);
-	if (do_asconf)
-		sctp_process_sendq(sctp);
 	return (0);
 }
 
@@ -390,6 +411,7 @@ sctp_bind_del(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
 	uchar_t		*ulist = NULL;
 	size_t		usize = 0;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	conn_t		*connp = sctp->sctp_connp;
 
 	if (!caller_hold_lock)
 		RUN_SCTP(sctp);
@@ -439,14 +461,12 @@ sctp_bind_del(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
 	/* ulist will be non-NULL only if cl_sctp_unlisten is non-NULL */
 	if (ulist != NULL) {
 		ASSERT(cl_sctp_unlisten != NULL);
-		(*cl_sctp_unlisten)(sctp->sctp_family, ulist, addrcnt,
-		    sctp->sctp_lport);
+		(*cl_sctp_unlisten)(connp->conn_family, ulist, addrcnt,
+		    connp->conn_lport);
 		/* ulist will be freed by the clustering module */
 	}
 	if (!caller_hold_lock)
 		WAKE_SCTP(sctp);
-	if (do_asconf)
-		sctp_process_sendq(sctp);
 	return (error);
 }
 
@@ -473,9 +493,10 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
 	int count = 0;
 	/* maximum number of times to run around the loop */
 	int loopmax;
-	zoneid_t zoneid = sctp->sctp_zoneid;
-	zone_t *zone = crgetzone(sctp->sctp_credp);
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	conn_t		*connp = sctp->sctp_connp;
+	zone_t *zone = crgetzone(connp->conn_cred);
+	zoneid_t zoneid = connp->conn_zoneid;
 
 	/*
 	 * Lookup for free addresses is done in a loop and "loopmax"
@@ -523,8 +544,9 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
 		mutex_enter(&tbf->tf_lock);
 		for (lsctp = tbf->tf_sctp; lsctp != NULL;
 		    lsctp = lsctp->sctp_bind_hash) {
+			conn_t *lconnp = lsctp->sctp_connp;
 
-			if (lport != lsctp->sctp_lport ||
+			if (lport != lconnp->conn_lport ||
 			    lsctp->sctp_state < SCTPS_BOUND)
 				continue;
 
@@ -534,14 +556,14 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
 			 * privilege as being in all zones, as there's
 			 * otherwise no way to identify the right receiver.
 			 */
-			if (lsctp->sctp_zoneid != zoneid &&
-			    lsctp->sctp_mac_mode == CONN_MAC_DEFAULT &&
-			    sctp->sctp_mac_mode == CONN_MAC_DEFAULT)
+			if (lconnp->conn_zoneid != zoneid &&
+			    lconnp->conn_mac_mode == CONN_MAC_DEFAULT &&
+			    connp->conn_mac_mode == CONN_MAC_DEFAULT)
 				continue;
 
 			addrcmp = sctp_compare_saddrs(sctp, lsctp);
 			if (addrcmp != SCTP_ADDR_DISJOINT) {
-				if (!sctp->sctp_reuseaddr) {
+				if (!connp->conn_reuseaddr) {
 					/* in use */
 					break;
 				} else if (lsctp->sctp_state == SCTPS_BOUND ||
@@ -563,10 +585,9 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
 			/* The port number is busy */
 			mutex_exit(&tbf->tf_lock);
 		} else {
-			conn_t *connp = sctp->sctp_connp;
-
 			if (is_system_labeled()) {
 				mlp_type_t addrtype, mlptype;
+				uint_t ipversion;
 
 				/*
 				 * On a labeled system we must check the type
@@ -575,11 +596,16 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
 				 * and that the user's requested binding
 				 * is permitted.
 				 */
+				if (connp->conn_family == AF_INET)
+					ipversion = IPV4_VERSION;
+				else
+					ipversion = IPV6_VERSION;
+
 				addrtype = tsol_mlp_addr_type(
 				    connp->conn_allzones ? ALL_ZONES :
 				    zone->zone_id,
-				    sctp->sctp_ipversion,
-				    sctp->sctp_ipversion == IPV4_VERSION ?
+				    ipversion,
+				    connp->conn_family == AF_INET ?
 				    (void *)&sctp->sctp_ipha->ipha_src :
 				    (void *)&sctp->sctp_ip6h->ip6_src,
 				    sctps->sctps_netstack->netstack_ip);
@@ -631,8 +657,7 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
 			 * number.
 			 */
 			sctp->sctp_state = SCTPS_BOUND;
-			sctp->sctp_lport = lport;
-			sctp->sctp_sctph->sh_sport = lport;
+			connp->conn_lport = lport;
 
 			ASSERT(&sctps->sctps_bind_fanout[
 			    SCTP_BIND_HASH(port)] == tbf);
diff --git a/usr/src/uts/common/inet/sctp/sctp_common.c b/usr/src/uts/common/inet/sctp/sctp_common.c
index 3486ba1150..b518eb3981 100644
--- a/usr/src/uts/common/inet/sctp/sctp_common.c
+++ b/usr/src/uts/common/inet/sctp/sctp_common.c
@@ -44,6 +44,8 @@
 #include <inet/ip.h>
 #include <inet/ip6.h>
 #include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ndp.h>
 #include <inet/mib2.h>
 #include <inet/nd.h>
 #include <inet/optcom.h>
@@ -57,7 +59,7 @@
 static struct kmem_cache *sctp_kmem_faddr_cache;
 static void sctp_init_faddr(sctp_t *, sctp_faddr_t *, in6_addr_t *, mblk_t *);
 
-/* Set the source address.  Refer to comments in sctp_get_ire(). */
+/* Set the source address.  Refer to comments in sctp_get_dest(). */
 void
 sctp_set_saddr(sctp_t *sctp, sctp_faddr_t *fp)
 {
@@ -68,7 +70,7 @@ sctp_set_saddr(sctp_t *sctp, sctp_faddr_t *fp)
 	/*
 	 * If there is no source address avaialble, mark this peer address
 	 * as unreachable for now.  When the heartbeat timer fires, it will
-	 * call sctp_get_ire() to re-check if there is any source address
+	 * call sctp_get_dest() to re-check if there is any source address
 	 * available.
 	 */
 	if (!addr_set)
@@ -76,25 +78,31 @@ sctp_set_saddr(sctp_t *sctp, sctp_faddr_t *fp)
 }
 
 /*
- * Call this function to update the cached IRE of a peer addr fp.
+ * Call this function to get information about a peer addr fp.
+ *
+ * Uses ip_attr_connect to avoid explicit use of ire and source address
+ * selection.
  */
 void
-sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp)
+sctp_get_dest(sctp_t *sctp, sctp_faddr_t *fp)
 {
-	ire_t		*ire;
-	ipaddr_t	addr4;
 	in6_addr_t	laddr;
+	in6_addr_t	nexthop;
 	sctp_saddr_ipif_t *sp;
 	int		hdrlen;
-	ts_label_t	*tsl;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
-	ip_stack_t	*ipst = sctps->sctps_netstack->netstack_ip;
+	conn_t		*connp = sctp->sctp_connp;
+	iulp_t		uinfo;
+	uint_t		pmtu;
+	int		error;
+	uint32_t	flags = IPDF_VERIFY_DST | IPDF_IPSEC |
+	    IPDF_SELECT_SRC | IPDF_UNIQUE_DCE;
 
-	/* Remove the previous cache IRE */
-	if ((ire = fp->ire) != NULL) {
-		IRE_REFRELE_NOTR(ire);
-		fp->ire = NULL;
-	}
+	/*
+	 * Tell sctp_make_mp it needs to call us again should we not
+	 * complete and set the saddr.
+	 */
+	fp->saddr = ipv6_all_zeros;
 
 	/*
 	 * If this addr is not reachable, mark it as unconfirmed for now, the
@@ -105,29 +113,28 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp)
 		fp->state = SCTP_FADDRS_UNCONFIRMED;
 	}
 
-	tsl = crgetlabel(CONN_CRED(sctp->sctp_connp));
+	/*
+	 * Socket is connected - enable PMTU discovery.
+	 */
+	if (!sctps->sctps_ignore_path_mtu)
+		fp->ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
 
-	if (fp->isv4) {
-		IN6_V4MAPPED_TO_IPADDR(&fp->faddr, addr4);
-		ire = ire_cache_lookup(addr4, sctp->sctp_zoneid, tsl, ipst);
-		if (ire != NULL)
-			IN6_IPADDR_TO_V4MAPPED(ire->ire_src_addr, &laddr);
-	} else {
-		ire = ire_cache_lookup_v6(&fp->faddr, sctp->sctp_zoneid, tsl,
-		    ipst);
-		if (ire != NULL)
-			laddr = ire->ire_src_addr_v6;
-	}
+	ip_attr_nexthop(&connp->conn_xmit_ipp, fp->ixa, &fp->faddr,
+	    &nexthop);
 
-	if (ire == NULL) {
-		dprint(3, ("ire2faddr: no ire for %x:%x:%x:%x\n",
+	laddr = fp->saddr;
+	error = ip_attr_connect(connp, fp->ixa, &laddr, &fp->faddr, &nexthop,
+	    connp->conn_fport, &laddr, &uinfo, flags);
+
+	if (error != 0) {
+		dprint(3, ("sctp_get_dest: no ire for %x:%x:%x:%x\n",
 		    SCTP_PRINTADDR(fp->faddr)));
 		/*
 		 * It is tempting to just leave the src addr
 		 * unspecified and let IP figure it out, but we
 		 * *cannot* do this, since IP may choose a src addr
 		 * that is not part of this association... unless
-		 * this sctp has bound to all addrs.  So if the ire
+		 * this sctp has bound to all addrs.  So if the dest
 		 * lookup fails, try to find one in our src addr
 		 * list, unless the sctp has bound to all addrs, in
 		 * which case we change the src addr to unspec.
@@ -144,56 +151,44 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp)
 			return;
 		goto check_current;
 	}
+	ASSERT(fp->ixa->ixa_ire != NULL);
+	ASSERT(!(fp->ixa->ixa_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)));
+
+	if (!sctp->sctp_loopback)
+		sctp->sctp_loopback = uinfo.iulp_loopback;
 
 	/* Make sure the laddr is part of this association */
-	if ((sp = sctp_saddr_lookup(sctp, &ire->ire_ipif->ipif_v6lcl_addr,
-	    0)) != NULL && !sp->saddr_ipif_dontsrc) {
+	if ((sp = sctp_saddr_lookup(sctp, &laddr, 0)) != NULL &&
+	    !sp->saddr_ipif_dontsrc) {
 		if (sp->saddr_ipif_unconfirmed == 1)
 			sp->saddr_ipif_unconfirmed = 0;
+		/* We did IPsec policy lookup for laddr already */
 		fp->saddr = laddr;
 	} else {
-		dprint(2, ("ire2faddr: src addr is not part of assc\n"));
+		dprint(2, ("sctp_get_dest: src addr is not part of assoc "
+		    "%x:%x:%x:%x\n", SCTP_PRINTADDR(laddr)));
 
 		/*
 		 * Set the src to the first saddr and hope for the best.
-		 * Note that we will still do the ire caching below.
-		 * Otherwise, whenever we send a packet, we need to do
-		 * the ire lookup again and still may not get the correct
-		 * source address.  Note that this case should very seldomly
+		 * Note that this case should very seldomly
 		 * happen.  One scenario this can happen is an app
 		 * explicitly bind() to an address.  But that address is
 		 * not the preferred source address to send to the peer.
 		 */
 		sctp_set_saddr(sctp, fp);
 		if (fp->state == SCTP_FADDRS_UNREACH) {
-			IRE_REFRELE(ire);
 			return;
 		}
 	}
 
 	/*
-	 * Note that ire_cache_lookup_*() returns an ire with the tracing
-	 * bits enabled.  This requires the thread holding the ire also
-	 * do the IRE_REFRELE().  Thus we need to do IRE_REFHOLD_NOTR()
-	 * and then IRE_REFRELE() the ire here to make the tracing bits
-	 * work.
-	 */
-	IRE_REFHOLD_NOTR(ire);
-	IRE_REFRELE(ire);
-
-	/* Cache the IRE */
-	fp->ire = ire;
-	if (fp->ire->ire_type == IRE_LOOPBACK && !sctp->sctp_loopback)
-		sctp->sctp_loopback = 1;
-
-	/*
 	 * Pull out RTO information for this faddr and use it if we don't
 	 * have any yet.
 	 */
-	if (fp->srtt == -1 && ire->ire_uinfo.iulp_rtt != 0) {
+	if (fp->srtt == -1 && uinfo.iulp_rtt != 0) {
 		/* The cached value is in ms. */
-		fp->srtt = MSEC_TO_TICK(ire->ire_uinfo.iulp_rtt);
-		fp->rttvar = MSEC_TO_TICK(ire->ire_uinfo.iulp_rtt_sd);
+		fp->srtt = MSEC_TO_TICK(uinfo.iulp_rtt);
+		fp->rttvar = MSEC_TO_TICK(uinfo.iulp_rtt_sd);
 		fp->rto = 3 * fp->srtt;
 
 		/* Bound the RTO by configured min and max values */
@@ -205,6 +200,7 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp)
 		}
 		SCTP_MAX_RTO(sctp, fp);
 	}
+	pmtu = uinfo.iulp_mtu;
 
 	/*
 	 * Record the MTU for this faddr. If the MTU for this faddr has
@@ -215,9 +211,9 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp)
 	} else {
 		hdrlen = sctp->sctp_hdr6_len;
 	}
-	if ((fp->sfa_pmss + hdrlen) != ire->ire_max_frag) {
+	if ((fp->sfa_pmss + hdrlen) != pmtu) {
 		/* Make sure that sfa_pmss is a multiple of SCTP_ALIGN. */
-		fp->sfa_pmss = (ire->ire_max_frag - hdrlen) & ~(SCTP_ALIGN - 1);
+		fp->sfa_pmss = (pmtu - hdrlen) & ~(SCTP_ALIGN - 1);
 		if (fp->cwnd < (fp->sfa_pmss * 2)) {
 			SET_CWND(fp, fp->sfa_pmss,
 			    sctps->sctps_slow_start_initial);
@@ -230,28 +226,16 @@ check_current:
 }
 
 void
-sctp_update_ire(sctp_t *sctp)
+sctp_update_dce(sctp_t *sctp)
 {
-	ire_t		*ire;
 	sctp_faddr_t	*fp;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	iulp_t		uinfo;
+	ip_stack_t	*ipst = sctps->sctps_netstack->netstack_ip;
+	uint_t		ifindex;
 
 	for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
-		if ((ire = fp->ire) == NULL)
-			continue;
-		mutex_enter(&ire->ire_lock);
-
-		/*
-		 * If the cached IRE is going away, there is no point to
-		 * update it.
-		 */
-		if (ire->ire_marks & IRE_MARK_CONDEMNED) {
-			mutex_exit(&ire->ire_lock);
-			IRE_REFRELE_NOTR(ire);
-			fp->ire = NULL;
-			continue;
-		}
-
+		bzero(&uinfo, sizeof (uinfo));
 		/*
 		 * Only record the PMTU for this faddr if we actually have
 		 * done discovery. This prevents initialized default from
@@ -259,70 +243,60 @@ sctp_update_ire(sctp_t *sctp)
 		 */
 		if (fp->pmtu_discovered) {
 			if (fp->isv4) {
-				ire->ire_max_frag = fp->sfa_pmss +
+				uinfo.iulp_mtu = fp->sfa_pmss +
 				    sctp->sctp_hdr_len;
 			} else {
-				ire->ire_max_frag = fp->sfa_pmss +
+				uinfo.iulp_mtu = fp->sfa_pmss +
 				    sctp->sctp_hdr6_len;
 			}
 		}
-
 		if (sctps->sctps_rtt_updates != 0 &&
 		    fp->rtt_updates >= sctps->sctps_rtt_updates) {
 			/*
-			 * If there is no old cached values, initialize them
-			 * conservatively.  Set them to be (1.5 * new value).
-			 * This code copied from ip_ire_advise().  The cached
-			 * value is in ms.
+			 * dce_update_uinfo() merges these values with the
+			 * old values.
 			 */
-			if (ire->ire_uinfo.iulp_rtt != 0) {
-				ire->ire_uinfo.iulp_rtt =
-				    (ire->ire_uinfo.iulp_rtt +
-				    TICK_TO_MSEC(fp->srtt)) >> 1;
-			} else {
-				ire->ire_uinfo.iulp_rtt =
-				    TICK_TO_MSEC(fp->srtt + (fp->srtt >> 1));
-			}
-			if (ire->ire_uinfo.iulp_rtt_sd != 0) {
-				ire->ire_uinfo.iulp_rtt_sd =
-				    (ire->ire_uinfo.iulp_rtt_sd +
-				    TICK_TO_MSEC(fp->rttvar)) >> 1;
+			uinfo.iulp_rtt = TICK_TO_MSEC(fp->srtt);
+			uinfo.iulp_rtt_sd = TICK_TO_MSEC(fp->rttvar);
+			fp->rtt_updates = 0;
+		}
+		ifindex = 0;
+		if (IN6_IS_ADDR_LINKSCOPE(&fp->faddr)) {
+			/*
+			 * If we are going to create a DCE we'd better have
+			 * an ifindex
+			 */
+			if (fp->ixa->ixa_nce != NULL) {
+				ifindex = fp->ixa->ixa_nce->nce_common->
+				    ncec_ill->ill_phyint->phyint_ifindex;
 			} else {
-				ire->ire_uinfo.iulp_rtt_sd =
-				    TICK_TO_MSEC(fp->rttvar +
-				    (fp->rttvar >> 1));
+				continue;
 			}
-			fp->rtt_updates = 0;
 		}
-		mutex_exit(&ire->ire_lock);
+
+		(void) dce_update_uinfo(&fp->faddr, ifindex, &uinfo, ipst);
 	}
 }
 
 /*
- * The sender must set the total length in the IP header.
- * If sendto == NULL, the current will be used.
+ * The sender must later set the total length in the IP header.
  */
 mblk_t *
-sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer)
+sctp_make_mp(sctp_t *sctp, sctp_faddr_t *fp, int trailer)
 {
 	mblk_t *mp;
 	size_t ipsctplen;
 	int isv4;
-	sctp_faddr_t *fp;
 	sctp_stack_t *sctps = sctp->sctp_sctps;
 	boolean_t src_changed = B_FALSE;
 
-	ASSERT(sctp->sctp_current != NULL || sendto != NULL);
-	if (sendto == NULL) {
-		fp = sctp->sctp_current;
-	} else {
-		fp = sendto;
-	}
+	ASSERT(fp != NULL);
 	isv4 = fp->isv4;
 
-	/* Try to look for another IRE again. */
-	if (fp->ire == NULL) {
-		sctp_get_ire(sctp, fp);
+	if (SCTP_IS_ADDR_UNSPEC(isv4, fp->saddr) ||
+	    (fp->ixa->ixa_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+		/* Need to pick a source */
+		sctp_get_dest(sctp, fp);
 		/*
 		 * Although we still may not get an IRE, the source address
 		 * may be changed in sctp_get_ire().  Set src_changed to
@@ -334,7 +308,9 @@ sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer)
 	/* There is no suitable source address to use, return. */
 	if (fp->state == SCTP_FADDRS_UNREACH)
 		return (NULL);
-	ASSERT(!SCTP_IS_ADDR_UNSPEC(fp->isv4, fp->saddr));
+
+	ASSERT(fp->ixa->ixa_ire != NULL);
+	ASSERT(!SCTP_IS_ADDR_UNSPEC(isv4, fp->saddr));
 
 	if (isv4) {
 		ipsctplen = sctp->sctp_hdr_len;
@@ -342,8 +318,7 @@ sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer)
 		ipsctplen = sctp->sctp_hdr6_len;
 	}
 
-	mp = allocb_cred(ipsctplen + sctps->sctps_wroff_xtra + trailer,
-	    CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
+	mp = allocb(ipsctplen + sctps->sctps_wroff_xtra + trailer, BPRI_MED);
 	if (mp == NULL) {
 		ip1dbg(("sctp_make_mp: error making mp..\n"));
 		return (NULL);
@@ -377,18 +352,6 @@ sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer)
 		}
 	}
 	ASSERT(sctp->sctp_connp != NULL);
-
-	/*
-	 * IP will not free this IRE if it is condemned.  SCTP needs to
-	 * free it.
-	 */
-	if ((fp->ire != NULL) && (fp->ire->ire_marks & IRE_MARK_CONDEMNED)) {
-		IRE_REFRELE_NOTR(fp->ire);
-		fp->ire = NULL;
-	}
-	/* Stash the conn and ire ptr info. for IP */
-	SCTP_STASH_IPINFO(mp, fp->ire);
-
 	return (mp);
 }
 
@@ -410,17 +373,22 @@ sctp_set_ulp_prop(sctp_t *sctp)
 	}
 	ASSERT(sctp->sctp_ulpd);
 
+	sctp->sctp_connp->conn_wroff = sctps->sctps_wroff_xtra + hdrlen +
+	    sizeof (sctp_data_hdr_t);
+
 	ASSERT(sctp->sctp_current->sfa_pmss == sctp->sctp_mss);
 	bzero(&sopp, sizeof (sopp));
 	sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF;
-	sopp.sopp_wroff = sctps->sctps_wroff_xtra + hdrlen +
-	    sizeof (sctp_data_hdr_t);
+	sopp.sopp_wroff = sctp->sctp_connp->conn_wroff;
 	sopp.sopp_maxblk = sctp->sctp_mss - sizeof (sctp_data_hdr_t);
 	sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp);
 }
 
+/*
+ * Set the lengths in the packet and the transmit attributes.
+ */
 void
-sctp_set_iplen(sctp_t *sctp, mblk_t *mp)
+sctp_set_iplen(sctp_t *sctp, mblk_t *mp, ip_xmit_attr_t *ixa)
 {
 	uint16_t	sum = 0;
 	ipha_t		*iph;
@@ -432,19 +400,15 @@ sctp_set_iplen(sctp_t *sctp, mblk_t *mp)
 	for (; pmp; pmp = pmp->b_cont)
 		sum += pmp->b_wptr - pmp->b_rptr;
 
+	ixa->ixa_pktlen = sum;
 	if (isv4) {
 		iph = (ipha_t *)mp->b_rptr;
 		iph->ipha_length = htons(sum);
+		ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr_len;
 	} else {
 		ip6h = (ip6_t *)mp->b_rptr;
-		/*
-		 * If an ip6i_t is present, the real IPv6 header
-		 * immediately follows.
-		 */
-		if (ip6h->ip6_nxt == IPPROTO_RAW)
-			ip6h = (ip6_t *)&ip6h[1];
-		ip6h->ip6_plen = htons(sum - ((char *)&sctp->sctp_ip6h[1] -
-		    sctp->sctp_iphc6));
+		ip6h->ip6_plen = htons(sum - IPV6_HDR_LEN);
+		ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr6_len;
 	}
 }
 
@@ -501,21 +465,21 @@ sctp_add_faddr(sctp_t *sctp, in6_addr_t *addr, int sleep, boolean_t first)
 	sctp_faddr_t	*faddr;
 	mblk_t		*timer_mp;
 	int		err;
+	conn_t		*connp = sctp->sctp_connp;
 
 	if (is_system_labeled()) {
-		cred_t *effective_cred;
+		ip_xmit_attr_t	*ixa = connp->conn_ixa;
+		ts_label_t	*effective_tsl = NULL;
+
+		ASSERT(ixa->ixa_tsl != NULL);
 
 		/*
 		 * Verify the destination is allowed to receive packets
 		 * at the security label of the connection we are initiating.
 		 *
-		 * tsol_check_dest() will create a new effective cred for
+		 * tsol_check_dest() will create a new effective label for
 		 * this connection with a modified label or label flags only
-		 * if there are changes from the original cred.
-		 *
-		 * conn_effective_cred may be non-NULL if a previous
-		 * faddr was already added or if this is a server
-		 * accepting a connection on a multi-label port.
+		 * if there are changes from the original label.
 		 *
 		 * Accept whatever label we get if this is the first
 		 * destination address for this connection. The security
@@ -525,27 +489,28 @@ sctp_add_faddr(sctp_t *sctp, in6_addr_t *addr, int sleep, boolean_t first)
 		if (IN6_IS_ADDR_V4MAPPED(addr)) {
 			uint32_t dst;
 			IN6_V4MAPPED_TO_IPADDR(addr, dst);
-			err = tsol_check_dest(CONN_CRED(sctp->sctp_connp),
-			    &dst, IPV4_VERSION, sctp->sctp_mac_mode,
-			    &effective_cred);
+			err = tsol_check_dest(ixa->ixa_tsl,
+			    &dst, IPV4_VERSION, connp->conn_mac_mode,
+			    connp->conn_zone_is_global, &effective_tsl);
 		} else {
-			err = tsol_check_dest(CONN_CRED(sctp->sctp_connp),
-			    addr, IPV6_VERSION, sctp->sctp_mac_mode,
-			    &effective_cred);
+			err = tsol_check_dest(ixa->ixa_tsl,
+			    addr, IPV6_VERSION, connp->conn_mac_mode,
+			    connp->conn_zone_is_global, &effective_tsl);
 		}
 		if (err != 0)
 			return (err);
-		if (sctp->sctp_faddrs == NULL &&
-		    sctp->sctp_connp->conn_effective_cred == NULL) {
-			sctp->sctp_connp->conn_effective_cred = effective_cred;
-		} else if (effective_cred != NULL) {
-			crfree(effective_cred);
+
+		if (sctp->sctp_faddrs == NULL && effective_tsl != NULL) {
+			ip_xmit_attr_replace_tsl(ixa, effective_tsl);
+		} else if (effective_tsl != NULL) {
+			label_rele(effective_tsl);
 			return (EHOSTUNREACH);
 		}
 	}
 
 	if ((faddr = kmem_cache_alloc(sctp_kmem_faddr_cache, sleep)) == NULL)
 		return (ENOMEM);
+	bzero(faddr, sizeof (*faddr));
 	timer_mp = sctp_timer_alloc((sctp), sctp_rexmit_timer, sleep);
 	if (timer_mp == NULL) {
 		kmem_cache_free(sctp_kmem_faddr_cache, faddr);
@@ -553,16 +518,19 @@ sctp_add_faddr(sctp_t *sctp, in6_addr_t *addr, int sleep, boolean_t first)
 	}
 	((sctpt_t *)(timer_mp->b_rptr))->sctpt_faddr = faddr;
 
-	sctp_init_faddr(sctp, faddr, addr, timer_mp);
-
-	/* Check for subnet broadcast. */
-	if (faddr->ire != NULL && faddr->ire->ire_type & IRE_BROADCAST) {
-		IRE_REFRELE_NOTR(faddr->ire);
-		sctp_timer_free(timer_mp);
-		faddr->timer_mp = NULL;
+	/* Start with any options set on the conn */
+	faddr->ixa = conn_get_ixa_exclusive(connp);
+	if (faddr->ixa == NULL) {
+		freemsg(timer_mp);
 		kmem_cache_free(sctp_kmem_faddr_cache, faddr);
-		return (EADDRNOTAVAIL);
+		return (ENOMEM);
 	}
+	faddr->ixa->ixa_notify_cookie = connp->conn_sctp;
+
+	sctp_init_faddr(sctp, faddr, addr, timer_mp);
+	ASSERT(faddr->ixa->ixa_cred != NULL);
+
+	/* ip_attr_connect didn't allow broadcats/multicast dest */
 	ASSERT(faddr->next == NULL);
 
 	if (sctp->sctp_faddrs == NULL) {
@@ -644,7 +612,7 @@ sctp_redo_faddr_srcs(sctp_t *sctp)
 	sctp_faddr_t *fp;
 
 	for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
-		sctp_get_ire(sctp, fp);
+		sctp_get_dest(sctp, fp);
 	}
 }
 
@@ -662,15 +630,17 @@ sctp_faddr_alive(sctp_t *sctp, sctp_faddr_t *fp)
 		fp->state = SCTP_FADDRS_ALIVE;
 		sctp_intf_event(sctp, fp->faddr, SCTP_ADDR_AVAILABLE, 0);
 		/* Should have a full IRE now */
-		sctp_get_ire(sctp, fp);
+		sctp_get_dest(sctp, fp);
 
 		/*
 		 * If this is the primary, switch back to it now.  And
 		 * we probably want to reset the source addr used to reach
 		 * it.
+		 * Note that if we didn't find a source in sctp_get_dest
+		 * then we'd be unreachable at this point in time.
 		 */
-		if (fp == sctp->sctp_primary) {
-			ASSERT(fp->state != SCTP_FADDRS_UNREACH);
+		if (fp == sctp->sctp_primary &&
+		    fp->state != SCTP_FADDRS_UNREACH) {
 			sctp_set_faddr_current(sctp, fp);
 			return;
 		}
@@ -816,9 +786,9 @@ sctp_unlink_faddr(sctp_t *sctp, sctp_faddr_t *fp)
 		fp->rc_timer_mp = NULL;
 		fp->rc_timer_running = 0;
 	}
-	if (fp->ire != NULL) {
-		IRE_REFRELE_NOTR(fp->ire);
-		fp->ire = NULL;
+	if (fp->ixa != NULL) {
+		ixa_refrele(fp->ixa);
+		fp->ixa = NULL;
 	}
 
 	if (fp == sctp->sctp_faddrs) {
@@ -837,7 +807,6 @@ gotit:
 		fpp->next = fp->next;
 	}
 	mutex_exit(&sctp->sctp_conn_tfp->tf_lock);
-	/* XXX faddr2ire? */
 	kmem_cache_free(sctp_kmem_faddr_cache, fp);
 	sctp->sctp_nfaddrs--;
 }
@@ -866,8 +835,10 @@ sctp_zap_faddrs(sctp_t *sctp, int caller_holds_lock)
 
 	for (fp = sctp->sctp_faddrs; fp; fp = fpn) {
 		fpn = fp->next;
-		if (fp->ire != NULL)
-			IRE_REFRELE_NOTR(fp->ire);
+		if (fp->ixa != NULL) {
+			ixa_refrele(fp->ixa);
+			fp->ixa = NULL;
+		}
 		kmem_cache_free(sctp_kmem_faddr_cache, fp);
 		sctp->sctp_nfaddrs--;
 	}
@@ -888,242 +859,177 @@ sctp_zap_addrs(sctp_t *sctp)
 }
 
 /*
- * Initialize the IPv4 header. Loses any record of any IP options.
+ * Build two SCTP header templates; one for IPv4 and one for IPv6.
+ * Store them in sctp_iphc and sctp_iphc6 respectively (and related fields).
+ * There are no IP addresses in the templates, but the port numbers and
+ * verifier are field in from the conn_t and sctp_t.
+ *
+ * Returns failure if can't allocate memory, or if there is a problem
+ * with a routing header/option.
+ *
+ * We allocate space for the minimum sctp header (sctp_hdr_t).
+ *
+ * We massage an routing option/header. There is no checksum implication
+ * for a routing header for sctp.
+ *
+ * Caller needs to update conn_wroff if desired.
+ *
+ * TSol notes: This assumes that a SCTP association has a single peer label
+ * since we only track a single pair of ipp_label_v4/v6 and not a separate one
+ * for each faddr.
  */
 int
-sctp_header_init_ipv4(sctp_t *sctp, int sleep)
+sctp_build_hdrs(sctp_t *sctp, int sleep)
 {
+	conn_t		*connp = sctp->sctp_connp;
+	ip_pkt_t	*ipp = &connp->conn_xmit_ipp;
+	uint_t		ip_hdr_length;
+	uchar_t		*hdrs;
+	uint_t		hdrs_len;
+	uint_t		ulp_hdr_length = sizeof (sctp_hdr_t);
+	ipha_t		*ipha;
+	ip6_t		*ip6h;
 	sctp_hdr_t	*sctph;
-	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	in6_addr_t	v6src, v6dst;
+	ipaddr_t	v4src, v4dst;
 
-	/*
-	 * This is a simple initialization. If there's
-	 * already a template, it should never be too small,
-	 * so reuse it.  Otherwise, allocate space for the new one.
-	 */
-	if (sctp->sctp_iphc != NULL) {
-		ASSERT(sctp->sctp_iphc_len >= SCTP_MAX_COMBINED_HEADER_LENGTH);
-		bzero(sctp->sctp_iphc, sctp->sctp_iphc_len);
-	} else {
-		sctp->sctp_iphc_len = SCTP_MAX_COMBINED_HEADER_LENGTH;
-		sctp->sctp_iphc = kmem_zalloc(sctp->sctp_iphc_len, sleep);
-		if (sctp->sctp_iphc == NULL) {
-			sctp->sctp_iphc_len = 0;
-			return (ENOMEM);
-		}
-	}
+	v4src = connp->conn_saddr_v4;
+	v4dst = connp->conn_faddr_v4;
+	v6src = connp->conn_saddr_v6;
+	v6dst = connp->conn_faddr_v6;
 
-	sctp->sctp_ipha = (ipha_t *)sctp->sctp_iphc;
+	/* First do IPv4 header */
+	ip_hdr_length = ip_total_hdrs_len_v4(ipp);
 
-	sctp->sctp_hdr_len = sizeof (ipha_t) + sizeof (sctp_hdr_t);
-	sctp->sctp_ip_hdr_len = sizeof (ipha_t);
-	sctp->sctp_ipha->ipha_length = htons(sizeof (ipha_t) +
-	    sizeof (sctp_hdr_t));
-	sctp->sctp_ipha->ipha_version_and_hdr_length =
-	    (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS;
+	/* In case of TX label and IP options it can be too much */
+	if (ip_hdr_length > IP_MAX_HDR_LENGTH) {
+		/* Preserves existing TX errno for this */
+		return (EHOSTUNREACH);
+	}
+	hdrs_len = ip_hdr_length + ulp_hdr_length;
+	ASSERT(hdrs_len != 0);
 
-	/*
-	 * These two fields should be zero, and are already set above.
-	 *
-	 * sctp->sctp_ipha->ipha_ident,
-	 * sctp->sctp_ipha->ipha_fragment_offset_and_flags.
-	 */
+	if (hdrs_len != sctp->sctp_iphc_len) {
+		/* Allocate new before we free any old */
+		hdrs = kmem_alloc(hdrs_len, sleep);
+		if (hdrs == NULL)
+			return (ENOMEM);
 
-	sctp->sctp_ipha->ipha_ttl = sctps->sctps_ipv4_ttl;
-	sctp->sctp_ipha->ipha_protocol = IPPROTO_SCTP;
+		if (sctp->sctp_iphc != NULL)
+			kmem_free(sctp->sctp_iphc, sctp->sctp_iphc_len);
+		sctp->sctp_iphc = hdrs;
+		sctp->sctp_iphc_len = hdrs_len;
+	} else {
+		hdrs = sctp->sctp_iphc;
+	}
+	sctp->sctp_hdr_len = sctp->sctp_iphc_len;
+	sctp->sctp_ip_hdr_len = ip_hdr_length;
 
-	sctph = (sctp_hdr_t *)(sctp->sctp_iphc + sizeof (ipha_t));
+	sctph = (sctp_hdr_t *)(hdrs + ip_hdr_length);
 	sctp->sctp_sctph = sctph;
-
-	return (0);
-}
-
-/*
- * Update sctp_sticky_hdrs based on sctp_sticky_ipp.
- * The headers include ip6i_t (if needed), ip6_t, any sticky extension
- * headers, and the maximum size sctp header (to avoid reallocation
- * on the fly for additional sctp options).
- * Returns failure if can't allocate memory.
- */
-int
-sctp_build_hdrs(sctp_t *sctp)
-{
-	char		*hdrs;
-	uint_t		hdrs_len;
-	ip6i_t		*ip6i;
-	char		buf[SCTP_MAX_HDR_LENGTH];
-	ip6_pkt_t	*ipp = &sctp->sctp_sticky_ipp;
-	in6_addr_t	src;
-	in6_addr_t	dst;
-	sctp_stack_t	*sctps = sctp->sctp_sctps;
-
-	/*
-	 * save the existing sctp header and source/dest IP addresses
-	 */
-	bcopy(sctp->sctp_sctph6, buf, sizeof (sctp_hdr_t));
-	src = sctp->sctp_ip6h->ip6_src;
-	dst = sctp->sctp_ip6h->ip6_dst;
-	hdrs_len = ip_total_hdrs_len_v6(ipp) + SCTP_MAX_HDR_LENGTH;
+	sctph->sh_sport = connp->conn_lport;
+	sctph->sh_dport = connp->conn_fport;
+	sctph->sh_verf = sctp->sctp_fvtag;
+	sctph->sh_chksum = 0;
+
+	ipha = (ipha_t *)hdrs;
+	sctp->sctp_ipha = ipha;
+
+	ipha->ipha_src = v4src;
+	ipha->ipha_dst = v4dst;
+	ip_build_hdrs_v4(hdrs, ip_hdr_length, ipp, connp->conn_proto);
+	ipha->ipha_length = htons(hdrs_len);
+	ipha->ipha_fragment_offset_and_flags = 0;
+
+	if (ipp->ipp_fields & IPPF_IPV4_OPTIONS)
+		(void) ip_massage_options(ipha, connp->conn_netstack);
+
+	/* Now IPv6 */
+	ip_hdr_length = ip_total_hdrs_len_v6(ipp);
+	hdrs_len = ip_hdr_length + ulp_hdr_length;
 	ASSERT(hdrs_len != 0);
-	if (hdrs_len > sctp->sctp_iphc6_len) {
-		/* Need to reallocate */
-		hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP);
+
+	if (hdrs_len != sctp->sctp_iphc6_len) {
+		/* Allocate new before we free any old */
+		hdrs = kmem_alloc(hdrs_len, sleep);
 		if (hdrs == NULL)
 			return (ENOMEM);
 
-		if (sctp->sctp_iphc6_len != 0)
+		if (sctp->sctp_iphc6 != NULL)
 			kmem_free(sctp->sctp_iphc6, sctp->sctp_iphc6_len);
 		sctp->sctp_iphc6 = hdrs;
 		sctp->sctp_iphc6_len = hdrs_len;
-	}
-	ip_build_hdrs_v6((uchar_t *)sctp->sctp_iphc6,
-	    hdrs_len - SCTP_MAX_HDR_LENGTH, ipp, IPPROTO_SCTP);
-
-	/* Set header fields not in ipp */
-	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
-		ip6i = (ip6i_t *)sctp->sctp_iphc6;
-		sctp->sctp_ip6h = (ip6_t *)&ip6i[1];
 	} else {
-		sctp->sctp_ip6h = (ip6_t *)sctp->sctp_iphc6;
+		hdrs = sctp->sctp_iphc6;
 	}
-	/*
-	 * sctp->sctp_ip_hdr_len will include ip6i_t if there is one.
-	 */
-	sctp->sctp_ip_hdr6_len = hdrs_len - SCTP_MAX_HDR_LENGTH;
-	sctp->sctp_sctph6 = (sctp_hdr_t *)(sctp->sctp_iphc6 +
-	    sctp->sctp_ip_hdr6_len);
-	sctp->sctp_hdr6_len = sctp->sctp_ip_hdr6_len + sizeof (sctp_hdr_t);
-
-	bcopy(buf, sctp->sctp_sctph6, sizeof (sctp_hdr_t));
+	sctp->sctp_hdr6_len = sctp->sctp_iphc6_len;
+	sctp->sctp_ip_hdr6_len = ip_hdr_length;
 
-	sctp->sctp_ip6h->ip6_src = src;
-	sctp->sctp_ip6h->ip6_dst = dst;
-	/*
-	 * If the hoplimit was not set by ip_build_hdrs_v6(), we need to
-	 * set it to the default value for SCTP.
-	 */
-	if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS))
-		sctp->sctp_ip6h->ip6_hops = sctps->sctps_ipv6_hoplimit;
-	/*
-	 * If we're setting extension headers after a connection
-	 * has been established, and if we have a routing header
-	 * among the extension headers, call ip_massage_options_v6 to
-	 * manipulate the routing header/ip6_dst set the checksum
-	 * difference in the sctp header template.
-	 * (This happens in sctp_connect_ipv6 if the routing header
-	 * is set prior to the connect.)
-	 */
-
-	if ((sctp->sctp_state >= SCTPS_COOKIE_WAIT) &&
-	    (sctp->sctp_sticky_ipp.ipp_fields & IPPF_RTHDR)) {
-		ip6_rthdr_t *rth;
-
-		rth = ip_find_rthdr_v6(sctp->sctp_ip6h,
-		    (uint8_t *)sctp->sctp_sctph6);
+	sctph = (sctp_hdr_t *)(hdrs + ip_hdr_length);
+	sctp->sctp_sctph6 = sctph;
+	sctph->sh_sport = connp->conn_lport;
+	sctph->sh_dport = connp->conn_fport;
+	sctph->sh_verf = sctp->sctp_fvtag;
+	sctph->sh_chksum = 0;
+
+	ip6h = (ip6_t *)hdrs;
+	sctp->sctp_ip6h = ip6h;
+
+	ip6h->ip6_src = v6src;
+	ip6h->ip6_dst = v6dst;
+	ip_build_hdrs_v6(hdrs, ip_hdr_length, ipp, connp->conn_proto,
+	    connp->conn_flowinfo);
+	ip6h->ip6_plen = htons(hdrs_len - IPV6_HDR_LEN);
+
+	if (ipp->ipp_fields & IPPF_RTHDR) {
+		uint8_t		*end;
+		ip6_rthdr_t	*rth;
+
+		end = (uint8_t *)ip6h + ip_hdr_length;
+		rth = ip_find_rthdr_v6(ip6h, end);
 		if (rth != NULL) {
-			(void) ip_massage_options_v6(sctp->sctp_ip6h, rth,
-			    sctps->sctps_netstack);
+			(void) ip_massage_options_v6(ip6h, rth,
+			    connp->conn_netstack);
 		}
-	}
-	return (0);
-}
 
-/*
- * Initialize the IPv6 header. Loses any record of any IPv6 extension headers.
- */
-int
-sctp_header_init_ipv6(sctp_t *sctp, int sleep)
-{
-	sctp_hdr_t	*sctph;
-	sctp_stack_t	*sctps = sctp->sctp_sctps;
-
-	/*
-	 * This is a simple initialization. If there's
-	 * already a template, it should never be too small,
-	 * so reuse it. Otherwise, allocate space for the new one.
-	 * Ensure that there is enough space to "downgrade" the sctp_t
-	 * to an IPv4 sctp_t. This requires having space for a full load
-	 * of IPv4 options
-	 */
-	if (sctp->sctp_iphc6 != NULL) {
-		ASSERT(sctp->sctp_iphc6_len >=
-		    SCTP_MAX_COMBINED_HEADER_LENGTH);
-		bzero(sctp->sctp_iphc6, sctp->sctp_iphc6_len);
-	} else {
-		sctp->sctp_iphc6_len = SCTP_MAX_COMBINED_HEADER_LENGTH;
-		sctp->sctp_iphc6 = kmem_zalloc(sctp->sctp_iphc_len, sleep);
-		if (sctp->sctp_iphc6 == NULL) {
-			sctp->sctp_iphc6_len = 0;
-			return (ENOMEM);
-		}
+		/*
+		 * Verify that the first hop isn't a mapped address.
+		 * Routers along the path need to do this verification
+		 * for subsequent hops.
+		 */
+		if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst))
+			return (EADDRNOTAVAIL);
 	}
-	sctp->sctp_hdr6_len = IPV6_HDR_LEN + sizeof (sctp_hdr_t);
-	sctp->sctp_ip_hdr6_len = IPV6_HDR_LEN;
-	sctp->sctp_ip6h = (ip6_t *)sctp->sctp_iphc6;
-
-	/* Initialize the header template */
-
-	sctp->sctp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-	sctp->sctp_ip6h->ip6_plen = ntohs(sizeof (sctp_hdr_t));
-	sctp->sctp_ip6h->ip6_nxt = IPPROTO_SCTP;
-	sctp->sctp_ip6h->ip6_hops = sctps->sctps_ipv6_hoplimit;
-
-	sctph = (sctp_hdr_t *)(sctp->sctp_iphc6 + IPV6_HDR_LEN);
-	sctp->sctp_sctph6 = sctph;
-
 	return (0);
 }
 
 static int
-sctp_v4_label(sctp_t *sctp)
+sctp_v4_label(sctp_t *sctp, sctp_faddr_t *fp)
 {
-	uchar_t optbuf[IP_MAX_OPT_LENGTH];
-	const cred_t *cr = CONN_CRED(sctp->sctp_connp);
-	int added;
+	conn_t *connp = sctp->sctp_connp;
 
-	if (tsol_compute_label(cr, sctp->sctp_ipha->ipha_dst, optbuf,
-	    sctp->sctp_sctps->sctps_netstack->netstack_ip) != 0)
-		return (EACCES);
-
-	added = tsol_remove_secopt(sctp->sctp_ipha, sctp->sctp_hdr_len);
-	if (added == -1)
-		return (EACCES);
-	sctp->sctp_hdr_len += added;
-	sctp->sctp_sctph = (sctp_hdr_t *)((uchar_t *)sctp->sctp_sctph + added);
-	sctp->sctp_ip_hdr_len += added;
-	if ((sctp->sctp_v4label_len = optbuf[IPOPT_OLEN]) != 0) {
-		sctp->sctp_v4label_len = (sctp->sctp_v4label_len + 3) & ~3;
-		added = tsol_prepend_option(optbuf, sctp->sctp_ipha,
-		    sctp->sctp_hdr_len);
-		if (added == -1)
-			return (EACCES);
-		sctp->sctp_hdr_len += added;
-		sctp->sctp_sctph = (sctp_hdr_t *)((uchar_t *)sctp->sctp_sctph +
-		    added);
-		sctp->sctp_ip_hdr_len += added;
-	}
-	return (0);
+	ASSERT(fp->ixa->ixa_flags & IXAF_IS_IPV4);
+	return (conn_update_label(connp, fp->ixa, &fp->faddr,
+	    &connp->conn_xmit_ipp));
 }
 
 static int
-sctp_v6_label(sctp_t *sctp)
+sctp_v6_label(sctp_t *sctp, sctp_faddr_t *fp)
 {
-	uchar_t optbuf[TSOL_MAX_IPV6_OPTION];
-	const cred_t *cr = CONN_CRED(sctp->sctp_connp);
+	conn_t *connp = sctp->sctp_connp;
 
-	if (tsol_compute_label_v6(cr, &sctp->sctp_ip6h->ip6_dst, optbuf,
-	    sctp->sctp_sctps->sctps_netstack->netstack_ip) != 0)
-		return (EACCES);
-	if (tsol_update_sticky(&sctp->sctp_sticky_ipp, &sctp->sctp_v6label_len,
-	    optbuf) != 0)
-		return (EACCES);
-	if (sctp_build_hdrs(sctp) != 0)
-		return (EACCES);
-	return (0);
+	ASSERT(!(fp->ixa->ixa_flags & IXAF_IS_IPV4));
+	return (conn_update_label(connp, fp->ixa, &fp->faddr,
+	    &connp->conn_xmit_ipp));
 }
 
 /*
  * XXX implement more sophisticated logic
+ *
+ * Tsol note: We have already verified the addresses using tsol_check_dest
+ * in sctp_add_faddr, thus no need to redo that here.
+ * We do setup ipp_label_v4 and ipp_label_v6 based on which addresses
+ * we have.
  */
 int
 sctp_set_hdraddrs(sctp_t *sctp)
@@ -1131,50 +1037,43 @@ sctp_set_hdraddrs(sctp_t *sctp)
 	sctp_faddr_t *fp;
 	int gotv4 = 0;
 	int gotv6 = 0;
+	conn_t *connp = sctp->sctp_connp;
 
 	ASSERT(sctp->sctp_faddrs != NULL);
 	ASSERT(sctp->sctp_nsaddrs > 0);
 
 	/* Set up using the primary first */
+	connp->conn_faddr_v6 = sctp->sctp_primary->faddr;
+	/* saddr may be unspec; make_mp() will handle this */
+	connp->conn_saddr_v6 = sctp->sctp_primary->saddr;
+	connp->conn_laddr_v6 = connp->conn_saddr_v6;
 	if (IN6_IS_ADDR_V4MAPPED(&sctp->sctp_primary->faddr)) {
-		IN6_V4MAPPED_TO_IPADDR(&sctp->sctp_primary->faddr,
-		    sctp->sctp_ipha->ipha_dst);
-		/* saddr may be unspec; make_mp() will handle this */
-		IN6_V4MAPPED_TO_IPADDR(&sctp->sctp_primary->saddr,
-		    sctp->sctp_ipha->ipha_src);
-		if (!is_system_labeled() || sctp_v4_label(sctp) == 0) {
+		if (!is_system_labeled() ||
+		    sctp_v4_label(sctp, sctp->sctp_primary) == 0) {
 			gotv4 = 1;
-			if (sctp->sctp_ipversion == IPV4_VERSION) {
-				goto copyports;
+			if (connp->conn_family == AF_INET) {
+				goto done;
 			}
 		}
 	} else {
-		sctp->sctp_ip6h->ip6_dst = sctp->sctp_primary->faddr;
-		/* saddr may be unspec; make_mp() will handle this */
-		sctp->sctp_ip6h->ip6_src = sctp->sctp_primary->saddr;
-		if (!is_system_labeled() || sctp_v6_label(sctp) == 0)
+		if (!is_system_labeled() ||
+		    sctp_v6_label(sctp, sctp->sctp_primary) == 0) {
 			gotv6 = 1;
+		}
 	}
 
 	for (fp = sctp->sctp_faddrs; fp; fp = fp->next) {
 		if (!gotv4 && IN6_IS_ADDR_V4MAPPED(&fp->faddr)) {
-			IN6_V4MAPPED_TO_IPADDR(&fp->faddr,
-			    sctp->sctp_ipha->ipha_dst);
-			/* copy in the faddr_t's saddr */
-			IN6_V4MAPPED_TO_IPADDR(&fp->saddr,
-			    sctp->sctp_ipha->ipha_src);
-			if (!is_system_labeled() || sctp_v4_label(sctp) == 0) {
+			if (!is_system_labeled() ||
+			    sctp_v4_label(sctp, fp) == 0) {
 				gotv4 = 1;
-				if (sctp->sctp_ipversion == IPV4_VERSION ||
-				    gotv6) {
+				if (connp->conn_family == AF_INET || gotv6) {
 					break;
 				}
 			}
 		} else if (!gotv6 && !IN6_IS_ADDR_V4MAPPED(&fp->faddr)) {
-			sctp->sctp_ip6h->ip6_dst = fp->faddr;
-			/* copy in the faddr_t's saddr */
-			sctp->sctp_ip6h->ip6_src = fp->saddr;
-			if (!is_system_labeled() || sctp_v6_label(sctp) == 0) {
+			if (!is_system_labeled() ||
+			    sctp_v6_label(sctp, fp) == 0) {
 				gotv6 = 1;
 				if (gotv4)
 					break;
@@ -1182,16 +1081,10 @@ sctp_set_hdraddrs(sctp_t *sctp)
 		}
 	}
 
-copyports:
+done:
 	if (!gotv4 && !gotv6)
 		return (EACCES);
 
-	/* copy in the ports for good measure */
-	sctp->sctp_sctph->sh_sport = sctp->sctp_lport;
-	sctp->sctp_sctph->sh_dport = sctp->sctp_fport;
-
-	sctp->sctp_sctph6->sh_sport = sctp->sctp_lport;
-	sctp->sctp_sctph6->sh_dport = sctp->sctp_fport;
 	return (0);
 }
 
@@ -1343,6 +1236,7 @@ sctp_get_addrparams(sctp_t *sctp, sctp_t *psctp, mblk_t *pkt,
 	boolean_t		check_saddr = B_TRUE;
 	in6_addr_t		curaddr;
 	sctp_stack_t		*sctps = sctp->sctp_sctps;
+	conn_t			*connp = sctp->sctp_connp;
 
 	if (sctp_options != NULL)
 		*sctp_options = 0;
@@ -1473,8 +1367,7 @@ sctp_get_addrparams(sctp_t *sctp, sctp_t *psctp, mblk_t *pkt,
 				if (ta == 0 ||
 				    ta == INADDR_BROADCAST ||
 				    ta == htonl(INADDR_LOOPBACK) ||
-				    CLASSD(ta) ||
-				    sctp->sctp_connp->conn_ipv6_v6only) {
+				    CLASSD(ta) || connp->conn_ipv6_v6only) {
 					goto next;
 				}
 				IN6_INADDR_TO_V4MAPPED((struct in_addr *)
@@ -1492,7 +1385,7 @@ sctp_get_addrparams(sctp_t *sctp, sctp_t *psctp, mblk_t *pkt,
 					goto next;
 			}
 		} else if (ph->sph_type == htons(PARM_ADDR6) &&
-		    sctp->sctp_family == AF_INET6) {
+		    connp->conn_family == AF_INET6) {
 			/* An v4 socket should not take v6 addresses. */
 			if (remaining >= PARM_ADDR6_LEN) {
 				in6_addr_t *addr6;
@@ -1567,7 +1460,7 @@ next:
 		}
 		bcopy(&curaddr, dlist, sizeof (curaddr));
 		sctp_get_faddr_list(sctp, alist, asize);
-		(*cl_sctp_assoc_change)(sctp->sctp_family, alist, asize,
+		(*cl_sctp_assoc_change)(connp->conn_family, alist, asize,
 		    sctp->sctp_nfaddrs, dlist, dsize, 1, SCTP_CL_PADDR,
 		    (cl_sctp_handle_t)sctp);
 		/* alist and dlist will be freed by the clustering module */
@@ -1581,7 +1474,7 @@ next:
  */
 int
 sctp_secure_restart_check(mblk_t *pkt, sctp_chunk_hdr_t *ich, uint32_t ports,
-    int sleep, sctp_stack_t *sctps)
+    int sleep, sctp_stack_t *sctps, ip_recv_attr_t *ira)
 {
 	sctp_faddr_t *fp, *fphead = NULL;
 	sctp_parm_hdr_t *ph;
@@ -1696,7 +1589,7 @@ sctp_secure_restart_check(mblk_t *pkt, sctp_chunk_hdr_t *ich, uint32_t ports,
 	mutex_enter(&tf->tf_lock);
 
 	for (sctp = tf->tf_sctp; sctp; sctp = sctp->sctp_conn_hash_next) {
-		if (ports != sctp->sctp_ports) {
+		if (ports != sctp->sctp_connp->conn_ports) {
 			continue;
 		}
 		compres = sctp_compare_faddrsets(fphead, sctp->sctp_faddrs);
@@ -1776,7 +1669,8 @@ done:
 
 		/* Send off the abort */
 		sctp_send_abort(sctp, sctp_init2vtag(ich),
-		    SCTP_ERR_RESTART_NEW_ADDRS, dtail, dlen, pkt, 0, B_TRUE);
+		    SCTP_ERR_RESTART_NEW_ADDRS, dtail, dlen, pkt, 0, B_TRUE,
+		    ira);
 
 		kmem_free(dtail, PARM_ADDR6_LEN * nadded);
 	}
@@ -1787,6 +1681,10 @@ cleanup:
 		sctp_faddr_t *fpn;
 		for (fp = fphead; fp; fp = fpn) {
 			fpn = fp->next;
+			if (fp->ixa != NULL) {
+				ixa_refrele(fp->ixa);
+				fp->ixa = NULL;
+			}
 			kmem_cache_free(sctp_kmem_faddr_cache, fp);
 		}
 	}
@@ -1850,6 +1748,8 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr,
 {
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
 
+	ASSERT(fp->ixa != NULL);
+
 	bcopy(addr, &fp->faddr, sizeof (*addr));
 	if (IN6_IS_ADDR_V4MAPPED(addr)) {
 		fp->isv4 = 1;
@@ -1857,11 +1757,13 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr,
 		fp->sfa_pmss =
 		    (sctps->sctps_initial_mtu - sctp->sctp_hdr_len) &
 		    ~(SCTP_ALIGN - 1);
+		fp->ixa->ixa_flags |= IXAF_IS_IPV4;
 	} else {
 		fp->isv4 = 0;
 		fp->sfa_pmss =
 		    (sctps->sctps_initial_mtu - sctp->sctp_hdr6_len) &
 		    ~(SCTP_ALIGN - 1);
+		fp->ixa->ixa_flags &= ~IXAF_IS_IPV4;
 	}
 	fp->cwnd = sctps->sctps_slow_start_initial * fp->sfa_pmss;
 	fp->rto = MIN(sctp->sctp_rto_initial, sctp->sctp_init_rto_max);
@@ -1884,14 +1786,13 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr,
 	fp->df = 1;
 	fp->pmtu_discovered = 0;
 	fp->next = NULL;
-	fp->ire = NULL;
 	fp->T3expire = 0;
 	(void) random_get_pseudo_bytes((uint8_t *)&fp->hb_secret,
 	    sizeof (fp->hb_secret));
 	fp->hb_expiry = lbolt64;
 	fp->rxt_unacked = 0;
 
-	sctp_get_ire(sctp, fp);
+	sctp_get_dest(sctp, fp);
 }
 
 /*ARGSUSED*/
diff --git a/usr/src/uts/common/inet/sctp/sctp_conn.c b/usr/src/uts/common/inet/sctp/sctp_conn.c
index 60c22a3673..7dc048f919 100644
--- a/usr/src/uts/common/inet/sctp/sctp_conn.c
+++ b/usr/src/uts/common/inet/sctp/sctp_conn.c
@@ -64,38 +64,19 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt,
 	uint_t			sctp_options;
 	conn_t			*aconnp;
 	conn_t			*lconnp;
-	cred_t			*credp;
-	ts_label_t		*tslp;
 	sctp_stack_t	*sctps = listener->sctp_sctps;
 
 	sctph = (sctp_hdr_t *)(cr_pkt->b_rptr + ip_hdr_len);
 	ASSERT(OK_32PTR(sctph));
 
-	acceptor->sctp_lport = listener->sctp_lport;
-	acceptor->sctp_fport = sctph->sh_sport;
+	aconnp = acceptor->sctp_connp;
+	lconnp = listener->sctp_connp;
+	aconnp->conn_lport = lconnp->conn_lport;
+	aconnp->conn_fport = sctph->sh_sport;
 
 	ich = (sctp_chunk_hdr_t *)(iack + 1);
 	init = (sctp_init_chunk_t *)(ich + 1);
 
-	/*
-	 * If this is an MLP connection, packets are to be
-	 * exchanged using the security label of the received
-	 * Cookie packet instead of the server application's label.
-	 * Create an effective cred for the connection by attaching
-	 * the received packet's security label to the server
-	 * application's cred.
-	 */
-	aconnp = acceptor->sctp_connp;
-	lconnp = listener->sctp_connp;
-	ASSERT(aconnp->conn_effective_cred == NULL);
-	if (lconnp->conn_mlp_type != mlptSingle &&
-	    (credp = msg_getcred(cr_pkt, NULL)) != NULL &&
-	    (tslp = crgetlabel(credp)) != NULL) {
-		if ((aconnp->conn_effective_cred = copycred_from_tslabel(
-		    aconnp->conn_cred, tslp, KM_NOSLEEP)) == NULL)
-			return (ENOMEM);
-	}
-
 	/* acceptor isn't in any fanouts yet, so don't need to hold locks */
 	ASSERT(acceptor->sctp_faddrs == NULL);
 	err = sctp_get_addrparams(acceptor, listener, cr_pkt, ich,
@@ -106,14 +87,15 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt,
 	if ((err = sctp_set_hdraddrs(acceptor)) != 0)
 		return (err);
 
+	if ((err = sctp_build_hdrs(acceptor, KM_NOSLEEP)) != 0)
+		return (err);
+
 	if ((sctp_options & SCTP_PRSCTP_OPTION) &&
 	    listener->sctp_prsctp_aware && sctps->sctps_prsctp_enabled) {
 		acceptor->sctp_prsctp_aware = B_TRUE;
 	} else {
 		acceptor->sctp_prsctp_aware = B_FALSE;
 	}
-	/* The new sctp_t is fully bound now. */
-	acceptor->sctp_connp->conn_fully_bound = B_TRUE;
 
 	/* Get  initial TSNs */
 	acceptor->sctp_ltsn = ntohl(iack->sic_inittsn);
@@ -142,9 +124,9 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt,
 	RUN_SCTP(acceptor);
 
 	sctp_conn_hash_insert(&sctps->sctps_conn_fanout[
-	    SCTP_CONN_HASH(sctps, acceptor->sctp_ports)], acceptor, 0);
+	    SCTP_CONN_HASH(sctps, aconnp->conn_ports)], acceptor, 0);
 	sctp_bind_hash_insert(&sctps->sctps_bind_fanout[
-	    SCTP_BIND_HASH(ntohs(acceptor->sctp_lport))], acceptor, 0);
+	    SCTP_BIND_HASH(ntohs(aconnp->conn_lport))], acceptor, 0);
 
 	/*
 	 * No need to check for multicast destination since ip will only pass
@@ -170,10 +152,9 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt,
 /* Process the COOKIE packet, mp, directed at the listener 'sctp' */
 sctp_t *
 sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
-    sctp_init_chunk_t *iack, mblk_t *ipsec_mp)
+    sctp_init_chunk_t *iack, ip_recv_attr_t *ira)
 {
 	sctp_t	*eager;
-	uint_t	ipvers;
 	ip6_t	*ip6h;
 	int	err;
 	conn_t	*connp, *econnp;
@@ -181,6 +162,8 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
 	struct sock_proto_props sopp;
 	cred_t		*cr;
 	pid_t		cpid;
+	in6_addr_t	faddr, laddr;
+	ip_xmit_attr_t	*ixa;
 
 	/*
 	 * No need to check for duplicate as this is the listener
@@ -189,89 +172,116 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
 	 * fanout already done cannot find a match, it means that
 	 * there is no duplicate.
 	 */
-	ipvers = IPH_HDR_VERSION(mp->b_rptr);
-	ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION);
 	ASSERT(OK_32PTR(mp->b_rptr));
 
 	if ((eager = sctp_create_eager(sctp)) == NULL) {
 		return (NULL);
 	}
 
-	if (ipvers != IPV4_VERSION) {
-		ip6h = (ip6_t *)mp->b_rptr;
-		if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))
-			eager->sctp_linklocal = 1;
-		/*
-		 * Record ifindex (might be zero) to tie this connection to
-		 * that interface if either the listener was bound or
-		 * if the connection is using link-local addresses.
-		 */
-		if (sctp->sctp_bound_if == ifindex ||
-		    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))
-			eager->sctp_bound_if = ifindex;
-		/*
-		 * XXX broken. bound_if is always overwritten by statement
-		 * below. What is the right thing to do here?
-		 */
-		eager->sctp_bound_if = sctp->sctp_bound_if;
-	}
-
 	connp = sctp->sctp_connp;
 	sctps = sctp->sctp_sctps;
 	econnp = eager->sctp_connp;
 
 	if (connp->conn_policy != NULL) {
-		ipsec_in_t *ii;
-
-		ASSERT(ipsec_mp != NULL);
-		ii = (ipsec_in_t *)(ipsec_mp->b_rptr);
-		ASSERT(ii->ipsec_in_policy == NULL);
-		IPPH_REFHOLD(connp->conn_policy);
-		ii->ipsec_in_policy = connp->conn_policy;
-
-		ipsec_mp->b_datap->db_type = IPSEC_POLICY_SET;
-		if (!ip_bind_ipsec_policy_set(econnp, ipsec_mp)) {
+		/* Inherit the policy from the listener; use actions from ira */
+		if (!ip_ipsec_policy_inherit(econnp, connp, ira)) {
 			sctp_close_eager(eager);
 			BUMP_MIB(&sctps->sctps_mib, sctpListenDrop);
 			return (NULL);
 		}
 	}
 
-	if (ipsec_mp != NULL) {
+	ip6h = (ip6_t *)mp->b_rptr;
+	if (ira->ira_flags & IXAF_IS_IPV4) {
+		ipha_t	*ipha;
+
+		ipha = (ipha_t *)ip6h;
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &laddr);
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &faddr);
+	} else {
+		laddr = ip6h->ip6_dst;
+		faddr = ip6h->ip6_src;
+	}
+
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
 		/*
 		 * XXX need to fix the cached policy issue here.
-		 * We temporarily set the conn_src/conn_rem here so
+		 * We temporarily set the conn_laddr/conn_faddr here so
 		 * that IPsec can use it for the latched policy
 		 * selector.  This is obvioursly wrong as SCTP can
 		 * use different addresses...
 		 */
-		if (ipvers == IPV4_VERSION) {
-			ipha_t	*ipha;
-
-			ipha = (ipha_t *)mp->b_rptr;
-			econnp->conn_src = ipha->ipha_dst;
-			econnp->conn_rem = ipha->ipha_src;
-		} else {
-			econnp->conn_srcv6 = ip6h->ip6_dst;
-			econnp->conn_remv6 = ip6h->ip6_src;
-		}
+		econnp->conn_laddr_v6 = laddr;
+		econnp->conn_faddr_v6 = faddr;
+		econnp->conn_saddr_v6 = laddr;
 	}
-	if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) {
+	if (ipsec_conn_cache_policy(econnp,
+	    (ira->ira_flags & IRAF_IS_IPV4) != 0) != 0) {
 		sctp_close_eager(eager);
 		BUMP_MIB(&sctps->sctps_mib, sctpListenDrop);
 		return (NULL);
 	}
 
 	/* Save for getpeerucred */
-	cr = msg_getcred(mp, &cpid);
+	cr = ira->ira_cred;
+	cpid = ira->ira_cpid;
+
+	if (is_system_labeled()) {
+		ip_xmit_attr_t *ixa = econnp->conn_ixa;
+
+		ASSERT(ira->ira_tsl != NULL);
+
+		/* Discard any old label */
+		if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+			ASSERT(ixa->ixa_tsl != NULL);
+			label_rele(ixa->ixa_tsl);
+			ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+			ixa->ixa_tsl = NULL;
+		}
+
+		if ((connp->conn_mlp_type != mlptSingle ||
+		    connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+		    ira->ira_tsl != NULL) {
+			/*
+			 * If this is an MLP connection or a MAC-Exempt
+			 * connection with an unlabeled node, packets are to be
+			 * exchanged using the security label of the received
+			 * Cookie packet instead of the server application's
+			 * label.
+			 * tsol_check_dest called from ip_set_destination
+			 * might later update TSF_UNLABELED by replacing
+			 * ixa_tsl with a new label.
+			 */
+			label_hold(ira->ira_tsl);
+			ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
+		} else {
+			ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
+		}
+	}
 
 	err = sctp_accept_comm(sctp, eager, mp, ip_hdr_len, iack);
-	if (err) {
+	if (err != 0) {
 		sctp_close_eager(eager);
 		BUMP_MIB(&sctps->sctps_mib, sctpListenDrop);
 		return (NULL);
 	}
 
+	ASSERT(eager->sctp_current->ixa != NULL);
+
+	ixa = eager->sctp_current->ixa;
+	if (!(ira->ira_flags & IXAF_IS_IPV4)) {
+		ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
+
+		if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) ||
+		    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) {
+			eager->sctp_linklocal = 1;
+
+			ixa->ixa_flags |= IXAF_SCOPEID_SET;
+			ixa->ixa_scopeid = ifindex;
+			econnp->conn_incoming_ifindex = ifindex;
+		}
+	}
+
 	/*
 	 * On a clustered note send this notification to the clustering
 	 * subsystem.
@@ -299,9 +309,9 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
 		/* The clustering module frees these list */
 		sctp_get_saddr_list(eager, slist, ssize);
 		sctp_get_faddr_list(eager, flist, fsize);
-		(*cl_sctp_connect)(eager->sctp_family, slist,
-		    eager->sctp_nsaddrs, eager->sctp_lport, flist,
-		    eager->sctp_nfaddrs, eager->sctp_fport, B_FALSE,
+		(*cl_sctp_connect)(econnp->conn_family, slist,
+		    eager->sctp_nsaddrs, econnp->conn_lport, flist,
+		    eager->sctp_nfaddrs, econnp->conn_fport, B_FALSE,
 		    (cl_sctp_handle_t)eager);
 	}
 
@@ -318,7 +328,7 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
 	bzero(&sopp, sizeof (sopp));
 	sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF;
 	sopp.sopp_maxblk = strmsgsz;
-	if (eager->sctp_family == AF_INET) {
+	if (econnp->conn_family == AF_INET) {
 		sopp.sopp_wroff = sctps->sctps_wroff_xtra +
 		    sizeof (sctp_data_hdr_t) + sctp->sctp_hdr_len;
 	} else {
@@ -335,7 +345,8 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
  * with an OK ack.
  */
 int
-sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
+sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen,
+    cred_t *cr, pid_t pid)
 {
 	sin_t		*sin;
 	sin6_t		*sin6;
@@ -346,18 +357,18 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 	sctp_t		*lsctp;
 	char		buf[INET6_ADDRSTRLEN];
 	int		sleep = sctp->sctp_cansleep ? KM_SLEEP : KM_NOSLEEP;
-	int 		hdrlen;
-	ip6_rthdr_t	*rth;
 	int		err;
 	sctp_faddr_t	*cur_fp;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
-	struct sock_proto_props sopp;
+	conn_t		*connp = sctp->sctp_connp;
+	uint_t		scope_id = 0;
+	ip_xmit_attr_t	*ixa;
 
 	/*
 	 * Determine packet type based on type of address passed in
 	 * the request should contain an IPv4 or IPv6 address.
 	 * Make sure that address family matches the type of
-	 * family of the the address passed down
+	 * family of the address passed down.
 	 */
 	if (addrlen < sizeof (sin_t)) {
 		return (EINVAL);
@@ -372,7 +383,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 			ip0dbg(("sctp_connect: non-unicast\n"));
 			return (EINVAL);
 		}
-		if (sctp->sctp_connp->conn_ipv6_v6only)
+		if (connp->conn_ipv6_v6only)
 			return (EAFNOSUPPORT);
 
 		/* convert to v6 mapped */
@@ -397,11 +408,6 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &dstaddr);
 		}
 		dstport = sin->sin_port;
-		if (sin->sin_family == AF_INET) {
-			hdrlen = sctp->sctp_hdr_len;
-		} else {
-			hdrlen = sctp->sctp_hdr6_len;
-		}
 		break;
 	case AF_INET6:
 		sin6 = (sin6_t *)dst;
@@ -411,7 +417,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 			ip0dbg(("sctp_connect: non-unicast\n"));
 			return (EINVAL);
 		}
-		if (sctp->sctp_connp->conn_ipv6_v6only &&
+		if (connp->conn_ipv6_v6only &&
 		    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 			return (EAFNOSUPPORT);
 		}
@@ -420,11 +426,13 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 			dstaddr = ipv6_loopback;
 		} else {
 			dstaddr = sin6->sin6_addr;
-			if (IN6_IS_ADDR_LINKLOCAL(&dstaddr))
+			if (IN6_IS_ADDR_LINKLOCAL(&dstaddr)) {
 				sctp->sctp_linklocal = 1;
+				scope_id = sin6->sin6_scope_id;
+			}
 		}
 		dstport = sin6->sin6_port;
-		hdrlen = sctp->sctp_hdr6_len;
+		connp->conn_flowinfo = sin6->sin6_flowinfo;
 		break;
 	default:
 		dprint(1, ("sctp_connect: unknown family %d\n",
@@ -437,12 +445,29 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 
 	RUN_SCTP(sctp);
 
-	if (sctp->sctp_family != dst->sa_family ||
-	    (sctp->sctp_connp->conn_state_flags & CONN_CLOSING)) {
+	if (connp->conn_family != dst->sa_family ||
+	    (connp->conn_state_flags & CONN_CLOSING)) {
 		WAKE_SCTP(sctp);
 		return (EINVAL);
 	}
 
+	/* We update our cred/cpid based on the caller of connect */
+	if (connp->conn_cred != cr) {
+		crhold(cr);
+		crfree(connp->conn_cred);
+		connp->conn_cred = cr;
+	}
+	connp->conn_cpid = pid;
+
+	/* Cache things in conn_ixa without any refhold */
+	ixa = connp->conn_ixa;
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
+	if (is_system_labeled()) {
+		/* We need to restart with a label based on the cred */
+		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
+	}
+
 	switch (sctp->sctp_state) {
 	case SCTPS_IDLE: {
 		struct sockaddr_storage	ss;
@@ -459,7 +484,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 		ASSERT(sctp->sctp_nsaddrs == 0);
 
 		bzero(&ss, sizeof (ss));
-		ss.ss_family = sctp->sctp_family;
+		ss.ss_family = connp->conn_family;
 		WAKE_SCTP(sctp);
 		if ((err = sctp_bind(sctp, (struct sockaddr *)&ss,
 		    sizeof (ss))) != 0) {
@@ -474,7 +499,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 
 		/* do the connect */
 		/* XXX check for attempt to connect to self */
-		sctp->sctp_fport = dstport;
+		connp->conn_fport = dstport;
 
 		ASSERT(sctp->sctp_iphc);
 		ASSERT(sctp->sctp_iphc6);
@@ -487,9 +512,9 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 		 */
 		sctp_conn_hash_remove(sctp);
 		tbf = &sctps->sctps_conn_fanout[SCTP_CONN_HASH(sctps,
-		    sctp->sctp_ports)];
+		    connp->conn_ports)];
 		mutex_enter(&tbf->tf_lock);
-		lsctp = sctp_lookup(sctp, &dstaddr, tbf, &sctp->sctp_ports,
+		lsctp = sctp_lookup(sctp, &dstaddr, tbf, &connp->conn_ports,
 		    SCTPS_COOKIE_WAIT);
 		if (lsctp != NULL) {
 			/* found a duplicate connection */
@@ -498,6 +523,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 			WAKE_SCTP(sctp);
 			return (EADDRINUSE);
 		}
+
 		/*
 		 * OK; set up the peer addr (this may grow after we get
 		 * the INIT ACK from the peer with additional addresses).
@@ -509,6 +535,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 			return (err);
 		}
 		cur_fp = sctp->sctp_faddrs;
+		ASSERT(cur_fp->ixa != NULL);
 
 		/* No valid src addr, return. */
 		if (cur_fp->state == SCTP_FADDRS_UNREACH) {
@@ -523,6 +550,16 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 		sctp_conn_hash_insert(tbf, sctp, 1);
 		mutex_exit(&tbf->tf_lock);
 
+		ixa = cur_fp->ixa;
+		ASSERT(ixa->ixa_cred != NULL);
+
+		if (scope_id != 0) {
+			ixa->ixa_flags |= IXAF_SCOPEID_SET;
+			ixa->ixa_scopeid = scope_id;
+		} else {
+			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		}
+
 		/* initialize composite headers */
 		if ((err = sctp_set_hdraddrs(sctp)) != 0) {
 			sctp_conn_hash_remove(sctp);
@@ -530,15 +567,10 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 			return (err);
 		}
 
-		/*
-		 * Massage a routing header (if present) putting the first hop
-		 * in ip6_dst.
-		 */
-		rth = ip_find_rthdr_v6(sctp->sctp_ip6h,
-		    (uint8_t *)sctp->sctp_sctph6);
-		if (rth != NULL) {
-			(void) ip_massage_options_v6(sctp->sctp_ip6h, rth,
-			    sctps->sctps_netstack);
+		if ((err = sctp_build_hdrs(sctp, KM_SLEEP)) != 0) {
+			sctp_conn_hash_remove(sctp);
+			WAKE_SCTP(sctp);
+			return (err);
 		}
 
 		/*
@@ -556,9 +588,6 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 		/* Mark this address as alive */
 		cur_fp->state = SCTP_FADDRS_ALIVE;
 
-		/* This sctp_t is fully bound now. */
-		sctp->sctp_connp->conn_fully_bound = B_TRUE;
-
 		/* Send the INIT to the peer */
 		SCTP_FADDR_TIMER_RESTART(sctp, cur_fp, cur_fp->rto);
 		sctp->sctp_state = SCTPS_COOKIE_WAIT;
@@ -567,7 +596,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 		 * address list, so take the hash lock.
 		 */
 		mutex_enter(&tbf->tf_lock);
-		initmp = sctp_init_mp(sctp);
+		initmp = sctp_init_mp(sctp, cur_fp);
 		if (initmp == NULL) {
 			mutex_exit(&tbf->tf_lock);
 			/*
@@ -605,24 +634,20 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
 			/* The clustering module frees the lists */
 			sctp_get_saddr_list(sctp, slist, ssize);
 			sctp_get_faddr_list(sctp, flist, fsize);
-			(*cl_sctp_connect)(sctp->sctp_family, slist,
-			    sctp->sctp_nsaddrs, sctp->sctp_lport,
-			    flist, sctp->sctp_nfaddrs, sctp->sctp_fport,
+			(*cl_sctp_connect)(connp->conn_family, slist,
+			    sctp->sctp_nsaddrs, connp->conn_lport,
+			    flist, sctp->sctp_nfaddrs, connp->conn_fport,
 			    B_TRUE, (cl_sctp_handle_t)sctp);
 		}
-		WAKE_SCTP(sctp);
-		/* OK to call IP_PUT() here instead of sctp_add_sendq(). */
-		CONN_INC_REF(sctp->sctp_connp);
-		initmp->b_flag |= MSGHASREF;
-		IP_PUT(initmp, sctp->sctp_connp, sctp->sctp_current->isv4);
+		ASSERT(ixa->ixa_cred != NULL);
+		ASSERT(ixa->ixa_ire != NULL);
+
+		(void) conn_ip_output(initmp, ixa);
 		BUMP_LOCAL(sctp->sctp_opkts);
+		WAKE_SCTP(sctp);
 
 notify_ulp:
-		bzero(&sopp, sizeof (sopp));
-		sopp.sopp_flags = SOCKOPT_WROFF;
-		sopp.sopp_wroff = sctps->sctps_wroff_xtra + hdrlen +
-		    sizeof (sctp_data_hdr_t);
-		sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp);
+		sctp_set_ulp_prop(sctp);
 
 		return (0);
 	default:
diff --git a/usr/src/uts/common/inet/sctp/sctp_cookie.c b/usr/src/uts/common/inet/sctp/sctp_cookie.c
index 601938c928..4baf0a7147 100644
--- a/usr/src/uts/common/inet/sctp/sctp_cookie.c
+++ b/usr/src/uts/common/inet/sctp/sctp_cookie.c
@@ -40,6 +40,7 @@
 #include <inet/common.h>
 #include <inet/ip.h>
 #include <inet/ip6.h>
+#include <inet/ipsec_impl.h>
 #include <inet/sctp_ip.h>
 #include <inet/ipclassifier.h>
 #include "sctp_impl.h"
@@ -156,7 +157,7 @@ hmac_md5(uchar_t *text, size_t text_len, uchar_t *key, size_t key_len,
 static int
 validate_init_params(sctp_t *sctp, sctp_chunk_hdr_t *ch,
     sctp_init_chunk_t *init, mblk_t *inmp, sctp_parm_hdr_t **want_cookie,
-    mblk_t **errmp, int *supp_af, uint_t *sctp_options)
+    mblk_t **errmp, int *supp_af, uint_t *sctp_options, ip_recv_attr_t *ira)
 {
 	sctp_parm_hdr_t		*cph;
 	sctp_init_chunk_t	*ic;
@@ -168,6 +169,7 @@ validate_init_params(sctp_t *sctp, sctp_chunk_hdr_t *ch,
 	boolean_t		got_errchunk = B_FALSE;
 	uint16_t		ptype;
 	sctp_mpc_t		mpc;
+	conn_t			*connp = sctp->sctp_connp;
 
 
 	ASSERT(errmp != NULL);
@@ -336,8 +338,8 @@ done:
 	 * is NULL.
 	 */
 	if (want_cookie == NULL &&
-	    ((sctp->sctp_family == AF_INET && !(*supp_af & PARM_SUPP_V4)) ||
-	    (sctp->sctp_family == AF_INET6 && !(*supp_af & PARM_SUPP_V6) &&
+	    ((connp->conn_family == AF_INET && !(*supp_af & PARM_SUPP_V4)) ||
+	    (connp->conn_family == AF_INET6 && !(*supp_af & PARM_SUPP_V6) &&
 	    sctp->sctp_connp->conn_ipv6_v6only))) {
 		dprint(1, ("sctp:validate_init_params: supp addr\n"));
 		serror = SCTP_ERR_BAD_ADDR;
@@ -353,7 +355,7 @@ cookie_abort:
 
 		dprint(1, ("validate_init_params: cookie absent\n"));
 		sctp_send_abort(sctp, sctp_init2vtag(ch), SCTP_ERR_MISSING_PARM,
-		    (char *)&mpc, sizeof (sctp_mpc_t), inmp, 0, B_FALSE);
+		    (char *)&mpc, sizeof (sctp_mpc_t), inmp, 0, B_FALSE, ira);
 		return (0);
 	}
 
@@ -365,7 +367,7 @@ abort:
 		return (0);
 
 	sctp_send_abort(sctp, sctp_init2vtag(ch), serror, details,
-	    errlen, inmp, 0, B_FALSE);
+	    errlen, inmp, 0, B_FALSE, ira);
 	return (0);
 }
 
@@ -453,14 +455,17 @@ cl_sctp_cookie_paddr(sctp_chunk_hdr_t *ch, in6_addr_t *addr)
 	sizeof (sctp_parm_hdr_t) +	/* param header */		\
 	16				/* MD5 hash */
 
+/*
+ * Note that sctp is the listener, hence we shouldn't modify it.
+ */
 void
 sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
-    mblk_t *initmp)
+    mblk_t *initmp, ip_recv_attr_t *ira)
 {
 	ipha_t			*initiph;
 	ip6_t			*initip6h;
-	ipha_t			*iackiph;
-	ip6_t			*iackip6h;
+	ipha_t			*iackiph = NULL;
+	ip6_t			*iackip6h = NULL;
 	sctp_chunk_hdr_t	*iack_ch;
 	sctp_init_chunk_t	*iack;
 	sctp_init_chunk_t	*init;
@@ -485,10 +490,10 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
 	mblk_t			*errmp = NULL;
 	boolean_t		initcollision = B_FALSE;
 	boolean_t		linklocal = B_FALSE;
-	cred_t			*cr;
-	pid_t			pid;
-	ts_label_t		*initlabel;
 	sctp_stack_t		*sctps = sctp->sctp_sctps;
+	conn_t			*connp = sctp->sctp_connp;
+	int			err;
+	ip_xmit_attr_t		*ixa = NULL;
 
 	BUMP_LOCAL(sctp->sctp_ibchunks);
 	isv4 = (IPH_HDR_VERSION(initmp->b_rptr) == IPV4_VERSION);
@@ -501,21 +506,24 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
 	} else {
 		initip6h = (ip6_t *)initmp->b_rptr;
 		ipsctplen = sctp->sctp_ip_hdr6_len;
-		if (IN6_IS_ADDR_LINKLOCAL(&initip6h->ip6_src))
+		if (IN6_IS_ADDR_LINKLOCAL(&initip6h->ip6_src) ||
+		    IN6_IS_ADDR_LINKLOCAL(&initip6h->ip6_dst))
 			linklocal = B_TRUE;
 		supp_af |= PARM_SUPP_V6;
+		if (!sctp->sctp_connp->conn_ipv6_v6only)
+			supp_af |= PARM_SUPP_V4;
 	}
 	ASSERT(OK_32PTR(initsh));
 	init = (sctp_init_chunk_t *)((char *)(initsh + 1) + sizeof (*iack_ch));
 
 	/* Make sure we like the peer's parameters */
 	if (validate_init_params(sctp, ch, init, initmp, NULL, &errmp,
-	    &supp_af, &sctp_options) == 0) {
+	    &supp_af, &sctp_options, ira) == 0) {
 		return;
 	}
 	if (errmp != NULL)
 		errlen = msgdsize(errmp);
-	if (sctp->sctp_family == AF_INET) {
+	if (connp->conn_family == AF_INET) {
 		/*
 		 * Irregardless of the supported address in the INIT, v4
 		 * must be supported.
@@ -580,43 +588,65 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
 	}
 
 	/*
-	 * If the listen socket is bound to a trusted extensions
-	 * multi-label port, attach a copy of the listener's cred
-	 * to the new INITACK mblk. Modify the cred to contain
+	 * Base the transmission on any routing-related socket options
+	 * that have been set on the listener.
+	 */
+	ixa = conn_get_ixa_exclusive(connp);
+	if (ixa == NULL) {
+		sctp_send_abort(sctp, sctp_init2vtag(ch),
+		    SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE, ira);
+		return;
+	}
+	ixa->ixa_flags &= ~IXAF_VERIFY_PMTU;
+
+	if (isv4)
+		ixa->ixa_flags |= IXAF_IS_IPV4;
+	else
+		ixa->ixa_flags &= ~IXAF_IS_IPV4;
+
+	/*
+	 * If the listen socket is bound to a trusted extensions multi-label
+	 * port, a MAC-Exempt connection with an unlabeled node, we use the
 	 * the security label of the received INIT packet.
 	 * If not a multi-label port, attach the unmodified
-	 * listener's cred directly.
+	 * listener's label directly.
 	 *
 	 * We expect Sun developed kernel modules to properly set
 	 * cred labels for sctp connections. We can't be so sure this
 	 * will be done correctly when 3rd party kernel modules
-	 * directly use sctp. The initlabel panic guard logic was
-	 * added to cover this possibility.
+	 * directly use sctp. We check for a NULL ira_tsl to cover this
+	 * possibility.
 	 */
-	if (sctp->sctp_connp->conn_mlp_type != mlptSingle) {
-		cr = msg_getcred(initmp, &pid);
-		if (cr == NULL || (initlabel = crgetlabel(cr)) == NULL) {
-			sctp_send_abort(sctp, sctp_init2vtag(ch),
-			    SCTP_ERR_UNKNOWN, NULL, 0, initmp, 0, B_FALSE);
-			return;
+	if (is_system_labeled()) {
+		/* Discard any old label */
+		if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+			ASSERT(ixa->ixa_tsl != NULL);
+			label_rele(ixa->ixa_tsl);
+			ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+			ixa->ixa_tsl = NULL;
 		}
-		cr = copycred_from_bslabel(CONN_CRED(sctp->sctp_connp),
-		    &initlabel->tsl_label, initlabel->tsl_doi, KM_NOSLEEP);
-		if (cr == NULL) {
-			sctp_send_abort(sctp, sctp_init2vtag(ch),
-			    SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE);
-			return;
+
+		if (connp->conn_mlp_type != mlptSingle ||
+		    connp->conn_mac_mode != CONN_MAC_DEFAULT) {
+			if (ira->ira_tsl == NULL) {
+				sctp_send_abort(sctp, sctp_init2vtag(ch),
+				    SCTP_ERR_UNKNOWN, NULL, 0, initmp, 0,
+				    B_FALSE, ira);
+				ixa_refrele(ixa);
+				return;
+			}
+			label_hold(ira->ira_tsl);
+			ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
+		} else {
+			ixa->ixa_tsl = crgetlabel(connp->conn_cred);
 		}
-		iackmp = allocb_cred(ipsctplen + sctps->sctps_wroff_xtra,
-		    cr, pid);
-		crfree(cr);
-	} else {
-		iackmp = allocb_cred(ipsctplen + sctps->sctps_wroff_xtra,
-		    CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
 	}
+
+	iackmp = allocb(ipsctplen + sctps->sctps_wroff_xtra, BPRI_MED);
 	if (iackmp == NULL) {
 		sctp_send_abort(sctp, sctp_init2vtag(ch),
-		    SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE);
+		    SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE, ira);
+		ixa_refrele(ixa);
 		return;
 	}
 
@@ -632,6 +662,7 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
 		iackiph->ipha_src = initiph->ipha_dst;
 		iackiph->ipha_length = htons(ipsctplen + errlen);
 		iacksh = (sctp_hdr_t *)(p + sctp->sctp_ip_hdr_len);
+		ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr_len;
 	} else {
 		bcopy(sctp->sctp_iphc6, p, sctp->sctp_hdr6_len);
 		iackip6h = (ip6_t *)p;
@@ -639,10 +670,12 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
 		/* Copy the peer's IP addr */
 		iackip6h->ip6_dst = initip6h->ip6_src;
 		iackip6h->ip6_src = initip6h->ip6_dst;
-		iackip6h->ip6_plen = htons(ipsctplen - sizeof (*iackip6h) +
-		    errlen);
+		iackip6h->ip6_plen = htons(ipsctplen + errlen - IPV6_HDR_LEN);
 		iacksh = (sctp_hdr_t *)(p + sctp->sctp_ip_hdr6_len);
+		ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr6_len;
 	}
+	ixa->ixa_pktlen = ipsctplen + errlen;
+
 	ASSERT(OK_32PTR(iacksh));
 
 	/* Fill in the holes in the SCTP common header */
@@ -776,41 +809,58 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
 
 	iackmp->b_cont = errmp;		/*  OK if NULL */
 
-	if (is_system_labeled() && (cr = msg_getcred(iackmp, &pid)) != NULL &&
-	    crgetlabel(cr) != NULL) {
-		conn_t *connp = sctp->sctp_connp;
-		int err;
-
-		if (isv4)
-			err = tsol_check_label(cr, &iackmp,
-			    connp->conn_mac_mode,
-			    sctps->sctps_netstack->netstack_ip, pid);
-		else
-			err = tsol_check_label_v6(cr, &iackmp,
-			    connp->conn_mac_mode,
-			    sctps->sctps_netstack->netstack_ip, pid);
+	if (is_system_labeled()) {
+		ts_label_t *effective_tsl = NULL;
+
+		ASSERT(ira->ira_tsl != NULL);
+
+		/* Discard any old label */
+		if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+			ASSERT(ixa->ixa_tsl != NULL);
+			label_rele(ixa->ixa_tsl);
+			ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+		}
+		ixa->ixa_tsl = ira->ira_tsl;	/* A multi-level responder */
+
+		/*
+		 * We need to check for label-related failures which implies
+		 * an extra call to tsol_check_dest (as ip_output_simple
+		 * also does a tsol_check_dest as part of computing the
+		 * label for the packet, but ip_output_simple doesn't return
+		 * a specific errno for that case so we can't rely on its
+		 * check.)
+		 */
+		if (isv4) {
+			err = tsol_check_dest(ixa->ixa_tsl, &iackiph->ipha_dst,
+			    IPV4_VERSION, connp->conn_mac_mode,
+			    connp->conn_zone_is_global, &effective_tsl);
+		} else {
+			err = tsol_check_dest(ixa->ixa_tsl, &iackip6h->ip6_dst,
+			    IPV6_VERSION, connp->conn_mac_mode,
+			    connp->conn_zone_is_global, &effective_tsl);
+		}
 		if (err != 0) {
 			sctp_send_abort(sctp, sctp_init2vtag(ch),
-			    SCTP_ERR_AUTH_ERR, NULL, 0, initmp, 0, B_FALSE);
+			    SCTP_ERR_AUTH_ERR, NULL, 0, initmp, 0, B_FALSE,
+			    ira);
+			ixa_refrele(ixa);
 			freemsg(iackmp);
 			return;
 		}
+		if (effective_tsl != NULL) {
+			/*
+			 * Since ip_output_simple will redo the
+			 * tsol_check_dest, we just drop the ref.
+			 */
+			label_rele(effective_tsl);
+		}
 	}
 
-	/*
-	 * Stash the conn ptr info. for IP only as e don't have any
-	 * cached IRE.
-	 */
-	SCTP_STASH_IPINFO(iackmp, (ire_t *)NULL);
-
-	/* XXX sctp == sctp_g_q, so using its obchunks is valid */
 	BUMP_LOCAL(sctp->sctp_opkts);
 	BUMP_LOCAL(sctp->sctp_obchunks);
 
-	/* OK to call IP_PUT() here instead of sctp_add_sendq(). */
-	CONN_INC_REF(sctp->sctp_connp);
-	iackmp->b_flag |= MSGHASREF;
-	IP_PUT(iackmp, sctp->sctp_connp, isv4);
+	(void) ip_output_simple(iackmp, ixa);
+	ixa_refrele(ixa);
 }
 
 void
@@ -820,7 +870,7 @@ sctp_send_cookie_ack(sctp_t *sctp)
 	mblk_t *camp;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
 
-	camp = sctp_make_mp(sctp, NULL, sizeof (*cach));
+	camp = sctp_make_mp(sctp, sctp->sctp_current, sizeof (*cach));
 	if (camp == NULL) {
 		/* XXX should abort, but don't have the inmp anymore */
 		SCTP_KSTAT(sctps, sctp_send_cookie_ack_failed);
@@ -833,11 +883,11 @@ sctp_send_cookie_ack(sctp_t *sctp)
 	cach->sch_flags = 0;
 	cach->sch_len = htons(sizeof (*cach));
 
-	sctp_set_iplen(sctp, camp);
-
 	BUMP_LOCAL(sctp->sctp_obchunks);
 
-	sctp_add_sendq(sctp, camp);
+	sctp_set_iplen(sctp, camp, sctp->sctp_current->ixa);
+	(void) conn_ip_output(camp, sctp->sctp_current->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 }
 
 static int
@@ -859,7 +909,8 @@ sctp_find_al_ind(sctp_parm_hdr_t *sph, ssize_t len, uint32_t *adaptation_code)
 }
 
 void
-sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp)
+sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp,
+    ip_recv_attr_t *ira)
 {
 	mblk_t			*cemp;
 	mblk_t			*mp = NULL;
@@ -886,7 +937,7 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp)
 
 	cph = NULL;
 	if (validate_init_params(sctp, iackch, iack, iackmp, &cph, &errmp,
-	    &pad, &sctp_options) == 0) { /* result in 'pad' ignored */
+	    &pad, &sctp_options, ira) == 0) { /* result in 'pad' ignored */
 		BUMP_MIB(&sctps->sctps_mib, sctpAborted);
 		sctp_assoc_event(sctp, SCTP_CANT_STR_ASSOC, 0, NULL);
 		sctp_clean_death(sctp, ECONNABORTED);
@@ -906,8 +957,8 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp)
 	else
 		hdrlen = sctp->sctp_hdr6_len;
 
-	cemp = allocb_cred(sctps->sctps_wroff_xtra + hdrlen + ceclen + pad,
-	    CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
+	cemp = allocb(sctps->sctps_wroff_xtra + hdrlen + ceclen + pad,
+	    BPRI_MED);
 	if (cemp == NULL) {
 		SCTP_FADDR_TIMER_RESTART(sctp, sctp->sctp_current,
 		    sctp->sctp_current->rto);
@@ -932,11 +983,13 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp)
 	 * in sctp_connect().
 	 */
 	sctp->sctp_current->df = B_TRUE;
+	sctp->sctp_ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
+
 	/*
 	 * Since IP uses this info during the fanout process, we need to hold
 	 * the lock for this hash line while performing this operation.
 	 */
-	/* XXX sctp_conn_fanout + SCTP_CONN_HASH(sctps, sctp->sctp_ports); */
+	/* XXX sctp_conn_fanout + SCTP_CONN_HASH(sctps, connp->conn_ports); */
 	ASSERT(sctp->sctp_conn_tfp != NULL);
 	tf = sctp->sctp_conn_tfp;
 	/* sctp isn't a listener so only need to hold conn fanout lock */
@@ -1139,14 +1192,15 @@ sendcookie:
 	sctp->sctp_state = SCTPS_COOKIE_ECHOED;
 	SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
 
-	sctp_set_iplen(sctp, head);
-	sctp_add_sendq(sctp, head);
+	sctp_set_iplen(sctp, head, fp->ixa);
+	(void) conn_ip_output(head, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 }
 
 int
 sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
     sctp_init_chunk_t **iackpp, sctp_hdr_t *insctph, int *recv_adaptation,
-    in6_addr_t *peer_addr)
+    in6_addr_t *peer_addr, ip_recv_attr_t *ira)
 {
 	int32_t			clen;
 	size_t			initplen;
@@ -1163,6 +1217,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
 	uint32_t		*fttag;
 	uint32_t		ports;
 	sctp_stack_t		*sctps = sctp->sctp_sctps;
+	conn_t			*connp = sctp->sctp_connp;
 
 	BUMP_LOCAL(sctp->sctp_ibchunks);
 	/* Verify the ICV */
@@ -1232,7 +1287,8 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
 		staleness = TICK_TO_USEC(diff);
 		staleness = htonl(staleness);
 		sctp_send_abort(sctp, init->sic_inittag, SCTP_ERR_STALE_COOKIE,
-		    (char *)&staleness, sizeof (staleness), cmp, 1, B_FALSE);
+		    (char *)&staleness, sizeof (staleness), cmp, 1, B_FALSE,
+		    ira);
 
 		dprint(1, ("stale cookie %d\n", staleness));
 
@@ -1242,7 +1298,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
 	/* Check for attack by adding addresses to a restart */
 	bcopy(insctph, &ports, sizeof (ports));
 	if (sctp_secure_restart_check(cmp, initch, ports, KM_NOSLEEP,
-	    sctps) != 1) {
+	    sctps, ira) != 1) {
 		return (-1);
 	}
 
@@ -1263,7 +1319,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
 
 			dprint(1, ("duplicate cookie from %x:%x:%x:%x (%d)\n",
 			    SCTP_PRINTADDR(sctp->sctp_current->faddr),
-			    (int)(sctp->sctp_fport)));
+			    (int)(connp->conn_fport)));
 			return (-1);
 		}
 
@@ -1292,7 +1348,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
 
 			dprint(1, ("sctp peer %x:%x:%x:%x (%d) restarted\n",
 			    SCTP_PRINTADDR(sctp->sctp_current->faddr),
-			    (int)(sctp->sctp_fport)));
+			    (int)(connp->conn_fport)));
 			/* reset parameters */
 			sctp_congest_reset(sctp);
 
@@ -1320,7 +1376,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
 
 			dprint(1, ("init collision with %x:%x:%x:%x (%d)\n",
 			    SCTP_PRINTADDR(sctp->sctp_current->faddr),
-			    (int)(sctp->sctp_fport)));
+			    (int)(connp->conn_fport)));
 
 			return (0);
 		} else if (iack->sic_inittag != sctp->sctp_lvtag &&
@@ -1330,7 +1386,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
 			/* Section 5.2.4 case C: late COOKIE */
 			dprint(1, ("late cookie from %x:%x:%x:%x (%d)\n",
 			    SCTP_PRINTADDR(sctp->sctp_current->faddr),
-			    (int)(sctp->sctp_fport)));
+			    (int)(connp->conn_fport)));
 			return (-1);
 		} else if (init->sic_inittag == sctp->sctp_fvtag &&
 		    iack->sic_inittag == sctp->sctp_lvtag) {
@@ -1341,7 +1397,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
 			 */
 			dprint(1, ("cookie tags match from %x:%x:%x:%x (%d)\n",
 			    SCTP_PRINTADDR(sctp->sctp_current->faddr),
-			    (int)(sctp->sctp_fport)));
+			    (int)(connp->conn_fport)));
 			if (sctp->sctp_state < SCTPS_ESTABLISHED) {
 				if (!sctp_initialize_params(sctp, init, iack))
 					return (-1);	/* Drop? */
@@ -1412,13 +1468,17 @@ sctp_addrlist2sctp(mblk_t *mp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ich,
 		/*
 		 * params have been put in host byteorder by
 		 * sctp_check_input()
+		 *
+		 * For labeled systems, there's no need to check the
+		 * label here.  It's known to be good as we checked
+		 * before allowing the connection to become bound.
 		 */
 		if (ph->sph_type == PARM_ADDR4) {
 			IN6_INADDR_TO_V4MAPPED((struct in_addr *)(ph + 1),
 			    &src);
 
 			sctp = sctp_conn_match(&src, &dst, ports, zoneid,
-			    sctps);
+			    0, sctps);
 
 			dprint(1,
 			    ("sctp_addrlist2sctp: src=%x:%x:%x:%x, sctp=%p\n",
@@ -1431,7 +1491,7 @@ sctp_addrlist2sctp(mblk_t *mp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ich,
 		} else if (ph->sph_type == PARM_ADDR6) {
 			src = *(in6_addr_t *)(ph + 1);
 			sctp = sctp_conn_match(&src, &dst, ports, zoneid,
-			    sctps);
+			    0, sctps);
 
 			dprint(1,
 			    ("sctp_addrlist2sctp: src=%x:%x:%x:%x, sctp=%p\n",
diff --git a/usr/src/uts/common/inet/sctp/sctp_error.c b/usr/src/uts/common/inet/sctp/sctp_error.c
index 02d18cf78c..293ff5bd6e 100644
--- a/usr/src/uts/common/inet/sctp/sctp_error.c
+++ b/usr/src/uts/common/inet/sctp/sctp_error.c
@@ -35,9 +35,11 @@
 #include <netinet/in.h>
 #include <netinet/ip6.h>
 
+#include <inet/ipsec_impl.h>
 #include <inet/common.h>
 #include <inet/ip.h>
 #include <inet/ip6.h>
+#include <inet/ipsec_impl.h>
 #include <inet/mib2.h>
 #include <inet/sctp_ip.h>
 #include <inet/ipclassifier.h>
@@ -99,6 +101,7 @@ sctp_user_abort(sctp_t *sctp, mblk_t *data)
 	int len, hdrlen;
 	char *cause;
 	sctp_faddr_t *fp = sctp->sctp_current;
+	ip_xmit_attr_t	*ixa = fp->ixa;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
 
 	/*
@@ -147,14 +150,15 @@ sctp_user_abort(sctp_t *sctp, mblk_t *data)
 		freemsg(mp);
 		return;
 	}
-	sctp_set_iplen(sctp, mp);
 	BUMP_MIB(&sctps->sctps_mib, sctpAborted);
 	BUMP_LOCAL(sctp->sctp_opkts);
 	BUMP_LOCAL(sctp->sctp_obchunks);
 
-	CONN_INC_REF(sctp->sctp_connp);
-	mp->b_flag |= MSGHASREF;
-	IP_PUT(mp, sctp->sctp_connp, fp->isv4);
+	sctp_set_iplen(sctp, mp, ixa);
+	ASSERT(ixa->ixa_ire != NULL);
+	ASSERT(ixa->ixa_cred != NULL);
+
+	(void) conn_ip_output(mp, ixa);
 
 	sctp_assoc_event(sctp, SCTP_COMM_LOST, 0, NULL);
 	sctp_clean_death(sctp, ECONNABORTED);
@@ -165,29 +169,24 @@ sctp_user_abort(sctp_t *sctp, mblk_t *data)
  */
 void
 sctp_send_abort(sctp_t *sctp, uint32_t vtag, uint16_t serror, char *details,
-    size_t len, mblk_t *inmp, int iserror, boolean_t tbit)
+    size_t len, mblk_t *inmp, int iserror, boolean_t tbit, ip_recv_attr_t *ira)
 {
 
 	mblk_t		*hmp;
 	uint32_t	ip_hdr_len;
 	ipha_t		*iniph;
-	ipha_t		*ahiph;
+	ipha_t		*ahiph = NULL;
 	ip6_t		*inip6h;
-	ip6_t		*ahip6h;
+	ip6_t		*ahip6h = NULL;
 	sctp_hdr_t	*sh;
 	sctp_hdr_t	*insh;
 	size_t		ahlen;
 	uchar_t		*p;
 	ssize_t		alen;
 	int		isv4;
-	ire_t		*ire;
-	irb_t		*irb;
-	ts_label_t	*tsl;
-	conn_t		*connp;
-	cred_t		*cr = NULL;
-	pid_t		pid;
+	conn_t		*connp = sctp->sctp_connp;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
-	ip_stack_t	*ipst;
+	ip_xmit_attr_t	*ixa;
 
 	isv4 = (IPH_HDR_VERSION(inmp->b_rptr) == IPV4_VERSION);
 	if (isv4) {
@@ -200,11 +199,10 @@ sctp_send_abort(sctp_t *sctp, uint32_t vtag, uint16_t serror, char *details,
 	 * If this is a labeled system, then check to see if we're allowed to
 	 * send a response to this particular sender.  If not, then just drop.
 	 */
-	if (is_system_labeled() && !tsol_can_reply_error(inmp))
+	if (is_system_labeled() && !tsol_can_reply_error(inmp, ira))
 		return;
 
-	hmp = allocb_cred(sctps->sctps_wroff_xtra + ahlen,
-	    CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
+	hmp = allocb(sctps->sctps_wroff_xtra + ahlen, BPRI_MED);
 	if (hmp == NULL) {
 		/* XXX no resources */
 		return;
@@ -262,75 +260,209 @@ sctp_send_abort(sctp_t *sctp, uint32_t vtag, uint16_t serror, char *details,
 		return;
 	}
 
+	/*
+	 * Base the transmission on any routing-related socket options
+	 * that have been set on the listener/connection.
+	 */
+	ixa = conn_get_ixa_exclusive(connp);
+	if (ixa == NULL) {
+		freemsg(hmp);
+		return;
+	}
+	ixa->ixa_flags &= ~IXAF_VERIFY_PMTU;
+
+	ixa->ixa_pktlen = ahlen + alen;
 	if (isv4) {
-		ahiph->ipha_length = htons(ahlen + alen);
+		ixa->ixa_flags |= IXAF_IS_IPV4;
+		ahiph->ipha_length = htons(ixa->ixa_pktlen);
+		ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr_len;
 	} else {
-		ahip6h->ip6_plen = htons(alen + sizeof (*sh));
+		ixa->ixa_flags &= ~IXAF_IS_IPV4;
+		ahip6h->ip6_plen = htons(ixa->ixa_pktlen - IPV6_HDR_LEN);
+		ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr6_len;
 	}
 
 	BUMP_MIB(&sctps->sctps_mib, sctpAborted);
 	BUMP_LOCAL(sctp->sctp_obchunks);
 
-	ipst = sctps->sctps_netstack->netstack_ip;
-	connp = sctp->sctp_connp;
-	if (is_system_labeled() && (cr = msg_getcred(inmp, &pid)) != NULL &&
-	    crgetlabel(cr) != NULL) {
-		int err;
-		uint_t mode = connp->conn_mac_mode;
+	if (is_system_labeled() && ixa->ixa_tsl != NULL) {
+		ASSERT(ira->ira_tsl != NULL);
 
-		if (isv4)
-			err = tsol_check_label(cr, &hmp, mode, ipst, pid);
-		else
-			err = tsol_check_label_v6(cr, &hmp, mode, ipst, pid);
-		if (err != 0) {
-			freemsg(hmp);
-			return;
-		}
+		ixa->ixa_tsl = ira->ira_tsl;	/* A multi-level responder */
 	}
 
-	/* Stash the conn ptr info. for IP */
-	SCTP_STASH_IPINFO(hmp, NULL);
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+		/*
+		 * Apply IPsec based on how IPsec was applied to
+		 * the packet that caused the abort.
+		 */
+		if (!ipsec_in_to_out(ira, ixa, hmp, ahiph, ahip6h)) {
+			ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip;
 
-	CONN_INC_REF(connp);
-	hmp->b_flag |= MSGHASREF;
-	IP_PUT(hmp, connp, sctp->sctp_current == NULL ? B_TRUE :
-	    sctp->sctp_current->isv4);
-	/*
-	 * Let's just mark the IRE for this destination as temporary
-	 * to prevent any DoS attack.
-	 */
-	tsl = cr == NULL ? NULL : crgetlabel(cr);
-	if (isv4) {
-		ire = ire_cache_lookup(iniph->ipha_src, sctp->sctp_zoneid, tsl,
-		    ipst);
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			/* Note: mp already consumed and ip_drop_packet done */
+			ixa_refrele(ixa);
+			return;
+		}
 	} else {
-		ire = ire_cache_lookup_v6(&inip6h->ip6_src, sctp->sctp_zoneid,
-		    tsl, ipst);
+		ixa->ixa_flags |= IXAF_NO_IPSEC;
 	}
+
+	BUMP_LOCAL(sctp->sctp_opkts);
+	BUMP_LOCAL(sctp->sctp_obchunks);
+
+	(void) ip_output_simple(hmp, ixa);
+	ixa_refrele(ixa);
+}
+
+/*
+ * OOTB version of the above.
+ * If iserror == 0, sends an abort. If iserror != 0, sends an error.
+ */
+void
+sctp_ootb_send_abort(uint32_t vtag, uint16_t serror, char *details,
+    size_t len, const mblk_t *inmp, int iserror, boolean_t tbit,
+    ip_recv_attr_t *ira, ip_stack_t *ipst)
+{
+	uint32_t	ip_hdr_len;
+	size_t		ahlen;
+	ipha_t		*ipha = NULL;
+	ip6_t		*ip6h = NULL;
+	sctp_hdr_t	*insctph;
+	int		i;
+	uint16_t	port;
+	ssize_t		alen;
+	int		isv4;
+	mblk_t		*mp;
+	netstack_t	*ns = ipst->ips_netstack;
+	sctp_stack_t	*sctps = ns->netstack_sctp;
+	ip_xmit_attr_t	ixas;
+
+	bzero(&ixas, sizeof (ixas));
+
+	isv4 = (IPH_HDR_VERSION(inmp->b_rptr) == IPV4_VERSION);
+	ip_hdr_len = ira->ira_ip_hdr_length;
+	ahlen = ip_hdr_len + sizeof (sctp_hdr_t);
+
 	/*
-	 * In the normal case the ire would be non-null, however it could be
-	 * null, say, if IP needs to resolve the gateway for this address. We
-	 * only care about IRE_CACHE.
+	 * If this is a labeled system, then check to see if we're allowed to
+	 * send a response to this particular sender.  If not, then just drop.
 	 */
-	if (ire == NULL)
+	if (is_system_labeled() && !tsol_can_reply_error(inmp, ira))
 		return;
-	if (ire->ire_type != IRE_CACHE) {
-		ire_refrele(ire);
+
+	mp = allocb(ahlen + sctps->sctps_wroff_xtra, BPRI_MED);
+	if (mp == NULL) {
 		return;
 	}
-	irb = ire->ire_bucket;
-	/* ire_lock is not needed, as ire_marks is protected by irb_lock */
-	rw_enter(&irb->irb_lock, RW_WRITER);
+	mp->b_rptr += sctps->sctps_wroff_xtra;
+	mp->b_wptr = mp->b_rptr + ahlen;
+	bcopy(inmp->b_rptr, mp->b_rptr, ahlen);
+
 	/*
-	 * Only increment the temporary IRE count if the original
-	 * IRE is not already marked temporary.
+	 * We follow the logic in tcp_xmit_early_reset() in that we skip
+	 * reversing source route (i.e. replace all IP options with EOL).
 	 */
-	if (!(ire->ire_marks & IRE_MARK_TEMPORARY)) {
-		irb->irb_tmp_ire_cnt++;
-		ire->ire_marks |= IRE_MARK_TEMPORARY;
+	if (isv4) {
+		ipaddr_t	v4addr;
+
+		ipha = (ipha_t *)mp->b_rptr;
+		for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
+			mp->b_rptr[i] = IPOPT_EOL;
+		/* Swap addresses */
+		ipha->ipha_length = htons(ahlen);
+		v4addr = ipha->ipha_src;
+		ipha->ipha_src = ipha->ipha_dst;
+		ipha->ipha_dst = v4addr;
+		ipha->ipha_ident = 0;
+		ipha->ipha_ttl = (uchar_t)sctps->sctps_ipv4_ttl;
+
+		ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+	} else {
+		in6_addr_t	v6addr;
+
+		ip6h = (ip6_t *)mp->b_rptr;
+		/* Remove any extension headers assuming partial overlay */
+		if (ip_hdr_len > IPV6_HDR_LEN) {
+			uint8_t	*to;
+
+			to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
+			ovbcopy(ip6h, to, IPV6_HDR_LEN);
+			mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
+			ip_hdr_len = IPV6_HDR_LEN;
+			ip6h = (ip6_t *)mp->b_rptr;
+			ip6h->ip6_nxt = IPPROTO_SCTP;
+			ahlen = ip_hdr_len + sizeof (sctp_hdr_t);
+		}
+		ip6h->ip6_plen = htons(ahlen - IPV6_HDR_LEN);
+		v6addr = ip6h->ip6_src;
+		ip6h->ip6_src = ip6h->ip6_dst;
+		ip6h->ip6_dst = v6addr;
+		ip6h->ip6_hops = (uchar_t)sctps->sctps_ipv6_hoplimit;
+
+		ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
+			ixas.ixa_flags |= IXAF_SCOPEID_SET;
+			ixas.ixa_scopeid = ira->ira_ruifindex;
+		}
+	}
+	insctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_len);
+
+	/* Swap ports.  Verification tag is reused. */
+	port = insctph->sh_sport;
+	insctph->sh_sport = insctph->sh_dport;
+	insctph->sh_dport = port;
+	insctph->sh_verf = vtag;
+
+	/* Link in the abort chunk */
+	if ((alen = sctp_link_abort(mp, serror, details, len, iserror, tbit))
+	    < 0) {
+		freemsg(mp);
+		return;
+	}
+
+	ixas.ixa_pktlen = ahlen + alen;
+	ixas.ixa_ip_hdr_length = ip_hdr_len;
+
+	if (isv4) {
+		ipha->ipha_length = htons(ixas.ixa_pktlen);
+	} else {
+		ip6h->ip6_plen = htons(ixas.ixa_pktlen - IPV6_HDR_LEN);
 	}
-	rw_exit(&irb->irb_lock);
-	ire_refrele(ire);
+
+	ixas.ixa_protocol = IPPROTO_SCTP;
+	ixas.ixa_zoneid = ira->ira_zoneid;
+	ixas.ixa_ipst = ipst;
+	ixas.ixa_ifindex = 0;
+
+	BUMP_MIB(&sctps->sctps_mib, sctpAborted);
+
+	if (is_system_labeled()) {
+		ASSERT(ira->ira_tsl != NULL);
+
+		ixas.ixa_tsl = ira->ira_tsl;	/* A multi-level responder */
+	}
+
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+		/*
+		 * Apply IPsec based on how IPsec was applied to
+		 * the packet that was out of the blue.
+		 */
+		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, ip6h)) {
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			/* Note: mp already consumed and ip_drop_packet done */
+			return;
+		}
+	} else {
+		/*
+		 * This is in clear. The abort message we are building
+		 * here should go out in clear, independent of our policy.
+		 */
+		ixas.ixa_flags |= IXAF_NO_IPSEC;
+	}
+
+	(void) ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
 }
 
 /*ARGSUSED*/
@@ -418,8 +550,9 @@ sctp_add_err(sctp_t *sctp, uint16_t serror, void *details, size_t len,
 			return;
 		}
 		sendmp->b_cont = sctp->sctp_err_chunks;
-		sctp_set_iplen(sctp, sendmp);
-		sctp_add_sendq(sctp, sendmp);
+		sctp_set_iplen(sctp, sendmp, fp->ixa);
+		(void) conn_ip_output(sendmp, fp->ixa);
+		BUMP_LOCAL(sctp->sctp_opkts);
 
 		sctp->sctp_err_chunks = emp;
 		sctp->sctp_err_len = emp_len;
@@ -445,17 +578,20 @@ sctp_process_err(sctp_t *sctp)
 	sctp_stack_t *sctps = sctp->sctp_sctps;
 	mblk_t *errmp;
 	mblk_t *sendmp;
+	sctp_faddr_t *fp;
 
 	ASSERT(sctp->sctp_err_chunks != NULL);
 	errmp = sctp->sctp_err_chunks;
-	if ((sendmp = sctp_make_mp(sctp, SCTP_CHUNK_DEST(errmp), 0)) == NULL) {
+	fp = SCTP_CHUNK_DEST(errmp);
+	if ((sendmp = sctp_make_mp(sctp, fp, 0)) == NULL) {
 		SCTP_KSTAT(sctps, sctp_send_err_failed);
 		freemsg(errmp);
 		goto done;
 	}
 	sendmp->b_cont = errmp;
-	sctp_set_iplen(sctp, sendmp);
-	sctp_add_sendq(sctp, sendmp);
+	sctp_set_iplen(sctp, sendmp, fp->ixa);
+	(void) conn_ip_output(sendmp, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 done:
 	sctp->sctp_err_chunks = NULL;
 	sctp->sctp_err_len = 0;
@@ -467,7 +603,7 @@ done:
  */
 int
 sctp_handle_error(sctp_t *sctp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ch,
-    mblk_t *mp)
+    mblk_t *mp, ip_recv_attr_t *ira)
 {
 	sctp_parm_hdr_t *errh;
 	sctp_chunk_hdr_t *uch;
@@ -487,11 +623,13 @@ sctp_handle_error(sctp_t *sctp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ch,
 	 */
 	case SCTP_ERR_BAD_SID:
 		cmn_err(CE_WARN, "BUG! send to invalid SID");
-		sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0);
+		sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0,
+		    ira);
 		return (ECONNABORTED);
 	case SCTP_ERR_NO_USR_DATA:
 		cmn_err(CE_WARN, "BUG! no usr data");
-		sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0);
+		sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0,
+		    ira);
 		return (ECONNABORTED);
 	case SCTP_ERR_UNREC_CHUNK:
 		/* Pull out the unrecognized chunk type */
diff --git a/usr/src/uts/common/inet/sctp/sctp_hash.c b/usr/src/uts/common/inet/sctp/sctp_hash.c
index 289dbc04e7..b5c838d297 100644
--- a/usr/src/uts/common/inet/sctp/sctp_hash.c
+++ b/usr/src/uts/common/inet/sctp/sctp_hash.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -82,7 +82,7 @@ sctp_hash_init(sctp_stack_t *sctps)
 	}
 	sctps->sctps_conn_fanout =
 	    (sctp_tf_t *)kmem_zalloc(sctps->sctps_conn_hash_size *
-	    sizeof (sctp_tf_t),	KM_SLEEP);
+	    sizeof (sctp_tf_t), KM_SLEEP);
 	for (i = 0; i < sctps->sctps_conn_hash_size; i++) {
 		mutex_init(&sctps->sctps_conn_fanout[i].tf_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
@@ -129,87 +129,6 @@ sctp_hash_destroy(sctp_stack_t *sctps)
 }
 
 /*
- * Walk the SCTP global list and refrele the ire for this ipif
- * This is called when an address goes down, so that we release any reference
- * to the ire associated with this address. Additionally, for any SCTP if
- * this was the only/last address in its source list, we don't kill the
- * assoc., if there is no address added subsequently, or if this does not
- * come up, then the assoc. will die a natural death (i.e. timeout).
- */
-void
-sctp_ire_cache_flush(ipif_t *ipif)
-{
-	sctp_t			*sctp;
-	sctp_t			*sctp_prev = NULL;
-	sctp_faddr_t		*fp;
-	conn_t			*connp;
-	ire_t			*ire;
-	sctp_stack_t		*sctps = ipif->ipif_ill->ill_ipst->
-	    ips_netstack->netstack_sctp;
-
-	sctp = sctps->sctps_gsctp;
-	mutex_enter(&sctps->sctps_g_lock);
-	while (sctp != NULL) {
-		mutex_enter(&sctp->sctp_reflock);
-		if (sctp->sctp_condemned) {
-			mutex_exit(&sctp->sctp_reflock);
-			sctp = list_next(&sctps->sctps_g_list, sctp);
-			continue;
-		}
-		sctp->sctp_refcnt++;
-		mutex_exit(&sctp->sctp_reflock);
-		mutex_exit(&sctps->sctps_g_lock);
-		if (sctp_prev != NULL)
-			SCTP_REFRELE(sctp_prev);
-
-		RUN_SCTP(sctp);
-		connp = sctp->sctp_connp;
-		mutex_enter(&connp->conn_lock);
-		ire = connp->conn_ire_cache;
-		if (ire != NULL && ire->ire_ipif == ipif) {
-			connp->conn_ire_cache = NULL;
-			mutex_exit(&connp->conn_lock);
-			IRE_REFRELE_NOTR(ire);
-		} else {
-			mutex_exit(&connp->conn_lock);
-		}
-		/* check for ires cached in faddr */
-		for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
-			/*
-			 * If this ipif is being used as the source address
-			 * we need to update it as well, else we will end
-			 * up using the dead source address.
-			 */
-			ire = fp->ire;
-			if (ire != NULL && ire->ire_ipif == ipif) {
-				fp->ire = NULL;
-				IRE_REFRELE_NOTR(ire);
-			}
-			/*
-			 * This may result in setting the fp as unreachable,
-			 * i.e. if all the source addresses are down. In
-			 * that case the assoc. would timeout.
-			 */
-			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
-			    &fp->saddr)) {
-				sctp_set_saddr(sctp, fp);
-				if (fp == sctp->sctp_current &&
-				    fp->state != SCTP_FADDRS_UNREACH) {
-					sctp_set_faddr_current(sctp, fp);
-				}
-			}
-		}
-		WAKE_SCTP(sctp);
-		sctp_prev = sctp;
-		mutex_enter(&sctps->sctps_g_lock);
-		sctp = list_next(&sctps->sctps_g_list, sctp);
-	}
-	mutex_exit(&sctps->sctps_g_lock);
-	if (sctp_prev != NULL)
-		SCTP_REFRELE(sctp_prev);
-}
-
-/*
  * Exported routine for extracting active SCTP associations.
  * Like TCP, we terminate the walk if the callback returns non-zero.
  *
@@ -244,9 +163,9 @@ cl_sctp_walk_list_stack(int (*cl_callback)(cl_sctp_info_t *, void *),
 	uchar_t		*slist;
 	uchar_t		*flist;
 
-	sctp = sctps->sctps_gsctp;
 	sctp_prev = NULL;
 	mutex_enter(&sctps->sctps_g_lock);
+	sctp = list_head(&sctps->sctps_g_list);
 	while (sctp != NULL) {
 		size_t	ssize;
 		size_t	fsize;
@@ -282,11 +201,14 @@ cl_sctp_walk_list_stack(int (*cl_callback)(cl_sctp_info_t *, void *),
 		sctp_get_faddr_list(sctp, flist, fsize);
 		cl_sctpi.cl_sctpi_nladdr = sctp->sctp_nsaddrs;
 		cl_sctpi.cl_sctpi_nfaddr = sctp->sctp_nfaddrs;
-		cl_sctpi.cl_sctpi_family = sctp->sctp_family;
-		cl_sctpi.cl_sctpi_ipversion = sctp->sctp_ipversion;
+		cl_sctpi.cl_sctpi_family = sctp->sctp_connp->conn_family;
+		if (cl_sctpi.cl_sctpi_family == AF_INET)
+			cl_sctpi.cl_sctpi_ipversion = IPV4_VERSION;
+		else
+			cl_sctpi.cl_sctpi_ipversion = IPV6_VERSION;
 		cl_sctpi.cl_sctpi_state = sctp->sctp_state;
-		cl_sctpi.cl_sctpi_lport = sctp->sctp_lport;
-		cl_sctpi.cl_sctpi_fport = sctp->sctp_fport;
+		cl_sctpi.cl_sctpi_lport = sctp->sctp_connp->conn_lport;
+		cl_sctpi.cl_sctpi_fport = sctp->sctp_connp->conn_fport;
 		cl_sctpi.cl_sctpi_handle = (cl_sctp_handle_t)sctp;
 		WAKE_SCTP(sctp);
 		cl_sctpi.cl_sctpi_laddrp = slist;
@@ -310,20 +232,26 @@ cl_sctp_walk_list_stack(int (*cl_callback)(cl_sctp_info_t *, void *),
 
 sctp_t *
 sctp_conn_match(in6_addr_t *faddr, in6_addr_t *laddr, uint32_t ports,
-    zoneid_t zoneid, sctp_stack_t *sctps)
+    zoneid_t zoneid, iaflags_t iraflags, sctp_stack_t *sctps)
 {
 	sctp_tf_t		*tf;
 	sctp_t			*sctp;
 	sctp_faddr_t		*fp;
+	conn_t			*connp;
 
 	tf = &(sctps->sctps_conn_fanout[SCTP_CONN_HASH(sctps, ports)]);
 	mutex_enter(&tf->tf_lock);
 
 	for (sctp = tf->tf_sctp; sctp; sctp = sctp->sctp_conn_hash_next) {
-		if (ports != sctp->sctp_ports ||
-		    !IPCL_ZONE_MATCH(sctp->sctp_connp, zoneid)) {
+		connp = sctp->sctp_connp;
+		if (ports != connp->conn_ports)
+			continue;
+		if (!(connp->conn_zoneid == zoneid ||
+		    connp->conn_allzones ||
+		    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+		    (iraflags & IRAF_TX_MAC_EXEMPTABLE) &&
+		    (iraflags & IRAF_TX_SHARED_ADDR))))
 			continue;
-		}
 
 		/* check for faddr match */
 		for (fp = sctp->sctp_faddrs; fp; fp = fp->next) {
@@ -351,11 +279,12 @@ done:
 
 static sctp_t *
 listen_match(in6_addr_t *laddr, uint32_t ports, zoneid_t zoneid,
-    sctp_stack_t *sctps)
+    iaflags_t iraflags, sctp_stack_t *sctps)
 {
 	sctp_t			*sctp;
 	sctp_tf_t		*tf;
 	uint16_t		lport;
+	conn_t			*connp;
 
 	lport = ((uint16_t *)&ports)[1];
 
@@ -363,10 +292,16 @@ listen_match(in6_addr_t *laddr, uint32_t ports, zoneid_t zoneid,
 	mutex_enter(&tf->tf_lock);
 
 	for (sctp = tf->tf_sctp; sctp; sctp = sctp->sctp_listen_hash_next) {
-		if (lport != sctp->sctp_lport ||
-		    !IPCL_ZONE_MATCH(sctp->sctp_connp, zoneid)) {
+		connp = sctp->sctp_connp;
+		if (lport != connp->conn_lport)
+			continue;
+
+		if (!(connp->conn_zoneid == zoneid ||
+		    connp->conn_allzones ||
+		    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+		    (iraflags & IRAF_TX_MAC_EXEMPTABLE) &&
+		    (iraflags & IRAF_TX_SHARED_ADDR))))
 			continue;
-		}
 
 		if (sctp_saddr_lookup(sctp, laddr, 0) != NULL) {
 			SCTP_REFHOLD(sctp);
@@ -383,48 +318,36 @@ done:
 /* called by ipsec_sctp_pol */
 conn_t *
 sctp_find_conn(in6_addr_t *src, in6_addr_t *dst, uint32_t ports,
-    zoneid_t zoneid, sctp_stack_t *sctps)
+    zoneid_t zoneid, iaflags_t iraflags, sctp_stack_t *sctps)
 {
 	sctp_t *sctp;
 
-	if ((sctp = sctp_conn_match(src, dst, ports, zoneid, sctps)) == NULL) {
+	sctp = sctp_conn_match(src, dst, ports, zoneid, iraflags, sctps);
+	if (sctp == NULL) {
 		/* Not in conn fanout; check listen fanout */
-		if ((sctp = listen_match(dst, ports, zoneid, sctps)) == NULL)
+		sctp = listen_match(dst, ports, zoneid, iraflags, sctps);
+		if (sctp == NULL)
 			return (NULL);
 	}
 	return (sctp->sctp_connp);
 }
 
+/*
+ * Fanout to a sctp instance.
+ */
 conn_t *
 sctp_fanout(in6_addr_t *src, in6_addr_t *dst, uint32_t ports,
-    zoneid_t zoneid, mblk_t *mp, sctp_stack_t *sctps)
-
+    ip_recv_attr_t *ira, mblk_t *mp, sctp_stack_t *sctps)
 {
+	zoneid_t zoneid = ira->ira_zoneid;
+	iaflags_t iraflags = ira->ira_flags;
 	sctp_t *sctp;
-	boolean_t shared_addr;
-
-	if ((sctp = sctp_conn_match(src, dst, ports, zoneid, sctps)) == NULL) {
-		shared_addr = (zoneid == ALL_ZONES);
-		if (shared_addr) {
-			/*
-			 * No need to handle exclusive-stack zones since
-			 * ALL_ZONES only applies to the shared stack.
-			 */
-			zoneid = tsol_mlp_findzone(IPPROTO_SCTP,
-			    htons(ntohl(ports) & 0xFFFF));
-			/*
-			 * If no shared MLP is found, tsol_mlp_findzone returns
-			 * ALL_ZONES.  In that case, we assume it's SLP, and
-			 * search for the zone based on the packet label.
-			 * That will also return ALL_ZONES on failure.
-			 */
-			if (zoneid == ALL_ZONES)
-				zoneid = tsol_packet_to_zoneid(mp);
-			if (zoneid == ALL_ZONES)
-				return (NULL);
-		}
+
+	sctp = sctp_conn_match(src, dst, ports, zoneid, iraflags, sctps);
+	if (sctp == NULL) {
 		/* Not in conn fanout; check listen fanout */
-		if ((sctp = listen_match(dst, ports, zoneid, sctps)) == NULL)
+		sctp = listen_match(dst, ports, zoneid, iraflags, sctps);
+		if (sctp == NULL)
 			return (NULL);
 		/*
 		 * On systems running trusted extensions, check if dst
@@ -432,9 +355,9 @@ sctp_fanout(in6_addr_t *src, in6_addr_t *dst, uint32_t ports,
 		 * that dst is in 16 byte AF_INET6 format. IPv4-mapped
 		 * IPv6 addresses are supported.
 		 */
-		if (is_system_labeled() &&
-		    !tsol_receive_local(mp, dst, IPV6_VERSION,
-		    shared_addr, sctp->sctp_connp)) {
+		if ((iraflags & IRAF_SYSTEM_LABELED) &&
+		    !tsol_receive_local(mp, dst, IPV6_VERSION, ira,
+		    sctp->sctp_connp)) {
 			DTRACE_PROBE3(
 			    tx__ip__log__info__classify__sctp,
 			    char *,
@@ -444,145 +367,84 @@ sctp_fanout(in6_addr_t *src, in6_addr_t *dst, uint32_t ports,
 			return (NULL);
 		}
 	}
+	/*
+	 * For labeled systems, there's no need to check the
+	 * label here.  It's known to be good as we checked
+	 * before allowing the connection to become bound.
+	 */
 	return (sctp->sctp_connp);
 }
 
 /*
- * Fanout for SCTP packets
+ * Fanout for ICMP errors for SCTP
  * The caller puts <fport, lport> in the ports parameter.
  */
-/* ARGSUSED */
 void
-ip_fanout_sctp(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha,
-    uint32_t ports, uint_t flags, boolean_t mctl_present, boolean_t ip_policy,
-    zoneid_t zoneid)
+ip_fanout_sctp(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports,
+    ip_recv_attr_t *ira)
 {
-	sctp_t *sctp;
-	boolean_t isv4;
-	conn_t *connp;
-	mblk_t *first_mp;
-	ip6_t *ip6h;
-	in6_addr_t map_src, map_dst;
-	in6_addr_t *src, *dst;
-	ip_stack_t	*ipst;
-	ipsec_stack_t	*ipss;
-	sctp_stack_t	*sctps;
-
-	ASSERT(recv_ill != NULL);
-	ipst = recv_ill->ill_ipst;
-	sctps = ipst->ips_netstack->netstack_sctp;
-	ipss = ipst->ips_netstack->netstack_ipsec;
-
-	first_mp = mp;
-	if (mctl_present) {
-		mp = first_mp->b_cont;
-		ASSERT(mp != NULL);
-	}
+	sctp_t		*sctp;
+	conn_t		*connp;
+	in6_addr_t	map_src, map_dst;
+	in6_addr_t	*src, *dst;
+	boolean_t	secure;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
+	netstack_t	*ns = ipst->ips_netstack;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	sctp_stack_t	*sctps = ns->netstack_sctp;
+	iaflags_t	iraflags = ira->ira_flags;
+	ill_t		*rill = ira->ira_rill;
+
+	ASSERT(iraflags & IRAF_ICMP_ERROR);
+
+	secure = iraflags & IRAF_IPSEC_SECURE;
 
 	/* Assume IP provides aligned packets - otherwise toss */
 	if (!OK_32PTR(mp->b_rptr)) {
-		BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
-		freemsg(first_mp);
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards", mp, ill);
+		freemsg(mp);
 		return;
 	}
 
-	if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) {
-		ip6h = (ip6_t *)ipha;
+	if (!(iraflags & IRAF_IS_IPV4)) {
 		src = &ip6h->ip6_src;
 		dst = &ip6h->ip6_dst;
-		isv4 = B_FALSE;
 	} else {
-		ip6h = NULL;
 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
 		src = &map_src;
 		dst = &map_dst;
-		isv4 = B_TRUE;
 	}
-	connp = sctp_fanout(src, dst, ports, zoneid, mp, sctps);
+	connp = sctp_fanout(src, dst, ports, ira, mp, sctps);
 	if (connp == NULL) {
-		ip_fanout_sctp_raw(first_mp, recv_ill, ipha, isv4,
-		    ports, mctl_present, flags, ip_policy, zoneid);
+		ip_fanout_sctp_raw(mp, ipha, ip6h, ports, ira);
 		return;
 	}
 	sctp = CONN2SCTP(connp);
 
-	/* Found a client; up it goes */
-	BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
-
 	/*
 	 * We check some fields in conn_t without holding a lock.
 	 * This should be fine.
 	 */
-	if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) {
-		first_mp = ipsec_check_inbound_policy(first_mp, connp,
-		    ipha, NULL, mctl_present);
-		if (first_mp == NULL) {
-			SCTP_REFRELE(sctp);
-			return;
-		}
-	}
-
-	/* Initiate IPPF processing for fastpath */
-	if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
-		ip_process(IPP_LOCAL_IN, &mp,
-		    recv_ill->ill_phyint->phyint_ifindex);
+	if (((iraflags & IRAF_IS_IPV4) ?
+	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
+	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
+	    secure) {
+		mp = ipsec_check_inbound_policy(mp, connp, ipha,
+		    ip6h, ira);
 		if (mp == NULL) {
 			SCTP_REFRELE(sctp);
-			if (mctl_present)
-				freeb(first_mp);
 			return;
-		} else if (mctl_present) {
-			/*
-			 * ip_process might return a new mp.
-			 */
-			ASSERT(first_mp != mp);
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
 		}
 	}
 
-	if (connp->conn_recvif || connp->conn_recvslla ||
-	    connp->conn_ip_recvpktinfo) {
-		int in_flags = 0;
-
-		if (connp->conn_recvif || connp->conn_ip_recvpktinfo) {
-			in_flags = IPF_RECVIF;
-		}
-		if (connp->conn_recvslla) {
-			in_flags |= IPF_RECVSLLA;
-		}
-		if (isv4) {
-			mp = ip_add_info(mp, recv_ill, in_flags,
-			    IPCL_ZONEID(connp), ipst);
-		} else {
-			mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst);
-		}
-		if (mp == NULL) {
-			SCTP_REFRELE(sctp);
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		} else if (mctl_present) {
-			/*
-			 * ip_add_info might return a new mp.
-			 */
-			ASSERT(first_mp != mp);
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
-		}
-	}
+	ira->ira_ill = ira->ira_rill = NULL;
 
 	mutex_enter(&sctp->sctp_lock);
 	if (sctp->sctp_running) {
-		if (mctl_present)
-			mp->b_prev = first_mp;
-		if (!sctp_add_recvq(sctp, mp, B_FALSE)) {
-			BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(first_mp);
-		}
+		sctp_add_recvq(sctp, mp, B_FALSE, ira);
 		mutex_exit(&sctp->sctp_lock);
 	} else {
 		sctp->sctp_running = B_TRUE;
@@ -590,24 +452,22 @@ ip_fanout_sctp(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha,
 
 		mutex_enter(&sctp->sctp_recvq_lock);
 		if (sctp->sctp_recvq != NULL) {
-			if (mctl_present)
-				mp->b_prev = first_mp;
-			if (!sctp_add_recvq(sctp, mp, B_TRUE)) {
-				BUMP_MIB(recv_ill->ill_ip_mib,
-				    ipIfStatsInDiscards);
-				freemsg(first_mp);
-			}
+			sctp_add_recvq(sctp, mp, B_TRUE, ira);
 			mutex_exit(&sctp->sctp_recvq_lock);
 			WAKE_SCTP(sctp);
 		} else {
 			mutex_exit(&sctp->sctp_recvq_lock);
-			sctp_input_data(sctp, mp, (mctl_present ? first_mp :
-			    NULL));
+			if (ira->ira_flags & IRAF_ICMP_ERROR) {
+				sctp_icmp_error(sctp, mp);
+			} else {
+				sctp_input_data(sctp, mp, ira);
+			}
 			WAKE_SCTP(sctp);
-			sctp_process_sendq(sctp);
 		}
 	}
 	SCTP_REFRELE(sctp);
+	ira->ira_ill = ill;
+	ira->ira_rill = rill;
 }
 
 void
@@ -623,7 +483,7 @@ sctp_conn_hash_remove(sctp_t *sctp)
 	 * subsystem.
 	 */
 	if (cl_sctp_disconnect != NULL) {
-		(*cl_sctp_disconnect)(sctp->sctp_family,
+		(*cl_sctp_disconnect)(sctp->sctp_connp->conn_family,
 		    (cl_sctp_handle_t)sctp);
 	}
 
@@ -683,6 +543,7 @@ void
 sctp_listen_hash_remove(sctp_t *sctp)
 {
 	sctp_tf_t *tf = sctp->sctp_listen_tfp;
+	conn_t	*connp = sctp->sctp_connp;
 
 	if (!tf) {
 		return;
@@ -698,8 +559,8 @@ sctp_listen_hash_remove(sctp_t *sctp)
 		ssize = sizeof (in6_addr_t) * sctp->sctp_nsaddrs;
 		slist = kmem_alloc(ssize, KM_SLEEP);
 		sctp_get_saddr_list(sctp, slist, ssize);
-		(*cl_sctp_unlisten)(sctp->sctp_family, slist,
-		    sctp->sctp_nsaddrs, sctp->sctp_lport);
+		(*cl_sctp_unlisten)(connp->conn_family, slist,
+		    sctp->sctp_nsaddrs, connp->conn_lport);
 		/* list will be freed by the clustering module */
 	}
 
@@ -722,7 +583,10 @@ sctp_listen_hash_remove(sctp_t *sctp)
 		    sctp->sctp_listen_hash_next;
 
 		if (sctp->sctp_listen_hash_next != NULL) {
-			sctp->sctp_listen_hash_next->sctp_listen_hash_prev =
+			sctp_t *next = sctp->sctp_listen_hash_next;
+
+			ASSERT(next->sctp_listen_hash_prev == sctp);
+			next->sctp_listen_hash_prev =
 			    sctp->sctp_listen_hash_prev;
 		}
 	}
@@ -735,6 +599,8 @@ sctp_listen_hash_remove(sctp_t *sctp)
 void
 sctp_listen_hash_insert(sctp_tf_t *tf, sctp_t *sctp)
 {
+	conn_t	*connp = sctp->sctp_connp;
+
 	if (sctp->sctp_listen_tfp) {
 		sctp_listen_hash_remove(sctp);
 	}
@@ -759,8 +625,8 @@ sctp_listen_hash_insert(sctp_tf_t *tf, sctp_t *sctp)
 		ssize = sizeof (in6_addr_t) * sctp->sctp_nsaddrs;
 		slist = kmem_alloc(ssize, KM_SLEEP);
 		sctp_get_saddr_list(sctp, slist, ssize);
-		(*cl_sctp_listen)(sctp->sctp_family, slist,
-		    sctp->sctp_nsaddrs, sctp->sctp_lport);
+		(*cl_sctp_listen)(connp->conn_family, slist,
+		    sctp->sctp_nsaddrs, connp->conn_lport);
 		/* list will be freed by the clustering module */
 	}
 }
@@ -850,8 +716,8 @@ sctp_lookup(sctp_t *sctp1, in6_addr_t *faddr, sctp_tf_t *tf, uint32_t *ports,
 
 	for (sctp = tf->tf_sctp; sctp != NULL;
 	    sctp = sctp->sctp_conn_hash_next) {
-		if (*ports != sctp->sctp_ports || sctp->sctp_state <
-		    min_state) {
+		if (*ports != sctp->sctp_connp->conn_ports ||
+		    sctp->sctp_state < min_state) {
 			continue;
 		}
 
@@ -886,38 +752,3 @@ done:
 	}
 	return (sctp);
 }
-
-boolean_t
-ip_fanout_sctp_raw_match(conn_t *connp, uint32_t ports, ipha_t *ipha)
-{
-	uint16_t lport;
-
-	if (connp->conn_fully_bound) {
-		return (IPCL_CONN_MATCH(connp, IPPROTO_SCTP, ipha->ipha_src,
-		    ipha->ipha_dst, ports));
-	} else {
-		lport = htons(ntohl(ports) & 0xFFFF);
-		return (IPCL_BIND_MATCH(connp, IPPROTO_SCTP, ipha->ipha_dst,
-		    lport));
-	}
-}
-
-boolean_t
-ip_fanout_sctp_raw_match_v6(conn_t *connp, uint32_t ports, ip6_t *ip6h,
-    boolean_t for_v4)
-{
-	uint16_t lport;
-	in6_addr_t	v6dst;
-
-	if (!for_v4 && connp->conn_fully_bound) {
-		return (IPCL_CONN_MATCH_V6(connp, IPPROTO_SCTP, ip6h->ip6_src,
-		    ip6h->ip6_dst, ports));
-	} else {
-		lport = htons(ntohl(ports) & 0xFFFF);
-		if (for_v4)
-			v6dst = ipv6_all_zeros;
-		else
-			v6dst = ip6h->ip6_dst;
-		return (IPCL_BIND_MATCH_V6(connp, IPPROTO_SCTP, v6dst, lport));
-	}
-}
diff --git a/usr/src/uts/common/inet/sctp/sctp_heartbeat.c b/usr/src/uts/common/inet/sctp/sctp_heartbeat.c
index 914f1cac3f..2fbffee1c3 100644
--- a/usr/src/uts/common/inet/sctp/sctp_heartbeat.c
+++ b/usr/src/uts/common/inet/sctp/sctp_heartbeat.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/stream.h>
@@ -66,8 +64,14 @@ sctp_return_heartbeat(sctp_t *sctp, sctp_chunk_hdr_t *hbcp, mblk_t *mp)
 		addr = inip6h->ip6_src;
 	}
 	fp = sctp_lookup_faddr(sctp, &addr);
-	ASSERT(fp != NULL);
-
+	/* If the source address is bogus we silently drop the packet */
+	if (fp == NULL) {
+		dprint(1,
+		    ("sctp_return_heartbeat: %p bogus hb from %x:%x:%x:%x\n",
+		    (void *)sctp, SCTP_PRINTADDR(addr)));
+		SCTP_KSTAT(sctps, sctp_return_hb_failed);
+		return;
+	}
 	dprint(3, ("sctp_return_heartbeat: %p got hb from %x:%x:%x:%x\n",
 	    (void *)sctp, SCTP_PRINTADDR(addr)));
 
@@ -98,10 +102,11 @@ sctp_return_heartbeat(sctp_t *sctp, sctp_chunk_hdr_t *hbcp, mblk_t *mp)
 
 	smp->b_wptr += len;
 
-	sctp_set_iplen(sctp, smp);
-
 	BUMP_LOCAL(sctp->sctp_obchunks);
-	sctp_add_sendq(sctp, smp);
+
+	sctp_set_iplen(sctp, smp, fp->ixa);
+	(void) conn_ip_output(smp, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 }
 
 /*
@@ -126,10 +131,10 @@ sctp_send_heartbeat(sctp_t *sctp, sctp_faddr_t *fp)
 	    SCTP_PRINTADDR(fp->faddr), SCTP_PRINTADDR(fp->saddr)));
 
 	hblen = sizeof (*cp) +
-		sizeof (*hpp) +
-		sizeof (*t) +
-		sizeof (fp->hb_secret) +
-		sizeof (fp->faddr);
+	    sizeof (*hpp) +
+	    sizeof (*t) +
+	    sizeof (fp->hb_secret) +
+	    sizeof (fp->faddr);
 	hbmp = sctp_make_mp(sctp, fp, hblen);
 	if (hbmp == NULL) {
 		SCTP_KSTAT(sctps, sctp_send_hb_failed);
@@ -180,8 +185,6 @@ sctp_send_heartbeat(sctp_t *sctp, sctp_faddr_t *fp)
 
 	hbmp->b_wptr += hblen;
 
-	sctp_set_iplen(sctp, hbmp);
-
 	/* Update the faddr's info */
 	fp->lastactive = now;
 	fp->hb_pending = B_TRUE;
@@ -189,7 +192,9 @@ sctp_send_heartbeat(sctp_t *sctp, sctp_faddr_t *fp)
 	BUMP_LOCAL(sctp->sctp_obchunks);
 	BUMP_MIB(&sctps->sctps_mib, sctpTimHeartBeatProbe);
 
-	sctp_add_sendq(sctp, hbmp);
+	sctp_set_iplen(sctp, hbmp, fp->ixa);
+	(void) conn_ip_output(hbmp, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 }
 
 /*
diff --git a/usr/src/uts/common/inet/sctp/sctp_impl.h b/usr/src/uts/common/inet/sctp/sctp_impl.h
index 32268648f6..d84c3762f3 100644
--- a/usr/src/uts/common/inet/sctp/sctp_impl.h
+++ b/usr/src/uts/common/inet/sctp/sctp_impl.h
@@ -191,7 +191,6 @@ typedef struct sctpparam_s {
 #define	SCTP_MAX_COMBINED_HEADER_LENGTH	(60 + 12) /* Maxed out ip + sctp */
 #define	SCTP_MAX_IP_OPTIONS_LENGTH	(60 - IP_SIMPLE_HDR_LENGTH)
 #define	SCTP_MAX_HDR_LENGTH		60
-#define	ICMP_MIN_SCTP_HDR_LEN	(ICMP_MIN_TP_HDR_LEN + sizeof (sctp_hdr_t))
 
 #define	SCTP_SECRET_LEN	16
 
@@ -213,27 +212,6 @@ typedef struct sctpparam_s {
 	}						\
 }
 
-#define	SCTP_G_Q_REFHOLD(sctps) {					\
-	atomic_add_32(&(sctps)->sctps_g_q_ref, 1);			\
-	ASSERT((sctps)->sctps_g_q_ref != 0);				\
-	DTRACE_PROBE1(sctp__g__q__refhold, sctp_stack_t, sctps);	\
-}
-
-/*
- * Decrement the reference count on sctp_g_q
- * In architectures e.g sun4u, where atomic_add_32_nv is just
- * a cas, we need to maintain the right memory barrier semantics
- * as that of mutex_exit i.e all the loads and stores should complete
- * before the cas is executed. membar_exit() does that here.
- */
-#define	SCTP_G_Q_REFRELE(sctps) {					\
-	ASSERT((sctps)->sctps_g_q_ref != 0);				\
-	membar_exit();							\
-	DTRACE_PROBE1(sctp__g__q__refrele, sctp_stack_t, sctps);	\
-	if (atomic_add_32_nv(&(sctps)->sctps_g_q_ref, -1) == 0)		\
-		sctp_g_q_inactive(sctps);				\
-}
-
 #define	SCTP_PRINTADDR(a)	(a).s6_addr32[0], (a).s6_addr32[1],\
 				(a).s6_addr32[2], (a).s6_addr32[3]
 
@@ -399,15 +377,6 @@ extern sin6_t	sctp_sin6_null;	/* Zero address for quick clears */
 
 #define	SCTP_IS_DETACHED(sctp)		((sctp)->sctp_detached)
 
-/*
- * Object to represent database of options to search passed to
- * {sock,tpi}optcom_req() interface routine to take care of option
- * management and associated methods.
- * XXX These and other externs should ideally move to a SCTP header
- */
-extern optdb_obj_t	sctp_opt_obj;
-extern uint_t		sctp_max_optbuf_len;
-
 /* Data structure used to track received TSNs */
 typedef struct sctp_set_s {
 	struct sctp_set_s *next;
@@ -528,7 +497,7 @@ typedef struct sctp_faddr_s {
 			hb_enabled : 1;
 
 	mblk_t		*rc_timer_mp;	/* reliable control chunk timer */
-	ire_t		*ire;		/* cached IRE */
+	ip_xmit_attr_t	*ixa;		/* Transmit attributes */
 	uint32_t	T3expire;	/* # of times T3 timer expired */
 
 	uint64_t	hb_secret;	/* per addr "secret" in heartbeat */
@@ -600,25 +569,6 @@ typedef struct sctp_s {
 	sctp_ipif_hash_t	sctp_saddrs[SCTP_IPIF_HASH];
 	int			sctp_nsaddrs;
 
-	/*
-	 * These fields contain the same information as sctp_sctph->th_*port.
-	 * However, the lookup functions can not use the header fields
-	 * since during IP option manipulation the sctp_sctph pointer
-	 * changes.
-	 */
-	union {
-		struct {
-			in_port_t	sctpu_fport;	/* Remote port */
-			in_port_t	sctpu_lport;	/* Local port */
-		} sctpu_ports1;
-		uint32_t		sctpu_ports2;	/* Rem port, */
-							/* local port */
-					/* Used for SCTP_MATCH performance */
-	} sctp_sctpu;
-#define	sctp_fport	sctp_sctpu.sctpu_ports1.sctpu_fport
-#define	sctp_lport	sctp_sctpu.sctpu_ports1.sctpu_lport
-#define	sctp_ports	sctp_sctpu.sctpu_ports2
-
 	kmutex_t	sctp_lock;
 	kcondvar_t	sctp_cv;
 	boolean_t	sctp_running;
@@ -637,12 +587,6 @@ typedef struct sctp_s {
 	int32_t		sctp_state;
 
 	conn_t		*sctp_connp;		/* conn_t stuff */
-#define	sctp_zoneid	sctp_connp->conn_zoneid
-#define	sctp_allzones	sctp_connp->conn_allzones
-#define	sctp_mac_mode	sctp_connp->conn_mac_mode
-#define	sctp_credp	sctp_connp->conn_cred
-#define	sctp_reuseaddr	sctp_connp->conn_reuseaddr
-
 	sctp_stack_t	*sctp_sctps;
 
 	/* Peer address tracking */
@@ -711,9 +655,6 @@ typedef struct sctp_s {
 	uint32_t	sctp_T3expire;		/* # of times T3timer expired */
 	uint32_t	sctp_assoc_start_time;	/* time when assoc was est. */
 
-	/* Outbound flow control */
-	int32_t		sctp_xmit_hiwater;	/* Send high water mark */
-	int32_t		sctp_xmit_lowater;	/* Send low water mark */
 	uint32_t	sctp_frwnd;		/* Peer RWND */
 	uint32_t	sctp_cwnd_max;
 
@@ -723,8 +664,8 @@ typedef struct sctp_s {
 	int32_t		sctp_rxqueued;		/* No. of bytes in RX q's */
 
 	/* Pre-initialized composite headers */
-	char		*sctp_iphc;	/* v4 sctp/ip hdr template buffer */
-	char		*sctp_iphc6;	/* v6 sctp/ip hdr template buffer */
+	uchar_t		*sctp_iphc;	/* v4 sctp/ip hdr template buffer */
+	uchar_t		*sctp_iphc6;	/* v6 sctp/ip hdr template buffer */
 
 	int32_t		sctp_iphc_len;	/* actual allocated v4 buffer size */
 	int32_t		sctp_iphc6_len;	/* actual allocated v6 buffer size */
@@ -754,17 +695,12 @@ typedef struct sctp_s {
 		uint32_t
 
 		sctp_understands_asconf : 1, /* Peer handles ASCONF chunks */
-		sctp_debug : 1,		/* SO_DEBUG "socket" option. */
 		sctp_cchunk_pend : 1,	/* Control chunk in flight. */
-		sctp_dgram_errind : 1,	/* SO_DGRAM_ERRIND option */
-
-		sctp_linger : 1,	/* SO_LINGER turned on */
 		sctp_lingering : 1,	/* Lingering in close */
 		sctp_loopback: 1,	/* src and dst are the same machine */
-		sctp_force_sack : 1,
 
+		sctp_force_sack : 1,
 		sctp_ack_timer_running: 1,	/* Delayed ACK timer running */
-		sctp_recvdstaddr : 1,	/* return T_EXTCONN_IND with dstaddr */
 		sctp_hwcksum : 1,	/* The NIC is capable of hwcksum */
 		sctp_understands_addip : 1,
 
@@ -802,15 +738,11 @@ typedef struct sctp_s {
 	} sctp_events;
 #define	sctp_priv_stream sctp_bits.sctp_priv_stream
 #define	sctp_understands_asconf sctp_bits.sctp_understands_asconf
-#define	sctp_debug sctp_bits.sctp_debug
 #define	sctp_cchunk_pend sctp_bits.sctp_cchunk_pend
-#define	sctp_dgram_errind sctp_bits.sctp_dgram_errind
-#define	sctp_linger sctp_bits.sctp_linger
 #define	sctp_lingering sctp_bits.sctp_lingering
 #define	sctp_loopback sctp_bits.sctp_loopback
 #define	sctp_force_sack sctp_bits.sctp_force_sack
 #define	sctp_ack_timer_running sctp_bits.sctp_ack_timer_running
-#define	sctp_recvdstaddr sctp_bits.sctp_recvdstaddr
 #define	sctp_hwcksum sctp_bits.sctp_hwcksum
 #define	sctp_understands_addip sctp_bits.sctp_understands_addip
 #define	sctp_bound_to_all sctp_bits.sctp_bound_to_all
@@ -853,15 +785,6 @@ typedef struct sctp_s {
 	uint8_t		sctp_old_secret[SCTP_SECRET_LEN];
 	uint32_t	sctp_cookie_lifetime;	/* cookie lifetime in tick */
 
-	/*
-	 * Address family that app wishes returned addrsses to be in.
-	 * Currently taken from address family used in T_BIND_REQ, but
-	 * should really come from family used in original socket() call.
-	 * Value can be AF_INET or AF_INET6.
-	 */
-	uint_t		sctp_family;
-	ushort_t	sctp_ipversion;
-
 	/* Bind hash tables */
 	kmutex_t	*sctp_bind_lockp;	/* Ptr to tf_lock */
 	struct sctp_s	*sctp_bind_hash;
@@ -870,14 +793,10 @@ typedef struct sctp_s {
 	/* Shutdown / cleanup */
 	sctp_faddr_t	*sctp_shutdown_faddr;	/* rotate faddr during shutd */
 	int32_t		sctp_client_errno;	/* How the client screwed up */
-	int		sctp_lingertime; /* Close linger time (in seconds) */
 	kmutex_t	sctp_reflock;	/* Protects sctp_refcnt & timer mp */
 	ushort_t	sctp_refcnt;	/* No. of pending upstream msg */
 	mblk_t		*sctp_timer_mp;	/* List of fired timers. */
 
-	/* Misc */
-	uint_t		sctp_bound_if;	/* IPV6_BOUND_IF */
-
 	mblk_t		*sctp_heartbeat_mp; /* Timer block for heartbeats */
 	uint32_t	sctp_hb_interval; /* Default hb_interval */
 
@@ -897,47 +816,19 @@ typedef struct sctp_s {
 	mblk_t		*sctp_recvq_tail;
 	taskq_t		*sctp_recvq_tq;
 
-	/* Send queue to IP */
-	kmutex_t	sctp_sendq_lock;
-	mblk_t		*sctp_sendq;
-	mblk_t		*sctp_sendq_tail;
-	boolean_t	sctp_sendq_sending;
-
 	/* IPv6 ancillary data */
-	uint_t		sctp_ipv6_recvancillary;	/* flags */
-#define	SCTP_IPV6_RECVPKTINFO	0x01		/* IPV6_RECVPKTINFO opt */
-#define	SCTP_IPV6_RECVHOPLIMIT	0x02		/* IPV6_RECVHOPLIMIT opt */
-#define	SCTP_IPV6_RECVHOPOPTS	0x04		/* IPV6_RECVHOPOPTS opt */
-#define	SCTP_IPV6_RECVDSTOPTS	0x08		/* IPV6_RECVDSTOPTS opt */
-#define	SCTP_IPV6_RECVRTHDR	0x10		/* IPV6_RECVRTHDR opt */
-#define	SCTP_IPV6_RECVRTDSTOPTS	0x20		/* IPV6_RECVRTHDRDSTOPTS opt */
-
 	uint_t		sctp_recvifindex;	/* last rcvd IPV6_RCVPKTINFO */
 	uint_t		sctp_recvhops;		/*  " IPV6_RECVHOPLIMIT */
+	uint_t		sctp_recvtclass;	/*  " IPV6_RECVTCLASS */
 	ip6_hbh_t	*sctp_hopopts;		/*  " IPV6_RECVHOPOPTS */
 	ip6_dest_t	*sctp_dstopts;		/*  " IPV6_RECVDSTOPTS */
-	ip6_dest_t	*sctp_rtdstopts;	/*  " IPV6_RECVRTHDRDSTOPTS */
+	ip6_dest_t	*sctp_rthdrdstopts;	/*  " IPV6_RECVRTHDRDSTOPTS */
 	ip6_rthdr_t	*sctp_rthdr;		/*  " IPV6_RECVRTHDR */
 	uint_t		sctp_hopoptslen;
 	uint_t		sctp_dstoptslen;
-	uint_t		sctp_rtdstoptslen;
+	uint_t		sctp_rthdrdstoptslen;
 	uint_t		sctp_rthdrlen;
 
-	ip6_pkt_t	sctp_sticky_ipp;	/* Sticky options */
-#define	sctp_ipp_fields		sctp_sticky_ipp.ipp_fields
-#define	sctp_ipp_ifindex	sctp_sticky_ipp.ipp_ifindex
-#define	sctp_ipp_addr		sctp_sticky_ipp.ipp_addr
-#define	sctp_ipp_hoplimit	sctp_sticky_ipp.ipp_hoplimit
-#define	sctp_ipp_hopoptslen	sctp_sticky_ipp.ipp_hopoptslen
-#define	sctp_ipp_rtdstoptslen	sctp_sticky_ipp.ipp_rtdstoptslen
-#define	sctp_ipp_rthdrlen	sctp_sticky_ipp.ipp_rthdrlen
-#define	sctp_ipp_dstoptslen	sctp_sticky_ipp.ipp_dstoptslen
-#define	sctp_ipp_hopopts	sctp_sticky_ipp.ipp_hopopts
-#define	sctp_ipp_rtdstopts	sctp_sticky_ipp.ipp_rtdstopts
-#define	sctp_ipp_rthdr		sctp_sticky_ipp.ipp_rthdr
-#define	sctp_ipp_dstopts	sctp_sticky_ipp.ipp_dstopts
-#define	sctp_ipp_pathmtu	sctp_sticky_ipp.ipp_pathmtu
-#define	sctp_ipp_nexthop	sctp_sticky_ipp.ipp_nexthop
 	/* Stats */
 	uint64_t	sctp_msgcount;
 	uint64_t	sctp_prsctpdrop;
@@ -951,9 +842,6 @@ typedef struct sctp_s {
 	mblk_t		*sctp_err_chunks;	/* Error chunks */
 	uint32_t	sctp_err_len;		/* Total error chunks length */
 
-	pid_t		sctp_cpid;	/* Process id when this was opened */
-	uint64_t	sctp_open_time;	/* time when this was opened */
-
 	/* additional source data for per endpoint association statistics */
 	uint64_t	sctp_outseqtsns;	/* TSN rx > expected TSN */
 	uint64_t	sctp_osacks;		/* total sacks sent */
@@ -988,7 +876,7 @@ typedef struct sctp_s {
 #define	SCTP_TXQ_LEN(sctp)	((sctp)->sctp_unsent + (sctp)->sctp_unacked)
 #define	SCTP_TXQ_UPDATE(sctp)					\
 	if ((sctp)->sctp_txq_full && SCTP_TXQ_LEN(sctp) <=	\
-	    (sctp)->sctp_xmit_lowater) {			\
+	    (sctp)->sctp_connp->conn_sndlowat) {		\
 		(sctp)->sctp_txq_full = 0;			\
 		(sctp)->sctp_ulp_xmitted((sctp)->sctp_ulpd,	\
 		    B_FALSE);					\
@@ -1004,8 +892,8 @@ extern void	sctp_add_err(sctp_t *, uint16_t, void *, size_t,
 extern int	sctp_add_faddr(sctp_t *, in6_addr_t *, int, boolean_t);
 extern boolean_t sctp_add_ftsn_set(sctp_ftsn_set_t **, sctp_faddr_t *, mblk_t *,
 		    uint_t *, uint32_t *);
-extern boolean_t sctp_add_recvq(sctp_t *, mblk_t *, boolean_t);
-extern void	sctp_add_sendq(sctp_t *, mblk_t *);
+extern void	sctp_add_recvq(sctp_t *, mblk_t *, boolean_t,
+		    ip_recv_attr_t *);
 extern void	sctp_add_unrec_parm(sctp_parm_hdr_t *, mblk_t **, boolean_t);
 extern size_t	sctp_addr_params(sctp_t *, int, uchar_t *, boolean_t);
 extern mblk_t	*sctp_add_proto_hdr(sctp_t *, sctp_faddr_t *, mblk_t *, int,
@@ -1013,7 +901,6 @@ extern mblk_t	*sctp_add_proto_hdr(sctp_t *, sctp_faddr_t *, mblk_t *, int,
 extern void	sctp_addr_req(sctp_t *, mblk_t *);
 extern sctp_t	*sctp_addrlist2sctp(mblk_t *, sctp_hdr_t *, sctp_chunk_hdr_t *,
 		    zoneid_t, sctp_stack_t *);
-extern void	sctp_add_hdr(sctp_t *, uchar_t *, size_t);
 extern void	sctp_check_adv_ack_pt(sctp_t *, mblk_t *, mblk_t *);
 extern void	sctp_assoc_event(sctp_t *, uint16_t, uint16_t,
 		    sctp_chunk_hdr_t *);
@@ -1024,7 +911,7 @@ extern int	sctp_bindi(sctp_t *, in_port_t, boolean_t, int, in_port_t *);
 extern int	sctp_bind_add(sctp_t *, const void *, uint32_t, boolean_t,
 		    in_port_t);
 extern int	sctp_bind_del(sctp_t *, const void *, uint32_t, boolean_t);
-extern int	sctp_build_hdrs(sctp_t *);
+extern int	sctp_build_hdrs(sctp_t *, int);
 
 extern int	sctp_check_abandoned_msg(sctp_t *, mblk_t *);
 extern void	sctp_clean_death(sctp_t *, int);
@@ -1035,11 +922,9 @@ extern void	sctp_conn_hash_insert(sctp_tf_t *, sctp_t *, int);
 extern void	sctp_conn_hash_remove(sctp_t *);
 extern void	sctp_conn_init(conn_t *);
 extern sctp_t	*sctp_conn_match(in6_addr_t *, in6_addr_t *, uint32_t,
-		    zoneid_t, sctp_stack_t *);
+		    zoneid_t, iaflags_t, sctp_stack_t *);
 extern sctp_t	*sctp_conn_request(sctp_t *, mblk_t *, uint_t, uint_t,
-		    sctp_init_chunk_t *, mblk_t *);
-extern int	sctp_conprim_opt_process(queue_t *, mblk_t *, int *, int *,
-		    int *);
+		    sctp_init_chunk_t *, ip_recv_attr_t *);
 extern uint32_t	sctp_cumack(sctp_t *, uint32_t, mblk_t **);
 extern sctp_t	*sctp_create_eager(sctp_t *);
 
@@ -1066,10 +951,9 @@ extern void	sctp_ftsn_sets_init(void);
 
 extern int	sctp_get_addrlist(sctp_t *, const void *, uint32_t *,
 		    uchar_t **, int *, size_t *);
-extern void	sctp_g_q_inactive(sctp_stack_t *);
 extern int	sctp_get_addrparams(sctp_t *, sctp_t *, mblk_t *,
 		    sctp_chunk_hdr_t *, uint_t *);
-extern void	sctp_get_ire(sctp_t *, sctp_faddr_t *);
+extern void	sctp_get_dest(sctp_t *, sctp_faddr_t *);
 extern void	sctp_get_faddr_list(sctp_t *, uchar_t *, size_t);
 extern mblk_t	*sctp_get_first_sent(sctp_t *);
 extern mblk_t	*sctp_get_msg_to_send(sctp_t *, mblk_t **, mblk_t *, int  *,
@@ -1077,22 +961,20 @@ extern mblk_t	*sctp_get_msg_to_send(sctp_t *, mblk_t **, mblk_t *, int  *,
 extern void	sctp_get_saddr_list(sctp_t *, uchar_t *, size_t);
 
 extern int	sctp_handle_error(sctp_t *, sctp_hdr_t *, sctp_chunk_hdr_t *,
-		    mblk_t *);
+		    mblk_t *, ip_recv_attr_t *);
 extern void	sctp_hash_destroy(sctp_stack_t *);
 extern void	sctp_hash_init(sctp_stack_t *);
-extern int	sctp_header_init_ipv4(sctp_t *, int);
-extern int	sctp_header_init_ipv6(sctp_t *, int);
 extern void	sctp_heartbeat_timer(sctp_t *);
 
 extern void	sctp_icmp_error(sctp_t *, mblk_t *);
 extern void	sctp_inc_taskq(sctp_stack_t *);
 extern void	sctp_info_req(sctp_t *, mblk_t *);
-extern mblk_t	*sctp_init_mp(sctp_t *);
+extern mblk_t	*sctp_init_mp(sctp_t *, sctp_faddr_t *);
 extern boolean_t sctp_initialize_params(sctp_t *, sctp_init_chunk_t *,
 		    sctp_init_chunk_t *);
 extern uint32_t	sctp_init2vtag(sctp_chunk_hdr_t *);
 extern void	sctp_intf_event(sctp_t *, in6_addr_t, int, int);
-extern void	sctp_input_data(sctp_t *, mblk_t *, mblk_t *);
+extern void	sctp_input_data(sctp_t *, mblk_t *, ip_recv_attr_t *);
 extern void	sctp_instream_cleanup(sctp_t *, boolean_t);
 extern int	sctp_is_a_faddr_clean(sctp_t *);
 
@@ -1124,7 +1006,8 @@ extern int	sctp_nd_getset(queue_t *, MBLKP);
 extern boolean_t sctp_nd_init(sctp_stack_t *);
 extern sctp_parm_hdr_t *sctp_next_parm(sctp_parm_hdr_t *, ssize_t *);
 
-extern void	sctp_ootb_shutdown_ack(sctp_t *, mblk_t *, uint_t);
+extern void	sctp_ootb_shutdown_ack(mblk_t *, uint_t, ip_recv_attr_t *,
+		    ip_stack_t *);
 extern size_t	sctp_options_param(const sctp_t *, void *, int);
 extern size_t	sctp_options_param_len(const sctp_t *, int);
 extern void	sctp_output(sctp_t *, uint_t);
@@ -1132,10 +1015,10 @@ extern void	sctp_output(sctp_t *, uint_t);
 extern boolean_t sctp_param_register(IDP *, sctpparam_t *, int, sctp_stack_t *);
 extern void	sctp_partial_delivery_event(sctp_t *);
 extern int	sctp_process_cookie(sctp_t *, sctp_chunk_hdr_t *, mblk_t *,
-		    sctp_init_chunk_t **, sctp_hdr_t *, int *, in6_addr_t *);
+		    sctp_init_chunk_t **, sctp_hdr_t *, int *, in6_addr_t *,
+		    ip_recv_attr_t *);
 extern void	sctp_process_err(sctp_t *);
 extern void	sctp_process_heartbeat(sctp_t *, sctp_chunk_hdr_t *);
-extern void	sctp_process_sendq(sctp_t *);
 extern void	sctp_process_timer(sctp_t *);
 
 extern void	sctp_redo_faddr_srcs(sctp_t *);
@@ -1149,13 +1032,17 @@ extern sctp_faddr_t *sctp_rotate_faddr(sctp_t *, sctp_faddr_t *);
 
 extern boolean_t sctp_sack(sctp_t *, mblk_t *);
 extern int	sctp_secure_restart_check(mblk_t *, sctp_chunk_hdr_t *,
-		    uint32_t, int, sctp_stack_t *);
+		    uint32_t, int, sctp_stack_t *, ip_recv_attr_t *);
 extern void	sctp_send_abort(sctp_t *, uint32_t, uint16_t, char *, size_t,
-		    mblk_t *, int, boolean_t);
+		    mblk_t *, int, boolean_t, ip_recv_attr_t *);
+extern void	sctp_ootb_send_abort(uint32_t, uint16_t, char *, size_t,
+		    const mblk_t *, int, boolean_t, ip_recv_attr_t *,
+		    ip_stack_t *);
 extern void	sctp_send_cookie_ack(sctp_t *);
-extern void	sctp_send_cookie_echo(sctp_t *, sctp_chunk_hdr_t *, mblk_t *);
+extern void	sctp_send_cookie_echo(sctp_t *, sctp_chunk_hdr_t *, mblk_t *,
+			ip_recv_attr_t *);
 extern void	sctp_send_initack(sctp_t *, sctp_hdr_t *, sctp_chunk_hdr_t *,
-		    mblk_t *);
+		    mblk_t *, ip_recv_attr_t *);
 extern void	sctp_send_shutdown(sctp_t *, int);
 extern void	sctp_send_heartbeat(sctp_t *, sctp_faddr_t *);
 extern void	sctp_sendfail_event(sctp_t *, mblk_t *, int, boolean_t);
@@ -1170,7 +1057,7 @@ extern int	sctp_shutdown_received(sctp_t *, sctp_chunk_hdr_t *, boolean_t,
 		    boolean_t, sctp_faddr_t *);
 extern void	sctp_shutdown_complete(sctp_t *);
 extern void	sctp_set_if_mtu(sctp_t *);
-extern void	sctp_set_iplen(sctp_t *, mblk_t *);
+extern void	sctp_set_iplen(sctp_t *, mblk_t *, ip_xmit_attr_t *);
 extern void	sctp_set_ulp_prop(sctp_t *);
 extern void	sctp_ss_rexmit(sctp_t *);
 extern size_t	sctp_supaddr_param_len(sctp_t *);
@@ -1183,7 +1070,7 @@ extern void	sctp_timer_free(mblk_t *);
 extern void	sctp_timer_stop(mblk_t *);
 extern void	sctp_unlink_faddr(sctp_t *, sctp_faddr_t *);
 
-extern void	sctp_update_ire(sctp_t *sctp);
+extern void	sctp_update_dce(sctp_t *sctp);
 extern in_port_t sctp_update_next_port(in_port_t, zone_t *zone, sctp_stack_t *);
 extern void	sctp_update_rtt(sctp_t *, sctp_faddr_t *, clock_t);
 extern void	sctp_user_abort(sctp_t *, mblk_t *);
@@ -1209,17 +1096,6 @@ extern void	(*cl_sctp_assoc_change)(sa_family_t, uchar_t *, size_t, uint_t,
 extern void	(*cl_sctp_check_addrs)(sa_family_t, in_port_t, uchar_t **,
 		    size_t, uint_t *, boolean_t);
 
-/* Send a mp to IP. */
-#define	IP_PUT(mp, conn, isv4)						\
-{									\
-	sctp_stack_t	*sctps = conn->conn_netstack->netstack_sctp;	\
-									\
-	if ((isv4))							\
-		ip_output((conn), (mp), WR(sctps->sctps_g_q), IP_WPUT);	\
-	else								\
-		ip_output_v6((conn), (mp), WR(sctps->sctps_g_q), IP_WPUT);\
-}
-
 #define	RUN_SCTP(sctp)						\
 {								\
 	mutex_enter(&(sctp)->sctp_lock);			\
diff --git a/usr/src/uts/common/inet/sctp/sctp_init.c b/usr/src/uts/common/inet/sctp/sctp_init.c
index 5547609c98..ff34147a65 100644
--- a/usr/src/uts/common/inet/sctp/sctp_init.c
+++ b/usr/src/uts/common/inet/sctp/sctp_init.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/stream.h>
 #include <sys/ddi.h>
@@ -45,32 +43,6 @@
 #include "sctp_impl.h"
 #include "sctp_addr.h"
 
-/*
- * This will compute the checksum over the SCTP packet, so this
- * function should only be called after the whole packet has been
- * built.
- *
- * rptr should point to the IP / SCTP composite header.
- * len should be the length of the entire packet, including the IP
- *     header.
- */
-void
-sctp_add_hdr(sctp_t *sctp, uchar_t *rptr, size_t len)
-{
-	ipha_t *iphdr;
-	short iplen;
-
-	ASSERT(len >= sctp->sctp_hdr_len);
-
-	/* Copy the common header from the template */
-	bcopy(sctp->sctp_iphc, rptr, sctp->sctp_hdr_len);
-
-	/* Set the total length in the IP hdr */
-	iplen = (short)len;
-	iphdr = (ipha_t *)rptr;
-	U16_TO_ABE16(iplen, &iphdr->ipha_length);
-}
-
 /*ARGSUSED*/
 size_t
 sctp_supaddr_param_len(sctp_t *sctp)
@@ -83,17 +55,18 @@ sctp_supaddr_param(sctp_t *sctp, uchar_t *p)
 {
 	sctp_parm_hdr_t *sph;
 	uint16_t *addrtype;
+	conn_t		*connp = sctp->sctp_connp;
 
 	sph = (sctp_parm_hdr_t *)p;
 	sph->sph_type = htons(PARM_SUPP_ADDRS);
 	addrtype = (uint16_t *)(sph + 1);
-	switch (sctp->sctp_ipversion) {
-	case IPV4_VERSION:
+	switch (connp->conn_family) {
+	case AF_INET:
 		*addrtype++ = htons(PARM_ADDR4);
 		*addrtype = 0;
 		sph->sph_len = htons(sizeof (*sph) + sizeof (*addrtype));
 		break;
-	case IPV6_VERSION:
+	case AF_INET6:
 		*addrtype++ = htons(PARM_ADDR6);
 		if (!sctp->sctp_connp->conn_ipv6_v6only) {
 			*addrtype = htons(PARM_ADDR4);
@@ -167,7 +140,7 @@ sctp_adaptation_code_param(sctp_t *sctp, uchar_t *p)
 }
 
 mblk_t *
-sctp_init_mp(sctp_t *sctp)
+sctp_init_mp(sctp_t *sctp, sctp_faddr_t *fp)
 {
 	mblk_t			*mp;
 	uchar_t			*p;
@@ -176,12 +149,12 @@ sctp_init_mp(sctp_t *sctp)
 	sctp_chunk_hdr_t	*chp;
 	uint16_t		schlen;
 	int			supp_af;
-	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	sctp_stack_t		*sctps = sctp->sctp_sctps;
+	conn_t			*connp = sctp->sctp_connp;
 
-	if (sctp->sctp_family == AF_INET) {
+	if (connp->conn_family == AF_INET) {
 		supp_af = PARM_SUPP_V4;
 	} else {
-		/* Assume here that a v6 endpoint supports v4 address. */
 		if (sctp->sctp_connp->conn_ipv6_v6only)
 			supp_af = PARM_SUPP_V6;
 		else
@@ -203,11 +176,17 @@ sctp_init_mp(sctp_t *sctp)
 	sctp->sctp_sctph->sh_verf = 0;
 	sctp->sctp_sctph6->sh_verf = 0;
 
-	mp = sctp_make_mp(sctp, NULL, initlen);
+	mp = sctp_make_mp(sctp, fp, initlen);
 	if (mp == NULL) {
 		SCTP_KSTAT(sctps, sctp_send_init_failed);
 		return (NULL);
 	}
+	/* sctp_make_mp could have discovered we have no usable sources */
+	if (sctp->sctp_nsaddrs == 0) {
+		freemsg(mp);
+		SCTP_KSTAT(sctps, sctp_send_init_failed);
+		return (NULL);
+	}
 
 	/* Lay in a new INIT chunk, starting with the chunk header */
 	chp = (sctp_chunk_hdr_t *)mp->b_wptr;
@@ -242,7 +221,7 @@ sctp_init_mp(sctp_t *sctp)
 
 	BUMP_LOCAL(sctp->sctp_obchunks);
 
-	sctp_set_iplen(sctp, mp);
+	sctp_set_iplen(sctp, mp, fp->ixa);
 
 	return (mp);
 }
diff --git a/usr/src/uts/common/inet/sctp/sctp_input.c b/usr/src/uts/common/inet/sctp/sctp_input.c
index e18bfeacdd..e4a5ef5c5b 100644
--- a/usr/src/uts/common/inet/sctp/sctp_input.c
+++ b/usr/src/uts/common/inet/sctp/sctp_input.c
@@ -42,6 +42,7 @@
 
 #include <inet/common.h>
 #include <inet/ip.h>
+#include <inet/ip_if.h>
 #include <inet/ip6.h>
 #include <inet/mib2.h>
 #include <inet/ipclassifier.h>
@@ -318,7 +319,7 @@ sctp_next_chunk(sctp_chunk_hdr_t *ch, ssize_t *remaining)
  */
 static int
 sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp,
-    sctp_faddr_t *fp, ip6_pkt_t *ipp)
+    sctp_faddr_t *fp, ip_pkt_t *ipp, ip_recv_attr_t *ira)
 {
 	struct T_unitdata_ind	*tudi;
 	int			optlen;
@@ -329,57 +330,61 @@ sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp,
 	struct sockaddr_in6	sin_buf[1];
 	struct sockaddr_in6	*sin6;
 	struct sockaddr_in	*sin4;
-	uint_t			addflag = 0;
+	crb_t			 addflag;	/* Which pieces to add */
+	conn_t			*connp = sctp->sctp_connp;
 
 	sin4 = NULL;
 	sin6 = NULL;
 
 	optlen = hdrlen = 0;
+	addflag.crb_all = 0;
 
 	/* Figure out address size */
-	if (sctp->sctp_ipversion == IPV4_VERSION) {
+	if (connp->conn_family == AF_INET) {
 		sin4 = (struct sockaddr_in *)sin_buf;
 		sin4->sin_family = AF_INET;
-		sin4->sin_port = sctp->sctp_fport;
+		sin4->sin_port = connp->conn_fport;
 		IN6_V4MAPPED_TO_IPADDR(&fp->faddr, sin4->sin_addr.s_addr);
 		hdrlen = sizeof (*tudi) + sizeof (*sin4);
 	} else {
 		sin6 = sin_buf;
 		sin6->sin6_family = AF_INET6;
-		sin6->sin6_port = sctp->sctp_fport;
+		sin6->sin6_port = connp->conn_fport;
 		sin6->sin6_addr = fp->faddr;
 		hdrlen = sizeof (*tudi) + sizeof (*sin6);
 	}
-
 	/* If app asked to receive send / recv info */
-	if (sctp->sctp_recvsndrcvinfo) {
+	if (sctp->sctp_recvsndrcvinfo)
 		optlen += sizeof (*cmsg) + sizeof (struct sctp_sndrcvinfo);
-		if (hdrlen == 0)
-			hdrlen = sizeof (struct T_optdata_ind);
-	}
 
-	if (sctp->sctp_ipv6_recvancillary == 0)
+	if (connp->conn_recv_ancillary.crb_all == 0)
 		goto noancillary;
 
-	if ((ipp->ipp_fields & IPPF_IFINDEX) &&
-	    ipp->ipp_ifindex != sctp->sctp_recvifindex &&
-	    (sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVPKTINFO)) {
+	if (connp->conn_recv_ancillary.crb_ip_recvpktinfo &&
+	    ira->ira_ruifindex != sctp->sctp_recvifindex) {
 		optlen += sizeof (*cmsg) + sizeof (struct in6_pktinfo);
 		if (hdrlen == 0)
 			hdrlen = sizeof (struct T_unitdata_ind);
-		addflag |= SCTP_IPV6_RECVPKTINFO;
+		addflag.crb_ip_recvpktinfo = 1;
 	}
 	/* If app asked for hoplimit and it has changed ... */
-	if ((ipp->ipp_fields & IPPF_HOPLIMIT) &&
-	    ipp->ipp_hoplimit != sctp->sctp_recvhops &&
-	    (sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVHOPLIMIT)) {
+	if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit &&
+	    ipp->ipp_hoplimit != sctp->sctp_recvhops) {
 		optlen += sizeof (*cmsg) + sizeof (uint_t);
 		if (hdrlen == 0)
 			hdrlen = sizeof (struct T_unitdata_ind);
-		addflag |= SCTP_IPV6_RECVHOPLIMIT;
+		addflag.crb_ipv6_recvhoplimit = 1;
+	}
+	/* If app asked for tclass and it has changed ... */
+	if (connp->conn_recv_ancillary.crb_ipv6_recvtclass &&
+	    ipp->ipp_tclass != sctp->sctp_recvtclass) {
+		optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
+		if (hdrlen == 0)
+			hdrlen = sizeof (struct T_unitdata_ind);
+		addflag.crb_ipv6_recvtclass = 1;
 	}
 	/* If app asked for hopbyhop headers and it has changed ... */
-	if ((sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVHOPOPTS) &&
+	if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts &&
 	    ip_cmpbuf(sctp->sctp_hopopts, sctp->sctp_hopoptslen,
 	    (ipp->ipp_fields & IPPF_HOPOPTS),
 	    ipp->ipp_hopopts, ipp->ipp_hopoptslen)) {
@@ -387,7 +392,7 @@ sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp,
 		    sctp->sctp_v6label_len;
 		if (hdrlen == 0)
 			hdrlen = sizeof (struct T_unitdata_ind);
-		addflag |= SCTP_IPV6_RECVHOPOPTS;
+		addflag.crb_ipv6_recvhopopts = 1;
 		if (!ip_allocbuf((void **)&sctp->sctp_hopopts,
 		    &sctp->sctp_hopoptslen,
 		    (ipp->ipp_fields & IPPF_HOPOPTS),
@@ -395,45 +400,44 @@ sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp,
 			return (-1);
 	}
 	/* If app asked for dst headers before routing headers ... */
-	if ((sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVRTDSTOPTS) &&
-	    ip_cmpbuf(sctp->sctp_rtdstopts, sctp->sctp_rtdstoptslen,
-	    (ipp->ipp_fields & IPPF_RTDSTOPTS),
-	    ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) {
-		optlen += sizeof (*cmsg) + ipp->ipp_rtdstoptslen;
+	if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts &&
+	    ip_cmpbuf(sctp->sctp_rthdrdstopts, sctp->sctp_rthdrdstoptslen,
+	    (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+	    ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) {
+		optlen += sizeof (*cmsg) + ipp->ipp_rthdrdstoptslen;
 		if (hdrlen == 0)
 			hdrlen = sizeof (struct T_unitdata_ind);
-		addflag |= SCTP_IPV6_RECVRTDSTOPTS;
-		if (!ip_allocbuf((void **)&sctp->sctp_rtdstopts,
-		    &sctp->sctp_rtdstoptslen,
-		    (ipp->ipp_fields & IPPF_RTDSTOPTS),
-		    ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen))
+		addflag.crb_ipv6_recvrthdrdstopts = 1;
+		if (!ip_allocbuf((void **)&sctp->sctp_rthdrdstopts,
+		    &sctp->sctp_rthdrdstoptslen,
+		    (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+		    ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen))
 			return (-1);
 	}
 	/* If app asked for routing headers and it has changed ... */
-	if (sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVRTHDR) {
-		if (ip_cmpbuf(sctp->sctp_rthdr, sctp->sctp_rthdrlen,
+	if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr &&
+	    ip_cmpbuf(sctp->sctp_rthdr, sctp->sctp_rthdrlen,
+	    (ipp->ipp_fields & IPPF_RTHDR),
+	    ipp->ipp_rthdr, ipp->ipp_rthdrlen)) {
+		optlen += sizeof (*cmsg) + ipp->ipp_rthdrlen;
+		if (hdrlen == 0)
+			hdrlen = sizeof (struct T_unitdata_ind);
+		addflag.crb_ipv6_recvrthdr = 1;
+		if (!ip_allocbuf((void **)&sctp->sctp_rthdr,
+		    &sctp->sctp_rthdrlen,
 		    (ipp->ipp_fields & IPPF_RTHDR),
-		    ipp->ipp_rthdr, ipp->ipp_rthdrlen)) {
-			optlen += sizeof (*cmsg) + ipp->ipp_rthdrlen;
-			if (hdrlen == 0)
-				hdrlen = sizeof (struct T_unitdata_ind);
-			addflag |= SCTP_IPV6_RECVRTHDR;
-			if (!ip_allocbuf((void **)&sctp->sctp_rthdr,
-			    &sctp->sctp_rthdrlen,
-			    (ipp->ipp_fields & IPPF_RTHDR),
-			    ipp->ipp_rthdr, ipp->ipp_rthdrlen))
-				return (-1);
-		}
+		    ipp->ipp_rthdr, ipp->ipp_rthdrlen))
+			return (-1);
 	}
 	/* If app asked for dest headers and it has changed ... */
-	if ((sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVDSTOPTS) &&
+	if (connp->conn_recv_ancillary.crb_ipv6_recvdstopts &&
 	    ip_cmpbuf(sctp->sctp_dstopts, sctp->sctp_dstoptslen,
 	    (ipp->ipp_fields & IPPF_DSTOPTS),
 	    ipp->ipp_dstopts, ipp->ipp_dstoptslen)) {
 		optlen += sizeof (*cmsg) + ipp->ipp_dstoptslen;
 		if (hdrlen == 0)
 			hdrlen = sizeof (struct T_unitdata_ind);
-		addflag |= SCTP_IPV6_RECVDSTOPTS;
+		addflag.crb_ipv6_recvdstopts = 1;
 		if (!ip_allocbuf((void **)&sctp->sctp_dstopts,
 		    &sctp->sctp_dstoptslen,
 		    (ipp->ipp_fields & IPPF_DSTOPTS),
@@ -499,9 +503,11 @@ noancillary:
 	 * If app asked for pktinfo and the index has changed ...
 	 * Note that the local address never changes for the connection.
 	 */
-	if (addflag & SCTP_IPV6_RECVPKTINFO) {
+	if (addflag.crb_ip_recvpktinfo) {
 		struct in6_pktinfo *pkti;
+		uint_t ifindex;
 
+		ifindex = ira->ira_ruifindex;
 		cmsg = (struct cmsghdr *)optptr;
 		cmsg->cmsg_level = IPPROTO_IPV6;
 		cmsg->cmsg_type = IPV6_PKTINFO;
@@ -509,19 +515,20 @@ noancillary:
 		optptr += sizeof (*cmsg);
 
 		pkti = (struct in6_pktinfo *)optptr;
-		if (sctp->sctp_ipversion == IPV6_VERSION)
+		if (connp->conn_family == AF_INET6)
 			pkti->ipi6_addr = sctp->sctp_ip6h->ip6_src;
 		else
 			IN6_IPADDR_TO_V4MAPPED(sctp->sctp_ipha->ipha_src,
 			    &pkti->ipi6_addr);
-		pkti->ipi6_ifindex = ipp->ipp_ifindex;
+
+		pkti->ipi6_ifindex = ifindex;
 		optptr += sizeof (*pkti);
 		ASSERT(OK_32PTR(optptr));
 		/* Save as "last" value */
-		sctp->sctp_recvifindex = ipp->ipp_ifindex;
+		sctp->sctp_recvifindex = ifindex;
 	}
 	/* If app asked for hoplimit and it has changed ... */
-	if (addflag & SCTP_IPV6_RECVHOPLIMIT) {
+	if (addflag.crb_ipv6_recvhoplimit) {
 		cmsg = (struct cmsghdr *)optptr;
 		cmsg->cmsg_level = IPPROTO_IPV6;
 		cmsg->cmsg_type = IPV6_HOPLIMIT;
@@ -534,7 +541,21 @@ noancillary:
 		/* Save as "last" value */
 		sctp->sctp_recvhops = ipp->ipp_hoplimit;
 	}
-	if (addflag & SCTP_IPV6_RECVHOPOPTS) {
+	/* If app asked for tclass and it has changed ... */
+	if (addflag.crb_ipv6_recvtclass) {
+		cmsg = (struct cmsghdr *)optptr;
+		cmsg->cmsg_level = IPPROTO_IPV6;
+		cmsg->cmsg_type = IPV6_TCLASS;
+		cmsg->cmsg_len = sizeof (*cmsg) + sizeof (uint_t);
+		optptr += sizeof (*cmsg);
+
+		*(uint_t *)optptr = ipp->ipp_tclass;
+		optptr += sizeof (uint_t);
+		ASSERT(OK_32PTR(optptr));
+		/* Save as "last" value */
+		sctp->sctp_recvtclass = ipp->ipp_tclass;
+	}
+	if (addflag.crb_ipv6_recvhopopts) {
 		cmsg = (struct cmsghdr *)optptr;
 		cmsg->cmsg_level = IPPROTO_IPV6;
 		cmsg->cmsg_type = IPV6_HOPOPTS;
@@ -550,23 +571,23 @@ noancillary:
 		    (ipp->ipp_fields & IPPF_HOPOPTS),
 		    ipp->ipp_hopopts, ipp->ipp_hopoptslen);
 	}
-	if (addflag & SCTP_IPV6_RECVRTDSTOPTS) {
+	if (addflag.crb_ipv6_recvrthdrdstopts) {
 		cmsg = (struct cmsghdr *)optptr;
 		cmsg->cmsg_level = IPPROTO_IPV6;
 		cmsg->cmsg_type = IPV6_RTHDRDSTOPTS;
-		cmsg->cmsg_len = sizeof (*cmsg) + ipp->ipp_rtdstoptslen;
+		cmsg->cmsg_len = sizeof (*cmsg) + ipp->ipp_rthdrdstoptslen;
 		optptr += sizeof (*cmsg);
 
-		bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen);
-		optptr += ipp->ipp_rtdstoptslen;
+		bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen);
+		optptr += ipp->ipp_rthdrdstoptslen;
 		ASSERT(OK_32PTR(optptr));
 		/* Save as last value */
-		ip_savebuf((void **)&sctp->sctp_rtdstopts,
-		    &sctp->sctp_rtdstoptslen,
-		    (ipp->ipp_fields & IPPF_RTDSTOPTS),
-		    ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen);
+		ip_savebuf((void **)&sctp->sctp_rthdrdstopts,
+		    &sctp->sctp_rthdrdstoptslen,
+		    (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+		    ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
 	}
-	if (addflag & SCTP_IPV6_RECVRTHDR) {
+	if (addflag.crb_ipv6_recvrthdr) {
 		cmsg = (struct cmsghdr *)optptr;
 		cmsg->cmsg_level = IPPROTO_IPV6;
 		cmsg->cmsg_type = IPV6_RTHDR;
@@ -582,7 +603,7 @@ noancillary:
 		    (ipp->ipp_fields & IPPF_RTHDR),
 		    ipp->ipp_rthdr, ipp->ipp_rthdrlen);
 	}
-	if (addflag & SCTP_IPV6_RECVDSTOPTS) {
+	if (addflag.crb_ipv6_recvdstopts) {
 		cmsg = (struct cmsghdr *)optptr;
 		cmsg->cmsg_level = IPPROTO_IPV6;
 		cmsg->cmsg_type = IPV6_DSTOPTS;
@@ -778,7 +799,6 @@ static mblk_t *
 sctp_try_partial_delivery(sctp_t *sctp, mblk_t *hmp, sctp_reass_t *srp,
     sctp_data_hdr_t **dc)
 {
-	mblk_t		*first_mp;
 	mblk_t		*mp;
 	mblk_t		*dmp;
 	mblk_t		*qmp;
@@ -791,8 +811,7 @@ sctp_try_partial_delivery(sctp_t *sctp, mblk_t *hmp, sctp_reass_t *srp,
 	dprint(4, ("trypartial: got=%d, needed=%d\n",
 	    (int)(srp->got), (int)(srp->needed)));
 
-	first_mp = hmp->b_cont;
-	mp = first_mp;
+	mp = hmp->b_cont;
 	qdc = (sctp_data_hdr_t *)mp->b_rptr;
 
 	ASSERT(SCTP_DATA_GET_BBIT(qdc) && srp->hasBchunk);
@@ -1175,7 +1194,7 @@ sctp_add_dup(uint32_t tsn, mblk_t **dups)
 
 static void
 sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
-    sctp_faddr_t *fp, ip6_pkt_t *ipp)
+    sctp_faddr_t *fp, ip_pkt_t *ipp, ip_recv_attr_t *ira)
 {
 	sctp_data_hdr_t *dc;
 	mblk_t *dmp, *pmp;
@@ -1419,7 +1438,8 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
 	if (can_deliver) {
 
 		dmp->b_rptr = (uchar_t *)(dc + 1);
-		if (sctp_input_add_ancillary(sctp, &dmp, dc, fp, ipp) == 0) {
+		if (sctp_input_add_ancillary(sctp, &dmp, dc, fp,
+		    ipp, ira) == 0) {
 			dprint(1, ("sctp_data_chunk: delivering %lu bytes\n",
 			    msgdsize(dmp)));
 			sctp->sctp_rwnd -= dlen;
@@ -1507,7 +1527,7 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
 		if (can_deliver) {
 			dmp->b_rptr = (uchar_t *)(dc + 1);
 			if (sctp_input_add_ancillary(sctp, &dmp, dc, fp,
-			    ipp) == 0) {
+			    ipp, ira) == 0) {
 				dprint(1, ("sctp_data_chunk: delivering %lu "
 				    "bytes\n", msgdsize(dmp)));
 				sctp->sctp_rwnd -= dlen;
@@ -1646,6 +1666,8 @@ sctp_make_sack(sctp_t *sctp, sctp_faddr_t *sendto, mblk_t *dups)
 	uint32_t	dups_len;
 	sctp_faddr_t	*fp;
 
+	ASSERT(sendto != NULL);
+
 	if (sctp->sctp_force_sack) {
 		sctp->sctp_force_sack = 0;
 		goto checks_done;
@@ -1696,8 +1718,9 @@ checks_done:
 				return (NULL);
 			}
 			smp->b_cont = sctp->sctp_err_chunks;
-			sctp_set_iplen(sctp, smp);
-			sctp_add_sendq(sctp, smp);
+			sctp_set_iplen(sctp, smp, fp->ixa);
+			(void) conn_ip_output(smp, fp->ixa);
+			BUMP_LOCAL(sctp->sctp_opkts);
 			sctp->sctp_err_chunks = NULL;
 			sctp->sctp_err_len = 0;
 		}
@@ -1749,8 +1772,6 @@ sctp_sack(sctp_t *sctp, mblk_t *dups)
 			freeb(dups);
 		return (B_FALSE);
 	}
-	sctp_set_iplen(sctp, smp);
-
 	dprint(2, ("sctp_sack: sending to %p %x:%x:%x:%x\n",
 	    (void *)sctp->sctp_lastdata,
 	    SCTP_PRINTADDR(sctp->sctp_lastdata->faddr)));
@@ -1758,7 +1779,10 @@ sctp_sack(sctp_t *sctp, mblk_t *dups)
 	sctp->sctp_active = lbolt64;
 
 	BUMP_MIB(&sctps->sctps_mib, sctpOutAck);
-	sctp_add_sendq(sctp, smp);
+
+	sctp_set_iplen(sctp, smp, sctp->sctp_lastdata->ixa);
+	(void) conn_ip_output(smp, sctp->sctp_lastdata->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 	return (B_TRUE);
 }
 
@@ -1813,8 +1837,9 @@ sctp_check_abandoned_msg(sctp_t *sctp, mblk_t *meta)
 			return (ENOMEM);
 		}
 		SCTP_MSG_SET_ABANDONED(meta);
-		sctp_set_iplen(sctp, head);
-		sctp_add_sendq(sctp, head);
+		sctp_set_iplen(sctp, head, fp->ixa);
+		(void) conn_ip_output(head, fp->ixa);
+		BUMP_LOCAL(sctp->sctp_opkts);
 		if (!fp->timer_running)
 			SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
 		mp1 = mp1->b_next;
@@ -2080,13 +2105,13 @@ sctp_ftsn_check_frag(sctp_t *sctp, uint16_t ssn, sctp_instr_t *sip)
  * messages, if any, from the instream queue (that were waiting for this
  * sid-ssn message to show up). Once we are done try to update the SACK
  * info. We could get a duplicate Forward TSN, in which case just send
- * a SACK. If any of the sid values in the the Forward TSN is invalid,
+ * a SACK. If any of the sid values in the Forward TSN is invalid,
  * send back an "Invalid Stream Identifier" error and continue processing
  * the rest.
  */
 static void
 sctp_process_forward_tsn(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp,
-    ip6_pkt_t *ipp)
+    ip_pkt_t *ipp, ip_recv_attr_t *ira)
 {
 	uint32_t	*ftsn = (uint32_t *)(ch + 1);
 	ftsn_entry_t	*ftsn_entry;
@@ -2171,7 +2196,7 @@ sctp_process_forward_tsn(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp,
 				dmp->b_next = NULL;
 				ASSERT(dmp->b_prev == NULL);
 				if (sctp_input_add_ancillary(sctp,
-				    &dmp, dc, fp, ipp) == 0) {
+				    &dmp, dc, fp, ipp, ira) == 0) {
 					sctp->sctp_rxqueued -= dlen;
 					sctp->sctp_rwnd -= dlen;
 					/*
@@ -2280,8 +2305,9 @@ sctp_check_abandoned_data(sctp_t *sctp, sctp_faddr_t *fp)
 				SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
 			return;
 		}
-		sctp_set_iplen(sctp, nmp);
-		sctp_add_sendq(sctp, nmp);
+		sctp_set_iplen(sctp, nmp, fp->ixa);
+		(void) conn_ip_output(nmp, fp->ixa);
+		BUMP_LOCAL(sctp->sctp_opkts);
 		if (!fp->timer_running)
 			SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
 	}
@@ -2604,8 +2630,9 @@ sctp_got_sack(sctp_t *sctp, sctp_chunk_hdr_t *sch)
 			sctp->sctp_zero_win_probe = B_FALSE;
 			sctp->sctp_rxt_nxttsn = sctp->sctp_ltsn;
 			sctp->sctp_rxt_maxtsn = sctp->sctp_ltsn;
-			sctp_set_iplen(sctp, pkt);
-			sctp_add_sendq(sctp, pkt);
+			sctp_set_iplen(sctp, pkt, fp->ixa);
+			(void) conn_ip_output(pkt, fp->ixa);
+			BUMP_LOCAL(sctp->sctp_opkts);
 		}
 	} else {
 		if (sctp->sctp_zero_win_probe) {
@@ -3160,97 +3187,15 @@ sctp_check_input(sctp_t *sctp, sctp_chunk_hdr_t *ch, ssize_t len, int first)
 	return (1);
 }
 
-/* ARGSUSED */
-static sctp_hdr_t *
-find_sctp_hdrs(mblk_t *mp, in6_addr_t *src, in6_addr_t *dst,
-    uint_t *ifindex, uint_t *ip_hdr_len, ip6_pkt_t *ipp, ip_pktinfo_t *pinfo)
-{
-	uchar_t	*rptr;
-	ipha_t	*ip4h;
-	ip6_t	*ip6h;
-	mblk_t	*mp1;
-
-	rptr = mp->b_rptr;
-	if (IPH_HDR_VERSION(rptr) == IPV4_VERSION) {
-		*ip_hdr_len = IPH_HDR_LENGTH(rptr);
-		ip4h = (ipha_t *)rptr;
-		IN6_IPADDR_TO_V4MAPPED(ip4h->ipha_src, src);
-		IN6_IPADDR_TO_V4MAPPED(ip4h->ipha_dst, dst);
-
-		ipp->ipp_fields |= IPPF_HOPLIMIT;
-		ipp->ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl;
-		if (pinfo != NULL && (pinfo->ip_pkt_flags & IPF_RECVIF)) {
-			ipp->ipp_fields |= IPPF_IFINDEX;
-			ipp->ipp_ifindex = pinfo->ip_pkt_ifindex;
-		}
-	} else {
-		ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
-		ip6h = (ip6_t *)rptr;
-		ipp->ipp_fields = IPPF_HOPLIMIT;
-		ipp->ipp_hoplimit = ip6h->ip6_hops;
-
-		if (ip6h->ip6_nxt != IPPROTO_SCTP) {
-			/* Look for ifindex information */
-			if (ip6h->ip6_nxt == IPPROTO_RAW) {
-				ip6i_t *ip6i = (ip6i_t *)ip6h;
-
-				if (ip6i->ip6i_flags & IP6I_IFINDEX) {
-					ASSERT(ip6i->ip6i_ifindex != 0);
-					ipp->ipp_fields |= IPPF_IFINDEX;
-					ipp->ipp_ifindex = ip6i->ip6i_ifindex;
-				}
-				rptr = (uchar_t *)&ip6i[1];
-				mp->b_rptr = rptr;
-				if (rptr == mp->b_wptr) {
-					mp1 = mp->b_cont;
-					freeb(mp);
-					mp = mp1;
-					rptr = mp->b_rptr;
-				}
-				ASSERT(mp->b_wptr - rptr >=
-				    IPV6_HDR_LEN + sizeof (sctp_hdr_t));
-				ip6h = (ip6_t *)rptr;
-			}
-			/*
-			 * Find any potentially interesting extension headers
-			 * as well as the length of the IPv6 + extension
-			 * headers.
-			 */
-			*ip_hdr_len = ip_find_hdr_v6(mp, ip6h, ipp, NULL);
-		} else {
-			*ip_hdr_len = IPV6_HDR_LEN;
-		}
-		*src = ip6h->ip6_src;
-		*dst = ip6h->ip6_dst;
-	}
-	ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
-	return ((sctp_hdr_t *)&rptr[*ip_hdr_len]);
-#undef IPVER
-}
-
 static mblk_t *
-sctp_check_in_policy(mblk_t *mp, mblk_t *ipsec_mp)
+sctp_check_in_policy(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
 {
-	ipsec_in_t *ii;
-	boolean_t check = B_TRUE;
 	boolean_t policy_present;
 	ipha_t *ipha;
 	ip6_t *ip6h;
-	netstack_t	*ns;
-	ipsec_stack_t	*ipss;
-
-	ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-	ASSERT(ii->ipsec_in_type == IPSEC_IN);
-	ns = ii->ipsec_in_ns;
-	ipss = ns->netstack_ipsec;
-
-	if (ii->ipsec_in_dont_check) {
-		check = B_FALSE;
-		if (!ii->ipsec_in_secure) {
-			freeb(ipsec_mp);
-			ipsec_mp = NULL;
-		}
-	}
+	netstack_t	*ns = ipst->ips_netstack;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+
 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
 		policy_present = ipss->ipsec_inbound_v4_policy_present;
 		ipha = (ipha_t *)mp->b_rptr;
@@ -3261,109 +3206,88 @@ sctp_check_in_policy(mblk_t *mp, mblk_t *ipsec_mp)
 		ip6h = (ip6_t *)mp->b_rptr;
 	}
 
-	if (check && policy_present) {
+	if (policy_present) {
 		/*
 		 * The conn_t parameter is NULL because we already know
 		 * nobody's home.
 		 */
-		ipsec_mp = ipsec_check_global_policy(ipsec_mp, (conn_t *)NULL,
-		    ipha, ip6h, B_TRUE, ns);
-		if (ipsec_mp == NULL)
+		mp = ipsec_check_global_policy(mp, (conn_t *)NULL,
+		    ipha, ip6h, ira, ns);
+		if (mp == NULL)
 			return (NULL);
 	}
-	if (ipsec_mp != NULL)
-		freeb(ipsec_mp);
 	return (mp);
 }
 
 /* Handle out-of-the-blue packets */
 void
-sctp_ootb_input(mblk_t *mp, ill_t *recv_ill, zoneid_t zoneid,
-    boolean_t mctl_present)
+sctp_ootb_input(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
 {
 	sctp_t			*sctp;
 	sctp_chunk_hdr_t	*ch;
 	sctp_hdr_t		*sctph;
 	in6_addr_t		src, dst;
-	uint_t			ip_hdr_len;
-	uint_t			ifindex;
-	ip6_pkt_t		ipp;
+	uint_t			ip_hdr_len = ira->ira_ip_hdr_length;
 	ssize_t			mlen;
-	ip_pktinfo_t		*pinfo = NULL;
-	mblk_t			*first_mp;
 	sctp_stack_t		*sctps;
-	ip_stack_t		*ipst;
+	boolean_t		secure;
+	zoneid_t		zoneid = ira->ira_zoneid;
+	uchar_t			*rptr;
+
+	ASSERT(ira->ira_ill == NULL);
+
+	secure = ira->ira_flags & IRAF_IPSEC_SECURE;
 
-	ASSERT(recv_ill != NULL);
-	ipst = recv_ill->ill_ipst;
 	sctps = ipst->ips_netstack->netstack_sctp;
 
 	BUMP_MIB(&sctps->sctps_mib, sctpOutOfBlue);
 	BUMP_MIB(&sctps->sctps_mib, sctpInSCTPPkts);
 
-	if (sctps->sctps_gsctp == NULL) {
-		/*
-		 * For non-zero stackids the default queue isn't created
-		 * until the first open, thus there can be a need to send
-		 * an error before then. But we can't do that, hence we just
-		 * drop the packet. Later during boot, when the default queue
-		 * has been setup, a retransmitted packet from the peer
-		 * will result in a error.
-		 */
-		ASSERT(sctps->sctps_netstack->netstack_stackid !=
-		    GLOBAL_NETSTACKID);
-		freemsg(mp);
-		return;
-	}
-
-	first_mp = mp;
-	if (mctl_present)
-		mp = mp->b_cont;
-
-	/* Initiate IPPf processing, if needed. */
-	if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
-		ip_process(IPP_LOCAL_IN, &mp,
-		    recv_ill->ill_phyint->phyint_ifindex);
-		if (mp == NULL) {
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		}
-	}
-
 	if (mp->b_cont != NULL) {
 		/*
 		 * All subsequent code is vastly simplified if it can
 		 * assume a single contiguous chunk of data.
 		 */
 		if (pullupmsg(mp, -1) == 0) {
-			BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(first_mp);
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+			freemsg(mp);
 			return;
 		}
 	}
 
-	/*
-	 * We don't really need to call this function...  Need to
-	 * optimize later.
-	 */
-	sctph = find_sctp_hdrs(mp, &src, &dst, &ifindex, &ip_hdr_len,
-	    &ipp, pinfo);
+	rptr = mp->b_rptr;
+	sctph = ((sctp_hdr_t *)&rptr[ip_hdr_len]);
+	if (ira->ira_flags & IRAF_IS_IPV4) {
+		ipha_t *ipha;
+
+		ipha = (ipha_t *)rptr;
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &src);
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &dst);
+	} else {
+		ip6_t *ip6h;
+
+		ip6h = (ip6_t *)rptr;
+		src = ip6h->ip6_src;
+		dst = ip6h->ip6_dst;
+	}
+
 	mlen = mp->b_wptr - (uchar_t *)(sctph + 1);
 	if ((ch = sctp_first_chunk((uchar_t *)(sctph + 1), mlen)) == NULL) {
 		dprint(3, ("sctp_ootb_input: invalid packet\n"));
-		BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
-		freemsg(first_mp);
+		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+		freemsg(mp);
 		return;
 	}
 
 	switch (ch->sch_id) {
 	case CHUNK_INIT:
 		/* no listener; send abort  */
-		if (mctl_present && sctp_check_in_policy(mp, first_mp) == NULL)
+		if (secure && sctp_check_in_policy(mp, ira, ipst) == NULL)
 			return;
-		sctp_send_abort(sctps->sctps_gsctp, sctp_init2vtag(ch), 0,
-		    NULL, 0, mp, 0, B_TRUE);
+		sctp_ootb_send_abort(sctp_init2vtag(ch), 0,
+		    NULL, 0, mp, 0, B_TRUE, ira, ipst);
 		break;
 	case CHUNK_INIT_ACK:
 		/* check for changed src addr */
@@ -3372,11 +3296,7 @@ sctp_ootb_input(mblk_t *mp, ill_t *recv_ill, zoneid_t zoneid,
 			/* success; proceed to normal path */
 			mutex_enter(&sctp->sctp_lock);
 			if (sctp->sctp_running) {
-				if (!sctp_add_recvq(sctp, mp, B_FALSE)) {
-					BUMP_MIB(recv_ill->ill_ip_mib,
-					    ipIfStatsInDiscards);
-					freemsg(mp);
-				}
+				sctp_add_recvq(sctp, mp, B_FALSE, ira);
 				mutex_exit(&sctp->sctp_lock);
 			} else {
 				/*
@@ -3387,152 +3307,101 @@ sctp_ootb_input(mblk_t *mp, ill_t *recv_ill, zoneid_t zoneid,
 				 */
 				sctp->sctp_running = B_TRUE;
 				mutex_exit(&sctp->sctp_lock);
-				sctp_input_data(sctp, mp, NULL);
+				sctp_input_data(sctp, mp, ira);
 				WAKE_SCTP(sctp);
-				sctp_process_sendq(sctp);
 			}
 			SCTP_REFRELE(sctp);
 			return;
 		}
-		if (mctl_present)
-			freeb(first_mp);
 		/* else bogus init ack; drop it */
 		break;
 	case CHUNK_SHUTDOWN_ACK:
-		if (mctl_present && sctp_check_in_policy(mp, first_mp) == NULL)
+		if (secure && sctp_check_in_policy(mp, ira, ipst) == NULL)
 			return;
-		sctp_ootb_shutdown_ack(sctps->sctps_gsctp, mp, ip_hdr_len);
-		sctp_process_sendq(sctps->sctps_gsctp);
+		sctp_ootb_shutdown_ack(mp, ip_hdr_len, ira, ipst);
 		return;
 	case CHUNK_ERROR:
 	case CHUNK_ABORT:
 	case CHUNK_COOKIE_ACK:
 	case CHUNK_SHUTDOWN_COMPLETE:
-		if (mctl_present)
-			freeb(first_mp);
 		break;
 	default:
-		if (mctl_present && sctp_check_in_policy(mp, first_mp) == NULL)
+		if (secure && sctp_check_in_policy(mp, ira, ipst) == NULL)
 			return;
-		sctp_send_abort(sctps->sctps_gsctp, sctph->sh_verf, 0,
-		    NULL, 0, mp, 0, B_TRUE);
+		sctp_ootb_send_abort(sctph->sh_verf, 0,
+		    NULL, 0, mp, 0, B_TRUE, ira, ipst);
 		break;
 	}
-	sctp_process_sendq(sctps->sctps_gsctp);
 	freemsg(mp);
 }
 
+/*
+ * Handle sctp packets.
+ * Note that we rele the sctp_t (the caller got a reference on it).
+ */
 void
-sctp_input(conn_t *connp, ipha_t *ipha, mblk_t *mp, mblk_t *first_mp,
-    ill_t *recv_ill, boolean_t isv4, boolean_t mctl_present)
+sctp_input(conn_t *connp, ipha_t *ipha, ip6_t *ip6h, mblk_t *mp,
+    ip_recv_attr_t *ira)
 {
-	sctp_t *sctp = CONN2SCTP(connp);
-	ip_stack_t	*ipst = recv_ill->ill_ipst;
+	sctp_t		*sctp = CONN2SCTP(connp);
+	boolean_t	secure;
+	ill_t		*ill = ira->ira_ill;
+	ip_stack_t	*ipst = ill->ill_ipst;
 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
+	iaflags_t	iraflags = ira->ira_flags;
+	ill_t		*rill = ira->ira_rill;
+
+	secure = iraflags & IRAF_IPSEC_SECURE;
 
 	/*
 	 * We check some fields in conn_t without holding a lock.
 	 * This should be fine.
 	 */
-	if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) {
-		first_mp = ipsec_check_inbound_policy(first_mp, connp,
-		    ipha, NULL, mctl_present);
-		if (first_mp == NULL) {
-			BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
-			SCTP_REFRELE(sctp);
-			return;
-		}
-	}
-
-	/* Initiate IPPF processing for fastpath */
-	if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
-		ip_process(IPP_LOCAL_IN, &mp,
-		    recv_ill->ill_phyint->phyint_ifindex);
+	if (((iraflags & IRAF_IS_IPV4) ?
+	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
+	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
+	    secure) {
+		mp = ipsec_check_inbound_policy(mp, connp, ipha,
+		    ip6h, ira);
 		if (mp == NULL) {
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+			/* Note that mp is NULL */
+			ip_drop_input("ipIfStatsInDiscards", mp, ill);
 			SCTP_REFRELE(sctp);
-			if (mctl_present)
-				freeb(first_mp);
 			return;
-		} else if (mctl_present) {
-			/*
-			 * ip_process might return a new mp.
-			 */
-			ASSERT(first_mp != mp);
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
 		}
 	}
 
-	if (connp->conn_recvif || connp->conn_recvslla ||
-	    connp->conn_ip_recvpktinfo) {
-		int in_flags = 0;
-
-		if (connp->conn_recvif || connp->conn_ip_recvpktinfo) {
-			in_flags = IPF_RECVIF;
-		}
-		if (connp->conn_recvslla) {
-			in_flags |= IPF_RECVSLLA;
-		}
-		if (isv4) {
-			mp = ip_add_info(mp, recv_ill, in_flags,
-			    IPCL_ZONEID(connp), ipst);
-		} else {
-			mp = ip_add_info_v6(mp, recv_ill,
-			    &(((ip6_t *)ipha)->ip6_dst));
-		}
-		if (mp == NULL) {
-			BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
-			SCTP_REFRELE(sctp);
-			if (mctl_present)
-				freeb(first_mp);
-			return;
-		} else if (mctl_present) {
-			/*
-			 * ip_add_info might return a new mp.
-			 */
-			ASSERT(first_mp != mp);
-			first_mp->b_cont = mp;
-		} else {
-			first_mp = mp;
-		}
-	}
+	ira->ira_ill = ira->ira_rill = NULL;
 
 	mutex_enter(&sctp->sctp_lock);
 	if (sctp->sctp_running) {
-		if (mctl_present)
-			mp->b_prev = first_mp;
-		if (!sctp_add_recvq(sctp, mp, B_FALSE)) {
-			BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
-			freemsg(first_mp);
-		}
+		sctp_add_recvq(sctp, mp, B_FALSE, ira);
 		mutex_exit(&sctp->sctp_lock);
-		SCTP_REFRELE(sctp);
-		return;
+		goto done;
 	} else {
 		sctp->sctp_running = B_TRUE;
 		mutex_exit(&sctp->sctp_lock);
 
 		mutex_enter(&sctp->sctp_recvq_lock);
 		if (sctp->sctp_recvq != NULL) {
-			if (mctl_present)
-				mp->b_prev = first_mp;
-			if (!sctp_add_recvq(sctp, mp, B_TRUE)) {
-				BUMP_MIB(recv_ill->ill_ip_mib,
-				    ipIfStatsInDiscards);
-				freemsg(first_mp);
-			}
+			sctp_add_recvq(sctp, mp, B_TRUE, ira);
 			mutex_exit(&sctp->sctp_recvq_lock);
 			WAKE_SCTP(sctp);
-			SCTP_REFRELE(sctp);
-			return;
+			goto done;
 		}
 	}
 	mutex_exit(&sctp->sctp_recvq_lock);
-	sctp_input_data(sctp, mp, (mctl_present ? first_mp : NULL));
+	if (ira->ira_flags & IRAF_ICMP_ERROR)
+		sctp_icmp_error(sctp, mp);
+	else
+		sctp_input_data(sctp, mp, ira);
 	WAKE_SCTP(sctp);
-	sctp_process_sendq(sctp);
+
+done:
 	SCTP_REFRELE(sctp);
+	ira->ira_ill = ill;
+	ira->ira_rill = rill;
 }
 
 static void
@@ -3549,7 +3418,7 @@ sctp_process_abort(sctp_t *sctp, sctp_chunk_hdr_t *ch, int err)
 }
 
 void
-sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
+sctp_input_data(sctp_t *sctp, mblk_t *mp, ip_recv_attr_t *ira)
 {
 	sctp_chunk_hdr_t	*ch;
 	ssize_t			mlen;
@@ -3559,17 +3428,15 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 	sctp_init_chunk_t	*iack;
 	uint32_t		tsn;
 	sctp_data_hdr_t		*sdc;
-	ip6_pkt_t		ipp;
+	ip_pkt_t		ipp;
 	in6_addr_t		src;
 	in6_addr_t		dst;
 	uint_t			ifindex;
 	sctp_hdr_t		*sctph;
-	uint_t			ip_hdr_len;
+	uint_t			ip_hdr_len = ira->ira_ip_hdr_length;
 	mblk_t			*dups = NULL;
 	int			recv_adaptation;
 	boolean_t		wake_eager = B_FALSE;
-	mblk_t			*pinfo_mp;
-	ip_pktinfo_t		*pinfo = NULL;
 	in6_addr_t		peer_src;
 	int64_t			now;
 	sctp_stack_t		*sctps = sctp->sctp_sctps;
@@ -3577,23 +3444,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 	boolean_t		hb_already = B_FALSE;
 	cred_t			*cr;
 	pid_t			cpid;
+	uchar_t			*rptr;
+	conn_t			*connp = sctp->sctp_connp;
 
-	if (DB_TYPE(mp) != M_DATA) {
-		ASSERT(DB_TYPE(mp) == M_CTL);
-		if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
-		    ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
-		    IN_PKTINFO) {
-			pinfo = (ip_pktinfo_t *)mp->b_rptr;
-			pinfo_mp = mp;
-			mp = mp->b_cont;
-		} else {
-			if (ipsec_mp != NULL)
-				freeb(ipsec_mp);
-			sctp_icmp_error(sctp, mp);
-			return;
-		}
-	}
 	ASSERT(DB_TYPE(mp) == M_DATA);
+	ASSERT(ira->ira_ill == NULL);
 
 	if (mp->b_cont != NULL) {
 		/*
@@ -3602,32 +3457,72 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 		 */
 		if (pullupmsg(mp, -1) == 0) {
 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
-			if (ipsec_mp != NULL)
-				freeb(ipsec_mp);
-			if (pinfo != NULL)
-				freeb(pinfo_mp);
+			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
 			freemsg(mp);
 			return;
 		}
 	}
 
 	BUMP_LOCAL(sctp->sctp_ipkts);
-	sctph = find_sctp_hdrs(mp, &src, &dst, &ifindex, &ip_hdr_len,
-	    &ipp, pinfo);
-	if (pinfo != NULL)
-		freeb(pinfo_mp);
+	ifindex = ira->ira_ruifindex;
+
+	rptr = mp->b_rptr;
+
+	ipp.ipp_fields = 0;
+	if (connp->conn_recv_ancillary.crb_all != 0) {
+		/*
+		 * Record packet information in the ip_pkt_t
+		 */
+		if (ira->ira_flags & IRAF_IS_IPV4) {
+			(void) ip_find_hdr_v4((ipha_t *)rptr, &ipp,
+			    B_FALSE);
+		} else {
+			uint8_t nexthdrp;
+
+			/*
+			 * IPv6 packets can only be received by applications
+			 * that are prepared to receive IPv6 addresses.
+			 * The IP fanout must ensure this.
+			 */
+			ASSERT(connp->conn_family == AF_INET6);
+
+			(void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp,
+			    &nexthdrp);
+			ASSERT(nexthdrp == IPPROTO_SCTP);
+
+			/* Could have caused a pullup? */
+			rptr = mp->b_rptr;
+		}
+	}
+
+	sctph = ((sctp_hdr_t *)&rptr[ip_hdr_len]);
+
+	if (ira->ira_flags & IRAF_IS_IPV4) {
+		ipha_t *ipha;
+
+		ipha = (ipha_t *)rptr;
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &src);
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &dst);
+	} else {
+		ip6_t *ip6h;
+
+		ip6h = (ip6_t *)rptr;
+		src = ip6h->ip6_src;
+		dst = ip6h->ip6_dst;
+	}
+
 	mlen = mp->b_wptr - (uchar_t *)(sctph + 1);
 	ch = sctp_first_chunk((uchar_t *)(sctph + 1), mlen);
 	if (ch == NULL) {
 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
-		if (ipsec_mp != NULL)
-			freeb(ipsec_mp);
+		ip_drop_input("ipIfStatsInDiscards", mp, NULL);
 		freemsg(mp);
 		return;
 	}
 
 	if (!sctp_check_input(sctp, ch, mlen, 1)) {
 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+		ip_drop_input("ipIfStatsInDiscards", mp, NULL);
 		goto done;
 	}
 	/*
@@ -3661,9 +3556,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 		if (sctp->sctp_state > SCTPS_BOUND &&
 		    sctp->sctp_state < SCTPS_ESTABLISHED) {
 			/* treat as OOTB */
-			sctp_ootb_shutdown_ack(sctp, mp, ip_hdr_len);
-			if (ipsec_mp != NULL)
-				freeb(ipsec_mp);
+			sctp_ootb_shutdown_ack(mp, ip_hdr_len, ira, ipst);
 			return;
 		}
 		/* else fallthru */
@@ -3717,7 +3610,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 					tsn = sdc->sdh_tsn;
 					sctp_send_abort(sctp, sctp->sctp_fvtag,
 					    SCTP_ERR_NO_USR_DATA, (char *)&tsn,
-					    sizeof (tsn), mp, 0, B_FALSE);
+					    sizeof (tsn), mp, 0, B_FALSE, ira);
 					sctp_assoc_event(sctp, SCTP_COMM_LOST,
 					    0, NULL);
 					sctp_clean_death(sctp, ECONNABORTED);
@@ -3726,7 +3619,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 
 				ASSERT(fp != NULL);
 				sctp->sctp_lastdata = fp;
-				sctp_data_chunk(sctp, ch, mp, &dups, fp, &ipp);
+				sctp_data_chunk(sctp, ch, mp, &dups, fp,
+				    &ipp, ira);
 				gotdata = 1;
 				/* Restart shutdown timer if shutting down */
 				if (sctp->sctp_state == SCTPS_SHUTDOWN_SENT) {
@@ -3743,7 +3637,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 					    sctps->sctps_shutack_wait_bound) {
 						sctp_send_abort(sctp,
 						    sctp->sctp_fvtag, 0, NULL,
-						    0, mp, 0, B_FALSE);
+						    0, mp, 0, B_FALSE, ira);
 						sctp_assoc_event(sctp,
 						    SCTP_COMM_LOST, 0, NULL);
 						sctp_clean_death(sctp,
@@ -3764,7 +3658,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 				trysend = sctp_got_sack(sctp, ch);
 				if (trysend < 0) {
 					sctp_send_abort(sctp, sctph->sh_verf,
-					    0, NULL, 0, mp, 0, B_FALSE);
+					    0, NULL, 0, mp, 0, B_FALSE, ira);
 					sctp_assoc_event(sctp,
 					    SCTP_COMM_LOST, 0, NULL);
 					sctp_clean_death(sctp,
@@ -3820,11 +3714,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 				goto done;
 			}
 			case CHUNK_INIT:
-				sctp_send_initack(sctp, sctph, ch, mp);
+				sctp_send_initack(sctp, sctph, ch, mp, ira);
 				break;
 			case CHUNK_COOKIE:
 				if (sctp_process_cookie(sctp, ch, mp, &iack,
-				    sctph, &recv_adaptation, NULL) != -1) {
+				    sctph, &recv_adaptation, NULL, ira) != -1) {
 					sctp_send_cookie_ack(sctp);
 					sctp_assoc_event(sctp, SCTP_RESTART,
 					    0, NULL);
@@ -3841,7 +3735,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 				int error;
 
 				BUMP_LOCAL(sctp->sctp_ibchunks);
-				error = sctp_handle_error(sctp, sctph, ch, mp);
+				error = sctp_handle_error(sctp, sctph, ch, mp,
+				    ira);
 				if (error != 0) {
 					sctp_assoc_event(sctp, SCTP_COMM_LOST,
 					    0, NULL);
@@ -3864,7 +3759,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 			case CHUNK_FORWARD_TSN:
 				ASSERT(fp != NULL);
 				sctp->sctp_lastdata = fp;
-				sctp_process_forward_tsn(sctp, ch, fp, &ipp);
+				sctp_process_forward_tsn(sctp, ch, fp,
+				    &ipp, ira);
 				gotdata = 1;
 				BUMP_LOCAL(sctp->sctp_ibchunks);
 				break;
@@ -3879,13 +3775,14 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 		case SCTPS_LISTEN:
 			switch (ch->sch_id) {
 			case CHUNK_INIT:
-				sctp_send_initack(sctp, sctph, ch, mp);
+				sctp_send_initack(sctp, sctph, ch, mp, ira);
 				break;
 			case CHUNK_COOKIE: {
 				sctp_t *eager;
 
 				if (sctp_process_cookie(sctp, ch, mp, &iack,
-				    sctph, &recv_adaptation, &peer_src) == -1) {
+				    sctph, &recv_adaptation, &peer_src,
+				    ira) == -1) {
 					BUMP_MIB(&sctps->sctps_mib,
 					    sctpInInvalidCookie);
 					goto done;
@@ -3900,11 +3797,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 					goto done;
 
 				eager = sctp_conn_request(sctp, mp, ifindex,
-				    ip_hdr_len, iack, ipsec_mp);
+				    ip_hdr_len, iack, ira);
 				if (eager == NULL) {
 					sctp_send_abort(sctp, sctph->sh_verf,
 					    SCTP_ERR_NO_RESOURCES, NULL, 0, mp,
-					    0, B_FALSE);
+					    0, B_FALSE, ira);
 					goto done;
 				}
 
@@ -3933,9 +3830,6 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 				BUMP_MIB(&sctps->sctps_mib, sctpPassiveEstab);
 				if (mlen > ntohs(ch->sch_len)) {
 					eager->sctp_cookie_mp = dupb(mp);
-					mblk_setcred(eager->sctp_cookie_mp,
-					    CONN_CRED(eager->sctp_connp),
-					    eager->sctp_cpid);
 					/*
 					 * If no mem, just let
 					 * the peer retransmit.
@@ -3986,7 +3880,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 			default:
 				BUMP_LOCAL(sctp->sctp_ibchunks);
 				sctp_send_abort(sctp, sctph->sh_verf, 0, NULL,
-				    0, mp, 0, B_TRUE);
+				    0, mp, 0, B_TRUE, ira);
 				goto done;
 			}
 			break;
@@ -3996,20 +3890,21 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 			case CHUNK_INIT_ACK:
 				sctp_stop_faddr_timers(sctp);
 				sctp_faddr_alive(sctp, sctp->sctp_current);
-				sctp_send_cookie_echo(sctp, ch, mp);
+				sctp_send_cookie_echo(sctp, ch, mp, ira);
 				BUMP_LOCAL(sctp->sctp_ibchunks);
 				break;
 			case CHUNK_ABORT:
 				sctp_process_abort(sctp, ch, ECONNREFUSED);
 				goto done;
 			case CHUNK_INIT:
-				sctp_send_initack(sctp, sctph, ch, mp);
+				sctp_send_initack(sctp, sctph, ch, mp, ira);
 				break;
 			case CHUNK_COOKIE:
-				cr = msg_getcred(mp, &cpid);
+				cr = ira->ira_cred;
+				cpid = ira->ira_cpid;
 
 				if (sctp_process_cookie(sctp, ch, mp, &iack,
-				    sctph, &recv_adaptation, NULL) == -1) {
+				    sctph, &recv_adaptation, NULL, ira) == -1) {
 					BUMP_MIB(&sctps->sctps_mib,
 					    sctpInInvalidCookie);
 					break;
@@ -4053,7 +3948,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 		case SCTPS_COOKIE_ECHOED:
 			switch (ch->sch_id) {
 			case CHUNK_COOKIE_ACK:
-				cr = msg_getcred(mp, &cpid);
+				cr = ira->ira_cred;
+				cpid = ira->ira_cpid;
 
 				if (!SCTP_IS_DETACHED(sctp)) {
 					sctp->sctp_ulp_connected(
@@ -4084,10 +3980,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 				sctp_process_abort(sctp, ch, ECONNREFUSED);
 				goto done;
 			case CHUNK_COOKIE:
-				cr = msg_getcred(mp, &cpid);
+				cr = ira->ira_cred;
+				cpid = ira->ira_cpid;
 
 				if (sctp_process_cookie(sctp, ch, mp, &iack,
-				    sctph, &recv_adaptation, NULL) == -1) {
+				    sctph, &recv_adaptation, NULL, ira) == -1) {
 					BUMP_MIB(&sctps->sctps_mib,
 					    sctpInInvalidCookie);
 					break;
@@ -4122,7 +4019,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 				trysend = 1;
 				break;
 			case CHUNK_INIT:
-				sctp_send_initack(sctp, sctph, ch, mp);
+				sctp_send_initack(sctp, sctph, ch, mp, ira);
 				break;
 			case CHUNK_ERROR: {
 				sctp_parm_hdr_t *p;
@@ -4165,7 +4062,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 			switch (ch->sch_id) {
 			case CHUNK_ABORT:
 				/* Pass gathered wisdom to IP for keeping */
-				sctp_update_ire(sctp);
+				sctp_update_dce(sctp);
 				sctp_process_abort(sctp, ch, 0);
 				goto done;
 			case CHUNK_SHUTDOWN_COMPLETE:
@@ -4175,7 +4072,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 				    NULL);
 
 				/* Pass gathered wisdom to IP for keeping */
-				sctp_update_ire(sctp);
+				sctp_update_dce(sctp);
 				sctp_clean_death(sctp, 0);
 				goto done;
 			case CHUNK_SHUTDOWN_ACK:
@@ -4215,7 +4112,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
 				trysend = sctp_got_sack(sctp, ch);
 				if (trysend < 0) {
 					sctp_send_abort(sctp, sctph->sh_verf,
-					    0, NULL, 0, mp, 0, B_FALSE);
+					    0, NULL, 0, mp, 0, B_FALSE, ira);
 					sctp_assoc_event(sctp,
 					    SCTP_COMM_LOST, 0, NULL);
 					sctp_clean_death(sctp,
@@ -4287,8 +4184,6 @@ nomorechunks:
 done:
 	if (dups != NULL)
 		freeb(dups);
-	if (ipsec_mp != NULL)
-		freeb(ipsec_mp);
 	freemsg(mp);
 
 	if (sctp->sctp_err_chunks != NULL)
@@ -4297,15 +4192,9 @@ done:
 	if (wake_eager) {
 		/*
 		 * sctp points to newly created control block, need to
-		 * release it before exiting.  Before releasing it and
-		 * processing the sendq, need to grab a hold on it.
-		 * Otherwise, another thread can close it while processing
-		 * the sendq.
+		 * release it before exiting.
 		 */
-		SCTP_REFHOLD(sctp);
 		WAKE_SCTP(sctp);
-		sctp_process_sendq(sctp);
-		SCTP_REFRELE(sctp);
 	}
 }
 
@@ -4340,12 +4229,6 @@ sctp_recvd(sctp_t *sctp, int len)
 		sctp->sctp_force_sack = 1;
 		BUMP_MIB(&sctps->sctps_mib, sctpOutWinUpdate);
 		(void) sctp_sack(sctp, NULL);
-		old = 1;
-	} else {
-		old = 0;
 	}
 	WAKE_SCTP(sctp);
-	if (old > 0) {
-		sctp_process_sendq(sctp);
-	}
 }
diff --git a/usr/src/uts/common/inet/sctp/sctp_ioc.c b/usr/src/uts/common/inet/sctp/sctp_ioc.c
index 7150c48c4b..5f5c2ee629 100644
--- a/usr/src/uts/common/inet/sctp/sctp_ioc.c
+++ b/usr/src/uts/common/inet/sctp/sctp_ioc.c
@@ -49,69 +49,7 @@
 #include "sctp_impl.h"
 
 /*
- * We need a stream q for sending packets to IP.  This q should
- * be set in strplumb() time.  Once it is set, it will never
- * be removed.  Since it is done in strplumb() time, there is
- * no need to have a lock on the default q.
- */
-static void
-sctp_def_q_set(queue_t *q, mblk_t *mp)
-{
-	conn_t		*connp = (conn_t *)q->q_ptr;
-	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
-	mblk_t		*mp1;
-	hrtime_t	t;
-	sctp_stack_t	*sctps = connp->conn_netstack->
-	    netstack_sctp;
-
-	if ((mp1 = mp->b_cont) == NULL) {
-		iocp->ioc_error = EINVAL;
-		ip0dbg(("sctp_def_q_set: no file descriptor\n"));
-		goto done;
-	}
-
-	mutex_enter(&sctps->sctps_g_q_lock);
-	if (sctps->sctps_g_q != NULL) {
-		mutex_exit(&sctps->sctps_g_q_lock);
-		ip0dbg(("sctp_def_q_set: already set\n"));
-		iocp->ioc_error = EALREADY;
-		goto done;
-	}
-
-	sctps->sctps_g_q = q;
-	mutex_exit(&sctps->sctps_g_q_lock);
-	sctps->sctps_gsctp = (sctp_t *)sctp_create(NULL, NULL, AF_INET6,
-	    SCTP_CAN_BLOCK, NULL, NULL, connp->conn_cred);
-	mutex_enter(&sctps->sctps_g_q_lock);
-	if (sctps->sctps_gsctp == NULL) {
-		sctps->sctps_g_q = NULL;
-		mutex_exit(&sctps->sctps_g_q_lock);
-		iocp->ioc_error = ENOMEM;
-		goto done;
-	}
-	mutex_exit(&sctps->sctps_g_q_lock);
-	ASSERT(sctps->sctps_g_q_ref >= 1);
-	ASSERT(list_head(&sctps->sctps_g_list) == sctps->sctps_gsctp);
-
-	/*
-	 * As a good citizen of using /dev/urandom, add some entropy
-	 * to the random number pool.
-	 */
-	t = gethrtime();
-	(void) random_add_entropy((uint8_t *)&t, sizeof (t), 0);
-done:
-	if (mp1 != NULL) {
-		freemsg(mp1);
-		mp->b_cont = NULL;
-	}
-	iocp->ioc_count = 0;
-	mp->b_datap->db_type = M_IOCACK;
-	qreply(q, mp);
-}
-
-
-/*
- * sctp_wput_ioctl is called by sctp_wput_slow to handle all
+ * sctp_wput_ioctl is called by sctp_wput to handle all
  * M_IOCTL messages.
  */
 void
@@ -119,7 +57,6 @@ sctp_wput_ioctl(queue_t *q, mblk_t *mp)
 {
 	conn_t	*connp = (conn_t *)q->q_ptr;
 	struct iocblk	*iocp;
-	cred_t *cr;
 
 	if (connp == NULL) {
 		ip0dbg(("sctp_wput_ioctl: null conn\n"));
@@ -127,24 +64,7 @@ sctp_wput_ioctl(queue_t *q, mblk_t *mp)
 	}
 
 	iocp = (struct iocblk *)mp->b_rptr;
-	/*
-	 * prefer credential from mblk over ioctl;
-	 * see ip_sioctl_copyin_setup
-	 */
-	cr = msg_getcred(mp, NULL);
-	if (cr == NULL)
-		cr = iocp->ioc_cr;
-
 	switch (iocp->ioc_cmd) {
-	case SCTP_IOC_DEFAULT_Q:
-		/* Wants to be the default wq. */
-		if (cr != NULL && secpolicy_ip_config(cr, B_FALSE) != 0) {
-			iocp->ioc_error = EPERM;
-			goto err_ret;
-		}
-		sctp_def_q_set(q, mp);
-		return;
-
 	case ND_SET:
 		/* sctp_nd_getset() -> nd_getset() does the checking. */
 	case ND_GET:
@@ -244,6 +164,9 @@ sctp_str_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 	netstack_rele(ns);
 
 	connp->conn_zoneid = zoneid;
+	connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
+	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+	connp->conn_ixa->ixa_zoneid = zoneid;
 
 	connp->conn_rq = q;
 	connp->conn_wq = WR(q);
@@ -276,6 +199,12 @@ sctp_str_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 	ASSERT(connp->conn_cred == NULL);
 	connp->conn_cred = credp;
 	crhold(connp->conn_cred);
+	connp->conn_cpid = curproc->p_pid;
+	/* Cache things in ixa without an extra refhold */
+	connp->conn_ixa->ixa_cred = connp->conn_cred;
+	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+	if (is_system_labeled())
+		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
 
 	/*
 	 * Make the conn globally visible to walkers
diff --git a/usr/src/uts/common/inet/sctp/sctp_notify.c b/usr/src/uts/common/inet/sctp/sctp_notify.c
index 3ede878954..ea46e0bbd2 100644
--- a/usr/src/uts/common/inet/sctp/sctp_notify.c
+++ b/usr/src/uts/common/inet/sctp/sctp_notify.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -51,6 +51,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len)
 	sctp_faddr_t *fp;
 	int32_t rwnd = 0;
 	int error;
+	conn_t *connp = sctp->sctp_connp;
 
 	if ((mp = allocb(sizeof (*tudi) + sizeof (void *) +
 		sizeof (struct sockaddr_in6), BPRI_HI)) == NULL) {
@@ -82,7 +83,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len)
 		tudi->SRC_length = sizeof (*sin4);
 		sin4 = (struct sockaddr_in *)(tudi + 1);
 		sin4->sin_family = AF_INET;
-		sin4->sin_port = sctp->sctp_fport;
+		sin4->sin_port = connp->conn_fport;
 		IN6_V4MAPPED_TO_IPADDR(&fp->faddr, sin4->sin_addr.s_addr);
 		mp->b_wptr = (uchar_t *)(sin4 + 1);
 	} else {
@@ -91,7 +92,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len)
 		tudi->SRC_length = sizeof (*sin6);
 		sin6 = (struct sockaddr_in6 *)(tudi + 1);
 		sin6->sin6_family = AF_INET6;
-		sin6->sin6_port = sctp->sctp_fport;
+		sin6->sin6_port = connp->conn_fport;
 		sin6->sin6_addr = fp->faddr;
 		mp->b_wptr = (uchar_t *)(sin6 + 1);
 	}
diff --git a/usr/src/uts/common/inet/sctp/sctp_opt_data.c b/usr/src/uts/common/inet/sctp/sctp_opt_data.c
index 322e4d461e..ee5eb445af 100644
--- a/usr/src/uts/common/inet/sctp/sctp_opt_data.c
+++ b/usr/src/uts/common/inet/sctp/sctp_opt_data.c
@@ -43,6 +43,7 @@
 #include <inet/ip.h>
 #include <inet/ip_ire.h>
 #include <inet/ip_if.h>
+#include <inet/proto_set.h>
 #include <inet/ipclassifier.h>
 #include <inet/ipsec_impl.h>
 
@@ -60,68 +61,6 @@
 
 static int	sctp_getpeeraddrs(sctp_t *, void *, int *);
 
-/*
- * Copy the standard header into its new location,
- * lay in the new options and then update the relevant
- * fields in both sctp_t and the standard header.
- * Returns 0 on success, errno otherwise.
- */
-static int
-sctp_opt_set_header(sctp_t *sctp, const void *ptr, uint_t len)
-{
-	uint8_t *ip_optp;
-	sctp_hdr_t *new_sctph;
-
-	if ((len > SCTP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3))
-		return (EINVAL);
-
-	if (len > IP_MAX_OPT_LENGTH - sctp->sctp_v4label_len)
-		return (EINVAL);
-
-	ip_optp = (uint8_t *)sctp->sctp_ipha + IP_SIMPLE_HDR_LENGTH;
-
-	if (sctp->sctp_v4label_len > 0) {
-		int padlen;
-		uint8_t opt;
-
-		/* convert list termination to no-ops as needed */
-		padlen = sctp->sctp_v4label_len - ip_optp[IPOPT_OLEN];
-		ip_optp += ip_optp[IPOPT_OLEN];
-		opt = len > 0 ? IPOPT_NOP : IPOPT_EOL;
-		while (--padlen >= 0)
-			*ip_optp++ = opt;
-		ASSERT(ip_optp == (uint8_t *)sctp->sctp_ipha +
-		    IP_SIMPLE_HDR_LENGTH + sctp->sctp_v4label_len);
-	}
-
-	/*
-	 * Move the existing SCTP header out where it belongs.
-	 */
-	new_sctph = (sctp_hdr_t *)(ip_optp + len);
-	ovbcopy(sctp->sctp_sctph, new_sctph, sizeof (sctp_hdr_t));
-	sctp->sctp_sctph = new_sctph;
-
-	/*
-	 * Insert the new user-supplied IP options.
-	 */
-	if (len > 0)
-		bcopy(ptr, ip_optp, len);
-
-	len += sctp->sctp_v4label_len;
-	sctp->sctp_ip_hdr_len = len;
-	sctp->sctp_ipha->ipha_version_and_hdr_length =
-	    (IP_VERSION << 4) | (len >> 2);
-	sctp->sctp_hdr_len = len + sizeof (sctp_hdr_t);
-
-	if (sctp->sctp_current) {
-		/*
-		 * Could be setting options before setting up connection.
-		 */
-		sctp_set_ulp_prop(sctp);
-	}
-	return (0);
-}
-
 static int
 sctp_get_status(sctp_t *sctp, void *ptr)
 {
@@ -132,6 +71,7 @@ sctp_get_status(sctp_t *sctp, void *ptr)
 	struct sctp_paddrinfo *sp;
 	mblk_t *meta, *mp;
 	int i;
+	conn_t	*connp = sctp->sctp_connp;
 
 	sstat->sstat_state = sctp->sctp_state;
 	sstat->sstat_rwnd = sctp->sctp_frwnd;
@@ -146,13 +86,13 @@ sctp_get_status(sctp_t *sctp, void *ptr)
 	if (fp->isv4) {
 		sin = (struct sockaddr_in *)&sp->spinfo_address;
 		sin->sin_family = AF_INET;
-		sin->sin_port = sctp->sctp_fport;
+		sin->sin_port = connp->conn_fport;
 		IN6_V4MAPPED_TO_INADDR(&fp->faddr, &sin->sin_addr);
 		sp->spinfo_mtu = sctp->sctp_hdr_len;
 	} else {
 		sin6 = (struct sockaddr_in6 *)&sp->spinfo_address;
 		sin6->sin6_family = AF_INET6;
-		sin6->sin6_port = sctp->sctp_fport;
+		sin6->sin6_port = connp->conn_fport;
 		sin6->sin6_addr = fp->faddr;
 		sp->spinfo_mtu = sctp->sctp_hdr6_len;
 	}
@@ -261,18 +201,16 @@ sctp_get_rtoinfo(sctp_t *sctp, void *ptr)
 }
 
 static int
-sctp_set_rtoinfo(sctp_t *sctp, const void *invalp, uint_t inlen)
+sctp_set_rtoinfo(sctp_t *sctp, const void *invalp)
 {
 	const struct sctp_rtoinfo *srto;
 	boolean_t ispriv;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	conn_t		*connp = sctp->sctp_connp;
 
-	if (inlen < sizeof (*srto)) {
-		return (EINVAL);
-	}
 	srto = invalp;
 
-	ispriv = secpolicy_ip_config(sctp->sctp_credp, B_TRUE) == 0;
+	ispriv = secpolicy_ip_config(connp->conn_cred, B_TRUE) == 0;
 
 	/*
 	 * Bounds checking.  Priviledged user can set the RTO initial
@@ -334,17 +272,13 @@ sctp_get_assocparams(sctp_t *sctp, void *ptr)
 }
 
 static int
-sctp_set_assocparams(sctp_t *sctp, const void *invalp, uint_t inlen)
+sctp_set_assocparams(sctp_t *sctp, const void *invalp)
 {
 	const struct sctp_assocparams *sap = invalp;
 	uint32_t sum = 0;
 	sctp_faddr_t *fp;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
 
-	if (inlen < sizeof (*sap)) {
-		return (EINVAL);
-	}
-
 	if (sap->sasoc_asocmaxrxt) {
 		if (sctp->sctp_faddrs) {
 			/*
@@ -403,6 +337,7 @@ sctp_set_initmsg(sctp_t *sctp, const void *invalp, uint_t inlen)
 {
 	const struct sctp_initmsg *si = invalp;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	conn_t		*connp = sctp->sctp_connp;
 
 	if (sctp->sctp_state > SCTPS_LISTEN) {
 		return (EINVAL);
@@ -430,7 +365,7 @@ sctp_set_initmsg(sctp_t *sctp, const void *invalp, uint_t inlen)
 		return (EINVAL);
 	}
 	if (si->sinit_max_init_timeo != 0 &&
-	    (secpolicy_ip_config(sctp->sctp_credp, B_TRUE) != 0 &&
+	    (secpolicy_ip_config(connp->conn_cred, B_TRUE) != 0 &&
 	    (si->sinit_max_init_timeo < sctps->sctps_rto_maxg_low ||
 	    si->sinit_max_init_timeo > sctps->sctps_rto_maxg_high))) {
 		return (EINVAL);
@@ -506,7 +441,7 @@ sctp_get_peer_addr_params(sctp_t *sctp, void *ptr)
 }
 
 static int
-sctp_set_peer_addr_params(sctp_t *sctp, const void *invalp, uint_t inlen)
+sctp_set_peer_addr_params(sctp_t *sctp, const void *invalp)
 {
 	const struct sctp_paddrparams *spp = invalp;
 	sctp_faddr_t *fp, *fp2;
@@ -515,10 +450,6 @@ sctp_set_peer_addr_params(sctp_t *sctp, const void *invalp, uint_t inlen)
 	int64_t now;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
 
-	if (inlen < sizeof (*spp)) {
-		return (EINVAL);
-	}
-
 	retval = sctp_find_peer_fp(sctp, &spp->spp_address, &fp);
 	if (retval != 0) {
 		return (retval);
@@ -620,13 +551,10 @@ sctp_get_def_send_params(sctp_t *sctp, void *ptr)
 }
 
 static int
-sctp_set_def_send_params(sctp_t *sctp, const void *invalp, uint_t inlen)
+sctp_set_def_send_params(sctp_t *sctp, const void *invalp)
 {
 	const struct sctp_sndrcvinfo *sinfo = invalp;
 
-	if (inlen < sizeof (*sinfo)) {
-		return (EINVAL);
-	}
 	if (sinfo->sinfo_stream >= sctp->sctp_num_ostr) {
 		return (EINVAL);
 	}
@@ -641,16 +569,12 @@ sctp_set_def_send_params(sctp_t *sctp, const void *invalp, uint_t inlen)
 }
 
 static int
-sctp_set_prim(sctp_t *sctp, const void *invalp, uint_t inlen)
+sctp_set_prim(sctp_t *sctp, const void *invalp)
 {
 	const struct	sctp_setpeerprim *pp = invalp;
 	int		retval;
 	sctp_faddr_t	*fp;
 
-	if (inlen < sizeof (*pp)) {
-		return (EINVAL);
-	}
-
 	retval = sctp_find_peer_fp(sctp, &pp->sspp_addr, &fp);
 	if (retval)
 		return (retval);
@@ -670,6 +594,183 @@ sctp_set_prim(sctp_t *sctp, const void *invalp, uint_t inlen)
 	return (0);
 }
 
+/*
+ * Table of all known options handled on a SCTP protocol stack.
+ *
+ * Note: This table contains options processed by both SCTP and IP levels
+ *       and is the superset of options that can be performed on a SCTP and IP
+ *       stack.
+ */
+opdes_t	sctp_opt_arr[] = {
+
+{ SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (struct linger), 0 },
+
+{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
+	},
+{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
+	},
+{ SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
+	0 },
+{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
+	0 },
+{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
+	0 },
+{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
+{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+
+{ SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+
+{ SCTP_ADAPTATION_LAYER, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (struct sctp_setadaptation), 0 },
+{ SCTP_ADD_ADDR, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, OP_VARLEN,
+	sizeof (int), 0 },
+{ SCTP_ASSOCINFO, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (struct sctp_assocparams), 0 },
+{ SCTP_AUTOCLOSE, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_DEFAULT_SEND_PARAM, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (struct sctp_sndrcvinfo), 0 },
+{ SCTP_DISABLE_FRAGMENTS, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ SCTP_EVENTS, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (struct sctp_event_subscribe), 0 },
+{ SCTP_GET_LADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, OP_VARLEN,
+	sizeof (int), 0 },
+{ SCTP_GET_NLADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_GET_NPADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_GET_PADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, OP_VARLEN,
+	sizeof (int), 0 },
+{ SCTP_GET_PEER_ADDR_INFO, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0,
+	sizeof (struct sctp_paddrinfo), 0 },
+{ SCTP_INITMSG, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (struct sctp_initmsg), 0 },
+{ SCTP_I_WANT_MAPPED_V4_ADDR, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ SCTP_MAXSEG, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_NODELAY, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_PEER_ADDR_PARAMS, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (struct sctp_paddrparams), 0 },
+{ SCTP_PRIMARY_ADDR, IPPROTO_SCTP, OA_W, OA_W, OP_NP, 0,
+	sizeof (struct sctp_setpeerprim), 0 },
+{ SCTP_PRSCTP, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_GET_ASSOC_STATS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0,
+	sizeof (sctp_assoc_stats_t), 0 },
+{ SCTP_REM_ADDR, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, OP_VARLEN,
+	sizeof (int), 0 },
+{ SCTP_RTOINFO, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (struct sctp_rtoinfo), 0 },
+{ SCTP_SET_PEER_PRIMARY_ADDR, IPPROTO_SCTP, OA_W, OA_W, OP_NP, 0,
+	sizeof (struct sctp_setprim), 0 },
+{ SCTP_STATUS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0,
+	sizeof (struct sctp_status), 0 },
+{ SCTP_UC_SWAP, IPPROTO_SCTP, OA_W, OA_W, OP_NP, 0,
+	sizeof (struct sctp_uc_swap), 0 },
+
+{ IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
+	(OP_VARLEN|OP_NODEFAULT),
+	40, -1 /* not initialized */ },
+{ T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
+	(OP_VARLEN|OP_NODEFAULT),
+	40, -1 /* not initialized */ },
+
+{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
+	sizeof (int), -1 /* not initialized */ },
+
+{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
+	sizeof (ipsec_req_t), -1 /* not initialized */ },
+
+{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int),	0 /* no ifindex */ },
+
+{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
+	sizeof (int), 0 },
+
+{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
+	sizeof (int), -1 /* not initialized */ },
+
+{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int),	0 /* no ifindex */ },
+
+{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
+{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
+	sizeof (in_addr_t),	-1 /* not initialized  */ },
+
+{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
+	sizeof (int), 0 },
+
+{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+	(OP_NODEFAULT|OP_VARLEN),
+	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
+{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+	OP_NODEFAULT,
+	sizeof (sin6_t), -1 /* not initialized */ },
+{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+	(OP_VARLEN|OP_NODEFAULT), 255*8,
+	-1 /* not initialized */ },
+{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+	(OP_VARLEN|OP_NODEFAULT), 255*8,
+	-1 /* not initialized */ },
+{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+	(OP_VARLEN|OP_NODEFAULT), 255*8,
+	-1 /* not initialized */ },
+{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+	(OP_VARLEN|OP_NODEFAULT), 255*8,
+	-1 /* not initialized */ },
+{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+	OP_NODEFAULT,
+	sizeof (int), -1 /* not initialized */ },
+{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+	OP_NODEFAULT,
+	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
+{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+
+/* Enable receipt of ancillary data */
+{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+
+{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
+	sizeof (ipsec_req_t), -1 /* not initialized */ },
+{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
+};
+
+uint_t sctp_opt_arr_size = A_CNT(sctp_opt_arr);
+
 /* Handy on off switch for socket option processing. */
 #define	ONOFF(x)	((x) == 0 ? 0 : 1)
 
@@ -682,8 +783,12 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
 	int	*i1 = (int *)ptr;
 	int	retval = 0;
 	int	buflen = *optlen;
-	conn_t		*connp = sctp->sctp_connp;
-	ip6_pkt_t	*ipp = &sctp->sctp_sticky_ipp;
+	conn_t	*connp = sctp->sctp_connp;
+	conn_opt_arg_t	coas;
+
+	coas.coa_connp = connp;
+	coas.coa_ixa = connp->conn_ixa;
+	coas.coa_ipp = &connp->conn_xmit_ipp;
 
 	/* In most cases, the return buffer is just an int */
 	*optlen = sizeof (int32_t);
@@ -695,83 +800,30 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
 		return (EINVAL);
 	}
 
-	switch (level) {
-	case SOL_SOCKET:
-		switch (name) {
-		case SO_LINGER:	{
-			struct linger *lgr = (struct linger *)ptr;
-
-			lgr->l_onoff = sctp->sctp_linger ? SO_LINGER : 0;
-			lgr->l_linger = TICK_TO_MSEC(sctp->sctp_lingertime);
-			*optlen = sizeof (struct linger);
-			break;
-		}
-		case SO_DEBUG:
-			*i1 = sctp->sctp_debug ? SO_DEBUG : 0;
-			break;
-		case SO_DONTROUTE:
-			*i1 = connp->conn_dontroute ? SO_DONTROUTE : 0;
-			break;
-		case SO_USELOOPBACK:
-			*i1 = connp->conn_loopback ? SO_USELOOPBACK : 0;
-			break;
-		case SO_BROADCAST:
-			*i1 = connp->conn_broadcast ? SO_BROADCAST : 0;
-			break;
-		case SO_REUSEADDR:
-			*i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
-			break;
-		case SO_DGRAM_ERRIND:
-			*i1 = sctp->sctp_dgram_errind ? SO_DGRAM_ERRIND : 0;
-			break;
-		case SO_SNDBUF:
-			*i1 = sctp->sctp_xmit_hiwater;
-			break;
-		case SO_RCVBUF:
-			*i1 = sctp->sctp_rwnd;
-			break;
-		case SO_ALLZONES:
-			*i1 = connp->conn_allzones;
-			break;
-		case SO_MAC_EXEMPT:
-			*i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
-			break;
-		case SO_MAC_IMPLICIT:
-			*i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
-			break;
-		case SO_PROTOTYPE:
-			*i1 = IPPROTO_SCTP;
-			break;
-		case SO_DOMAIN:
-			*i1 = sctp->sctp_family;
-			break;
-		default:
-			retval = ENOPROTOOPT;
-			break;
+	/*
+	 * Check that the level and name are supported by SCTP, and that
+	 * the length and credentials are ok.
+	 */
+	retval = proto_opt_check(level, name, buflen, NULL, sctp_opt_arr,
+	    sctp_opt_arr_size, B_FALSE, B_TRUE, connp->conn_cred);
+	if (retval != 0) {
+		WAKE_SCTP(sctp);
+		if (retval < 0) {
+			retval = proto_tlitosyserr(-retval);
 		}
-		break;
+		return (retval);
+	}
 
+	switch (level) {
 	case IPPROTO_SCTP:
 		switch (name) {
 		case SCTP_RTOINFO:
-			if (buflen < sizeof (struct sctp_rtoinfo)) {
-				retval = EINVAL;
-				break;
-			}
 			*optlen = sctp_get_rtoinfo(sctp, ptr);
 			break;
 		case SCTP_ASSOCINFO:
-			if (buflen < sizeof (struct sctp_assocparams)) {
-				retval = EINVAL;
-				break;
-			}
 			*optlen = sctp_get_assocparams(sctp, ptr);
 			break;
 		case SCTP_INITMSG:
-			if (buflen < sizeof (struct sctp_initmsg)) {
-				retval = EINVAL;
-				break;
-			}
 			*optlen = sctp_get_initmsg(sctp, ptr);
 			break;
 		case SCTP_NODELAY:
@@ -781,34 +833,18 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
 			*i1 = TICK_TO_SEC(sctp->sctp_autoclose);
 			break;
 		case SCTP_ADAPTATION_LAYER:
-			if (buflen < sizeof (struct sctp_setadaptation)) {
-				retval = EINVAL;
-				break;
-			}
 			((struct sctp_setadaptation *)ptr)->ssb_adaptation_ind =
 			    sctp->sctp_tx_adaptation_code;
 			break;
 		case SCTP_PEER_ADDR_PARAMS:
-			if (buflen < sizeof (struct sctp_paddrparams)) {
-				retval = EINVAL;
-				break;
-			}
 			*optlen = sctp_get_peer_addr_params(sctp, ptr);
 			break;
 		case SCTP_DEFAULT_SEND_PARAM:
-			if (buflen < sizeof (struct sctp_sndrcvinfo)) {
-				retval = EINVAL;
-				break;
-			}
 			*optlen = sctp_get_def_send_params(sctp, ptr);
 			break;
 		case SCTP_EVENTS: {
 			struct sctp_event_subscribe *ev;
 
-			if (buflen < sizeof (struct sctp_event_subscribe)) {
-				retval = EINVAL;
-				break;
-			}
 			ev = (struct sctp_event_subscribe *)ptr;
 			ev->sctp_data_io_event =
 			    ONOFF(sctp->sctp_recvsndrcvinfo);
@@ -830,17 +866,9 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
 			break;
 		}
 		case SCTP_STATUS:
-			if (buflen < sizeof (struct sctp_status)) {
-				retval = EINVAL;
-				break;
-			}
 			*optlen = sctp_get_status(sctp, ptr);
 			break;
 		case SCTP_GET_PEER_ADDR_INFO:
-			if (buflen < sizeof (struct sctp_paddrinfo)) {
-				retval = EINVAL;
-				break;
-			}
 			retval = sctp_get_paddrinfo(sctp, ptr, optlen);
 			break;
 		case SCTP_GET_NLADDRS:
@@ -850,7 +878,7 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
 			int addr_cnt;
 			int addr_size;
 
-			if (sctp->sctp_family == AF_INET)
+			if (connp->conn_family == AF_INET)
 				addr_size = sizeof (struct sockaddr_in);
 			else
 				addr_size = sizeof (struct sockaddr_in6);
@@ -874,7 +902,7 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
 			int addr_cnt;
 			int addr_size;
 
-			if (sctp->sctp_family == AF_INET)
+			if (connp->conn_family == AF_INET)
 				addr_size = sizeof (struct sockaddr_in);
 			else
 				addr_size = sizeof (struct sockaddr_in6);
@@ -891,11 +919,6 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
 		case SCTP_GET_ASSOC_STATS: {
 			sctp_assoc_stats_t *sas;
 
-			if (buflen < sizeof (sctp_assoc_stats_t)) {
-				retval = EINVAL;
-				break;
-			}
-
 			sas = (sctp_assoc_stats_t *)ptr;
 
 			/*
@@ -947,15 +970,15 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
 		case SCTP_I_WANT_MAPPED_V4_ADDR:
 		case SCTP_MAXSEG:
 		case SCTP_DISABLE_FRAGMENTS:
-			/* Not yet supported. */
 		default:
+			/* Not yet supported. */
 			retval = ENOPROTOOPT;
 			break;
 		}
-		break;
-
+		WAKE_SCTP(sctp);
+		return (retval);
 	case IPPROTO_IP:
-		if (sctp->sctp_family != AF_INET) {
+		if (connp->conn_family != AF_INET) {
 			retval = EINVAL;
 			break;
 		}
@@ -972,231 +995,52 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
 			 * ip_opt_get_user() adds the final destination
 			 * at the start.
 			 */
-			char	*opt_ptr;
 			int	opt_len;
 			uchar_t	obuf[SCTP_MAX_IP_OPTIONS_LENGTH + IP_ADDR_LEN];
 
-			opt_ptr = (char *)sctp->sctp_ipha +
-			    IP_SIMPLE_HDR_LENGTH;
-			opt_len = (char *)sctp->sctp_sctph - opt_ptr;
-			/* Caller ensures enough space */
-			if (opt_len > 0) {
-				/*
-				 * TODO: Do we have to handle getsockopt on an
-				 * initiator as well?
-				 */
-				opt_len = ip_opt_get_user(sctp->sctp_ipha,
-				    obuf);
-				ASSERT(opt_len <= sizeof (obuf));
-			} else {
-				opt_len = 0;
-			}
+			opt_len = ip_opt_get_user(connp, obuf);
+			ASSERT(opt_len <= sizeof (obuf));
+
 			if (buflen < opt_len) {
 				/* Silently truncate */
 				opt_len = buflen;
 			}
 			*optlen = opt_len;
 			bcopy(obuf, ptr, opt_len);
-			break;
-		}
-		case IP_TOS:
-		case T_IP_TOS:
-			*i1 = (int)sctp->sctp_ipha->ipha_type_of_service;
-			break;
-		case IP_TTL:
-			*i1 = (int)sctp->sctp_ipha->ipha_ttl;
-			break;
-		case IP_NEXTHOP:
-			if (connp->conn_nexthop_set) {
-				*(ipaddr_t *)ptr = connp->conn_nexthop_v4;
-				*optlen = sizeof (ipaddr_t);
-			} else {
-				*optlen = 0;
-			}
-			break;
-		default:
-			retval = ENOPROTOOPT;
-			break;
-		}
-		break;
-	case IPPROTO_IPV6:
-		if (sctp->sctp_family != AF_INET6) {
-			retval = EINVAL;
-			break;
-		}
-		switch (name) {
-		case IPV6_UNICAST_HOPS:
-			*i1 = (unsigned int) sctp->sctp_ip6h->ip6_hops;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVPKTINFO:
-			if (sctp->sctp_ipv6_recvancillary &
-			    SCTP_IPV6_RECVPKTINFO) {
-				*i1 = 1;
-			} else {
-				*i1 = 0;
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVHOPLIMIT:
-			if (sctp->sctp_ipv6_recvancillary &
-			    SCTP_IPV6_RECVHOPLIMIT) {
-				*i1 = 1;
-			} else {
-				*i1 = 0;
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVHOPOPTS:
-			if (sctp->sctp_ipv6_recvancillary &
-			    SCTP_IPV6_RECVHOPOPTS) {
-				*i1 = 1;
-			} else {
-				*i1 = 0;
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVDSTOPTS:
-			if (sctp->sctp_ipv6_recvancillary &
-			    SCTP_IPV6_RECVDSTOPTS) {
-				*i1 = 1;
-			} else {
-				*i1 = 0;
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVRTHDR:
-			if (sctp->sctp_ipv6_recvancillary &
-			    SCTP_IPV6_RECVRTHDR) {
-				*i1 = 1;
-			} else {
-				*i1 = 0;
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVRTHDRDSTOPTS:
-			if (sctp->sctp_ipv6_recvancillary &
-			    SCTP_IPV6_RECVRTDSTOPTS) {
-				*i1 = 1;
-			} else {
-				*i1 = 0;
-			}
-			break;	/* goto sizeof (int) option return */
-		case IPV6_PKTINFO: {
-			struct in6_pktinfo *pkti;
-
-			if (buflen < sizeof (struct in6_pktinfo)) {
-				retval = EINVAL;
-				break;
-			}
-			pkti = (struct in6_pktinfo *)ptr;
-			if (ipp->ipp_fields & IPPF_IFINDEX)
-				pkti->ipi6_ifindex = ipp->ipp_ifindex;
-			else
-				pkti->ipi6_ifindex = 0;
-			if (ipp->ipp_fields & IPPF_ADDR)
-				pkti->ipi6_addr = ipp->ipp_addr;
-			else
-				pkti->ipi6_addr = ipv6_all_zeros;
-			*optlen = sizeof (struct in6_pktinfo);
-			break;
-		}
-		case IPV6_NEXTHOP: {
-			sin6_t *sin6;
-
-			if (buflen < sizeof (sin6_t)) {
-				retval = EINVAL;
-				break;
-			}
-			sin6 = (sin6_t *)ptr;
-			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
-				break;
-			*sin6 = sctp_sin6_null;
-			sin6->sin6_family = AF_INET6;
-			sin6->sin6_addr = ipp->ipp_nexthop;
-			*optlen = sizeof (sin6_t);
-			break;
+			WAKE_SCTP(sctp);
+			return (0);
 		}
-		case IPV6_HOPOPTS: {
-			int len;
-
-			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
-				break;
-			len = ipp->ipp_hopoptslen - sctp->sctp_v6label_len;
-			if (len <= 0)
-				break;
-			if (buflen < len) {
-				retval = EINVAL;
-				break;
-			}
-			bcopy((char *)ipp->ipp_hopopts +
-			    sctp->sctp_v6label_len, ptr, len);
-			if (sctp->sctp_v6label_len > 0) {
-				char *cptr = ptr;
-
-				/*
-				 * If the label length is greater than zero,
-				 * then we need to hide the label from user.
-				 * Make it look as though a normal Hop-By-Hop
-				 * Options Header is present here.
-				 */
-				cptr[0] = ((char *)ipp->ipp_hopopts)[0];
-				cptr[1] = (len + 7) / 8 - 1;
-			}
-			*optlen = len;
-			break;
-		}
-		case IPV6_RTHDRDSTOPTS:
-			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
-				break;
-			if (buflen < ipp->ipp_rtdstoptslen) {
-				retval = EINVAL;
-				break;
-			}
-			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
-			*optlen  = ipp->ipp_rtdstoptslen;
-			break;
-		case IPV6_RTHDR:
-			if (!(ipp->ipp_fields & IPPF_RTHDR))
-				break;
-			if (buflen < ipp->ipp_rthdrlen) {
-				retval = EINVAL;
-				break;
-			}
-			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
-			*optlen = ipp->ipp_rthdrlen;
-			break;
-		case IPV6_DSTOPTS:
-			if (!(ipp->ipp_fields & IPPF_DSTOPTS))
-				break;
-			if (buflen < ipp->ipp_dstoptslen) {
-				retval = EINVAL;
-				break;
-			}
-			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
-			*optlen  = ipp->ipp_dstoptslen;
-			break;
-		case IPV6_V6ONLY:
-			*i1 = sctp->sctp_connp->conn_ipv6_v6only;
-			break;
 		default:
-			retval = ENOPROTOOPT;
 			break;
 		}
 		break;
-
-	default:
-		retval = ENOPROTOOPT;
-		break;
 	}
+	mutex_enter(&connp->conn_lock);
+	retval = conn_opt_get(&coas, level, name, ptr);
+	mutex_exit(&connp->conn_lock);
 	WAKE_SCTP(sctp);
-	return (retval);
+	if (retval == -1)
+		return (EINVAL);
+	*optlen = retval;
+	return (0);
 }
 
 int
 sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
     socklen_t inlen)
 {
-	ip6_pkt_t	*ipp = &sctp->sctp_sticky_ipp;
 	int		*i1 = (int *)invalp;
 	boolean_t	onoff;
 	int		retval = 0, addrcnt;
 	conn_t		*connp = sctp->sctp_connp;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	conn_opt_arg_t	coas;
+
+	coas.coa_connp = connp;
+	coas.coa_ixa = connp->conn_ixa;
+	coas.coa_ipp = &connp->conn_xmit_ipp;
+	coas.coa_ancillary = B_FALSE;
+	coas.coa_changed = 0;
 
 	/* In all cases, the size of the option must be bigger than int */
 	if (inlen >= sizeof (int32_t)) {
@@ -1211,74 +1055,42 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
 		return (EINVAL);
 	}
 
+	/*
+	 * Check that the level and name are supported by SCTP, and that
+	 * the length an credentials are ok.
+	 */
+	retval = proto_opt_check(level, name, inlen, NULL, sctp_opt_arr,
+	    sctp_opt_arr_size, B_TRUE, B_FALSE, connp->conn_cred);
+	if (retval != 0) {
+		if (retval < 0) {
+			retval = proto_tlitosyserr(-retval);
+		}
+		goto done;
+	}
+
+	/* Note: both SCTP and TCP interpret l_linger as being in seconds */
 	switch (level) {
 	case SOL_SOCKET:
-		if (inlen < sizeof (int32_t)) {
-			retval = EINVAL;
-			break;
-		}
 		switch (name) {
-		case SO_LINGER: {
-			struct linger *lgr;
-
-			if (inlen != sizeof (struct linger)) {
-				retval = EINVAL;
-				break;
-			}
-			lgr = (struct linger *)invalp;
-			if (lgr->l_onoff != 0) {
-				sctp->sctp_linger = 1;
-				sctp->sctp_lingertime = MSEC_TO_TICK(
-				    lgr->l_linger);
-			} else {
-				sctp->sctp_linger = 0;
-				sctp->sctp_lingertime = 0;
-			}
-			break;
-		}
-		case SO_DEBUG:
-			sctp->sctp_debug = onoff;
-			break;
-		case SO_KEEPALIVE:
-			break;
-		case SO_DONTROUTE:
-			/*
-			 * SO_DONTROUTE, SO_USELOOPBACK and SO_BROADCAST are
-			 * only of interest to IP.
-			 */
-			connp->conn_dontroute = onoff;
-			break;
-		case SO_USELOOPBACK:
-			connp->conn_loopback = onoff;
-			break;
-		case SO_BROADCAST:
-			connp->conn_broadcast = onoff;
-			break;
-		case SO_REUSEADDR:
-			connp->conn_reuseaddr = onoff;
-			break;
-		case SO_DGRAM_ERRIND:
-			sctp->sctp_dgram_errind = onoff;
-			break;
 		case SO_SNDBUF:
 			if (*i1 > sctps->sctps_max_buf) {
 				retval = ENOBUFS;
-				break;
+				goto done;
 			}
 			if (*i1 < 0) {
 				retval = EINVAL;
-				break;
+				goto done;
 			}
-			sctp->sctp_xmit_hiwater = *i1;
-			if (sctps->sctps_snd_lowat_fraction != 0)
-				sctp->sctp_xmit_lowater =
-				    sctp->sctp_xmit_hiwater /
+			connp->conn_sndbuf = *i1;
+			if (sctps->sctps_snd_lowat_fraction != 0) {
+				connp->conn_sndlowat = connp->conn_sndbuf /
 				    sctps->sctps_snd_lowat_fraction;
-			break;
+			}
+			goto done;
 		case SO_RCVBUF:
 			if (*i1 > sctps->sctps_max_buf) {
 				retval = ENOBUFS;
-				break;
+				goto done;
 			}
 			/* Silently ignore zero */
 			if (*i1 != 0) {
@@ -1294,12 +1106,16 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
 				*i1 = MAX(*i1,
 				    sctps->sctps_recv_hiwat_minmss *
 				    sctp->sctp_mss);
-				sctp->sctp_rwnd = *i1;
+				/*
+				 * Note that sctp_rwnd is modified by the
+				 * protocol and here we just whack it.
+				 */
+				connp->conn_rcvbuf = sctp->sctp_rwnd = *i1;
 				sctp->sctp_irwnd = sctp->sctp_rwnd;
 				sctp->sctp_pd_point = sctp->sctp_rwnd;
 
 				sopp.sopp_flags = SOCKOPT_RCVHIWAT;
-				sopp.sopp_rxhiwat = *i1;
+				sopp.sopp_rxhiwat = connp->conn_rcvbuf;
 				sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp);
 
 			}
@@ -1307,60 +1123,29 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
 			 * XXX should we return the rwnd here
 			 * and sctp_opt_get ?
 			 */
-			break;
+			goto done;
 		case SO_ALLZONES:
-			if (secpolicy_ip(sctp->sctp_credp, OP_CONFIG,
-			    B_TRUE)) {
-				retval = EACCES;
-				break;
-			}
 			if (sctp->sctp_state >= SCTPS_BOUND) {
 				retval = EINVAL;
-				break;
+				goto done;
 			}
-			sctp->sctp_allzones = onoff;
 			break;
 		case SO_MAC_EXEMPT:
-			if (secpolicy_net_mac_aware(sctp->sctp_credp) != 0) {
-				retval = EACCES;
-				break;
-			}
-			if (sctp->sctp_state >= SCTPS_BOUND) {
-				retval = EINVAL;
-				break;
-			}
-			connp->conn_mac_mode = onoff ?
-			    CONN_MAC_AWARE : CONN_MAC_DEFAULT;
-			break;
-		case SO_MAC_IMPLICIT:
-			if (secpolicy_net_mac_implicit(sctp->sctp_credp) != 0) {
-				retval = EACCES;
-				break;
-			}
 			if (sctp->sctp_state >= SCTPS_BOUND) {
 				retval = EINVAL;
-				break;
+				goto done;
 			}
-			connp->conn_mac_mode = onoff ?
-			    CONN_MAC_AWARE : CONN_MAC_IMPLICIT;
-			break;
-		default:
-			retval = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case IPPROTO_SCTP:
-		if (inlen < sizeof (int32_t)) {
-			retval = EINVAL;
-			break;
-		}
 		switch (name) {
 		case SCTP_RTOINFO:
-			retval = sctp_set_rtoinfo(sctp, invalp, inlen);
+			retval = sctp_set_rtoinfo(sctp, invalp);
 			break;
 		case SCTP_ASSOCINFO:
-			retval = sctp_set_assocparams(sctp, invalp, inlen);
+			retval = sctp_set_assocparams(sctp, invalp);
 			break;
 		case SCTP_INITMSG:
 			retval = sctp_set_initmsg(sctp, invalp, inlen);
@@ -1378,37 +1163,28 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
 			sctp_heartbeat_timer(sctp);
 			break;
 		case SCTP_SET_PEER_PRIMARY_ADDR:
-			retval = sctp_set_peerprim(sctp, invalp, inlen);
+			retval = sctp_set_peerprim(sctp, invalp);
 			break;
 		case SCTP_PRIMARY_ADDR:
-			retval = sctp_set_prim(sctp, invalp, inlen);
+			retval = sctp_set_prim(sctp, invalp);
 			break;
 		case SCTP_ADAPTATION_LAYER: {
 			struct sctp_setadaptation *ssb;
 
-			if (inlen < sizeof (struct sctp_setadaptation)) {
-				retval = EINVAL;
-				break;
-			}
 			ssb = (struct sctp_setadaptation *)invalp;
 			sctp->sctp_send_adaptation = 1;
 			sctp->sctp_tx_adaptation_code = ssb->ssb_adaptation_ind;
 			break;
 		}
 		case SCTP_PEER_ADDR_PARAMS:
-			retval = sctp_set_peer_addr_params(sctp, invalp,
-			    inlen);
+			retval = sctp_set_peer_addr_params(sctp, invalp);
 			break;
 		case SCTP_DEFAULT_SEND_PARAM:
-			retval = sctp_set_def_send_params(sctp, invalp, inlen);
+			retval = sctp_set_def_send_params(sctp, invalp);
 			break;
 		case SCTP_EVENTS: {
 			struct sctp_event_subscribe *ev;
 
-			if (inlen < sizeof (struct sctp_event_subscribe)) {
-				retval = EINVAL;
-				break;
-			}
 			ev = (struct sctp_event_subscribe *)invalp;
 			sctp->sctp_recvsndrcvinfo =
 			    ONOFF(ev->sctp_data_io_event);
@@ -1438,15 +1214,15 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
 				retval = EINVAL;
 				break;
 			}
-			if (sctp->sctp_family == AF_INET) {
+			if (connp->conn_family == AF_INET) {
 				addrcnt = inlen / sizeof (struct sockaddr_in);
 			} else {
-				ASSERT(sctp->sctp_family == AF_INET6);
+				ASSERT(connp->conn_family == AF_INET6);
 				addrcnt = inlen / sizeof (struct sockaddr_in6);
 			}
 			if (name == SCTP_ADD_ADDR) {
 				retval = sctp_bind_add(sctp, invalp, addrcnt,
-				    B_TRUE, sctp->sctp_lport);
+				    B_TRUE, connp->conn_lport);
 			} else {
 				retval = sctp_bind_del(sctp, invalp, addrcnt,
 				    B_TRUE);
@@ -1458,10 +1234,6 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
 			/*
 			 * Change handle & upcalls.
 			 */
-			if (inlen < sizeof (*us)) {
-				retval = EINVAL;
-				break;
-			}
 			us = (struct sctp_uc_swap *)invalp;
 			sctp->sctp_ulpd = us->sus_handle;
 			sctp->sctp_upcalls = us->sus_upcalls;
@@ -1474,33 +1246,17 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
 		case SCTP_MAXSEG:
 		case SCTP_DISABLE_FRAGMENTS:
 			/* Not yet supported. */
-		default:
 			retval = ENOPROTOOPT;
 			break;
 		}
-		break;
+		goto done;
 
 	case IPPROTO_IP:
-		if (sctp->sctp_family != AF_INET) {
+		if (connp->conn_family != AF_INET) {
 			retval = ENOPROTOOPT;
-			break;
-		}
-		if ((name != IP_OPTIONS) && (inlen < sizeof (int32_t))) {
-			retval = EINVAL;
-			break;
+			goto done;
 		}
 		switch (name) {
-		case IP_OPTIONS:
-		case T_IP_OPTIONS:
-			retval = sctp_opt_set_header(sctp, invalp, inlen);
-			break;
-		case IP_TOS:
-		case T_IP_TOS:
-			sctp->sctp_ipha->ipha_type_of_service = (uchar_t)*i1;
-			break;
-		case IP_TTL:
-			sctp->sctp_ipha->ipha_ttl = (uchar_t)*i1;
-			break;
 		case IP_SEC_OPT:
 			/*
 			 * We should not allow policy setting after
@@ -1508,319 +1264,30 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
 			 */
 			if (sctp->sctp_state >= SCTPS_LISTEN) {
 				retval = EINVAL;
-			} else {
-				retval = ipsec_set_req(sctp->sctp_credp,
-				    sctp->sctp_connp, (ipsec_req_t *)invalp);
-			}
-			break;
-		/* IP level options */
-		case IP_UNSPEC_SRC:
-			connp->conn_unspec_src = onoff;
-			break;
-		case IP_NEXTHOP: {
-			ipaddr_t addr = *i1;
-			ipif_t *ipif = NULL;
-			ill_t *ill;
-			ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip;
-
-			if (secpolicy_ip(sctp->sctp_credp, OP_CONFIG,
-			    B_TRUE) == 0) {
-				ipif = ipif_lookup_onlink_addr(addr,
-				    connp->conn_zoneid, ipst);
-				if (ipif == NULL) {
-					retval = EHOSTUNREACH;
-					break;
-				}
-				ill = ipif->ipif_ill;
-				mutex_enter(&ill->ill_lock);
-				if ((ill->ill_state_flags & ILL_CONDEMNED) ||
-				    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
-					mutex_exit(&ill->ill_lock);
-					ipif_refrele(ipif);
-					retval =  EHOSTUNREACH;
-					break;
-				}
-				mutex_exit(&ill->ill_lock);
-				ipif_refrele(ipif);
-				mutex_enter(&connp->conn_lock);
-				connp->conn_nexthop_v4 = addr;
-				connp->conn_nexthop_set = B_TRUE;
-				mutex_exit(&connp->conn_lock);
+				goto done;
 			}
 			break;
 		}
-		default:
-			retval = ENOPROTOOPT;
-			break;
-		}
 		break;
-	case IPPROTO_IPV6: {
-		if (sctp->sctp_family != AF_INET6) {
-			retval = ENOPROTOOPT;
-			break;
+	case IPPROTO_IPV6:
+		if (connp->conn_family != AF_INET6) {
+			retval = EINVAL;
+			goto done;
 		}
 
 		switch (name) {
-		case IPV6_UNICAST_HOPS:
-			if (inlen < sizeof (int32_t)) {
-				retval = EINVAL;
-				break;
-			}
-			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
-				retval = EINVAL;
-				break;
-			}
-			if (*i1 == -1) {
-				ipp->ipp_unicast_hops =
-				    sctps->sctps_ipv6_hoplimit;
-				ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
-			} else {
-				ipp->ipp_unicast_hops = (uint8_t)*i1;
-				ipp->ipp_fields |= IPPF_UNICAST_HOPS;
-			}
-			retval = sctp_build_hdrs(sctp);
-			break;
-		case IPV6_UNSPEC_SRC:
-			if (inlen < sizeof (int32_t)) {
-				retval = EINVAL;
-				break;
-			}
-			connp->conn_unspec_src = onoff;
-			break;
 		case IPV6_RECVPKTINFO:
-			if (inlen < sizeof (int32_t)) {
-				retval = EINVAL;
-				break;
-			}
-			if (onoff)
-				sctp->sctp_ipv6_recvancillary |=
-				    SCTP_IPV6_RECVPKTINFO;
-			else
-				sctp->sctp_ipv6_recvancillary &=
-				    ~SCTP_IPV6_RECVPKTINFO;
 			/* Send it with the next msg */
 			sctp->sctp_recvifindex = 0;
-			connp->conn_ip_recvpktinfo = onoff;
+			break;
+		case IPV6_RECVTCLASS:
+			/* Force it to be sent up with the next msg */
+			sctp->sctp_recvtclass = 0xffffffffU;
 			break;
 		case IPV6_RECVHOPLIMIT:
-			if (inlen < sizeof (int32_t)) {
-				retval = EINVAL;
-				break;
-			}
-			if (onoff)
-				sctp->sctp_ipv6_recvancillary |=
-				    SCTP_IPV6_RECVHOPLIMIT;
-			else
-				sctp->sctp_ipv6_recvancillary &=
-				    ~SCTP_IPV6_RECVHOPLIMIT;
+			/* Force it to be sent up with the next msg */
 			sctp->sctp_recvhops = 0xffffffffU;
-			connp->conn_ipv6_recvhoplimit = onoff;
-			break;
-		case IPV6_RECVHOPOPTS:
-			if (inlen < sizeof (int32_t)) {
-				retval = EINVAL;
-				break;
-			}
-			if (onoff)
-				sctp->sctp_ipv6_recvancillary |=
-				    SCTP_IPV6_RECVHOPOPTS;
-			else
-				sctp->sctp_ipv6_recvancillary &=
-				    ~SCTP_IPV6_RECVHOPOPTS;
-			connp->conn_ipv6_recvhopopts = onoff;
-			break;
-		case IPV6_RECVDSTOPTS:
-			if (inlen < sizeof (int32_t)) {
-				retval = EINVAL;
-				break;
-			}
-			if (onoff)
-				sctp->sctp_ipv6_recvancillary |=
-				    SCTP_IPV6_RECVDSTOPTS;
-			else
-				sctp->sctp_ipv6_recvancillary &=
-				    ~SCTP_IPV6_RECVDSTOPTS;
-			connp->conn_ipv6_recvdstopts = onoff;
-			break;
-		case IPV6_RECVRTHDR:
-			if (inlen < sizeof (int32_t)) {
-				retval = EINVAL;
-				break;
-			}
-			if (onoff)
-				sctp->sctp_ipv6_recvancillary |=
-				    SCTP_IPV6_RECVRTHDR;
-			else
-				sctp->sctp_ipv6_recvancillary &=
-				    ~SCTP_IPV6_RECVRTHDR;
-			connp->conn_ipv6_recvrthdr = onoff;
-			break;
-		case IPV6_RECVRTHDRDSTOPTS:
-			if (inlen < sizeof (int32_t)) {
-				retval = EINVAL;
-				break;
-			}
-			if (onoff)
-				sctp->sctp_ipv6_recvancillary |=
-				    SCTP_IPV6_RECVRTDSTOPTS;
-			else
-				sctp->sctp_ipv6_recvancillary &=
-				    ~SCTP_IPV6_RECVRTDSTOPTS;
-			connp->conn_ipv6_recvrtdstopts = onoff;
-			break;
-		case IPV6_PKTINFO:
-			if (inlen != 0 &&
-			    inlen != sizeof (struct in6_pktinfo)) {
-				retval = EINVAL;
-				break;
-			}
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~(IPPF_IFINDEX |IPPF_ADDR);
-			} else  {
-				struct in6_pktinfo *pkti;
-
-				pkti = (struct in6_pktinfo *)invalp;
-				/* XXX Need to check if the index exists */
-				ipp->ipp_ifindex = pkti->ipi6_ifindex;
-				ipp->ipp_addr = pkti->ipi6_addr;
-				if (ipp->ipp_ifindex != 0)
-					ipp->ipp_fields |= IPPF_IFINDEX;
-				else
-					ipp->ipp_fields &= ~IPPF_IFINDEX;
-				if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr))
-					ipp->ipp_fields |= IPPF_ADDR;
-				else
-					ipp->ipp_fields &= ~IPPF_ADDR;
-			}
-			retval = sctp_build_hdrs(sctp);
-			break;
-		case IPV6_NEXTHOP: {
-			struct sockaddr_in6 *sin6;
-			ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip;
-
-			if (inlen != 0 && inlen != sizeof (sin6_t)) {
-				retval = EINVAL;
-				break;
-			}
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~IPPF_NEXTHOP;
-			} else {
-				sin6 = (struct sockaddr_in6 *)invalp;
-				if (sin6->sin6_family != AF_INET6) {
-					retval = EAFNOSUPPORT;
-					break;
-				}
-				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
-					retval = EADDRNOTAVAIL;
-					break;
-				}
-				ipp->ipp_nexthop = sin6->sin6_addr;
-				if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
-					ipp->ipp_fields &= ~IPPF_NEXTHOP;
-				} else {
-					ire_t	*ire;
-
-					ire = ire_route_lookup_v6(
-					    &sin6->sin6_addr, NULL, NULL, 0,
-					    NULL, NULL, ALL_ZONES, NULL,
-					    MATCH_IRE_DEFAULT, ipst);
-					if (ire == NULL) {
-						retval = EHOSTUNREACH;
-						break;
-					}
-					ire_refrele(ire);
-					ipp->ipp_fields |= IPPF_NEXTHOP;
-				}
-			}
-			retval = sctp_build_hdrs(sctp);
-			break;
-		}
-		case IPV6_HOPOPTS: {
-			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
-
-			if (inlen != 0 &&
-			    inlen != (8 * (hopts->ip6h_len + 1))) {
-				retval = EINVAL;
-				break;
-			}
-
-			retval = optcom_pkt_set((uchar_t *)invalp, inlen,
-			    B_TRUE, (uchar_t **)&ipp->ipp_hopopts,
-			    &ipp->ipp_hopoptslen, sctp->sctp_v6label_len);
-			if (retval != 0)
-				break;
-			if (ipp->ipp_hopoptslen == 0)
-				ipp->ipp_fields &= ~IPPF_HOPOPTS;
-			else
-				ipp->ipp_fields |= IPPF_HOPOPTS;
-			retval = sctp_build_hdrs(sctp);
-			break;
-		}
-		case IPV6_RTHDRDSTOPTS: {
-			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
-			if (inlen != 0 &&
-			    inlen != (8 * (dopts->ip6d_len + 1))) {
-				retval = EINVAL;
-				break;
-			}
-
-			retval = optcom_pkt_set((uchar_t *)invalp, inlen,
-			    B_TRUE, (uchar_t **)&ipp->ipp_rtdstopts,
-			    &ipp->ipp_rtdstoptslen, 0);
-			if (retval != 0)
-				break;
-			if (ipp->ipp_rtdstoptslen == 0)
-				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
-			else
-				ipp->ipp_fields |= IPPF_RTDSTOPTS;
-			retval = sctp_build_hdrs(sctp);
-			break;
-		}
-		case IPV6_DSTOPTS: {
-			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
-			if (inlen != 0 &&
-			    inlen != (8 * (dopts->ip6d_len + 1))) {
-				retval = EINVAL;
-				break;
-			}
-
-			retval = optcom_pkt_set((uchar_t *)invalp, inlen,
-			    B_TRUE, (uchar_t **)&ipp->ipp_dstopts,
-			    &ipp->ipp_dstoptslen, 0);
-			if (retval != 0)
-				break;
-			if (ipp->ipp_dstoptslen == 0)
-				ipp->ipp_fields &= ~IPPF_DSTOPTS;
-			else
-				ipp->ipp_fields |= IPPF_DSTOPTS;
-			retval = sctp_build_hdrs(sctp);
 			break;
-		}
-		case IPV6_RTHDR: {
-			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
-
-			if (inlen != 0 &&
-			    inlen != (8 * (rt->ip6r_len + 1))) {
-				retval = EINVAL;
-				break;
-			}
-
-			retval = optcom_pkt_set((uchar_t *)invalp, inlen,
-			    B_TRUE, (uchar_t **)&ipp->ipp_rthdr,
-			    &ipp->ipp_rthdrlen, 0);
-			if (retval != 0)
-				break;
-			if (ipp->ipp_rthdrlen == 0)
-				ipp->ipp_fields &= ~IPPF_RTHDR;
-			else
-				ipp->ipp_fields |= IPPF_RTHDR;
-			retval = sctp_build_hdrs(sctp);
-			break;
-		}
 		case IPV6_SEC_OPT:
 			/*
 			 * We should not allow policy setting after
@@ -1828,9 +1295,7 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
 			 */
 			if (sctp->sctp_state >= SCTPS_LISTEN) {
 				retval = EINVAL;
-			} else {
-				retval = ipsec_set_req(sctp->sctp_credp,
-				    sctp->sctp_connp, (ipsec_req_t *)invalp);
+				goto done;
 			}
 			break;
 		case IPV6_V6ONLY:
@@ -1840,21 +1305,44 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
 			 */
 			if (sctp->sctp_state >= SCTPS_BOUND) {
 				retval = EINVAL;
-			} else {
-				sctp->sctp_connp->conn_ipv6_v6only = onoff;
+				goto done;
 			}
 			break;
-		default:
-			retval = ENOPROTOOPT;
-			break;
 		}
 		break;
 	}
-	default:
-		retval = ENOPROTOOPT;
-		break;
-	}
 
+	retval = conn_opt_set(&coas, level, name, inlen, (uchar_t *)invalp,
+	    B_FALSE, connp->conn_cred);
+	if (retval != 0)
+		goto done;
+
+	if (coas.coa_changed & COA_ROUTE_CHANGED) {
+		sctp_faddr_t *fp;
+		/*
+		 * We recache the information which might pick a different
+		 * source and redo IPsec as a result.
+		 */
+		for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next)
+			sctp_get_dest(sctp, fp);
+	}
+	if (coas.coa_changed & COA_HEADER_CHANGED) {
+		retval = sctp_build_hdrs(sctp, KM_NOSLEEP);
+		if (retval != 0)
+			goto done;
+	}
+	if (coas.coa_changed & COA_WROFF_CHANGED) {
+		connp->conn_wroff = connp->conn_ht_iphc_allocated +
+		    sctps->sctps_wroff_xtra;
+		if (sctp->sctp_current != NULL) {
+			/*
+			 * Could be setting options before setting up
+			 * connection.
+			 */
+			sctp_set_ulp_prop(sctp);
+		}
+	}
+done:
 	WAKE_SCTP(sctp);
 	return (retval);
 }
@@ -1871,18 +1359,19 @@ sctp_getsockname(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen)
 	int	addrcnt = 1;
 	sin_t	*sin4;
 	sin6_t	*sin6;
+	conn_t	*connp = sctp->sctp_connp;
 
 	ASSERT(sctp != NULL);
 
 	RUN_SCTP(sctp);
-	addr->sa_family = sctp->sctp_family;
-	switch (sctp->sctp_family) {
+	addr->sa_family = connp->conn_family;
+	switch (connp->conn_family) {
 	case AF_INET:
 		sin4 = (sin_t *)addr;
 		if ((sctp->sctp_state <= SCTPS_LISTEN) &&
 		    sctp->sctp_bound_to_all) {
 			sin4->sin_addr.s_addr = INADDR_ANY;
-			sin4->sin_port = sctp->sctp_lport;
+			sin4->sin_port = connp->conn_lport;
 		} else {
 			err = sctp_getmyaddrs(sctp, sin4, &addrcnt);
 			if (err != 0) {
@@ -1897,7 +1386,7 @@ sctp_getsockname(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen)
 		if ((sctp->sctp_state <= SCTPS_LISTEN) &&
 		    sctp->sctp_bound_to_all) {
 			bzero(&sin6->sin6_addr, sizeof (sin6->sin6_addr));
-			sin6->sin6_port = sctp->sctp_lport;
+			sin6->sin6_port = connp->conn_lport;
 		} else {
 			err = sctp_getmyaddrs(sctp, sin6, &addrcnt);
 			if (err != 0) {
@@ -1906,10 +1395,7 @@ sctp_getsockname(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen)
 			}
 		}
 		*addrlen = sizeof (struct sockaddr_in6);
-		sin6->sin6_flowinfo = sctp->sctp_ip6h->ip6_vcf &
-		    ~IPV6_VERS_AND_FLOW_MASK;
-		sin6->sin6_scope_id = 0;
-		sin6->__sin6_src_id = 0;
+		/* Note that flowinfo is only returned for getpeername */
 		break;
 	}
 	WAKE_SCTP(sctp);
@@ -1927,12 +1413,13 @@ sctp_getpeername(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen)
 	int	err = 0;
 	int	addrcnt = 1;
 	sin6_t	*sin6;
+	conn_t	*connp = sctp->sctp_connp;
 
 	ASSERT(sctp != NULL);
 
 	RUN_SCTP(sctp);
-	addr->sa_family = sctp->sctp_family;
-	switch (sctp->sctp_family) {
+	addr->sa_family = connp->conn_family;
+	switch (connp->conn_family) {
 	case AF_INET:
 		err = sctp_getpeeraddrs(sctp, addr, &addrcnt);
 		if (err != 0) {
@@ -1949,9 +1436,6 @@ sctp_getpeername(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen)
 			break;
 		}
 		*addrlen = sizeof (struct sockaddr_in6);
-		sin6->sin6_flowinfo = 0;
-		sin6->sin6_scope_id = 0;
-		sin6->__sin6_src_id = 0;
 		break;
 	}
 	WAKE_SCTP(sctp);
@@ -1973,13 +1457,14 @@ sctp_getpeeraddrs(sctp_t *sctp, void *paddrs, int *addrcnt)
 	int			cnt;
 	sctp_faddr_t		*fp = sctp->sctp_faddrs;
 	in6_addr_t		addr;
+	conn_t			*connp = sctp->sctp_connp;
 
 	ASSERT(sctp != NULL);
 
 	if (sctp->sctp_faddrs == NULL)
 		return (ENOTCONN);
 
-	family = sctp->sctp_family;
+	family = connp->conn_family;
 	max = *addrcnt;
 
 	/* If we want only one, give the primary */
@@ -1989,15 +1474,26 @@ sctp_getpeeraddrs(sctp_t *sctp, void *paddrs, int *addrcnt)
 		case AF_INET:
 			sin4 = paddrs;
 			IN6_V4MAPPED_TO_INADDR(&addr, &sin4->sin_addr);
-			sin4->sin_port = sctp->sctp_fport;
+			sin4->sin_port = connp->conn_fport;
 			sin4->sin_family = AF_INET;
 			break;
 
 		case AF_INET6:
 			sin6 = paddrs;
 			sin6->sin6_addr = addr;
-			sin6->sin6_port = sctp->sctp_fport;
+			sin6->sin6_port = connp->conn_fport;
 			sin6->sin6_family = AF_INET6;
+			sin6->sin6_flowinfo = connp->conn_flowinfo;
+			if (IN6_IS_ADDR_LINKSCOPE(&addr) &&
+			    sctp->sctp_primary != NULL &&
+			    (sctp->sctp_primary->ixa->ixa_flags &
+			    IXAF_SCOPEID_SET)) {
+				sin6->sin6_scope_id =
+				    sctp->sctp_primary->ixa->ixa_scopeid;
+			} else {
+				sin6->sin6_scope_id = 0;
+			}
+			sin6->__sin6_src_id = 0;
 			break;
 		}
 		return (0);
@@ -2010,14 +1506,21 @@ sctp_getpeeraddrs(sctp_t *sctp, void *paddrs, int *addrcnt)
 			ASSERT(IN6_IS_ADDR_V4MAPPED(&addr));
 			sin4 = (struct sockaddr_in *)paddrs + cnt;
 			IN6_V4MAPPED_TO_INADDR(&addr, &sin4->sin_addr);
-			sin4->sin_port = sctp->sctp_fport;
+			sin4->sin_port = connp->conn_fport;
 			sin4->sin_family = AF_INET;
 			break;
 		case AF_INET6:
 			sin6 = (struct sockaddr_in6 *)paddrs + cnt;
 			sin6->sin6_addr = addr;
-			sin6->sin6_port = sctp->sctp_fport;
+			sin6->sin6_port = connp->conn_fport;
 			sin6->sin6_family = AF_INET6;
+			sin6->sin6_flowinfo = connp->conn_flowinfo;
+			if (IN6_IS_ADDR_LINKSCOPE(&addr) &&
+			    (fp->ixa->ixa_flags & IXAF_SCOPEID_SET))
+				sin6->sin6_scope_id = fp->ixa->ixa_scopeid;
+			else
+				sin6->sin6_scope_id = 0;
+			sin6->__sin6_src_id = 0;
 			break;
 		}
 	}
diff --git a/usr/src/uts/common/inet/sctp/sctp_output.c b/usr/src/uts/common/inet/sctp/sctp_output.c
index c16a1166fa..1a50097260 100644
--- a/usr/src/uts/common/inet/sctp/sctp_output.c
+++ b/usr/src/uts/common/inet/sctp/sctp_output.c
@@ -38,6 +38,7 @@
 #include <inet/common.h>
 #include <inet/mi.h>
 #include <inet/ip.h>
+#include <inet/ip_ire.h>
 #include <inet/ip6.h>
 #include <inet/sctp_ip.h>
 #include <inet/ipclassifier.h>
@@ -140,6 +141,7 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags)
 	sctp_msg_hdr_t	*sctp_msg_hdr;
 	uint32_t	msg_len = 0;
 	uint32_t	timetolive = sctp->sctp_def_timetolive;
+	conn_t		*connp = sctp->sctp_connp;
 
 	ASSERT(DB_TYPE(mproto) == M_PROTO);
 
@@ -228,7 +230,7 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags)
 		RUN_SCTP(sctp);
 		sctp_user_abort(sctp, mp);
 		freemsg(mproto);
-		goto process_sendq;
+		goto done2;
 	}
 	if (mp == NULL)
 		goto done;
@@ -292,15 +294,14 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags)
 	/*
 	 * Notify sockfs if the tx queue is full.
 	 */
-	if (SCTP_TXQ_LEN(sctp) >= sctp->sctp_xmit_hiwater) {
+	if (SCTP_TXQ_LEN(sctp) >= connp->conn_sndbuf) {
 		sctp->sctp_txq_full = 1;
 		sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, B_TRUE);
 	}
 	if (sctp->sctp_state == SCTPS_ESTABLISHED)
 		sctp_output(sctp, UINT_MAX);
-process_sendq:
+done2:
 	WAKE_SCTP(sctp);
-	sctp_process_sendq(sctp);
 	return (0);
 unlock_done:
 	WAKE_SCTP(sctp);
@@ -569,7 +570,7 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
     int *error)
 {
 	int hdrlen;
-	char *hdr;
+	uchar_t *hdr;
 	int isv4 = fp->isv4;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
 
@@ -584,17 +585,19 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
 		hdr = sctp->sctp_iphc6;
 	}
 	/*
-	 * A null fp->ire could mean that the address is 'down'. Similarly,
+	 * A reject|blackhole could mean that the address is 'down'. Similarly,
 	 * it is possible that the address went down, we tried to send an
 	 * heartbeat and ended up setting fp->saddr as unspec because we
 	 * didn't have any usable source address.  In either case
-	 * sctp_get_ire() will try find an IRE, if available, and set
+	 * sctp_get_dest() will try find an IRE, if available, and set
 	 * the source address, if needed.  If we still don't have any
 	 * usable source address, fp->state will be SCTP_FADDRS_UNREACH and
 	 * we return EHOSTUNREACH.
 	 */
-	if (fp->ire == NULL || SCTP_IS_ADDR_UNSPEC(fp->isv4, fp->saddr)) {
-		sctp_get_ire(sctp, fp);
+	ASSERT(fp->ixa->ixa_ire != NULL);
+	if ((fp->ixa->ixa_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+	    SCTP_IS_ADDR_UNSPEC(fp->isv4, fp->saddr)) {
+		sctp_get_dest(sctp, fp);
 		if (fp->state == SCTP_FADDRS_UNREACH) {
 			if (error != NULL)
 				*error = EHOSTUNREACH;
@@ -603,8 +606,7 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
 	}
 	/* Copy in IP header. */
 	if ((mp->b_rptr - mp->b_datap->db_base) <
-	    (sctps->sctps_wroff_xtra + hdrlen + sacklen) || DB_REF(mp) > 2 ||
-	    !IS_P2ALIGNED(DB_BASE(mp), sizeof (ire_t *))) {
+	    (sctps->sctps_wroff_xtra + hdrlen + sacklen) || DB_REF(mp) > 2) {
 		mblk_t *nmp;
 
 		/*
@@ -612,8 +614,8 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
 		 * data was moved into chunks, or during retransmission,
 		 * or things like snoop is running.
 		 */
-		nmp = allocb_cred(sctps->sctps_wroff_xtra + hdrlen + sacklen,
-		    CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
+		nmp = allocb(sctps->sctps_wroff_xtra + hdrlen + sacklen,
+		    BPRI_MED);
 		if (nmp == NULL) {
 			if (error !=  NULL)
 				*error = ENOMEM;
@@ -625,7 +627,6 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
 		mp = nmp;
 	} else {
 		mp->b_rptr -= (hdrlen + sacklen);
-		mblk_setcred(mp, CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
 	}
 	bcopy(hdr, mp->b_rptr, hdrlen);
 	if (sacklen) {
@@ -644,26 +645,16 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
 				iph->ipha_src = INADDR_ANY;
 			}
 		} else {
-			((ip6_t *)(mp->b_rptr))->ip6_dst = fp->faddr;
+			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+			ip6h->ip6_dst = fp->faddr;
 			if (!IN6_IS_ADDR_UNSPECIFIED(&fp->saddr)) {
-				((ip6_t *)(mp->b_rptr))->ip6_src = fp->saddr;
+				ip6h->ip6_src = fp->saddr;
 			} else if (sctp->sctp_bound_to_all) {
-				V6_SET_ZERO(((ip6_t *)(mp->b_rptr))->ip6_src);
+				ip6h->ip6_src = ipv6_all_zeros;
 			}
 		}
 	}
-	/*
-	 * IP will not free this IRE if it is condemned.  SCTP needs to
-	 * free it.
-	 */
-	if ((fp->ire != NULL) && (fp->ire->ire_marks & IRE_MARK_CONDEMNED)) {
-		IRE_REFRELE_NOTR(fp->ire);
-		fp->ire = NULL;
-	}
-
-	/* Stash the conn and ire ptr info for IP */
-	SCTP_STASH_IPINFO(mp, fp->ire);
-
 	return (mp);
 }
 
@@ -985,8 +976,9 @@ sctp_fast_rexmit(sctp_t *sctp)
 		iph->ipha_fragment_offset_and_flags = 0;
 	}
 
-	sctp_set_iplen(sctp, head);
-	sctp_add_sendq(sctp, head);
+	sctp_set_iplen(sctp, head, fp->ixa);
+	(void) conn_ip_output(head, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 	sctp->sctp_active = fp->lastactive = lbolt64;
 }
 
@@ -1280,8 +1272,9 @@ sctp_output(sctp_t *sctp, uint_t num_pkt)
 		    seglen - xtralen, ntohl(sdc->sdh_tsn),
 		    ntohs(sdc->sdh_ssn), (void *)fp, sctp->sctp_frwnd,
 		    cansend, sctp->sctp_lastack_rxd));
-		sctp_set_iplen(sctp, head);
-		sctp_add_sendq(sctp, head);
+		sctp_set_iplen(sctp, head, fp->ixa);
+		(void) conn_ip_output(head, fp->ixa);
+		BUMP_LOCAL(sctp->sctp_opkts);
 		/* arm rto timer (if not set) */
 		if (!fp->timer_running)
 			SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
@@ -1415,8 +1408,7 @@ sctp_make_ftsn_chunk(sctp_t *sctp, sctp_faddr_t *fp, sctp_ftsn_set_t *sets,
 		xtralen = sctp->sctp_hdr_len + sctps->sctps_wroff_xtra;
 	else
 		xtralen = sctp->sctp_hdr6_len + sctps->sctps_wroff_xtra;
-	ftsn_mp = allocb_cred(xtralen + seglen, CONN_CRED(sctp->sctp_connp),
-	    sctp->sctp_cpid);
+	ftsn_mp = allocb(xtralen + seglen, BPRI_MED);
 	if (ftsn_mp == NULL)
 		return (NULL);
 	ftsn_mp->b_rptr += xtralen;
@@ -1804,8 +1796,9 @@ out:
 		pkt = sctp_rexmit_packet(sctp, &meta, &mp, fp, &pkt_len);
 		if (pkt != NULL) {
 			ASSERT(pkt_len <= fp->sfa_pmss);
-			sctp_set_iplen(sctp, pkt);
-			sctp_add_sendq(sctp, pkt);
+			sctp_set_iplen(sctp, pkt, fp->ixa);
+			(void) conn_ip_output(pkt, fp->ixa);
+			BUMP_LOCAL(sctp->sctp_opkts);
 		} else {
 			SCTP_KSTAT(sctps, sctp_ss_rexmit_failed);
 		}
@@ -2022,8 +2015,9 @@ done_bundle:
 	sctp->sctp_rexmitting = B_TRUE;
 	sctp->sctp_rxt_nxttsn = first_ua_tsn;
 	sctp->sctp_rxt_maxtsn = sctp->sctp_ltsn - 1;
-	sctp_set_iplen(sctp, head);
-	sctp_add_sendq(sctp, head);
+	sctp_set_iplen(sctp, head, fp->ixa);
+	(void) conn_ip_output(head, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 
 	/*
 	 * Restart the oldfp timer with exponential backoff and
@@ -2305,8 +2299,9 @@ found_msg:
 		 */
 		iph->ipha_fragment_offset_and_flags = 0;
 	}
-	sctp_set_iplen(sctp, pkt);
-	sctp_add_sendq(sctp, pkt);
+	sctp_set_iplen(sctp, pkt, fp->ixa);
+	(void) conn_ip_output(pkt, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 
 	/* Check and see if there is more chunk to be retransmitted. */
 	if (tot_wnd <= pkt_len || tot_wnd - pkt_len < fp->sfa_pmss ||
diff --git a/usr/src/uts/common/inet/sctp/sctp_param.c b/usr/src/uts/common/inet/sctp/sctp_param.c
index 5d5ed19676..26365c5a06 100644
--- a/usr/src/uts/common/inet/sctp/sctp_param.c
+++ b/usr/src/uts/common/inet/sctp/sctp_param.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/stream.h>
 #include <sys/socket.h>
 #include <sys/ddi.h>
@@ -72,11 +70,8 @@
 /*
  * sctp_wroff_xtra is the extra space in front of SCTP/IP header for link
  * layer header.  It has to be a multiple of 4.
- * Also there has to be enough space to stash in information passed between
- * IP and SCTP.
  */
-sctpparam_t lcl_sctp_wroff_xtra_param = { sizeof (conn_t *) + sizeof (ire_t *),
-					256, 32, "sctp_wroff_xtra" };
+sctpparam_t lcl_sctp_wroff_xtra_param = { 0, 256, 32, "sctp_wroff_xtra" };
 
 /*
  * All of these are alterable, within the min/max values given, at run time.
@@ -343,7 +338,7 @@ sctp_nd_init(sctp_stack_t *sctps)
 	bcopy(lcl_sctp_param_arr, pa, sizeof (lcl_sctp_param_arr));
 	sctps->sctps_params = pa;
 	return (sctp_param_register(&sctps->sctps_g_nd, pa,
-		    A_CNT(lcl_sctp_param_arr), sctps));
+	    A_CNT(lcl_sctp_param_arr), sctps));
 }
 
 int
diff --git a/usr/src/uts/common/inet/sctp/sctp_shutdown.c b/usr/src/uts/common/inet/sctp/sctp_shutdown.c
index b58016eb15..ff835a60c0 100644
--- a/usr/src/uts/common/inet/sctp/sctp_shutdown.c
+++ b/usr/src/uts/common/inet/sctp/sctp_shutdown.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,6 +35,7 @@
 #include <netinet/in.h>
 #include <netinet/ip6.h>
 
+#include <inet/ipsec_impl.h>
 #include <inet/common.h>
 #include <inet/ip.h>
 #include <inet/ip6.h>
@@ -129,12 +130,12 @@ sctp_send_shutdown(sctp_t *sctp, int rexmit)
 
 	/* Link the shutdown chunk in after the IP/SCTP header */
 
-	sctp_set_iplen(sctp, sendmp);
-
 	BUMP_LOCAL(sctp->sctp_obchunks);
 
 	/* Send the shutdown and restart the timer */
-	sctp_add_sendq(sctp, sendmp);
+	sctp_set_iplen(sctp, sendmp, fp->ixa);
+	(void) conn_ip_output(sendmp, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 
 done:
 	sctp->sctp_state = SCTPS_SHUTDOWN_SENT;
@@ -211,11 +212,11 @@ sctp_shutdown_received(sctp_t *sctp, sctp_chunk_hdr_t *sch, boolean_t crwsd,
 		}
 	}
 
-	sctp_set_iplen(sctp, samp);
-
 	BUMP_LOCAL(sctp->sctp_obchunks);
 
-	sctp_add_sendq(sctp, samp);
+	sctp_set_iplen(sctp, samp, fp->ixa);
+	(void) conn_ip_output(samp, fp->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 
 dotimer:
 	sctp->sctp_state = SCTPS_SHUTDOWN_ACK_SENT;
@@ -232,7 +233,7 @@ sctp_shutdown_complete(sctp_t *sctp)
 	sctp_chunk_hdr_t *scch;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
 
-	scmp = sctp_make_mp(sctp, NULL, sizeof (*scch));
+	scmp = sctp_make_mp(sctp, sctp->sctp_current, sizeof (*scch));
 	if (scmp == NULL) {
 		/* XXX use timer approach */
 		SCTP_KSTAT(sctps, sctp_send_shutdown_comp_failed);
@@ -246,11 +247,11 @@ sctp_shutdown_complete(sctp_t *sctp)
 
 	scmp->b_wptr += sizeof (*scch);
 
-	sctp_set_iplen(sctp, scmp);
-
 	BUMP_LOCAL(sctp->sctp_obchunks);
 
-	sctp_add_sendq(sctp, scmp);
+	sctp_set_iplen(sctp, scmp, sctp->sctp_current->ixa);
+	(void) conn_ip_output(scmp, sctp->sctp_current->ixa);
+	BUMP_LOCAL(sctp->sctp_opkts);
 }
 
 /*
@@ -259,91 +260,99 @@ sctp_shutdown_complete(sctp_t *sctp)
  * and instead must draw all necessary info from the incoming packet.
  */
 void
-sctp_ootb_shutdown_ack(sctp_t *gsctp, mblk_t *inmp, uint_t ip_hdr_len)
+sctp_ootb_shutdown_ack(mblk_t *mp, uint_t ip_hdr_len, ip_recv_attr_t *ira,
+    ip_stack_t *ipst)
 {
 	boolean_t		isv4;
-	ipha_t			*inip4h;
-	ip6_t			*inip6h;
+	ipha_t			*ipha = NULL;
+	ip6_t			*ip6h = NULL;
 	sctp_hdr_t		*insctph;
 	sctp_chunk_hdr_t	*scch;
 	int			i;
 	uint16_t		port;
 	mblk_t			*mp1;
-	sctp_stack_t	*sctps = gsctp->sctp_sctps;
+	netstack_t		*ns = ipst->ips_netstack;
+	sctp_stack_t		*sctps = ns->netstack_sctp;
+	ip_xmit_attr_t		ixas;
 
-	isv4 = (IPH_HDR_VERSION(inmp->b_rptr) == IPV4_VERSION);
+	bzero(&ixas, sizeof (ixas));
 
-	/*
-	 * The gsctp should contain the minimal IP header.  So the
-	 * incoming mblk should be able to hold the new SCTP packet.
-	 */
-	ASSERT(MBLKL(inmp) >= sizeof (*insctph) + sizeof (*scch) +
-	    (isv4 ? gsctp->sctp_ip_hdr_len : gsctp->sctp_ip_hdr6_len));
+	isv4 = (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
+
+	ASSERT(MBLKL(mp) >= sizeof (*insctph) + sizeof (*scch) +
+	    (isv4 ? sizeof (ipha_t) : sizeof (ip6_t)));
 
 	/*
 	 * Check to see if we can reuse the incoming mblk.  There should
-	 * not be other reference and the db_base of the mblk should be
-	 * properly aligned.  Since this packet comes from below,
+	 * not be other reference. Since this packet comes from below,
 	 * there should be enough header space to fill in what the lower
-	 * layers want to add.  And we will not stash anything there.
+	 * layers want to add.
 	 */
-	if (!IS_P2ALIGNED(DB_BASE(inmp), sizeof (ire_t *)) ||
-	    DB_REF(inmp) != 1) {
-		mp1 = allocb(MBLKL(inmp) + sctps->sctps_wroff_xtra, BPRI_MED);
+	if (DB_REF(mp) != 1) {
+		mp1 = allocb(MBLKL(mp) + sctps->sctps_wroff_xtra, BPRI_MED);
 		if (mp1 == NULL) {
-			freeb(inmp);
+			freeb(mp);
 			return;
 		}
 		mp1->b_rptr += sctps->sctps_wroff_xtra;
-		mp1->b_wptr = mp1->b_rptr + MBLKL(inmp);
-		bcopy(inmp->b_rptr, mp1->b_rptr, MBLKL(inmp));
-		freeb(inmp);
-		inmp = mp1;
+		mp1->b_wptr = mp1->b_rptr + MBLKL(mp);
+		bcopy(mp->b_rptr, mp1->b_rptr, MBLKL(mp));
+		freeb(mp);
+		mp = mp1;
 	} else {
-		ASSERT(DB_CKSUMFLAGS(inmp) == 0);
+		DB_CKSUMFLAGS(mp) = 0;
 	}
 
+	ixas.ixa_pktlen = ip_hdr_len + sizeof (*insctph) + sizeof (*scch);
+	ixas.ixa_ip_hdr_length = ip_hdr_len;
 	/*
 	 * We follow the logic in tcp_xmit_early_reset() in that we skip
-	 * reversing source route (i.e. relpace all IP options with EOL).
+	 * reversing source route (i.e. replace all IP options with EOL).
 	 */
 	if (isv4) {
 		ipaddr_t	v4addr;
 
-		inip4h = (ipha_t *)inmp->b_rptr;
+		ipha = (ipha_t *)mp->b_rptr;
 		for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
-			inmp->b_rptr[i] = IPOPT_EOL;
+			mp->b_rptr[i] = IPOPT_EOL;
 		/* Swap addresses */
-		inip4h->ipha_length = htons(ip_hdr_len + sizeof (*insctph) +
-		    sizeof (*scch));
-		v4addr = inip4h->ipha_src;
-		inip4h->ipha_src = inip4h->ipha_dst;
-		inip4h->ipha_dst = v4addr;
-		inip4h->ipha_ident = 0;
-		inip4h->ipha_ttl = (uchar_t)sctps->sctps_ipv4_ttl;
+		ipha->ipha_length = htons(ixas.ixa_pktlen);
+		v4addr = ipha->ipha_src;
+		ipha->ipha_src = ipha->ipha_dst;
+		ipha->ipha_dst = v4addr;
+		ipha->ipha_ident = 0;
+		ipha->ipha_ttl = (uchar_t)sctps->sctps_ipv4_ttl;
+
+		ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
 	} else {
 		in6_addr_t	v6addr;
 
-		inip6h = (ip6_t *)inmp->b_rptr;
+		ip6h = (ip6_t *)mp->b_rptr;
 		/* Remove any extension headers assuming partial overlay */
 		if (ip_hdr_len > IPV6_HDR_LEN) {
 			uint8_t	*to;
 
-			to = inmp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
-			ovbcopy(inip6h, to, IPV6_HDR_LEN);
-			inmp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
+			to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
+			ovbcopy(ip6h, to, IPV6_HDR_LEN);
+			mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
 			ip_hdr_len = IPV6_HDR_LEN;
-			inip6h = (ip6_t *)inmp->b_rptr;
-			inip6h->ip6_nxt = IPPROTO_SCTP;
+			ip6h = (ip6_t *)mp->b_rptr;
+			ip6h->ip6_nxt = IPPROTO_SCTP;
+		}
+		ip6h->ip6_plen = htons(ixas.ixa_pktlen - IPV6_HDR_LEN);
+		v6addr = ip6h->ip6_src;
+		ip6h->ip6_src = ip6h->ip6_dst;
+		ip6h->ip6_dst = v6addr;
+		ip6h->ip6_hops = (uchar_t)sctps->sctps_ipv6_hoplimit;
+
+		ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
+			ixas.ixa_flags |= IXAF_SCOPEID_SET;
+			ixas.ixa_scopeid = ira->ira_ruifindex;
 		}
-		inip6h->ip6_plen = htons(ip_hdr_len + sizeof (*insctph) +
-		    sizeof (*scch) - IPV6_HDR_LEN);
-		v6addr = inip6h->ip6_src;
-		inip6h->ip6_src = inip6h->ip6_dst;
-		inip6h->ip6_dst = v6addr;
-		inip6h->ip6_hops = (uchar_t)sctps->sctps_ipv6_hoplimit;
 	}
-	insctph = (sctp_hdr_t *)(inmp->b_rptr + ip_hdr_len);
+
+	insctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_len);
 
 	/* Swap ports.  Verification tag is reused. */
 	port = insctph->sh_sport;
@@ -359,9 +368,29 @@ sctp_ootb_shutdown_ack(sctp_t *gsctp, mblk_t *inmp, uint_t ip_hdr_len)
 	/* Set the T-bit */
 	SCTP_SET_TBIT(scch);
 
-	BUMP_LOCAL(gsctp->sctp_obchunks);
-	/* Nothing to stash... */
-	SCTP_STASH_IPINFO(inmp, (ire_t *)NULL);
+	ixas.ixa_protocol = IPPROTO_SCTP;
+	ixas.ixa_zoneid = ira->ira_zoneid;
+	ixas.ixa_ipst = ipst;
+	ixas.ixa_ifindex = 0;
+
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+		/*
+		 * Apply IPsec based on how IPsec was applied to
+		 * the packet that was out of the blue.
+		 */
+		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, ip6h)) {
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			/* Note: mp already consumed and ip_drop_packet done */
+			return;
+		}
+	} else {
+		/*
+		 * This is in clear. The message we are building
+		 * here should go out in clear, independent of our policy.
+		 */
+		ixas.ixa_flags |= IXAF_NO_IPSEC;
+	}
 
-	sctp_add_sendq(gsctp, inmp);
+	(void) ip_output_simple(mp, &ixas);
+	ixa_cleanup(&ixas);
 }
diff --git a/usr/src/uts/common/inet/sctp/sctp_snmp.c b/usr/src/uts/common/inet/sctp/sctp_snmp.c
index f859cd6ba5..f1e7deceae 100644
--- a/usr/src/uts/common/inet/sctp/sctp_snmp.c
+++ b/usr/src/uts/common/inet/sctp/sctp_snmp.c
@@ -78,9 +78,9 @@ sctp_kstat_update(kstat_t *kp, int rw)
 	 * individual set of statistics.
 	 */
 	SET_MIB(sctps->sctps_mib.sctpCurrEstab, 0);
-	sctp = sctps->sctps_gsctp;
 	sctp_prev = NULL;
 	mutex_enter(&sctps->sctps_g_lock);
+	sctp = list_head(&sctps->sctps_g_list);
 	while (sctp != NULL) {
 		mutex_enter(&sctp->sctp_reflock);
 		if (sctp->sctp_condemned) {
@@ -471,8 +471,8 @@ sctp_snmp_get_mib2(queue_t *q, mblk_t *mpctl, sctp_stack_t *sctps)
 	SET_MIB(sctps->sctps_mib.sctpCurrEstab, 0);
 
 	idx = 0;
-	sctp = sctps->sctps_gsctp;
 	mutex_enter(&sctps->sctps_g_lock);
+	sctp = list_head(&sctps->sctps_g_list);
 	while (sctp != NULL) {
 		mutex_enter(&sctp->sctp_reflock);
 		if (sctp->sctp_condemned) {
@@ -541,8 +541,8 @@ sctp_snmp_get_mib2(queue_t *q, mblk_t *mpctl, sctp_stack_t *sctps)
 		sctp->sctp_reassmsgs = 0;
 
 		sce.sctpAssocId = ntohl(sctp->sctp_lvtag);
-		sce.sctpAssocLocalPort = ntohs(sctp->sctp_lport);
-		sce.sctpAssocRemPort = ntohs(sctp->sctp_fport);
+		sce.sctpAssocLocalPort = ntohs(sctp->sctp_connp->conn_lport);
+		sce.sctpAssocRemPort = ntohs(sctp->sctp_connp->conn_fport);
 
 		RUN_SCTP(sctp);
 		if (sctp->sctp_primary != NULL) {
@@ -659,11 +659,10 @@ done:
 			needattr = B_TRUE;
 			break;
 		}
-		if (connp->conn_fully_bound &&
-		    connp->conn_effective_cred != NULL) {
+		if (sctp->sctp_connp->conn_ixa->ixa_tsl != NULL) {
 			ts_label_t *tsl;
 
-			tsl = crgetlabel(connp->conn_effective_cred);
+			tsl = sctp->sctp_connp->conn_ixa->ixa_tsl;
 			mlp.tme_flags |= MIB2_TMEF_IS_LABELED;
 			mlp.tme_doi = label2doi(tsl);
 			mlp.tme_label = *label2bslabel(tsl);
diff --git a/usr/src/uts/common/inet/sctp/sctp_stack.h b/usr/src/uts/common/inet/sctp/sctp_stack.h
index d467b38a17..e9ad5cf9c7 100644
--- a/usr/src/uts/common/inet/sctp/sctp_stack.h
+++ b/usr/src/uts/common/inet/sctp/sctp_stack.h
@@ -20,15 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_INET_SCTP_SCTP_STACK_H
 #define	_INET_SCTP_SCTP_STACK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/netstack.h>
 #include <sys/taskq.h>
 
@@ -76,17 +74,6 @@ struct sctp_stack {
 
 	mib2_sctp_t		sctps_mib;
 
-	/* Protected by sctps_g_q_lock */
-	queue_t		*sctps_g_q;
-	uint_t		sctps_g_q_ref; /* Number of sctp_t's that use it */
-	kmutex_t	sctps_g_q_lock;
-	kcondvar_t	sctps_g_q_cv;
-	kthread_t	*sctps_g_q_creator;
-	struct __ldi_handle *sctps_g_q_lh;
-	cred_t		*sctps_g_q_cr;    /* For _inactive close call */
-	/* The default sctp_t for responding out of the blue packets. */
-	struct sctp_s	*sctps_gsctp;
-
 	/* Protected by sctps_g_lock */
 	struct list	sctps_g_list;	/* SCTP instance data chain */
 	kmutex_t	sctps_g_lock;
diff --git a/usr/src/uts/common/inet/sctp/sctp_timer.c b/usr/src/uts/common/inet/sctp/sctp_timer.c
index c6fd4a5c71..24b46ad6f0 100644
--- a/usr/src/uts/common/inet/sctp/sctp_timer.c
+++ b/usr/src/uts/common/inet/sctp/sctp_timer.c
@@ -220,7 +220,6 @@ sctp_timer_fire(sctp_tb_t *sctp_tb)
 
 		sctp_timer_call(sctp, mp);
 		WAKE_SCTP(sctp);
-		sctp_process_sendq(sctp);
 	}
 	SCTP_REFRELE(sctp);
 }
@@ -429,7 +428,7 @@ sctp_heartbeat_timer(sctp_t *sctp)
 	for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
 		/*
 		 * If the peer is unreachable because there is no available
-		 * source address, call sctp_get_ire() to see if it is
+		 * source address, call sctp_get_dest() to see if it is
 		 * reachable now.  If it is OK, the state will become
 		 * unconfirmed.  And the following code to handle unconfirmed
 		 * address will be executed.  If it is still not OK,
@@ -438,7 +437,7 @@ sctp_heartbeat_timer(sctp_t *sctp)
 		 * is disable, this retry may go on forever.
 		 */
 		if (fp->state == SCTP_FADDRS_UNREACH) {
-			sctp_get_ire(sctp, fp);
+			sctp_get_dest(sctp, fp);
 			if (fp->state == SCTP_FADDRS_UNREACH) {
 				if (fp->hb_enabled &&
 				    ++fp->strikes > fp->max_retr &&
@@ -642,15 +641,14 @@ rxmit_init:
 		 * address list won't be modified (it would have been done
 		 * the first time around).
 		 */
-		mp = sctp_init_mp(sctp);
+		mp = sctp_init_mp(sctp, fp);
 		if (mp != NULL) {
 			BUMP_MIB(&sctps->sctps_mib, sctpTimRetrans);
-			sctp_add_sendq(sctp, mp);
+			(void) conn_ip_output(mp, fp->ixa);
+			BUMP_LOCAL(sctp->sctp_opkts);
 		}
 		break;
-	case SCTPS_COOKIE_ECHOED: {
-		ipha_t *iph;
-
+	case SCTPS_COOKIE_ECHOED:
 		BUMP_LOCAL(sctp->sctp_T1expire);
 		if (sctp->sctp_cookie_mp == NULL) {
 			sctp->sctp_state = SCTPS_COOKIE_WAIT;
@@ -659,14 +657,10 @@ rxmit_init:
 		mp = dupmsg(sctp->sctp_cookie_mp);
 		if (mp == NULL)
 			break;
-		iph = (ipha_t *)mp->b_rptr;
-		/* Reset the IP ident. */
-		if (IPH_HDR_VERSION(iph) == IPV4_VERSION)
-			iph->ipha_ident = 0;
-		sctp_add_sendq(sctp, mp);
+		(void) conn_ip_output(mp, fp->ixa);
+		BUMP_LOCAL(sctp->sctp_opkts);
 		BUMP_MIB(&sctps->sctps_mib, sctpTimRetrans);
 		break;
-	}
 	case SCTPS_SHUTDOWN_SENT:
 		BUMP_LOCAL(sctp->sctp_T2expire);
 		sctp_send_shutdown(sctp, 1);
diff --git a/usr/src/uts/common/inet/sctp_ip.h b/usr/src/uts/common/inet/sctp_ip.h
index 7b20d3fd2b..9e4c2ef7ec 100644
--- a/usr/src/uts/common/inet/sctp_ip.h
+++ b/usr/src/uts/common/inet/sctp_ip.h
@@ -35,40 +35,24 @@ extern "C" {
 #define	SCTP_COMMON_HDR_LENGTH	12	/* SCTP common header length */
 
 /* SCTP routines for IP to call. */
-extern void ip_fanout_sctp(mblk_t *, ill_t *, ipha_t *, uint32_t,
-    uint_t, boolean_t, boolean_t, zoneid_t);
+extern void ip_fanout_sctp(mblk_t *, ipha_t *, ip6_t *, uint32_t,
+    ip_recv_attr_t *);
 extern void sctp_ddi_g_init(void);
 extern void sctp_ddi_g_destroy(void);
 extern conn_t *sctp_find_conn(in6_addr_t *, in6_addr_t *, uint32_t,
-    zoneid_t, sctp_stack_t *);
+    zoneid_t, iaflags_t, sctp_stack_t *);
 extern conn_t *sctp_fanout(in6_addr_t *, in6_addr_t *, uint32_t,
-    zoneid_t, mblk_t *, sctp_stack_t *);
+    ip_recv_attr_t *, mblk_t *, sctp_stack_t *);
 
-extern void sctp_input(conn_t *, ipha_t *, mblk_t *, mblk_t *, ill_t *,
-    boolean_t, boolean_t);
+extern void sctp_input(conn_t *, ipha_t *, ip6_t *, mblk_t *, ip_recv_attr_t *);
 extern void sctp_wput(queue_t *, mblk_t *);
-extern void sctp_ootb_input(mblk_t *, ill_t *, zoneid_t, boolean_t);
+extern void sctp_ootb_input(mblk_t *, ip_recv_attr_t *, ip_stack_t *);
 extern void sctp_hash_init(sctp_stack_t *);
 extern void sctp_hash_destroy(sctp_stack_t *);
 extern uint32_t sctp_cksum(mblk_t *, int);
 extern mblk_t *sctp_snmp_get_mib2(queue_t *, mblk_t *, sctp_stack_t *);
 extern void sctp_free(conn_t *);
 
-#define	SCTP_STASH_IPINFO(mp, ire)			\
-{							\
-	unsigned char *stp;				\
-	stp = DB_BASE((mp));				\
-	ASSERT(stp + sizeof (ire_t *) < (mp)->b_rptr);	\
-	*(ire_t **)stp  = (ire);			\
-}
-
-#define	SCTP_EXTRACT_IPINFO(mp, ire)			\
-{							\
-	unsigned char *stp;				\
-	stp = (mp)->b_datap->db_base;			\
-	(ire) = *(ire_t **)stp;				\
-}
-
 /*
  * SCTP maintains a list of ILLs/IPIFs, these functions are provided by
  * SCTP to keep its interface list up to date.
@@ -87,16 +71,8 @@ extern void sctp_ill_reindex(ill_t *, uint_t);
 #define	SCTP_IPIF_UPDATE	6
 
 /* IP routines for SCTP to call. */
-extern void ip_fanout_sctp_raw(mblk_t *, ill_t *, ipha_t *, boolean_t,
-    uint32_t, boolean_t, uint_t, boolean_t, zoneid_t);
-extern void sctp_ire_cache_flush(ipif_t *);
-
-/*
- * Private (and possibly temporary) ioctls.  It is a large number
- * to avoid conflict with other ioctls, which are normally smaller
- * than 2^16.
- */
-#define	SCTP_IOC_DEFAULT_Q	(('S' << 16) | 1024)
+extern void ip_fanout_sctp_raw(mblk_t *, ipha_t *, ip6_t *, uint32_t,
+    ip_recv_attr_t *);
 
 #ifdef __cplusplus
 }
diff --git a/usr/src/uts/common/inet/sctp_itf.h b/usr/src/uts/common/inet/sctp_itf.h
index 9ce69fdaf0..2ae6d3669f 100644
--- a/usr/src/uts/common/inet/sctp_itf.h
+++ b/usr/src/uts/common/inet/sctp_itf.h
@@ -83,9 +83,9 @@ extern int sctp_bindx(struct sctp_s *conn, const void *addrs, int addrcnt,
     int flags);
 extern void sctp_close(struct sctp_s *conn);
 extern int sctp_connect(struct sctp_s *conn, const struct sockaddr *dst,
-    socklen_t addrlen);
+    socklen_t addrlen, cred_t *cr, pid_t pid);
 extern struct sctp_s *sctp_create(void *newhandle, struct sctp_s *parent,
-    int family, int flags, struct sock_upcalls_s *su,
+    int family, int type, int flags, struct sock_upcalls_s *su,
     sctp_sockbuf_limits_t *sbl, cred_t *cr);
 extern int sctp_disconnect(struct sctp_s *conn);
 extern int sctp_get_opt(struct sctp_s *conn, int level, int opt, void *opts,
diff --git a/usr/src/uts/common/inet/sockmods/socksctp.c b/usr/src/uts/common/inet/sockmods/socksctp.c
index 7da9f92dde..4df7e33501 100644
--- a/usr/src/uts/common/inet/sockmods/socksctp.c
+++ b/usr/src/uts/common/inet/sockmods/socksctp.c
@@ -207,7 +207,7 @@ sosctp_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags)
 		upcalls = &sosctp_assoc_upcalls;
 	}
 	so->so_proto_handle = (sock_lower_handle_t)sctp_create(so, NULL,
-	    so->so_family, SCTP_CAN_BLOCK, upcalls, &sbl, cr);
+	    so->so_family, so->so_type, SCTP_CAN_BLOCK, upcalls, &sbl, cr);
 	if (so->so_proto_handle == NULL)
 		return (ENOMEM);
 
@@ -350,6 +350,7 @@ sosctp_connect(struct sonode *so, const struct sockaddr *name,
     socklen_t namelen, int fflag, int flags, struct cred *cr)
 {
 	int error = 0;
+	pid_t pid = curproc->p_pid;
 
 	ASSERT(so->so_type == SOCK_STREAM);
 
@@ -404,7 +405,7 @@ sosctp_connect(struct sonode *so, const struct sockaddr *name,
 	mutex_exit(&so->so_lock);
 
 	error = sctp_connect((struct sctp_s *)so->so_proto_handle,
-	    name, namelen);
+	    name, namelen, cr, pid);
 
 	mutex_enter(&so->so_lock);
 	if (error == 0) {
@@ -662,7 +663,7 @@ done:
 
 int
 sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, int wroff,
-    struct uio *uiop, int flags, cred_t *cr)
+    struct uio *uiop, int flags)
 {
 	ssize_t size;
 	int error;
@@ -683,8 +684,7 @@ sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, int wroff,
 		 * packets, each mblk will have the extra space before
 		 * data to accommodate what SCTP wants to put in there.
 		 */
-		while ((mp = allocb_cred(size + wroff, cr,
-		    curproc->p_pid)) == NULL) {
+		while ((mp = allocb(size + wroff, BPRI_MED)) == NULL) {
 			if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
 			    (flags & MSG_DONTWAIT)) {
 				return (EAGAIN);
@@ -887,7 +887,7 @@ sosctp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
 
 	/* Copy in the message. */
 	if ((error = sosctp_uiomove(mctl, count, ss->ss_wrsize, ss->ss_wroff,
-	    uiop, flags, cr)) != 0) {
+	    uiop, flags)) != 0) {
 		goto error_ret;
 	}
 	error = sctp_sendmsg((struct sctp_s *)so->so_proto_handle, mctl, 0);
@@ -1091,7 +1091,7 @@ sosctp_seq_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
 
 	/* Copy in the message. */
 	if ((error = sosctp_uiomove(mctl, count, ssa->ssa_wrsize,
-	    ssa->ssa_wroff, uiop, flags, cr)) != 0) {
+	    ssa->ssa_wroff, uiop, flags)) != 0) {
 		goto lock_rele;
 	}
 	error = sctp_sendmsg((struct sctp_s *)ssa->ssa_conn, mctl, 0);
diff --git a/usr/src/uts/common/inet/sockmods/socksctp.h b/usr/src/uts/common/inet/sockmods/socksctp.h
index b02622c994..2ac7058821 100644
--- a/usr/src/uts/common/inet/sockmods/socksctp.h
+++ b/usr/src/uts/common/inet/sockmods/socksctp.h
@@ -116,7 +116,7 @@ extern void sosctp_assoc_isdisconnected(struct sctp_soassoc *ssa, int error);
 
 extern int sosctp_waitconnected(struct sonode *so, int fmode);
 extern int sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size,
-    int wroff, struct uio *uiop, int flags, cred_t *cr);
+    int wroff, struct uio *uiop, int flags);
 
 /*
  * Data structure types.
diff --git a/usr/src/uts/common/inet/sockmods/socksctpsubr.c b/usr/src/uts/common/inet/sockmods/socksctpsubr.c
index 4a4cb08007..a647cbe4f2 100644
--- a/usr/src/uts/common/inet/sockmods/socksctpsubr.c
+++ b/usr/src/uts/common/inet/sockmods/socksctpsubr.c
@@ -367,6 +367,7 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name,
 	sctp_assoc_t id;
 	int error;
 	struct cmsghdr *cmsg;
+	pid_t pid = curproc->p_pid;
 
 	ASSERT(MUTEX_HELD(&so->so_lock));
 
@@ -407,7 +408,8 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name,
 	ssa->ssa_wroff = ss->ss_wroff;
 	ssa->ssa_wrsize = ss->ss_wrsize;
 	ssa->ssa_conn = sctp_create(ssa, (struct sctp_s *)so->so_proto_handle,
-	    so->so_family, SCTP_CAN_BLOCK, &sosctp_assoc_upcalls, &sbl, cr);
+	    so->so_family, so->so_type, SCTP_CAN_BLOCK, &sosctp_assoc_upcalls,
+	    &sbl, cr);
 
 	mutex_enter(&so->so_lock);
 	ss->ss_assocs[id].ssi_assoc = ssa;
@@ -435,7 +437,7 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name,
 			goto ret_err;
 	}
 
-	if ((error = sctp_connect(ssa->ssa_conn, name, namelen)) != 0)
+	if ((error = sctp_connect(ssa->ssa_conn, name, namelen, cr, pid)) != 0)
 		goto ret_err;
 
 	mutex_enter(&so->so_lock);
diff --git a/usr/src/uts/common/inet/spdsock.h b/usr/src/uts/common/inet/spdsock.h
index 7622e56a45..64c63cdd71 100644
--- a/usr/src/uts/common/inet/spdsock.h
+++ b/usr/src/uts/common/inet/spdsock.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -110,7 +110,7 @@ extern uint_t		spdsock_max_optsize;
 
 extern int spdsock_opt_get(queue_t *, int, int, uchar_t *);
 extern int spdsock_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
-    uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+    uint_t *, uchar_t *, void *, cred_t *);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index e46293d820..db11ef79ae 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -39,8 +39,8 @@
  * parallelization (on a per H/W execution pipeline basis) with at
  * most one queuing.
  *
- * The modules needing protection typically calls squeue_enter() or
- * squeue_enter_chain() routine as soon as a thread enter the module
+ * The modules needing protection typically calls SQUEUE_ENTER_ONE() or
+ * SQUEUE_ENTER() macro as soon as a thread enter the module
  * from either direction. For each packet, the processing function
  * and argument is stored in the mblk itself. When the packet is ready
  * to be processed, the squeue retrieves the stored function and calls
@@ -406,11 +406,15 @@ squeue_worker_wakeup(squeue_t *sqp)
  * and drain in the entering thread context. If process_flag is
  * SQ_FILL, then we just queue the mblk and return (after signaling
  * the worker thread if no one else is processing the squeue).
+ *
+ * The ira argument can be used when the count is one.
+ * For a chain the caller needs to prepend any needed mblks from
+ * ip_recv_attr_to_mblk().
  */
 /* ARGSUSED */
 void
 squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
-    int process_flag, uint8_t tag)
+    ip_recv_attr_t *ira, int process_flag, uint8_t tag)
 {
 	conn_t		*connp;
 	sqproc_t	proc;
@@ -421,6 +425,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 	ASSERT(tail != NULL);
 	ASSERT(cnt > 0);
 	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
+	ASSERT(ira == NULL || cnt == 1);
 
 	mutex_enter(&sqp->sq_lock);
 
@@ -467,7 +472,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 				connp->conn_on_sqp = B_TRUE;
 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 				    sqp, mblk_t *, mp, conn_t *, connp);
-				(*proc)(connp, mp, sqp);
+				(*proc)(connp, mp, sqp, ira);
 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 				    sqp, conn_t *, connp);
 				connp->conn_on_sqp = B_FALSE;
@@ -475,7 +480,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 				CONN_DEC_REF(connp);
 			} else {
 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
-				    connp, SQ_FILL, SQTAG_SQUEUE_CHANGE);
+				    connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
 			}
 			ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
 			mutex_enter(&sqp->sq_lock);
@@ -499,6 +504,33 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 				return;
 			}
 		} else {
+			if (ira != NULL) {
+				mblk_t	*attrmp;
+
+				ASSERT(cnt == 1);
+				attrmp = ip_recv_attr_to_mblk(ira);
+				if (attrmp == NULL) {
+					mutex_exit(&sqp->sq_lock);
+					ip_drop_input("squeue: "
+					    "ip_recv_attr_to_mblk",
+					    mp, NULL);
+					/* Caller already set b_prev/b_next */
+					mp->b_prev = mp->b_next = NULL;
+					freemsg(mp);
+					return;
+				}
+				ASSERT(attrmp->b_cont == NULL);
+				attrmp->b_cont = mp;
+				/* Move connp and func to new */
+				attrmp->b_queue = mp->b_queue;
+				mp->b_queue = NULL;
+				attrmp->b_prev = mp->b_prev;
+				mp->b_prev = NULL;
+
+				ASSERT(mp == tail);
+				tail = mp = attrmp;
+			}
+
 			ENQUEUE_CHAIN(sqp, mp, tail, cnt);
 #ifdef DEBUG
 			mp->b_tag = tag;
@@ -564,14 +596,14 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 				connp->conn_on_sqp = B_TRUE;
 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 				    sqp, mblk_t *, mp, conn_t *, connp);
-				(*proc)(connp, mp, sqp);
+				(*proc)(connp, mp, sqp, ira);
 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 				    sqp, conn_t *, connp);
 				connp->conn_on_sqp = B_FALSE;
 				CONN_DEC_REF(connp);
 			} else {
 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
-				    connp, SQ_FILL, SQTAG_SQUEUE_CHANGE);
+				    connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
 			}
 
 			mutex_enter(&sqp->sq_lock);
@@ -589,7 +621,31 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 #ifdef DEBUG
 		mp->b_tag = tag;
 #endif
+		if (ira != NULL) {
+			mblk_t	*attrmp;
 
+			ASSERT(cnt == 1);
+			attrmp = ip_recv_attr_to_mblk(ira);
+			if (attrmp == NULL) {
+				mutex_exit(&sqp->sq_lock);
+				ip_drop_input("squeue: ip_recv_attr_to_mblk",
+				    mp, NULL);
+				/* Caller already set b_prev/b_next */
+				mp->b_prev = mp->b_next = NULL;
+				freemsg(mp);
+				return;
+			}
+			ASSERT(attrmp->b_cont == NULL);
+			attrmp->b_cont = mp;
+			/* Move connp and func to new */
+			attrmp->b_queue = mp->b_queue;
+			mp->b_queue = NULL;
+			attrmp->b_prev = mp->b_prev;
+			mp->b_prev = NULL;
+
+			ASSERT(mp == tail);
+			tail = mp = attrmp;
+		}
 		ENQUEUE_CHAIN(sqp, mp, tail, cnt);
 		if (!(sqp->sq_state & SQS_PROC)) {
 			squeue_worker_wakeup(sqp);
@@ -653,6 +709,7 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
 	hrtime_t 	now;
 	boolean_t	did_wakeup = B_FALSE;
 	boolean_t	sq_poll_capable;
+	ip_recv_attr_t	*ira, iras;
 
 	sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0;
 again:
@@ -697,6 +754,31 @@ again:
 		connp = (conn_t *)mp->b_prev;
 		mp->b_prev = NULL;
 
+		/* Is there an ip_recv_attr_t to handle? */
+		if (ip_recv_attr_is_mblk(mp)) {
+			mblk_t	*attrmp = mp;
+
+			ASSERT(attrmp->b_cont != NULL);
+
+			mp = attrmp->b_cont;
+			attrmp->b_cont = NULL;
+			ASSERT(mp->b_queue == NULL);
+			ASSERT(mp->b_prev == NULL);
+
+			if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
+				/* The ill or ip_stack_t disappeared on us */
+				ip_drop_input("ip_recv_attr_from_mblk",
+				    mp, NULL);
+				ira_cleanup(&iras, B_TRUE);
+				CONN_DEC_REF(connp);
+				continue;
+			}
+			ira = &iras;
+		} else {
+			ira = NULL;
+		}
+
+
 		/*
 		 * Handle squeue switching. More details in the
 		 * block comment at the top of the file
@@ -707,15 +789,17 @@ again:
 			connp->conn_on_sqp = B_TRUE;
 			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 			    sqp, mblk_t *, mp, conn_t *, connp);
-			(*proc)(connp, mp, sqp);
+			(*proc)(connp, mp, sqp, ira);
 			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 			    sqp, conn_t *, connp);
 			connp->conn_on_sqp = B_FALSE;
 			CONN_DEC_REF(connp);
 		} else {
-			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp,
+			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
 			    SQ_FILL, SQTAG_SQUEUE_CHANGE);
 		}
+		if (ira != NULL)
+			ira_cleanup(ira, B_TRUE);
 	}
 
 	SQUEUE_DBG_CLEAR(sqp);
@@ -991,9 +1075,13 @@ poll_again:
 			    &tail, &cnt);
 		}
 		mutex_enter(lock);
-		if (mp != NULL)
+		if (mp != NULL) {
+			/*
+			 * The ip_accept function has already added an
+			 * ip_recv_attr_t mblk if that is needed.
+			 */
 			ENQUEUE_CHAIN(sqp, mp, tail, cnt);
-
+		}
 		ASSERT((sqp->sq_state &
 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
@@ -1263,7 +1351,7 @@ squeue_getprivate(squeue_t *sqp, sqprivate_t p)
 
 /* ARGSUSED */
 void
-squeue_wakeup_conn(void *arg, mblk_t *mp, void *arg2)
+squeue_wakeup_conn(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	conn_t *connp = (conn_t *)arg;
 	squeue_t *sqp = connp->conn_sqp;
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index 8442c4f384..321d0756fc 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -36,7 +36,6 @@ extern "C" {
 #include <netinet/tcp.h>
 #include <sys/socket.h>
 #include <sys/socket_proto.h>
-#include <sys/multidata.h>
 #include <sys/md5.h>
 #include <inet/common.h>
 #include <inet/ip.h>
@@ -47,12 +46,6 @@ extern "C" {
 #include <inet/tcp_sack.h>
 #include <inet/kssl/ksslapi.h>
 
-/*
- * Private (and possibly temporary) ioctl used by configuration code
- * to lock in the "default" stream for detached closes.
- */
-#define	TCP_IOC_DEFAULT_Q	(('T' << 8) + 51)
-
 /* TCP states */
 #define	TCPS_CLOSED		-6
 #define	TCPS_IDLE		-5	/* idle (opened, but not bound) */
@@ -73,7 +66,7 @@ extern "C" {
 
 /*
  * Internal flags used in conjunction with the packet header flags.
- * Used in tcp_rput_data to keep track of what needs to be done.
+ * Used in tcp_input_data to keep track of what needs to be done.
  */
 #define	TH_LIMIT_XMIT		0x0400	/* Limited xmit is needed */
 #define	TH_XMIT_NEEDED		0x0800	/* Window opened - send queued data */
@@ -108,11 +101,12 @@ typedef	struct tcphdr_s {
 	uint8_t		th_urp[2];	/* Urgent pointer */
 } tcph_t;
 
-#define	TCP_HDR_LENGTH(tcph) (((tcph)->th_offset_and_rsrvd[0] >>2) &(0xF << 2))
+#define	TCP_HDR_LENGTH(tcph) \
+	((((tcph_t *)tcph)->th_offset_and_rsrvd[0] >>2) &(0xF << 2))
 #define	TCP_MAX_COMBINED_HEADER_LENGTH	(60 + 60) /* Maxed out ip + tcp */
 #define	TCP_MAX_IP_OPTIONS_LENGTH	(60 - IP_SIMPLE_HDR_LENGTH)
 #define	TCP_MAX_HDR_LENGTH		60
-#define	TCP_MAX_TCP_OPTIONS_LENGTH	(60 - sizeof (tcph_t))
+#define	TCP_MAX_TCP_OPTIONS_LENGTH	(60 - sizeof (tcpha_t))
 #define	TCP_MIN_HEADER_LENGTH		20
 #define	TCP_MAXWIN			65535
 #define	TCP_PORT_LEN			sizeof (in_port_t)
@@ -122,7 +116,7 @@ typedef	struct tcphdr_s {
 
 #define	TCPIP_HDR_LENGTH(mp, n)					\
 	(n) = IPH_HDR_LENGTH((mp)->b_rptr),			\
-	(n) += TCP_HDR_LENGTH((tcph_t *)&(mp)->b_rptr[(n)])
+	(n) += TCP_HDR_LENGTH((tcpha_t *)&(mp)->b_rptr[(n)])
 
 /* TCP Protocol header (used if the header is known to be 32-bit aligned) */
 typedef	struct tcphdra_s {
@@ -173,9 +167,6 @@ typedef struct tcp_s {
 	uint32_t tcp_rnxt;		/* Seq we expect to recv next */
 	uint32_t tcp_rwnd;
 
-	queue_t	*tcp_rq;		/* Our upstream neighbor (client) */
-	queue_t	*tcp_wq;		/* Our downstream neighbor */
-
 	/* Fields arranged in approximate access order along main paths */
 	mblk_t	*tcp_xmit_head;		/* Head of rexmit list */
 	mblk_t	*tcp_xmit_last;		/* last valid data seen by tcp_wput */
@@ -207,46 +198,16 @@ typedef struct tcp_s {
 	int64_t tcp_last_recv_time;	/* Last time we receive a segment. */
 	uint32_t tcp_init_cwnd;		/* Initial cwnd (start/restart) */
 
-	/*
-	 * Following socket options are set by sockfs outside the squeue
-	 * and we want to separate these bit fields from the other bit fields
-	 * set by TCP to avoid grabbing locks. sockfs ensures that only one
-	 * thread in sockfs can set a socket option at a time on a conn_t.
-	 * However TCP may read these options concurrently. The linger option
-	 * needs atomicity since tcp_lingertime also needs to be in sync.
-	 * However TCP uses it only during close, and by then no socket option
-	 * can come down. So we don't need any locks, instead just separating
-	 * the sockfs settable bit fields from the other bit fields is
-	 * sufficient.
-	 */
-	uint32_t
-		tcp_debug : 1,		/* SO_DEBUG "socket" option. */
-		tcp_dontroute : 1,	/* SO_DONTROUTE "socket" option. */
-		tcp_broadcast : 1,	/* SO_BROADCAST "socket" option. */
-		tcp_useloopback : 1,	/* SO_USELOOPBACK "socket" option. */
-
-		tcp_oobinline : 1,	/* SO_OOBINLINE "socket" option. */
-		tcp_dgram_errind : 1,	/* SO_DGRAM_ERRIND option */
-		tcp_linger : 1,		/* SO_LINGER turned on */
-		tcp_reuseaddr	: 1,	/* SO_REUSEADDR "socket" option. */
-
-		tcp_junk_to_bit_31 : 24;
-
 	/* Following manipulated by TCP under squeue protection */
 	uint32_t
 		tcp_urp_last_valid : 1,	/* Is tcp_urp_last valid? */
-		tcp_hard_binding : 1,	/* If we've started a full bind */
-		tcp_hard_bound : 1,	/* If we've done a full bind with IP */
+		tcp_hard_binding : 1,	/* TCP_DETACHED_NONEAGER */
 		tcp_fin_acked : 1,	/* Has our FIN been acked? */
-
 		tcp_fin_rcvd : 1,	/* Have we seen a FIN? */
+
 		tcp_fin_sent : 1,	/* Have we sent our FIN yet? */
 		tcp_ordrel_done : 1,	/* Have we sent the ord_rel upstream? */
 		tcp_detached : 1,	/* If we're detached from a stream */
-
-		tcp_bind_pending : 1,	/* Client is waiting for bind ack */
-		tcp_unbind_pending : 1, /* Client sent T_UNBIND_REQ */
-		tcp_ka_enabled: 1,	/* Connection KeepAlive Timer needed */
 		tcp_zero_win_probe: 1,	/* Zero win probing is in progress */
 
 		tcp_loopback: 1,	/* src and dst are the same machine */
@@ -258,44 +219,40 @@ typedef struct tcp_s {
 		tcp_active_open: 1,	/* This is a active open */
 		tcp_rexmit : 1,		/* TCP is retransmitting */
 		tcp_snd_sack_ok : 1,	/* Can use SACK for this connection */
-		tcp_empty_flag : 1,	/* Empty flag for future use */
-
-		tcp_recvdstaddr : 1,	/* return T_EXTCONN_IND with dst addr */
 		tcp_hwcksum : 1,	/* The NIC is capable of hwcksum */
-		tcp_ip_forward_progress : 1,
-		tcp_anon_priv_bind : 1,
 
+		tcp_ip_forward_progress : 1,
 		tcp_ecn_ok : 1,		/* Can use ECN for this connection */
 		tcp_ecn_echo_on : 1,	/* Need to do ECN echo */
 		tcp_ecn_cwr_sent : 1,	/* ECN_CWR has been sent */
+
 		tcp_cwr : 1,		/* Cwnd has reduced recently */
 
-		tcp_pad_to_bit31 : 4;
+		tcp_pad_to_bit31 : 11;
+
 	/* Following manipulated by TCP under squeue protection */
 	uint32_t
-		tcp_mdt : 1,		/* Lower layer is capable of MDT */
 		tcp_snd_ts_ok  : 1,
 		tcp_snd_ws_ok  : 1,
-		tcp_exclbind	: 1,	/* ``exclusive'' binding */
-
-		tcp_hdr_grown	: 1,
+		tcp_reserved_port : 1,
 		tcp_in_free_list : 1,
-		tcp_snd_zcopy_on : 1,	/* xmit zero-copy enabled */
 
+		tcp_snd_zcopy_on : 1,	/* xmit zero-copy enabled */
 		tcp_snd_zcopy_aware : 1, /* client is zero-copy aware */
 		tcp_xmit_zc_clean : 1,	/* the xmit list is free of zc-mblk */
 		tcp_wait_for_eagers : 1, /* Wait for eagers to disappear */
-		tcp_accept_error : 1,	/* Error during TLI accept */
 
+		tcp_accept_error : 1,	/* Error during TLI accept */
 		tcp_send_discon_ind : 1, /* TLI accept err, send discon ind */
 		tcp_cork : 1,		/* tcp_cork option */
 		tcp_tconnind_started : 1, /* conn_ind message is being sent */
+
 		tcp_lso :1,		/* Lower layer is capable of LSO */
-		tcp_refuse :1,		/* Connection needs refusing */
 		tcp_is_wnd_shrnk : 1, /* Window has shrunk */
-		tcp_pad_to_bit_31 : 15;
 
-	uint32_t	tcp_if_mtu;	/* Outgoing interface MTU. */
+		tcp_pad_to_bit_31 : 18;
+
+	uint32_t	tcp_initial_pmtu; /* Initial outgoing Path MTU. */
 
 	mblk_t	*tcp_reass_head;	/* Out of order reassembly list head */
 	mblk_t	*tcp_reass_tail;	/* Out of order reassembly list tail */
@@ -340,11 +297,6 @@ typedef struct tcp_s {
 
 	struct tcp_s *tcp_listener;	/* Our listener */
 
-	size_t	tcp_xmit_hiwater;	/* Send buffer high water mark. */
-	size_t	tcp_xmit_lowater;	/* Send buffer low water mark. */
-	size_t	tcp_recv_hiwater;	/* Recv high water mark */
-	size_t	tcp_recv_lowater;	/* Recv low water mark */
-
 	uint32_t tcp_irs;		/* Initial recv seq num */
 	uint32_t tcp_fss;		/* Final/fin send seq num */
 	uint32_t tcp_urg;		/* Urgent data seq num */
@@ -354,8 +306,6 @@ typedef struct tcp_s {
 	clock_t	tcp_first_ctimer_threshold; /* 1st threshold while connecting */
 	clock_t tcp_second_ctimer_threshold; /* 2nd ... while connecting */
 
-	int	tcp_lingertime;		/* Close linger time (in seconds) */
-
 	uint32_t tcp_urp_last;		/* Last urp for which signal sent */
 	mblk_t	*tcp_urp_mp;		/* T_EXDATA_IND for urgent byte */
 	mblk_t	*tcp_urp_mark_mp;	/* zero-length marked/unmarked msg */
@@ -389,21 +339,14 @@ typedef struct tcp_s {
 
 	int32_t	tcp_client_errno;	/* How the client screwed up */
 
-	char	*tcp_iphc;		/* Buffer holding tcp/ip hdr template */
-	int	tcp_iphc_len;		/* actual allocated buffer size */
-	int32_t	tcp_hdr_len;		/* Byte len of combined TCP/IP hdr */
-	ipha_t	*tcp_ipha;		/* IPv4 header in the buffer */
-	ip6_t	*tcp_ip6h;		/* IPv6 header in the buffer */
-	int	tcp_ip_hdr_len;		/* Byte len of our current IPvx hdr */
-	tcph_t	*tcp_tcph;		/* tcp header within combined hdr */
-	int32_t	tcp_tcp_hdr_len;	/* tcp header len within combined */
-	/* Saved peer headers in the case of re-fusion */
-	ipha_t	tcp_saved_ipha;
-	ip6_t	tcp_saved_ip6h;
-	tcph_t	tcp_saved_tcph;
-
-	uint32_t tcp_sum;		/* checksum to compensate for source */
-					/* routed packets. Host byte order */
+	/*
+	 * The header template lives in conn_ht_iphc allocated by tcp_build_hdrs
+	 * We maintain three pointers into conn_ht_iphc.
+	 */
+	ipha_t	*tcp_ipha;		/* IPv4 header in conn_ht_iphc */
+	ip6_t	*tcp_ip6h;		/* IPv6 header in conn_ht_iphc */
+	tcpha_t	*tcp_tcpha;		/* TCP header in conn_ht_iphc */
+
 	uint16_t tcp_last_sent_len;	/* Record length for nagle */
 	uint16_t tcp_dupack_cnt;	/* # of consequtive duplicate acks */
 
@@ -413,75 +356,20 @@ typedef struct tcp_s {
 	t_uscalar_t	tcp_acceptor_id;	/* ACCEPTOR_id */
 
 	int		tcp_ipsec_overhead;
-	/*
-	 * Address family that app wishes returned addrsses to be in.
-	 * Currently taken from address family used in T_BIND_REQ, but
-	 * should really come from family used in original socket() call.
-	 * Value can be AF_INET or AF_INET6.
-	 */
-	uint_t	tcp_family;
-	/*
-	 * used for a quick test to determine if any ancillary bits are
-	 * set
-	 */
-	uint_t		tcp_ipv6_recvancillary;		/* Flags */
-#define	TCP_IPV6_RECVPKTINFO	0x01	/* IPV6_RECVPKTINFO option  */
-#define	TCP_IPV6_RECVHOPLIMIT	0x02	/* IPV6_RECVHOPLIMIT option */
-#define	TCP_IPV6_RECVHOPOPTS	0x04	/* IPV6_RECVHOPOPTS option */
-#define	TCP_IPV6_RECVDSTOPTS	0x08	/* IPV6_RECVDSTOPTS option */
-#define	TCP_IPV6_RECVRTHDR	0x10	/* IPV6_RECVRTHDR option */
-#define	TCP_IPV6_RECVRTDSTOPTS	0x20	/* IPV6_RECVRTHDRDSTOPTS option */
-#define	TCP_IPV6_RECVTCLASS	0x40	/* IPV6_RECVTCLASS option */
-#define	TCP_OLD_IPV6_RECVDSTOPTS 0x80	/* old IPV6_RECVDSTOPTS option */
 
 	uint_t		tcp_recvifindex; /* Last received IPV6_RCVPKTINFO */
 	uint_t		tcp_recvhops;	/* Last received IPV6_RECVHOPLIMIT */
 	uint_t		tcp_recvtclass;	/* Last received IPV6_RECVTCLASS */
 	ip6_hbh_t	*tcp_hopopts;	/* Last received IPV6_RECVHOPOPTS */
 	ip6_dest_t	*tcp_dstopts;	/* Last received IPV6_RECVDSTOPTS */
-	ip6_dest_t	*tcp_rtdstopts;	/* Last recvd IPV6_RECVRTHDRDSTOPTS */
+	ip6_dest_t	*tcp_rthdrdstopts; /* Last recv IPV6_RECVRTHDRDSTOPTS */
 	ip6_rthdr_t	*tcp_rthdr;	/* Last received IPV6_RECVRTHDR */
 	uint_t		tcp_hopoptslen;
 	uint_t		tcp_dstoptslen;
-	uint_t		tcp_rtdstoptslen;
+	uint_t		tcp_rthdrdstoptslen;
 	uint_t		tcp_rthdrlen;
 
 	mblk_t		*tcp_timercache;
-	cred_t		*tcp_cred;	/* Credentials when this was opened */
-	pid_t		tcp_cpid;	/* Process id when this was opened */
-	uint64_t	tcp_open_time;	/* time when this was opened */
-
-
-	union {
-		struct {
-			uchar_t	v4_ttl;
-				/* Dup of tcp_ipha.iph_type_of_service */
-			uchar_t	v4_tos; /* Dup of tcp_ipha.iph_ttl */
-		} v4_hdr_info;
-		struct {
-			uint_t	v6_vcf;		/* Dup of tcp_ip6h.ip6h_vcf */
-			uchar_t	v6_hops;	/* Dup of tcp_ip6h.ip6h_hops */
-		} v6_hdr_info;
-	} tcp_hdr_info;
-#define	tcp_ttl	tcp_hdr_info.v4_hdr_info.v4_ttl
-#define	tcp_tos	tcp_hdr_info.v4_hdr_info.v4_tos
-#define	tcp_ip6_vcf	tcp_hdr_info.v6_hdr_info.v6_vcf
-#define	tcp_ip6_hops	tcp_hdr_info.v6_hdr_info.v6_hops
-
-	ushort_t	tcp_ipversion;
-	uint_t		tcp_bound_if;	/* IPV6_BOUND_IF */
-
-#define	tcp_lport	tcp_connp->conn_lport
-#define	tcp_fport	tcp_connp->conn_fport
-#define	tcp_ports	tcp_connp->conn_ports
-
-#define	tcp_remote	tcp_connp->conn_rem
-#define	tcp_ip_src	tcp_connp->conn_src
-
-#define	tcp_remote_v6	tcp_connp->conn_remv6
-#define	tcp_ip_src_v6	tcp_connp->conn_srcv6
-#define	tcp_bound_source_v6	tcp_connp->conn_bound_source_v6
-#define	tcp_bound_source	tcp_connp->conn_bound_source
 
 	kmutex_t	tcp_closelock;
 	kcondvar_t	tcp_closecv;
@@ -497,36 +385,13 @@ typedef struct tcp_s {
 	struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */
 	struct tcp_s **tcp_ptpbhn;
 
-	boolean_t	tcp_ire_ill_check_done;
-	uint_t		tcp_maxpsz;
-
-	/*
-	 * used for Multidata Transmit
-	 */
-	uint_t	tcp_mdt_hdr_head; /* leading header fragment extra space */
-	uint_t	tcp_mdt_hdr_tail; /* trailing header fragment extra space */
-	int	tcp_mdt_max_pld;  /* maximum payload buffers per Multidata */
+	uint_t		tcp_maxpsz_multiplier;
 
 	uint32_t	tcp_lso_max; /* maximum LSO payload */
 
 	uint32_t	tcp_ofo_fin_seq; /* Recv out of order FIN seq num */
 	uint32_t	tcp_cwr_snd_max;
-	uint_t		tcp_drop_opt_ack_cnt; /* # tcp generated optmgmt */
-	ip6_pkt_t	tcp_sticky_ipp;			/* Sticky options */
-#define	tcp_ipp_fields	tcp_sticky_ipp.ipp_fields	/* valid fields */
-#define	tcp_ipp_ifindex	tcp_sticky_ipp.ipp_ifindex	/* pktinfo ifindex */
-#define	tcp_ipp_addr	tcp_sticky_ipp.ipp_addr	/* pktinfo src/dst addr */
-#define	tcp_ipp_hoplimit	tcp_sticky_ipp.ipp_hoplimit
-#define	tcp_ipp_hopoptslen	tcp_sticky_ipp.ipp_hopoptslen
-#define	tcp_ipp_rtdstoptslen	tcp_sticky_ipp.ipp_rtdstoptslen
-#define	tcp_ipp_rthdrlen	tcp_sticky_ipp.ipp_rthdrlen
-#define	tcp_ipp_dstoptslen	tcp_sticky_ipp.ipp_dstoptslen
-#define	tcp_ipp_hopopts		tcp_sticky_ipp.ipp_hopopts
-#define	tcp_ipp_rtdstopts	tcp_sticky_ipp.ipp_rtdstopts
-#define	tcp_ipp_rthdr		tcp_sticky_ipp.ipp_rthdr
-#define	tcp_ipp_dstopts		tcp_sticky_ipp.ipp_dstopts
-#define	tcp_ipp_nexthop		tcp_sticky_ipp.ipp_nexthop
-#define	tcp_ipp_use_min_mtu	tcp_sticky_ipp.ipp_use_min_mtu
+
 	struct tcp_s *tcp_saved_listener;	/* saved value of listener */
 
 	uint32_t	tcp_in_ack_unsent;	/* ACK for unsent data cnt. */
@@ -562,7 +427,6 @@ typedef struct tcp_s {
 	boolean_t		tcp_kssl_inhandshake; /* during SSL handshake */
 	kssl_ent_t		tcp_kssl_ent;	/* SSL table entry */
 	kssl_ctx_t		tcp_kssl_ctx;	/* SSL session */
-	uint_t	tcp_label_len;	/* length of cached label */
 
 	/*
 	 * tcp_closemp_used is protected by listener's tcp_eager_lock
@@ -620,47 +484,17 @@ typedef struct tcp_s {
 #define	TCP_DEBUG_GETPCSTACK(buffer, depth)
 #endif
 
-/*
- * Track a reference count on the tcps in order to know when
- * the tcps_g_q can be removed. As long as there is any
- * tcp_t, other that the tcps_g_q itself, in the tcp_stack_t we
- * need to keep tcps_g_q around so that a closing connection can
- * switch to using tcps_g_q as part of it closing.
- */
-#define	TCPS_REFHOLD(tcps) {					\
-	atomic_add_32(&(tcps)->tcps_refcnt, 1);			\
-	ASSERT((tcps)->tcps_refcnt != 0);			\
-	DTRACE_PROBE1(tcps__refhold, tcp_stack_t, tcps);	\
-}
-
-/*
- * Decrement the reference count on the tcp_stack_t.
- * In architectures e.g sun4u, where atomic_add_32_nv is just
- * a cas, we need to maintain the right memory barrier semantics
- * as that of mutex_exit i.e all the loads and stores should complete
- * before the cas is executed. membar_exit() does that here.
- */
-#define	TCPS_REFRELE(tcps) {					\
-	ASSERT((tcps)->tcps_refcnt != 0);			\
-	membar_exit();						\
-	DTRACE_PROBE1(tcps__refrele, tcp_stack_t, tcps);	\
-	if (atomic_add_32_nv(&(tcps)->tcps_refcnt, -1) == 0 &&	\
-	    (tcps)->tcps_g_q != NULL) {				\
-		/* Only tcps_g_q left */			\
-		tcp_g_q_inactive(tcps);				\
-	}							\
-}
-
 extern void 	tcp_free(tcp_t *tcp);
 extern void	tcp_ddi_g_init(void);
 extern void	tcp_ddi_g_destroy(void);
-extern void	tcp_g_q_inactive(tcp_stack_t *);
-extern void	tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len,
-    zoneid_t zoneid, tcp_stack_t *, conn_t *connp);
-extern void	tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
-extern void	tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2);
-extern void 	tcp_input(void *arg, mblk_t *mp, void *arg2);
-extern void	tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
+extern void	tcp_xmit_listeners_reset(mblk_t *, ip_recv_attr_t *,
+    ip_stack_t *, conn_t *);
+extern void	tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *);
+extern void	tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *);
+extern void	tcp_input_data(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *);
 extern void 	*tcp_get_conn(void *arg, tcp_stack_t *);
 extern void	tcp_time_wait_collector(void *arg);
 extern mblk_t	*tcp_snmp_get(queue_t *, mblk_t *);
@@ -668,7 +502,6 @@ extern int	tcp_snmp_set(queue_t *, int, int, uchar_t *, int len);
 extern mblk_t	*tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send,
 		    int32_t *offset, mblk_t **end_mp, uint32_t seq,
 		    boolean_t sendall, uint32_t *seg_len, boolean_t rexmit);
-extern void	tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2);
 
 /*
  * The TCP Fanout structure.
@@ -706,6 +539,15 @@ typedef struct cl_tcp_info_s {
 } cl_tcp_info_t;
 
 /*
+ * Hook functions to enable cluster networking
+ * On non-clustered systems these vectors must always be NULL.
+ */
+extern void	(*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t,
+		    uint8_t *, in_port_t, void *);
+extern void	(*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t,
+		    uint8_t *, in_port_t, void *);
+
+/*
  * Contracted Consolidation Private ioctl for aborting TCP connections.
  * In order to keep the offsets and size of the structure the same between
  * a 32-bit application and a 64-bit amd64 kernel, we use a #pragma
@@ -729,25 +571,6 @@ typedef struct tcp_ioc_abort_conn_s {
 #pragma pack()
 #endif
 
-#if (defined(_KERNEL) || defined(_KMEMUSER))
-extern void tcp_rput_other(tcp_t *tcp, mblk_t *mp);
-#endif
-
-#if (defined(_KERNEL))
-#define	TCP_XRE_EVENT_IP_FANOUT_TCP 1
-
-/*
- * This is a private structure used to pass data to an squeue function during
- * tcp's listener reset sending path.
- */
-typedef struct tcp_xmit_reset_event {
-	int		tcp_xre_event;
-	int		tcp_xre_iphdrlen;
-	zoneid_t	tcp_xre_zoneid;
-	tcp_stack_t	*tcp_xre_tcps;
-} tcp_xmit_reset_event_t;
-#endif
-
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index c9a941eab2..0e1ef43cfb 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -46,8 +46,6 @@
 #include <sys/ethernet.h>
 #include <sys/cpuvar.h>
 #include <sys/dlpi.h>
-#include <sys/multidata.h>
-#include <sys/multidata_impl.h>
 #include <sys/pattr.h>
 #include <sys/policy.h>
 #include <sys/priv.h>
@@ -87,7 +85,6 @@
 #include <inet/tcp_impl.h>
 #include <inet/udp_impl.h>
 #include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
 #include <inet/ipdrop.h>
 
 #include <inet/ipclassifier.h>
@@ -95,6 +92,7 @@
 #include <inet/ip_ftable.h>
 #include <inet/ip_if.h>
 #include <inet/ipp_common.h>
+#include <inet/ip_rts.h>
 #include <inet/ip_netinfo.h>
 #include <sys/squeue_impl.h>
 #include <sys/squeue.h>
@@ -111,7 +109,7 @@
  *
  * The entire tcp state is contained in tcp_t and conn_t structure
  * which are allocated in tandem using ipcl_conn_create() and passing
- * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect
+ * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect
  * the references on the tcp_t. The tcp_t structure is never compressed
  * and packets always land on the correct TCP perimeter from the time
  * eager is created till the time tcp_t dies (as such the old mentat
@@ -172,8 +170,8 @@
  *
  * This is a more interesting case because of various races involved in
  * establishing a eager in its own perimeter. Read the meta comment on
- * top of tcp_conn_request(). But briefly, the squeue is picked by
- * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU.
+ * top of tcp_input_listener(). But briefly, the squeue is picked by
+ * ip_fanout based on the ring or the sender (if loopback).
  *
  * Closing a connection:
  *
@@ -198,20 +196,13 @@
  *
  * Special provisions and fast paths:
  *
- * We make special provision for (AF_INET, SOCK_STREAM) sockets which
- * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP
- * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles
- * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY
- * check to send packets directly to tcp_rput_data via squeue. Everyone
- * else comes through tcp_input() on the read side.
- *
- * We also make special provisions for sockfs by marking tcp_issocket
+ * We make special provisions for sockfs by marking tcp_issocket
  * whenever we have only sockfs on top of TCP. This allows us to skip
  * putting the tcp in acceptor hash since a sockfs listener can never
  * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
  * since eager has already been allocated and the accept now happens
  * on acceptor STREAM. There is a big blob of comment on top of
- * tcp_conn_request explaining the new accept. When socket is POP'd,
+ * tcp_input_listener explaining the new accept. When socket is POP'd,
  * sockfs sends us an ioctl to mark the fact and we go back to old
  * behaviour. Once tcp_issocket is unset, its never set for the
  * life of that connection.
@@ -224,13 +215,6 @@
  * only exception is tcp_xmit_listeners_reset() which is called
  * directly from IP and needs to policy check to see if TH_RST
  * can be sent out.
- *
- * PFHooks notes :
- *
- * For mdt case, one meta buffer contains multiple packets. Mblks for every
- * packet are assembled and passed to the hooks. When packets are blocked,
- * or boundary of any packet is changed, the mdt processing is stopped, and
- * packets of the meta buffer are send to the IP path one by one.
  */
 
 /*
@@ -244,7 +228,7 @@ int tcp_squeue_flag;
 
 /*
  * This controls how tiny a write must be before we try to copy it
- * into the the mblk on the tail of the transmit queue.  Not much
+ * into the mblk on the tail of the transmit queue.  Not much
  * speedup is observed for values larger than sixteen.  Zero will
  * disable the optimisation.
  */
@@ -333,16 +317,6 @@ static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
 tcp_g_stat_t	tcp_g_statistics;
 kstat_t		*tcp_g_kstat;
 
-/*
- * Call either ip_output or ip_output_v6. This replaces putnext() calls on the
- * tcp write side.
- */
-#define	CALL_IP_WPUT(connp, q, mp) {					\
-	ASSERT(((q)->q_flag & QREADR) == 0);				\
-	TCP_DBGSTAT(connp->conn_netstack->netstack_tcp, tcp_ip_output);	\
-	connp->conn_send(connp, (mp), (q), IP_WPUT);			\
-}
-
 /* Macros for timestamp comparisons */
 #define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
 #define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)
@@ -354,7 +328,7 @@ kstat_t		*tcp_g_kstat;
  * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
  * a per-connection component which grows by 125000 for every new connection;
  * and an "extra" component that grows by a random amount centered
- * approximately on 64000.  This causes the the ISS generator to cycle every
+ * approximately on 64000.  This causes the ISS generator to cycle every
  * 4.89 hours if no TCP connections are made, and faster if connections are
  * made.
  *
@@ -381,8 +355,13 @@ static sin6_t	sin6_null;	/* Zero address for quick clears */
  */
 #define	TCP_OLD_URP_INTERPRETATION	1
 
+/*
+ * Since tcp_listener is not cleared atomically with tcp_detached
+ * being cleared we need this extra bit to tell a detached connection
+ * apart from one that is in the process of being accepted.
+ */
 #define	TCP_IS_DETACHED_NONEAGER(tcp)	\
-	(TCP_IS_DETACHED(tcp) && \
+	(TCP_IS_DETACHED(tcp) &&	\
 	    (!(tcp)->tcp_hard_binding))
 
 /*
@@ -495,7 +474,6 @@ typedef struct tcp_timer_s {
 
 static kmem_cache_t *tcp_timercache;
 kmem_cache_t	*tcp_sack_info_cache;
-kmem_cache_t	*tcp_iphc_cache;
 
 /*
  * For scalability, we must not run a timer for every TCP connection
@@ -592,17 +570,6 @@ typedef struct tcp_opt_s {
 } tcp_opt_t;
 
 /*
- * TCP option struct passing information b/w lisenter and eager.
- */
-struct tcp_options {
-	uint_t			to_flags;
-	ssize_t			to_boundif;	/* IPV6_BOUND_IF */
-};
-
-#define	TCPOPT_BOUNDIF		0x00000001	/* set IPV6_BOUND_IF */
-#define	TCPOPT_RECVPKTINFO	0x00000002	/* set IPV6_RECVPKTINFO */
-
-/*
  * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
  */
 
@@ -673,43 +640,53 @@ typedef struct tcpt_s {
 /*
  * Functions called directly via squeue having a prototype of edesc_t.
  */
-void		tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
-static void	tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2);
-void		tcp_accept_finish(void *arg, mblk_t *mp, void *arg2);
-static void	tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2);
-static void	tcp_wput_proto(void *arg, mblk_t *mp, void *arg2);
-void 		tcp_input(void *arg, mblk_t *mp, void *arg2);
-void		tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
-static void	tcp_close_output(void *arg, mblk_t *mp, void *arg2);
-void		tcp_output(void *arg, mblk_t *mp, void *arg2);
-void		tcp_output_urgent(void *arg, mblk_t *mp, void *arg2);
-static void	tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
-static void	tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
-static void	tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2);
+void		tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *ira);
+static void	tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
+void		tcp_accept_finish(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
+static void	tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
+static void	tcp_wput_proto(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
+void		tcp_input_data(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *ira);
+static void	tcp_close_output(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
+void		tcp_output(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
+void		tcp_output_urgent(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
+static void	tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
+static void	tcp_timer_handler(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
+static void	tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
 
 
 /* Prototype for TCP functions */
 static void	tcp_random_init(void);
 int		tcp_random(void);
 static void	tcp_tli_accept(tcp_t *tcp, mblk_t *mp);
-static int	tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
+static void	tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
 		    tcp_t *eager);
-static int	tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
+static int	tcp_set_destination(tcp_t *tcp);
 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
     int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
     boolean_t user_specified);
 static void	tcp_closei_local(tcp_t *tcp);
 static void	tcp_close_detached(tcp_t *tcp);
-static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
-			mblk_t *idmp, mblk_t **defermp);
+static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr,
+		    mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira);
 static void	tcp_tpi_connect(tcp_t *tcp, mblk_t *mp);
 static int	tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
-		    in_port_t dstport, uint_t srcid, cred_t *cr, pid_t pid);
-static int 	tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
-		    in_port_t dstport, uint32_t flowinfo, uint_t srcid,
-		    uint32_t scope_id, cred_t *cr, pid_t pid);
+		    in_port_t dstport, uint_t srcid);
+static int	tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
+		    in_port_t dstport, uint32_t flowinfo,
+		    uint_t srcid, uint32_t scope_id);
 static int	tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
-static void	tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
 static void	tcp_disconnect(tcp_t *tcp, mblk_t *mp);
 static char	*tcp_display(tcp_t *tcp, char *, char);
 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
@@ -735,34 +712,16 @@ static void	tcp_acceptor_hash_remove(tcp_t *tcp);
 static void	tcp_capability_req(tcp_t *tcp, mblk_t *mp);
 static void	tcp_info_req(tcp_t *tcp, mblk_t *mp);
 static void	tcp_addr_req(tcp_t *tcp, mblk_t *mp);
-static void	tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp);
-void		tcp_g_q_setup(tcp_stack_t *);
-void		tcp_g_q_create(tcp_stack_t *);
-void		tcp_g_q_destroy(tcp_stack_t *);
-static int	tcp_header_init_ipv4(tcp_t *tcp);
-static int	tcp_header_init_ipv6(tcp_t *tcp);
-int		tcp_init(tcp_t *tcp, queue_t *q);
-static int	tcp_init_values(tcp_t *tcp);
-static mblk_t	*tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
-static void	tcp_ip_ire_mark_advice(tcp_t *tcp);
+static void	tcp_init_values(tcp_t *tcp);
 static void	tcp_ip_notify(tcp_t *tcp);
-static mblk_t	*tcp_ire_mp(mblk_t **mpp);
 static void	tcp_iss_init(tcp_t *tcp);
 static void	tcp_keepalive_killer(void *arg);
-static int	tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
-static void	tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss);
+static int	tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt);
+static void	tcp_mss_set(tcp_t *tcp, uint32_t size);
 static int	tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
 		    int *do_disconnectp, int *t_errorp, int *sys_errorp);
 static boolean_t tcp_allow_connopt_set(int level, int name);
 int		tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
-int		tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
-int		tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
-		    int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
-		    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
-		    mblk_t *mblk);
-static void	tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha);
-static int	tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly,
-		    uchar_t *ptr, uint_t len);
 static int	tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
     tcp_stack_t *);
@@ -785,9 +744,9 @@ static uint_t	tcp_rcv_drain(tcp_t *tcp);
 static void	tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
 static void	tcp_ss_rexmit(tcp_t *tcp);
-static mblk_t	*tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp);
-static void	tcp_process_options(tcp_t *, tcph_t *);
-static void	tcp_rput_common(tcp_t *tcp, mblk_t *mp);
+static mblk_t	*tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
+    ip_recv_attr_t *);
+static void	tcp_process_options(tcp_t *, tcpha_t *);
 static void	tcp_rsrv(queue_t *q);
 static int	tcp_snmp_state(tcp_t *tcp);
 static void	tcp_timer(void *arg);
@@ -801,16 +760,10 @@ void		tcp_tpi_accept(queue_t *q, mblk_t *mp);
 static void	tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
 static void	tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
 static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
-static int	tcp_send(queue_t *q, tcp_t *tcp, const int mss,
-		    const int tcp_hdr_len, const int tcp_tcp_hdr_len,
+static int	tcp_send(tcp_t *tcp, const int mss,
+		    const int total_hdr_len, const int tcp_hdr_len,
 		    const int num_sack_blk, int *usable, uint_t *snxt,
-		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
-		    const int mdt_thres);
-static int	tcp_multisend(queue_t *q, tcp_t *tcp, const int mss,
-		    const int tcp_hdr_len, const int tcp_tcp_hdr_len,
-		    const int num_sack_blk, int *usable, uint_t *snxt,
-		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
-		    const int mdt_thres);
+		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time);
 static void	tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
 		    int num_sack_blk);
 static void	tcp_wsrv(queue_t *q);
@@ -818,38 +771,36 @@ static int	tcp_xmit_end(tcp_t *tcp);
 static void	tcp_ack_timer(void *arg);
 static mblk_t	*tcp_ack_mp(tcp_t *tcp);
 static void	tcp_xmit_early_reset(char *str, mblk_t *mp,
-		    uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len,
-		    zoneid_t zoneid, tcp_stack_t *, conn_t *connp);
+		    uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *,
+		    ip_stack_t *, conn_t *);
 static void	tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
 		    uint32_t ack, int ctl);
-static int	setmaxps(queue_t *q, int maxpsz);
 static void	tcp_set_rto(tcp_t *, time_t);
-static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
-		    boolean_t, boolean_t);
-static void	tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
-		    boolean_t ipsec_mctl);
+static void	tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void	tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
+static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
+    ip_recv_attr_t *);
 static int	tcp_build_hdrs(tcp_t *);
 static void	tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
-		    uint32_t seg_seq, uint32_t seg_ack, int seg_len,
-		    tcph_t *tcph);
-boolean_t	tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp);
-static mblk_t	*tcp_mdt_info_mp(mblk_t *);
-static void	tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
-static int	tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
-		    const boolean_t, const uint32_t, const uint32_t,
-		    const uint32_t, const uint32_t, tcp_stack_t *);
-static void	tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *,
-		    const uint_t, const uint_t, boolean_t *);
-static mblk_t	*tcp_lso_info_mp(mblk_t *);
-static void	tcp_lso_update(tcp_t *, ill_lso_capab_t *);
-static void	tcp_send_data(tcp_t *, queue_t *, mblk_t *);
+    uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha,
+    ip_recv_attr_t *ira);
+boolean_t	tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp);
+static boolean_t tcp_zcopy_check(tcp_t *);
+static void	tcp_zcopy_notify(tcp_t *);
+static mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
+static void	tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);
+static void	tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only);
+static void	tcp_update_zcopy(tcp_t *tcp);
+static void	tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
+    ixa_notify_arg_t);
+static void	tcp_rexmit_after_error(tcp_t *tcp);
+static void	tcp_send_data(tcp_t *, mblk_t *);
 extern mblk_t	*tcp_timermp_alloc(int);
 extern void	tcp_timermp_free(tcp_t *);
 static void	tcp_timer_free(tcp_t *tcp, mblk_t *mp);
 static void	tcp_stop_lingering(tcp_t *tcp);
 static void	tcp_close_linger_timeout(void *arg);
 static void	*tcp_stack_init(netstackid_t stackid, netstack_t *ns);
-static void	tcp_stack_shutdown(netstackid_t stackid, void *arg);
 static void	tcp_stack_fini(netstackid_t stackid, void *arg);
 static void	*tcp_g_kstat_init(tcp_g_stat_t *);
 static void	tcp_g_kstat_fini(kstat_t *);
@@ -858,11 +809,10 @@ static void	tcp_kstat_fini(netstackid_t, kstat_t *);
 static void	*tcp_kstat2_init(netstackid_t, tcp_stat_t *);
 static void	tcp_kstat2_fini(netstackid_t, kstat_t *);
 static int	tcp_kstat_update(kstat_t *kp, int rw);
-void		tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
-static int	tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
-			tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
-static int	tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
-			tcph_t *tcph, mblk_t *idmp);
+static mblk_t	*tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
+    ip_recv_attr_t *ira);
+static mblk_t	*tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
+    ip_recv_attr_t *ira);
 static int	tcp_squeue_switch(int);
 
 static int	tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
@@ -872,21 +822,17 @@ static int	tcp_tpi_close(queue_t *, int);
 static int	tcp_tpi_close_accept(queue_t *);
 
 static void	tcp_squeue_add(squeue_t *);
-static boolean_t tcp_zcopy_check(tcp_t *);
-static void	tcp_zcopy_notify(tcp_t *);
-static mblk_t	*tcp_zcopy_disable(tcp_t *, mblk_t *);
-static mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
-static void	tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
+static void	tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 
-extern void	tcp_kssl_input(tcp_t *, mblk_t *);
+extern void	tcp_kssl_input(tcp_t *, mblk_t *, cred_t *);
 
-void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2);
-void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2);
+void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy);
+void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
 
 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
 	    sock_upper_handle_t, cred_t *);
 static int tcp_listen(sock_lower_handle_t, int, cred_t *);
-static int tcp_post_ip_bind(tcp_t *, mblk_t *, int, cred_t *, pid_t);
 static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *,
     boolean_t);
 static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
@@ -922,7 +868,8 @@ static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
  */
 static mblk_t	*tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
 static void	tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
-static void	tcp_ioctl_abort_handler(tcp_t *, mblk_t *);
+static void	tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy);
 static int	tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
 static void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
 static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
@@ -988,12 +935,6 @@ struct streamtab tcpinfov6 = {
 
 sock_downcalls_t sock_tcp_downcalls;
 
-/*
- * Have to ensure that tcp_g_q_close is not done by an
- * interrupt thread.
- */
-static taskq_t *tcp_taskq;
-
 /* Setable only in /etc/system. Move to ndd? */
 boolean_t tcp_icmp_source_quench = B_FALSE;
 
@@ -1042,8 +983,8 @@ static struct T_info_ack tcp_g_t_info_ack_v6 = {
 #define	PARAM_MAX (~(uint32_t)0)
 
 /* Max size IP datagram is 64k - 1 */
-#define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t)))
-#define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t)))
+#define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
+#define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
 /* Max of the above */
 #define	TCP_MSS_MAX	TCP_MSS_MAX_IPV4
 
@@ -1128,29 +1069,10 @@ static tcpparam_t	lcl_tcp_param_arr[] = {
  { 0,		100*MS,		50*MS,		"tcp_push_timer_interval"},
  { 0,		1,		0,		"tcp_use_smss_as_mss_opt"},
  { 0,		PARAM_MAX,	8*MINUTES,	"tcp_keepalive_abort_interval"},
+ { 0,		1,		0,		"tcp_dev_flow_ctl"},
 };
 /* END CSTYLED */
 
-/*
- * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
- * each header fragment in the header buffer.  Each parameter value has
- * to be a multiple of 4 (32-bit aligned).
- */
-static tcpparam_t lcl_tcp_mdt_head_param =
-	{ 32, 256, 32, "tcp_mdt_hdr_head_min" };
-static tcpparam_t lcl_tcp_mdt_tail_param =
-	{ 0,  256, 32, "tcp_mdt_hdr_tail_min" };
-#define	tcps_mdt_hdr_head_min	tcps_mdt_head_param->tcp_param_val
-#define	tcps_mdt_hdr_tail_min	tcps_mdt_tail_param->tcp_param_val
-
-/*
- * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out
- * the maximum number of payload buffers associated per Multidata.
- */
-static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
-	{ 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" };
-#define	tcps_mdt_max_pbufs	tcps_mdt_max_pbufs_param->tcp_param_val
-
 /* Round up the value to the nearest mss. */
 #define	MSS_ROUNDUP(value, mss)		((((value) - 1) / (mss) + 1) * (mss))
 
@@ -1162,7 +1084,7 @@ static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
  * point ECT(0) for TCP as described in RFC 2481.
  */
 #define	SET_ECT(tcp, iph) \
-	if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
+	if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
 		/* We need to clear the code point first. */ \
 		((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
 		((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
@@ -1183,23 +1105,12 @@ static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
 #define	IS_VMLOANED_MBLK(mp) \
 	(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
 
-
-/* Enable or disable b_cont M_MULTIDATA chaining for MDT. */
-boolean_t tcp_mdt_chain = B_TRUE;
-
-/*
- * MDT threshold in the form of effective send MSS multiplier; we take
- * the MDT path if the amount of unsent data exceeds the threshold value
- * (default threshold is 1*SMSS).
- */
-uint_t tcp_mdt_smss_threshold = 1;
-
 uint32_t do_tcpzcopy = 1;		/* 0: disable, 1: enable, 2: force */
 
 /*
  * Forces all connections to obey the value of the tcps_maxpsz_multiplier
  * tunable settable via NDD.  Otherwise, the per-connection behavior is
- * determined dynamically during tcp_adapt_ire(), which is the default.
+ * determined dynamically during tcp_set_destination(), which is the default.
  */
 boolean_t tcp_static_maxpsz = B_FALSE;
 
@@ -1273,84 +1184,73 @@ int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol,
 			    uint8_t *laddrp, in_port_t lport,
 			    uint8_t *faddrp, in_port_t fport,
 			    void *args) = NULL;
-
 void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol,
 			    sa_family_t addr_family, uint8_t *laddrp,
 			    in_port_t lport, uint8_t *faddrp,
 			    in_port_t fport, void *args) = NULL;
 
-/*
- * The following are defined in ip.c
- */
-extern int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
-			    sa_family_t addr_family, uint8_t *laddrp,
-			    void *args);
-extern uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
-			    sa_family_t addr_family, uint8_t *laddrp,
-			    uint8_t *faddrp, void *args);
-
 
 /*
  * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err)
  */
-#define	CL_INET_CONNECT(connp, tcp, is_outgoing, err) {		\
+#define	CL_INET_CONNECT(connp, is_outgoing, err) {		\
 	(err) = 0;						\
 	if (cl_inet_connect2 != NULL) {				\
 		/*						\
 		 * Running in cluster mode - register active connection	\
 		 * information						\
 		 */							\
-		if ((tcp)->tcp_ipversion == IPV4_VERSION) {		\
-			if ((tcp)->tcp_ipha->ipha_src != 0) {		\
+		if ((connp)->conn_ipversion == IPV4_VERSION) {		\
+			if ((connp)->conn_laddr_v4 != 0) {		\
 				(err) = (*cl_inet_connect2)(		\
 				    (connp)->conn_netstack->netstack_stackid,\
 				    IPPROTO_TCP, is_outgoing, AF_INET,	\
-				    (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\
-				    (in_port_t)(tcp)->tcp_lport,	\
-				    (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
-				    (in_port_t)(tcp)->tcp_fport, NULL);	\
+				    (uint8_t *)(&((connp)->conn_laddr_v4)),\
+				    (in_port_t)(connp)->conn_lport,	\
+				    (uint8_t *)(&((connp)->conn_faddr_v4)),\
+				    (in_port_t)(connp)->conn_fport, NULL); \
 			}						\
 		} else {						\
 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
-			    &(tcp)->tcp_ip6h->ip6_src)) {		\
+			    &(connp)->conn_laddr_v6)) {			\
 				(err) = (*cl_inet_connect2)(		\
 				    (connp)->conn_netstack->netstack_stackid,\
 				    IPPROTO_TCP, is_outgoing, AF_INET6,	\
-				    (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\
-				    (in_port_t)(tcp)->tcp_lport,	\
-				    (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
-				    (in_port_t)(tcp)->tcp_fport, NULL);	\
+				    (uint8_t *)(&((connp)->conn_laddr_v6)),\
+				    (in_port_t)(connp)->conn_lport,	\
+				    (uint8_t *)(&((connp)->conn_faddr_v6)), \
+				    (in_port_t)(connp)->conn_fport, NULL); \
 			}						\
 		}							\
 	}								\
 }
 
-#define	CL_INET_DISCONNECT(connp, tcp)	{				\
+#define	CL_INET_DISCONNECT(connp)	{				\
 	if (cl_inet_disconnect != NULL) {				\
 		/*							\
 		 * Running in cluster mode - deregister active		\
 		 * connection information				\
 		 */							\
-		if ((tcp)->tcp_ipversion == IPV4_VERSION) {		\
-			if ((tcp)->tcp_ip_src != 0) {			\
+		if ((connp)->conn_ipversion == IPV4_VERSION) {		\
+			if ((connp)->conn_laddr_v4 != 0) {		\
 				(*cl_inet_disconnect)(			\
 				    (connp)->conn_netstack->netstack_stackid,\
 				    IPPROTO_TCP, AF_INET,		\
-				    (uint8_t *)(&((tcp)->tcp_ip_src)),	\
-				    (in_port_t)(tcp)->tcp_lport,	\
-				    (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
-				    (in_port_t)(tcp)->tcp_fport, NULL);	\
+				    (uint8_t *)(&((connp)->conn_laddr_v4)),\
+				    (in_port_t)(connp)->conn_lport,	\
+				    (uint8_t *)(&((connp)->conn_faddr_v4)),\
+				    (in_port_t)(connp)->conn_fport, NULL); \
 			}						\
 		} else {						\
 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
-			    &(tcp)->tcp_ip_src_v6)) {			\
+			    &(connp)->conn_laddr_v6)) {			\
 				(*cl_inet_disconnect)(			\
 				    (connp)->conn_netstack->netstack_stackid,\
 				    IPPROTO_TCP, AF_INET6,		\
-				    (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\
-				    (in_port_t)(tcp)->tcp_lport,	\
-				    (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
-				    (in_port_t)(tcp)->tcp_fport, NULL);	\
+				    (uint8_t *)(&((connp)->conn_laddr_v6)),\
+				    (in_port_t)(connp)->conn_lport,	\
+				    (uint8_t *)(&((connp)->conn_faddr_v6)), \
+				    (in_port_t)(connp)->conn_fport, NULL); \
 			}						\
 		}							\
 	}								\
@@ -1367,11 +1267,6 @@ int cl_tcp_walk_list(netstackid_t stack_id,
 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
     void *arg, tcp_stack_t *tcps);
 
-#define	DTRACE_IP_FASTPATH(mp, iph, ill, ipha, ip6h) 			\
-	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,	\
-	    iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha,		\
-	    ip6_t *, ip6h, int, 0);
-
 static void
 tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh)
 {
@@ -1540,7 +1435,7 @@ tcp_time_wait_append(tcp_t *tcp)
 
 /* ARGSUSED */
 void
-tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
+tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	conn_t	*connp = (conn_t *)arg;
 	tcp_t	*tcp = connp->conn_tcp;
@@ -1551,11 +1446,11 @@ tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
 		return;
 	}
 
-	ASSERT((tcp->tcp_family == AF_INET &&
-	    tcp->tcp_ipversion == IPV4_VERSION) ||
-	    (tcp->tcp_family == AF_INET6 &&
-	    (tcp->tcp_ipversion == IPV4_VERSION ||
-	    tcp->tcp_ipversion == IPV6_VERSION)));
+	ASSERT((connp->conn_family == AF_INET &&
+	    connp->conn_ipversion == IPV4_VERSION) ||
+	    (connp->conn_family == AF_INET6 &&
+	    (connp->conn_ipversion == IPV4_VERSION ||
+	    connp->conn_ipversion == IPV6_VERSION)));
 	ASSERT(!tcp->tcp_listener);
 
 	TCP_STAT(tcps, tcp_time_wait_reap);
@@ -1579,10 +1474,17 @@ tcp_ipsec_cleanup(tcp_t *tcp)
 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
 
 	if (connp->conn_latch != NULL) {
-		IPLATCH_REFRELE(connp->conn_latch,
-		    connp->conn_netstack);
+		IPLATCH_REFRELE(connp->conn_latch);
 		connp->conn_latch = NULL;
 	}
+	if (connp->conn_latch_in_policy != NULL) {
+		IPPOL_REFRELE(connp->conn_latch_in_policy);
+		connp->conn_latch_in_policy = NULL;
+	}
+	if (connp->conn_latch_in_action != NULL) {
+		IPACT_REFRELE(connp->conn_latch_in_action);
+		connp->conn_latch_in_action = NULL;
+	}
 	if (connp->conn_policy != NULL) {
 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
 		connp->conn_policy = NULL;
@@ -1598,9 +1500,6 @@ void
 tcp_cleanup(tcp_t *tcp)
 {
 	mblk_t		*mp;
-	char		*tcp_iphc;
-	int		tcp_iphc_len;
-	int		tcp_hdr_grown;
 	tcp_sack_info_t	*tcp_sack_info;
 	conn_t		*connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
@@ -1611,6 +1510,22 @@ tcp_cleanup(tcp_t *tcp)
 
 	/* Cleanup that which needs the netstack first */
 	tcp_ipsec_cleanup(tcp);
+	ixa_cleanup(connp->conn_ixa);
+
+	if (connp->conn_ht_iphc != NULL) {
+		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
+		connp->conn_ht_iphc = NULL;
+		connp->conn_ht_iphc_allocated = 0;
+		connp->conn_ht_iphc_len = 0;
+		connp->conn_ht_ulp = NULL;
+		connp->conn_ht_ulp_len = 0;
+		tcp->tcp_ipha = NULL;
+		tcp->tcp_ip6h = NULL;
+		tcp->tcp_tcpha = NULL;
+	}
+
+	/* We clear any IP_OPTIONS and extension headers */
+	ip_pkt_free(&connp->conn_xmit_ipp);
 
 	tcp_free(tcp);
 
@@ -1626,8 +1541,6 @@ tcp_cleanup(tcp_t *tcp)
 	}
 	tcp->tcp_kssl_pending = B_FALSE;
 
-	conn_delete_ire(connp, NULL);
-
 	/*
 	 * Since we will bzero the entire structure, we need to
 	 * remove it and reinsert it in global hash list. We
@@ -1639,46 +1552,36 @@ tcp_cleanup(tcp_t *tcp)
 	 */
 	ipcl_globalhash_remove(connp);
 
-	/*
-	 * Now it is safe to decrement the reference counts.
-	 * This might be the last reference on the netstack and TCPS
-	 * in which case it will cause the tcp_g_q_close and
-	 * the freeing of the IP Instance.
-	 */
-	connp->conn_netstack = NULL;
-	netstack_rele(ns);
-	ASSERT(tcps != NULL);
-	tcp->tcp_tcps = NULL;
-	TCPS_REFRELE(tcps);
-
 	/* Save some state */
 	mp = tcp->tcp_timercache;
 
 	tcp_sack_info = tcp->tcp_sack_info;
-	tcp_iphc = tcp->tcp_iphc;
-	tcp_iphc_len = tcp->tcp_iphc_len;
-	tcp_hdr_grown = tcp->tcp_hdr_grown;
 	tcp_rsrv_mp = tcp->tcp_rsrv_mp;
 
 	if (connp->conn_cred != NULL) {
 		crfree(connp->conn_cred);
 		connp->conn_cred = NULL;
 	}
-	if (connp->conn_effective_cred != NULL) {
-		crfree(connp->conn_effective_cred);
-		connp->conn_effective_cred = NULL;
-	}
 	ipcl_conn_cleanup(connp);
 	connp->conn_flags = IPCL_TCPCONN;
+
+	/*
+	 * Now it is safe to decrement the reference counts.
+	 * This might be the last reference on the netstack
+	 * in which case it will cause the freeing of the IP Instance.
+	 */
+	connp->conn_netstack = NULL;
+	connp->conn_ixa->ixa_ipst = NULL;
+	netstack_rele(ns);
+	ASSERT(tcps != NULL);
+	tcp->tcp_tcps = NULL;
+
 	bzero(tcp, sizeof (tcp_t));
 
 	/* restore the state */
 	tcp->tcp_timercache = mp;
 
 	tcp->tcp_sack_info = tcp_sack_info;
-	tcp->tcp_iphc = tcp_iphc;
-	tcp->tcp_iphc_len = tcp_iphc_len;
-	tcp->tcp_hdr_grown = tcp_hdr_grown;
 	tcp->tcp_rsrv_mp = tcp_rsrv_mp;
 
 	tcp->tcp_connp = connp;
@@ -1686,7 +1589,7 @@ tcp_cleanup(tcp_t *tcp)
 	ASSERT(connp->conn_tcp == tcp);
 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
 	connp->conn_state_flags = CONN_INCIPIENT;
-	ASSERT(connp->conn_ulp == IPPROTO_TCP);
+	ASSERT(connp->conn_proto == IPPROTO_TCP);
 	ASSERT(connp->conn_ref == 1);
 }
 
@@ -1777,11 +1680,7 @@ tcp_time_wait_collector(void *arg)
 				/*
 				 * Set the CONDEMNED flag now itself so that
 				 * the refcnt cannot increase due to any
-				 * walker. But we have still not cleaned up
-				 * conn_ire_cache. This is still ok since
-				 * we are going to clean it up in tcp_cleanup
-				 * immediately and any interface unplumb
-				 * thread will wait till the ire is blown away
+				 * walker.
 				 */
 				connp->conn_state_flags |= CONN_CONDEMNED;
 				mutex_exit(lock);
@@ -1809,7 +1708,7 @@ tcp_time_wait_collector(void *arg)
 					mutex_exit(
 					    &tcp_time_wait->tcp_time_wait_lock);
 					tcp_bind_hash_remove(tcp);
-					conn_delete_ire(tcp->tcp_connp, NULL);
+					ixa_cleanup(tcp->tcp_connp->conn_ixa);
 					tcp_ipsec_cleanup(tcp);
 					CONN_DEC_REF(tcp->tcp_connp);
 				}
@@ -1839,7 +1738,7 @@ tcp_time_wait_collector(void *arg)
 				TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
 				mp = &tcp->tcp_closemp;
 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
-				    tcp_timewait_output, connp,
+				    tcp_timewait_output, connp, NULL,
 				    SQ_FILL, SQTAG_TCP_TIMEWAIT);
 			}
 		} else {
@@ -1867,7 +1766,7 @@ tcp_time_wait_collector(void *arg)
 			TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
 			mp = &tcp->tcp_closemp;
 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
-			    tcp_timewait_output, connp,
+			    tcp_timewait_output, connp, NULL,
 			    SQ_FILL, SQTAG_TCP_TIMEWAIT);
 		}
 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
@@ -1886,24 +1785,23 @@ tcp_time_wait_collector(void *arg)
 /*
  * Reply to a clients T_CONN_RES TPI message. This function
  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
- * on the acceptor STREAM and processed in tcp_wput_accept().
- * Read the block comment on top of tcp_conn_request().
+ * on the acceptor STREAM and processed in tcp_accept_common().
+ * Read the block comment on top of tcp_input_listener().
  */
 static void
 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 {
-	tcp_t	*acceptor;
-	tcp_t	*eager;
-	tcp_t   *tcp;
+	tcp_t		*acceptor;
+	tcp_t		*eager;
+	tcp_t   	*tcp;
 	struct T_conn_res	*tcr;
 	t_uscalar_t	acceptor_id;
 	t_scalar_t	seqnum;
-	mblk_t	*opt_mp = NULL;	/* T_OPTMGMT_REQ messages */
-	struct tcp_options *tcpopt;
-	mblk_t	*ok_mp;
-	mblk_t	*mp1;
+	mblk_t		*discon_mp = NULL;
+	mblk_t		*ok_mp;
+	mblk_t		*mp1;
 	tcp_stack_t	*tcps = listener->tcp_tcps;
-	int	error;
+	conn_t		*econnp;
 
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
 		tcp_err_ack(listener, mp, TPROTO, 0);
@@ -1922,8 +1820,8 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 	 * fanout hash lock is held.
 	 * This prevents any thread from entering the acceptor queue from
 	 * below (since it has not been hard bound yet i.e. any inbound
-	 * packets will arrive on the listener or default tcp queue and
-	 * go through tcp_lookup).
+	 * packets will arrive on the listener conn_t and
+	 * go through the classifier).
 	 * The CONN_INC_REF will prevent the acceptor from closing.
 	 *
 	 * XXX It is still possible for a tli application to send down data
@@ -1974,7 +1872,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 	} else {
 		acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
 		if (acceptor == NULL) {
-			if (listener->tcp_debug) {
+			if (listener->tcp_connp->conn_debug) {
 				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_accept: did not find acceptor 0x%x\n",
@@ -2013,7 +1911,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 	 * Rendezvous with an eager connection request packet hanging off
 	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
 	 * tcp structure when the connection packet arrived in
-	 * tcp_conn_request().
+	 * tcp_input_listener().
 	 */
 	seqnum = tcr->SEQ_number;
 	eager = listener;
@@ -2047,37 +1945,26 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 	 */
 	ASSERT(eager->tcp_connp->conn_ref >= 1);
 
-	/* Pre allocate the stroptions mblk also */
-	opt_mp = allocb(MAX(sizeof (struct tcp_options),
-	    sizeof (struct T_conn_res)), BPRI_HI);
-	if (opt_mp == NULL) {
+	/*
+	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
+	 * use it if something failed.
+	 */
+	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
+	    sizeof (struct stroptions)), BPRI_HI);
+	if (discon_mp == NULL) {
 		CONN_DEC_REF(acceptor->tcp_connp);
 		CONN_DEC_REF(eager->tcp_connp);
 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
 		return;
 	}
-	DB_TYPE(opt_mp) = M_SETOPTS;
-	opt_mp->b_wptr += sizeof (struct tcp_options);
-	tcpopt = (struct tcp_options *)opt_mp->b_rptr;
-	tcpopt->to_flags = 0;
 
-	/*
-	 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
-	 * from listener to acceptor.
-	 */
-	if (listener->tcp_bound_if != 0) {
-		tcpopt->to_flags |= TCPOPT_BOUNDIF;
-		tcpopt->to_boundif = listener->tcp_bound_if;
-	}
-	if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
-		tcpopt->to_flags |= TCPOPT_RECVPKTINFO;
-	}
+	econnp = eager->tcp_connp;
 
-	/* Re-use mp1 to hold a copy of mp, in case reallocb fails */
+	/* Hold a copy of mp, in case reallocb fails */
 	if ((mp1 = copymsg(mp)) == NULL) {
 		CONN_DEC_REF(acceptor->tcp_connp);
 		CONN_DEC_REF(eager->tcp_connp);
-		freemsg(opt_mp);
+		freemsg(discon_mp);
 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
 		return;
 	}
@@ -2093,7 +1980,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 	{
 		int extra;
 
-		extra = (eager->tcp_family == AF_INET) ?
+		extra = (econnp->conn_family == AF_INET) ?
 		    sizeof (sin_t) : sizeof (sin6_t);
 
 		/*
@@ -2104,7 +1991,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
 			CONN_DEC_REF(acceptor->tcp_connp);
 			CONN_DEC_REF(eager->tcp_connp);
-			freemsg(opt_mp);
+			freemsg(discon_mp);
 			/* Original mp has been freed by now, so use mp1 */
 			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
 			return;
@@ -2114,38 +2001,32 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 
 		switch (extra) {
 		case sizeof (sin_t): {
-				sin_t *sin = (sin_t *)ok_mp->b_wptr;
+			sin_t *sin = (sin_t *)ok_mp->b_wptr;
 
-				ok_mp->b_wptr += extra;
-				sin->sin_family = AF_INET;
-				sin->sin_port = eager->tcp_lport;
-				sin->sin_addr.s_addr =
-				    eager->tcp_ipha->ipha_src;
-				break;
-			}
+			ok_mp->b_wptr += extra;
+			sin->sin_family = AF_INET;
+			sin->sin_port = econnp->conn_lport;
+			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
+			break;
+		}
 		case sizeof (sin6_t): {
-				sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
+			sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
 
-				ok_mp->b_wptr += extra;
-				sin6->sin6_family = AF_INET6;
-				sin6->sin6_port = eager->tcp_lport;
-				if (eager->tcp_ipversion == IPV4_VERSION) {
-					sin6->sin6_flowinfo = 0;
-					IN6_IPADDR_TO_V4MAPPED(
-					    eager->tcp_ipha->ipha_src,
-					    &sin6->sin6_addr);
-				} else {
-					ASSERT(eager->tcp_ip6h != NULL);
-					sin6->sin6_flowinfo =
-					    eager->tcp_ip6h->ip6_vcf &
-					    ~IPV6_VERS_AND_FLOW_MASK;
-					sin6->sin6_addr =
-					    eager->tcp_ip6h->ip6_src;
-				}
+			ok_mp->b_wptr += extra;
+			sin6->sin6_family = AF_INET6;
+			sin6->sin6_port = econnp->conn_lport;
+			sin6->sin6_addr = econnp->conn_laddr_v6;
+			sin6->sin6_flowinfo = econnp->conn_flowinfo;
+			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
+			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
+				sin6->sin6_scope_id =
+				    econnp->conn_ixa->ixa_scopeid;
+			} else {
 				sin6->sin6_scope_id = 0;
-				sin6->__sin6_src_id = 0;
-				break;
 			}
+			sin6->__sin6_src_id = 0;
+			break;
+		}
 		default:
 			break;
 		}
@@ -2158,15 +2039,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 	 * the tcp_accept_swap is done since it would be dangerous to
 	 * let the application start using the new fd prior to the swap.
 	 */
-	error = tcp_accept_swap(listener, acceptor, eager);
-	if (error != 0) {
-		CONN_DEC_REF(acceptor->tcp_connp);
-		CONN_DEC_REF(eager->tcp_connp);
-		freemsg(ok_mp);
-		/* Original mp has been freed by now, so use mp1 */
-		tcp_err_ack(listener, mp1, TSYSERR, error);
-		return;
-	}
+	tcp_accept_swap(listener, acceptor, eager);
 
 	/*
 	 * tcp_accept_swap unlinks eager from listener but does not drop
@@ -2244,7 +2117,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 	/* We no longer need mp1, since all options processing has passed */
 	freemsg(mp1);
 
-	putnext(listener->tcp_rq, ok_mp);
+	putnext(listener->tcp_connp->conn_rq, ok_mp);
 
 	mutex_enter(&listener->tcp_eager_lock);
 	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
@@ -2305,7 +2178,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 		listener->tcp_eager_last_q = tcp;
 		tcp->tcp_eager_next_q = NULL;
 		mutex_exit(&listener->tcp_eager_lock);
-		putnext(tcp->tcp_rq, conn_ind);
+		putnext(tcp->tcp_connp->conn_rq, conn_ind);
 	} else {
 		mutex_exit(&listener->tcp_eager_lock);
 	}
@@ -2318,26 +2191,20 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
 	 */
 finish:
 	ASSERT(acceptor->tcp_detached);
-	ASSERT(tcps->tcps_g_q != NULL);
+	acceptor->tcp_connp->conn_rq = NULL;
 	ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
-	acceptor->tcp_rq = tcps->tcps_g_q;
-	acceptor->tcp_wq = WR(tcps->tcps_g_q);
+	acceptor->tcp_connp->conn_wq = NULL;
 	(void) tcp_clean_death(acceptor, 0, 2);
 	CONN_DEC_REF(acceptor->tcp_connp);
 
 	/*
-	 * In case we already received a FIN we have to make tcp_rput send
-	 * the ordrel_ind. This will also send up a window update if the window
-	 * has opened up.
-	 *
-	 * In the normal case of a successful connection acceptance
-	 * we give the O_T_BIND_REQ to the read side put procedure as an
-	 * indication that this was just accepted. This tells tcp_rput to
-	 * pass up any data queued in tcp_rcv_list.
+	 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
 	 *
-	 * In the fringe case where options sent with T_CONN_RES failed and
-	 * we required, we would be indicating a T_DISCON_IND to blow
-	 * away this connection.
+	 * It will update the setting for sockfs/stream head and also take
+	 * care of any data that arrived before accept() wad called.
+	 * In case we already received a FIN then tcp_accept_finish will send up
+	 * the ordrel. It will also send up a window update if the window
+	 * has opened up.
 	 */
 
 	/*
@@ -2346,7 +2213,7 @@ finish:
 	 * and is well know but nothing can be done short of major rewrite
 	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
 	 * eager same squeue as listener (we can distinguish non socket
-	 * listeners at the time of handling a SYN in tcp_conn_request)
+	 * listeners at the time of handling a SYN in tcp_input_listener)
 	 * and do most of the work that tcp_accept_finish does here itself
 	 * and then get behind the acceptor squeue to access the acceptor
 	 * queue.
@@ -2354,52 +2221,38 @@ finish:
 	/*
 	 * We already have a ref on tcp so no need to do one before squeue_enter
 	 */
-	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, opt_mp, tcp_accept_finish,
-	    eager->tcp_connp, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH);
+	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
+	    tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
+	    SQTAG_TCP_ACCEPT_FINISH);
 }
 
 /*
  * Swap information between the eager and acceptor for a TLI/XTI client.
  * The sockfs accept is done on the acceptor stream and control goes
- * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not
+ * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
  * called. In either case, both the eager and listener are in their own
  * perimeter (squeue) and the code has to deal with potential race.
  *
- * See the block comment on top of tcp_accept() and tcp_wput_accept().
+ * See the block comment on top of tcp_accept() and tcp_tli_accept().
  */
-static int
+static void
 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
 {
 	conn_t	*econnp, *aconnp;
-	cred_t	*effective_cred = NULL;
 
-	ASSERT(eager->tcp_rq == listener->tcp_rq);
+	ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
 	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
-	ASSERT(!eager->tcp_hard_bound);
 	ASSERT(!TCP_IS_SOCKET(acceptor));
 	ASSERT(!TCP_IS_SOCKET(eager));
 	ASSERT(!TCP_IS_SOCKET(listener));
 
-	econnp = eager->tcp_connp;
-	aconnp = acceptor->tcp_connp;
-
 	/*
 	 * Trusted Extensions may need to use a security label that is
 	 * different from the acceptor's label on MLP and MAC-Exempt
 	 * sockets. If this is the case, the required security label
-	 * already exists in econnp->conn_effective_cred. Use this label
-	 * to generate a new effective cred for the acceptor.
-	 *
-	 * We allow for potential application level retry attempts by
-	 * checking for transient errors before modifying eager.
+	 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
+	 * acceptor stream refer to econnp we atomatically get that label.
 	 */
-	if (is_system_labeled() &&
-	    aconnp->conn_cred != NULL && econnp->conn_effective_cred != NULL) {
-		effective_cred = copycred_from_tslabel(aconnp->conn_cred,
-		    crgetlabel(econnp->conn_effective_cred), KM_NOSLEEP);
-		if (effective_cred == NULL)
-			return (ENOMEM);
-	}
 
 	acceptor->tcp_detached = B_TRUE;
 	/*
@@ -2416,18 +2269,20 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
 	ASSERT(eager->tcp_eager_next_q0 == NULL &&
 	    eager->tcp_eager_prev_q0 == NULL);
 	mutex_exit(&listener->tcp_eager_lock);
-	eager->tcp_rq = acceptor->tcp_rq;
-	eager->tcp_wq = acceptor->tcp_wq;
 
-	eager->tcp_rq->q_ptr = econnp;
-	eager->tcp_wq->q_ptr = econnp;
+	econnp = eager->tcp_connp;
+	aconnp = acceptor->tcp_connp;
+	econnp->conn_rq = aconnp->conn_rq;
+	econnp->conn_wq = aconnp->conn_wq;
+	econnp->conn_rq->q_ptr = econnp;
+	econnp->conn_wq->q_ptr = econnp;
 
 	/*
 	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
 	 * which might be a different squeue from our peer TCP instance.
 	 * For TCP Fusion, the peer expects that whenever tcp_detached is
 	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
-	 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq
+	 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
 	 * above reach global visibility prior to the clearing of tcp_detached.
 	 */
 	membar_producer();
@@ -2439,419 +2294,187 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
 	econnp->conn_minor_arena = aconnp->conn_minor_arena;
 
 	ASSERT(econnp->conn_minor_arena != NULL);
-	if (eager->tcp_cred != NULL)
-		crfree(eager->tcp_cred);
-	eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred;
-	if (econnp->conn_effective_cred != NULL)
-		crfree(econnp->conn_effective_cred);
-	econnp->conn_effective_cred = effective_cred;
+	if (econnp->conn_cred != NULL)
+		crfree(econnp->conn_cred);
+	econnp->conn_cred = aconnp->conn_cred;
 	aconnp->conn_cred = NULL;
-	ASSERT(aconnp->conn_effective_cred == NULL);
-
+	econnp->conn_cpid = aconnp->conn_cpid;
 	ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
 	ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
 
 	econnp->conn_zoneid = aconnp->conn_zoneid;
 	econnp->conn_allzones = aconnp->conn_allzones;
+	econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
 
+	econnp->conn_mac_mode = aconnp->conn_mac_mode;
+	econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
 	aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
 
 	/* Do the IPC initialization */
 	CONN_INC_REF(econnp);
 
-	econnp->conn_multicast_loop = aconnp->conn_multicast_loop;
-	econnp->conn_af_isv6 = aconnp->conn_af_isv6;
-	econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6;
+	econnp->conn_family = aconnp->conn_family;
+	econnp->conn_ipversion = aconnp->conn_ipversion;
 
 	/* Done with old IPC. Drop its ref on its connp */
 	CONN_DEC_REF(aconnp);
-	return (0);
 }
 
 
 /*
  * Adapt to the information, such as rtt and rtt_sd, provided from the
- * ire cached in conn_cache_ire. If no ire cached, do a ire lookup.
+ * DCE and IRE maintained by IP.
  *
  * Checks for multicast and broadcast destination address.
- * Returns zero on failure; non-zero if ok.
+ * Returns zero if ok; an errno on failure.
  *
  * Note that the MSS calculation here is based on the info given in
- * the IRE.  We do not do any calculation based on TCP options.  They
- * will be handled in tcp_rput_other() and tcp_rput_data() when TCP
- * knows which options to use.
+ * the DCE and IRE.  We do not do any calculation based on TCP options.  They
+ * will be handled in tcp_input_data() when TCP knows which options to use.
  *
  * Note on how TCP gets its parameters for a connection.
  *
  * When a tcp_t structure is allocated, it gets all the default parameters.
- * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd,
+ * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
  * spipe, rpipe, ... from the route metrics.  Route metric overrides the
  * default.
  *
- * An incoming SYN with a multicast or broadcast destination address, is dropped
- * in 1 of 2 places.
- *
- * 1. If the packet was received over the wire it is dropped in
- * ip_rput_process_broadcast()
- *
- * 2. If the packet was received through internal IP loopback, i.e. the packet
- * was generated and received on the same machine, it is dropped in
- * ip_wput_local()
+ * An incoming SYN with a multicast or broadcast destination address is dropped
+ * in ip_fanout_v4/v6.
  *
  * An incoming SYN with a multicast or broadcast source address is always
- * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to
+ * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in
+ * conn_connect.
+ * The same logic in tcp_set_destination also serves to
  * reject an attempt to connect to a broadcast or multicast (destination)
  * address.
  */
 static int
-tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
+tcp_set_destination(tcp_t *tcp)
 {
-	ire_t		*ire;
-	ire_t		*sire = NULL;
-	iulp_t		*ire_uinfo = NULL;
 	uint32_t	mss_max;
 	uint32_t	mss;
 	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
 	conn_t		*connp = tcp->tcp_connp;
-	boolean_t	ire_cacheable = B_FALSE;
-	zoneid_t	zoneid = connp->conn_zoneid;
-	int		match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-	    MATCH_IRE_SECATTR;
-	ts_label_t	*tsl = crgetlabel(CONN_CRED(connp));
-	ill_t		*ill = NULL;
-	boolean_t	incoming = (ire_mp == NULL);
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
-
-	ASSERT(connp->conn_ire_cache == NULL);
-
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
+	iulp_t		uinfo;
+	int		error;
+	uint32_t	flags;
 
-		if (CLASSD(tcp->tcp_connp->conn_rem)) {
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
-			return (0);
-		}
-		/*
-		 * If IP_NEXTHOP is set, then look for an IRE_CACHE
-		 * for the destination with the nexthop as gateway.
-		 * ire_ctable_lookup() is used because this particular
-		 * ire, if it exists, will be marked private.
-		 * If that is not available, use the interface ire
-		 * for the nexthop.
-		 *
-		 * TSol: tcp_update_label will detect label mismatches based
-		 * only on the destination's label, but that would not
-		 * detect label mismatches based on the security attributes
-		 * of routes or next hop gateway. Hence we need to pass the
-		 * label to ire_ftable_lookup below in order to locate the
-		 * right prefix (and/or) ire cache. Similarly we also need
-		 * pass the label to the ire_cache_lookup below to locate
-		 * the right ire that also matches on the label.
-		 */
-		if (tcp->tcp_connp->conn_nexthop_set) {
-			ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem,
-			    tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid,
-			    tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW,
-			    ipst);
-			if (ire == NULL) {
-				ire = ire_ftable_lookup(
-				    tcp->tcp_connp->conn_nexthop_v4,
-				    0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0,
-				    tsl, match_flags, ipst);
-				if (ire == NULL)
-					return (0);
-			} else {
-				ire_uinfo = &ire->ire_uinfo;
-			}
-		} else {
-			ire = ire_cache_lookup(tcp->tcp_connp->conn_rem,
-			    zoneid, tsl, ipst);
-			if (ire != NULL) {
-				ire_cacheable = B_TRUE;
-				ire_uinfo = (ire_mp != NULL) ?
-				    &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
-				    &ire->ire_uinfo;
+	flags = IPDF_LSO | IPDF_ZCOPY;
+	/*
+	 * Make sure we have a dce for the destination to avoid dce_ident
+	 * contention for connected sockets.
+	 */
+	flags |= IPDF_UNIQUE_DCE;
 
-			} else {
-				if (ire_mp == NULL) {
-					ire = ire_ftable_lookup(
-					    tcp->tcp_connp->conn_rem,
-					    0, 0, 0, NULL, &sire, zoneid, 0,
-					    tsl, (MATCH_IRE_RECURSIVE |
-					    MATCH_IRE_DEFAULT), ipst);
-					if (ire == NULL)
-						return (0);
-					ire_uinfo = (sire != NULL) ?
-					    &sire->ire_uinfo :
-					    &ire->ire_uinfo;
-				} else {
-					ire = (ire_t *)ire_mp->b_rptr;
-					ire_uinfo =
-					    &((ire_t *)
-					    ire_mp->b_rptr)->ire_uinfo;
-				}
-			}
-		}
-		ASSERT(ire != NULL);
+	if (!tcps->tcps_ignore_path_mtu)
+		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
 
-		if ((ire->ire_src_addr == INADDR_ANY) ||
-		    (ire->ire_type & IRE_BROADCAST)) {
-			/*
-			 * ire->ire_mp is non null when ire_mp passed in is used
-			 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
-			 */
-			if (ire->ire_mp == NULL)
-				ire_refrele(ire);
-			if (sire != NULL)
-				ire_refrele(sire);
-			return (0);
-		}
-
-		if (tcp->tcp_ipha->ipha_src == INADDR_ANY) {
-			ipaddr_t src_addr;
+	/* Use conn_lock to satify ASSERT; tcp is already serialized */
+	mutex_enter(&connp->conn_lock);
+	error = conn_connect(connp, &uinfo, flags);
+	mutex_exit(&connp->conn_lock);
+	if (error != 0)
+		return (error);
 
-			/*
-			 * ip_bind_connected() has stored the correct source
-			 * address in conn_src.
-			 */
-			src_addr = tcp->tcp_connp->conn_src;
-			tcp->tcp_ipha->ipha_src = src_addr;
-			/*
-			 * Copy of the src addr. in tcp_t is needed
-			 * for the lookup funcs.
-			 */
-			IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6);
-		}
-		/*
-		 * Set the fragment bit so that IP will tell us if the MTU
-		 * should change. IP tells us the latest setting of
-		 * ip_path_mtu_discovery through ire_frag_flag.
-		 */
-		if (ipst->ips_ip_path_mtu_discovery) {
-			tcp->tcp_ipha->ipha_fragment_offset_and_flags =
-			    htons(IPH_DF);
-		}
-		/*
-		 * If ire_uinfo is NULL, this is the IRE_INTERFACE case
-		 * for IP_NEXTHOP. No cache ire has been found for the
-		 * destination and we are working with the nexthop's
-		 * interface ire. Since we need to forward all packets
-		 * to the nexthop first, we "blindly" set tcp_localnet
-		 * to false, eventhough the destination may also be
-		 * onlink.
-		 */
-		if (ire_uinfo == NULL)
-			tcp->tcp_localnet = 0;
-		else
-			tcp->tcp_localnet = (ire->ire_gateway_addr == 0);
-	} else {
-		/*
-		 * For incoming connection ire_mp = NULL
-		 * For outgoing connection ire_mp != NULL
-		 * Technically we should check conn_incoming_ill
-		 * when ire_mp is NULL and conn_outgoing_ill when
-		 * ire_mp is non-NULL. But this is performance
-		 * critical path and for IPV*_BOUND_IF, outgoing
-		 * and incoming ill are always set to the same value.
-		 */
-		ill_t	*dst_ill = NULL;
-		ipif_t  *dst_ipif = NULL;
+	error = tcp_build_hdrs(tcp);
+	if (error != 0)
+		return (error);
 
-		ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
+	tcp->tcp_localnet = uinfo.iulp_localnet;
 
-		if (connp->conn_outgoing_ill != NULL) {
-			/* Outgoing or incoming path */
-			int   err;
+	if (uinfo.iulp_rtt != 0) {
+		clock_t	rto;
 
-			dst_ill = conn_get_held_ill(connp,
-			    &connp->conn_outgoing_ill, &err);
-			if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) {
-				ip1dbg(("tcp_adapt_ire: ill_lookup failed\n"));
-				return (0);
-			}
-			match_flags |= MATCH_IRE_ILL;
-			dst_ipif = dst_ill->ill_ipif;
-		}
-		ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6,
-		    0, 0, dst_ipif, zoneid, tsl, match_flags, ipst);
+		tcp->tcp_rtt_sa = uinfo.iulp_rtt;
+		tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
+		rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
+		    tcps->tcps_rexmit_interval_extra +
+		    (tcp->tcp_rtt_sa >> 5);
 
-		if (ire != NULL) {
-			ire_cacheable = B_TRUE;
-			ire_uinfo = (ire_mp != NULL) ?
-			    &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
-			    &ire->ire_uinfo;
+		if (rto > tcps->tcps_rexmit_interval_max) {
+			tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
+		} else if (rto < tcps->tcps_rexmit_interval_min) {
+			tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
 		} else {
-			if (ire_mp == NULL) {
-				ire = ire_ftable_lookup_v6(
-				    &tcp->tcp_connp->conn_remv6,
-				    0, 0, 0, dst_ipif, &sire, zoneid,
-				    0, tsl, match_flags, ipst);
-				if (ire == NULL) {
-					if (dst_ill != NULL)
-						ill_refrele(dst_ill);
-					return (0);
-				}
-				ire_uinfo = (sire != NULL) ? &sire->ire_uinfo :
-				    &ire->ire_uinfo;
-			} else {
-				ire = (ire_t *)ire_mp->b_rptr;
-				ire_uinfo =
-				    &((ire_t *)ire_mp->b_rptr)->ire_uinfo;
-			}
-		}
-		if (dst_ill != NULL)
-			ill_refrele(dst_ill);
-
-		ASSERT(ire != NULL);
-		ASSERT(ire_uinfo != NULL);
-
-		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) ||
-		    IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
-			/*
-			 * ire->ire_mp is non null when ire_mp passed in is used
-			 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
-			 */
-			if (ire->ire_mp == NULL)
-				ire_refrele(ire);
-			if (sire != NULL)
-				ire_refrele(sire);
-			return (0);
+			tcp->tcp_rto = rto;
 		}
-
-		if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
-			in6_addr_t	src_addr;
-
-			/*
-			 * ip_bind_connected_v6() has stored the correct source
-			 * address per IPv6 addr. selection policy in
-			 * conn_src_v6.
-			 */
-			src_addr = tcp->tcp_connp->conn_srcv6;
-
-			tcp->tcp_ip6h->ip6_src = src_addr;
-			/*
-			 * Copy of the src addr. in tcp_t is needed
-			 * for the lookup funcs.
-			 */
-			tcp->tcp_ip_src_v6 = src_addr;
-			ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src,
-			    &connp->conn_srcv6));
+	}
+	if (uinfo.iulp_ssthresh != 0)
+		tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
+	else
+		tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
+	if (uinfo.iulp_spipe > 0) {
+		connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
+		    tcps->tcps_max_buf);
+		if (tcps->tcps_snd_lowat_fraction != 0) {
+			connp->conn_sndlowat = connp->conn_sndbuf /
+			    tcps->tcps_snd_lowat_fraction;
 		}
-		tcp->tcp_localnet =
-		    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
+		(void) tcp_maxpsz_set(tcp, B_TRUE);
 	}
-
 	/*
-	 * This allows applications to fail quickly when connections are made
-	 * to dead hosts. Hosts can be labeled dead by adding a reject route
-	 * with both the RTF_REJECT and RTF_PRIVATE flags set.
+	 * Note that up till now, acceptor always inherits receive
+	 * window from the listener.  But if there is a metrics
+	 * associated with a host, we should use that instead of
+	 * inheriting it from listener. Thus we need to pass this
+	 * info back to the caller.
 	 */
-	if ((ire->ire_flags & RTF_REJECT) &&
-	    (ire->ire_flags & RTF_PRIVATE))
-		goto error;
+	if (uinfo.iulp_rpipe > 0) {
+		tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe,
+		    tcps->tcps_max_buf);
+	}
+
+	if (uinfo.iulp_rtomax > 0) {
+		tcp->tcp_second_timer_threshold =
+		    uinfo.iulp_rtomax;
+	}
 
 	/*
-	 * Make use of the cached rtt and rtt_sd values to calculate the
-	 * initial RTO.  Note that they are already initialized in
-	 * tcp_init_values().
-	 * If ire_uinfo is NULL, i.e., we do not have a cache ire for
-	 * IP_NEXTHOP, but instead are using the interface ire for the
-	 * nexthop, then we do not use the ire_uinfo from that ire to
-	 * do any initializations.
+	 * Use the metric option settings, iulp_tstamp_ok and
+	 * iulp_wscale_ok, only for active open. What this means
+	 * is that if the other side uses timestamp or window
+	 * scale option, TCP will also use those options. That
+	 * is for passive open.  If the application sets a
+	 * large window, window scale is enabled regardless of
+	 * the value in iulp_wscale_ok.  This is the behavior
+	 * since 2.6.  So we keep it.
+	 * The only case left in passive open processing is the
+	 * check for SACK.
+	 * For ECN, it should probably be like SACK.  But the
+	 * current value is binary, so we treat it like the other
+	 * cases.  The metric only controls active open.For passive
+	 * open, the ndd param, tcp_ecn_permitted, controls the
+	 * behavior.
 	 */
-	if (ire_uinfo != NULL) {
-		if (ire_uinfo->iulp_rtt != 0) {
-			clock_t	rto;
-
-			tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt;
-			tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd;
-			rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
-			    tcps->tcps_rexmit_interval_extra +
-			    (tcp->tcp_rtt_sa >> 5);
-
-			if (rto > tcps->tcps_rexmit_interval_max) {
-				tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
-			} else if (rto < tcps->tcps_rexmit_interval_min) {
-				tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
-			} else {
-				tcp->tcp_rto = rto;
-			}
-		}
-		if (ire_uinfo->iulp_ssthresh != 0)
-			tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh;
-		else
-			tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
-		if (ire_uinfo->iulp_spipe > 0) {
-			tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe,
-			    tcps->tcps_max_buf);
-			if (tcps->tcps_snd_lowat_fraction != 0)
-				tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
-				    tcps->tcps_snd_lowat_fraction;
-			(void) tcp_maxpsz_set(tcp, B_TRUE);
-		}
+	if (!tcp_detached) {
 		/*
-		 * Note that up till now, acceptor always inherits receive
-		 * window from the listener.  But if there is a metrics
-		 * associated with a host, we should use that instead of
-		 * inheriting it from listener. Thus we need to pass this
-		 * info back to the caller.
+		 * The if check means that the following can only
+		 * be turned on by the metrics only IRE, but not off.
 		 */
-		if (ire_uinfo->iulp_rpipe > 0) {
-			tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe,
-			    tcps->tcps_max_buf);
-		}
-
-		if (ire_uinfo->iulp_rtomax > 0) {
-			tcp->tcp_second_timer_threshold =
-			    ire_uinfo->iulp_rtomax;
-		}
-
+		if (uinfo.iulp_tstamp_ok)
+			tcp->tcp_snd_ts_ok = B_TRUE;
+		if (uinfo.iulp_wscale_ok)
+			tcp->tcp_snd_ws_ok = B_TRUE;
+		if (uinfo.iulp_sack == 2)
+			tcp->tcp_snd_sack_ok = B_TRUE;
+		if (uinfo.iulp_ecn_ok)
+			tcp->tcp_ecn_ok = B_TRUE;
+	} else {
 		/*
-		 * Use the metric option settings, iulp_tstamp_ok and
-		 * iulp_wscale_ok, only for active open. What this means
-		 * is that if the other side uses timestamp or window
-		 * scale option, TCP will also use those options. That
-		 * is for passive open.  If the application sets a
-		 * large window, window scale is enabled regardless of
-		 * the value in iulp_wscale_ok.  This is the behavior
-		 * since 2.6.  So we keep it.
-		 * The only case left in passive open processing is the
-		 * check for SACK.
-		 * For ECN, it should probably be like SACK.  But the
-		 * current value is binary, so we treat it like the other
-		 * cases.  The metric only controls active open.For passive
-		 * open, the ndd param, tcp_ecn_permitted, controls the
-		 * behavior.
+		 * Passive open.
+		 *
+		 * As above, the if check means that SACK can only be
+		 * turned on by the metric only IRE.
 		 */
-		if (!tcp_detached) {
-			/*
-			 * The if check means that the following can only
-			 * be turned on by the metrics only IRE, but not off.
-			 */
-			if (ire_uinfo->iulp_tstamp_ok)
-				tcp->tcp_snd_ts_ok = B_TRUE;
-			if (ire_uinfo->iulp_wscale_ok)
-				tcp->tcp_snd_ws_ok = B_TRUE;
-			if (ire_uinfo->iulp_sack == 2)
-				tcp->tcp_snd_sack_ok = B_TRUE;
-			if (ire_uinfo->iulp_ecn_ok)
-				tcp->tcp_ecn_ok = B_TRUE;
-		} else {
-			/*
-			 * Passive open.
-			 *
-			 * As above, the if check means that SACK can only be
-			 * turned on by the metric only IRE.
-			 */
-			if (ire_uinfo->iulp_sack > 0) {
-				tcp->tcp_snd_sack_ok = B_TRUE;
-			}
+		if (uinfo.iulp_sack > 0) {
+			tcp->tcp_snd_sack_ok = B_TRUE;
 		}
 	}
 
-
 	/*
-	 * XXX: Note that currently, ire_max_frag can be as small as 68
+	 * XXX Note that currently, iulp_mtu can be as small as 68
 	 * because of PMTUd.  So tcp_mss may go to negative if combined
 	 * length of all those options exceeds 28 bytes.  But because
 	 * of the tcp_mss_min check below, we may not have a problem if
@@ -2864,31 +2487,15 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
 	 * We do not deal with that now.  All those problems related to
 	 * PMTUd will be fixed later.
 	 */
-	ASSERT(ire->ire_max_frag != 0);
-	mss = tcp->tcp_if_mtu = ire->ire_max_frag;
-	if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) {
-		if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) {
-			mss = MIN(mss, IPV6_MIN_MTU);
-		}
-	}
+	ASSERT(uinfo.iulp_mtu != 0);
+	mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu;
 
 	/* Sanity check for MSS value. */
-	if (tcp->tcp_ipversion == IPV4_VERSION)
+	if (connp->conn_ipversion == IPV4_VERSION)
 		mss_max = tcps->tcps_mss_max_ipv4;
 	else
 		mss_max = tcps->tcps_mss_max_ipv6;
 
-	if (tcp->tcp_ipversion == IPV6_VERSION &&
-	    (ire->ire_frag_flag & IPH_FRAG_HDR)) {
-		/*
-		 * After receiving an ICMPv6 "packet too big" message with a
-		 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
-		 * will insert a 8-byte fragment header in every packet; we
-		 * reduce the MSS by that amount here.
-		 */
-		mss -= sizeof (ip6_frag_t);
-	}
-
 	if (tcp->tcp_ipsec_overhead == 0)
 		tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
 
@@ -2903,71 +2510,28 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
 	tcp->tcp_mss = mss;
 
 	/*
+	 * Update the tcp connection with LSO capability.
+	 */
+	tcp_update_lso(tcp, connp->conn_ixa);
+
+	/*
 	 * Initialize the ISS here now that we have the full connection ID.
 	 * The RFC 1948 method of initial sequence number generation requires
 	 * knowledge of the full connection ID before setting the ISS.
 	 */
-
 	tcp_iss_init(tcp);
 
-	if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL))
-		tcp->tcp_loopback = B_TRUE;
-
-	if (sire != NULL)
-		IRE_REFRELE(sire);
-
-	/*
-	 * If we got an IRE_CACHE and an ILL, go through their properties;
-	 * otherwise, this is deferred until later when we have an IRE_CACHE.
-	 */
-	if (tcp->tcp_loopback ||
-	    (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) {
-		/*
-		 * For incoming, see if this tcp may be MDT-capable.  For
-		 * outgoing, this process has been taken care of through
-		 * tcp_rput_other.
-		 */
-		tcp_ire_ill_check(tcp, ire, ill, incoming);
-		tcp->tcp_ire_ill_check_done = B_TRUE;
-	}
+	tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local);
 
-	mutex_enter(&connp->conn_lock);
 	/*
 	 * Make sure that conn is not marked incipient
 	 * for incoming connections. A blind
 	 * removal of incipient flag is cheaper than
 	 * check and removal.
 	 */
+	mutex_enter(&connp->conn_lock);
 	connp->conn_state_flags &= ~CONN_INCIPIENT;
-
-	/*
-	 * Must not cache forwarding table routes
-	 * or recache an IRE after the conn_t has
-	 * had conn_ire_cache cleared and is flagged
-	 * unusable, (see the CONN_CACHE_IRE() macro).
-	 */
-	if (ire_cacheable && CONN_CACHE_IRE(connp)) {
-		rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
-		if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-			connp->conn_ire_cache = ire;
-			IRE_UNTRACE_REF(ire);
-			rw_exit(&ire->ire_bucket->irb_lock);
-			mutex_exit(&connp->conn_lock);
-			return (1);
-		}
-		rw_exit(&ire->ire_bucket->irb_lock);
-	}
 	mutex_exit(&connp->conn_lock);
-
-	if (ire->ire_mp == NULL)
-		ire_refrele(ire);
-	return (1);
-
-error:
-	if (ire->ire_mp == NULL)
-		ire_refrele(ire);
-	if (sire != NULL)
-		ire_refrele(sire);
 	return (0);
 }
 
@@ -3001,7 +2565,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
 
 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_tpi_bind: bad req, len %u",
 			    (uint_t)(mp->b_wptr - mp->b_rptr));
@@ -3010,7 +2574,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
 		return;
 	}
 	/* Make sure the largest address fits */
-	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
+	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
 	if (mp1 == NULL) {
 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
 		return;
@@ -3024,7 +2588,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
 	switch (len) {
 	case 0:		/* request for a generic port */
 		tbr->ADDR_offset = sizeof (struct T_bind_req);
-		if (tcp->tcp_family == AF_INET) {
+		if (connp->conn_family == AF_INET) {
 			tbr->ADDR_length = sizeof (sin_t);
 			sin = (sin_t *)&tbr[1];
 			*sin = sin_null;
@@ -3033,7 +2597,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
 			len = sizeof (sin_t);
 			mp->b_wptr = (uchar_t *)&sin[1];
 		} else {
-			ASSERT(tcp->tcp_family == AF_INET6);
+			ASSERT(connp->conn_family == AF_INET6);
 			tbr->ADDR_length = sizeof (sin6_t);
 			sin6 = (sin6_t *)&tbr[1];
 			*sin6 = sin6_null;
@@ -3055,7 +2619,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
 		break;
 
 	default:
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_tpi_bind: bad address length, %d",
 			    tbr->ADDR_length);
@@ -3080,16 +2644,16 @@ done:
 		/*
 		 * Update port information as sockfs/tpi needs it for checking
 		 */
-		if (tcp->tcp_family == AF_INET) {
+		if (connp->conn_family == AF_INET) {
 			sin = (sin_t *)sa;
-			sin->sin_port = tcp->tcp_lport;
+			sin->sin_port = connp->conn_lport;
 		} else {
 			sin6 = (sin6_t *)sa;
-			sin6->sin6_port = tcp->tcp_lport;
+			sin6->sin6_port = connp->conn_lport;
 		}
 		mp->b_datap->db_type = M_PCPROTO;
 		tbr->PRIM_type = T_BIND_ACK;
-		putnext(tcp->tcp_rq, mp);
+		putnext(connp->conn_rq, mp);
 	}
 }
 
@@ -3139,7 +2703,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 		 * Set loopmax appropriately so that one does not look
 		 * forever in the case all of the anonymous ports are in use.
 		 */
-		if (tcp->tcp_anon_priv_bind) {
+		if (connp->conn_anon_priv_bind) {
 			/*
 			 * loopmax =
 			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
@@ -3175,7 +2739,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 		mutex_enter(&tbf->tf_lock);
 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
 		    ltcp = ltcp->tcp_bind_hash) {
-			if (lport == ltcp->tcp_lport)
+			if (lport == ltcp->tcp_connp->conn_lport)
 				break;
 		}
 
@@ -3191,7 +2755,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 			 * privilege as being in all zones, as there's
 			 * otherwise no way to identify the right receiver.
 			 */
-			if (!IPCL_BIND_ZONE_MATCH(ltcp->tcp_connp, connp))
+			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
 				continue;
 
 			/*
@@ -3227,7 +2791,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 			 * added.
 			 *
 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
-			 *	!reuseaddr || !ltcp->tcp_reuseaddr) {
+			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
 			 *		...
 			 * }
 			 *
@@ -3243,17 +2807,18 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 			 */
 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
 			    TCP_IS_SOCKET(tcp));
-			exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind;
+			exclbind = lconnp->conn_exclbind ||
+			    connp->conn_exclbind;
 
 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
 			    (exclbind && (not_socket ||
 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
 				if (V6_OR_V4_INADDR_ANY(
-				    ltcp->tcp_bound_source_v6) ||
+				    lconnp->conn_bound_addr_v6) ||
 				    V6_OR_V4_INADDR_ANY(*laddr) ||
 				    IN6_ARE_ADDR_EQUAL(laddr,
-				    &ltcp->tcp_bound_source_v6)) {
+				    &lconnp->conn_bound_addr_v6)) {
 					break;
 				}
 				continue;
@@ -3266,7 +2831,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 			 * specific port. We use the same autoassigned port
 			 * number space for IPv4 and IPv6 sockets.
 			 */
-			if (tcp->tcp_ipversion != ltcp->tcp_ipversion &&
+			if (connp->conn_ipversion != lconnp->conn_ipversion &&
 			    bind_to_req_port_only)
 				continue;
 
@@ -3281,9 +2846,9 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 			 */
 			if (quick_connect &&
 			    (ltcp->tcp_state > TCPS_LISTEN) &&
-			    ((tcp->tcp_fport != ltcp->tcp_fport) ||
-			    !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6,
-			    &ltcp->tcp_remote_v6)))
+			    ((connp->conn_fport != lconnp->conn_fport) ||
+			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
+			    &lconnp->conn_faddr_v6)))
 				continue;
 
 			if (!reuseaddr) {
@@ -3299,9 +2864,9 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 				 */
 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
 				    !V6_OR_V4_INADDR_ANY(
-				    ltcp->tcp_bound_source_v6) &&
+				    lconnp->conn_bound_addr_v6) &&
 				    !IN6_ARE_ADDR_EQUAL(laddr,
-				    &ltcp->tcp_bound_source_v6))
+				    &lconnp->conn_bound_addr_v6))
 					continue;
 				if (ltcp->tcp_state >= TCPS_BOUND) {
 					/*
@@ -3327,7 +2892,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 				 * SO_REUSEADDR setting, so we break.
 				 */
 				if (IN6_ARE_ADDR_EQUAL(laddr,
-				    &ltcp->tcp_bound_source_v6) &&
+				    &lconnp->conn_bound_addr_v6) &&
 				    (ltcp->tcp_state == TCPS_LISTEN ||
 				    ltcp->tcp_state == TCPS_BOUND))
 					break;
@@ -3343,11 +2908,10 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 			 * number.
 			 */
 			tcp->tcp_state = TCPS_BOUND;
-			tcp->tcp_lport = htons(port);
-			*(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
+			connp->conn_lport = htons(port);
 
 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
-			    tcp->tcp_lport)] == tbf);
+			    connp->conn_lport)] == tbf);
 			tcp_bind_hash_insert(tbf, tcp, 1);
 
 			mutex_exit(&tbf->tf_lock);
@@ -3364,12 +2928,12 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 			 * is updated. After the update, it may or may not
 			 * be in the valid range.
 			 */
-			if (!tcp->tcp_anon_priv_bind)
+			if (!connp->conn_anon_priv_bind)
 				tcps->tcps_next_port_to_try = port + 1;
 			return (port);
 		}
 
-		if (tcp->tcp_anon_priv_bind) {
+		if (connp->conn_anon_priv_bind) {
 			port = tcp_get_next_priv_port(tcp);
 		} else {
 			if (count == 0 && user_specified) {
@@ -3402,12 +2966,13 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
  * tcp_clean_death / tcp_close_detached must not be called more than once
  * on a tcp. Thus every function that potentially calls tcp_clean_death
  * must check for the tcp state before calling tcp_clean_death.
- * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper,
+ * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper,
  * tcp_timer_handler, all check for the tcp state.
  */
 /* ARGSUSED */
 void
-tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2)
+tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy)
 {
 	tcp_t	*tcp = ((conn_t *)arg)->conn_tcp;
 
@@ -3449,11 +3014,11 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
 	}
 
 	ASSERT(tcp != NULL);
-	ASSERT((tcp->tcp_family == AF_INET &&
-	    tcp->tcp_ipversion == IPV4_VERSION) ||
-	    (tcp->tcp_family == AF_INET6 &&
-	    (tcp->tcp_ipversion == IPV4_VERSION ||
-	    tcp->tcp_ipversion == IPV6_VERSION)));
+	ASSERT((connp->conn_family == AF_INET &&
+	    connp->conn_ipversion == IPV4_VERSION) ||
+	    (connp->conn_family == AF_INET6 &&
+	    (connp->conn_ipversion == IPV4_VERSION ||
+	    connp->conn_ipversion == IPV6_VERSION)));
 
 	if (TCP_IS_DETACHED(tcp)) {
 		if (tcp->tcp_hard_binding) {
@@ -3483,7 +3048,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
 
 	TCP_STAT(tcps, tcp_clean_death_nondetached);
 
-	q = tcp->tcp_rq;
+	q = connp->conn_rq;
 
 	/* Trash all inbound data */
 	if (!IPCL_IS_NONSTR(connp)) {
@@ -3506,7 +3071,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
 			 */
 			(void) putnextctl1(q, M_FLUSH, FLUSHR);
 		}
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 			    "tcp_clean_death: discon err %d", err);
 		}
@@ -3519,7 +3084,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
 			if (mp != NULL) {
 				putnext(q, mp);
 			} else {
-				if (tcp->tcp_debug) {
+				if (connp->conn_debug) {
 					(void) strlog(TCP_MOD_ID, 0, 1,
 					    SL_ERROR|SL_TRACE,
 					    "tcp_clean_death, sending M_ERROR");
@@ -3552,6 +3117,7 @@ tcp_stop_lingering(tcp_t *tcp)
 {
 	clock_t	delta = 0;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
 	tcp->tcp_linger_tid = 0;
 	if (tcp->tcp_state > TCPS_LISTEN) {
@@ -3568,15 +3134,14 @@ tcp_stop_lingering(tcp_t *tcp)
 		}
 		/*
 		 * Need to cancel those timers which will not be used when
-		 * TCP is detached.  This has to be done before the tcp_wq
-		 * is set to the global queue.
+		 * TCP is detached.  This has to be done before the conn_wq
+		 * is cleared.
 		 */
 		tcp_timers_stop(tcp);
 
 		tcp->tcp_detached = B_TRUE;
-		ASSERT(tcps->tcps_g_q != NULL);
-		tcp->tcp_rq = tcps->tcps_g_q;
-		tcp->tcp_wq = WR(tcps->tcps_g_q);
+		connp->conn_rq = NULL;
+		connp->conn_wq = NULL;
 
 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
 			tcp_time_wait_append(tcp);
@@ -3595,16 +3160,14 @@ tcp_stop_lingering(tcp_t *tcp)
 		}
 	} else {
 		tcp_closei_local(tcp);
-		CONN_DEC_REF(tcp->tcp_connp);
+		CONN_DEC_REF(connp);
 	}
 finish:
 	/* Signal closing thread that it can complete close */
 	mutex_enter(&tcp->tcp_closelock);
 	tcp->tcp_detached = B_TRUE;
-	ASSERT(tcps->tcps_g_q != NULL);
-
-	tcp->tcp_rq = tcps->tcps_g_q;
-	tcp->tcp_wq = WR(tcps->tcps_g_q);
+	connp->conn_rq = NULL;
+	connp->conn_wq = NULL;
 
 	tcp->tcp_closed = 1;
 	cv_signal(&tcp->tcp_closecv);
@@ -3636,9 +3199,9 @@ tcp_close_common(conn_t *connp, int flags)
 	ASSERT(connp->conn_ref >= 2);
 
 	/*
-	 * Mark the conn as closing. ill_pending_mp_add will not
+	 * Mark the conn as closing. ipsq_pending_mp_add will not
 	 * add any mp to the pending mp list, after this conn has
-	 * started closing. Same for sq_pending_mp_add
+	 * started closing.
 	 */
 	mutex_enter(&connp->conn_lock);
 	connp->conn_state_flags |= CONN_CLOSING;
@@ -3664,7 +3227,7 @@ tcp_close_common(conn_t *connp, int flags)
 	TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
 
 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp,
-	    tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
+	    NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
 
 	mutex_enter(&tcp->tcp_closelock);
 	while (!tcp->tcp_closed) {
@@ -3684,13 +3247,13 @@ tcp_close_common(conn_t *connp, int flags)
 			 * thread is higher priority than the squeue worker
 			 * thread and is bound to the same cpu.
 			 */
-			if (tcp->tcp_linger && tcp->tcp_lingertime > 0) {
+			if (connp->conn_linger && connp->conn_lingertime > 0) {
 				mutex_exit(&tcp->tcp_closelock);
 				/* Entering squeue, bump ref count. */
 				CONN_INC_REF(connp);
 				bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
 				SQUEUE_ENTER_ONE(connp->conn_sqp, bp,
-				    tcp_linger_interrupted, connp,
+				    tcp_linger_interrupted, connp, NULL,
 				    tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
 				mutex_enter(&tcp->tcp_closelock);
 			}
@@ -3703,8 +3266,8 @@ tcp_close_common(conn_t *connp, int flags)
 
 	/*
 	 * In the case of listener streams that have eagers in the q or q0
-	 * we wait for the eagers to drop their reference to us. tcp_rq and
-	 * tcp_wq of the eagers point to our queues. By waiting for the
+	 * we wait for the eagers to drop their reference to us. conn_rq and
+	 * conn_wq of the eagers point to our queues. By waiting for the
 	 * refcnt to drop to 1, we are sure that the eagers have cleaned
 	 * up their queue pointers and also dropped their references to us.
 	 */
@@ -3716,13 +3279,12 @@ tcp_close_common(conn_t *connp, int flags)
 		mutex_exit(&connp->conn_lock);
 	}
 	/*
-	 * ioctl cleanup. The mp is queued in the
-	 * ill_pending_mp or in the sq_pending_mp.
+	 * ioctl cleanup. The mp is queued in the ipx_pending_mp.
 	 */
 	if (conn_ioctl_cleanup_reqd)
 		conn_ioctl_cleanup(connp);
 
-	tcp->tcp_cpid = -1;
+	connp->conn_cpid = NOPID;
 }
 
 static int
@@ -3799,7 +3361,7 @@ tcp_tpi_close_accept(queue_t *q)
 
 /* ARGSUSED */
 static void
-tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2)
+tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	conn_t	*connp = (conn_t *)arg;
 	tcp_t	*tcp = connp->conn_tcp;
@@ -3828,7 +3390,7 @@ tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2)
 
 /* ARGSUSED */
 static void
-tcp_close_output(void *arg, mblk_t *mp, void *arg2)
+tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	char	*msg;
 	conn_t	*connp = (conn_t *)arg;
@@ -3847,10 +3409,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
 	}
 	mutex_exit(&tcp->tcp_eager_lock);
 
-	connp->conn_mdt_ok = B_FALSE;
-	tcp->tcp_mdt = B_FALSE;
-
-	connp->conn_lso_ok = B_FALSE;
 	tcp->tcp_lso = B_FALSE;
 
 	msg = NULL;
@@ -3879,12 +3437,11 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
 		 * If SO_LINGER has set a zero linger time, abort the
 		 * connection with a reset.
 		 */
-		if (tcp->tcp_linger && tcp->tcp_lingertime == 0) {
+		if (connp->conn_linger && connp->conn_lingertime == 0) {
 			msg = "tcp_close, zero lingertime";
 			break;
 		}
 
-		ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding);
 		/*
 		 * Abort connection if there is unread data queued.
 		 */
@@ -3893,9 +3450,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
 			break;
 		}
 		/*
-		 * tcp_hard_bound is now cleared thus all packets go through
-		 * tcp_lookup. This fact is used by tcp_detach below.
-		 *
 		 * We have done a qwait() above which could have possibly
 		 * drained more messages in turn causing transition to a
 		 * different state. Check whether we have to do the rest
@@ -3915,7 +3469,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
 		 * If lingering on close then wait until the fin is acked,
 		 * the SO_LINGER time passes, or a reset is sent/received.
 		 */
-		if (tcp->tcp_linger && tcp->tcp_lingertime > 0 &&
+		if (connp->conn_linger && connp->conn_lingertime > 0 &&
 		    !(tcp->tcp_fin_acked) &&
 		    tcp->tcp_state >= TCPS_ESTABLISHED) {
 			if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
@@ -3926,7 +3480,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
 
 				tcp->tcp_linger_tid = TCP_TIMER(tcp,
 				    tcp_close_linger_timeout,
-				    tcp->tcp_lingertime * hz);
+				    connp->conn_lingertime * hz);
 
 				/* tcp_close_linger_timeout will finish close */
 				if (tcp->tcp_linger_tid == 0)
@@ -3944,8 +3498,8 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
 		}
 
 		/*
-		 * Make sure that no other thread will access the tcp_rq of
-		 * this instance (through lookups etc.) as tcp_rq will go
+		 * Make sure that no other thread will access the conn_rq of
+		 * this instance (through lookups etc.) as conn_rq will go
 		 * away shortly.
 		 */
 		tcp_acceptor_hash_remove(tcp);
@@ -3962,8 +3516,8 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
 		}
 		/*
 		 * Need to cancel those timers which will not be used when
-		 * TCP is detached.  This has to be done before the tcp_wq
-		 * is set to the global queue.
+		 * TCP is detached.  This has to be done before the conn_wq
+		 * is set to NULL.
 		 */
 		tcp_timers_stop(tcp);
 
@@ -4004,18 +3558,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
 	ASSERT(connp->conn_ref >= 2);
 
 finish:
-	/*
-	 * Although packets are always processed on the correct
-	 * tcp's perimeter and access is serialized via squeue's,
-	 * IP still needs a queue when sending packets in time_wait
-	 * state so use WR(tcps_g_q) till ip_output() can be
-	 * changed to deal with just connp. For read side, we
-	 * could have set tcp_rq to NULL but there are some cases
-	 * in tcp_rput_data() from early days of this code which
-	 * do a putnext without checking if tcp is closed. Those
-	 * need to be identified before both tcp_rq and tcp_wq
-	 * can be set to NULL and tcps_g_q can disappear forever.
-	 */
 	mutex_enter(&tcp->tcp_closelock);
 	/*
 	 * Don't change the queues in the case of a listener that has
@@ -4024,13 +3566,8 @@ finish:
 	 */
 	if (!tcp->tcp_wait_for_eagers) {
 		tcp->tcp_detached = B_TRUE;
-		/*
-		 * When default queue is closing we set tcps_g_q to NULL
-		 * after the close is done.
-		 */
-		ASSERT(tcps->tcps_g_q != NULL);
-		tcp->tcp_rq = tcps->tcps_g_q;
-		tcp->tcp_wq = WR(tcps->tcps_g_q);
+		connp->conn_rq = NULL;
+		connp->conn_wq = NULL;
 	}
 
 	/* Signal tcp_close() to finish closing. */
@@ -4112,8 +3649,7 @@ tcp_timers_stop(tcp_t *tcp)
 static void
 tcp_closei_local(tcp_t *tcp)
 {
-	ire_t 	*ire;
-	conn_t	*connp = tcp->tcp_connp;
+	conn_t		*connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
 	if (!TCP_IS_SOCKET(tcp))
@@ -4138,7 +3674,7 @@ tcp_closei_local(tcp_t *tcp)
 		 * this point, eager will be closed but we
 		 * leave it in listeners eager list so that
 		 * if listener decides to close without doing
-		 * accept, we can clean this up. In tcp_wput_accept
+		 * accept, we can clean this up. In tcp_tli_accept
 		 * we take care of the case of accept on closed
 		 * eager.
 		 */
@@ -4150,9 +3686,9 @@ tcp_closei_local(tcp_t *tcp)
 			 * listener queue, after we have released our
 			 * reference on the listener
 			 */
-			ASSERT(tcps->tcps_g_q != NULL);
-			tcp->tcp_rq = tcps->tcps_g_q;
-			tcp->tcp_wq = WR(tcps->tcps_g_q);
+			ASSERT(tcp->tcp_detached);
+			connp->conn_rq = NULL;
+			connp->conn_wq = NULL;
 			CONN_DEC_REF(listener->tcp_connp);
 		} else {
 			mutex_exit(&listener->tcp_eager_lock);
@@ -4185,20 +3721,16 @@ tcp_closei_local(tcp_t *tcp)
 	 */
 	if (tcp->tcp_state == TCPS_TIME_WAIT)
 		(void) tcp_time_wait_remove(tcp, NULL);
-	CL_INET_DISCONNECT(connp, tcp);
+	CL_INET_DISCONNECT(connp);
 	ipcl_hash_remove(connp);
+	ixa_cleanup(connp->conn_ixa);
 
 	/*
-	 * Delete the cached ire in conn_ire_cache and also mark
-	 * the conn as CONDEMNED
+	 * Mark the conn as CONDEMNED
 	 */
 	mutex_enter(&connp->conn_lock);
 	connp->conn_state_flags |= CONN_CONDEMNED;
-	ire = connp->conn_ire_cache;
-	connp->conn_ire_cache = NULL;
 	mutex_exit(&connp->conn_lock);
-	if (ire != NULL)
-		IRE_REFRELE_NOTR(ire);
 
 	/* Need to cleanup any pending ioctls */
 	ASSERT(tcp->tcp_time_wait_next == NULL);
@@ -4227,14 +3759,14 @@ tcp_closei_local(tcp_t *tcp)
 void
 tcp_free(tcp_t *tcp)
 {
-	mblk_t	*mp;
-	ip6_pkt_t	*ipp;
+	mblk_t		*mp;
+	conn_t		*connp = tcp->tcp_connp;
 
 	ASSERT(tcp != NULL);
 	ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL);
 
-	tcp->tcp_rq = NULL;
-	tcp->tcp_wq = NULL;
+	connp->conn_rq = NULL;
+	connp->conn_wq = NULL;
 
 	tcp_close_mpp(&tcp->tcp_xmit_head);
 	tcp_close_mpp(&tcp->tcp_reass_head);
@@ -4281,12 +3813,12 @@ tcp_free(tcp_t *tcp)
 		tcp->tcp_dstoptslen = 0;
 	}
 	ASSERT(tcp->tcp_dstoptslen == 0);
-	if (tcp->tcp_rtdstopts != NULL) {
-		mi_free(tcp->tcp_rtdstopts);
-		tcp->tcp_rtdstopts = NULL;
-		tcp->tcp_rtdstoptslen = 0;
+	if (tcp->tcp_rthdrdstopts != NULL) {
+		mi_free(tcp->tcp_rthdrdstopts);
+		tcp->tcp_rthdrdstopts = NULL;
+		tcp->tcp_rthdrdstoptslen = 0;
 	}
-	ASSERT(tcp->tcp_rtdstoptslen == 0);
+	ASSERT(tcp->tcp_rthdrdstoptslen == 0);
 	if (tcp->tcp_rthdr != NULL) {
 		mi_free(tcp->tcp_rthdr);
 		tcp->tcp_rthdr = NULL;
@@ -4294,18 +3826,6 @@ tcp_free(tcp_t *tcp)
 	}
 	ASSERT(tcp->tcp_rthdrlen == 0);
 
-	ipp = &tcp->tcp_sticky_ipp;
-	if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS |
-	    IPPF_RTHDR))
-		ip6_pkt_free(ipp);
-
-	/*
-	 * Free memory associated with the tcp/ip header template.
-	 */
-
-	if (tcp->tcp_iphc != NULL)
-		bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
-
 	/*
 	 * Following is really a blowing away a union.
 	 * It happens to have exactly two members of identical size
@@ -4317,17 +3837,19 @@ tcp_free(tcp_t *tcp)
 
 /*
  * Put a connection confirmation message upstream built from the
- * address information within 'iph' and 'tcph'.  Report our success or failure.
+ * address/flowid information with the conn and iph. Report our success or
+ * failure.
  */
 static boolean_t
-tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
-    mblk_t **defermp)
+tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
+    mblk_t **defermp, ip_recv_attr_t *ira)
 {
 	sin_t	sin;
 	sin6_t	sin6;
 	mblk_t	*mp;
 	char	*optp = NULL;
 	int	optlen = 0;
+	conn_t	*connp = tcp->tcp_connp;
 
 	if (defermp != NULL)
 		*defermp = NULL;
@@ -4352,20 +3874,19 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
 	}
 
 	if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
-		ipha_t *ipha = (ipha_t *)iphdr;
 
 		/* packet is IPv4 */
-		if (tcp->tcp_family == AF_INET) {
+		if (connp->conn_family == AF_INET) {
 			sin = sin_null;
-			sin.sin_addr.s_addr = ipha->ipha_src;
-			sin.sin_port = *(uint16_t *)tcph->th_lport;
+			sin.sin_addr.s_addr = connp->conn_faddr_v4;
+			sin.sin_port = connp->conn_fport;
 			sin.sin_family = AF_INET;
 			mp = mi_tpi_conn_con(NULL, (char *)&sin,
 			    (int)sizeof (sin_t), optp, optlen);
 		} else {
 			sin6 = sin6_null;
-			IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
-			sin6.sin6_port = *(uint16_t *)tcph->th_lport;
+			sin6.sin6_addr = connp->conn_faddr_v6;
+			sin6.sin6_port = connp->conn_fport;
 			sin6.sin6_family = AF_INET6;
 			mp = mi_tpi_conn_con(NULL, (char *)&sin6,
 			    (int)sizeof (sin6_t), optp, optlen);
@@ -4375,10 +3896,10 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
 		ip6_t	*ip6h = (ip6_t *)iphdr;
 
 		ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
-		ASSERT(tcp->tcp_family == AF_INET6);
+		ASSERT(connp->conn_family == AF_INET6);
 		sin6 = sin6_null;
-		sin6.sin6_addr = ip6h->ip6_src;
-		sin6.sin6_port = *(uint16_t *)tcph->th_lport;
+		sin6.sin6_addr = connp->conn_faddr_v6;
+		sin6.sin6_port = connp->conn_fport;
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
 		mp = mi_tpi_conn_con(NULL, (char *)&sin6,
@@ -4393,16 +3914,16 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
 	if (defermp == NULL) {
 		conn_t *connp = tcp->tcp_connp;
 		if (IPCL_IS_NONSTR(connp)) {
-			cred_t *cr;
-			pid_t cpid;
-
-			cr = msg_getcred(mp, &cpid);
 			(*connp->conn_upcalls->su_connected)
-			    (connp->conn_upper_handle, tcp->tcp_connid, cr,
-			    cpid);
+			    (connp->conn_upper_handle, tcp->tcp_connid,
+			    ira->ira_cred, ira->ira_cpid);
 			freemsg(mp);
 		} else {
-			putnext(tcp->tcp_rq, mp);
+			if (ira->ira_cred != NULL) {
+				/* So that getpeerucred works for TPI sockfs */
+				mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
+			}
+			putnext(connp->conn_rq, mp);
 		}
 	} else {
 		*defermp = mp;
@@ -4456,7 +3977,7 @@ tcp_drop_q0(tcp_t *tcp)
 	 */
 	MAKE_UNDROPPABLE(eager);
 
-	if (tcp->tcp_debug) {
+	if (tcp->tcp_connp->conn_debug) {
 		(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
 		    "tcp_drop_q0: listen half-open queue (max=%d) overflow"
 		    " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0,
@@ -4469,18 +3990,19 @@ tcp_drop_q0(tcp_t *tcp)
 	/* Put a reference on the conn as we are enqueueing it in the sqeue */
 	CONN_INC_REF(eager->tcp_connp);
 
-	/* Mark the IRE created for this SYN request temporary */
-	tcp_ip_ire_mark_advice(eager);
 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
-	    tcp_clean_death_wrapper, eager->tcp_connp,
+	    tcp_clean_death_wrapper, eager->tcp_connp, NULL,
 	    SQ_FILL, SQTAG_TCP_DROP_Q0);
 
 	return (B_TRUE);
 }
 
-int
+/*
+ * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6
+ */
+static mblk_t *
 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
-    tcph_t *tcph, uint_t ipvers, mblk_t *idmp)
+    ip_recv_attr_t *ira)
 {
 	tcp_t 		*ltcp = lconnp->conn_tcp;
 	tcp_t		*tcp = connp->conn_tcp;
@@ -4488,36 +4010,30 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
 	ipha_t		*ipha;
 	ip6_t		*ip6h;
 	sin6_t 		sin6;
-	in6_addr_t 	v6dst;
-	int		err;
-	int		ifindex = 0;
+	uint_t		ifindex = ira->ira_ruifindex;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
-	if (ipvers == IPV4_VERSION) {
+	if (ira->ira_flags & IRAF_IS_IPV4) {
 		ipha = (ipha_t *)mp->b_rptr;
 
-		connp->conn_send = ip_output;
-		connp->conn_recv = tcp_input;
-
-		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst,
-		    &connp->conn_bound_source_v6);
-		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
-		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
+		connp->conn_ipversion = IPV4_VERSION;
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
+		connp->conn_saddr_v6 = connp->conn_laddr_v6;
 
 		sin6 = sin6_null;
-		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
-		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
-		sin6.sin6_port = *(uint16_t *)tcph->th_lport;
+		sin6.sin6_addr = connp->conn_faddr_v6;
+		sin6.sin6_port = connp->conn_fport;
 		sin6.sin6_family = AF_INET6;
-		sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst,
-		    lconnp->conn_zoneid, tcps->tcps_netstack);
-		if (tcp->tcp_recvdstaddr) {
+		sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
+		    IPCL_ZONEID(lconnp), tcps->tcps_netstack);
+
+		if (connp->conn_recv_ancillary.crb_recvdstaddr) {
 			sin6_t	sin6d;
 
 			sin6d = sin6_null;
-			IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst,
-			    &sin6d.sin6_addr);
-			sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
+			sin6d.sin6_addr = connp->conn_laddr_v6;
+			sin6d.sin6_port = connp->conn_lport;
 			sin6d.sin6_family = AF_INET;
 			tpi_mp = mi_tpi_extconn_ind(NULL,
 			    (char *)&sin6d, sizeof (sin6_t),
@@ -4534,24 +4050,18 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
 	} else {
 		ip6h = (ip6_t *)mp->b_rptr;
 
-		connp->conn_send = ip_output_v6;
-		connp->conn_recv = tcp_input;
-
-		connp->conn_bound_source_v6 = ip6h->ip6_dst;
-		connp->conn_srcv6 = ip6h->ip6_dst;
-		connp->conn_remv6 = ip6h->ip6_src;
-
-		/* db_cksumstuff is set at ip_fanout_tcp_v6 */
-		ifindex = (int)DB_CKSUMSTUFF(mp);
-		DB_CKSUMSTUFF(mp) = 0;
+		connp->conn_ipversion = IPV6_VERSION;
+		connp->conn_laddr_v6 = ip6h->ip6_dst;
+		connp->conn_faddr_v6 = ip6h->ip6_src;
+		connp->conn_saddr_v6 = connp->conn_laddr_v6;
 
 		sin6 = sin6_null;
-		sin6.sin6_addr = ip6h->ip6_src;
-		sin6.sin6_port = *(uint16_t *)tcph->th_lport;
+		sin6.sin6_addr = connp->conn_faddr_v6;
+		sin6.sin6_port = connp->conn_fport;
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
-		sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
-		    lconnp->conn_zoneid, tcps->tcps_netstack);
+		sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
+		    IPCL_ZONEID(lconnp), tcps->tcps_netstack);
 
 		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
 			/* Pass up the scope_id of remote addr */
@@ -4559,13 +4069,16 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
 		} else {
 			sin6.sin6_scope_id = 0;
 		}
-		if (tcp->tcp_recvdstaddr) {
+		if (connp->conn_recv_ancillary.crb_recvdstaddr) {
 			sin6_t	sin6d;
 
 			sin6d = sin6_null;
-			sin6.sin6_addr = ip6h->ip6_dst;
-			sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
-			sin6d.sin6_family = AF_INET;
+			sin6.sin6_addr = connp->conn_laddr_v6;
+			sin6d.sin6_port = connp->conn_lport;
+			sin6d.sin6_family = AF_INET6;
+			if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6))
+				sin6d.sin6_scope_id = ifindex;
+
 			tpi_mp = mi_tpi_extconn_ind(NULL,
 			    (char *)&sin6d, sizeof (sin6_t),
 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
@@ -4579,194 +4092,40 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
 		}
 	}
 
-	if (tpi_mp == NULL)
-		return (ENOMEM);
-
-	connp->conn_fport = *(uint16_t *)tcph->th_lport;
-	connp->conn_lport = *(uint16_t *)tcph->th_fport;
-	connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER);
-	connp->conn_fully_bound = B_FALSE;
-
-	/* Inherit information from the "parent" */
-	tcp->tcp_ipversion = ltcp->tcp_ipversion;
-	tcp->tcp_family = ltcp->tcp_family;
-
-	tcp->tcp_wq = ltcp->tcp_wq;
-	tcp->tcp_rq = ltcp->tcp_rq;
-
 	tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
-	tcp->tcp_detached = B_TRUE;
-	SOCK_CONNID_INIT(tcp->tcp_connid);
-	if ((err = tcp_init_values(tcp)) != 0) {
-		freemsg(tpi_mp);
-		return (err);
-	}
-
-	if (ipvers == IPV4_VERSION) {
-		if ((err = tcp_header_init_ipv4(tcp)) != 0) {
-			freemsg(tpi_mp);
-			return (err);
-		}
-		ASSERT(tcp->tcp_ipha != NULL);
-	} else {
-		/* ifindex must be already set */
-		ASSERT(ifindex != 0);
-
-		if (ltcp->tcp_bound_if != 0)
-			tcp->tcp_bound_if = ltcp->tcp_bound_if;
-		else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
-			tcp->tcp_bound_if = ifindex;
-
-		tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary;
-		tcp->tcp_recvifindex = 0;
-		tcp->tcp_recvhops = 0xffffffffU;
-		ASSERT(tcp->tcp_ip6h != NULL);
-	}
-
-	tcp->tcp_lport = ltcp->tcp_lport;
-
-	if (ltcp->tcp_ipversion == tcp->tcp_ipversion) {
-		if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) {
-			/*
-			 * Listener had options of some sort; eager inherits.
-			 * Free up the eager template and allocate one
-			 * of the right size.
-			 */
-			if (tcp->tcp_hdr_grown) {
-				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
-			} else {
-				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
-				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
-			}
-			tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len,
-			    KM_NOSLEEP);
-			if (tcp->tcp_iphc == NULL) {
-				tcp->tcp_iphc_len = 0;
-				freemsg(tpi_mp);
-				return (ENOMEM);
-			}
-			tcp->tcp_iphc_len = ltcp->tcp_iphc_len;
-			tcp->tcp_hdr_grown = B_TRUE;
-		}
-		tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
-		tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
-		tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
-		tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops;
-		tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf;
-
-		/*
-		 * Copy the IP+TCP header template from listener to eager
-		 */
-		bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
-		if (tcp->tcp_ipversion == IPV6_VERSION) {
-			if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt ==
-			    IPPROTO_RAW) {
-				tcp->tcp_ip6h =
-				    (ip6_t *)(tcp->tcp_iphc +
-				    sizeof (ip6i_t));
-			} else {
-				tcp->tcp_ip6h =
-				    (ip6_t *)(tcp->tcp_iphc);
-			}
-			tcp->tcp_ipha = NULL;
-		} else {
-			tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
-			tcp->tcp_ip6h = NULL;
-		}
-		tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
-		    tcp->tcp_ip_hdr_len);
-	} else {
-		/*
-		 * only valid case when ipversion of listener and
-		 * eager differ is when listener is IPv6 and
-		 * eager is IPv4.
-		 * Eager header template has been initialized to the
-		 * maximum v4 header sizes, which includes space for
-		 * TCP and IP options.
-		 */
-		ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) &&
-		    (tcp->tcp_ipversion == IPV4_VERSION));
-		ASSERT(tcp->tcp_iphc_len >=
-		    TCP_MAX_COMBINED_HEADER_LENGTH);
-		tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
-		/* copy IP header fields individually */
-		tcp->tcp_ipha->ipha_ttl =
-		    ltcp->tcp_ip6h->ip6_hops;
-		bcopy(ltcp->tcp_tcph->th_lport,
-		    tcp->tcp_tcph->th_lport, sizeof (ushort_t));
-	}
-
-	bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
-	bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport,
-	    sizeof (in_port_t));
-
-	if (ltcp->tcp_lport == 0) {
-		tcp->tcp_lport = *(in_port_t *)tcph->th_fport;
-		bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport,
-		    sizeof (in_port_t));
-	}
-
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
-		ASSERT(ipha != NULL);
-		tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
-		tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
-
-		/* Source routing option copyover (reverse it) */
-		if (tcps->tcps_rev_src_routes)
-			tcp_opt_reverse(tcp, ipha);
-	} else {
-		ASSERT(ip6h != NULL);
-		tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src;
-		tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst;
-	}
-
-	ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
-	ASSERT(!tcp->tcp_tconnind_started);
-	/*
-	 * If the SYN contains a credential, it's a loopback packet; attach
-	 * the credential to the TPI message.
-	 */
-	mblk_copycred(tpi_mp, idmp);
-
-	tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
-
-	/* Inherit the listener's SSL protection state */
-
-	if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
-		kssl_hold_ent(tcp->tcp_kssl_ent);
-		tcp->tcp_kssl_pending = B_TRUE;
-	}
-
-	/* Inherit the listener's non-STREAMS flag */
-	if (IPCL_IS_NONSTR(lconnp)) {
-		connp->conn_flags |= IPCL_NONSTR;
-	}
-
-	return (0);
+	return (tpi_mp);
 }
 
-
-int
-tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
-    tcph_t *tcph, mblk_t *idmp)
+/* Handle a SYN on an AF_INET socket */
+mblk_t *
+tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
+    ip_recv_attr_t *ira)
 {
 	tcp_t 		*ltcp = lconnp->conn_tcp;
 	tcp_t		*tcp = connp->conn_tcp;
 	sin_t		sin;
 	mblk_t		*tpi_mp = NULL;
-	int		err;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	ipha_t		*ipha;
+
+	ASSERT(ira->ira_flags & IRAF_IS_IPV4);
+	ipha = (ipha_t *)mp->b_rptr;
+
+	connp->conn_ipversion = IPV4_VERSION;
+	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
+	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
+	connp->conn_saddr_v6 = connp->conn_laddr_v6;
 
 	sin = sin_null;
-	sin.sin_addr.s_addr = ipha->ipha_src;
-	sin.sin_port = *(uint16_t *)tcph->th_lport;
+	sin.sin_addr.s_addr = connp->conn_faddr_v4;
+	sin.sin_port = connp->conn_fport;
 	sin.sin_family = AF_INET;
-	if (ltcp->tcp_recvdstaddr) {
+	if (lconnp->conn_recv_ancillary.crb_recvdstaddr) {
 		sin_t	sind;
 
 		sind = sin_null;
-		sind.sin_addr.s_addr = ipha->ipha_dst;
-		sind.sin_port = *(uint16_t *)tcph->th_fport;
+		sind.sin_addr.s_addr = connp->conn_laddr_v4;
+		sind.sin_port = connp->conn_lport;
 		sind.sin_family = AF_INET;
 		tpi_mp = mi_tpi_extconn_ind(NULL,
 		    (char *)&sind, sizeof (sin_t), (char *)&tcp,
@@ -4779,214 +4138,8 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
 		    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
 	}
 
-	if (tpi_mp == NULL) {
-		return (ENOMEM);
-	}
-
-	connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER);
-	connp->conn_send = ip_output;
-	connp->conn_recv = tcp_input;
-	connp->conn_fully_bound = B_FALSE;
-
-	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_bound_source_v6);
-	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
-	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
-	connp->conn_fport = *(uint16_t *)tcph->th_lport;
-	connp->conn_lport = *(uint16_t *)tcph->th_fport;
-
-	/* Inherit information from the "parent" */
-	tcp->tcp_ipversion = ltcp->tcp_ipversion;
-	tcp->tcp_family = ltcp->tcp_family;
-	tcp->tcp_wq = ltcp->tcp_wq;
-	tcp->tcp_rq = ltcp->tcp_rq;
 	tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
-	tcp->tcp_detached = B_TRUE;
-	SOCK_CONNID_INIT(tcp->tcp_connid);
-	if ((err = tcp_init_values(tcp)) != 0) {
-		freemsg(tpi_mp);
-		return (err);
-	}
-
-	/*
-	 * Let's make sure that eager tcp template has enough space to
-	 * copy IPv4 listener's tcp template. Since the conn_t structure is
-	 * preserved and tcp_iphc_len is also preserved, an eager conn_t may
-	 * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or
-	 * more (in case of re-allocation of conn_t with tcp-IPv6 template with
-	 * extension headers or with ip6i_t struct). Note that bcopy() below
-	 * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_
-	 * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener.
-	 */
-	ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
-	ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH);
-
-	tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
-	tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
-	tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
-	tcp->tcp_ttl = ltcp->tcp_ttl;
-	tcp->tcp_tos = ltcp->tcp_tos;
-
-	/* Copy the IP+TCP header template from listener to eager */
-	bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
-	tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
-	tcp->tcp_ip6h = NULL;
-	tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
-	    tcp->tcp_ip_hdr_len);
-
-	/* Initialize the IP addresses and Ports */
-	tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
-	tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
-	bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
-	bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t));
-
-	/* Source routing option copyover (reverse it) */
-	if (tcps->tcps_rev_src_routes)
-		tcp_opt_reverse(tcp, ipha);
-
-	ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
-	ASSERT(!tcp->tcp_tconnind_started);
-
-	/*
-	 * If the SYN contains a credential, it's a loopback packet; attach
-	 * the credential to the TPI message.
-	 */
-	mblk_copycred(tpi_mp, idmp);
-
-	tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
-
-	/* Inherit the listener's SSL protection state */
-	if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
-		kssl_hold_ent(tcp->tcp_kssl_ent);
-		tcp->tcp_kssl_pending = B_TRUE;
-	}
-
-	/* Inherit the listener's non-STREAMS flag */
-	if (IPCL_IS_NONSTR(lconnp)) {
-		connp->conn_flags |= IPCL_NONSTR;
-	}
-
-	return (0);
-}
-
-/*
- * sets up conn for ipsec.
- * if the first mblk is M_CTL it is consumed and mpp is updated.
- * in case of error mpp is freed.
- */
-conn_t *
-tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
-{
-	conn_t 		*connp = tcp->tcp_connp;
-	conn_t 		*econnp;
-	squeue_t 	*new_sqp;
-	mblk_t 		*first_mp = *mpp;
-	mblk_t		*mp = *mpp;
-	boolean_t	mctl_present = B_FALSE;
-	uint_t		ipvers;
-
-	econnp = tcp_get_conn(sqp, tcp->tcp_tcps);
-	if (econnp == NULL) {
-		freemsg(first_mp);
-		return (NULL);
-	}
-	if (DB_TYPE(mp) == M_CTL) {
-		if (mp->b_cont == NULL ||
-		    mp->b_cont->b_datap->db_type != M_DATA) {
-			freemsg(first_mp);
-			return (NULL);
-		}
-		mp = mp->b_cont;
-		if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) {
-			freemsg(first_mp);
-			return (NULL);
-		}
-
-		mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
-		first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
-		mctl_present = B_TRUE;
-	} else {
-		ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY);
-		mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
-	}
-
-	new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
-	DB_CKSUMSTART(mp) = 0;
-
-	ASSERT(OK_32PTR(mp->b_rptr));
-	ipvers = IPH_HDR_VERSION(mp->b_rptr);
-	if (ipvers == IPV4_VERSION) {
-		uint16_t  	*up;
-		uint32_t	ports;
-		ipha_t		*ipha;
-
-		ipha = (ipha_t *)mp->b_rptr;
-		up = (uint16_t *)((uchar_t *)ipha +
-		    IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET);
-		ports = *(uint32_t *)up;
-		IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP,
-		    ipha->ipha_dst, ipha->ipha_src, ports);
-	} else {
-		uint16_t  	*up;
-		uint32_t	ports;
-		uint16_t	ip_hdr_len;
-		uint8_t		*nexthdrp;
-		ip6_t 		*ip6h;
-		tcph_t		*tcph;
-
-		ip6h = (ip6_t *)mp->b_rptr;
-		if (ip6h->ip6_nxt == IPPROTO_TCP) {
-			ip_hdr_len = IPV6_HDR_LEN;
-		} else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len,
-		    &nexthdrp) || *nexthdrp != IPPROTO_TCP) {
-			CONN_DEC_REF(econnp);
-			freemsg(first_mp);
-			return (NULL);
-		}
-		tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
-		up = (uint16_t *)tcph->th_lport;
-		ports = *(uint32_t *)up;
-		IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP,
-		    ip6h->ip6_dst, ip6h->ip6_src, ports);
-	}
-
-	/*
-	 * The caller already ensured that there is a sqp present.
-	 */
-	econnp->conn_sqp = new_sqp;
-	econnp->conn_initial_sqp = new_sqp;
-
-	if (connp->conn_policy != NULL) {
-		ipsec_in_t *ii;
-		ii = (ipsec_in_t *)(first_mp->b_rptr);
-		ASSERT(ii->ipsec_in_policy == NULL);
-		IPPH_REFHOLD(connp->conn_policy);
-		ii->ipsec_in_policy = connp->conn_policy;
-
-		first_mp->b_datap->db_type = IPSEC_POLICY_SET;
-		if (!ip_bind_ipsec_policy_set(econnp, first_mp)) {
-			CONN_DEC_REF(econnp);
-			freemsg(first_mp);
-			return (NULL);
-		}
-	}
-
-	if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) {
-		CONN_DEC_REF(econnp);
-		freemsg(first_mp);
-		return (NULL);
-	}
-
-	/*
-	 * If we know we have some policy, pass the "IPSEC"
-	 * options size TCP uses this adjust the MSS.
-	 */
-	econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp);
-	if (mctl_present) {
-		freeb(first_mp);
-		*mpp = mp;
-	}
-
-	return (econnp);
+	return (tpi_mp);
 }
 
 /*
@@ -5002,10 +4155,8 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
  * connection sitting in the freelist. Obviously, this buys us
  * performance.
  *
- * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request
- * has multiple disadvantages - tying up the squeue during alloc, and the
- * fact that IPSec policy initialization has to happen here which
- * requires us sending a M_CTL and checking for it i.e. real ugliness.
+ * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
+ * has multiple disadvantages - tying up the squeue during alloc.
  * But allocating the conn/tcp in IP land is also not the best since
  * we can't check the 'q' and 'q0' which are protected by squeue and
  * blindly allocate memory which might have to be freed here if we are
@@ -5050,9 +4201,15 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps)
 		ns = tcps->tcps_netstack;
 		netstack_hold(ns);
 		connp->conn_netstack = ns;
+		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 		tcp->tcp_tcps = tcps;
-		TCPS_REFHOLD(tcps);
 		ipcl_globalhash_insert(connp);
+
+		connp->conn_ixa->ixa_notify_cookie = tcp;
+		ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
+		connp->conn_recv = tcp_input_data;
+		ASSERT(connp->conn_recvicmp == tcp_icmp_input);
+		ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
 		return ((void *)connp);
 	}
 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
@@ -5075,62 +4232,20 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps)
 	mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	tcp->tcp_tcps = tcps;
-	TCPS_REFHOLD(tcps);
 
-	return ((void *)connp);
-}
+	connp->conn_recv = tcp_input_data;
+	connp->conn_recvicmp = tcp_icmp_input;
+	connp->conn_verifyicmp = tcp_verifyicmp;
 
-/*
- * Update the cached label for the given tcp_t.  This should be called once per
- * connection, and before any packets are sent or tcp_process_options is
- * invoked.  Returns B_FALSE if the correct label could not be constructed.
- */
-static boolean_t
-tcp_update_label(tcp_t *tcp, const cred_t *cr)
-{
-	conn_t *connp = tcp->tcp_connp;
-
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
-		uchar_t optbuf[IP_MAX_OPT_LENGTH];
-		int added;
-
-		if (tsol_compute_label(cr, tcp->tcp_remote, optbuf,
-		    tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0)
-			return (B_FALSE);
-
-		added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len);
-		if (added == -1)
-			return (B_FALSE);
-		tcp->tcp_hdr_len += added;
-		tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added);
-		tcp->tcp_ip_hdr_len += added;
-		if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) {
-			tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3;
-			added = tsol_prepend_option(optbuf, tcp->tcp_ipha,
-			    tcp->tcp_hdr_len);
-			if (added == -1)
-				return (B_FALSE);
-			tcp->tcp_hdr_len += added;
-			tcp->tcp_tcph = (tcph_t *)
-			    ((uchar_t *)tcp->tcp_tcph + added);
-			tcp->tcp_ip_hdr_len += added;
-		}
-	} else {
-		uchar_t optbuf[TSOL_MAX_IPV6_OPTION];
-
-		if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf,
-		    tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0)
-			return (B_FALSE);
-		if (tsol_update_sticky(&tcp->tcp_sticky_ipp,
-		    &tcp->tcp_label_len, optbuf) != 0)
-			return (B_FALSE);
-		if (tcp_build_hdrs(tcp) != 0)
-			return (B_FALSE);
-	}
-
-	connp->conn_ulp_labeled = 1;
+	/*
+	 * Register tcp_notify to listen to capability changes detected by IP.
+	 * This upcall is made in the context of the call to conn_ip_output
+	 * thus it is inside the squeue.
+	 */
+	connp->conn_ixa->ixa_notify = tcp_notify;
+	connp->conn_ixa->ixa_notify_cookie = tcp;
 
-	return (B_TRUE);
+	return ((void *)connp);
 }
 
 /* BEGIN CSTYLED */
@@ -5140,7 +4255,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
  * =======================
  *
  * The eager is now established in its own perimeter as soon as SYN is
- * received in tcp_conn_request(). When sockfs receives conn_ind, it
+ * received in tcp_input_listener(). When sockfs receives conn_ind, it
  * completes the accept processing on the acceptor STREAM. The sending
  * of conn_ind part is common for both sockfs listener and a TLI/XTI
  * listener but a TLI/XTI listener completes the accept processing
@@ -5149,29 +4264,28 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
  * Common control flow for 3 way handshake:
  * ----------------------------------------
  *
- * incoming SYN (listener perimeter) 	-> tcp_rput_data()
- *					-> tcp_conn_request()
+ * incoming SYN (listener perimeter)	-> tcp_input_listener()
  *
- * incoming SYN-ACK-ACK (eager perim) 	-> tcp_rput_data()
+ * incoming SYN-ACK-ACK (eager perim) 	-> tcp_input_data()
  * send T_CONN_IND (listener perim)	-> tcp_send_conn_ind()
  *
  * Sockfs ACCEPT Path:
  * -------------------
  *
- * open acceptor stream (tcp_open allocates tcp_wput_accept()
+ * open acceptor stream (tcp_open allocates tcp_tli_accept()
  * as STREAM entry point)
  *
- * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept()
+ * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept()
  *
- * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager
+ * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager
  * association (we are not behind eager's squeue but sockfs is protecting us
  * and no one knows about this stream yet. The STREAMS entry point q->q_info
  * is changed to point at tcp_wput().
  *
- * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to
+ * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to
  * listener (done on listener's perimeter).
  *
- * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish
+ * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish
  * accept.
  *
  * TLI/XTI client ACCEPT path:
@@ -5179,8 +4293,8 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
  *
  * soaccept() sends T_CONN_RES on the listener STREAM.
  *
- * tcp_accept() -> tcp_accept_swap() complete the processing and send
- * the bind_mp to eager perimeter to finish accept (tcp_rput_other()).
+ * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send
+ * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()).
  *
  * Locks:
  * ======
@@ -5191,7 +4305,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
  * Referencing:
  * ============
  *
- * 1) We start out in tcp_conn_request by eager placing a ref on
+ * 1) We start out in tcp_input_listener by eager placing a ref on
  * listener and listener adding eager to listeners->tcp_eager_next_q0.
  *
  * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
@@ -5249,51 +4363,71 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
 
 /*
  * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
- * tcp_rput_data will not see any SYN packets.
+ * tcp_input_data will not see any packets for listeners since the listener
+ * has conn_recv set to tcp_input_listener.
  */
 /* ARGSUSED */
 void
-tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
+tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 {
-	tcph_t		*tcph;
+	tcpha_t		*tcpha;
 	uint32_t	seg_seq;
 	tcp_t		*eager;
-	uint_t		ipvers;
-	ipha_t		*ipha;
-	ip6_t		*ip6h;
 	int		err;
 	conn_t		*econnp = NULL;
 	squeue_t	*new_sqp;
 	mblk_t		*mp1;
 	uint_t 		ip_hdr_len;
-	conn_t		*connp = (conn_t *)arg;
-	tcp_t		*tcp = connp->conn_tcp;
-	cred_t		*credp;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t	*ipst;
+	conn_t		*lconnp = (conn_t *)arg;
+	tcp_t		*listener = lconnp->conn_tcp;
+	tcp_stack_t	*tcps = listener->tcp_tcps;
+	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
+	uint_t		flags;
+	mblk_t		*tpi_mp;
+	uint_t		ifindex = ira->ira_ruifindex;
 
-	if (tcp->tcp_state != TCPS_LISTEN)
+	ip_hdr_len = ira->ira_ip_hdr_length;
+	tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
+	flags = (unsigned int)tcpha->tha_flags & 0xFF;
+
+	if (!(flags & TH_SYN)) {
+		if ((flags & TH_RST) || (flags & TH_URG)) {
+			freemsg(mp);
+			return;
+		}
+		if (flags & TH_ACK) {
+			/* Note this executes in listener's squeue */
+			tcp_xmit_listeners_reset(mp, ira, ipst, lconnp);
+			return;
+		}
+
+		freemsg(mp);
+		return;
+	}
+
+	if (listener->tcp_state != TCPS_LISTEN)
 		goto error2;
 
-	ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0);
+	ASSERT(IPCL_IS_BOUND(lconnp));
 
-	mutex_enter(&tcp->tcp_eager_lock);
-	if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) {
-		mutex_exit(&tcp->tcp_eager_lock);
+	mutex_enter(&listener->tcp_eager_lock);
+	if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) {
+		mutex_exit(&listener->tcp_eager_lock);
 		TCP_STAT(tcps, tcp_listendrop);
 		BUMP_MIB(&tcps->tcps_mib, tcpListenDrop);
-		if (tcp->tcp_debug) {
+		if (lconnp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
-			    "tcp_conn_request: listen backlog (max=%d) "
+			    "tcp_input_listener: listen backlog (max=%d) "
 			    "overflow (%d pending) on %s",
-			    tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q,
-			    tcp_display(tcp, NULL, DISP_PORT_ONLY));
+			    listener->tcp_conn_req_max,
+			    listener->tcp_conn_req_cnt_q,
+			    tcp_display(listener, NULL, DISP_PORT_ONLY));
 		}
 		goto error2;
 	}
 
-	if (tcp->tcp_conn_req_cnt_q0 >=
-	    tcp->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
+	if (listener->tcp_conn_req_cnt_q0 >=
+	    listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
 		/*
 		 * Q0 is full. Drop a pending half-open req from the queue
 		 * to make room for the new SYN req. Also mark the time we
@@ -5303,83 +4437,127 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
 		 * be to set the "tcp_syn_defense" flag now.
 		 */
 		TCP_STAT(tcps, tcp_listendropq0);
-		tcp->tcp_last_rcv_lbolt = lbolt64;
-		if (!tcp_drop_q0(tcp)) {
-			mutex_exit(&tcp->tcp_eager_lock);
+		listener->tcp_last_rcv_lbolt = lbolt64;
+		if (!tcp_drop_q0(listener)) {
+			mutex_exit(&listener->tcp_eager_lock);
 			BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0);
-			if (tcp->tcp_debug) {
+			if (lconnp->conn_debug) {
 				(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
-				    "tcp_conn_request: listen half-open queue "
-				    "(max=%d) full (%d pending) on %s",
+				    "tcp_input_listener: listen half-open "
+				    "queue (max=%d) full (%d pending) on %s",
 				    tcps->tcps_conn_req_max_q0,
-				    tcp->tcp_conn_req_cnt_q0,
-				    tcp_display(tcp, NULL,
+				    listener->tcp_conn_req_cnt_q0,
+				    tcp_display(listener, NULL,
 				    DISP_PORT_ONLY));
 			}
 			goto error2;
 		}
 	}
-	mutex_exit(&tcp->tcp_eager_lock);
+	mutex_exit(&listener->tcp_eager_lock);
 
 	/*
-	 * IP adds STRUIO_EAGER and ensures that the received packet is
-	 * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6
-	 * link local address.  If IPSec is enabled, db_struioflag has
-	 * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER);
-	 * otherwise an error case if neither of them is set.
+	 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
+	 * or based on the ring (for packets from GLD). Otherwise it is
+	 * set based on lbolt i.e., a somewhat random number.
 	 */
-	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
-		new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
-		DB_CKSUMSTART(mp) = 0;
-		mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
-		econnp = (conn_t *)tcp_get_conn(arg2, tcps);
-		if (econnp == NULL)
-			goto error2;
-		ASSERT(econnp->conn_netstack == connp->conn_netstack);
-		econnp->conn_sqp = new_sqp;
-		econnp->conn_initial_sqp = new_sqp;
-	} else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) {
-		/*
-		 * mp is updated in tcp_get_ipsec_conn().
-		 */
-		econnp = tcp_get_ipsec_conn(tcp, arg2, &mp);
-		if (econnp == NULL) {
-			/*
-			 * mp freed by tcp_get_ipsec_conn.
-			 */
-			return;
-		}
-		ASSERT(econnp->conn_netstack == connp->conn_netstack);
-	} else {
+	ASSERT(ira->ira_sqp != NULL);
+	new_sqp = ira->ira_sqp;
+
+	econnp = (conn_t *)tcp_get_conn(arg2, tcps);
+	if (econnp == NULL)
 		goto error2;
-	}
 
-	ASSERT(DB_TYPE(mp) == M_DATA);
+	ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
+	econnp->conn_sqp = new_sqp;
+	econnp->conn_initial_sqp = new_sqp;
+	econnp->conn_ixa->ixa_sqp = new_sqp;
+
+	econnp->conn_fport = tcpha->tha_lport;
+	econnp->conn_lport = tcpha->tha_fport;
+
+	err = conn_inherit_parent(lconnp, econnp);
+	if (err != 0)
+		goto error3;
 
-	ipvers = IPH_HDR_VERSION(mp->b_rptr);
-	ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION);
 	ASSERT(OK_32PTR(mp->b_rptr));
-	if (ipvers == IPV4_VERSION) {
-		ipha = (ipha_t *)mp->b_rptr;
-		ip_hdr_len = IPH_HDR_LENGTH(ipha);
-		tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
-	} else {
-		ip6h = (ip6_t *)mp->b_rptr;
-		ip_hdr_len = ip_hdr_length_v6(mp, ip6h);
-		tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
-	}
+	ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||
+	    IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
 
-	if (tcp->tcp_family == AF_INET) {
-		ASSERT(ipvers == IPV4_VERSION);
-		err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp);
+	if (lconnp->conn_family == AF_INET) {
+		ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
+		tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira);
 	} else {
-		err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp);
+		tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira);
 	}
 
-	if (err)
+	if (tpi_mp == NULL)
 		goto error3;
 
 	eager = econnp->conn_tcp;
+	eager->tcp_detached = B_TRUE;
+	SOCK_CONNID_INIT(eager->tcp_connid);
+
+	tcp_init_values(eager);
+
+	ASSERT((econnp->conn_ixa->ixa_flags &
+	    (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
+	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) ==
+	    (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
+	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO));
+
+	if (!tcps->tcps_dev_flow_ctl)
+		econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
+
+	/* Prepare for diffing against previous packets */
+	eager->tcp_recvifindex = 0;
+	eager->tcp_recvhops = 0xffffffffU;
+
+	if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) {
+		if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) ||
+		    IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) {
+			econnp->conn_incoming_ifindex = ifindex;
+			econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
+			econnp->conn_ixa->ixa_scopeid = ifindex;
+		}
+	}
+
+	if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) ==
+	    (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) &&
+	    tcps->tcps_rev_src_routes) {
+		ipha_t *ipha = (ipha_t *)mp->b_rptr;
+		ip_pkt_t *ipp = &econnp->conn_xmit_ipp;
+
+		/* Source routing option copyover (reverse it) */
+		err = ip_find_hdr_v4(ipha, ipp, B_TRUE);
+		if (err != 0) {
+			freemsg(tpi_mp);
+			goto error3;
+		}
+		ip_pkt_source_route_reverse_v4(ipp);
+	}
+
+	ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL);
+	ASSERT(!eager->tcp_tconnind_started);
+	/*
+	 * If the SYN came with a credential, it's a loopback packet or a
+	 * labeled packet; attach the credential to the TPI message.
+	 */
+	if (ira->ira_cred != NULL)
+		mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid);
+
+	eager->tcp_conn.tcp_eager_conn_ind = tpi_mp;
+
+	/* Inherit the listener's SSL protection state */
+	if ((eager->tcp_kssl_ent = listener->tcp_kssl_ent) != NULL) {
+		kssl_hold_ent(eager->tcp_kssl_ent);
+		eager->tcp_kssl_pending = B_TRUE;
+	}
+
+	/* Inherit the listener's non-STREAMS flag */
+	if (IPCL_IS_NONSTR(lconnp)) {
+		econnp->conn_flags |= IPCL_NONSTR;
+	}
+
 	ASSERT(eager->tcp_ordrel_mp == NULL);
 
 	if (!IPCL_IS_NONSTR(econnp)) {
@@ -5392,127 +4570,103 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
 		if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL)
 			goto error3;
 	}
-	/* Inherit various TCP parameters from the listener */
-	eager->tcp_naglim = tcp->tcp_naglim;
-	eager->tcp_first_timer_threshold = tcp->tcp_first_timer_threshold;
-	eager->tcp_second_timer_threshold = tcp->tcp_second_timer_threshold;
-
-	eager->tcp_first_ctimer_threshold = tcp->tcp_first_ctimer_threshold;
-	eager->tcp_second_ctimer_threshold = tcp->tcp_second_ctimer_threshold;
-
 	/*
-	 * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics.
-	 * If it does not, the eager's receive window will be set to the
-	 * listener's receive window later in this function.
+	 * Now that the IP addresses and ports are setup in econnp we
+	 * can do the IPsec policy work.
 	 */
-	eager->tcp_rwnd = 0;
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+		if (lconnp->conn_policy != NULL) {
+			/*
+			 * Inherit the policy from the listener; use
+			 * actions from ira
+			 */
+			if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) {
+				CONN_DEC_REF(econnp);
+				freemsg(mp);
+				goto error3;
+			}
+		}
+	}
 
-	/*
-	 * Inherit listener's tcp_init_cwnd.  Need to do this before
-	 * calling tcp_process_options() where tcp_mss_set() is called
-	 * to set the initial cwnd.
-	 */
-	eager->tcp_init_cwnd = tcp->tcp_init_cwnd;
+	/* Inherit various TCP parameters from the listener */
+	eager->tcp_naglim = listener->tcp_naglim;
+	eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold;
+	eager->tcp_second_timer_threshold =
+	    listener->tcp_second_timer_threshold;
+	eager->tcp_first_ctimer_threshold =
+	    listener->tcp_first_ctimer_threshold;
+	eager->tcp_second_ctimer_threshold =
+	    listener->tcp_second_ctimer_threshold;
 
 	/*
-	 * Zones: tcp_adapt_ire() and tcp_send_data() both need the
-	 * zone id before the accept is completed in tcp_wput_accept().
+	 * tcp_set_destination() may set tcp_rwnd according to the route
+	 * metrics. If it does not, the eager's receive window will be set
+	 * to the listener's receive window later in this function.
 	 */
-	econnp->conn_zoneid = connp->conn_zoneid;
-	econnp->conn_allzones = connp->conn_allzones;
-
-	/* Copy nexthop information from listener to eager */
-	if (connp->conn_nexthop_set) {
-		econnp->conn_nexthop_set = connp->conn_nexthop_set;
-		econnp->conn_nexthop_v4 = connp->conn_nexthop_v4;
-	}
+	eager->tcp_rwnd = 0;
 
 	/*
-	 * TSOL: tsol_input_proc() needs the eager's cred before the
-	 * eager is accepted
+	 * Inherit listener's tcp_init_cwnd.  Need to do this before
+	 * calling tcp_process_options() which set the initial cwnd.
 	 */
-	econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred;
-	crhold(credp);
+	eager->tcp_init_cwnd = listener->tcp_init_cwnd;
 
-	ASSERT(econnp->conn_effective_cred == NULL);
 	if (is_system_labeled()) {
-		cred_t *cr;
-		ts_label_t *tsl;
-
-		/*
-		 * If this is an MLP connection or a MAC-Exempt connection
-		 * with an unlabeled node, packets are to be
-		 * exchanged using the security label of the received
-		 * SYN packet instead of the server application's label.
-		 */
-		if ((cr = msg_getcred(mp, NULL)) != NULL &&
-		    (tsl = crgetlabel(cr)) != NULL &&
-		    (connp->conn_mlp_type != mlptSingle ||
-		    (connp->conn_mac_mode != CONN_MAC_AWARE &&
-		    (tsl->tsl_flags & TSLF_UNLABELED)))) {
-			if ((econnp->conn_effective_cred =
-			    copycred_from_tslabel(econnp->conn_cred,
-			    tsl, KM_NOSLEEP)) != NULL) {
-				DTRACE_PROBE2(
-				    syn_accept_peerlabel,
-				    conn_t *, econnp, cred_t *,
-				    econnp->conn_effective_cred);
-			} else {
-				DTRACE_PROBE3(
-				    tx__ip__log__error__set__eagercred__tcp,
-				    char *,
-				    "SYN mp(1) label on eager connp(2) failed",
-				    mblk_t *, mp, conn_t *, econnp);
-				goto error3;
-			}
+		ip_xmit_attr_t *ixa = econnp->conn_ixa;
+
+		ASSERT(ira->ira_tsl != NULL);
+		/* Discard any old label */
+		if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+			ASSERT(ixa->ixa_tsl != NULL);
+			label_rele(ixa->ixa_tsl);
+			ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+			ixa->ixa_tsl = NULL;
+		}
+		if ((lconnp->conn_mlp_type != mlptSingle ||
+		    lconnp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+		    ira->ira_tsl != NULL) {
+			/*
+			 * If this is an MLP connection or a MAC-Exempt
+			 * connection with an unlabeled node, packets are to be
+			 * exchanged using the security label of the received
+			 * SYN packet instead of the server application's label.
+			 * tsol_check_dest called from ip_set_destination
+			 * might later update TSF_UNLABELED by replacing
+			 * ixa_tsl with a new label.
+			 */
+			label_hold(ira->ira_tsl);
+			ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
+			DTRACE_PROBE2(mlp_syn_accept, conn_t *,
+			    econnp, ts_label_t *, ixa->ixa_tsl)
 		} else {
+			ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
 			DTRACE_PROBE2(syn_accept, conn_t *,
-			    econnp, cred_t *, econnp->conn_cred)
+			    econnp, ts_label_t *, ixa->ixa_tsl)
 		}
-
 		/*
-		 * Verify the destination is allowed to receive packets
-		 * at the security label of the SYN-ACK we are generating.
-		 * tsol_check_dest() may create a new effective cred for
-		 * this connection with a modified label or label flags.
+		 * conn_connect() called from tcp_set_destination will verify
+		 * the destination is allowed to receive packets at the
+		 * security label of the SYN-ACK we are generating. As part of
+		 * that, tsol_check_dest() may create a new effective label for
+		 * this connection.
+		 * Finally conn_connect() will call conn_update_label.
+		 * All that remains for TCP to do is to call
+		 * conn_build_hdr_template which is done as part of
+		 * tcp_set_destination.
 		 */
-		if (IN6_IS_ADDR_V4MAPPED(&econnp->conn_remv6)) {
-			uint32_t dst;
-			IN6_V4MAPPED_TO_IPADDR(&econnp->conn_remv6, dst);
-			err = tsol_check_dest(CONN_CRED(econnp), &dst,
-			    IPV4_VERSION, B_FALSE, &cr);
-		} else {
-			err = tsol_check_dest(CONN_CRED(econnp),
-			    &econnp->conn_remv6, IPV6_VERSION,
-			    B_FALSE, &cr);
-		}
-		if (err != 0)
-			goto error3;
-		if (cr != NULL) {
-			if (econnp->conn_effective_cred != NULL)
-				crfree(econnp->conn_effective_cred);
-			econnp->conn_effective_cred = cr;
-		}
-
-		/*
-		 * Generate the security label to be used in the text of
-		 * this connection's outgoing packets.
-		 */
-		if (!tcp_update_label(eager, CONN_CRED(econnp))) {
-			DTRACE_PROBE3(
-			    tx__ip__log__error__connrequest__tcp,
-			    char *, "eager connp(1) label on SYN mp(2) failed",
-			    conn_t *, econnp, mblk_t *, mp);
-			goto error3;
-		}
 	}
 
+	/*
+	 * Since we will clear tcp_listener before we clear tcp_detached
+	 * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress
+	 * so we can tell a TCP_DETACHED_NONEAGER apart.
+	 */
 	eager->tcp_hard_binding = B_TRUE;
 
 	tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
-	    TCP_BIND_HASH(eager->tcp_lport)], eager, 0);
+	    TCP_BIND_HASH(econnp->conn_lport)], eager, 0);
 
-	CL_INET_CONNECT(connp, eager, B_FALSE, err);
+	CL_INET_CONNECT(econnp, B_FALSE, err);
 	if (err != 0) {
 		tcp_bind_hash_remove(eager);
 		goto error3;
@@ -5528,32 +4682,27 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
 	SOCK_CONNID_BUMP(eager->tcp_connid);
 
 	/*
-	 * There should be no ire in the mp as we are being called after
-	 * receiving the SYN.
-	 */
-	ASSERT(tcp_ire_mp(&mp) == NULL);
-
-	/*
-	 * Adapt our mss, ttl, ... according to information provided in IRE.
+	 * Adapt our mss, ttl, ... based on the remote address.
 	 */
 
-	if (tcp_adapt_ire(eager, NULL) == 0) {
+	if (tcp_set_destination(eager) != 0) {
+		BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
 		/* Undo the bind_hash_insert */
 		tcp_bind_hash_remove(eager);
 		goto error3;
 	}
 
 	/* Process all TCP options. */
-	tcp_process_options(eager, tcph);
+	tcp_process_options(eager, tcpha);
 
 	/* Is the other end ECN capable? */
 	if (tcps->tcps_ecn_permitted >= 1 &&
-	    (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
+	    (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
 		eager->tcp_ecn_ok = B_TRUE;
 	}
 
 	/*
-	 * listeners tcp_recv_hiwater should be the default window size or a
+	 * The listener's conn_rcvbuf should be the default window size or a
 	 * window size changed via SO_RCVBUF option. First round up the
 	 * eager's tcp_rwnd to the nearest MSS. Then find out the window
 	 * scale option value if needed. Call tcp_rwnd_set() to finish the
@@ -5563,7 +4712,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
 	 * we should not inherit receive window size from listener.
 	 */
 	eager->tcp_rwnd = MSS_ROUNDUP(
-	    (eager->tcp_rwnd == 0 ? tcp->tcp_recv_hiwater:
+	    (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf :
 	    eager->tcp_rwnd), eager->tcp_mss);
 	if (eager->tcp_snd_ws_ok)
 		tcp_set_ws_value(eager);
@@ -5575,77 +4724,46 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
 	 */
 	(void) tcp_rwnd_set(eager, eager->tcp_rwnd);
 
-	/*
-	 * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ
-	 * via soaccept()->soinheritoptions() which essentially applies
-	 * all the listener options to the new STREAM. The options that we
-	 * need to take care of are:
-	 * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST,
-	 * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER,
-	 * SO_SNDBUF, SO_RCVBUF.
-	 *
-	 * SO_RCVBUF:	tcp_rwnd_set() above takes care of it.
-	 * SO_SNDBUF:	Set the tcp_xmit_hiwater for the eager. When
-	 *		tcp_maxpsz_set() gets called later from
-	 *		tcp_accept_finish(), the option takes effect.
-	 *
-	 */
-	/* Set the TCP options */
-	eager->tcp_recv_lowater = tcp->tcp_recv_lowater;
-	eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater;
-	eager->tcp_dgram_errind = tcp->tcp_dgram_errind;
-	eager->tcp_oobinline = tcp->tcp_oobinline;
-	eager->tcp_reuseaddr = tcp->tcp_reuseaddr;
-	eager->tcp_broadcast = tcp->tcp_broadcast;
-	eager->tcp_useloopback = tcp->tcp_useloopback;
-	eager->tcp_dontroute = tcp->tcp_dontroute;
-	eager->tcp_debug = tcp->tcp_debug;
-	eager->tcp_linger = tcp->tcp_linger;
-	eager->tcp_lingertime = tcp->tcp_lingertime;
-	if (tcp->tcp_ka_enabled)
-		eager->tcp_ka_enabled = 1;
-
-	ASSERT(eager->tcp_recv_hiwater != 0 &&
-	    eager->tcp_recv_hiwater == eager->tcp_rwnd);
-
-	/* Set the IP options */
-	econnp->conn_broadcast = connp->conn_broadcast;
-	econnp->conn_loopback = connp->conn_loopback;
-	econnp->conn_dontroute = connp->conn_dontroute;
-	econnp->conn_reuseaddr = connp->conn_reuseaddr;
+	ASSERT(eager->tcp_connp->conn_rcvbuf != 0 &&
+	    eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd);
+
+	ASSERT(econnp->conn_rcvbuf != 0 &&
+	    econnp->conn_rcvbuf == eager->tcp_rwnd);
 
 	/* Put a ref on the listener for the eager. */
-	CONN_INC_REF(connp);
-	mutex_enter(&tcp->tcp_eager_lock);
-	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
-	eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
-	tcp->tcp_eager_next_q0 = eager;
-	eager->tcp_eager_prev_q0 = tcp;
+	CONN_INC_REF(lconnp);
+	mutex_enter(&listener->tcp_eager_lock);
+	listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
+	eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0;
+	listener->tcp_eager_next_q0 = eager;
+	eager->tcp_eager_prev_q0 = listener;
 
 	/* Set tcp_listener before adding it to tcp_conn_fanout */
-	eager->tcp_listener = tcp;
-	eager->tcp_saved_listener = tcp;
+	eager->tcp_listener = listener;
+	eager->tcp_saved_listener = listener;
 
 	/*
 	 * Tag this detached tcp vector for later retrieval
 	 * by our listener client in tcp_accept().
 	 */
-	eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum;
-	tcp->tcp_conn_req_cnt_q0++;
-	if (++tcp->tcp_conn_req_seqnum == -1) {
+	eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum;
+	listener->tcp_conn_req_cnt_q0++;
+	if (++listener->tcp_conn_req_seqnum == -1) {
 		/*
 		 * -1 is "special" and defined in TPI as something
 		 * that should never be used in T_CONN_IND
 		 */
-		++tcp->tcp_conn_req_seqnum;
+		++listener->tcp_conn_req_seqnum;
 	}
-	mutex_exit(&tcp->tcp_eager_lock);
+	mutex_exit(&listener->tcp_eager_lock);
 
-	if (tcp->tcp_syn_defense) {
+	if (listener->tcp_syn_defense) {
 		/* Don't drop the SYN that comes from a good IP source */
-		ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache);
-		if (addr_cache != NULL && eager->tcp_remote ==
-		    addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) {
+		ipaddr_t *addr_cache;
+
+		addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
+		if (addr_cache != NULL && econnp->conn_faddr_v4 ==
+		    addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) {
 			eager->tcp_dontdrop = B_TRUE;
 		}
 	}
@@ -5655,14 +4773,14 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
 	 * as we do that, we expose the eager to the classifier and
 	 * should not touch any field outside the eager's perimeter.
 	 * So do all the work necessary before inserting the eager
-	 * in its own perimeter. Be optimistic that ipcl_conn_insert()
+	 * in its own perimeter. Be optimistic that conn_connect()
 	 * will succeed but undo everything if it fails.
 	 */
-	seg_seq = ABE32_TO_U32(tcph->th_seq);
+	seg_seq = ntohl(tcpha->tha_seq);
 	eager->tcp_irs = seg_seq;
 	eager->tcp_rack = seg_seq;
 	eager->tcp_rnxt = seg_seq + 1;
-	U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack);
+	eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt);
 	BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
 	eager->tcp_state = TCPS_SYN_RCVD;
 	mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
@@ -5677,24 +4795,10 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
 	}
 
 	/*
-	 * Note that in theory this should use the current pid
-	 * so that getpeerucred on the client returns the actual listener
-	 * that does accept. But accept() hasn't been called yet. We could use
-	 * the pid of the process that did bind/listen on the server.
-	 * However, with common usage like inetd() the bind/listen can be done
-	 * by a different process than the accept().
-	 * Hence we do the simple thing of using the open pid here.
-	 * Note that db_credp is set later in tcp_send_data().
-	 */
-	mblk_setcred(mp1, credp, tcp->tcp_cpid);
-	eager->tcp_cpid = tcp->tcp_cpid;
-	eager->tcp_open_time = lbolt64;
-
-	/*
 	 * We need to start the rto timer. In normal case, we start
 	 * the timer after sending the packet on the wire (or at
 	 * least believing that packet was sent by waiting for
-	 * CALL_IP_WPUT() to return). Since this is the first packet
+	 * conn_ip_output() to return). Since this is the first packet
 	 * being sent on the wire for the eager, our initial tcp_rto
 	 * is at least tcp_rexmit_interval_min which is a fairly
 	 * large value to allow the algorithm to adjust slowly to large
@@ -5716,7 +4820,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
 	 * ensure against an eager close race.
 	 */
 
-	CONN_INC_REF(eager->tcp_connp);
+	CONN_INC_REF(econnp);
 
 	TCP_TIMER_RESTART(eager, eager->tcp_rto);
 
@@ -5724,22 +4828,16 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
 	 * Insert the eager in its own perimeter now. We are ready to deal
 	 * with any packets on eager.
 	 */
-	if (eager->tcp_ipversion == IPV4_VERSION) {
-		if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) {
-			goto error;
-		}
-	} else {
-		if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) {
-			goto error;
-		}
-	}
-
-	/* mark conn as fully-bound */
-	econnp->conn_fully_bound = B_TRUE;
+	if (ipcl_conn_insert(econnp) != 0)
+		goto error;
 
-	/* Send the SYN-ACK */
-	tcp_send_data(eager, eager->tcp_wq, mp1);
-	CONN_DEC_REF(eager->tcp_connp);
+	/*
+	 * Send the SYN-ACK. Can't use tcp_send_data since we can't update
+	 * pmtu etc; we are not on the eager's squeue
+	 */
+	ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp);
+	(void) conn_ip_output(mp1, econnp->conn_ixa);
+	CONN_DEC_REF(econnp);
 	freemsg(mp);
 
 	return;
@@ -5749,7 +4847,7 @@ error:
 	TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
 	mp1 = &eager->tcp_closemp;
 	SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill,
-	    econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
+	    econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
 
 	/*
 	 * If a connection already exists, send the mp to that connections so
@@ -5757,7 +4855,7 @@ error:
 	 */
 	ipst = tcps->tcps_netstack->netstack_ip;
 
-	if ((econnp = ipcl_classify(mp, connp->conn_zoneid, ipst)) != NULL) {
+	if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) {
 		if (!IPCL_IS_CONNECTED(econnp)) {
 			/*
 			 * Something bad happened. ipcl_conn_insert()
@@ -5772,8 +4870,8 @@ error:
 			CONN_DEC_REF(econnp);
 			freemsg(mp);
 		} else {
-			SQUEUE_ENTER_ONE(econnp->conn_sqp, mp,
-			    tcp_input, econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
+			SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data,
+			    econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
 		}
 	} else {
 		/* Nobody wants this packet */
@@ -5803,18 +4901,21 @@ error2:
  * very first time and there is no attempt to rebind them.
  */
 void
-tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
+tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *ira)
 {
 	conn_t		*connp = (conn_t *)arg;
 	squeue_t	*sqp = (squeue_t *)arg2;
 	squeue_t	*new_sqp;
 	uint32_t	conn_flags;
 
-	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
-		new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
-	} else {
-		goto done;
-	}
+	/*
+	 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
+	 * or based on the ring (for packets from GLD). Otherwise it is
+	 * set based on lbolt i.e., a somewhat random number.
+	 */
+	ASSERT(ira->ira_sqp != NULL);
+	new_sqp = ira->ira_sqp;
 
 	if (connp->conn_fanout == NULL)
 		goto done;
@@ -5849,6 +4950,8 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
 		if (connp->conn_sqp != new_sqp) {
 			while (connp->conn_sqp != new_sqp)
 				(void) casptr(&connp->conn_sqp, sqp, new_sqp);
+			/* No special MT issues for outbound ixa_sqp hint */
+			connp->conn_ixa->ixa_sqp = new_sqp;
 		}
 
 		do {
@@ -5860,49 +4963,47 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
 
 		mutex_exit(&connp->conn_fanout->connf_lock);
 		mutex_exit(&connp->conn_lock);
+
+		/*
+		 * Assume we have picked a good squeue for the listener. Make
+		 * subsequent SYNs not try to change the squeue.
+		 */
+		connp->conn_recv = tcp_input_listener;
 	}
 
 done:
 	if (connp->conn_sqp != sqp) {
 		CONN_INC_REF(connp);
 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
-		    SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
+		    ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
 	} else {
-		tcp_conn_request(connp, mp, sqp);
+		tcp_input_listener(connp, mp, sqp, ira);
 	}
 }
 
 /*
  * Successful connect request processing begins when our client passes
- * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes
- * our T_OK_ACK reply message upstream.  The control flow looks like this:
- *   upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_tpi_connect() -> IP
- *   upstream <- tcp_rput()		<- IP
+ * a T_CONN_REQ message into tcp_wput(), which performs function calls into
+ * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
+ *
  * After various error checks are completed, tcp_tpi_connect() lays
- * the target address and port into the composite header template,
- * preallocates the T_OK_ACK reply message, construct a full 12 byte bind
- * request followed by an IRE request, and passes the three mblk message
- * down to IP looking like this:
- *   O_T_BIND_REQ for IP  --> IRE req --> T_OK_ACK for our client
- * Processing continues in tcp_rput() when we receive the following message:
- *   T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client
- * After consuming the first two mblks, tcp_rput() calls tcp_timer(),
- * to fire off the connection request, and then passes the T_OK_ACK mblk
- * upstream that we filled in below.  There are, of course, numerous
- * error conditions along the way which truncate the processing described
- * above.
+ * the target address and port into the composite header template.
+ * Then we ask IP for information, including a source address if we didn't
+ * already have one. Finally we prepare to send the SYN packet, and then
+ * send up the T_OK_ACK reply message.
  */
 static void
 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
 {
 	sin_t		*sin;
-	queue_t		*q = tcp->tcp_wq;
 	struct T_conn_req	*tcr;
 	struct sockaddr	*sa;
 	socklen_t	len;
 	int		error;
 	cred_t		*cr;
 	pid_t		cpid;
+	conn_t		*connp = tcp->tcp_connp;
+	queue_t		*q = connp->conn_wq;
 
 	/*
 	 * All Solaris components should pass a db_credp
@@ -5944,7 +5045,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
 	 * Determine packet type based on type of address passed in
 	 * the request should contain an IPv4 or IPv6 address.
 	 * Make sure that address family matches the type of
-	 * family of the the address passed down
+	 * family of the address passed down.
 	 */
 	switch (tcr->DEST_length) {
 	default:
@@ -6022,7 +5123,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
 		break;
 	}
 
-	error = proto_verify_ip_addr(tcp->tcp_family, sa, len);
+	error = proto_verify_ip_addr(connp->conn_family, sa, len);
 	if (error != 0) {
 		tcp_err_ack(tcp, mp, TSYSERR, error);
 		return;
@@ -6111,7 +5212,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
 	/* return error ack and blow away saved option results if any */
 connect_failed:
 	if (mp != NULL)
-		putnext(tcp->tcp_rq, mp);
+		putnext(connp->conn_rq, mp);
 	else {
 		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
 		    TSYSERR, ENOMEM);
@@ -6121,20 +5222,19 @@ connect_failed:
 /*
  * Handle connect to IPv4 destinations, including connections for AF_INET6
  * sockets connecting to IPv4 mapped IPv6 destinations.
+ * Returns zero if OK, a positive errno, or a negative TLI error.
  */
 static int
 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
-    uint_t srcid, cred_t *cr, pid_t pid)
+    uint_t srcid)
 {
-	tcph_t	*tcph;
-	mblk_t	*mp;
-	ipaddr_t dstaddr = *dstaddrp;
-	int32_t	oldstate;
-	uint16_t lport;
-	int	error = 0;
+	ipaddr_t 	dstaddr = *dstaddrp;
+	uint16_t 	lport;
+	conn_t		*connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	int		error;
 
-	ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
+	ASSERT(connp->conn_ipversion == IPV4_VERSION);
 
 	/* Check for attempt to connect to INADDR_ANY */
 	if (dstaddr == INADDR_ANY)  {
@@ -6157,74 +5257,21 @@ tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
 	}
 
 	/* Handle __sin6_src_id if socket not bound to an IP address */
-	if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) {
-		ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6,
-		    tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack);
-		IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6,
-		    tcp->tcp_ipha->ipha_src);
+	if (srcid != 0 && connp->conn_laddr_v4 == INADDR_ANY) {
+		ip_srcid_find_id(srcid, &connp->conn_laddr_v6,
+		    IPCL_ZONEID(connp), tcps->tcps_netstack);
+		connp->conn_saddr_v6 = connp->conn_laddr_v6;
 	}
 
-	/*
-	 * Don't let an endpoint connect to itself.  Note that
-	 * the test here does not catch the case where the
-	 * source IP addr was left unspecified by the user. In
-	 * this case, the source addr is set in tcp_adapt_ire()
-	 * using the reply to the T_BIND message that we send
-	 * down to IP here and the check is repeated in tcp_rput_other.
-	 */
-	if (dstaddr == tcp->tcp_ipha->ipha_src &&
-	    dstport == tcp->tcp_lport) {
-		error = -TBADADDR;
-		goto failed;
-	}
+	IN6_IPADDR_TO_V4MAPPED(dstaddr, &connp->conn_faddr_v6);
+	connp->conn_fport = dstport;
 
 	/*
-	 * Verify the destination is allowed to receive packets
-	 * at the security label of the connection we are initiating.
-	 * tsol_check_dest() may create a new effective cred for this
-	 * connection with a modified label or label flags.
-	 */
-	if (is_system_labeled()) {
-		ASSERT(tcp->tcp_connp->conn_effective_cred == NULL);
-		if ((error = tsol_check_dest(CONN_CRED(tcp->tcp_connp),
-		    &dstaddr, IPV4_VERSION, tcp->tcp_connp->conn_mac_mode,
-		    &tcp->tcp_connp->conn_effective_cred)) != 0) {
-			if (error != EHOSTUNREACH)
-				error = -TSYSERR;
-			goto failed;
-		}
-	}
-
-	tcp->tcp_ipha->ipha_dst = dstaddr;
-	IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6);
-
-	/*
-	 * Massage a source route if any putting the first hop
-	 * in iph_dst. Compute a starting value for the checksum which
-	 * takes into account that the original iph_dst should be
-	 * included in the checksum but that ip will include the
-	 * first hop in the source route in the tcp checksum.
-	 */
-	tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha, tcps->tcps_netstack);
-	tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
-	tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) +
-	    (tcp->tcp_ipha->ipha_dst & 0xffff));
-	if ((int)tcp->tcp_sum < 0)
-		tcp->tcp_sum--;
-	tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
-	tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
-	    (tcp->tcp_sum >> 16));
-	tcph = tcp->tcp_tcph;
-	*(uint16_t *)tcph->th_fport = dstport;
-	tcp->tcp_fport = dstport;
-
-	oldstate = tcp->tcp_state;
-	/*
 	 * At this point the remote destination address and remote port fields
 	 * in the tcp-four-tuple have been filled in the tcp structure. Now we
-	 * have to see which state tcp was in so we can take apropriate action.
+	 * have to see which state tcp was in so we can take appropriate action.
 	 */
-	if (oldstate == TCPS_IDLE) {
+	if (tcp->tcp_state == TCPS_IDLE) {
 		/*
 		 * We support a quick connect capability here, allowing
 		 * clients to transition directly from IDLE to SYN_SENT
@@ -6233,203 +5280,93 @@ tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
 		 */
 		lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
 		    tcp, B_TRUE);
-		lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
+		lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE,
 		    B_FALSE, B_FALSE);
-		if (lport == 0) {
-			error = -TNOADDR;
-			goto failed;
-		}
-	}
-	tcp->tcp_state = TCPS_SYN_SENT;
-
-	mp = allocb(sizeof (ire_t), BPRI_HI);
-	if (mp == NULL) {
-		tcp->tcp_state = oldstate;
-		error = ENOMEM;
-		goto failed;
+		if (lport == 0)
+			return (-TNOADDR);
 	}
 
-	mp->b_wptr += sizeof (ire_t);
-	mp->b_datap->db_type = IRE_DB_REQ_TYPE;
-	tcp->tcp_hard_binding = 1;
-
 	/*
-	 * We need to make sure that the conn_recv is set to a non-null
-	 * value before we insert the conn_t into the classifier table.
-	 * This is to avoid a race with an incoming packet which does
-	 * an ipcl_classify().
+	 * Lookup the route to determine a source address and the uinfo.
+	 * If there was a source route we have tcp_ipha->ipha_dst as the first
+	 * hop.
+	 * Setup TCP parameters based on the metrics/DCE.
 	 */
-	tcp->tcp_connp->conn_recv = tcp_input;
+	error = tcp_set_destination(tcp);
+	if (error != 0)
+		return (error);
 
-	if (tcp->tcp_family == AF_INET) {
-		error = ip_proto_bind_connected_v4(tcp->tcp_connp, &mp,
-		    IPPROTO_TCP, &tcp->tcp_ipha->ipha_src, tcp->tcp_lport,
-		    tcp->tcp_remote, tcp->tcp_fport, B_TRUE, B_TRUE, cr);
-	} else {
-		in6_addr_t v6src;
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
-			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src);
-		} else {
-			v6src = tcp->tcp_ip6h->ip6_src;
-		}
-		error = ip_proto_bind_connected_v6(tcp->tcp_connp, &mp,
-		    IPPROTO_TCP, &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6,
-		    &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE, cr);
-	}
-	BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
-	tcp->tcp_active_open = 1;
+	/*
+	 * Don't let an endpoint connect to itself.
+	 */
+	if (connp->conn_faddr_v4 == connp->conn_laddr_v4 &&
+	    connp->conn_fport == connp->conn_lport)
+		return (-TBADADDR);
 
+	tcp->tcp_state = TCPS_SYN_SENT;
 
-	return (tcp_post_ip_bind(tcp, mp, error, cr, pid));
-failed:
-	/* return error ack and blow away saved option results if any */
-	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
-		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
-	return (error);
+	return (ipcl_conn_insert_v4(connp));
 }
 
 /*
  * Handle connect to IPv6 destinations.
+ * Returns zero if OK, a positive errno, or a negative TLI error.
  */
 static int
 tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport,
-    uint32_t flowinfo, uint_t srcid, uint32_t scope_id, cred_t *cr, pid_t pid)
+    uint32_t flowinfo, uint_t srcid, uint32_t scope_id)
 {
-	tcph_t	*tcph;
-	mblk_t	*mp;
-	ip6_rthdr_t *rth;
-	int32_t  oldstate;
-	uint16_t lport;
+	uint16_t 	lport;
+	conn_t		*connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	int	error = 0;
-	conn_t	*connp = tcp->tcp_connp;
+	int		error;
 
-	ASSERT(tcp->tcp_family == AF_INET6);
+	ASSERT(connp->conn_family == AF_INET6);
 
 	/*
 	 * If we're here, it means that the destination address is a native
-	 * IPv6 address.  Return an error if tcp_ipversion is not IPv6.  A
+	 * IPv6 address.  Return an error if conn_ipversion is not IPv6.  A
 	 * reason why it might not be IPv6 is if the socket was bound to an
 	 * IPv4-mapped IPv6 address.
 	 */
-	if (tcp->tcp_ipversion != IPV6_VERSION) {
+	if (connp->conn_ipversion != IPV6_VERSION)
 		return (-TBADADDR);
-	}
 
 	/*
 	 * Interpret a zero destination to mean loopback.
 	 * Update the T_CONN_REQ (sin/sin6) since it is used to
 	 * generate the T_CONN_CON.
 	 */
-	if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) {
+	if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp))
 		*dstaddrp = ipv6_loopback;
-	}
 
 	/* Handle __sin6_src_id if socket not bound to an IP address */
-	if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
-		ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src,
-		    connp->conn_zoneid, tcps->tcps_netstack);
-		tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src;
-	}
-
-	/*
-	 * Take care of the scope_id now and add ip6i_t
-	 * if ip6i_t is not already allocated through TCP
-	 * sticky options. At this point tcp_ip6h does not
-	 * have dst info, thus use dstaddrp.
-	 */
-	if (scope_id != 0 &&
-	    IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
-		ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
-		ip6i_t  *ip6i;
-
-		ipp->ipp_ifindex = scope_id;
-		ip6i = (ip6i_t *)tcp->tcp_iphc;
-
-		if ((ipp->ipp_fields & IPPF_HAS_IP6I) &&
-		    ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) {
-			/* Already allocated */
-			ip6i->ip6i_flags |= IP6I_IFINDEX;
-			ip6i->ip6i_ifindex = ipp->ipp_ifindex;
-			ipp->ipp_fields |= IPPF_SCOPE_ID;
-		} else {
-			int reterr;
-
-			ipp->ipp_fields |= IPPF_SCOPE_ID;
-			if (ipp->ipp_fields & IPPF_HAS_IP6I)
-				ip2dbg(("tcp_connect_v6: SCOPE_ID set\n"));
-			reterr = tcp_build_hdrs(tcp);
-			if (reterr != 0)
-				goto failed;
-			ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n"));
-		}
-	}
-
-	/*
-	 * Don't let an endpoint connect to itself.  Note that
-	 * the test here does not catch the case where the
-	 * source IP addr was left unspecified by the user. In
-	 * this case, the source addr is set in tcp_adapt_ire()
-	 * using the reply to the T_BIND message that we send
-	 * down to IP here and the check is repeated in tcp_rput_other.
-	 */
-	if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) &&
-	    (dstport == tcp->tcp_lport)) {
-		error = -TBADADDR;
-		goto failed;
+	if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
+		ip_srcid_find_id(srcid, &connp->conn_laddr_v6,
+		    IPCL_ZONEID(connp), tcps->tcps_netstack);
+		connp->conn_saddr_v6 = connp->conn_laddr_v6;
 	}
 
 	/*
-	 * Verify the destination is allowed to receive packets
-	 * at the security label of the connection we are initiating.
-	 * check_dest may create a new effective cred for this
-	 * connection with a modified label or label flags.
+	 * Take care of the scope_id now.
 	 */
-	if (is_system_labeled()) {
-		ASSERT(tcp->tcp_connp->conn_effective_cred == NULL);
-		if ((error = tsol_check_dest(CONN_CRED(tcp->tcp_connp),
-		    dstaddrp, IPV6_VERSION, tcp->tcp_connp->conn_mac_mode,
-		    &tcp->tcp_connp->conn_effective_cred)) != 0) {
-			if (error != EHOSTUNREACH)
-				error = -TSYSERR;
-			goto failed;
-		}
-	}
-
-	tcp->tcp_ip6h->ip6_dst = *dstaddrp;
-	tcp->tcp_remote_v6 = *dstaddrp;
-	tcp->tcp_ip6h->ip6_vcf =
-	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
-	    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
-
-	/*
-	 * Massage a routing header (if present) putting the first hop
-	 * in ip6_dst. Compute a starting value for the checksum which
-	 * takes into account that the original ip6_dst should be
-	 * included in the checksum but that ip will include the
-	 * first hop in the source route in the tcp checksum.
-	 */
-	rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph);
-	if (rth != NULL) {
-		tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth,
-		    tcps->tcps_netstack);
-		tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
-		    (tcp->tcp_sum >> 16));
+	if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
+		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
+		connp->conn_ixa->ixa_scopeid = scope_id;
 	} else {
-		tcp->tcp_sum = 0;
+		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 	}
 
-	tcph = tcp->tcp_tcph;
-	*(uint16_t *)tcph->th_fport = dstport;
-	tcp->tcp_fport = dstport;
+	connp->conn_flowinfo = flowinfo;
+	connp->conn_faddr_v6 = *dstaddrp;
+	connp->conn_fport = dstport;
 
-	oldstate = tcp->tcp_state;
 	/*
 	 * At this point the remote destination address and remote port fields
 	 * in the tcp-four-tuple have been filled in the tcp structure. Now we
-	 * have to see which state tcp was in so we can take apropriate action.
+	 * have to see which state tcp was in so we can take appropriate action.
 	 */
-	if (oldstate == TCPS_IDLE) {
+	if (tcp->tcp_state == TCPS_IDLE) {
 		/*
 		 * We support a quick connect capability here, allowing
 		 * clients to transition directly from IDLE to SYN_SENT
@@ -6438,128 +5375,55 @@ tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport,
 		 */
 		lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
 		    tcp, B_TRUE);
-		lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
+		lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE,
 		    B_FALSE, B_FALSE);
-		if (lport == 0) {
-			error = -TNOADDR;
-			goto failed;
-		}
+		if (lport == 0)
+			return (-TNOADDR);
 	}
-	tcp->tcp_state = TCPS_SYN_SENT;
-
-	mp = allocb(sizeof (ire_t), BPRI_HI);
-	if (mp != NULL) {
-		in6_addr_t v6src;
-
-		mp->b_wptr += sizeof (ire_t);
-		mp->b_datap->db_type = IRE_DB_REQ_TYPE;
 
-		tcp->tcp_hard_binding = 1;
-
-		/*
-		 * We need to make sure that the conn_recv is set to a non-null
-		 * value before we insert the conn_t into the classifier table.
-		 * This is to avoid a race with an incoming packet which does
-		 * an ipcl_classify().
-		 */
-		tcp->tcp_connp->conn_recv = tcp_input;
+	/*
+	 * Lookup the route to determine a source address and the uinfo.
+	 * If there was a source route we have tcp_ip6h->ip6_dst as the first
+	 * hop.
+	 * Setup TCP parameters based on the metrics/DCE.
+	 */
+	error = tcp_set_destination(tcp);
+	if (error != 0)
+		return (error);
 
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
-			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src);
-		} else {
-			v6src = tcp->tcp_ip6h->ip6_src;
-		}
-		error = ip_proto_bind_connected_v6(connp, &mp, IPPROTO_TCP,
-		    &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6,
-		    &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE, cr);
-		BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
-		tcp->tcp_active_open = 1;
+	/*
+	 * Don't let an endpoint connect to itself.
+	 */
+	if (IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &connp->conn_laddr_v6) &&
+	    connp->conn_fport == connp->conn_lport)
+		return (-TBADADDR);
 
-		return (tcp_post_ip_bind(tcp, mp, error, cr, pid));
-	}
-	/* Error case */
-	tcp->tcp_state = oldstate;
-	error = ENOMEM;
+	tcp->tcp_state = TCPS_SYN_SENT;
 
-failed:
-	/* return error ack and blow away saved option results if any */
-	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
-		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
-	return (error);
+	return (ipcl_conn_insert_v6(connp));
 }
 
 /*
- * We need a stream q for detached closing tcp connections
- * to use.  Our client hereby indicates that this q is the
- * one to use.
+ * Disconnect
+ * Note that unlike other functions this returns a positive tli error
+ * when it fails; it never returns an errno.
  */
-static void
-tcp_def_q_set(tcp_t *tcp, mblk_t *mp)
-{
-	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
-	queue_t	*q = tcp->tcp_wq;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-
-#ifdef NS_DEBUG
-	(void) printf("TCP_IOC_DEFAULT_Q for stack %d\n",
-	    tcps->tcps_netstack->netstack_stackid);
-#endif
-	mp->b_datap->db_type = M_IOCACK;
-	iocp->ioc_count = 0;
-	mutex_enter(&tcps->tcps_g_q_lock);
-	if (tcps->tcps_g_q != NULL) {
-		mutex_exit(&tcps->tcps_g_q_lock);
-		iocp->ioc_error = EALREADY;
-	} else {
-		int error = 0;
-		conn_t *connp = tcp->tcp_connp;
-		ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
-		tcps->tcps_g_q = tcp->tcp_rq;
-		mutex_exit(&tcps->tcps_g_q_lock);
-		iocp->ioc_error = 0;
-		iocp->ioc_rval = 0;
-		/*
-		 * We are passing tcp_sticky_ipp as NULL
-		 * as it is not useful for tcp_default queue
-		 *
-		 * Set conn_recv just in case.
-		 */
-		tcp->tcp_connp->conn_recv = tcp_conn_request;
-
-		ASSERT(connp->conn_af_isv6);
-		connp->conn_ulp = IPPROTO_TCP;
-
-		if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_TCP].connf_head !=
-		    NULL || (connp->conn_mac_mode != CONN_MAC_DEFAULT)) {
-			error = -TBADADDR;
-		} else {
-			connp->conn_srcv6 = ipv6_all_zeros;
-			ipcl_proto_insert_v6(connp, IPPROTO_TCP);
-		}
-
-		(void) tcp_post_ip_bind(tcp, NULL, error, NULL, 0);
-	}
-	qreply(q, mp);
-}
-
 static int
 tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
 {
 	tcp_t	*ltcp = NULL;
-	conn_t	*connp;
+	conn_t		*lconnp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
 	/*
 	 * Right now, upper modules pass down a T_DISCON_REQ to TCP,
 	 * when the stream is in BOUND state. Do not send a reset,
 	 * since the destination IP address is not valid, and it can
 	 * be the initialized value of all zeros (broadcast address).
-	 *
-	 * XXX There won't be any pending bind request to IP.
 	 */
-	if (tcp->tcp_state <= TCPS_BOUND) {
-		if (tcp->tcp_debug) {
+	if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_disconnect: bad state, %d", tcp->tcp_state);
 		}
@@ -6595,19 +5459,23 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
 		 * If it used to be a listener, check to make sure no one else
 		 * has taken the port before switching back to LISTEN state.
 		 */
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
-			connp = ipcl_lookup_listener_v4(tcp->tcp_lport,
-			    tcp->tcp_ipha->ipha_src,
-			    tcp->tcp_connp->conn_zoneid, ipst);
-			if (connp != NULL)
-				ltcp = connp->conn_tcp;
+		if (connp->conn_ipversion == IPV4_VERSION) {
+			lconnp = ipcl_lookup_listener_v4(connp->conn_lport,
+			    connp->conn_laddr_v4, IPCL_ZONEID(connp), ipst);
+			if (lconnp != NULL)
+				ltcp = lconnp->conn_tcp;
 		} else {
-			/* Allow tcp_bound_if listeners? */
-			connp = ipcl_lookup_listener_v6(tcp->tcp_lport,
-			    &tcp->tcp_ip6h->ip6_src, 0,
-			    tcp->tcp_connp->conn_zoneid, ipst);
-			if (connp != NULL)
-				ltcp = connp->conn_tcp;
+			uint_t ifindex = 0;
+
+			if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)
+				ifindex = connp->conn_ixa->ixa_scopeid;
+
+			/* Allow conn_bound_if listeners? */
+			lconnp = ipcl_lookup_listener_v6(connp->conn_lport,
+			    &connp->conn_laddr_v6, ifindex, IPCL_ZONEID(connp),
+			    ipst);
+			if (lconnp != NULL)
+				ltcp = lconnp->conn_tcp;
 		}
 		if (tcp->tcp_conn_req_max && ltcp == NULL) {
 			tcp->tcp_state = TCPS_LISTEN;
@@ -6616,7 +5484,7 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
 			tcp->tcp_state = TCPS_BOUND;
 		}
 		if (ltcp != NULL)
-			CONN_DEC_REF(ltcp->tcp_connp);
+			CONN_DEC_REF(lconnp);
 		if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
 		} else if (old_state == TCPS_ESTABLISHED ||
@@ -6648,7 +5516,7 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
 
 /*
  * Our client hereby directs us to reject the connection request
- * that tcp_conn_request() marked with 'seqnum'.  Rejection consists
+ * that tcp_input_listener() marked with 'seqnum'.  Rejection consists
  * of sending the appropriate RST, not an ICMP error.
  */
 static void
@@ -6656,6 +5524,7 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp)
 {
 	t_scalar_t seqnum;
 	int	error;
+	conn_t	*connp = tcp->tcp_connp;
 
 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
@@ -6669,11 +5538,11 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp)
 	else {
 		if (tcp->tcp_state >= TCPS_ESTABLISHED) {
 			/* Send M_FLUSH according to TPI */
-			(void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
+			(void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
 		}
 		mp = mi_tpi_ok_ack_alloc(mp);
-		if (mp)
-			putnext(tcp->tcp_rq, mp);
+		if (mp != NULL)
+			putnext(connp->conn_rq, mp);
 	}
 }
 
@@ -6695,6 +5564,7 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format)
 	in6_addr_t	local, remote;
 	char		local_addrbuf[INET6_ADDRSTRLEN];
 	char		remote_addrbuf[INET6_ADDRSTRLEN];
+	conn_t		*connp;
 
 	if (sup_buf != NULL)
 		buf = sup_buf;
@@ -6703,6 +5573,8 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format)
 
 	if (tcp == NULL)
 		return ("NULL_TCP");
+
+	connp = tcp->tcp_connp;
 	switch (tcp->tcp_state) {
 	case TCPS_CLOSED:
 		cp = "TCP_CLOSED";
@@ -6750,32 +5622,32 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format)
 	}
 	switch (format) {
 	case DISP_ADDR_AND_PORT:
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
+		if (connp->conn_ipversion == IPV4_VERSION) {
 			/*
 			 * Note that we use the remote address in the tcp_b
 			 * structure.  This means that it will print out
 			 * the real destination address, not the next hop's
 			 * address if source routing is used.
 			 */
-			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local);
-			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote);
+			IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
+			IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
 
 		} else {
-			local = tcp->tcp_ip_src_v6;
-			remote = tcp->tcp_remote_v6;
+			local = connp->conn_laddr_v6;
+			remote = connp->conn_faddr_v6;
 		}
 		(void) inet_ntop(AF_INET6, &local, local_addrbuf,
 		    sizeof (local_addrbuf));
 		(void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
 		    sizeof (remote_addrbuf));
 		(void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
-		    local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf,
-		    ntohs(tcp->tcp_fport), cp);
+		    local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
+		    ntohs(connp->conn_fport), cp);
 		break;
 	case DISP_PORT_ONLY:
 	default:
 		(void) mi_sprintf(buf, "[%u, %u] %s",
-		    ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp);
+		    ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
 		break;
 	}
 
@@ -6788,26 +5660,24 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format)
  * eager to disappear either by means of tcp_eager_blowoff() or
  * tcp_eager_cleanup() being called. tcp_eager_kill() can also be
  * called (via squeue) if the eager cannot be inserted in the
- * fanout table in tcp_conn_request().
+ * fanout table in tcp_input_listener().
  */
 /* ARGSUSED */
 void
-tcp_eager_kill(void *arg, mblk_t *mp, void *arg2)
+tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	conn_t	*econnp = (conn_t *)arg;
 	tcp_t	*eager = econnp->conn_tcp;
 	tcp_t	*listener = eager->tcp_listener;
-	tcp_stack_t	*tcps = eager->tcp_tcps;
 
 	/*
 	 * We could be called because listener is closing. Since
-	 * the eager is using listener's queue's, its not safe.
-	 * Better use the default queue just to send the TH_RST
-	 * out.
+	 * the eager was using listener's queue's, we avoid
+	 * using the listeners queues from now on.
 	 */
-	ASSERT(tcps->tcps_g_q != NULL);
-	eager->tcp_rq = tcps->tcps_g_q;
-	eager->tcp_wq = WR(tcps->tcps_g_q);
+	ASSERT(eager->tcp_detached);
+	econnp->conn_rq = NULL;
+	econnp->conn_wq = NULL;
 
 	/*
 	 * An eager's conn_fanout will be NULL if it's a duplicate
@@ -6828,7 +5698,7 @@ tcp_eager_kill(void *arg, mblk_t *mp, void *arg2)
 			 * The eager has sent a conn_ind up to the
 			 * listener but listener decides to close
 			 * instead. We need to drop the extra ref
-			 * placed on eager in tcp_rput_data() before
+			 * placed on eager in tcp_input_data() before
 			 * sending the conn_ind to listener.
 			 */
 			CONN_DEC_REF(econnp);
@@ -6873,7 +5743,7 @@ tcp_eager_blowoff(tcp_t	*listener, t_scalar_t seqnum)
 	mutex_exit(&listener->tcp_eager_lock);
 	mp = &eager->tcp_closemp;
 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
-	    eager->tcp_connp, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
+	    eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
 	return (B_TRUE);
 }
 
@@ -6901,7 +5771,7 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
 				CONN_INC_REF(eager->tcp_connp);
 				mp = &eager->tcp_closemp;
 				SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
-				    tcp_eager_kill, eager->tcp_connp,
+				    tcp_eager_kill, eager->tcp_connp, NULL,
 				    SQ_FILL, SQTAG_TCP_EAGER_CLEANUP);
 			}
 			eager = eager->tcp_eager_next_q;
@@ -6917,7 +5787,7 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
 			CONN_INC_REF(eager->tcp_connp);
 			mp = &eager->tcp_closemp;
 			SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
-			    tcp_eager_kill, eager->tcp_connp, SQ_FILL,
+			    tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL,
 			    SQTAG_TCP_EAGER_CLEANUP_Q0);
 		}
 		eager = eager->tcp_eager_next_q0;
@@ -7008,7 +5878,7 @@ static void
 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
 {
 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
-		putnext(tcp->tcp_rq, mp);
+		putnext(tcp->tcp_connp->conn_rq, mp);
 }
 
 /* Shorthand to generate and send TPI error acks to our client */
@@ -7024,7 +5894,7 @@ tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
 		teackp->ERROR_prim = primitive;
 		teackp->TLI_error = t_error;
 		teackp->UNIX_error = sys_error;
-		putnext(tcp->tcp_rq, mp);
+		putnext(tcp->tcp_connp->conn_rq, mp);
 	}
 }
 
@@ -7194,8 +6064,9 @@ static void
 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
 {
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
-	if (tcp->tcp_family == AF_INET6)
+	if (connp->conn_family == AF_INET6)
 		*tia = tcp_g_t_info_ack_v6;
 	else
 		*tia = tcp_g_t_info_ack;
@@ -7203,7 +6074,7 @@ tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
 	tia->OPT_size = tcp_max_optsize;
 	if (tcp->tcp_mss == 0) {
 		/* Not yet set - tcp_open does not set mss */
-		if (tcp->tcp_ipversion == IPV4_VERSION)
+		if (connp->conn_ipversion == IPV4_VERSION)
 			tia->TIDU_size = tcps->tcps_mss_def_ipv4;
 		else
 			tia->TIDU_size = tcps->tcps_mss_def_ipv6;
@@ -7258,7 +6129,7 @@ tcp_capability_req(tcp_t *tcp, mblk_t *mp)
 	tcap = (struct T_capability_ack *)mp->b_rptr;
 	tcp_do_capability_ack(tcp, tcap, cap_bits1);
 
-	putnext(tcp->tcp_rq, mp);
+	putnext(tcp->tcp_connp->conn_rq, mp);
 }
 
 /*
@@ -7276,16 +6147,18 @@ tcp_info_req(tcp_t *tcp, mblk_t *mp)
 		return;
 	}
 	tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
-	putnext(tcp->tcp_rq, mp);
+	putnext(tcp->tcp_connp->conn_rq, mp);
 }
 
 /* Respond to the TPI addr request */
 static void
 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
 {
-	sin_t	*sin;
+	struct sockaddr *sa;
 	mblk_t	*ackmp;
 	struct T_addr_ack *taa;
+	conn_t	*connp = tcp->tcp_connp;
+	uint_t	addrlen;
 
 	/* Make it large enough for worst case */
 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
@@ -7295,10 +6168,6 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp)
 		return;
 	}
 
-	if (tcp->tcp_ipversion == IPV6_VERSION) {
-		tcp_addr_req_ipv6(tcp, ackmp);
-		return;
-	}
 	taa = (struct T_addr_ack *)ackmp->b_rptr;
 
 	bzero(taa, sizeof (struct T_addr_ack));
@@ -7307,110 +6176,38 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp)
 	taa->PRIM_type = T_ADDR_ACK;
 	ackmp->b_datap->db_type = M_PCPROTO;
 
+	if (connp->conn_family == AF_INET)
+		addrlen = sizeof (sin_t);
+	else
+		addrlen = sizeof (sin6_t);
+
 	/*
 	 * Note: Following code assumes 32 bit alignment of basic
 	 * data structures like sin_t and struct T_addr_ack.
 	 */
 	if (tcp->tcp_state >= TCPS_BOUND) {
 		/*
-		 * Fill in local address
+		 * Fill in local address first
 		 */
-		taa->LOCADDR_length = sizeof (sin_t);
 		taa->LOCADDR_offset = sizeof (*taa);
-
-		sin = (sin_t *)&taa[1];
-
-		/* Fill zeroes and then intialize non-zero fields */
-		*sin = sin_null;
-
-		sin->sin_family = AF_INET;
-
-		sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src;
-		sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport;
-
-		ackmp->b_wptr = (uchar_t *)&sin[1];
-
-		if (tcp->tcp_state >= TCPS_SYN_RCVD) {
-			/*
-			 * Fill in Remote address
-			 */
-			taa->REMADDR_length = sizeof (sin_t);
-			taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
-			    taa->LOCADDR_length);
-
-			sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset);
-			*sin = sin_null;
-			sin->sin_family = AF_INET;
-			sin->sin_addr.s_addr = tcp->tcp_remote;
-			sin->sin_port = tcp->tcp_fport;
-
-			ackmp->b_wptr = (uchar_t *)&sin[1];
-		}
+		taa->LOCADDR_length = addrlen;
+		sa = (struct sockaddr *)&taa[1];
+		(void) conn_getsockname(connp, sa, &addrlen);
+		ackmp->b_wptr += addrlen;
 	}
-	putnext(tcp->tcp_rq, ackmp);
-}
-
-/* Assumes that tcp_addr_req gets enough space and alignment */
-static void
-tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp)
-{
-	sin6_t	*sin6;
-	struct T_addr_ack *taa;
-
-	ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
-	ASSERT(OK_32PTR(ackmp->b_rptr));
-	ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) +
-	    2 * sizeof (sin6_t));
-
-	taa = (struct T_addr_ack *)ackmp->b_rptr;
-
-	bzero(taa, sizeof (struct T_addr_ack));
-	ackmp->b_wptr = (uchar_t *)&taa[1];
-
-	taa->PRIM_type = T_ADDR_ACK;
-	ackmp->b_datap->db_type = M_PCPROTO;
-
-	/*
-	 * Note: Following code assumes 32 bit alignment of basic
-	 * data structures like sin6_t and struct T_addr_ack.
-	 */
-	if (tcp->tcp_state >= TCPS_BOUND) {
+	if (tcp->tcp_state >= TCPS_SYN_RCVD) {
 		/*
-		 * Fill in local address
+		 * Fill in Remote address
 		 */
-		taa->LOCADDR_length = sizeof (sin6_t);
-		taa->LOCADDR_offset = sizeof (*taa);
-
-		sin6 = (sin6_t *)&taa[1];
-		*sin6 = sin6_null;
-
-		sin6->sin6_family = AF_INET6;
-		sin6->sin6_addr = tcp->tcp_ip6h->ip6_src;
-		sin6->sin6_port = tcp->tcp_lport;
-
-		ackmp->b_wptr = (uchar_t *)&sin6[1];
-
-		if (tcp->tcp_state >= TCPS_SYN_RCVD) {
-			/*
-			 * Fill in Remote address
-			 */
-			taa->REMADDR_length = sizeof (sin6_t);
-			taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
-			    taa->LOCADDR_length);
-
-			sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset);
-			*sin6 = sin6_null;
-			sin6->sin6_family = AF_INET6;
-			sin6->sin6_flowinfo =
-			    tcp->tcp_ip6h->ip6_vcf &
-			    ~IPV6_VERS_AND_FLOW_MASK;
-			sin6->sin6_addr = tcp->tcp_remote_v6;
-			sin6->sin6_port = tcp->tcp_fport;
-
-			ackmp->b_wptr = (uchar_t *)&sin6[1];
-		}
+		taa->REMADDR_length = addrlen;
+		/* assumed 32-bit alignment */
+		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
+		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
+		(void) conn_getpeername(connp, sa, &addrlen);
+		ackmp->b_wptr += addrlen;
 	}
-	putnext(tcp->tcp_rq, ackmp);
+	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
+	putnext(tcp->tcp_connp->conn_rq, ackmp);
 }
 
 /*
@@ -7420,19 +6217,19 @@ tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp)
 static void
 tcp_reinit(tcp_t *tcp)
 {
-	mblk_t	*mp;
-	int 	err;
+	mblk_t		*mp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp  = tcp->tcp_connp;
 
 	TCP_STAT(tcps, tcp_reinit_calls);
 
 	/* tcp_reinit should never be called for detached tcp_t's */
 	ASSERT(tcp->tcp_listener == NULL);
-	ASSERT((tcp->tcp_family == AF_INET &&
-	    tcp->tcp_ipversion == IPV4_VERSION) ||
-	    (tcp->tcp_family == AF_INET6 &&
-	    (tcp->tcp_ipversion == IPV4_VERSION ||
-	    tcp->tcp_ipversion == IPV6_VERSION)));
+	ASSERT((connp->conn_family == AF_INET &&
+	    connp->conn_ipversion == IPV4_VERSION) ||
+	    (connp->conn_family == AF_INET6 &&
+	    (connp->conn_ipversion == IPV4_VERSION ||
+	    connp->conn_ipversion == IPV6_VERSION)));
 
 	/* Cancel outstanding timers */
 	tcp_timers_stop(tcp);
@@ -7453,7 +6250,7 @@ tcp_reinit(tcp_t *tcp)
 	tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
 	mutex_enter(&tcp->tcp_non_sq_lock);
 	if (tcp->tcp_flow_stopped &&
-	    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+	    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
 		tcp_clrqfull(tcp);
 	}
 	mutex_exit(&tcp->tcp_non_sq_lock);
@@ -7494,7 +6291,7 @@ tcp_reinit(tcp_t *tcp)
 	 */
 	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
 
-	CL_INET_DISCONNECT(tcp->tcp_connp, tcp);
+	CL_INET_DISCONNECT(connp);
 
 	/*
 	 * The connection can't be on the tcp_time_wait_head list
@@ -7522,14 +6319,12 @@ tcp_reinit(tcp_t *tcp)
 	 * Reset/preserve other values
 	 */
 	tcp_reinit_values(tcp);
-	ipcl_hash_remove(tcp->tcp_connp);
-	conn_delete_ire(tcp->tcp_connp, NULL);
+	ipcl_hash_remove(connp);
+	ixa_cleanup(connp->conn_ixa);
 	tcp_ipsec_cleanup(tcp);
 
-	if (tcp->tcp_connp->conn_effective_cred != NULL) {
-		crfree(tcp->tcp_connp->conn_effective_cred);
-		tcp->tcp_connp->conn_effective_cred = NULL;
-	}
+	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+	connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 
 	if (tcp->tcp_conn_req_max != 0) {
 		/*
@@ -7553,44 +6348,31 @@ tcp_reinit(tcp_t *tcp)
 		tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
 		tcp->tcp_eager_next_drop_q0 = tcp;
 		tcp->tcp_eager_prev_drop_q0 = tcp;
-		tcp->tcp_connp->conn_recv = tcp_conn_request;
-		if (tcp->tcp_family == AF_INET6) {
-			ASSERT(tcp->tcp_connp->conn_af_isv6);
-			(void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP,
-			    &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport);
-		} else {
-			ASSERT(!tcp->tcp_connp->conn_af_isv6);
-			(void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP,
-			    tcp->tcp_ipha->ipha_src, tcp->tcp_lport);
-		}
+		/*
+		 * Initially set conn_recv to tcp_input_listener_unbound to try
+		 * to pick a good squeue for the listener when the first SYN
+		 * arrives. tcp_input_listener_unbound sets it to
+		 * tcp_input_listener on that first SYN.
+		 */
+		connp->conn_recv = tcp_input_listener_unbound;
+
+		connp->conn_proto = IPPROTO_TCP;
+		connp->conn_faddr_v6 = ipv6_all_zeros;
+		connp->conn_fport = 0;
+
+		(void) ipcl_bind_insert(connp);
 	} else {
 		tcp->tcp_state = TCPS_BOUND;
 	}
 
 	/*
 	 * Initialize to default values
-	 * Can't fail since enough header template space already allocated
-	 * at open().
-	 */
-	err = tcp_init_values(tcp);
-	ASSERT(err == 0);
-	/* Restore state in tcp_tcph */
-	bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN);
-	if (tcp->tcp_ipversion == IPV4_VERSION)
-		tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source;
-	else
-		tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6;
-	/*
-	 * Copy of the src addr. in tcp_t is needed in tcp_t
-	 * since the lookup funcs can only lookup on tcp_t
 	 */
-	tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6;
+	tcp_init_values(tcp);
 
 	ASSERT(tcp->tcp_ptpbhn != NULL);
-	tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat;
-	tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat;
-	tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
-	tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ?
+	tcp->tcp_rwnd = connp->conn_rcvbuf;
+	tcp->tcp_mss = connp->conn_ipversion != IPV4_VERSION ?
 	    tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4;
 }
 
@@ -7606,6 +6388,7 @@ tcp_reinit_values(tcp)
 	tcp_t *tcp;
 {
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
 #ifndef	lint
 #define	DONTCARE(x)
@@ -7626,8 +6409,8 @@ tcp_reinit_values(tcp)
 	ASSERT(tcp->tcp_time_wait_prev == NULL);
 	ASSERT(tcp->tcp_time_wait_expire == 0);
 	PRESERVE(tcp->tcp_state);
-	PRESERVE(tcp->tcp_rq);
-	PRESERVE(tcp->tcp_wq);
+	PRESERVE(connp->conn_rq);
+	PRESERVE(connp->conn_wq);
 
 	ASSERT(tcp->tcp_xmit_head == NULL);
 	ASSERT(tcp->tcp_xmit_last == NULL);
@@ -7638,26 +6421,32 @@ tcp_reinit_values(tcp)
 	tcp->tcp_snxt = 0;			/* Displayed in mib */
 	tcp->tcp_suna = 0;			/* Displayed in mib */
 	tcp->tcp_swnd = 0;
-	DONTCARE(tcp->tcp_cwnd);		/* Init in tcp_mss_set */
+	DONTCARE(tcp->tcp_cwnd);	/* Init in tcp_process_options */
 
 	ASSERT(tcp->tcp_ibsegs == 0);
 	ASSERT(tcp->tcp_obsegs == 0);
 
-	if (tcp->tcp_iphc != NULL) {
-		ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
-		bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
+	if (connp->conn_ht_iphc != NULL) {
+		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
+		connp->conn_ht_iphc = NULL;
+		connp->conn_ht_iphc_allocated = 0;
+		connp->conn_ht_iphc_len = 0;
+		connp->conn_ht_ulp = NULL;
+		connp->conn_ht_ulp_len = 0;
+		tcp->tcp_ipha = NULL;
+		tcp->tcp_ip6h = NULL;
+		tcp->tcp_tcpha = NULL;
 	}
 
+	/* We clear any IP_OPTIONS and extension headers */
+	ip_pkt_free(&connp->conn_xmit_ipp);
+
 	DONTCARE(tcp->tcp_naglim);		/* Init in tcp_init_values */
-	DONTCARE(tcp->tcp_hdr_len);		/* Init in tcp_init_values */
 	DONTCARE(tcp->tcp_ipha);
 	DONTCARE(tcp->tcp_ip6h);
-	DONTCARE(tcp->tcp_ip_hdr_len);
-	DONTCARE(tcp->tcp_tcph);
-	DONTCARE(tcp->tcp_tcp_hdr_len);		/* Init in tcp_init_values */
+	DONTCARE(tcp->tcp_tcpha);
 	tcp->tcp_valid_bits = 0;
 
-	DONTCARE(tcp->tcp_xmit_hiwater);	/* Init in tcp_init_values */
 	DONTCARE(tcp->tcp_timer_backoff);	/* Init in tcp_init_values */
 	DONTCARE(tcp->tcp_last_recv_time);	/* Init in tcp_init_values */
 	tcp->tcp_last_rcv_lbolt = 0;
@@ -7666,38 +6455,19 @@ tcp_reinit_values(tcp)
 
 	tcp->tcp_urp_last_valid = 0;
 	tcp->tcp_hard_binding = 0;
-	tcp->tcp_hard_bound = 0;
-	PRESERVE(tcp->tcp_cred);
-	PRESERVE(tcp->tcp_cpid);
-	PRESERVE(tcp->tcp_open_time);
-	PRESERVE(tcp->tcp_exclbind);
 
 	tcp->tcp_fin_acked = 0;
 	tcp->tcp_fin_rcvd = 0;
 	tcp->tcp_fin_sent = 0;
 	tcp->tcp_ordrel_done = 0;
 
-	tcp->tcp_debug = 0;
-	tcp->tcp_dontroute = 0;
-	tcp->tcp_broadcast = 0;
-
-	tcp->tcp_useloopback = 0;
-	tcp->tcp_reuseaddr = 0;
-	tcp->tcp_oobinline = 0;
-	tcp->tcp_dgram_errind = 0;
-
 	tcp->tcp_detached = 0;
-	tcp->tcp_bind_pending = 0;
-	tcp->tcp_unbind_pending = 0;
 
 	tcp->tcp_snd_ws_ok = B_FALSE;
 	tcp->tcp_snd_ts_ok = B_FALSE;
-	tcp->tcp_linger = 0;
-	tcp->tcp_ka_enabled = 0;
 	tcp->tcp_zero_win_probe = 0;
 
 	tcp->tcp_loopback = 0;
-	tcp->tcp_refuse = 0;
 	tcp->tcp_localnet = 0;
 	tcp->tcp_syn_defense = 0;
 	tcp->tcp_set_timer = 0;
@@ -7707,19 +6477,12 @@ tcp_reinit_values(tcp)
 	tcp->tcp_xmit_zc_clean = B_FALSE;
 
 	tcp->tcp_snd_sack_ok = B_FALSE;
-	PRESERVE(tcp->tcp_recvdstaddr);
 	tcp->tcp_hwcksum = B_FALSE;
 
-	tcp->tcp_ire_ill_check_done = B_FALSE;
-	DONTCARE(tcp->tcp_maxpsz);		/* Init in tcp_init_values */
-
-	tcp->tcp_mdt = B_FALSE;
-	tcp->tcp_mdt_hdr_head = 0;
-	tcp->tcp_mdt_hdr_tail = 0;
+	DONTCARE(tcp->tcp_maxpsz_multiplier);	/* Init in tcp_init_values */
 
 	tcp->tcp_conn_def_q0 = 0;
 	tcp->tcp_ip_forward_progress = B_FALSE;
-	tcp->tcp_anon_priv_bind = 0;
 	tcp->tcp_ecn_ok = B_FALSE;
 
 	tcp->tcp_cwr = B_FALSE;
@@ -7740,7 +6503,7 @@ tcp_reinit_values(tcp)
 	tcp->tcp_ts_recent = 0;
 	tcp->tcp_rnxt = 0;			/* Displayed in mib */
 	DONTCARE(tcp->tcp_rwnd);		/* Set in tcp_reinit() */
-	tcp->tcp_if_mtu = 0;
+	tcp->tcp_initial_pmtu = 0;
 
 	ASSERT(tcp->tcp_reass_head == NULL);
 	ASSERT(tcp->tcp_reass_tail == NULL);
@@ -7752,7 +6515,7 @@ tcp_reinit_values(tcp)
 	ASSERT(tcp->tcp_rcv_last_tail == NULL);
 	ASSERT(tcp->tcp_rcv_cnt == 0);
 
-	DONTCARE(tcp->tcp_cwnd_ssthresh);	/* Init in tcp_adapt_ire */
+	DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */
 	DONTCARE(tcp->tcp_cwnd_max);		/* Init in tcp_init_values */
 	tcp->tcp_csuna = 0;
 
@@ -7773,8 +6536,6 @@ tcp_reinit_values(tcp)
 
 	ASSERT(tcp->tcp_listener == NULL);
 
-	DONTCARE(tcp->tcp_xmit_lowater);	/* Init in tcp_init_values */
-
 	DONTCARE(tcp->tcp_irs);			/* tcp_valid_bits cleared */
 	DONTCARE(tcp->tcp_iss);			/* tcp_valid_bits cleared */
 	DONTCARE(tcp->tcp_fss);			/* tcp_valid_bits cleared */
@@ -7785,14 +6546,11 @@ tcp_reinit_values(tcp)
 	PRESERVE(tcp->tcp_conn_req_max);
 	PRESERVE(tcp->tcp_conn_req_seqnum);
 
-	DONTCARE(tcp->tcp_ip_hdr_len);		/* Init in tcp_init_values */
 	DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */
 	DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */
 	DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */
 	DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */
 
-	tcp->tcp_lingertime = 0;
-
 	DONTCARE(tcp->tcp_urp_last);	/* tcp_urp_last_valid is cleared */
 	ASSERT(tcp->tcp_urp_mp == NULL);
 	ASSERT(tcp->tcp_urp_mark_mp == NULL);
@@ -7811,16 +6569,16 @@ tcp_reinit_values(tcp)
 
 	tcp->tcp_client_errno = 0;
 
-	DONTCARE(tcp->tcp_sum);			/* Init in tcp_init_values */
+	DONTCARE(connp->conn_sum);		/* Init in tcp_init_values */
 
-	tcp->tcp_remote_v6 = ipv6_all_zeros;	/* Displayed in MIB */
+	connp->conn_faddr_v6 = ipv6_all_zeros;	/* Displayed in MIB */
 
-	PRESERVE(tcp->tcp_bound_source_v6);
+	PRESERVE(connp->conn_bound_addr_v6);
 	tcp->tcp_last_sent_len = 0;
 	tcp->tcp_dupack_cnt = 0;
 
-	tcp->tcp_fport = 0;			/* Displayed in MIB */
-	PRESERVE(tcp->tcp_lport);
+	connp->conn_fport = 0;			/* Displayed in MIB */
+	PRESERVE(connp->conn_lport);
 
 	PRESERVE(tcp->tcp_acceptor_lockp);
 
@@ -7828,16 +6586,18 @@ tcp_reinit_values(tcp)
 	PRESERVE(tcp->tcp_acceptor_id);
 	DONTCARE(tcp->tcp_ipsec_overhead);
 
-	PRESERVE(tcp->tcp_family);
-	if (tcp->tcp_family == AF_INET6) {
+	PRESERVE(connp->conn_family);
+	/* Remove any remnants of mapped address binding */
+	if (connp->conn_family == AF_INET6) {
+		connp->conn_ipversion = IPV6_VERSION;
 		tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
 	} else {
+		connp->conn_ipversion = IPV4_VERSION;
 		tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
 	}
-	PRESERVE(tcp->tcp_ipversion);		/* Init in tcp_init_values */
 
-	tcp->tcp_bound_if = 0;
-	tcp->tcp_ipv6_recvancillary = 0;
+	connp->conn_bound_if = 0;
+	connp->conn_recv_ancillary.crb_all = 0;
 	tcp->tcp_recvifindex = 0;
 	tcp->tcp_recvhops = 0;
 	tcp->tcp_closed = 0;
@@ -7854,19 +6614,18 @@ tcp_reinit_values(tcp)
 		tcp->tcp_dstoptslen = 0;
 	}
 	ASSERT(tcp->tcp_dstoptslen == 0);
-	if (tcp->tcp_rtdstopts != NULL) {
-		mi_free(tcp->tcp_rtdstopts);
-		tcp->tcp_rtdstopts = NULL;
-		tcp->tcp_rtdstoptslen = 0;
+	if (tcp->tcp_rthdrdstopts != NULL) {
+		mi_free(tcp->tcp_rthdrdstopts);
+		tcp->tcp_rthdrdstopts = NULL;
+		tcp->tcp_rthdrdstoptslen = 0;
 	}
-	ASSERT(tcp->tcp_rtdstoptslen == 0);
+	ASSERT(tcp->tcp_rthdrdstoptslen == 0);
 	if (tcp->tcp_rthdr != NULL) {
 		mi_free(tcp->tcp_rthdr);
 		tcp->tcp_rthdr = NULL;
 		tcp->tcp_rthdrlen = 0;
 	}
 	ASSERT(tcp->tcp_rthdrlen == 0);
-	PRESERVE(tcp->tcp_drop_opt_ack_cnt);
 
 	/* Reset fusion-related fields */
 	tcp->tcp_fused = B_FALSE;
@@ -7902,35 +6661,17 @@ tcp_reinit_values(tcp)
 #undef	PRESERVE
 }
 
-/*
- * Allocate necessary resources and initialize state vector.
- * Guaranteed not to fail so that when an error is returned,
- * the caller doesn't need to do any additional cleanup.
- */
-int
-tcp_init(tcp_t *tcp, queue_t *q)
-{
-	int	err;
-
-	tcp->tcp_rq = q;
-	tcp->tcp_wq = WR(q);
-	tcp->tcp_state = TCPS_IDLE;
-	if ((err = tcp_init_values(tcp)) != 0)
-		tcp_timers_stop(tcp);
-	return (err);
-}
-
-static int
+static void
 tcp_init_values(tcp_t *tcp)
 {
-	int	err;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
-	ASSERT((tcp->tcp_family == AF_INET &&
-	    tcp->tcp_ipversion == IPV4_VERSION) ||
-	    (tcp->tcp_family == AF_INET6 &&
-	    (tcp->tcp_ipversion == IPV4_VERSION ||
-	    tcp->tcp_ipversion == IPV6_VERSION)));
+	ASSERT((connp->conn_family == AF_INET &&
+	    connp->conn_ipversion == IPV4_VERSION) ||
+	    (connp->conn_family == AF_INET6 &&
+	    (connp->conn_ipversion == IPV4_VERSION ||
+	    connp->conn_ipversion == IPV6_VERSION)));
 
 	/*
 	 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
@@ -7953,7 +6694,7 @@ tcp_init_values(tcp_t *tcp)
 	tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
 	tcp->tcp_snd_burst = TCP_CWND_INFINITE;
 
-	tcp->tcp_maxpsz = tcps->tcps_maxpsz_multiplier;
+	tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
 
 	tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
 	tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval;
@@ -7966,10 +6707,7 @@ tcp_init_values(tcp_t *tcp)
 
 	tcp->tcp_naglim = tcps->tcps_naglim_def;
 
-	/* NOTE:  ISS is now set in tcp_adapt_ire(). */
-
-	tcp->tcp_mdt_hdr_head = 0;
-	tcp->tcp_mdt_hdr_tail = 0;
+	/* NOTE:  ISS is now set in tcp_set_destination(). */
 
 	/* Reset fusion-related fields */
 	tcp->tcp_fused = B_FALSE;
@@ -7977,280 +6715,84 @@ tcp_init_values(tcp_t *tcp)
 	tcp->tcp_fused_sigurg = B_FALSE;
 	tcp->tcp_loopback_peer = NULL;
 
-	/* Initialize the header template */
-	if (tcp->tcp_family == AF_INET) {
-		err = tcp_header_init_ipv4(tcp);
-	} else {
-		err = tcp_header_init_ipv6(tcp);
-	}
-	if (err)
-		return (err);
+	/* We rebuild the header template on the next connect/conn_request */
+
+	connp->conn_mlp_type = mlptSingle;
 
 	/*
 	 * Init the window scale to the max so tcp_rwnd_set() won't pare
-	 * down tcp_rwnd. tcp_adapt_ire() will set the right value later.
+	 * down tcp_rwnd. tcp_set_destination() will set the right value later.
 	 */
 	tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT;
-	tcp->tcp_xmit_lowater = tcps->tcps_xmit_lowat;
-	tcp->tcp_xmit_hiwater = tcps->tcps_xmit_hiwat;
-	tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat;
-	tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
-	tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat;
+	tcp->tcp_rwnd = connp->conn_rcvbuf;
 
 	tcp->tcp_cork = B_FALSE;
 	/*
-	 * Init the tcp_debug option.  This value determines whether TCP
+	 * Init the tcp_debug option if it wasn't already set.  This value
+	 * determines whether TCP
 	 * calls strlog() to print out debug messages.  Doing this
 	 * initialization here means that this value is not inherited thru
 	 * tcp_reinit().
 	 */
-	tcp->tcp_debug = tcps->tcps_dbg;
+	if (!connp->conn_debug)
+		connp->conn_debug = tcps->tcps_dbg;
 
 	tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
 	tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
-
-	return (0);
-}
-
-/*
- * Initialize the IPv4 header. Loses any record of any IP options.
- */
-static int
-tcp_header_init_ipv4(tcp_t *tcp)
-{
-	tcph_t		*tcph;
-	uint32_t	sum;
-	conn_t		*connp;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-
-	/*
-	 * This is a simple initialization. If there's
-	 * already a template, it should never be too small,
-	 * so reuse it.  Otherwise, allocate space for the new one.
-	 */
-	if (tcp->tcp_iphc == NULL) {
-		ASSERT(tcp->tcp_iphc_len == 0);
-		tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
-		tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
-		if (tcp->tcp_iphc == NULL) {
-			tcp->tcp_iphc_len = 0;
-			return (ENOMEM);
-		}
-	}
-
-	/* options are gone; may need a new label */
-	connp = tcp->tcp_connp;
-	connp->conn_mlp_type = mlptSingle;
-	connp->conn_ulp_labeled = !is_system_labeled();
-	ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
-
-	/*
-	 * tcp_do_get{sock,peer}name constructs the sockaddr from the
-	 * ip header, and decides which header to use based on ip version.
-	 * That operation happens outside the squeue, so we hold the lock
-	 * here to ensure that the ip version and header remain consistent.
-	 */
-	mutex_enter(&connp->conn_lock);
-	tcp->tcp_ipversion = IPV4_VERSION;
-	tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
-	tcp->tcp_ip6h = NULL;
-	mutex_exit(&connp->conn_lock);
-
-	tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t);
-	tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
-	tcp->tcp_ip_hdr_len = sizeof (ipha_t);
-	tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t));
-	tcp->tcp_ipha->ipha_version_and_hdr_length
-	    = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS;
-	tcp->tcp_ipha->ipha_ident = 0;
-
-	tcp->tcp_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
-	tcp->tcp_tos = 0;
-	tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
-	tcp->tcp_ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
-	tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP;
-
-	tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t));
-	tcp->tcp_tcph = tcph;
-	tcph->th_offset_and_rsrvd[0] = (5 << 4);
-	/*
-	 * IP wants our header length in the checksum field to
-	 * allow it to perform a single pseudo-header+checksum
-	 * calculation on behalf of TCP.
-	 * Include the adjustment for a source route once IP_OPTIONS is set.
-	 */
-	sum = sizeof (tcph_t) + tcp->tcp_sum;
-	sum = (sum >> 16) + (sum & 0xFFFF);
-	U16_TO_ABE16(sum, tcph->th_sum);
-	return (0);
-}
-
-/*
- * Initialize the IPv6 header. Loses any record of any IPv6 extension headers.
- */
-static int
-tcp_header_init_ipv6(tcp_t *tcp)
-{
-	tcph_t	*tcph;
-	uint32_t	sum;
-	conn_t	*connp;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-
-	/*
-	 * This is a simple initialization. If there's
-	 * already a template, it should never be too small,
-	 * so reuse it. Otherwise, allocate space for the new one.
-	 * Ensure that there is enough space to "downgrade" the tcp_t
-	 * to an IPv4 tcp_t. This requires having space for a full load
-	 * of IPv4 options, as well as a full load of TCP options
-	 * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space
-	 * than a v6 header and a TCP header with a full load of TCP options
-	 * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes).
-	 * We want to avoid reallocation in the "downgraded" case when
-	 * processing outbound IPv4 options.
-	 */
-	if (tcp->tcp_iphc == NULL) {
-		ASSERT(tcp->tcp_iphc_len == 0);
-		tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
-		tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
-		if (tcp->tcp_iphc == NULL) {
-			tcp->tcp_iphc_len = 0;
-			return (ENOMEM);
-		}
-	}
-
-	/* options are gone; may need a new label */
-	connp = tcp->tcp_connp;
-	connp->conn_mlp_type = mlptSingle;
-	connp->conn_ulp_labeled = !is_system_labeled();
-
-	ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
-	tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t);
-	tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
-	tcp->tcp_ip_hdr_len = IPV6_HDR_LEN;
-
-	/*
-	 * tcp_do_get{sock,peer}name constructs the sockaddr from the
-	 * ip header, and decides which header to use based on ip version.
-	 * That operation happens outside the squeue, so we hold the lock
-	 * here to ensure that the ip version and header remain consistent.
-	 */
-	mutex_enter(&connp->conn_lock);
-	tcp->tcp_ipversion = IPV6_VERSION;
-	tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc;
-	tcp->tcp_ipha = NULL;
-	mutex_exit(&connp->conn_lock);
-
-	/* Initialize the header template */
-
-	tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-	tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t));
-	tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP;
-	tcp->tcp_ip6h->ip6_hops = (uint8_t)tcps->tcps_ipv6_hoplimit;
-
-	tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN);
-	tcp->tcp_tcph = tcph;
-	tcph->th_offset_and_rsrvd[0] = (5 << 4);
-	/*
-	 * IP wants our header length in the checksum field to
-	 * allow it to perform a single psuedo-header+checksum
-	 * calculation on behalf of TCP.
-	 * Include the adjustment for a source route when IPV6_RTHDR is set.
-	 */
-	sum = sizeof (tcph_t) + tcp->tcp_sum;
-	sum = (sum >> 16) + (sum & 0xFFFF);
-	U16_TO_ABE16(sum, tcph->th_sum);
-	return (0);
 }
 
 /* At minimum we need 8 bytes in the TCP header for the lookup */
 #define	ICMP_MIN_TCP_HDR	8
 
 /*
- * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages
+ * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages
  * passed up by IP. The message is always received on the correct tcp_t.
  * Assumes that IP has pulled up everything up to and including the ICMP header.
  */
-void
-tcp_icmp_error(tcp_t *tcp, mblk_t *mp)
+/* ARGSUSED2 */
+static void
+tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 {
-	icmph_t *icmph;
-	ipha_t	*ipha;
-	int	iph_hdr_length;
-	tcph_t	*tcph;
-	boolean_t ipsec_mctl = B_FALSE;
-	boolean_t secure;
-	mblk_t *first_mp = mp;
-	int32_t new_mss;
-	uint32_t ratio;
-	size_t mp_size = MBLKL(mp);
-	uint32_t seg_seq;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
-
-	/* Assume IP provides aligned packets - otherwise toss */
-	if (!OK_32PTR(mp->b_rptr)) {
-		freemsg(mp);
-		return;
-	}
-
-	/*
-	 * Since ICMP errors are normal data marked with M_CTL when sent
-	 * to TCP or UDP, we have to look for a IPSEC_IN value to identify
-	 * packets starting with an ipsec_info_t, see ipsec_info.h.
-	 */
-	if ((mp_size == sizeof (ipsec_info_t)) &&
-	    (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) {
-		ASSERT(mp->b_cont != NULL);
-		mp = mp->b_cont;
-		/* IP should have done this */
-		ASSERT(OK_32PTR(mp->b_rptr));
-		mp_size = MBLKL(mp);
-		ipsec_mctl = B_TRUE;
-	}
+	conn_t		*connp = (conn_t *)arg1;
+	icmph_t		*icmph;
+	ipha_t		*ipha;
+	int		iph_hdr_length;
+	tcpha_t		*tcpha;
+	uint32_t	seg_seq;
+	tcp_t		*tcp = connp->conn_tcp;
 
-	/*
-	 * Verify that we have a complete outer IP header. If not, drop it.
-	 */
-	if (mp_size < sizeof (ipha_t)) {
-noticmpv4:
-		freemsg(first_mp);
-		return;
-	}
+	/* Assume IP provides aligned packets */
+	ASSERT(OK_32PTR(mp->b_rptr));
+	ASSERT((MBLKL(mp) >= sizeof (ipha_t)));
 
-	ipha = (ipha_t *)mp->b_rptr;
 	/*
 	 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent
 	 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
 	 */
-	switch (IPH_HDR_VERSION(ipha)) {
-	case IPV6_VERSION:
-		tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl);
+	if (!(ira->ira_flags & IRAF_IS_IPV4)) {
+		tcp_icmp_error_ipv6(tcp, mp, ira);
 		return;
-	case IPV4_VERSION:
-		break;
-	default:
-		goto noticmpv4;
 	}
 
 	/* Skip past the outer IP and ICMP headers */
-	iph_hdr_length = IPH_HDR_LENGTH(ipha);
+	iph_hdr_length = ira->ira_ip_hdr_length;
 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
 	/*
-	 * If we don't have the correct outer IP header length or if the ULP
-	 * is not IPPROTO_ICMP or if we don't have a complete inner IP header
-	 * send it upstream.
+	 * If we don't have the correct outer IP header length
+	 * or if we don't have a complete inner IP header
+	 * drop it.
 	 */
 	if (iph_hdr_length < sizeof (ipha_t) ||
-	    ipha->ipha_protocol != IPPROTO_ICMP ||
 	    (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
-		goto noticmpv4;
+noticmpv4:
+		freemsg(mp);
+		return;
 	}
 	ipha = (ipha_t *)&icmph[1];
 
 	/* Skip past the inner IP and find the ULP header */
 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
-	tcph = (tcph_t *)((char *)ipha + iph_hdr_length);
+	tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length);
 	/*
 	 * If we don't have the correct inner IP header length or if the ULP
 	 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
@@ -8258,166 +6800,20 @@ noticmpv4:
 	 */
 	if (iph_hdr_length < sizeof (ipha_t) ||
 	    ipha->ipha_protocol != IPPROTO_TCP ||
-	    (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) {
-		goto noticmpv4;
-	}
-
-	if (TCP_IS_DETACHED_NONEAGER(tcp)) {
-		if (ipsec_mctl) {
-			secure = ipsec_in_is_secure(first_mp);
-		} else {
-			secure = B_FALSE;
-		}
-		if (secure) {
-			/*
-			 * If we are willing to accept this in clear
-			 * we don't have to verify policy.
-			 */
-			if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) {
-				if (!tcp_check_policy(tcp, first_mp,
-				    ipha, NULL, secure, ipsec_mctl)) {
-					/*
-					 * tcp_check_policy called
-					 * ip_drop_packet() on failure.
-					 */
-					return;
-				}
-			}
-		}
-	} else if (ipsec_mctl) {
-		/*
-		 * This is a hard_bound connection. IP has already
-		 * verified policy. We don't have to do it again.
-		 */
-		freeb(first_mp);
-		first_mp = mp;
-		ipsec_mctl = B_FALSE;
-	}
-
-	seg_seq = ABE32_TO_U32(tcph->th_seq);
-	/*
-	 * TCP SHOULD check that the TCP sequence number contained in
-	 * payload of the ICMP error message is within the range
-	 * SND.UNA <= SEG.SEQ < SND.NXT.
-	 */
-	if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) {
-		/*
-		 * The ICMP message is bogus, just drop it.  But if this is
-		 * an ICMP too big message, IP has already changed
-		 * the ire_max_frag to the bogus value.  We need to change
-		 * it back.
-		 */
-		if (icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
-		    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
-			conn_t *connp = tcp->tcp_connp;
-			ire_t *ire;
-			int flag;
-
-			if (tcp->tcp_ipversion == IPV4_VERSION) {
-				flag = tcp->tcp_ipha->
-				    ipha_fragment_offset_and_flags;
-			} else {
-				flag = 0;
-			}
-			mutex_enter(&connp->conn_lock);
-			if ((ire = connp->conn_ire_cache) != NULL) {
-				mutex_enter(&ire->ire_lock);
-				mutex_exit(&connp->conn_lock);
-				ire->ire_max_frag = tcp->tcp_if_mtu;
-				ire->ire_frag_flag |= flag;
-				mutex_exit(&ire->ire_lock);
-			} else {
-				mutex_exit(&connp->conn_lock);
-			}
-		}
+	    (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) {
 		goto noticmpv4;
 	}
 
+	seg_seq = ntohl(tcpha->tha_seq);
 	switch (icmph->icmph_type) {
 	case ICMP_DEST_UNREACHABLE:
 		switch (icmph->icmph_code) {
 		case ICMP_FRAGMENTATION_NEEDED:
 			/*
-			 * Reduce the MSS based on the new MTU.  This will
-			 * eliminate any fragmentation locally.
-			 * N.B.  There may well be some funny side-effects on
-			 * the local send policy and the remote receive policy.
-			 * Pending further research, we provide
-			 * tcp_ignore_path_mtu just in case this proves
-			 * disastrous somewhere.
-			 *
-			 * After updating the MSS, retransmit part of the
-			 * dropped segment using the new mss by calling
-			 * tcp_wput_data().  Need to adjust all those
-			 * params to make sure tcp_wput_data() work properly.
-			 */
-			if (tcps->tcps_ignore_path_mtu ||
-			    tcp->tcp_ipha->ipha_fragment_offset_and_flags == 0)
-				break;
-
-			/*
-			 * Decrease the MSS by time stamp options
-			 * IP options and IPSEC options. tcp_hdr_len
-			 * includes time stamp option and IP option
-			 * length.  Note that new_mss may be negative
-			 * if tcp_ipsec_overhead is large and the
-			 * icmph_du_mtu is the minimum value, which is 68.
-			 */
-			new_mss = ntohs(icmph->icmph_du_mtu) -
-			    tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead;
-
-			DTRACE_PROBE2(tcp__pmtu__change, tcp_t *, tcp, int,
-			    new_mss);
-
-			/*
-			 * Only update the MSS if the new one is
-			 * smaller than the previous one.  This is
-			 * to avoid problems when getting multiple
-			 * ICMP errors for the same MTU.
-			 */
-			if (new_mss >= tcp->tcp_mss)
-				break;
-
-			/*
-			 * Note that we are using the template header's DF
-			 * bit in the fast path sending.  So we need to compare
-			 * the new mss with both tcps_mss_min and ip_pmtu_min.
-			 * And stop doing IPv4 PMTUd if new_mss is less than
-			 * MAX(tcps_mss_min, ip_pmtu_min).
-			 */
-			if (new_mss < tcps->tcps_mss_min ||
-			    new_mss < ipst->ips_ip_pmtu_min) {
-				tcp->tcp_ipha->ipha_fragment_offset_and_flags =
-				    0;
-			}
-
-			ratio = tcp->tcp_cwnd / tcp->tcp_mss;
-			ASSERT(ratio >= 1);
-			tcp_mss_set(tcp, new_mss, B_TRUE);
-
-			/*
-			 * Make sure we have something to
-			 * send.
+			 * Update Path MTU, then try to send something out.
 			 */
-			if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
-			    (tcp->tcp_xmit_head != NULL)) {
-				/*
-				 * Shrink tcp_cwnd in
-				 * proportion to the old MSS/new MSS.
-				 */
-				tcp->tcp_cwnd = ratio * tcp->tcp_mss;
-				if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
-				    (tcp->tcp_unsent == 0)) {
-					tcp->tcp_rexmit_max = tcp->tcp_fss;
-				} else {
-					tcp->tcp_rexmit_max = tcp->tcp_snxt;
-				}
-				tcp->tcp_rexmit_nxt = tcp->tcp_suna;
-				tcp->tcp_rexmit = B_TRUE;
-				tcp->tcp_dupack_cnt = 0;
-				tcp->tcp_snd_burst = TCP_CWND_SS;
-				tcp_ss_rexmit(tcp);
-			}
+			tcp_update_pmtu(tcp, B_TRUE);
+			tcp_rexmit_after_error(tcp);
 			break;
 		case ICMP_PORT_UNREACHABLE:
 		case ICMP_PROTOCOL_UNREACHABLE:
@@ -8451,7 +6847,6 @@ noticmpv4:
 					 * Ditch the half-open connection if we
 					 * suspect a SYN attack is under way.
 					 */
-					tcp_ip_ire_mark_advice(tcp);
 					(void) tcp_clean_death(tcp,
 					    tcp->tcp_client_errno, 7);
 				}
@@ -8483,67 +6878,191 @@ noticmpv4:
 		break;
 	}
 	}
-	freemsg(first_mp);
+	freemsg(mp);
 }
 
 /*
- * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6
- * error messages passed up by IP.
- * Assumes that IP has pulled up all the extension headers as well
- * as the ICMPv6 header.
+ * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might
+ * change. But it can refer to fields like tcp_suna and tcp_snxt.
+ *
+ * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP
+ * error messages received by IP. The message is always received on the correct
+ * tcp_t.
+ */
+/* ARGSUSED */
+static boolean_t
+tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
+    ip_recv_attr_t *ira)
+{
+	tcpha_t		*tcpha = (tcpha_t *)arg2;
+	uint32_t	seq = ntohl(tcpha->tha_seq);
+	tcp_t		*tcp = connp->conn_tcp;
+
+	/*
+	 * TCP sequence number contained in payload of the ICMP error message
+	 * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise,
+	 * the message is either a stale ICMP error, or an attack from the
+	 * network. Fail the verification.
+	 */
+	if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt))
+		return (B_FALSE);
+
+	/* For "too big" we also check the ignore flag */
+	if (ira->ira_flags & IRAF_IS_IPV4) {
+		ASSERT(icmph != NULL);
+		if (icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
+		    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED &&
+		    tcp->tcp_tcps->tcps_ignore_path_mtu)
+			return (B_FALSE);
+	} else {
+		ASSERT(icmp6 != NULL);
+		if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG &&
+		    tcp->tcp_tcps->tcps_ignore_path_mtu)
+			return (B_FALSE);
+	}
+	return (B_TRUE);
+}
+
+/*
+ * Update the TCP connection according to change of PMTU.
+ *
+ * Path MTU might have changed by either increase or decrease, so need to
+ * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny
+ * or negative MSS, since tcp_mss_set() will do it.
  */
 static void
-tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl)
+tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
 {
-	icmp6_t *icmp6;
-	ip6_t	*ip6h;
-	uint16_t	iph_hdr_length;
-	tcpha_t	*tcpha;
-	uint8_t	*nexthdrp;
-	uint32_t new_mss;
-	uint32_t ratio;
-	boolean_t secure;
-	mblk_t *first_mp = mp;
-	size_t mp_size;
-	uint32_t seg_seq;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	uint32_t	pmtu;
+	int32_t		mss;
+	conn_t		*connp = tcp->tcp_connp;
+	ip_xmit_attr_t	*ixa = connp->conn_ixa;
+	iaflags_t	ixaflags;
+
+	if (tcp->tcp_tcps->tcps_ignore_path_mtu)
+		return;
+
+	if (tcp->tcp_state < TCPS_ESTABLISHED)
+		return;
 
 	/*
-	 * The caller has determined if this is an IPSEC_IN packet and
-	 * set ipsec_mctl appropriately (see tcp_icmp_error).
+	 * Always call ip_get_pmtu() to make sure that IP has updated
+	 * ixa_flags properly.
 	 */
-	if (ipsec_mctl)
-		mp = mp->b_cont;
+	pmtu = ip_get_pmtu(ixa);
+	ixaflags = ixa->ixa_flags;
 
-	mp_size = MBLKL(mp);
+	/*
+	 * Calculate the MSS by decreasing the PMTU by conn_ht_iphc_len and
+	 * IPsec overhead if applied. Make sure to use the most recent
+	 * IPsec information.
+	 */
+	mss = pmtu - connp->conn_ht_iphc_len - conn_ipsec_length(connp);
 
 	/*
-	 * Verify that we have a complete IP header. If not, send it upstream.
+	 * Nothing to change, so just return.
 	 */
-	if (mp_size < sizeof (ip6_t)) {
-noticmpv6:
-		freemsg(first_mp);
+	if (mss == tcp->tcp_mss)
 		return;
-	}
 
 	/*
-	 * Verify this is an ICMPV6 packet, else send it upstream.
+	 * Currently, for ICMP errors, only PMTU decrease is handled.
 	 */
-	ip6h = (ip6_t *)mp->b_rptr;
-	if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
-		iph_hdr_length = IPV6_HDR_LEN;
-	} else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length,
-	    &nexthdrp) ||
-	    *nexthdrp != IPPROTO_ICMPV6) {
-		goto noticmpv6;
+	if (mss > tcp->tcp_mss && decrease_only)
+		return;
+
+	DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss);
+
+	/*
+	 * Update ixa_fragsize and ixa_pmtu.
+	 */
+	ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
+
+	/*
+	 * Adjust MSS and all relevant variables.
+	 */
+	tcp_mss_set(tcp, mss);
+
+	/*
+	 * If the PMTU is below the min size maintained by IP, then ip_get_pmtu
+	 * has set IXAF_PMTU_TOO_SMALL and cleared IXAF_PMTU_IPV4_DF. Since TCP
+	 * has a (potentially different) min size we do the same. Make sure to
+	 * clear IXAF_DONTFRAG, which is used by IP to decide whether to
+	 * fragment the packet.
+	 *
+	 * LSO over IPv6 can not be fragmented. So need to disable LSO
+	 * when IPv6 fragmentation is needed.
+	 */
+	if (mss < tcp->tcp_tcps->tcps_mss_min)
+		ixaflags |= IXAF_PMTU_TOO_SMALL;
+
+	if (ixaflags & IXAF_PMTU_TOO_SMALL)
+		ixaflags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
+
+	if ((connp->conn_ipversion == IPV4_VERSION) &&
+	    !(ixaflags & IXAF_PMTU_IPV4_DF)) {
+		tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
 	}
+	ixa->ixa_flags = ixaflags;
+}
+
+/*
+ * Do slow start retransmission after ICMP errors of PMTU changes.
+ */
+static void
+tcp_rexmit_after_error(tcp_t *tcp)
+{
+	/*
+	 * All sent data has been acknowledged or no data left to send, just
+	 * to return.
+	 */
+	if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
+	    (tcp->tcp_xmit_head == NULL))
+		return;
+
+	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
+		tcp->tcp_rexmit_max = tcp->tcp_fss;
+	else
+		tcp->tcp_rexmit_max = tcp->tcp_snxt;
+
+	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
+	tcp->tcp_rexmit = B_TRUE;
+	tcp->tcp_dupack_cnt = 0;
+	tcp->tcp_snd_burst = TCP_CWND_SS;
+	tcp_ss_rexmit(tcp);
+}
+
+/*
+ * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
+ * error messages passed up by IP.
+ * Assumes that IP has pulled up all the extension headers as well
+ * as the ICMPv6 header.
+ */
+static void
+tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
+{
+	icmp6_t		*icmp6;
+	ip6_t		*ip6h;
+	uint16_t	iph_hdr_length = ira->ira_ip_hdr_length;
+	tcpha_t		*tcpha;
+	uint8_t		*nexthdrp;
+	uint32_t	seg_seq;
+
+	/*
+	 * Verify that we have a complete IP header.
+	 */
+	ASSERT((MBLKL(mp) >= sizeof (ip6_t)));
+
 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
 	ip6h = (ip6_t *)&icmp6[1];
 	/*
 	 * Verify if we have a complete ICMP and inner IP header.
 	 */
-	if ((uchar_t *)&ip6h[1] > mp->b_wptr)
-		goto noticmpv6;
+	if ((uchar_t *)&ip6h[1] > mp->b_wptr) {
+noticmpv6:
+		freemsg(mp);
+		return;
+	}
 
 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
 		goto noticmpv6;
@@ -8558,130 +7077,15 @@ noticmpv6:
 		goto noticmpv6;
 	}
 
-	/*
-	 * ICMP errors come on the right queue or come on
-	 * listener/global queue for detached connections and
-	 * get switched to the right queue. If it comes on the
-	 * right queue, policy check has already been done by IP
-	 * and thus free the first_mp without verifying the policy.
-	 * If it has come for a non-hard bound connection, we need
-	 * to verify policy as IP may not have done it.
-	 */
-	if (!tcp->tcp_hard_bound) {
-		if (ipsec_mctl) {
-			secure = ipsec_in_is_secure(first_mp);
-		} else {
-			secure = B_FALSE;
-		}
-		if (secure) {
-			/*
-			 * If we are willing to accept this in clear
-			 * we don't have to verify policy.
-			 */
-			if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) {
-				if (!tcp_check_policy(tcp, first_mp,
-				    NULL, ip6h, secure, ipsec_mctl)) {
-					/*
-					 * tcp_check_policy called
-					 * ip_drop_packet() on failure.
-					 */
-					return;
-				}
-			}
-		}
-	} else if (ipsec_mctl) {
-		/*
-		 * This is a hard_bound connection. IP has already
-		 * verified policy. We don't have to do it again.
-		 */
-		freeb(first_mp);
-		first_mp = mp;
-		ipsec_mctl = B_FALSE;
-	}
-
 	seg_seq = ntohl(tcpha->tha_seq);
-	/*
-	 * TCP SHOULD check that the TCP sequence number contained in
-	 * payload of the ICMP error message is within the range
-	 * SND.UNA <= SEG.SEQ < SND.NXT.
-	 */
-	if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) {
-		/*
-		 * If the ICMP message is bogus, should we kill the
-		 * connection, or should we just drop the bogus ICMP
-		 * message? It would probably make more sense to just
-		 * drop the message so that if this one managed to get
-		 * in, the real connection should not suffer.
-		 */
-		goto noticmpv6;
-	}
-
 	switch (icmp6->icmp6_type) {
 	case ICMP6_PACKET_TOO_BIG:
 		/*
-		 * Reduce the MSS based on the new MTU.  This will
-		 * eliminate any fragmentation locally.
-		 * N.B.  There may well be some funny side-effects on
-		 * the local send policy and the remote receive policy.
-		 * Pending further research, we provide
-		 * tcp_ignore_path_mtu just in case this proves
-		 * disastrous somewhere.
-		 *
-		 * After updating the MSS, retransmit part of the
-		 * dropped segment using the new mss by calling
-		 * tcp_wput_data().  Need to adjust all those
-		 * params to make sure tcp_wput_data() work properly.
-		 */
-		if (tcps->tcps_ignore_path_mtu)
-			break;
-
-		/*
-		 * Decrease the MSS by time stamp options
-		 * IP options and IPSEC options. tcp_hdr_len
-		 * includes time stamp option and IP option
-		 * length.
-		 */
-		new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len -
-		    tcp->tcp_ipsec_overhead;
-
-		/*
-		 * Only update the MSS if the new one is
-		 * smaller than the previous one.  This is
-		 * to avoid problems when getting multiple
-		 * ICMP errors for the same MTU.
-		 */
-		if (new_mss >= tcp->tcp_mss)
-			break;
-
-		ratio = tcp->tcp_cwnd / tcp->tcp_mss;
-		ASSERT(ratio >= 1);
-		tcp_mss_set(tcp, new_mss, B_TRUE);
-
-		/*
-		 * Make sure we have something to
-		 * send.
+		 * Update Path MTU, then try to send something out.
 		 */
-		if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
-		    (tcp->tcp_xmit_head != NULL)) {
-			/*
-			 * Shrink tcp_cwnd in
-			 * proportion to the old MSS/new MSS.
-			 */
-			tcp->tcp_cwnd = ratio * tcp->tcp_mss;
-			if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
-			    (tcp->tcp_unsent == 0)) {
-				tcp->tcp_rexmit_max = tcp->tcp_fss;
-			} else {
-				tcp->tcp_rexmit_max = tcp->tcp_snxt;
-			}
-			tcp->tcp_rexmit_nxt = tcp->tcp_suna;
-			tcp->tcp_rexmit = B_TRUE;
-			tcp->tcp_dupack_cnt = 0;
-			tcp->tcp_snd_burst = TCP_CWND_SS;
-			tcp_ss_rexmit(tcp);
-		}
+		tcp_update_pmtu(tcp, B_TRUE);
+		tcp_rexmit_after_error(tcp);
 		break;
-
 	case ICMP6_DST_UNREACH:
 		switch (icmp6->icmp6_code) {
 		case ICMP6_DST_UNREACH_NOPORT:
@@ -8692,7 +7096,6 @@ noticmpv6:
 				    ECONNREFUSED, 8);
 			}
 			break;
-
 		case ICMP6_DST_UNREACH_ADMIN:
 		case ICMP6_DST_UNREACH_NOROUTE:
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
@@ -8708,7 +7111,6 @@ noticmpv6:
 					 * Ditch the half-open connection if we
 					 * suspect a SYN attack is under way.
 					 */
-					tcp_ip_ire_mark_advice(tcp);
 					(void) tcp_clean_death(tcp,
 					    tcp->tcp_client_errno, 9);
 				}
@@ -8720,7 +7122,6 @@ noticmpv6:
 			break;
 		}
 		break;
-
 	case ICMP6_PARAM_PROB:
 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
@@ -8739,83 +7140,42 @@ noticmpv6:
 	default:
 		break;
 	}
-	freemsg(first_mp);
+	freemsg(mp);
 }
 
 /*
  * Notify IP that we are having trouble with this connection.  IP should
- * blow the IRE away and start over.
+ * make note so it can potentially use a different IRE.
  */
 static void
 tcp_ip_notify(tcp_t *tcp)
 {
-	struct iocblk	*iocp;
-	ipid_t	*ipid;
-	mblk_t	*mp;
-
-	/* IPv6 has NUD thus notification to delete the IRE is not needed */
-	if (tcp->tcp_ipversion == IPV6_VERSION)
-		return;
-
-	mp = mkiocb(IP_IOCTL);
-	if (mp == NULL)
-		return;
-
-	iocp = (struct iocblk *)mp->b_rptr;
-	iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst);
-
-	mp->b_cont = allocb(iocp->ioc_count, BPRI_HI);
-	if (!mp->b_cont) {
-		freeb(mp);
-		return;
-	}
+	conn_t		*connp = tcp->tcp_connp;
+	ire_t		*ire;
 
-	ipid = (ipid_t *)mp->b_cont->b_rptr;
-	mp->b_cont->b_wptr += iocp->ioc_count;
-	bzero(ipid, sizeof (*ipid));
-	ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY;
-	ipid->ipid_ire_type = IRE_CACHE;
-	ipid->ipid_addr_offset = sizeof (ipid_t);
-	ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst);
 	/*
 	 * Note: in the case of source routing we want to blow away the
 	 * route to the first source route hop.
 	 */
-	bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1],
-	    sizeof (tcp->tcp_ipha->ipha_dst));
-
-	CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
-}
-
-/* Unlink and return any mblk that looks like it contains an ire */
-static mblk_t *
-tcp_ire_mp(mblk_t **mpp)
-{
-	mblk_t 	*mp = *mpp;
-	mblk_t	*prev_mp = NULL;
-
-	for (;;) {
-		switch (DB_TYPE(mp)) {
-		case IRE_DB_TYPE:
-		case IRE_DB_REQ_TYPE:
-			if (mp == *mpp) {
-				*mpp = mp->b_cont;
-			} else {
-				prev_mp->b_cont = mp->b_cont;
-			}
-			mp->b_cont = NULL;
-			return (mp);
-		default:
-			break;
+	ire = connp->conn_ixa->ixa_ire;
+	if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+		if (ire->ire_ipversion == IPV4_VERSION) {
+			/*
+			 * As per RFC 1122, we send an RTM_LOSING to inform
+			 * routing protocols.
+			 */
+			ip_rts_change(RTM_LOSING, ire->ire_addr,
+			    ire->ire_gateway_addr, ire->ire_mask,
+			    connp->conn_laddr_v4,  0, 0, 0,
+			    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
+			    ire->ire_ipst);
 		}
-		prev_mp = mp;
-		mp = mp->b_cont;
-		if (mp == NULL)
-			break;
+		(void) ire_no_good(ire);
 	}
-	return (mp);
 }
 
+#pragma inline(tcp_send_data)
+
 /*
  * Timer callback routine for keepalive probe.  We do a fake resend of
  * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
@@ -8890,7 +7250,7 @@ tcp_keepalive_killer(void *arg)
 			 * timer back.
 			 */
 			if (mp != NULL) {
-				tcp_send_data(tcp, tcp->tcp_wq, mp);
+				tcp_send_data(tcp, mp);
 				BUMP_MIB(&tcps->tcps_mib,
 				    tcpTimKeepaliveProbe);
 				if (tcp->tcp_ka_last_intrvl != 0) {
@@ -8930,17 +7290,17 @@ tcp_keepalive_killer(void *arg)
 int
 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
 {
-	queue_t	*q = tcp->tcp_rq;
+	conn_t	*connp = tcp->tcp_connp;
+	queue_t	*q = connp->conn_rq;
 	int32_t	mss = tcp->tcp_mss;
 	int	maxpsz;
-	conn_t	*connp = tcp->tcp_connp;
 
 	if (TCP_IS_DETACHED(tcp))
 		return (mss);
 	if (tcp->tcp_fused) {
 		maxpsz = tcp_fuse_maxpsz(tcp);
 		mss = INFPSZ;
-	} else if (tcp->tcp_mdt || tcp->tcp_lso || tcp->tcp_maxpsz == 0) {
+	} else if (tcp->tcp_maxpsz_multiplier == 0) {
 		/*
 		 * Set the sd_qn_maxpsz according to the socket send buffer
 		 * size, and sd_maxblk to INFPSZ (-1).  This will essentially
@@ -8948,7 +7308,7 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
 		 * kernel-allocated buffers without breaking it up into smaller
 		 * chunks.  We round up the buffer size to the nearest SMSS.
 		 */
-		maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss);
+		maxpsz = MSS_ROUNDUP(connp->conn_sndbuf, mss);
 		if (tcp->tcp_kssl_ctx == NULL)
 			mss = INFPSZ;
 		else
@@ -8960,21 +7320,17 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
 		 * head to break down larger than SMSS writes into SMSS-
 		 * size mblks, up to tcp_maxpsz_multiplier mblks at a time.
 		 */
-		/* XXX tune this with ndd tcp_maxpsz_multiplier */
-		maxpsz = tcp->tcp_maxpsz * mss;
-		if (maxpsz > tcp->tcp_xmit_hiwater/2) {
-			maxpsz = tcp->tcp_xmit_hiwater/2;
+		maxpsz = tcp->tcp_maxpsz_multiplier * mss;
+		if (maxpsz > connp->conn_sndbuf / 2) {
+			maxpsz = connp->conn_sndbuf / 2;
 			/* Round up to nearest mss */
 			maxpsz = MSS_ROUNDUP(maxpsz, mss);
 		}
 	}
 
 	(void) proto_set_maxpsz(q, connp, maxpsz);
-	if (!(IPCL_IS_NONSTR(connp))) {
-		/* XXX do it in set_maxpsz()? */
-		tcp->tcp_wq->q_maxpsz = maxpsz;
-	}
-
+	if (!(IPCL_IS_NONSTR(connp)))
+		connp->conn_wq->q_maxpsz = maxpsz;
 	if (set_maxblk)
 		(void) proto_set_tx_maxblk(q, connp, mss);
 	return (mss);
@@ -8985,18 +7341,18 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
  * tcpopt struct and return a bitmask saying which options were found.
  */
 static int
-tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt)
+tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt)
 {
 	uchar_t		*endp;
 	int		len;
 	uint32_t	mss;
-	uchar_t		*up = (uchar_t *)tcph;
+	uchar_t		*up = (uchar_t *)tcpha;
 	int		found = 0;
 	int32_t		sack_len;
 	tcp_seq		sack_begin, sack_end;
 	tcp_t		*tcp;
 
-	endp = up + TCP_HDR_LENGTH(tcph);
+	endp = up + TCP_HDR_LENGTH(tcpha);
 	up += TCP_MIN_HEADER_LENGTH;
 	while (up < endp) {
 		len = endp - up;
@@ -9135,28 +7491,20 @@ tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt)
 }
 
 /*
- * Set the mss associated with a particular tcp based on its current value,
- * and a new one passed in. Observe minimums and maximums, and reset
- * other state variables that we want to view as multiples of mss.
- *
- * This function is called mainly because values like tcp_mss, tcp_cwnd,
- * highwater marks etc. need to be initialized or adjusted.
- * 1) From tcp_process_options() when the other side's SYN/SYN-ACK
- *    packet arrives.
- * 2) We need to set a new MSS when ICMP_FRAGMENTATION_NEEDED or
- *    ICMP6_PACKET_TOO_BIG arrives.
- * 3) From tcp_paws_check() if the other side stops sending the timestamp,
- *    to increase the MSS to use the extra bytes available.
+ * Set the MSS associated with a particular tcp based on its current value,
+ * and a new one passed in. Observe minimums and maximums, and reset other
+ * state variables that we want to view as multiples of MSS.
  *
- * Callers except tcp_paws_check() ensure that they only reduce mss.
+ * The value of MSS could be either increased or descreased.
  */
 static void
-tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss)
+tcp_mss_set(tcp_t *tcp, uint32_t mss)
 {
 	uint32_t	mss_max;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
-	if (tcp->tcp_ipversion == IPV4_VERSION)
+	if (connp->conn_ipversion == IPV4_VERSION)
 		mss_max = tcps->tcps_mss_max_ipv4;
 	else
 		mss_max = tcps->tcps_mss_max_ipv6;
@@ -9176,34 +7524,22 @@ tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss)
 	 * TCP should be able to buffer at least 4 MSS data for obvious
 	 * performance reason.
 	 */
-	if ((mss << 2) > tcp->tcp_xmit_hiwater)
-		tcp->tcp_xmit_hiwater = mss << 2;
+	if ((mss << 2) > connp->conn_sndbuf)
+		connp->conn_sndbuf = mss << 2;
 
 	/*
-	 * Set the xmit_lowater to at least twice of MSS.
+	 * Set the send lowater to at least twice of MSS.
 	 */
-	if ((mss << 1) > tcp->tcp_xmit_lowater)
-		tcp->tcp_xmit_lowater = mss << 1;
+	if ((mss << 1) > connp->conn_sndlowat)
+		connp->conn_sndlowat = mss << 1;
+
+	/*
+	 * Update tcp_cwnd according to the new value of MSS. Keep the
+	 * previous ratio to preserve the transmit rate.
+	 */
+	tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
+	tcp->tcp_cwnd_cnt = 0;
 
-	if (do_ss) {
-		/*
-		 * Either the tcp_cwnd is as yet uninitialized, or mss is
-		 * changing due to a reduction in MTU, presumably as a
-		 * result of a new path component, reset cwnd to its
-		 * "initial" value, as a multiple of the new mss.
-		 */
-		SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_initial);
-	} else {
-		/*
-		 * Called by tcp_paws_check(), the mss increased
-		 * marginally to allow use of space previously taken
-		 * by the timestamp option. It would be inappropriate
-		 * to apply slow start or tcp_init_cwnd values to
-		 * tcp_cwnd, simply adjust to a multiple of the new mss.
-		 */
-		tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
-		tcp->tcp_cwnd_cnt = 0;
-	}
 	tcp->tcp_mss = mss;
 	(void) tcp_maxpsz_set(tcp, B_TRUE);
 }
@@ -9223,12 +7559,11 @@ tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 }
 
 static conn_t *
-tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6,
-    boolean_t issocket, int *errorp)
+tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket,
+    int *errorp)
 {
 	tcp_t		*tcp = NULL;
 	conn_t		*connp;
-	int		err;
 	zoneid_t	zoneid;
 	tcp_stack_t	*tcps;
 	squeue_t	*sqp;
@@ -9265,15 +7600,6 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6,
 		else
 			zoneid = crgetzoneid(credp);
 	}
-	/*
-	 * For stackid zero this is done from strplumb.c, but
-	 * non-zero stackids are handled here.
-	 */
-	if (tcps->tcps_g_q == NULL &&
-	    tcps->tcps_netstack->netstack_stackid !=
-	    GLOBAL_NETSTACKID) {
-		tcp_g_q_setup(tcps);
-	}
 
 	sqp = IP_SQUEUE_GET((uint_t)gethrtime());
 	connp = (conn_t *)tcp_get_conn(sqp, tcps);
@@ -9286,41 +7612,50 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6,
 		*errorp = ENOSR;
 		return (NULL);
 	}
+	ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
+
 	connp->conn_sqp = sqp;
 	connp->conn_initial_sqp = connp->conn_sqp;
+	connp->conn_ixa->ixa_sqp = connp->conn_sqp;
 	tcp = connp->conn_tcp;
 
+	/*
+	 * Besides asking IP to set the checksum for us, have conn_ip_output
+	 * to do the following checks when necessary:
+	 *
+	 * IXAF_VERIFY_SOURCE: drop packets when our outer source goes invalid
+	 * IXAF_VERIFY_PMTU: verify PMTU changes
+	 * IXAF_VERIFY_LSO: verify LSO capability changes
+	 */
+	connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
+	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO;
+
+	if (!tcps->tcps_dev_flow_ctl)
+		connp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
+
 	if (isv6) {
-		connp->conn_flags |= IPCL_TCP6;
-		connp->conn_send = ip_output_v6;
-		connp->conn_af_isv6 = B_TRUE;
-		connp->conn_pkt_isv6 = B_TRUE;
-		connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT;
-		tcp->tcp_ipversion = IPV6_VERSION;
-		tcp->tcp_family = AF_INET6;
+		connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
+		connp->conn_ipversion = IPV6_VERSION;
+		connp->conn_family = AF_INET6;
 		tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
+		connp->conn_default_ttl = tcps->tcps_ipv6_hoplimit;
 	} else {
-		connp->conn_flags |= IPCL_TCP4;
-		connp->conn_send = ip_output;
-		connp->conn_af_isv6 = B_FALSE;
-		connp->conn_pkt_isv6 = B_FALSE;
-		tcp->tcp_ipversion = IPV4_VERSION;
-		tcp->tcp_family = AF_INET;
+		connp->conn_ipversion = IPV4_VERSION;
+		connp->conn_family = AF_INET;
 		tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
+		connp->conn_default_ttl = tcps->tcps_ipv4_ttl;
 	}
+	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
+
+	crhold(credp);
+	connp->conn_cred = credp;
+	connp->conn_cpid = curproc->p_pid;
+	connp->conn_open_time = lbolt64;
 
-	/*
-	 * TCP keeps a copy of cred for cache locality reasons but
-	 * we put a reference only once. If connp->conn_cred
-	 * becomes invalid, tcp_cred should also be set to NULL.
-	 */
-	tcp->tcp_cred = connp->conn_cred = credp;
-	crhold(connp->conn_cred);
-	tcp->tcp_cpid = curproc->p_pid;
-	tcp->tcp_open_time = lbolt64;
 	connp->conn_zoneid = zoneid;
+	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+	connp->conn_ixa->ixa_zoneid = zoneid;
 	connp->conn_mlp_type = mlptSingle;
-	connp->conn_ulp_labeled = !is_system_labeled();
 	ASSERT(connp->conn_netstack == tcps->tcps_netstack);
 	ASSERT(tcp->tcp_tcps == tcps);
 
@@ -9331,38 +7666,22 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6,
 	if (getpflags(NET_MAC_AWARE, credp) != 0)
 		connp->conn_mac_mode = CONN_MAC_AWARE;
 
-	connp->conn_dev = NULL;
+	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
+
 	if (issocket) {
-		connp->conn_flags |= IPCL_SOCKET;
 		tcp->tcp_issocket = 1;
 	}
 
-	/* Non-zero default values */
-	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
-
-	if (q == NULL) {
-		/*
-		 * Create a helper stream for non-STREAMS socket.
-		 */
-		err = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
-		if (err != 0) {
-			ip1dbg(("tcp_create_common: create of IP helper stream "
-			    "failed\n"));
-			CONN_DEC_REF(connp);
-			*errorp = err;
-			return (NULL);
-		}
-		q = connp->conn_rq;
-	}
+	connp->conn_rcvbuf = tcps->tcps_recv_hiwat;
+	connp->conn_sndbuf = tcps->tcps_xmit_hiwat;
+	connp->conn_sndlowat = tcps->tcps_xmit_lowat;
+	connp->conn_so_type = SOCK_STREAM;
+	connp->conn_wroff = connp->conn_ht_iphc_allocated +
+	    tcps->tcps_wroff_xtra;
 
 	SOCK_CONNID_INIT(tcp->tcp_connid);
-	err = tcp_init(tcp, q);
-	if (err != 0) {
-		CONN_DEC_REF(connp);
-		*errorp = err;
-		return (NULL);
-	}
-
+	tcp->tcp_state = TCPS_IDLE;
+	tcp_init_values(tcp);
 	return (connp);
 }
 
@@ -9415,7 +7734,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 		q->q_qinfo = &tcp_acceptor_rinit;
 		/*
 		 * the conn_dev and minor_arena will be subsequently used by
-		 * tcp_wput_accept() and tcp_tpi_close_accept() to figure out
+		 * tcp_tli_accept() and tcp_tpi_close_accept() to figure out
 		 * the minor device number for this connection from the q_ptr.
 		 */
 		RD(q)->q_ptr = (void *)conn_dev;
@@ -9426,7 +7745,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 	}
 
 	issocket = flag & SO_SOCKSTR;
-	connp = tcp_create_common(q, credp, isv6, issocket, &err);
+	connp = tcp_create_common(credp, isv6, issocket, &err);
 
 	if (connp == NULL) {
 		inet_minor_free(minor_arena, conn_dev);
@@ -9434,6 +7753,8 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 		return (err);
 	}
 
+	connp->conn_rq = q;
+	connp->conn_wq = WR(q);
 	q->q_ptr = WR(q)->q_ptr = connp;
 
 	connp->conn_dev = conn_dev;
@@ -9500,7 +7821,7 @@ tcp_allow_connopt_set(int level, int name)
 }
 
 /*
- * this routine gets default values of certain options whose default
+ * This routine gets default values of certain options whose default
  * values are maintained by protocol specific code
  */
 /* ARGSUSED */
@@ -9553,321 +7874,102 @@ tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
 	return (sizeof (int));
 }
 
+/*
+ * TCP routine to get the values of options.
+ */
 static int
 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 {
 	int		*i1 = (int *)ptr;
 	tcp_t		*tcp = connp->conn_tcp;
-	ip6_pkt_t	*ipp = &tcp->tcp_sticky_ipp;
+	conn_opt_arg_t	coas;
+	int		retval;
+
+	coas.coa_connp = connp;
+	coas.coa_ixa = connp->conn_ixa;
+	coas.coa_ipp = &connp->conn_xmit_ipp;
+	coas.coa_ancillary = B_FALSE;
+	coas.coa_changed = 0;
 
 	switch (level) {
 	case SOL_SOCKET:
 		switch (name) {
-		case SO_LINGER:	{
-			struct linger *lgr = (struct linger *)ptr;
-
-			lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0;
-			lgr->l_linger = tcp->tcp_lingertime;
-			}
-			return (sizeof (struct linger));
-		case SO_DEBUG:
-			*i1 = tcp->tcp_debug ? SO_DEBUG : 0;
-			break;
-		case SO_KEEPALIVE:
-			*i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0;
-			break;
-		case SO_DONTROUTE:
-			*i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0;
-			break;
-		case SO_USELOOPBACK:
-			*i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0;
-			break;
-		case SO_BROADCAST:
-			*i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0;
-			break;
-		case SO_REUSEADDR:
-			*i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0;
-			break;
-		case SO_OOBINLINE:
-			*i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0;
-			break;
-		case SO_DGRAM_ERRIND:
-			*i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0;
-			break;
-		case SO_TYPE:
-			*i1 = SOCK_STREAM;
-			break;
-		case SO_SNDBUF:
-			*i1 = tcp->tcp_xmit_hiwater;
-			break;
-		case SO_RCVBUF:
-			*i1 = tcp->tcp_recv_hiwater;
-			break;
 		case SO_SND_COPYAVOID:
 			*i1 = tcp->tcp_snd_zcopy_on ?
 			    SO_SND_COPYAVOID : 0;
-			break;
-		case SO_ALLZONES:
-			*i1 = connp->conn_allzones ? 1 : 0;
-			break;
-		case SO_ANON_MLP:
-			*i1 = connp->conn_anon_mlp;
-			break;
-		case SO_MAC_EXEMPT:
-			*i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
-			break;
-		case SO_MAC_IMPLICIT:
-			*i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
-			break;
-		case SO_EXCLBIND:
-			*i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0;
-			break;
-		case SO_PROTOTYPE:
-			*i1 = IPPROTO_TCP;
-			break;
-		case SO_DOMAIN:
-			*i1 = tcp->tcp_family;
-			break;
+			return (sizeof (int));
 		case SO_ACCEPTCONN:
 			*i1 = (tcp->tcp_state == TCPS_LISTEN);
-		default:
-			return (-1);
+			return (sizeof (int));
 		}
 		break;
 	case IPPROTO_TCP:
 		switch (name) {
 		case TCP_NODELAY:
 			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
-			break;
+			return (sizeof (int));
 		case TCP_MAXSEG:
 			*i1 = tcp->tcp_mss;
-			break;
+			return (sizeof (int));
 		case TCP_NOTIFY_THRESHOLD:
 			*i1 = (int)tcp->tcp_first_timer_threshold;
-			break;
+			return (sizeof (int));
 		case TCP_ABORT_THRESHOLD:
 			*i1 = tcp->tcp_second_timer_threshold;
-			break;
+			return (sizeof (int));
 		case TCP_CONN_NOTIFY_THRESHOLD:
 			*i1 = tcp->tcp_first_ctimer_threshold;
-			break;
+			return (sizeof (int));
 		case TCP_CONN_ABORT_THRESHOLD:
 			*i1 = tcp->tcp_second_ctimer_threshold;
-			break;
-		case TCP_RECVDSTADDR:
-			*i1 = tcp->tcp_recvdstaddr;
-			break;
-		case TCP_ANONPRIVBIND:
-			*i1 = tcp->tcp_anon_priv_bind;
-			break;
-		case TCP_EXCLBIND:
-			*i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0;
-			break;
+			return (sizeof (int));
 		case TCP_INIT_CWND:
 			*i1 = tcp->tcp_init_cwnd;
-			break;
+			return (sizeof (int));
 		case TCP_KEEPALIVE_THRESHOLD:
 			*i1 = tcp->tcp_ka_interval;
-			break;
+			return (sizeof (int));
 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
 			*i1 = tcp->tcp_ka_abort_thres;
-			break;
+			return (sizeof (int));
 		case TCP_CORK:
 			*i1 = tcp->tcp_cork;
-			break;
-		default:
-			return (-1);
+			return (sizeof (int));
 		}
 		break;
 	case IPPROTO_IP:
-		if (tcp->tcp_family != AF_INET)
+		if (connp->conn_family != AF_INET)
 			return (-1);
 		switch (name) {
 		case IP_OPTIONS:
-		case T_IP_OPTIONS: {
-			/*
-			 * This is compatible with BSD in that in only return
-			 * the reverse source route with the final destination
-			 * as the last entry. The first 4 bytes of the option
-			 * will contain the final destination.
-			 */
-			int	opt_len;
-
-			opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha;
-			opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH;
-			ASSERT(opt_len >= 0);
+		case T_IP_OPTIONS:
 			/* Caller ensures enough space */
-			if (opt_len > 0) {
-				/*
-				 * TODO: Do we have to handle getsockopt on an
-				 * initiator as well?
-				 */
-				return (ip_opt_get_user(tcp->tcp_ipha, ptr));
-			}
-			return (0);
-			}
-		case IP_TOS:
-		case T_IP_TOS:
-			*i1 = (int)tcp->tcp_ipha->ipha_type_of_service;
-			break;
-		case IP_TTL:
-			*i1 = (int)tcp->tcp_ipha->ipha_ttl;
-			break;
-		case IP_NEXTHOP:
-			/* Handled at IP level */
-			return (-EINVAL);
+			return (ip_opt_get_user(connp, ptr));
 		default:
-			return (-1);
+			break;
 		}
 		break;
+
 	case IPPROTO_IPV6:
 		/*
 		 * IPPROTO_IPV6 options are only supported for sockets
 		 * that are using IPv6 on the wire.
 		 */
-		if (tcp->tcp_ipversion != IPV6_VERSION) {
+		if (connp->conn_ipversion != IPV6_VERSION) {
 			return (-1);
 		}
 		switch (name) {
-		case IPV6_UNICAST_HOPS:
-			*i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_BOUND_IF:
-			/* Zero if not set */
-			*i1 = tcp->tcp_bound_if;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVPKTINFO:
-			if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)
-				*i1 = 1;
-			else
-				*i1 = 0;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVTCLASS:
-			if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)
-				*i1 = 1;
-			else
-				*i1 = 0;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVHOPLIMIT:
-			if (tcp->tcp_ipv6_recvancillary &
-			    TCP_IPV6_RECVHOPLIMIT)
-				*i1 = 1;
-			else
-				*i1 = 0;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVHOPOPTS:
-			if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS)
-				*i1 = 1;
-			else
-				*i1 = 0;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVDSTOPTS:
-			if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS)
-				*i1 = 1;
-			else
-				*i1 = 0;
-			break;	/* goto sizeof (int) option return */
-		case _OLD_IPV6_RECVDSTOPTS:
-			if (tcp->tcp_ipv6_recvancillary &
-			    TCP_OLD_IPV6_RECVDSTOPTS)
-				*i1 = 1;
-			else
-				*i1 = 0;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVRTHDR:
-			if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR)
-				*i1 = 1;
-			else
-				*i1 = 0;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVRTHDRDSTOPTS:
-			if (tcp->tcp_ipv6_recvancillary &
-			    TCP_IPV6_RECVRTDSTOPTS)
-				*i1 = 1;
-			else
-				*i1 = 0;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_PKTINFO: {
-			/* XXX assumes that caller has room for max size! */
-			struct in6_pktinfo *pkti;
-
-			pkti = (struct in6_pktinfo *)ptr;
-			if (ipp->ipp_fields & IPPF_IFINDEX)
-				pkti->ipi6_ifindex = ipp->ipp_ifindex;
-			else
-				pkti->ipi6_ifindex = 0;
-			if (ipp->ipp_fields & IPPF_ADDR)
-				pkti->ipi6_addr = ipp->ipp_addr;
-			else
-				pkti->ipi6_addr = ipv6_all_zeros;
-			return (sizeof (struct in6_pktinfo));
-		}
-		case IPV6_TCLASS:
-			if (ipp->ipp_fields & IPPF_TCLASS)
-				*i1 = ipp->ipp_tclass;
-			else
-				*i1 = IPV6_FLOW_TCLASS(
-				    IPV6_DEFAULT_VERS_AND_FLOW);
-			break;	/* goto sizeof (int) option return */
-		case IPV6_NEXTHOP: {
-			sin6_t *sin6 = (sin6_t *)ptr;
-
-			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
-				return (0);
-			*sin6 = sin6_null;
-			sin6->sin6_family = AF_INET6;
-			sin6->sin6_addr = ipp->ipp_nexthop;
-			return (sizeof (sin6_t));
-		}
-		case IPV6_HOPOPTS:
-			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
-				return (0);
-			if (ipp->ipp_hopoptslen <= tcp->tcp_label_len)
-				return (0);
-			bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len,
-			    ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len);
-			if (tcp->tcp_label_len > 0) {
-				ptr[0] = ((char *)ipp->ipp_hopopts)[0];
-				ptr[1] = (ipp->ipp_hopoptslen -
-				    tcp->tcp_label_len + 7) / 8 - 1;
-			}
-			return (ipp->ipp_hopoptslen - tcp->tcp_label_len);
-		case IPV6_RTHDRDSTOPTS:
-			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
-				return (0);
-			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
-			return (ipp->ipp_rtdstoptslen);
-		case IPV6_RTHDR:
-			if (!(ipp->ipp_fields & IPPF_RTHDR))
-				return (0);
-			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
-			return (ipp->ipp_rthdrlen);
-		case IPV6_DSTOPTS:
-			if (!(ipp->ipp_fields & IPPF_DSTOPTS))
-				return (0);
-			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
-			return (ipp->ipp_dstoptslen);
-		case IPV6_SRC_PREFERENCES:
-			return (ip6_get_src_preferences(connp,
-			    (uint32_t *)ptr));
-		case IPV6_PATHMTU: {
-			struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr;
-
+		case IPV6_PATHMTU:
 			if (tcp->tcp_state < TCPS_ESTABLISHED)
 				return (-1);
-
-			return (ip_fill_mtuinfo(&connp->conn_remv6,
-			    connp->conn_fport, mtuinfo,
-			    connp->conn_netstack));
-		}
-		default:
-			return (-1);
+			break;
 		}
 		break;
-	default:
-		return (-1);
 	}
-	return (sizeof (int));
+	mutex_enter(&connp->conn_lock);
+	retval = conn_opt_get(&coas, level, name, ptr);
+	mutex_exit(&connp->conn_lock);
+	return (retval);
 }
 
 /*
@@ -9896,7 +7998,6 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
 	    tcp_opt_obj.odb_opt_des_arr,
 	    tcp_opt_obj.odb_opt_arr_cnt,
-	    tcp_opt_obj.odb_topmost_tpiprovider,
 	    B_FALSE, B_TRUE, cr);
 	if (error != 0) {
 		if (error < 0) {
@@ -9909,30 +8010,28 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 
 	error = squeue_synch_enter(sqp, connp, NULL);
 	if (error == ENOMEM) {
+		kmem_free(optvalp_buf, max_optbuf_len);
 		return (ENOMEM);
 	}
 
 	len = tcp_opt_get(connp, level, option_name, optvalp_buf);
 	squeue_synch_exit(sqp, connp);
 
-	if (len < 0) {
-		/*
-		 * Pass on to IP
-		 */
+	if (len == -1) {
 		kmem_free(optvalp_buf, max_optbuf_len);
-		return (ip_get_options(connp, level, option_name,
-		    optvalp, optlen, cr));
-	} else {
-		/*
-		 * update optlen and copy option value
-		 */
-		t_uscalar_t size = MIN(len, *optlen);
-		bcopy(optvalp_buf, optvalp, size);
-		bcopy(&size, optlen, sizeof (size));
-
-		kmem_free(optvalp_buf, max_optbuf_len);
-		return (0);
+		return (EINVAL);
 	}
+
+	/*
+	 * update optlen and copy option value
+	 */
+	t_uscalar_t size = MIN(len, *optlen);
+
+	bcopy(optvalp_buf, optvalp, size);
+	bcopy(&size, optlen, sizeof (size));
+
+	kmem_free(optvalp_buf, max_optbuf_len);
+	return (0);
 }
 
 /*
@@ -9943,7 +8042,7 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 int
 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
-    void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+    void *thisdg_attrs, cred_t *cr)
 {
 	tcp_t	*tcp = connp->conn_tcp;
 	int	*i1 = (int *)invalp;
@@ -9951,6 +8050,13 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 	boolean_t checkonly;
 	int	reterr;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_opt_arg_t	coas;
+
+	coas.coa_connp = connp;
+	coas.coa_ixa = connp->conn_ixa;
+	coas.coa_ipp = &connp->conn_xmit_ipp;
+	coas.coa_ancillary = B_FALSE;
+	coas.coa_changed = 0;
 
 	switch (optset_context) {
 	case SETFN_OPTCOM_CHECKONLY:
@@ -10016,37 +8122,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 	switch (level) {
 	case SOL_SOCKET:
 		switch (name) {
-		case SO_LINGER: {
-			struct linger *lgr = (struct linger *)invalp;
-
-			if (!checkonly) {
-				if (lgr->l_onoff) {
-					tcp->tcp_linger = 1;
-					tcp->tcp_lingertime = lgr->l_linger;
-				} else {
-					tcp->tcp_linger = 0;
-					tcp->tcp_lingertime = 0;
-				}
-				/* struct copy */
-				*(struct linger *)outvalp = *lgr;
-			} else {
-				if (!lgr->l_onoff) {
-					((struct linger *)
-					    outvalp)->l_onoff = 0;
-					((struct linger *)
-					    outvalp)->l_linger = 0;
-				} else {
-					/* struct copy */
-					*(struct linger *)outvalp = *lgr;
-				}
-			}
-			*outlenp = sizeof (struct linger);
-			return (0);
-		}
-		case SO_DEBUG:
-			if (!checkonly)
-				tcp->tcp_debug = onoff;
-			break;
 		case SO_KEEPALIVE:
 			if (checkonly) {
 				/* check only case */
@@ -10054,65 +8129,25 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			}
 
 			if (!onoff) {
-				if (tcp->tcp_ka_enabled) {
+				if (connp->conn_keepalive) {
 					if (tcp->tcp_ka_tid != 0) {
 						(void) TCP_TIMER_CANCEL(tcp,
 						    tcp->tcp_ka_tid);
 						tcp->tcp_ka_tid = 0;
 					}
-					tcp->tcp_ka_enabled = 0;
+					connp->conn_keepalive = 0;
 				}
 				break;
 			}
-			if (!tcp->tcp_ka_enabled) {
+			if (!connp->conn_keepalive) {
 				/* Crank up the keepalive timer */
 				tcp->tcp_ka_last_intrvl = 0;
 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
 				    tcp_keepalive_killer,
 				    MSEC_TO_TICK(tcp->tcp_ka_interval));
-				tcp->tcp_ka_enabled = 1;
-			}
-			break;
-		case SO_DONTROUTE:
-			/*
-			 * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are
-			 * only of interest to IP.  We track them here only so
-			 * that we can report their current value.
-			 */
-			if (!checkonly) {
-				tcp->tcp_dontroute = onoff;
-				tcp->tcp_connp->conn_dontroute = onoff;
+				connp->conn_keepalive = 1;
 			}
 			break;
-		case SO_USELOOPBACK:
-			if (!checkonly) {
-				tcp->tcp_useloopback = onoff;
-				tcp->tcp_connp->conn_loopback = onoff;
-			}
-			break;
-		case SO_BROADCAST:
-			if (!checkonly) {
-				tcp->tcp_broadcast = onoff;
-				tcp->tcp_connp->conn_broadcast = onoff;
-			}
-			break;
-		case SO_REUSEADDR:
-			if (!checkonly) {
-				tcp->tcp_reuseaddr = onoff;
-				tcp->tcp_connp->conn_reuseaddr = onoff;
-			}
-			break;
-		case SO_OOBINLINE:
-			if (!checkonly) {
-				tcp->tcp_oobinline = onoff;
-				if (IPCL_IS_NONSTR(tcp->tcp_connp))
-					proto_set_rx_oob_opt(connp, onoff);
-			}
-			break;
-		case SO_DGRAM_ERRIND:
-			if (!checkonly)
-				tcp->tcp_dgram_errind = onoff;
-			break;
 		case SO_SNDBUF: {
 			if (*i1 > tcps->tcps_max_buf) {
 				*outlenp = 0;
@@ -10121,11 +8156,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			if (checkonly)
 				break;
 
-			tcp->tcp_xmit_hiwater = *i1;
-			if (tcps->tcps_snd_lowat_fraction != 0)
-				tcp->tcp_xmit_lowater =
-				    tcp->tcp_xmit_hiwater /
+			connp->conn_sndbuf = *i1;
+			if (tcps->tcps_snd_lowat_fraction != 0) {
+				connp->conn_sndlowat = connp->conn_sndbuf /
 				    tcps->tcps_snd_lowat_fraction;
+			}
 			(void) tcp_maxpsz_set(tcp, B_TRUE);
 			/*
 			 * If we are flow-controlled, recheck the condition.
@@ -10135,11 +8170,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			 */
 			mutex_enter(&tcp->tcp_non_sq_lock);
 			if (tcp->tcp_flow_stopped &&
-			    TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) {
+			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
 				tcp_clrqfull(tcp);
 			}
 			mutex_exit(&tcp->tcp_non_sq_lock);
-			break;
+			*outlenp = inlen;
+			return (0);
 		}
 		case SO_RCVBUF:
 			if (*i1 > tcps->tcps_max_buf) {
@@ -10155,43 +8191,20 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			 * XXX should we return the rwnd here
 			 * and tcp_opt_get ?
 			 */
-			break;
+			*outlenp = inlen;
+			return (0);
 		case SO_SND_COPYAVOID:
 			if (!checkonly) {
-				/* we only allow enable at most once for now */
 				if (tcp->tcp_loopback ||
 				    (tcp->tcp_kssl_ctx != NULL) ||
-				    (!tcp->tcp_snd_zcopy_aware &&
-				    (onoff != 1 || !tcp_zcopy_check(tcp)))) {
+				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
 					*outlenp = 0;
 					return (EOPNOTSUPP);
 				}
 				tcp->tcp_snd_zcopy_aware = 1;
 			}
-			break;
-		case SO_RCVTIMEO:
-		case SO_SNDTIMEO:
-			/*
-			 * Pass these two options in order for third part
-			 * protocol usage. Here just return directly.
-			 */
+			*outlenp = inlen;
 			return (0);
-		case SO_ALLZONES:
-			/* Pass option along to IP level for handling */
-			return (-EINVAL);
-		case SO_ANON_MLP:
-			/* Pass option along to IP level for handling */
-			return (-EINVAL);
-		case SO_MAC_EXEMPT:
-			/* Pass option along to IP level for handling */
-			return (-EINVAL);
-		case SO_EXCLBIND:
-			if (!checkonly)
-				tcp->tcp_exclbind = onoff;
-			break;
-		default:
-			*outlenp = 0;
-			return (EINVAL);
 		}
 		break;
 	case IPPROTO_TCP:
@@ -10217,25 +8230,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 				tcp->tcp_second_ctimer_threshold = *i1;
 			break;
 		case TCP_RECVDSTADDR:
-			if (tcp->tcp_state > TCPS_LISTEN)
-				return (EOPNOTSUPP);
-			if (!checkonly)
-				tcp->tcp_recvdstaddr = onoff;
-			break;
-		case TCP_ANONPRIVBIND:
-			if ((reterr = secpolicy_net_privaddr(cr, 0,
-			    IPPROTO_TCP)) != 0) {
+			if (tcp->tcp_state > TCPS_LISTEN) {
 				*outlenp = 0;
-				return (reterr);
-			}
-			if (!checkonly) {
-				tcp->tcp_anon_priv_bind = onoff;
+				return (EOPNOTSUPP);
 			}
+			/* Setting done in conn_opt_set */
 			break;
-		case TCP_EXCLBIND:
-			if (!checkonly)
-				tcp->tcp_exclbind = onoff;
-			break;	/* goto sizeof (int) option return */
 		case TCP_INIT_CWND: {
 			uint32_t init_cwnd = *((uint32_t *)invalp);
 
@@ -10278,7 +8278,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 				 * keepalive timer.
 				 */
 				if (tcp->tcp_ka_tid != 0) {
-					ASSERT(tcp->tcp_ka_enabled);
+					ASSERT(connp->conn_keepalive);
 					(void) TCP_TIMER_CANCEL(tcp,
 					    tcp->tcp_ka_tid);
 					tcp->tcp_ka_last_intrvl = 0;
@@ -10318,49 +8318,15 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			}
 			break;
 		default:
-			*outlenp = 0;
-			return (EINVAL);
+			break;
 		}
 		break;
 	case IPPROTO_IP:
-		if (tcp->tcp_family != AF_INET) {
+		if (connp->conn_family != AF_INET) {
 			*outlenp = 0;
-			return (ENOPROTOOPT);
+			return (EINVAL);
 		}
 		switch (name) {
-		case IP_OPTIONS:
-		case T_IP_OPTIONS:
-			reterr = tcp_opt_set_header(tcp, checkonly,
-			    invalp, inlen);
-			if (reterr) {
-				*outlenp = 0;
-				return (reterr);
-			}
-			/* OK return - copy input buffer into output buffer */
-			if (invalp != outvalp) {
-				/* don't trust bcopy for identical src/dst */
-				bcopy(invalp, outvalp, inlen);
-			}
-			*outlenp = inlen;
-			return (0);
-		case IP_TOS:
-		case T_IP_TOS:
-			if (!checkonly) {
-				tcp->tcp_ipha->ipha_type_of_service =
-				    (uchar_t)*i1;
-				tcp->tcp_tos = (uchar_t)*i1;
-			}
-			break;
-		case IP_TTL:
-			if (!checkonly) {
-				tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1;
-				tcp->tcp_ttl = (uchar_t)*i1;
-			}
-			break;
-		case IP_BOUND_IF:
-		case IP_NEXTHOP:
-			/* Handled at the IP level */
-			return (-EINVAL);
 		case IP_SEC_OPT:
 			/*
 			 * We should not allow policy setting after
@@ -10368,166 +8334,42 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			 */
 			if (tcp->tcp_state == TCPS_LISTEN) {
 				return (EINVAL);
-			} else {
-				/* Handled at the IP level */
-				return (-EINVAL);
 			}
-		default:
-			*outlenp = 0;
-			return (EINVAL);
+			break;
 		}
 		break;
-	case IPPROTO_IPV6: {
-		ip6_pkt_t		*ipp;
-
+	case IPPROTO_IPV6:
 		/*
 		 * IPPROTO_IPV6 options are only supported for sockets
 		 * that are using IPv6 on the wire.
 		 */
-		if (tcp->tcp_ipversion != IPV6_VERSION) {
+		if (connp->conn_ipversion != IPV6_VERSION) {
 			*outlenp = 0;
-			return (ENOPROTOOPT);
+			return (EINVAL);
 		}
-		/*
-		 * Only sticky options; no ancillary data
-		 */
-		ipp = &tcp->tcp_sticky_ipp;
 
 		switch (name) {
-		case IPV6_UNICAST_HOPS:
-			/* -1 means use default */
-			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
-				*outlenp = 0;
-				return (EINVAL);
-			}
-			if (!checkonly) {
-				if (*i1 == -1) {
-					tcp->tcp_ip6h->ip6_hops =
-					    ipp->ipp_unicast_hops =
-					    (uint8_t)tcps->tcps_ipv6_hoplimit;
-					ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
-					/* Pass modified value to IP. */
-					*i1 = tcp->tcp_ip6h->ip6_hops;
-				} else {
-					tcp->tcp_ip6h->ip6_hops =
-					    ipp->ipp_unicast_hops =
-					    (uint8_t)*i1;
-					ipp->ipp_fields |= IPPF_UNICAST_HOPS;
-				}
-				reterr = tcp_build_hdrs(tcp);
-				if (reterr != 0)
-					return (reterr);
-			}
-			break;
-		case IPV6_BOUND_IF:
-			if (!checkonly) {
-				tcp->tcp_bound_if = *i1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		/*
-		 * Set boolean switches for ancillary data delivery
-		 */
 		case IPV6_RECVPKTINFO:
 			if (!checkonly) {
-				if (onoff)
-					tcp->tcp_ipv6_recvancillary |=
-					    TCP_IPV6_RECVPKTINFO;
-				else
-					tcp->tcp_ipv6_recvancillary &=
-					    ~TCP_IPV6_RECVPKTINFO;
 				/* Force it to be sent up with the next msg */
 				tcp->tcp_recvifindex = 0;
-				PASS_OPT_TO_IP(connp);
 			}
 			break;
 		case IPV6_RECVTCLASS:
 			if (!checkonly) {
-				if (onoff)
-					tcp->tcp_ipv6_recvancillary |=
-					    TCP_IPV6_RECVTCLASS;
-				else
-					tcp->tcp_ipv6_recvancillary &=
-					    ~TCP_IPV6_RECVTCLASS;
-				PASS_OPT_TO_IP(connp);
+				/* Force it to be sent up with the next msg */
+				tcp->tcp_recvtclass = 0xffffffffU;
 			}
 			break;
 		case IPV6_RECVHOPLIMIT:
 			if (!checkonly) {
-				if (onoff)
-					tcp->tcp_ipv6_recvancillary |=
-					    TCP_IPV6_RECVHOPLIMIT;
-				else
-					tcp->tcp_ipv6_recvancillary &=
-					    ~TCP_IPV6_RECVHOPLIMIT;
 				/* Force it to be sent up with the next msg */
 				tcp->tcp_recvhops = 0xffffffffU;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVHOPOPTS:
-			if (!checkonly) {
-				if (onoff)
-					tcp->tcp_ipv6_recvancillary |=
-					    TCP_IPV6_RECVHOPOPTS;
-				else
-					tcp->tcp_ipv6_recvancillary &=
-					    ~TCP_IPV6_RECVHOPOPTS;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVDSTOPTS:
-			if (!checkonly) {
-				if (onoff)
-					tcp->tcp_ipv6_recvancillary |=
-					    TCP_IPV6_RECVDSTOPTS;
-				else
-					tcp->tcp_ipv6_recvancillary &=
-					    ~TCP_IPV6_RECVDSTOPTS;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case _OLD_IPV6_RECVDSTOPTS:
-			if (!checkonly) {
-				if (onoff)
-					tcp->tcp_ipv6_recvancillary |=
-					    TCP_OLD_IPV6_RECVDSTOPTS;
-				else
-					tcp->tcp_ipv6_recvancillary &=
-					    ~TCP_OLD_IPV6_RECVDSTOPTS;
-			}
-			break;
-		case IPV6_RECVRTHDR:
-			if (!checkonly) {
-				if (onoff)
-					tcp->tcp_ipv6_recvancillary |=
-					    TCP_IPV6_RECVRTHDR;
-				else
-					tcp->tcp_ipv6_recvancillary &=
-					    ~TCP_IPV6_RECVRTHDR;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVRTHDRDSTOPTS:
-			if (!checkonly) {
-				if (onoff)
-					tcp->tcp_ipv6_recvancillary |=
-					    TCP_IPV6_RECVRTDSTOPTS;
-				else
-					tcp->tcp_ipv6_recvancillary &=
-					    ~TCP_IPV6_RECVRTDSTOPTS;
-				PASS_OPT_TO_IP(connp);
 			}
 			break;
 		case IPV6_PKTINFO:
-			if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
-				return (EINVAL);
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
-			} else {
+			/* This is an extra check for TCP */
+			if (inlen == sizeof (struct in6_pktinfo)) {
 				struct in6_pktinfo *pkti;
 
 				pkti = (struct in6_pktinfo *)invalp;
@@ -10539,219 +8381,8 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 				 */
 				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
 					return (EINVAL);
-				/*
-				 * IP will validate the source address and
-				 * interface index.
-				 */
-				if (IPCL_IS_NONSTR(tcp->tcp_connp)) {
-					reterr = ip_set_options(tcp->tcp_connp,
-					    level, name, invalp, inlen, cr);
-				} else {
-					reterr = ip6_set_pktinfo(cr,
-					    tcp->tcp_connp, pkti);
-				}
-				if (reterr != 0)
-					return (reterr);
-				ipp->ipp_ifindex = pkti->ipi6_ifindex;
-				ipp->ipp_addr = pkti->ipi6_addr;
-				if (ipp->ipp_ifindex != 0)
-					ipp->ipp_fields |= IPPF_IFINDEX;
-				else
-					ipp->ipp_fields &= ~IPPF_IFINDEX;
-				if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr))
-					ipp->ipp_fields |= IPPF_ADDR;
-				else
-					ipp->ipp_fields &= ~IPPF_ADDR;
-			}
-			reterr = tcp_build_hdrs(tcp);
-			if (reterr != 0)
-				return (reterr);
-			break;
-		case IPV6_TCLASS:
-			if (inlen != 0 && inlen != sizeof (int))
-				return (EINVAL);
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~IPPF_TCLASS;
-			} else {
-				if (*i1 > 255 || *i1 < -1)
-					return (EINVAL);
-				if (*i1 == -1) {
-					ipp->ipp_tclass = 0;
-					*i1 = 0;
-				} else {
-					ipp->ipp_tclass = *i1;
-				}
-				ipp->ipp_fields |= IPPF_TCLASS;
-			}
-			reterr = tcp_build_hdrs(tcp);
-			if (reterr != 0)
-				return (reterr);
-			break;
-		case IPV6_NEXTHOP:
-			/*
-			 * IP will verify that the nexthop is reachable
-			 * and fail for sticky options.
-			 */
-			if (inlen != 0 && inlen != sizeof (sin6_t))
-				return (EINVAL);
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~IPPF_NEXTHOP;
-			} else {
-				sin6_t *sin6 = (sin6_t *)invalp;
-
-				if (sin6->sin6_family != AF_INET6)
-					return (EAFNOSUPPORT);
-				if (IN6_IS_ADDR_V4MAPPED(
-				    &sin6->sin6_addr))
-					return (EADDRNOTAVAIL);
-				ipp->ipp_nexthop = sin6->sin6_addr;
-				if (!IN6_IS_ADDR_UNSPECIFIED(
-				    &ipp->ipp_nexthop))
-					ipp->ipp_fields |= IPPF_NEXTHOP;
-				else
-					ipp->ipp_fields &= ~IPPF_NEXTHOP;
-			}
-			reterr = tcp_build_hdrs(tcp);
-			if (reterr != 0)
-				return (reterr);
-			PASS_OPT_TO_IP(connp);
-			break;
-		case IPV6_HOPOPTS: {
-			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
-
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (hopts->ip6h_len + 1)))
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
-			    (uchar_t **)&ipp->ipp_hopopts,
-			    &ipp->ipp_hopoptslen, tcp->tcp_label_len);
-			if (reterr != 0)
-				return (reterr);
-			if (ipp->ipp_hopoptslen == 0)
-				ipp->ipp_fields &= ~IPPF_HOPOPTS;
-			else
-				ipp->ipp_fields |= IPPF_HOPOPTS;
-			reterr = tcp_build_hdrs(tcp);
-			if (reterr != 0)
-				return (reterr);
-			break;
-		}
-		case IPV6_RTHDRDSTOPTS: {
-			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (dopts->ip6d_len + 1)))
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
-			    (uchar_t **)&ipp->ipp_rtdstopts,
-			    &ipp->ipp_rtdstoptslen, 0);
-			if (reterr != 0)
-				return (reterr);
-			if (ipp->ipp_rtdstoptslen == 0)
-				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
-			else
-				ipp->ipp_fields |= IPPF_RTDSTOPTS;
-			reterr = tcp_build_hdrs(tcp);
-			if (reterr != 0)
-				return (reterr);
-			break;
-		}
-		case IPV6_DSTOPTS: {
-			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (dopts->ip6d_len + 1)))
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
-			    (uchar_t **)&ipp->ipp_dstopts,
-			    &ipp->ipp_dstoptslen, 0);
-			if (reterr != 0)
-				return (reterr);
-			if (ipp->ipp_dstoptslen == 0)
-				ipp->ipp_fields &= ~IPPF_DSTOPTS;
-			else
-				ipp->ipp_fields |= IPPF_DSTOPTS;
-			reterr = tcp_build_hdrs(tcp);
-			if (reterr != 0)
-				return (reterr);
-			break;
-		}
-		case IPV6_RTHDR: {
-			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
-
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (rt->ip6r_len + 1)))
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
-			    (uchar_t **)&ipp->ipp_rthdr,
-			    &ipp->ipp_rthdrlen, 0);
-			if (reterr != 0)
-				return (reterr);
-			if (ipp->ipp_rthdrlen == 0)
-				ipp->ipp_fields &= ~IPPF_RTHDR;
-			else
-				ipp->ipp_fields |= IPPF_RTHDR;
-			reterr = tcp_build_hdrs(tcp);
-			if (reterr != 0)
-				return (reterr);
-			break;
-		}
-		case IPV6_V6ONLY:
-			if (!checkonly) {
-				tcp->tcp_connp->conn_ipv6_v6only = onoff;
 			}
 			break;
-		case IPV6_USE_MIN_MTU:
-			if (inlen != sizeof (int))
-				return (EINVAL);
-
-			if (*i1 < -1 || *i1 > 1)
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
-			ipp->ipp_use_min_mtu = *i1;
-			break;
 		case IPV6_SEC_OPT:
 			/*
 			 * We should not allow policy setting after
@@ -10759,30 +8390,18 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			 */
 			if (tcp->tcp_state == TCPS_LISTEN) {
 				return (EINVAL);
-			} else {
-				/* Handled at the IP level */
-				return (-EINVAL);
-			}
-		case IPV6_SRC_PREFERENCES:
-			if (inlen != sizeof (uint32_t))
-				return (EINVAL);
-			reterr = ip6_set_src_preferences(tcp->tcp_connp,
-			    *(uint32_t *)invalp);
-			if (reterr != 0) {
-				*outlenp = 0;
-				return (reterr);
 			}
 			break;
-		default:
-			*outlenp = 0;
-			return (EINVAL);
 		}
 		break;
-	}		/* end IPPROTO_IPV6 */
-	default:
+	}
+	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
+	    checkonly, cr);
+	if (reterr != 0) {
 		*outlenp = 0;
-		return (EINVAL);
+		return (reterr);
 	}
+
 	/*
 	 * Common case of OK return with outval same as inval
 	 */
@@ -10791,6 +8410,45 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 		(void) bcopy(invalp, outvalp, inlen);
 	}
 	*outlenp = inlen;
+
+	if (coas.coa_changed & COA_HEADER_CHANGED) {
+		reterr = tcp_build_hdrs(tcp);
+		if (reterr != 0)
+			return (reterr);
+	}
+	if (coas.coa_changed & COA_ROUTE_CHANGED) {
+		in6_addr_t nexthop;
+
+		/*
+		 * If we are connected we re-cache the information.
+		 * We ignore errors to preserve BSD behavior.
+		 * Note that we don't redo IPsec policy lookup here
+		 * since the final destination (or source) didn't change.
+		 */
+		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
+		    &connp->conn_faddr_v6, &nexthop);
+
+		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
+		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
+			(void) ip_attr_connect(connp, connp->conn_ixa,
+			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
+			    &nexthop, connp->conn_fport, NULL, NULL,
+			    IPDF_VERIFY_DST);
+		}
+	}
+	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
+		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
+	}
+	if (coas.coa_changed & COA_WROFF_CHANGED) {
+		connp->conn_wroff = connp->conn_ht_iphc_allocated +
+		    tcps->tcps_wroff_xtra;
+		(void) proto_set_tx_wroff(connp->conn_rq, connp,
+		    connp->conn_wroff);
+	}
+	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
+		if (IPCL_IS_NONSTR(connp))
+			proto_set_rx_oob_opt(connp, onoff);
+	}
 	return (0);
 }
 
@@ -10798,12 +8456,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 int
 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
-    void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+    void *thisdg_attrs, cred_t *cr)
 {
 	conn_t	*connp =  Q_TO_CONN(q);
 
 	return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
-	    outlenp, outvalp, thisdg_attrs, cr, mblk));
+	    outlenp, outvalp, thisdg_attrs, cr));
 }
 
 int
@@ -10843,7 +8501,6 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	error = proto_opt_check(level, option_name, optlen, NULL,
 	    tcp_opt_obj.odb_opt_des_arr,
 	    tcp_opt_obj.odb_opt_arr_cnt,
-	    tcp_opt_obj.odb_topmost_tpiprovider,
 	    B_TRUE, B_FALSE, cr);
 
 	if (error != 0) {
@@ -10856,292 +8513,75 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 
 	error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
 	    optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
-	    NULL, cr, NULL);
+	    NULL, cr);
 	squeue_synch_exit(sqp, connp);
 
-	if (error < 0) {
-		/*
-		 * Pass on to ip
-		 */
-		error = ip_set_options(connp, level, option_name, optvalp,
-		    optlen, cr);
-	}
+	ASSERT(error >= 0);
+
 	return (error);
 }
 
 /*
- * Update tcp_sticky_hdrs based on tcp_sticky_ipp.
- * The headers include ip6i_t (if needed), ip6_t, any sticky extension
+ * Build/update the tcp header template (in conn_ht_iphc) based on
+ * conn_xmit_ipp. The headers include ip6_t, any extension
  * headers, and the maximum size tcp header (to avoid reallocation
  * on the fly for additional tcp options).
+ *
+ * Assumes the caller has already set conn_{faddr,laddr,fport,lport,flowinfo}.
  * Returns failure if can't allocate memory.
  */
 static int
 tcp_build_hdrs(tcp_t *tcp)
 {
-	char	*hdrs;
-	uint_t	hdrs_len;
-	ip6i_t	*ip6i;
-	char	buf[TCP_MAX_HDR_LENGTH];
-	ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
-	in6_addr_t src, dst;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	conn_t *connp = tcp->tcp_connp;
+	conn_t		*connp = tcp->tcp_connp;
+	tcpha_t		*tcpha;
+	uint32_t	cksum;
+	int		error;
 
-	/*
-	 * save the existing tcp header and source/dest IP addresses
-	 */
-	bcopy(tcp->tcp_tcph, buf, tcp->tcp_tcp_hdr_len);
-	src = tcp->tcp_ip6h->ip6_src;
-	dst = tcp->tcp_ip6h->ip6_dst;
-	hdrs_len = ip_total_hdrs_len_v6(ipp) + TCP_MAX_HDR_LENGTH;
-	ASSERT(hdrs_len != 0);
-	if (hdrs_len > tcp->tcp_iphc_len) {
-		/* Need to reallocate */
-		hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP);
-		if (hdrs == NULL)
-			return (ENOMEM);
-		if (tcp->tcp_iphc != NULL) {
-			if (tcp->tcp_hdr_grown) {
-				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
-			} else {
-				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
-				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
-			}
-			tcp->tcp_iphc_len = 0;
-		}
-		ASSERT(tcp->tcp_iphc_len == 0);
-		tcp->tcp_iphc = hdrs;
-		tcp->tcp_iphc_len = hdrs_len;
-		tcp->tcp_hdr_grown = B_TRUE;
-	}
-	ip_build_hdrs_v6((uchar_t *)tcp->tcp_iphc,
-	    hdrs_len - TCP_MAX_HDR_LENGTH, ipp, IPPROTO_TCP);
+	/* Grab lock to satisfy ASSERT; TCP is serialized using squeue */
+	mutex_enter(&connp->conn_lock);
+	error = conn_build_hdr_template(connp, TCP_MIN_HEADER_LENGTH,
+	    TCP_MAX_TCP_OPTIONS_LENGTH, &connp->conn_laddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_flowinfo);
+	mutex_exit(&connp->conn_lock);
+	if (error != 0)
+		return (error);
 
-	/* Set header fields not in ipp */
-	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
-		ip6i = (ip6i_t *)tcp->tcp_iphc;
-		tcp->tcp_ip6h = (ip6_t *)&ip6i[1];
-	} else {
-		tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc;
-	}
 	/*
-	 * tcp->tcp_ip_hdr_len will include ip6i_t if there is one.
-	 *
-	 * tcp->tcp_tcp_hdr_len doesn't change here.
+	 * Any routing header/option has been massaged. The checksum difference
+	 * is stored in conn_sum for later use.
 	 */
-	tcp->tcp_ip_hdr_len = hdrs_len - TCP_MAX_HDR_LENGTH;
-	tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + tcp->tcp_ip_hdr_len);
-	tcp->tcp_hdr_len = tcp->tcp_ip_hdr_len + tcp->tcp_tcp_hdr_len;
+	tcpha = (tcpha_t *)connp->conn_ht_ulp;
+	tcp->tcp_tcpha = tcpha;
 
-	bcopy(buf, tcp->tcp_tcph, tcp->tcp_tcp_hdr_len);
-
-	tcp->tcp_ip6h->ip6_src = src;
-	tcp->tcp_ip6h->ip6_dst = dst;
+	tcpha->tha_lport = connp->conn_lport;
+	tcpha->tha_fport = connp->conn_fport;
+	tcpha->tha_sum = 0;
+	tcpha->tha_offset_and_reserved = (5 << 4);
 
 	/*
-	 * If the hop limit was not set by ip_build_hdrs_v6(), set it to
-	 * the default value for TCP.
-	 */
-	if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS))
-		tcp->tcp_ip6h->ip6_hops = tcps->tcps_ipv6_hoplimit;
-
-	/*
-	 * If we're setting extension headers after a connection
-	 * has been established, and if we have a routing header
-	 * among the extension headers, call ip_massage_options_v6 to
-	 * manipulate the routing header/ip6_dst set the checksum
-	 * difference in the tcp header template.
-	 * (This happens in tcp_connect_ipv6 if the routing header
-	 * is set prior to the connect.)
-	 * Set the tcp_sum to zero first in case we've cleared a
-	 * routing header or don't have one at all.
+	 * IP wants our header length in the checksum field to
+	 * allow it to perform a single pseudo-header+checksum
+	 * calculation on behalf of TCP.
+	 * Include the adjustment for a source route once IP_OPTIONS is set.
 	 */
-	tcp->tcp_sum = 0;
-	if ((tcp->tcp_state >= TCPS_SYN_SENT) &&
-	    (tcp->tcp_ipp_fields & IPPF_RTHDR)) {
-		ip6_rthdr_t *rth = ip_find_rthdr_v6(tcp->tcp_ip6h,
-		    (uint8_t *)tcp->tcp_tcph);
-		if (rth != NULL) {
-			tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h,
-			    rth, tcps->tcps_netstack);
-			tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
-			    (tcp->tcp_sum >> 16));
-		}
-	}
-
-	/* Try to get everything in a single mblk */
-	(void) proto_set_tx_wroff(tcp->tcp_rq, connp,
-	    hdrs_len + tcps->tcps_wroff_xtra);
-	return (0);
-}
-
-/*
- * Transfer any source route option from ipha to buf/dst in reversed form.
- */
-static int
-tcp_opt_rev_src_route(ipha_t *ipha, char *buf, uchar_t *dst)
-{
-	ipoptp_t	opts;
-	uchar_t		*opt;
-	uint8_t		optval;
-	uint8_t		optlen;
-	uint32_t	len = 0;
-
-	for (optval = ipoptp_first(&opts, ipha);
-	    optval != IPOPT_EOL;
-	    optval = ipoptp_next(&opts)) {
-		opt = opts.ipoptp_cur;
-		optlen = opts.ipoptp_len;
-		switch (optval) {
-			int	off1, off2;
-		case IPOPT_SSRR:
-		case IPOPT_LSRR:
-
-			/* Reverse source route */
-			/*
-			 * First entry should be the next to last one in the
-			 * current source route (the last entry is our
-			 * address.)
-			 * The last entry should be the final destination.
-			 */
-			buf[IPOPT_OPTVAL] = (uint8_t)optval;
-			buf[IPOPT_OLEN] = (uint8_t)optlen;
-			off1 = IPOPT_MINOFF_SR - 1;
-			off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
-			if (off2 < 0) {
-				/* No entries in source route */
-				break;
-			}
-			bcopy(opt + off2, dst, IP_ADDR_LEN);
-			/*
-			 * Note: use src since ipha has not had its src
-			 * and dst reversed (it is in the state it was
-			 * received.
-			 */
-			bcopy(&ipha->ipha_src, buf + off2,
-			    IP_ADDR_LEN);
-			off2 -= IP_ADDR_LEN;
-
-			while (off2 > 0) {
-				bcopy(opt + off2, buf + off1,
-				    IP_ADDR_LEN);
-				off1 += IP_ADDR_LEN;
-				off2 -= IP_ADDR_LEN;
-			}
-			buf[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
-			buf += optlen;
-			len += optlen;
-			break;
-		}
-	}
-done:
-	/* Pad the resulting options */
-	while (len & 0x3) {
-		*buf++ = IPOPT_EOL;
-		len++;
-	}
-	return (len);
-}
-
-
-/*
- * Extract and revert a source route from ipha (if any)
- * and then update the relevant fields in both tcp_t and the standard header.
- */
-static void
-tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha)
-{
-	char	buf[TCP_MAX_HDR_LENGTH];
-	uint_t	tcph_len;
-	int	len;
-
-	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
-	len = IPH_HDR_LENGTH(ipha);
-	if (len == IP_SIMPLE_HDR_LENGTH)
-		/* Nothing to do */
-		return;
-	if (len > IP_SIMPLE_HDR_LENGTH + TCP_MAX_IP_OPTIONS_LENGTH ||
-	    (len & 0x3))
-		return;
-
-	tcph_len = tcp->tcp_tcp_hdr_len;
-	bcopy(tcp->tcp_tcph, buf, tcph_len);
-	tcp->tcp_sum = (tcp->tcp_ipha->ipha_dst >> 16) +
-	    (tcp->tcp_ipha->ipha_dst & 0xffff);
-	len = tcp_opt_rev_src_route(ipha, (char *)tcp->tcp_ipha +
-	    IP_SIMPLE_HDR_LENGTH, (uchar_t *)&tcp->tcp_ipha->ipha_dst);
-	len += IP_SIMPLE_HDR_LENGTH;
-	tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) +
-	    (tcp->tcp_ipha->ipha_dst & 0xffff));
-	if ((int)tcp->tcp_sum < 0)
-		tcp->tcp_sum--;
-	tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
-	tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16));
-	tcp->tcp_tcph = (tcph_t *)((char *)tcp->tcp_ipha + len);
-	bcopy(buf, tcp->tcp_tcph, tcph_len);
-	tcp->tcp_ip_hdr_len = len;
-	tcp->tcp_ipha->ipha_version_and_hdr_length =
-	    (IP_VERSION << 4) | (len >> 2);
-	len += tcph_len;
-	tcp->tcp_hdr_len = len;
-}
-
-/*
- * Copy the standard header into its new location,
- * lay in the new options and then update the relevant
- * fields in both tcp_t and the standard header.
- */
-static int
-tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len)
-{
-	uint_t	tcph_len;
-	uint8_t	*ip_optp;
-	tcph_t	*new_tcph;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	conn_t	*connp = tcp->tcp_connp;
-
-	if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3))
-		return (EINVAL);
-
-	if (len > IP_MAX_OPT_LENGTH - tcp->tcp_label_len)
-		return (EINVAL);
-
-	if (checkonly) {
-		/*
-		 * do not really set, just pretend to - T_CHECK
-		 */
-		return (0);
-	}
+	cksum = sizeof (tcpha_t) + connp->conn_sum;
+	cksum = (cksum >> 16) + (cksum & 0xFFFF);
+	ASSERT(cksum < 0x10000);
+	tcpha->tha_sum = htons(cksum);
 
-	ip_optp = (uint8_t *)tcp->tcp_ipha + IP_SIMPLE_HDR_LENGTH;
-	if (tcp->tcp_label_len > 0) {
-		int padlen;
-		uint8_t opt;
+	if (connp->conn_ipversion == IPV4_VERSION)
+		tcp->tcp_ipha = (ipha_t *)connp->conn_ht_iphc;
+	else
+		tcp->tcp_ip6h = (ip6_t *)connp->conn_ht_iphc;
 
-		/* convert list termination to no-ops */
-		padlen = tcp->tcp_label_len - ip_optp[IPOPT_OLEN];
-		ip_optp += ip_optp[IPOPT_OLEN];
-		opt = len > 0 ? IPOPT_NOP : IPOPT_EOL;
-		while (--padlen >= 0)
-			*ip_optp++ = opt;
-	}
-	tcph_len = tcp->tcp_tcp_hdr_len;
-	new_tcph = (tcph_t *)(ip_optp + len);
-	ovbcopy(tcp->tcp_tcph, new_tcph, tcph_len);
-	tcp->tcp_tcph = new_tcph;
-	bcopy(ptr, ip_optp, len);
-
-	len += IP_SIMPLE_HDR_LENGTH + tcp->tcp_label_len;
-
-	tcp->tcp_ip_hdr_len = len;
-	tcp->tcp_ipha->ipha_version_and_hdr_length =
-	    (IP_VERSION << 4) | (len >> 2);
-	tcp->tcp_hdr_len = len + tcph_len;
-	if (!TCP_IS_DETACHED(tcp)) {
-		/* Always allocate room for all options. */
-		(void) proto_set_tx_wroff(tcp->tcp_rq, connp,
-		    TCP_MAX_COMBINED_HEADER_LENGTH + tcps->tcps_wroff_xtra);
+	if (connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra >
+	    connp->conn_wroff) {
+		connp->conn_wroff = connp->conn_ht_iphc_allocated +
+		    tcps->tcps_wroff_xtra;
+		(void) proto_set_tx_wroff(connp->conn_rq, connp,
+		    connp->conn_wroff);
 	}
 	return (0);
 }
@@ -11184,36 +8624,6 @@ tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps)
 		nd_free(ndp);
 		return (B_FALSE);
 	}
-	tcps->tcps_mdt_head_param = kmem_zalloc(sizeof (tcpparam_t),
-	    KM_SLEEP);
-	bcopy(&lcl_tcp_mdt_head_param, tcps->tcps_mdt_head_param,
-	    sizeof (tcpparam_t));
-	if (!nd_load(ndp, tcps->tcps_mdt_head_param->tcp_param_name,
-	    tcp_param_get, tcp_param_set_aligned,
-	    (caddr_t)tcps->tcps_mdt_head_param)) {
-		nd_free(ndp);
-		return (B_FALSE);
-	}
-	tcps->tcps_mdt_tail_param = kmem_zalloc(sizeof (tcpparam_t),
-	    KM_SLEEP);
-	bcopy(&lcl_tcp_mdt_tail_param, tcps->tcps_mdt_tail_param,
-	    sizeof (tcpparam_t));
-	if (!nd_load(ndp, tcps->tcps_mdt_tail_param->tcp_param_name,
-	    tcp_param_get, tcp_param_set_aligned,
-	    (caddr_t)tcps->tcps_mdt_tail_param)) {
-		nd_free(ndp);
-		return (B_FALSE);
-	}
-	tcps->tcps_mdt_max_pbufs_param = kmem_zalloc(sizeof (tcpparam_t),
-	    KM_SLEEP);
-	bcopy(&lcl_tcp_mdt_max_pbufs_param, tcps->tcps_mdt_max_pbufs_param,
-	    sizeof (tcpparam_t));
-	if (!nd_load(ndp, tcps->tcps_mdt_max_pbufs_param->tcp_param_name,
-	    tcp_param_get, tcp_param_set_aligned,
-	    (caddr_t)tcps->tcps_mdt_max_pbufs_param)) {
-		nd_free(ndp);
-		return (B_FALSE);
-	}
 	if (!nd_load(ndp, "tcp_extra_priv_ports",
 	    tcp_extra_priv_ports_get, NULL, NULL)) {
 		nd_free(ndp);
@@ -11248,7 +8658,7 @@ tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps)
 	return (B_TRUE);
 }
 
-/* ndd set routine for tcp_wroff_xtra, tcp_mdt_hdr_{head,tail}_min. */
+/* ndd set routine for tcp_wroff_xtra. */
 /* ARGSUSED */
 static int
 tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -11307,6 +8717,7 @@ tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
 	uint32_t	u1;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
+
 	/* Walk through all the new pieces. */
 	do {
 		ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
@@ -11433,9 +8844,10 @@ tcp_rwnd_reopen(tcp_t *tcp)
 {
 	uint_t ret = 0;
 	uint_t thwin;
+	conn_t *connp = tcp->tcp_connp;
 
 	/* Learn the latest rwnd information that we sent to the other side. */
-	thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
+	thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win))
 	    << tcp->tcp_rcv_ws;
 	/* This is peer's calculated send window (our receive window). */
 	thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
@@ -11444,7 +8856,7 @@ tcp_rwnd_reopen(tcp_t *tcp)
 	 * SWS avoidance.  This means that we need to check the increase of
 	 * of receive window is at least 1 MSS.
 	 */
-	if (tcp->tcp_recv_hiwater - thwin >= tcp->tcp_mss) {
+	if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) {
 		/*
 		 * If the window that the other side knows is less than max
 		 * deferred acks segments, send an update immediately.
@@ -11453,7 +8865,7 @@ tcp_rwnd_reopen(tcp_t *tcp)
 			BUMP_MIB(&tcp->tcp_tcps->tcps_mib, tcpOutWinUpdate);
 			ret = TH_ACK_NEEDED;
 		}
-		tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+		tcp->tcp_rwnd = connp->conn_rcvbuf;
 	}
 	return (ret);
 }
@@ -11469,7 +8881,7 @@ tcp_rcv_drain(tcp_t *tcp)
 #ifdef DEBUG
 	uint_t cnt = 0;
 #endif
-	queue_t	*q = tcp->tcp_rq;
+	queue_t	*q = tcp->tcp_connp->conn_rq;
 
 	/* Can't drain on an eager connection */
 	if (tcp->tcp_listener != NULL)
@@ -11511,7 +8923,7 @@ tcp_rcv_drain(tcp_t *tcp)
 		if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) {
 			DTRACE_PROBE1(kssl_mblk__ksslinput_rcvdrain,
 			    mblk_t *, mp);
-			tcp_kssl_input(tcp, mp);
+			tcp_kssl_input(tcp, mp, NULL);
 			continue;
 		}
 		putnext(q, mp);
@@ -11538,11 +8950,22 @@ tcp_rcv_drain(tcp_t *tcp)
  * Other messages are added as new (b_next) elements.
  */
 void
-tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len)
+tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr)
 {
 	ASSERT(seg_len == msgdsize(mp));
 	ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL);
 
+	if (is_system_labeled()) {
+		ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL);
+		/*
+		 * Provide for protocols above TCP such as RPC. NOPID leaves
+		 * db_cpid unchanged.
+		 * The cred could have already been set.
+		 */
+		if (cr != NULL)
+			mblk_setcred(mp, cr, NOPID);
+	}
+
 	if (tcp->tcp_rcv_list == NULL) {
 		ASSERT(tcp->tcp_rcv_last_head == NULL);
 		tcp->tcp_rcv_list = mp;
@@ -11562,176 +8985,6 @@ tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len)
 	tcp->tcp_rwnd -= seg_len;
 }
 
-/*
- * DEFAULT TCP ENTRY POINT via squeue on READ side.
- *
- * This is the default entry function into TCP on the read side. TCP is
- * always entered via squeue i.e. using squeue's for mutual exclusion.
- * When classifier does a lookup to find the tcp, it also puts a reference
- * on the conn structure associated so the tcp is guaranteed to exist
- * when we come here. We still need to check the state because it might
- * as well has been closed. The squeue processing function i.e. squeue_enter,
- * is responsible for doing the CONN_DEC_REF.
- *
- * Apart from the default entry point, IP also sends packets directly to
- * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming
- * connections.
- */
-boolean_t tcp_outbound_squeue_switch = B_FALSE;
-void
-tcp_input(void *arg, mblk_t *mp, void *arg2)
-{
-	conn_t	*connp = (conn_t *)arg;
-	tcp_t	*tcp = (tcp_t *)connp->conn_tcp;
-
-	/* arg2 is the sqp */
-	ASSERT(arg2 != NULL);
-	ASSERT(mp != NULL);
-
-	/*
-	 * Don't accept any input on a closed tcp as this TCP logically does
-	 * not exist on the system. Don't proceed further with this TCP.
-	 * For eg. this packet could trigger another close of this tcp
-	 * which would be disastrous for tcp_refcnt. tcp_close_detached /
-	 * tcp_clean_death / tcp_closei_local must be called at most once
-	 * on a TCP. In this case we need to refeed the packet into the
-	 * classifier and figure out where the packet should go. Need to
-	 * preserve the recv_ill somehow. Until we figure that out, for
-	 * now just drop the packet if we can't classify the packet.
-	 */
-	if (tcp->tcp_state == TCPS_CLOSED ||
-	    tcp->tcp_state == TCPS_BOUND) {
-		conn_t	*new_connp;
-		ip_stack_t *ipst = tcp->tcp_tcps->tcps_netstack->netstack_ip;
-
-		new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst);
-		if (new_connp != NULL) {
-			tcp_reinput(new_connp, mp, arg2);
-			return;
-		}
-		/* We failed to classify. For now just drop the packet */
-		freemsg(mp);
-		return;
-	}
-
-	if (DB_TYPE(mp) != M_DATA) {
-		tcp_rput_common(tcp, mp);
-		return;
-	}
-
-	if (mp->b_datap->db_struioflag & STRUIO_CONNECT) {
-		squeue_t	*final_sqp;
-
-		mp->b_datap->db_struioflag &= ~STRUIO_CONNECT;
-		final_sqp = (squeue_t *)DB_CKSUMSTART(mp);
-		DB_CKSUMSTART(mp) = 0;
-		if (tcp->tcp_state == TCPS_SYN_SENT &&
-		    connp->conn_final_sqp == NULL &&
-		    tcp_outbound_squeue_switch) {
-			ASSERT(connp->conn_initial_sqp == connp->conn_sqp);
-			connp->conn_final_sqp = final_sqp;
-			if (connp->conn_final_sqp != connp->conn_sqp) {
-				CONN_INC_REF(connp);
-				SQUEUE_SWITCH(connp, connp->conn_final_sqp);
-				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
-				    tcp_rput_data, connp, ip_squeue_flag,
-				    SQTAG_CONNECT_FINISH);
-				return;
-			}
-		}
-	}
-	tcp_rput_data(connp, mp, arg2);
-}
-
-/*
- * The read side put procedure.
- * The packets passed up by ip are assume to be aligned according to
- * OK_32PTR and the IP+TCP headers fitting in the first mblk.
- */
-static void
-tcp_rput_common(tcp_t *tcp, mblk_t *mp)
-{
-	/*
-	 * tcp_rput_data() does not expect M_CTL except for the case
-	 * where tcp_ipv6_recvancillary is set and we get a IN_PKTINFO
-	 * type. Need to make sure that any other M_CTLs don't make
-	 * it to tcp_rput_data since it is not expecting any and doesn't
-	 * check for it.
-	 */
-	if (DB_TYPE(mp) == M_CTL) {
-		switch (*(uint32_t *)(mp->b_rptr)) {
-		case TCP_IOC_ABORT_CONN:
-			/*
-			 * Handle connection abort request.
-			 */
-			tcp_ioctl_abort_handler(tcp, mp);
-			return;
-		case IPSEC_IN:
-			/*
-			 * Only secure icmp arrive in TCP and they
-			 * don't go through data path.
-			 */
-			tcp_icmp_error(tcp, mp);
-			return;
-		case IN_PKTINFO:
-			/*
-			 * Handle IPV6_RECVPKTINFO socket option on AF_INET6
-			 * sockets that are receiving IPv4 traffic. tcp
-			 */
-			ASSERT(tcp->tcp_family == AF_INET6);
-			ASSERT(tcp->tcp_ipv6_recvancillary &
-			    TCP_IPV6_RECVPKTINFO);
-			tcp_rput_data(tcp->tcp_connp, mp,
-			    tcp->tcp_connp->conn_sqp);
-			return;
-		case MDT_IOC_INFO_UPDATE:
-			/*
-			 * Handle Multidata information update; the
-			 * following routine will free the message.
-			 */
-			if (tcp->tcp_connp->conn_mdt_ok) {
-				tcp_mdt_update(tcp,
-				    &((ip_mdt_info_t *)mp->b_rptr)->mdt_capab,
-				    B_FALSE);
-			}
-			freemsg(mp);
-			return;
-		case LSO_IOC_INFO_UPDATE:
-			/*
-			 * Handle LSO information update; the following
-			 * routine will free the message.
-			 */
-			if (tcp->tcp_connp->conn_lso_ok) {
-				tcp_lso_update(tcp,
-				    &((ip_lso_info_t *)mp->b_rptr)->lso_capab);
-			}
-			freemsg(mp);
-			return;
-		default:
-			/*
-			 * tcp_icmp_err() will process the M_CTL packets.
-			 * Non-ICMP packets, if any, will be discarded in
-			 * tcp_icmp_err(). We will process the ICMP packet
-			 * even if we are TCP_IS_DETACHED_NONEAGER as the
-			 * incoming ICMP packet may result in changing
-			 * the tcp_mss, which we would need if we have
-			 * packets to retransmit.
-			 */
-			tcp_icmp_error(tcp, mp);
-			return;
-		}
-	}
-
-	/* No point processing the message if tcp is already closed */
-	if (TCP_IS_DETACHED_NONEAGER(tcp)) {
-		freemsg(mp);
-		return;
-	}
-
-	tcp_rput_other(tcp, mp);
-}
-
-
 /* The minimum of smoothed mean deviation in RTO calculation. */
 #define	TCP_SD_MIN	400
 
@@ -11885,12 +9138,12 @@ tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off)
  * segments.  A segment is eligible if sack_cnt for that segment is greater
  * than or equal tcp_dupack_fast_retransmit.  After it has retransmitted
  * all eligible segments, it checks to see if TCP can send some new segments
- * (fast recovery).  If it can, set the appropriate flag for tcp_rput_data().
+ * (fast recovery).  If it can, set the appropriate flag for tcp_input_data().
  *
  * Parameters:
  *	tcp_t *tcp: the tcp structure of the connection.
  *	uint_t *flags: in return, appropriate value will be set for
- *	tcp_rput_data().
+ *	tcp_input_data().
  */
 static void
 tcp_sack_rxmit(tcp_t *tcp, uint_t *flags)
@@ -11988,7 +9241,7 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags)
 		tcp->tcp_pipe += seg_len;
 		tcp->tcp_sack_snxt = begin + seg_len;
 
-		tcp_send_data(tcp, tcp->tcp_wq, xmit_mp);
+		tcp_send_data(tcp, xmit_mp);
 
 		/*
 		 * Update the send timestamp to avoid false retransmission.
@@ -12012,96 +9265,8 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags)
 }
 
 /*
- * This function handles policy checking at TCP level for non-hard_bound/
- * detached connections.
- */
-static boolean_t
-tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h,
-    boolean_t secure, boolean_t mctl_present)
-{
-	ipsec_latch_t *ipl = NULL;
-	ipsec_action_t *act = NULL;
-	mblk_t *data_mp;
-	ipsec_in_t *ii;
-	const char *reason;
-	kstat_named_t *counter;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ipsec_stack_t	*ipss;
-	ip_stack_t	*ipst;
-
-	ASSERT(mctl_present || !secure);
-
-	ASSERT((ipha == NULL && ip6h != NULL) ||
-	    (ip6h == NULL && ipha != NULL));
-
-	/*
-	 * We don't necessarily have an ipsec_in_act action to verify
-	 * policy because of assymetrical policy where we have only
-	 * outbound policy and no inbound policy (possible with global
-	 * policy).
-	 */
-	if (!secure) {
-		if (act == NULL || act->ipa_act.ipa_type == IPSEC_ACT_BYPASS ||
-		    act->ipa_act.ipa_type == IPSEC_ACT_CLEAR)
-			return (B_TRUE);
-		ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH,
-		    "tcp_check_policy", ipha, ip6h, secure,
-		    tcps->tcps_netstack);
-		ipss = tcps->tcps_netstack->netstack_ipsec;
-
-		ip_drop_packet(first_mp, B_TRUE, NULL, NULL,
-		    DROPPER(ipss, ipds_tcp_clear),
-		    &tcps->tcps_dropper);
-		return (B_FALSE);
-	}
-
-	/*
-	 * We have a secure packet.
-	 */
-	if (act == NULL) {
-		ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED,
-		    "tcp_check_policy", ipha, ip6h, secure,
-		    tcps->tcps_netstack);
-		ipss = tcps->tcps_netstack->netstack_ipsec;
-
-		ip_drop_packet(first_mp, B_TRUE, NULL, NULL,
-		    DROPPER(ipss, ipds_tcp_secure),
-		    &tcps->tcps_dropper);
-		return (B_FALSE);
-	}
-
-	/*
-	 * XXX This whole routine is currently incorrect.  ipl should
-	 * be set to the latch pointer, but is currently not set, so
-	 * we initialize it to NULL to avoid picking up random garbage.
-	 */
-	if (ipl == NULL)
-		return (B_TRUE);
-
-	data_mp = first_mp->b_cont;
-
-	ii = (ipsec_in_t *)first_mp->b_rptr;
-
-	ipst = tcps->tcps_netstack->netstack_ip;
-
-	if (ipsec_check_ipsecin_latch(ii, data_mp, ipl, ipha, ip6h, &reason,
-	    &counter, tcp->tcp_connp)) {
-		BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
-		return (B_TRUE);
-	}
-	(void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
-	    "tcp inbound policy mismatch: %s, packet dropped\n",
-	    reason);
-	BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
-
-	ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter,
-	    &tcps->tcps_dropper);
-	return (B_FALSE);
-}
-
-/*
- * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start
- * retransmission after a timeout.
+ * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
+ * or ICMP errors.
  *
  * To limit the number of duplicate segments, we limit the number of segment
  * to be sent in one time to tcp_snd_burst, the burst variable.
@@ -12150,7 +9315,7 @@ tcp_ss_rexmit(tcp_t *tcp)
 			if (xmit_mp == NULL)
 				return;
 
-			tcp_send_data(tcp, tcp->tcp_wq, xmit_mp);
+			tcp_send_data(tcp, xmit_mp);
 
 			snxt += cnt;
 			win -= cnt;
@@ -12184,7 +9349,7 @@ tcp_ss_rexmit(tcp_t *tcp)
 
 /*
  * Process all TCP option in SYN segment.  Note that this function should
- * be called after tcp_adapt_ire() is called so that the necessary info
+ * be called after tcp_set_destination() is called so that the necessary info
  * from IRE is already set in the tcp structure.
  *
  * This function sets up the correct tcp_mss value according to the
@@ -12194,16 +9359,17 @@ tcp_ss_rexmit(tcp_t *tcp)
  * should do the appropriate change.
  */
 void
-tcp_process_options(tcp_t *tcp, tcph_t *tcph)
+tcp_process_options(tcp_t *tcp, tcpha_t *tcpha)
 {
 	int options;
 	tcp_opt_t tcpopt;
 	uint32_t mss_max;
 	char *tmp_tcph;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
 	tcpopt.tcp = NULL;
-	options = tcp_parse_options(tcph, &tcpopt);
+	options = tcp_parse_options(tcpha, &tcpopt);
 
 	/*
 	 * Process MSS option.  Note that MSS option value does not account
@@ -12212,12 +9378,12 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph)
 	 * IPv6.
 	 */
 	if (!(options & TCP_OPT_MSS_PRESENT)) {
-		if (tcp->tcp_ipversion == IPV4_VERSION)
+		if (connp->conn_ipversion == IPV4_VERSION)
 			tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4;
 		else
 			tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6;
 	} else {
-		if (tcp->tcp_ipversion == IPV4_VERSION)
+		if (connp->conn_ipversion == IPV4_VERSION)
 			mss_max = tcps->tcps_mss_max_ipv4;
 		else
 			mss_max = tcps->tcps_mss_max_ipv6;
@@ -12240,23 +9406,23 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph)
 	/* Process Timestamp option. */
 	if ((options & TCP_OPT_TSTAMP_PRESENT) &&
 	    (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) {
-		tmp_tcph = (char *)tcp->tcp_tcph;
+		tmp_tcph = (char *)tcp->tcp_tcpha;
 
 		tcp->tcp_snd_ts_ok = B_TRUE;
 		tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
 		tcp->tcp_last_rcv_lbolt = lbolt64;
 		ASSERT(OK_32PTR(tmp_tcph));
-		ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH);
+		ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
 
 		/* Fill in our template header with basic timestamp option. */
-		tmp_tcph += tcp->tcp_tcp_hdr_len;
+		tmp_tcph += connp->conn_ht_ulp_len;
 		tmp_tcph[0] = TCPOPT_NOP;
 		tmp_tcph[1] = TCPOPT_NOP;
 		tmp_tcph[2] = TCPOPT_TSTAMP;
 		tmp_tcph[3] = TCPOPT_TSTAMP_LEN;
-		tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN;
-		tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN;
-		tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4);
+		connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN;
+		connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN;
+		tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4);
 	} else {
 		tcp->tcp_snd_ts_ok = B_FALSE;
 	}
@@ -12266,12 +9432,11 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph)
 	 * then allocate the SACK info structure.  Note the following ways
 	 * when tcp_snd_sack_ok is set to true.
 	 *
-	 * For active connection: in tcp_adapt_ire() called in
-	 * tcp_rput_other(), or in tcp_rput_other() when tcp_sack_permitted
-	 * is checked.
+	 * For active connection: in tcp_set_destination() called in
+	 * tcp_connect().
 	 *
-	 * For passive connection: in tcp_adapt_ire() called in
-	 * tcp_accept_comm().
+	 * For passive connection: in tcp_set_destination() called in
+	 * tcp_input_listener().
 	 *
 	 * That's the reason why the extra TCP_IS_DETACHED() check is there.
 	 * That check makes sure that if we did not send a SACK OK option,
@@ -12320,7 +9485,8 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph)
 	 * Now we know the exact TCP/IP header length, subtract
 	 * that from tcp_mss to get our side's MSS.
 	 */
-	tcp->tcp_mss -= tcp->tcp_hdr_len;
+	tcp->tcp_mss -= connp->conn_ht_iphc_len;
+
 	/*
 	 * Here we assume that the other side's header size will be equal to
 	 * our header size.  We calculate the real MSS accordingly.  Need to
@@ -12328,22 +9494,29 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph)
 	 *
 	 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header)
 	 */
-	tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len + tcp->tcp_ipsec_overhead -
-	    ((tcp->tcp_ipversion == IPV4_VERSION ?
+	tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len +
+	    tcp->tcp_ipsec_overhead -
+	    ((connp->conn_ipversion == IPV4_VERSION ?
 	    IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
 
 	/*
 	 * Set MSS to the smaller one of both ends of the connection.
 	 * We should not have called tcp_mss_set() before, but our
 	 * side of the MSS should have been set to a proper value
-	 * by tcp_adapt_ire().  tcp_mss_set() will also set up the
+	 * by tcp_set_destination().  tcp_mss_set() will also set up the
 	 * STREAM head parameters properly.
 	 *
 	 * If we have a larger-than-16-bit window but the other side
 	 * didn't want to do window scale, tcp_rwnd_set() will take
 	 * care of that.
 	 */
-	tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss), B_TRUE);
+	tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
+
+	/*
+	 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
+	 * updated properly.
+	 */
+	SET_TCP_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
 }
 
 /*
@@ -12410,7 +9583,7 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
 		tcp_t *tail;
 
 		/*
-		 * The eager already has an extra ref put in tcp_rput_data
+		 * The eager already has an extra ref put in tcp_input_data
 		 * so that it stays till accept comes back even though it
 		 * might get into TCPS_CLOSED as a result of a TH_RST etc.
 		 */
@@ -12496,8 +9669,8 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
 		 * remote host. This proves the IP addr is good.
 		 * Cache it!
 		 */
-		addr_cache[IP_ADDR_CACHE_HASH(
-		    tcp->tcp_remote)] = tcp->tcp_remote;
+		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
+		    tcp->tcp_connp->conn_faddr_v4;
 	}
 	mutex_exit(&listener->tcp_eager_lock);
 	if (need_send_conn_ind)
@@ -12513,17 +9686,16 @@ tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp)
 {
 	if (IPCL_IS_NONSTR(lconnp)) {
 		cred_t	*cr;
-		pid_t	cpid;
-
-		cr = msg_getcred(mp, &cpid);
+		pid_t	cpid = NOPID;
 
 		ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp);
 		ASSERT(econnp->conn_tcp->tcp_saved_listener ==
 		    lconnp->conn_tcp);
 
+		cr = msg_getcred(mp, &cpid);
+
 		/* Keep the message around in case of a fallback to TPI */
 		econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp;
-
 		/*
 		 * Notify the ULP about the newconn. It is guaranteed that no
 		 * tcp_accept() call will be made for the eager if the
@@ -12545,177 +9717,83 @@ tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp)
 			    econnp->conn_tcp->tcp_conn_req_seqnum);
 		}
 	} else {
-		putnext(lconnp->conn_tcp->tcp_rq, mp);
+		putnext(lconnp->conn_rq, mp);
 	}
 }
 
-mblk_t *
-tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp,
-    uint_t *ifindexp, ip6_pkt_t *ippp)
+/*
+ * Handle a packet that has been reclassified by TCP.
+ * This function drops the ref on connp that the caller had.
+ */
+static void
+tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
 {
-	ip_pktinfo_t	*pinfo;
-	ip6_t		*ip6h;
-	uchar_t		*rptr;
-	mblk_t		*first_mp = mp;
-	boolean_t	mctl_present = B_FALSE;
-	uint_t 		ifindex = 0;
-	ip6_pkt_t	ipp;
-	uint_t		ipvers;
-	uint_t		ip_hdr_len;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
 
-	rptr = mp->b_rptr;
-	ASSERT(OK_32PTR(rptr));
-	ASSERT(tcp != NULL);
-	ipp.ipp_fields = 0;
+	if (connp->conn_incoming_ifindex != 0 &&
+	    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+		freemsg(mp);
+		CONN_DEC_REF(connp);
+		return;
+	}
 
-	switch DB_TYPE(mp) {
-	case M_CTL:
-		mp = mp->b_cont;
-		if (mp == NULL) {
-			freemsg(first_mp);
-			return (NULL);
+	if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
+	    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
+		ip6_t *ip6h;
+		ipha_t *ipha;
+
+		if (ira->ira_flags & IRAF_IS_IPV4) {
+			ipha = (ipha_t *)mp->b_rptr;
+			ip6h = NULL;
+		} else {
+			ipha = NULL;
+			ip6h = (ip6_t *)mp->b_rptr;
 		}
-		if (DB_TYPE(mp) != M_DATA) {
-			freemsg(first_mp);
-			return (NULL);
+		mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira);
+		if (mp == NULL) {
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+			/* Note that mp is NULL */
+			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+			CONN_DEC_REF(connp);
+			return;
 		}
-		mctl_present = B_TRUE;
-		break;
-	case M_DATA:
-		break;
-	default:
-		cmn_err(CE_NOTE, "tcp_find_pktinfo: unknown db_type");
-		freemsg(mp);
-		return (NULL);
 	}
-	ipvers = IPH_HDR_VERSION(rptr);
-	if (ipvers == IPV4_VERSION) {
-		if (tcp == NULL) {
-			ip_hdr_len = IPH_HDR_LENGTH(rptr);
-			goto done;
-		}
-
-		ipp.ipp_fields |= IPPF_HOPLIMIT;
-		ipp.ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl;
 
+	if (IPCL_IS_TCP(connp)) {
 		/*
-		 * If we have IN_PKTINFO in an M_CTL and tcp_ipv6_recvancillary
-		 * has TCP_IPV6_RECVPKTINFO set, pass I/F index along in ipp.
+		 * do not drain, certain use cases can blow
+		 * the stack
 		 */
-		if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) &&
-		    mctl_present) {
-			pinfo = (ip_pktinfo_t *)first_mp->b_rptr;
-			if ((MBLKL(first_mp) == sizeof (ip_pktinfo_t)) &&
-			    (pinfo->ip_pkt_ulp_type == IN_PKTINFO) &&
-			    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
-				ipp.ipp_fields |= IPPF_IFINDEX;
-				ipp.ipp_ifindex = pinfo->ip_pkt_ifindex;
-				ifindex = pinfo->ip_pkt_ifindex;
-			}
-			freeb(first_mp);
-			mctl_present = B_FALSE;
-		}
-		ip_hdr_len = IPH_HDR_LENGTH(rptr);
+		SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+		    connp->conn_recv, connp, ira,
+		    SQ_NODRAIN, SQTAG_IP_TCP_INPUT);
 	} else {
-		ip6h = (ip6_t *)rptr;
-
-		ASSERT(ipvers == IPV6_VERSION);
-		ipp.ipp_fields = IPPF_HOPLIMIT | IPPF_TCLASS;
-		ipp.ipp_tclass = (ip6h->ip6_flow & 0x0FF00000) >> 20;
-		ipp.ipp_hoplimit = ip6h->ip6_hops;
-
-		if (ip6h->ip6_nxt != IPPROTO_TCP) {
-			uint8_t	nexthdrp;
-			ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
-
-			/* Look for ifindex information */
-			if (ip6h->ip6_nxt == IPPROTO_RAW) {
-				ip6i_t *ip6i = (ip6i_t *)ip6h;
-				if ((uchar_t *)&ip6i[1] > mp->b_wptr) {
-					BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs);
-					freemsg(first_mp);
-					return (NULL);
-				}
-
-				if (ip6i->ip6i_flags & IP6I_IFINDEX) {
-					ASSERT(ip6i->ip6i_ifindex != 0);
-					ipp.ipp_fields |= IPPF_IFINDEX;
-					ipp.ipp_ifindex = ip6i->ip6i_ifindex;
-					ifindex = ip6i->ip6i_ifindex;
-				}
-				rptr = (uchar_t *)&ip6i[1];
-				mp->b_rptr = rptr;
-				if (rptr == mp->b_wptr) {
-					mblk_t *mp1;
-					mp1 = mp->b_cont;
-					freeb(mp);
-					mp = mp1;
-					rptr = mp->b_rptr;
-				}
-				if (MBLKL(mp) < IPV6_HDR_LEN +
-				    sizeof (tcph_t)) {
-					BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs);
-					freemsg(first_mp);
-					return (NULL);
-				}
-				ip6h = (ip6_t *)rptr;
-			}
-
-			/*
-			 * Find any potentially interesting extension headers
-			 * as well as the length of the IPv6 + extension
-			 * headers.
-			 */
-			ip_hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp);
-			/* Verify if this is a TCP packet */
-			if (nexthdrp != IPPROTO_TCP) {
-				BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs);
-				freemsg(first_mp);
-				return (NULL);
-			}
-		} else {
-			ip_hdr_len = IPV6_HDR_LEN;
-		}
+		/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
+		(connp->conn_recv)(connp, mp, NULL,
+		    ira);
+		CONN_DEC_REF(connp);
 	}
 
-done:
-	if (ipversp != NULL)
-		*ipversp = ipvers;
-	if (ip_hdr_lenp != NULL)
-		*ip_hdr_lenp = ip_hdr_len;
-	if (ippp != NULL)
-		*ippp = ipp;
-	if (ifindexp != NULL)
-		*ifindexp = ifindex;
-	if (mctl_present) {
-		freeb(first_mp);
-	}
-	return (mp);
 }
 
+boolean_t tcp_outbound_squeue_switch = B_FALSE;
+
 /*
  * Handle M_DATA messages from IP. Its called directly from IP via
- * squeue for AF_INET type sockets fast path. No M_CTL are expected
- * in this path.
- *
- * For everything else (including AF_INET6 sockets with 'tcp_ipversion'
- * v4 and v6), we are called through tcp_input() and a M_CTL can
- * be present for options but tcp_find_pktinfo() deals with it. We
- * only expect M_DATA packets after tcp_find_pktinfo() is done.
+ * squeue for received IP packets.
  *
  * The first argument is always the connp/tcp to which the mp belongs.
  * There are no exceptions to this rule. The caller has already put
- * a reference on this connp/tcp and once tcp_rput_data() returns,
+ * a reference on this connp/tcp and once tcp_input_data() returns,
  * the squeue will do the refrele.
  *
- * The TH_SYN for the listener directly go to tcp_conn_request via
- * squeue.
+ * The TH_SYN for the listener directly go to tcp_input_listener via
+ * squeue. ICMP errors go directly to tcp_icmp_input().
  *
  * sqp: NULL = recursive, sqp != NULL means called from squeue
  */
 void
-tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
+tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 {
 	int32_t		bytes_acked;
 	int32_t		gap;
@@ -12729,11 +9807,10 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 	int		seg_len;
 	uint_t		ip_hdr_len;
 	uint32_t	seg_seq;
-	tcph_t		*tcph;
+	tcpha_t		*tcpha;
 	int		urp;
 	tcp_opt_t	tcpopt;
-	uint_t		ipvers;
-	ip6_pkt_t	ipp;
+	ip_pkt_t	ipp;
 	boolean_t	ofo_seg = B_FALSE; /* Out of order segment */
 	uint32_t	cwnd;
 	uint32_t	add;
@@ -12756,33 +9833,43 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 	rptr = mp->b_rptr;
 	ASSERT(OK_32PTR(rptr));
 
-	/*
-	 * An AF_INET socket is not capable of receiving any pktinfo. Do inline
-	 * processing here. For rest call tcp_find_pktinfo to fill up the
-	 * necessary information.
-	 */
-	if (IPCL_IS_TCP4(connp)) {
-		ipvers = IPV4_VERSION;
-		ip_hdr_len = IPH_HDR_LENGTH(rptr);
-	} else {
-		mp = tcp_find_pktinfo(tcp, mp, &ipvers, &ip_hdr_len,
-		    NULL, &ipp);
-		if (mp == NULL) {
-			TCP_STAT(tcps, tcp_rput_v6_error);
-			return;
+	ip_hdr_len = ira->ira_ip_hdr_length;
+	if (connp->conn_recv_ancillary.crb_all != 0) {
+		/*
+		 * Record packet information in the ip_pkt_t
+		 */
+		ipp.ipp_fields = 0;
+		if (ira->ira_flags & IRAF_IS_IPV4) {
+			(void) ip_find_hdr_v4((ipha_t *)rptr, &ipp,
+			    B_FALSE);
+		} else {
+			uint8_t nexthdrp;
+
+			/*
+			 * IPv6 packets can only be received by applications
+			 * that are prepared to receive IPv6 addresses.
+			 * The IP fanout must ensure this.
+			 */
+			ASSERT(connp->conn_family == AF_INET6);
+
+			(void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp,
+			    &nexthdrp);
+			ASSERT(nexthdrp == IPPROTO_TCP);
+
+			/* Could have caused a pullup? */
+			iphdr = mp->b_rptr;
+			rptr = mp->b_rptr;
 		}
-		iphdr = mp->b_rptr;
-		rptr = mp->b_rptr;
 	}
 	ASSERT(DB_TYPE(mp) == M_DATA);
 	ASSERT(mp->b_next == NULL);
 
-	tcph = (tcph_t *)&rptr[ip_hdr_len];
-	seg_seq = ABE32_TO_U32(tcph->th_seq);
-	seg_ack = ABE32_TO_U32(tcph->th_ack);
+	tcpha = (tcpha_t *)&rptr[ip_hdr_len];
+	seg_seq = ntohl(tcpha->tha_seq);
+	seg_ack = ntohl(tcpha->tha_ack);
 	ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
 	seg_len = (int)(mp->b_wptr - rptr) -
-	    (ip_hdr_len + TCP_HDR_LENGTH(tcph));
+	    (ip_hdr_len + TCP_HDR_LENGTH(tcpha));
 	if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) {
 		do {
 			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
@@ -12794,7 +9881,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 
 	if (tcp->tcp_state == TCPS_TIME_WAIT) {
 		tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
-		    seg_len, tcph);
+		    seg_len, tcpha, ira);
 		return;
 	}
 
@@ -12809,7 +9896,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 		tcp->tcp_last_recv_time = lbolt;
 	}
 
-	flags = (unsigned int)tcph->th_flags[0] & 0xFF;
+	flags = (unsigned int)tcpha->tha_flags & 0xFF;
 
 	BUMP_LOCAL(tcp->tcp_ibsegs);
 	DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
@@ -12840,7 +9927,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 		}
 		/* Update pointers into message */
 		iphdr = rptr = mp->b_rptr;
-		tcph = (tcph_t *)&rptr[ip_hdr_len];
+		tcpha = (tcpha_t *)&rptr[ip_hdr_len];
 		if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
 			/*
 			 * Since we can't handle any data with this urgent
@@ -12849,13 +9936,29 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 			 * the urgent mark and generate the M_PCSIG,
 			 * which we can do.
 			 */
-			mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph);
+			mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha);
 			seg_len = 0;
 		}
 	}
 
 	switch (tcp->tcp_state) {
 	case TCPS_SYN_SENT:
+		if (connp->conn_final_sqp == NULL &&
+		    tcp_outbound_squeue_switch && sqp != NULL) {
+			ASSERT(connp->conn_initial_sqp == connp->conn_sqp);
+			connp->conn_final_sqp = sqp;
+			if (connp->conn_final_sqp != connp->conn_sqp) {
+				DTRACE_PROBE1(conn__final__sqp__switch,
+				    conn_t *, connp);
+				CONN_INC_REF(connp);
+				SQUEUE_SWITCH(connp, connp->conn_final_sqp);
+				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+				    tcp_input_data, connp, ira, ip_squeue_flag,
+				    SQTAG_CONNECT_FINISH);
+				return;
+			}
+			DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp);
+		}
 		if (flags & TH_ACK) {
 			/*
 			 * Note that our stack cannot send data before a
@@ -12887,13 +9990,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 		}
 
 		/* Process all TCP options. */
-		tcp_process_options(tcp, tcph);
+		tcp_process_options(tcp, tcpha);
 		/*
 		 * The following changes our rwnd to be a multiple of the
 		 * MIN(peer MSS, our MSS) for performance reason.
 		 */
-		(void) tcp_rwnd_set(tcp,
-		    MSS_ROUNDUP(tcp->tcp_recv_hiwater, tcp->tcp_mss));
+		(void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf,
+		    tcp->tcp_mss));
 
 		/* Is the other end ECN capable? */
 		if (tcp->tcp_ecn_ok) {
@@ -12910,21 +10013,17 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 		tcp->tcp_irs = seg_seq;
 		tcp->tcp_rack = seg_seq;
 		tcp->tcp_rnxt = seg_seq + 1;
-		U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
+		tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt);
 		if (!TCP_IS_DETACHED(tcp)) {
 			/* Allocate room for SACK options if needed. */
-			if (tcp->tcp_snd_sack_ok) {
-				(void) proto_set_tx_wroff(tcp->tcp_rq, connp,
-				    tcp->tcp_hdr_len +
-				    TCPOPT_MAX_SACK_LEN +
-				    (tcp->tcp_loopback ? 0 :
-				    tcps->tcps_wroff_xtra));
-			} else {
-				(void) proto_set_tx_wroff(tcp->tcp_rq, connp,
-				    tcp->tcp_hdr_len +
-				    (tcp->tcp_loopback ? 0 :
-				    tcps->tcps_wroff_xtra));
-			}
+			connp->conn_wroff = connp->conn_ht_iphc_len;
+			if (tcp->tcp_snd_sack_ok)
+				connp->conn_wroff += TCPOPT_MAX_SACK_LEN;
+			if (!tcp->tcp_loopback)
+				connp->conn_wroff += tcps->tcps_wroff_xtra;
+
+			(void) proto_set_tx_wroff(connp->conn_rq, connp,
+			    connp->conn_wroff);
 		}
 		if (flags & TH_ACK) {
 			/*
@@ -12944,15 +10043,14 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 			 * sending up connection confirmation
 			 */
 			tcp->tcp_state = TCPS_ESTABLISHED;
-			if (!tcp_conn_con(tcp, iphdr, tcph, mp,
-			    tcp->tcp_loopback ? &mp1 : NULL)) {
+			if (!tcp_conn_con(tcp, iphdr, mp,
+			    tcp->tcp_loopback ? &mp1 : NULL, ira)) {
 				tcp->tcp_state = TCPS_SYN_SENT;
 				freemsg(mp);
 				return;
 			}
 			/* SYN was acked - making progress */
-			if (tcp->tcp_ipversion == IPV6_VERSION)
-				tcp->tcp_ip_forward_progress = B_TRUE;
+			tcp->tcp_ip_forward_progress = B_TRUE;
 
 			/* One for the SYN */
 			tcp->tcp_suna = tcp->tcp_iss + 1;
@@ -12983,7 +10081,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 			tcp->tcp_swl1 = seg_seq;
 			tcp->tcp_swl2 = seg_ack;
 
-			new_swnd = BE16_TO_U16(tcph->th_win);
+			new_swnd = ntohs(tcpha->tha_win);
 			tcp->tcp_swnd = new_swnd;
 			if (new_swnd > tcp->tcp_max_swnd)
 				tcp->tcp_max_swnd = new_swnd;
@@ -13022,22 +10120,25 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 						    tcp->tcp_ack_tid);
 						tcp->tcp_ack_tid = 0;
 					}
-					tcp_send_data(tcp, tcp->tcp_wq, ack_mp);
+					tcp_send_data(tcp, ack_mp);
 					BUMP_LOCAL(tcp->tcp_obsegs);
 					BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
 
 					if (!IPCL_IS_NONSTR(connp)) {
 						/* Send up T_CONN_CON */
-						putnext(tcp->tcp_rq, mp1);
+						if (ira->ira_cred != NULL) {
+							mblk_setcred(mp1,
+							    ira->ira_cred,
+							    ira->ira_cpid);
+						}
+						putnext(connp->conn_rq, mp1);
 					} else {
-						cred_t	*cr;
-						pid_t	cpid;
-
-						cr = msg_getcred(mp1, &cpid);
 						(*connp->conn_upcalls->
 						    su_connected)
 						    (connp->conn_upper_handle,
-						    tcp->tcp_connid, cr, cpid);
+						    tcp->tcp_connid,
+						    ira->ira_cred,
+						    ira->ira_cpid);
 						freemsg(mp1);
 					}
 
@@ -13054,15 +10155,16 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 				TCP_STAT(tcps, tcp_fusion_unfusable);
 				tcp->tcp_unfusable = B_TRUE;
 				if (!IPCL_IS_NONSTR(connp)) {
-					putnext(tcp->tcp_rq, mp1);
+					if (ira->ira_cred != NULL) {
+						mblk_setcred(mp1, ira->ira_cred,
+						    ira->ira_cpid);
+					}
+					putnext(connp->conn_rq, mp1);
 				} else {
-					cred_t	*cr;
-					pid_t	cpid;
-
-					cr = msg_getcred(mp1, &cpid);
 					(*connp->conn_upcalls->su_connected)
 					    (connp->conn_upper_handle,
-					    tcp->tcp_connid, cr, cpid);
+					    tcp->tcp_connid, ira->ira_cred,
+					    ira->ira_cpid);
 					freemsg(mp1);
 				}
 			}
@@ -13089,13 +10191,8 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 		tcp->tcp_state = TCPS_SYN_RCVD;
 		mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss,
 		    NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
-		if (mp1) {
-			/*
-			 * See comment in tcp_conn_request() for why we use
-			 * the open() time pid here.
-			 */
-			DB_CPID(mp1) = tcp->tcp_cpid;
-			tcp_send_data(tcp, tcp->tcp_wq, mp1);
+		if (mp1 != NULL) {
+			tcp_send_data(tcp, mp1);
 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
 		}
 		freemsg(mp);
@@ -13146,9 +10243,20 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 		conn_t	*new_connp;
 		ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
 
-		new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst);
+		/*
+		 * Don't accept any input on a closed tcp as this TCP logically
+		 * does not exist on the system. Don't proceed further with
+		 * this TCP. For instance, this packet could trigger another
+		 * close of this tcp which would be disastrous for tcp_refcnt.
+		 * tcp_close_detached / tcp_clean_death / tcp_closei_local must
+		 * be called at most once on a TCP. In this case we need to
+		 * refeed the packet into the classifier and figure out where
+		 * the packet should go.
+		 */
+		new_connp = ipcl_classify(mp, ira, ipst);
 		if (new_connp != NULL) {
-			tcp_reinput(new_connp, mp, connp->conn_sqp);
+			/* Drops ref on new_connp */
+			tcp_reinput(new_connp, mp, ira, ipst);
 			return;
 		}
 		/* We failed to classify. For now just drop the packet */
@@ -13194,7 +10302,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 			tcp->tcp_kssl_ctx = NULL;
 
 			tcp->tcp_rnxt += seg_len;
-			U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
+			tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt);
 			flags |= TH_ACK_NEEDED;
 			goto ack_check;
 		}
@@ -13205,13 +10313,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 		return;
 	}
 
-	mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph);
-	urp = BE16_TO_U16(tcph->th_urp) - TCP_OLD_URP_INTERPRETATION;
-	new_swnd = BE16_TO_U16(tcph->th_win) <<
-	    ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws);
+	mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha);
+	urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION;
+	new_swnd = ntohs(tcpha->tha_win) <<
+	    ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
 
 	if (tcp->tcp_snd_ts_ok) {
-		if (!tcp_paws_check(tcp, tcph, &tcpopt)) {
+		if (!tcp_paws_check(tcp, tcpha, &tcpopt)) {
 			/*
 			 * This segment is not acceptable.
 			 * Drop it and send back an ACK.
@@ -13227,7 +10335,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
 		 * SACK info in already updated in tcp_parse_options.  Ignore
 		 * all other TCP options...
 		 */
-		(void) tcp_parse_options(tcph, &tcpopt);
+		(void) tcp_parse_options(tcpha, &tcpopt);
 	}
 try_again:;
 	mss = tcp->tcp_mss;
@@ -13289,7 +10397,7 @@ try_again:;
 			 * Adjust seg_len to the original value for tracing.
 			 */
 			seg_len -= gap;
-			if (tcp->tcp_debug) {
+			if (connp->conn_debug) {
 				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 				    "tcp_rput: unacceptable, gap %d, rgap %d, "
 				    "flags 0x%x, seg_seq %u, seg_ack %u, "
@@ -13436,7 +10544,7 @@ try_again:;
 						return;
 					}
 					if (!TCP_IS_DETACHED(tcp) &&
-					    !putnextctl1(tcp->tcp_rq,
+					    !putnextctl1(connp->conn_rq,
 					    M_PCSIG, SIGURG)) {
 						/* Try again on the rexmit. */
 						freemsg(mp1);
@@ -13505,7 +10613,7 @@ ok:;
 		 * same segment.  In this case, we once again turn
 		 * on ECN_ECHO.
 		 */
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
+		if (connp->conn_ipversion == IPV4_VERSION) {
 			uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service;
 
 			if ((tos & IPH_ECN_CE) == IPH_ECN_CE) {
@@ -13705,7 +10813,7 @@ ok:;
 					return;
 				}
 				if (!TCP_IS_DETACHED(tcp) &&
-				    !putnextctl1(tcp->tcp_rq, M_PCSIG,
+				    !putnextctl1(connp->conn_rq, M_PCSIG,
 				    SIGURG)) {
 					/* Try again on the rexmit. */
 					freemsg(mp1);
@@ -13739,7 +10847,7 @@ ok:;
 		} else if (tcp->tcp_urp_mark_mp != NULL) {
 			/*
 			 * An allocation failure prevented the previous
-			 * tcp_rput_data from sending up the allocated
+			 * tcp_input_data from sending up the allocated
 			 * MSG*MARKNEXT message - send it up this time
 			 * around.
 			 */
@@ -13775,14 +10883,14 @@ ok:;
 						 */
 						(void) adjmsg(mp,
 						    urp - seg_len);
-						tcp_rput_data(connp,
-						    mp, NULL);
+						tcp_input_data(connp,
+						    mp, NULL, ira);
 						return;
 					}
 					(void) adjmsg(mp1, urp - seg_len);
 					/* Feed this piece back in. */
 					tmp_rnxt = tcp->tcp_rnxt;
-					tcp_rput_data(connp, mp1, NULL);
+					tcp_input_data(connp, mp1, NULL, ira);
 					/*
 					 * If the data passed back in was not
 					 * processed (ie: bad ACK) sending
@@ -13811,13 +10919,13 @@ ok:;
 						 */
 						(void) adjmsg(mp,
 						    urp + 1 - seg_len);
-						tcp_rput_data(connp,
-						    mp, NULL);
+						tcp_input_data(connp,
+						    mp, NULL, ira);
 						return;
 					}
 					(void) adjmsg(mp1, urp + 1 - seg_len);
 					tmp_rnxt = tcp->tcp_rnxt;
-					tcp_rput_data(connp, mp1, NULL);
+					tcp_input_data(connp, mp1, NULL, ira);
 					/*
 					 * If the data passed back in was not
 					 * processed (ie: bad ACK) sending
@@ -13831,7 +10939,7 @@ ok:;
 						return;
 					}
 				}
-				tcp_rput_data(connp, mp, NULL);
+				tcp_input_data(connp, mp, NULL, ira);
 				return;
 			}
 			/*
@@ -13960,7 +11068,7 @@ process_ack:
 	}
 	bytes_acked = (int)(seg_ack - tcp->tcp_suna);
 
-	if (tcp->tcp_ipversion == IPV6_VERSION && bytes_acked > 0)
+	if (bytes_acked > 0)
 		tcp->tcp_ip_forward_progress = B_TRUE;
 	if (tcp->tcp_state == TCPS_SYN_RCVD) {
 		if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) &&
@@ -13983,7 +11091,7 @@ process_ack:
 
 			/*
 			 * The listener also exists because of the refhold
-			 * done in tcp_conn_request. Its possible that it
+			 * done in tcp_input_listener. Its possible that it
 			 * might have closed. We will check that once we
 			 * get inside listeners context.
 			 */
@@ -14005,12 +11113,12 @@ process_ack:
 			} else if (!tcp->tcp_loopback) {
 				SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
 				    mp, tcp_send_conn_ind,
-				    listener->tcp_connp, SQ_FILL,
+				    listener->tcp_connp, NULL, SQ_FILL,
 				    SQTAG_TCP_CONN_IND);
 			} else {
 				SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
 				    mp, tcp_send_conn_ind,
-				    listener->tcp_connp, SQ_PROCESS,
+				    listener->tcp_connp, NULL, SQ_PROCESS,
 				    SQTAG_TCP_CONN_IND);
 			}
 		}
@@ -14026,7 +11134,7 @@ process_ack:
 		 */
 		tcp->tcp_state = TCPS_ESTABLISHED;
 		if (tcp->tcp_active_open) {
-			if (!tcp_conn_con(tcp, iphdr, tcph, mp, NULL)) {
+			if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) {
 				freemsg(mp);
 				tcp->tcp_state = TCPS_SYN_RCVD;
 				return;
@@ -14044,8 +11152,7 @@ process_ack:
 		tcp->tcp_suna = tcp->tcp_iss + 1;	/* One for the SYN */
 		bytes_acked--;
 		/* SYN was acked - making progress */
-		if (tcp->tcp_ipversion == IPV6_VERSION)
-			tcp->tcp_ip_forward_progress = B_TRUE;
+		tcp->tcp_ip_forward_progress = B_TRUE;
 
 		/*
 		 * If SYN was retransmitted, need to reset all
@@ -14083,7 +11190,7 @@ process_ack:
 
 		/* Fuse when both sides are in ESTABLISHED state */
 		if (tcp->tcp_loopback && do_tcp_fusion)
-			tcp_fuse(tcp, iphdr, tcph);
+			tcp_fuse(tcp, iphdr, tcpha);
 
 	}
 	/* This code follows 4.4BSD-Lite2 mostly. */
@@ -14388,7 +11495,7 @@ process_ack:
 			if (mp != NULL) {
 				BUMP_LOCAL(tcp->tcp_obsegs);
 				BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
-				tcp_send_data(tcp, tcp->tcp_wq, mp);
+				tcp_send_data(tcp, mp);
 			}
 			return;
 		}
@@ -14487,7 +11594,6 @@ process_ack:
 				}
 			} else {
 				tcp->tcp_rexmit = B_FALSE;
-				tcp->tcp_xmit_zc_clean = B_FALSE;
 				tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
 				tcp->tcp_snd_burst = tcp->tcp_localnet ?
 				    TCP_CWND_INFINITE : TCP_CWND_NORMAL;
@@ -14662,8 +11768,7 @@ fin_acked:
 			tcp->tcp_xmit_tail = NULL;
 			if (tcp->tcp_fin_sent) {
 				/* FIN was acked - making progress */
-				if (tcp->tcp_ipversion == IPV6_VERSION &&
-				    !tcp->tcp_fin_acked)
+				if (!tcp->tcp_fin_acked)
 					tcp->tcp_ip_forward_progress = B_TRUE;
 				tcp->tcp_fin_acked = B_TRUE;
 				if (tcp->tcp_linger_tid != 0 &&
@@ -14781,7 +11886,7 @@ est:
 				 * bit so this TIME-WAIT connection won't
 				 * interfere with new ones.
 				 */
-				tcp->tcp_exclbind = 0;
+				connp->conn_exclbind = 0;
 				if (!TCP_IS_DETACHED(tcp)) {
 					TCP_TIMER_RESTART(tcp,
 					    tcps->tcps_time_wait_interval);
@@ -14805,8 +11910,8 @@ est:
 		if (!tcp->tcp_fin_rcvd) {
 			tcp->tcp_fin_rcvd = B_TRUE;
 			tcp->tcp_rnxt++;
-			tcph = tcp->tcp_tcph;
-			U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
+			tcpha = tcp->tcp_tcpha;
+			tcpha->tha_ack = htonl(tcp->tcp_rnxt);
 
 			/*
 			 * Generate the ordrel_ind at the end unless we
@@ -14815,7 +11920,7 @@ est:
 			 * after tcp_accept is done.
 			 */
 			if (tcp->tcp_listener == NULL &&
-			    !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding))
+			    !TCP_IS_DETACHED(tcp) && !tcp->tcp_hard_binding)
 				flags |= TH_ORDREL_NEEDED;
 			switch (tcp->tcp_state) {
 			case TCPS_SYN_RCVD:
@@ -14836,7 +11941,7 @@ est:
 				 * bit so this TIME-WAIT connection won't
 				 * interfere with new ones.
 				 */
-				tcp->tcp_exclbind = 0;
+				connp->conn_exclbind = 0;
 				if (!TCP_IS_DETACHED(tcp)) {
 					TCP_TIMER_RESTART(tcp,
 					    tcps->tcps_time_wait_interval);
@@ -14872,7 +11977,7 @@ est:
 		freeb(mp1);
 	}
 update_ack:
-	tcph = tcp->tcp_tcph;
+	tcpha = tcp->tcp_tcpha;
 	tcp->tcp_rack_cnt++;
 	{
 		uint32_t cur_max;
@@ -14915,7 +12020,7 @@ update_ack:
 		}
 	}
 	tcp->tcp_rnxt += seg_len;
-	U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
+	tcpha->tha_ack = htonl(tcp->tcp_rnxt);
 
 	if (mp == NULL)
 		goto xmit_check;
@@ -14942,12 +12047,13 @@ update_ack:
 	/*
 	 * Check for ancillary data changes compared to last segment.
 	 */
-	if (tcp->tcp_ipv6_recvancillary != 0) {
-		mp = tcp_rput_add_ancillary(tcp, mp, &ipp);
-		ASSERT(mp != NULL);
+	if (connp->conn_recv_ancillary.crb_all != 0) {
+		mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira);
+		if (mp == NULL)
+			return;
 	}
 
-	if (tcp->tcp_listener || tcp->tcp_hard_binding) {
+	if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) {
 		/*
 		 * Side queue inbound data until the accept happens.
 		 * tcp_accept/tcp_rput drains this when the accept happens.
@@ -14961,9 +12067,9 @@ update_ack:
 		if (tcp->tcp_kssl_pending) {
 			DTRACE_PROBE1(kssl_mblk__ksslinput_pending,
 			    mblk_t *, mp);
-			tcp_kssl_input(tcp, mp);
+			tcp_kssl_input(tcp, mp, ira->ira_cred);
 		} else {
-			tcp_rcv_enqueue(tcp, mp, seg_len);
+			tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
 		}
 	} else if (IPCL_IS_NONSTR(connp)) {
 		/*
@@ -15015,19 +12121,22 @@ update_ack:
 			    (DB_TYPE(mp) == M_DATA)) {
 				DTRACE_PROBE1(kssl_mblk__ksslinput_data1,
 				    mblk_t *, mp);
-				tcp_kssl_input(tcp, mp);
+				tcp_kssl_input(tcp, mp, ira->ira_cred);
 			} else {
-				putnext(tcp->tcp_rq, mp);
-				if (!canputnext(tcp->tcp_rq))
+				if (is_system_labeled())
+					tcp_setcred_data(mp, ira);
+
+				putnext(connp->conn_rq, mp);
+				if (!canputnext(connp->conn_rq))
 					tcp->tcp_rwnd -= seg_len;
 			}
 		} else if ((tcp->tcp_kssl_ctx != NULL) &&
 		    (DB_TYPE(mp) == M_DATA)) {
 			/* Does this need SSL processing first? */
 			DTRACE_PROBE1(kssl_mblk__ksslinput_data2, mblk_t *, mp);
-			tcp_kssl_input(tcp, mp);
+			tcp_kssl_input(tcp, mp, ira->ira_cred);
 		} else if ((flags & (TH_PUSH|TH_FIN)) ||
-		    tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_recv_hiwater >> 3) {
+		    tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) {
 			if (tcp->tcp_rcv_list != NULL) {
 				/*
 				 * Enqueue the new segment first and then
@@ -15042,11 +12151,15 @@ update_ack:
 				 * canputnext() as tcp_rcv_drain() needs to
 				 * call canputnext().
 				 */
-				tcp_rcv_enqueue(tcp, mp, seg_len);
+				tcp_rcv_enqueue(tcp, mp, seg_len,
+				    ira->ira_cred);
 				flags |= tcp_rcv_drain(tcp);
 			} else {
-				putnext(tcp->tcp_rq, mp);
-				if (!canputnext(tcp->tcp_rq))
+				if (is_system_labeled())
+					tcp_setcred_data(mp, ira);
+
+				putnext(connp->conn_rq, mp);
+				if (!canputnext(connp->conn_rq))
 					tcp->tcp_rwnd -= seg_len;
 			}
 		} else {
@@ -15054,7 +12167,7 @@ update_ack:
 			 * Enqueue all packets when processing an mblk
 			 * from the co queue and also enqueue normal packets.
 			 */
-			tcp_rcv_enqueue(tcp, mp, seg_len);
+			tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
 		}
 		/*
 		 * Make sure the timer is running if we have data waiting
@@ -15103,7 +12216,7 @@ xmit_check:
 				BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs);
 				UPDATE_MIB(&tcps->tcps_mib,
 				    tcpRetransBytes, snd_size);
-				tcp_send_data(tcp, tcp->tcp_wq, mp1);
+				tcp_send_data(tcp, mp1);
 			}
 		}
 		if (flags & TH_NEED_SACK_REXMIT) {
@@ -15155,7 +12268,10 @@ ack_check:
 		ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
 		mp1 = tcp->tcp_urp_mark_mp;
 		tcp->tcp_urp_mark_mp = NULL;
-		putnext(tcp->tcp_rq, mp1);
+		if (is_system_labeled())
+			tcp_setcred_data(mp1, ira);
+
+		putnext(connp->conn_rq, mp1);
 #ifdef DEBUG
 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_rput: sending zero-length %s %s",
@@ -15172,7 +12288,7 @@ ack_check:
 		mp1 = tcp_ack_mp(tcp);
 
 		if (mp1 != NULL) {
-			tcp_send_data(tcp, tcp->tcp_wq, mp1);
+			tcp_send_data(tcp, mp1);
 			BUMP_LOCAL(tcp->tcp_obsegs);
 			BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
 		}
@@ -15200,6 +12316,7 @@ ack_check:
 		 * after tcp_accept is done.
 		 */
 		ASSERT(tcp->tcp_listener == NULL);
+		ASSERT(!tcp->tcp_detached);
 
 		if (IPCL_IS_NONSTR(connp)) {
 			ASSERT(tcp->tcp_ordrel_mp == NULL);
@@ -15220,7 +12337,7 @@ ack_check:
 		mp1 = tcp->tcp_ordrel_mp;
 		tcp->tcp_ordrel_mp = NULL;
 		tcp->tcp_ordrel_done = B_TRUE;
-		putnext(tcp->tcp_rq, mp1);
+		putnext(connp->conn_rq, mp1);
 	}
 done:
 	ASSERT(!(flags & TH_MARKNEXT_NEEDED));
@@ -15251,21 +12368,22 @@ tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt)
  * segment passes the PAWS test, else returns B_FALSE.
  */
 boolean_t
-tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp)
+tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp)
 {
 	uint8_t	flags;
 	int	options;
 	uint8_t *up;
+	conn_t	*connp = tcp->tcp_connp;
 
-	flags = (unsigned int)tcph->th_flags[0] & 0xFF;
+	flags = (unsigned int)tcpha->tha_flags & 0xFF;
 	/*
 	 * If timestamp option is aligned nicely, get values inline,
 	 * otherwise call general routine to parse.  Only do that
 	 * if timestamp is the only option.
 	 */
-	if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH +
+	if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH +
 	    TCPOPT_REAL_TS_LEN &&
-	    OK_32PTR((up = ((uint8_t *)tcph) +
+	    OK_32PTR((up = ((uint8_t *)tcpha) +
 	    TCP_MIN_HEADER_LENGTH)) &&
 	    *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) {
 		tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4));
@@ -15278,7 +12396,7 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp)
 		} else {
 			tcpoptp->tcp = NULL;
 		}
-		options = tcp_parse_options(tcph, tcpoptp);
+		options = tcp_parse_options(tcpha, tcpoptp);
 	}
 
 	if (options & TCP_OPT_TSTAMP_PRESENT) {
@@ -15311,16 +12429,15 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp)
 		 */
 		tcp->tcp_snd_ts_ok = B_FALSE;
 
-		tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN;
-		tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN;
-		tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4);
+		connp->conn_ht_iphc_len -= TCPOPT_REAL_TS_LEN;
+		connp->conn_ht_ulp_len -= TCPOPT_REAL_TS_LEN;
+		tcp->tcp_tcpha->tha_offset_and_reserved -= (3 << 4);
 		/*
-		 * Adjust the tcp_mss accordingly. We also need to
-		 * adjust tcp_cwnd here in accordance with the new mss.
-		 * But we avoid doing a slow start here so as to not
-		 * to lose on the transfer rate built up so far.
+		 * Adjust the tcp_mss and tcp_cwnd accordingly. We avoid
+		 * doing a slow start here so as to not to lose on the
+		 * transfer rate built up so far.
 		 */
-		tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN, B_FALSE);
+		tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN);
 		if (tcp->tcp_snd_sack_ok) {
 			ASSERT(tcp->tcp_sack_info != NULL);
 			tcp->tcp_max_sack_blk = 4;
@@ -15338,38 +12455,37 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp)
  * when memory allocation fails we can just wait for the next data segment.
  */
 static mblk_t *
-tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
+tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
+    ip_recv_attr_t *ira)
 {
 	struct T_optdata_ind *todi;
 	int optlen;
 	uchar_t *optptr;
 	struct T_opthdr *toh;
-	uint_t addflag;	/* Which pieces to add */
+	crb_t addflag;	/* Which pieces to add */
 	mblk_t *mp1;
+	conn_t	*connp = tcp->tcp_connp;
 
 	optlen = 0;
-	addflag = 0;
+	addflag.crb_all = 0;
 	/* If app asked for pktinfo and the index has changed ... */
-	if ((ipp->ipp_fields & IPPF_IFINDEX) &&
-	    ipp->ipp_ifindex != tcp->tcp_recvifindex &&
-	    (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)) {
+	if (connp->conn_recv_ancillary.crb_ip_recvpktinfo &&
+	    ira->ira_ruifindex != tcp->tcp_recvifindex) {
 		optlen += sizeof (struct T_opthdr) +
 		    sizeof (struct in6_pktinfo);
-		addflag |= TCP_IPV6_RECVPKTINFO;
+		addflag.crb_ip_recvpktinfo = 1;
 	}
 	/* If app asked for hoplimit and it has changed ... */
-	if ((ipp->ipp_fields & IPPF_HOPLIMIT) &&
-	    ipp->ipp_hoplimit != tcp->tcp_recvhops &&
-	    (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPLIMIT)) {
+	if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit &&
+	    ipp->ipp_hoplimit != tcp->tcp_recvhops) {
 		optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
-		addflag |= TCP_IPV6_RECVHOPLIMIT;
+		addflag.crb_ipv6_recvhoplimit = 1;
 	}
 	/* If app asked for tclass and it has changed ... */
-	if ((ipp->ipp_fields & IPPF_TCLASS) &&
-	    ipp->ipp_tclass != tcp->tcp_recvtclass &&
-	    (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)) {
+	if (connp->conn_recv_ancillary.crb_ipv6_recvtclass &&
+	    ipp->ipp_tclass != tcp->tcp_recvtclass) {
 		optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
-		addflag |= TCP_IPV6_RECVTCLASS;
+		addflag.crb_ipv6_recvtclass = 1;
 	}
 	/*
 	 * If app asked for hopbyhop headers and it has changed ...
@@ -15377,51 +12493,51 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
 	 * a connected socket at all, (2) we're connected to at most one peer,
 	 * (3) if anything changes, then it must be some other extra option.
 	 */
-	if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) &&
+	if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts &&
 	    ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen,
 	    (ipp->ipp_fields & IPPF_HOPOPTS),
 	    ipp->ipp_hopopts, ipp->ipp_hopoptslen)) {
-		optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen -
-		    tcp->tcp_label_len;
-		addflag |= TCP_IPV6_RECVHOPOPTS;
+		optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen;
+		addflag.crb_ipv6_recvhopopts = 1;
 		if (!ip_allocbuf((void **)&tcp->tcp_hopopts,
 		    &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS),
 		    ipp->ipp_hopopts, ipp->ipp_hopoptslen))
 			return (mp);
 	}
 	/* If app asked for dst headers before routing headers ... */
-	if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTDSTOPTS) &&
-	    ip_cmpbuf(tcp->tcp_rtdstopts, tcp->tcp_rtdstoptslen,
-	    (ipp->ipp_fields & IPPF_RTDSTOPTS),
-	    ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) {
+	if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts &&
+	    ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen,
+	    (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+	    ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) {
 		optlen += sizeof (struct T_opthdr) +
-		    ipp->ipp_rtdstoptslen;
-		addflag |= TCP_IPV6_RECVRTDSTOPTS;
-		if (!ip_allocbuf((void **)&tcp->tcp_rtdstopts,
-		    &tcp->tcp_rtdstoptslen, (ipp->ipp_fields & IPPF_RTDSTOPTS),
-		    ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen))
+		    ipp->ipp_rthdrdstoptslen;
+		addflag.crb_ipv6_recvrthdrdstopts = 1;
+		if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts,
+		    &tcp->tcp_rthdrdstoptslen,
+		    (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+		    ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen))
 			return (mp);
 	}
 	/* If app asked for routing headers and it has changed ... */
-	if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) &&
+	if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr &&
 	    ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen,
 	    (ipp->ipp_fields & IPPF_RTHDR),
 	    ipp->ipp_rthdr, ipp->ipp_rthdrlen)) {
 		optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen;
-		addflag |= TCP_IPV6_RECVRTHDR;
+		addflag.crb_ipv6_recvrthdr = 1;
 		if (!ip_allocbuf((void **)&tcp->tcp_rthdr,
 		    &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR),
 		    ipp->ipp_rthdr, ipp->ipp_rthdrlen))
 			return (mp);
 	}
 	/* If app asked for dest headers and it has changed ... */
-	if ((tcp->tcp_ipv6_recvancillary &
-	    (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) &&
+	if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts ||
+	    connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) &&
 	    ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen,
 	    (ipp->ipp_fields & IPPF_DSTOPTS),
 	    ipp->ipp_dstopts, ipp->ipp_dstoptslen)) {
 		optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen;
-		addflag |= TCP_IPV6_RECVDSTOPTS;
+		addflag.crb_ipv6_recvdstopts = 1;
 		if (!ip_allocbuf((void **)&tcp->tcp_dstopts,
 		    &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS),
 		    ipp->ipp_dstopts, ipp->ipp_dstoptslen))
@@ -15454,9 +12570,11 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
 	 * If app asked for pktinfo and the index has changed ...
 	 * Note that the local address never changes for the connection.
 	 */
-	if (addflag & TCP_IPV6_RECVPKTINFO) {
+	if (addflag.crb_ip_recvpktinfo) {
 		struct in6_pktinfo *pkti;
+		uint_t ifindex;
 
+		ifindex = ira->ira_ruifindex;
 		toh = (struct T_opthdr *)optptr;
 		toh->level = IPPROTO_IPV6;
 		toh->name = IPV6_PKTINFO;
@@ -15464,19 +12582,15 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
 		toh->status = 0;
 		optptr += sizeof (*toh);
 		pkti = (struct in6_pktinfo *)optptr;
-		if (tcp->tcp_ipversion == IPV6_VERSION)
-			pkti->ipi6_addr = tcp->tcp_ip6h->ip6_src;
-		else
-			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
-			    &pkti->ipi6_addr);
-		pkti->ipi6_ifindex = ipp->ipp_ifindex;
+		pkti->ipi6_addr = connp->conn_laddr_v6;
+		pkti->ipi6_ifindex = ifindex;
 		optptr += sizeof (*pkti);
 		ASSERT(OK_32PTR(optptr));
 		/* Save as "last" value */
-		tcp->tcp_recvifindex = ipp->ipp_ifindex;
+		tcp->tcp_recvifindex = ifindex;
 	}
 	/* If app asked for hoplimit and it has changed ... */
-	if (addflag & TCP_IPV6_RECVHOPLIMIT) {
+	if (addflag.crb_ipv6_recvhoplimit) {
 		toh = (struct T_opthdr *)optptr;
 		toh->level = IPPROTO_IPV6;
 		toh->name = IPV6_HOPLIMIT;
@@ -15490,7 +12604,7 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
 		tcp->tcp_recvhops = ipp->ipp_hoplimit;
 	}
 	/* If app asked for tclass and it has changed ... */
-	if (addflag & TCP_IPV6_RECVTCLASS) {
+	if (addflag.crb_ipv6_recvtclass) {
 		toh = (struct T_opthdr *)optptr;
 		toh->level = IPPROTO_IPV6;
 		toh->name = IPV6_TCLASS;
@@ -15503,40 +12617,38 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
 		/* Save as "last" value */
 		tcp->tcp_recvtclass = ipp->ipp_tclass;
 	}
-	if (addflag & TCP_IPV6_RECVHOPOPTS) {
+	if (addflag.crb_ipv6_recvhopopts) {
 		toh = (struct T_opthdr *)optptr;
 		toh->level = IPPROTO_IPV6;
 		toh->name = IPV6_HOPOPTS;
-		toh->len = sizeof (*toh) + ipp->ipp_hopoptslen -
-		    tcp->tcp_label_len;
+		toh->len = sizeof (*toh) + ipp->ipp_hopoptslen;
 		toh->status = 0;
 		optptr += sizeof (*toh);
-		bcopy((uchar_t *)ipp->ipp_hopopts + tcp->tcp_label_len, optptr,
-		    ipp->ipp_hopoptslen - tcp->tcp_label_len);
-		optptr += ipp->ipp_hopoptslen - tcp->tcp_label_len;
+		bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen);
+		optptr += ipp->ipp_hopoptslen;
 		ASSERT(OK_32PTR(optptr));
 		/* Save as last value */
 		ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen,
 		    (ipp->ipp_fields & IPPF_HOPOPTS),
 		    ipp->ipp_hopopts, ipp->ipp_hopoptslen);
 	}
-	if (addflag & TCP_IPV6_RECVRTDSTOPTS) {
+	if (addflag.crb_ipv6_recvrthdrdstopts) {
 		toh = (struct T_opthdr *)optptr;
 		toh->level = IPPROTO_IPV6;
 		toh->name = IPV6_RTHDRDSTOPTS;
-		toh->len = sizeof (*toh) + ipp->ipp_rtdstoptslen;
+		toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen;
 		toh->status = 0;
 		optptr += sizeof (*toh);
-		bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen);
-		optptr += ipp->ipp_rtdstoptslen;
+		bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen);
+		optptr += ipp->ipp_rthdrdstoptslen;
 		ASSERT(OK_32PTR(optptr));
 		/* Save as last value */
-		ip_savebuf((void **)&tcp->tcp_rtdstopts,
-		    &tcp->tcp_rtdstoptslen,
-		    (ipp->ipp_fields & IPPF_RTDSTOPTS),
-		    ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen);
+		ip_savebuf((void **)&tcp->tcp_rthdrdstopts,
+		    &tcp->tcp_rthdrdstoptslen,
+		    (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+		    ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
 	}
-	if (addflag & TCP_IPV6_RECVRTHDR) {
+	if (addflag.crb_ipv6_recvrthdr) {
 		toh = (struct T_opthdr *)optptr;
 		toh->level = IPPROTO_IPV6;
 		toh->name = IPV6_RTHDR;
@@ -15551,7 +12663,7 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
 		    (ipp->ipp_fields & IPPF_RTHDR),
 		    ipp->ipp_rthdr, ipp->ipp_rthdrlen);
 	}
-	if (addflag & (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) {
+	if (addflag.crb_ipv6_recvdstopts) {
 		toh = (struct T_opthdr *)optptr;
 		toh->level = IPPROTO_IPV6;
 		toh->name = IPV6_DSTOPTS;
@@ -15570,99 +12682,13 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
 	return (mp);
 }
 
-/*
- * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA
- * messages.
- */
-void
-tcp_rput_other(tcp_t *tcp, mblk_t *mp)
-{
-	uchar_t	*rptr = mp->b_rptr;
-	queue_t	*q = tcp->tcp_rq;
-	struct T_error_ack *tea;
-
-	switch (mp->b_datap->db_type) {
-	case M_PROTO:
-	case M_PCPROTO:
-		ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
-		if ((mp->b_wptr - rptr) < sizeof (t_scalar_t))
-			break;
-		tea = (struct T_error_ack *)rptr;
-		ASSERT(tea->PRIM_type != T_BIND_ACK);
-		ASSERT(tea->ERROR_prim != O_T_BIND_REQ &&
-		    tea->ERROR_prim != T_BIND_REQ);
-		switch (tea->PRIM_type) {
-		case T_ERROR_ACK:
-			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MOD_ID, 0, 1,
-				    SL_TRACE|SL_ERROR,
-				    "tcp_rput_other: case T_ERROR_ACK, "
-				    "ERROR_prim == %d",
-				    tea->ERROR_prim);
-			}
-			switch (tea->ERROR_prim) {
-			case T_SVR4_OPTMGMT_REQ:
-				if (tcp->tcp_drop_opt_ack_cnt > 0) {
-					/* T_OPTMGMT_REQ generated by TCP */
-					printf("T_SVR4_OPTMGMT_REQ failed "
-					    "%d/%d - dropped (cnt %d)\n",
-					    tea->TLI_error, tea->UNIX_error,
-					    tcp->tcp_drop_opt_ack_cnt);
-					freemsg(mp);
-					tcp->tcp_drop_opt_ack_cnt--;
-					return;
-				}
-				break;
-			}
-			if (tea->ERROR_prim == T_SVR4_OPTMGMT_REQ &&
-			    tcp->tcp_drop_opt_ack_cnt > 0) {
-				printf("T_SVR4_OPTMGMT_REQ failed %d/%d "
-				    "- dropped (cnt %d)\n",
-				    tea->TLI_error, tea->UNIX_error,
-				    tcp->tcp_drop_opt_ack_cnt);
-				freemsg(mp);
-				tcp->tcp_drop_opt_ack_cnt--;
-				return;
-			}
-			break;
-		case T_OPTMGMT_ACK:
-			if (tcp->tcp_drop_opt_ack_cnt > 0) {
-				/* T_OPTMGMT_REQ generated by TCP */
-				freemsg(mp);
-				tcp->tcp_drop_opt_ack_cnt--;
-				return;
-			}
-			break;
-		default:
-			ASSERT(tea->ERROR_prim != T_UNBIND_REQ);
-			break;
-		}
-		break;
-	case M_FLUSH:
-		if (*rptr & FLUSHR)
-			flushq(q, FLUSHDATA);
-		break;
-	default:
-		/* M_CTL will be directly sent to tcp_icmp_error() */
-		ASSERT(DB_TYPE(mp) != M_CTL);
-		break;
-	}
-	/*
-	 * Make sure we set this bit before sending the ACK for
-	 * bind. Otherwise accept could possibly run and free
-	 * this tcp struct.
-	 */
-	ASSERT(q != NULL);
-	putnext(q, mp);
-}
-
 /* ARGSUSED */
 static void
-tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
+tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	conn_t	*connp = (conn_t *)arg;
 	tcp_t	*tcp = connp->conn_tcp;
-	queue_t	*q = tcp->tcp_rq;
+	queue_t	*q = connp->conn_rq;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
 	ASSERT(!IPCL_IS_NONSTR(connp));
@@ -15683,7 +12709,7 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
 
 	if (canputnext(q)) {
 		/* Not flow-controlled, open rwnd */
-		tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+		tcp->tcp_rwnd = connp->conn_rcvbuf;
 
 		/*
 		 * Send back a window update immediately if TCP is above
@@ -15712,16 +12738,10 @@ tcp_rsrv(queue_t *q)
 	conn_t		*connp = Q_TO_CONN(q);
 	tcp_t		*tcp = connp->conn_tcp;
 	mblk_t		*mp;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
 	/* No code does a putq on the read side */
 	ASSERT(q->q_first == NULL);
 
-	/* Nothing to do for the default queue */
-	if (q == tcps->tcps_g_q) {
-		return;
-	}
-
 	/*
 	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already
 	 * been run.  So just return.
@@ -15736,7 +12756,7 @@ tcp_rsrv(queue_t *q)
 
 	CONN_INC_REF(connp);
 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp,
-	    SQ_PROCESS, SQTAG_TCP_RSRV);
+	    NULL, SQ_PROCESS, SQTAG_TCP_RSRV);
 }
 
 /*
@@ -15746,8 +12766,8 @@ tcp_rsrv(queue_t *q)
  *
  * This function is called in 2 cases:
  *
- * 1) Before data transfer begins, in tcp_accept_comm() for accepting a
- *    connection (passive open) and in tcp_rput_data() for active connect.
+ * 1) Before data transfer begins, in tcp_input_listener() for accepting a
+ *    connection (passive open) and in tcp_input_data() for active connect.
  *    This is called after tcp_mss_set() when the desired MSS value is known.
  *    This makes sure that our window size is a mutiple of the other side's
  *    MSS.
@@ -15766,6 +12786,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
 	uint32_t	max_transmittable_rwnd;
 	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
 	/*
 	 * Insist on a receive window that is at least
@@ -15782,7 +12803,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
 		ASSERT(peer_tcp != NULL);
 		sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd);
 		if (!tcp_detached) {
-			(void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp,
+			(void) proto_set_rx_hiwat(connp->conn_rq, connp,
 			    sth_hiwat);
 			tcp_set_recv_threshold(tcp, sth_hiwat >> 3);
 		}
@@ -15797,11 +12818,10 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
 		return (sth_hiwat);
 	}
 
-	if (tcp_detached) {
+	if (tcp_detached)
 		old_max_rwnd = tcp->tcp_rwnd;
-	} else {
-		old_max_rwnd = tcp->tcp_recv_hiwater;
-	}
+	else
+		old_max_rwnd = connp->conn_rcvbuf;
 
 
 	/*
@@ -15854,9 +12874,14 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
 	 * connection.)
 	 */
 	tcp->tcp_rwnd += rwnd - old_max_rwnd;
-	tcp->tcp_recv_hiwater = rwnd;
+	connp->conn_rcvbuf = rwnd;
+
+	/* Are we already connected? */
+	if (tcp->tcp_tcpha != NULL) {
+		tcp->tcp_tcpha->tha_win =
+		    htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
+	}
 
-	U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win);
 	if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max)
 		tcp->tcp_cwnd_max = rwnd;
 
@@ -15865,7 +12890,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
 
 	tcp_set_recv_threshold(tcp, rwnd >> 3);
 
-	(void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp, rwnd);
+	(void) proto_set_rx_hiwat(connp->conn_rq, connp, rwnd);
 	return (rwnd);
 }
 
@@ -15944,7 +12969,7 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
 		connp = NULL;
 
 		while ((connp =
-		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) {
 			tcp_t *tcp;
 			boolean_t needattr;
 
@@ -15992,11 +13017,10 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
 				needattr = B_TRUE;
 				break;
 			}
-			if (connp->conn_fully_bound &&
-			    connp->conn_effective_cred != NULL) {
+			if (connp->conn_ixa->ixa_tsl != NULL) {
 				ts_label_t *tsl;
 
-				tsl = crgetlabel(connp->conn_effective_cred);
+				tsl = connp->conn_ixa->ixa_tsl;
 				mlp.tme_flags |= MIB2_TMEF_IS_LABELED;
 				mlp.tme_doi = label2doi(tsl);
 				mlp.tme_label = *label2bslabel(tsl);
@@ -16004,12 +13028,17 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
 			}
 
 			/* Create a message to report on IPv6 entries */
-			if (tcp->tcp_ipversion == IPV6_VERSION) {
-			tce6.tcp6ConnLocalAddress = tcp->tcp_ip_src_v6;
-			tce6.tcp6ConnRemAddress = tcp->tcp_remote_v6;
-			tce6.tcp6ConnLocalPort = ntohs(tcp->tcp_lport);
-			tce6.tcp6ConnRemPort = ntohs(tcp->tcp_fport);
-			tce6.tcp6ConnIfIndex = tcp->tcp_bound_if;
+			if (connp->conn_ipversion == IPV6_VERSION) {
+			tce6.tcp6ConnLocalAddress = connp->conn_laddr_v6;
+			tce6.tcp6ConnRemAddress = connp->conn_faddr_v6;
+			tce6.tcp6ConnLocalPort = ntohs(connp->conn_lport);
+			tce6.tcp6ConnRemPort = ntohs(connp->conn_fport);
+			if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) {
+				tce6.tcp6ConnIfIndex =
+				    connp->conn_ixa->ixa_scopeid;
+			} else {
+				tce6.tcp6ConnIfIndex = connp->conn_bound_if;
+			}
 			/* Don't want just anybody seeing these... */
 			if (ispriv) {
 				tce6.tcp6ConnEntryInfo.ce_snxt =
@@ -16041,9 +13070,9 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
 			tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state;
 
 			tce6.tcp6ConnCreationProcess =
-			    (tcp->tcp_cpid < 0) ? MIB2_UNKNOWN_PROCESS :
-			    tcp->tcp_cpid;
-			tce6.tcp6ConnCreationTime = tcp->tcp_open_time;
+			    (connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS :
+			    connp->conn_cpid;
+			tce6.tcp6ConnCreationTime = connp->conn_open_time;
 
 			(void) snmp_append_data2(mp6_conn_ctl->b_cont,
 			    &mp6_conn_tail, (char *)&tce6, sizeof (tce6));
@@ -16059,21 +13088,21 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
 			 * but don't have IPV6_V6ONLY set.
 			 * (i.e. anything an IPv4 peer could connect to)
 			 */
-			if (tcp->tcp_ipversion == IPV4_VERSION ||
+			if (connp->conn_ipversion == IPV4_VERSION ||
 			    (tcp->tcp_state <= TCPS_LISTEN &&
-			    !tcp->tcp_connp->conn_ipv6_v6only &&
-			    IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6))) {
-				if (tcp->tcp_ipversion == IPV6_VERSION) {
+			    !connp->conn_ipv6_v6only &&
+			    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6))) {
+				if (connp->conn_ipversion == IPV6_VERSION) {
 					tce.tcpConnRemAddress = INADDR_ANY;
 					tce.tcpConnLocalAddress = INADDR_ANY;
 				} else {
 					tce.tcpConnRemAddress =
-					    tcp->tcp_remote;
+					    connp->conn_faddr_v4;
 					tce.tcpConnLocalAddress =
-					    tcp->tcp_ip_src;
+					    connp->conn_laddr_v4;
 				}
-				tce.tcpConnLocalPort = ntohs(tcp->tcp_lport);
-				tce.tcpConnRemPort = ntohs(tcp->tcp_fport);
+				tce.tcpConnLocalPort = ntohs(connp->conn_lport);
+				tce.tcpConnRemPort = ntohs(connp->conn_fport);
 				/* Don't want just anybody seeing these... */
 				if (ispriv) {
 					tce.tcpConnEntryInfo.ce_snxt =
@@ -16107,9 +13136,10 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
 				    tcp->tcp_state;
 
 				tce.tcpConnCreationProcess =
-				    (tcp->tcp_cpid < 0) ? MIB2_UNKNOWN_PROCESS :
-				    tcp->tcp_cpid;
-				tce.tcpConnCreationTime = tcp->tcp_open_time;
+				    (connp->conn_cpid < 0) ?
+				    MIB2_UNKNOWN_PROCESS :
+				    connp->conn_cpid;
+				tce.tcpConnCreationTime = connp->conn_open_time;
 
 				(void) snmp_append_data2(mp_conn_ctl->b_cont,
 				    &mp_conn_tail, (char *)&tce, sizeof (tce));
@@ -16273,7 +13303,6 @@ tcp_timer(void *arg)
 		tcp_t	*listener = tcp->tcp_listener;
 
 		if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
-			ASSERT(tcp->tcp_rq == listener->tcp_rq);
 			/* it's our first timeout */
 			tcp->tcp_syn_rcvd_timeout = 1;
 			mutex_enter(&listener->tcp_eager_lock);
@@ -16295,7 +13324,7 @@ tcp_timer(void *arg)
 				cmn_err(CE_WARN, "High TCP connect timeout "
 				    "rate! System (port %d) may be under a "
 				    "SYN flood attack!",
-				    BE16_TO_U16(listener->tcp_tcph->th_lport));
+				    ntohs(listener->tcp_connp->conn_lport));
 
 				listener->tcp_ip_addr_cache = kmem_zalloc(
 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
@@ -16363,7 +13392,7 @@ tcp_timer(void *arg)
 			 * backoff.
 			 */
 			if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
-				if (tcp->tcp_debug) {
+				if (connp->conn_debug) {
 					(void) strlog(TCP_MOD_ID, 0, 1,
 					    SL_TRACE, "tcp_timer: zero win");
 				}
@@ -16415,6 +13444,13 @@ tcp_timer(void *arg)
 		 * 3.  But 1 and 3 are exclusive.
 		 */
 		if (tcp->tcp_unsent != 0) {
+			/*
+			 * Should not hold the zero-copy messages for too long.
+			 */
+			if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
+				tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
+				    tcp->tcp_xmit_head, B_TRUE);
+
 			if (tcp->tcp_cwnd == 0) {
 				/*
 				 * Set tcp_cwnd to 1 MSS so that a
@@ -16477,7 +13513,7 @@ tcp_timer(void *arg)
 		(void) tcp_clean_death(tcp, 0, 24);
 		return;
 	default:
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 			    "tcp_timer: strange state (%d) %s",
 			    tcp->tcp_state, tcp_display(tcp, NULL,
@@ -16485,8 +13521,16 @@ tcp_timer(void *arg)
 		}
 		return;
 	}
+
 	if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
 		/*
+		 * Should not hold the zero-copy messages for too long.
+		 */
+		if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
+			tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
+			    tcp->tcp_xmit_head, B_TRUE);
+
+		/*
 		 * For zero window probe, we need to send indefinitely,
 		 * unless we have not heard from the other side for some
 		 * time...
@@ -16529,11 +13573,13 @@ tcp_timer(void *arg)
 			tcp->tcp_ms_we_have_waited = second_threshold;
 		}
 	} else if (ms > first_threshold) {
-		if (tcp->tcp_snd_zcopy_aware && (!tcp->tcp_xmit_zc_clean) &&
-		    tcp->tcp_xmit_head != NULL) {
-			tcp->tcp_xmit_head =
-			    tcp_zcopy_backoff(tcp, tcp->tcp_xmit_head, 1);
-		}
+		/*
+		 * Should not hold the zero-copy messages for too long.
+		 */
+		if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
+			tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
+			    tcp->tcp_xmit_head, B_TRUE);
+
 		/*
 		 * We have been retransmitting for too long...  The RTT
 		 * we calculated is probably incorrect.  Reinitialize it.
@@ -16618,20 +13664,11 @@ tcp_timer(void *arg)
 	if (mp == NULL) {
 		return;
 	}
-	/*
-	 * Attach credentials to retransmitted initial SYNs.
-	 * In theory we should use the credentials from the connect()
-	 * call to ensure that getpeerucred() on the peer will be correct.
-	 * But we assume that SYN's are not dropped for loopback connections.
-	 */
-	if (tcp->tcp_state == TCPS_SYN_SENT) {
-		mblk_setcred(mp, CONN_CRED(tcp->tcp_connp), tcp->tcp_cpid);
-	}
 
 	tcp->tcp_csuna = tcp->tcp_snxt;
 	BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs);
 	UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, mss);
-	tcp_send_data(tcp, tcp->tcp_wq, mp);
+	tcp_send_data(tcp, mp);
 
 }
 
@@ -16639,7 +13676,6 @@ static int
 tcp_do_unbind(conn_t *connp)
 {
 	tcp_t *tcp = connp->conn_tcp;
-	int error = 0;
 
 	switch (tcp->tcp_state) {
 	case TCPS_BOUND:
@@ -16659,41 +13695,36 @@ tcp_do_unbind(conn_t *connp)
 	}
 	mutex_exit(&tcp->tcp_eager_lock);
 
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
-		tcp->tcp_ipha->ipha_src = 0;
-	} else {
-		V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
-	}
-	V6_SET_ZERO(tcp->tcp_ip_src_v6);
-	bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport));
+	connp->conn_laddr_v6 = ipv6_all_zeros;
+	connp->conn_saddr_v6 = ipv6_all_zeros;
 	tcp_bind_hash_remove(tcp);
 	tcp->tcp_state = TCPS_IDLE;
-	tcp->tcp_mdt = B_FALSE;
 
-	connp = tcp->tcp_connp;
-	connp->conn_mdt_ok = B_FALSE;
-	ipcl_hash_remove(connp);
+	ip_unbind(connp);
 	bzero(&connp->conn_ports, sizeof (connp->conn_ports));
 
-	return (error);
+	return (0);
 }
 
 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
 static void
 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
 {
-	int error = tcp_do_unbind(tcp->tcp_connp);
+	conn_t *connp = tcp->tcp_connp;
+	int error;
 
+	error = tcp_do_unbind(connp);
 	if (error > 0) {
 		tcp_err_ack(tcp, mp, TSYSERR, error);
 	} else if (error < 0) {
 		tcp_err_ack(tcp, mp, -error, 0);
 	} else {
 		/* Send M_FLUSH according to TPI */
-		(void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
+		(void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
 
 		mp = mi_tpi_ok_ack_alloc(mp);
-		putnext(tcp->tcp_rq, mp);
+		if (mp != NULL)
+			putnext(connp->conn_rq, mp);
 	}
 }
 
@@ -16764,7 +13795,7 @@ retry:
 		}
 	}
 	if (is_system_labeled() &&
-	    (i = tsol_next_port(crgetzone(tcp->tcp_cred), port,
+	    (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
 	    IPPROTO_TCP, B_TRUE)) != 0) {
 		port = i;
 		goto retry;
@@ -16796,7 +13827,7 @@ retry:
 		restart = B_TRUE;
 	}
 	if (is_system_labeled() &&
-	    (nextport = tsol_next_port(crgetzone(tcp->tcp_cred),
+	    (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
 	    next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
 		next_priv_port = nextport;
 		goto retry;
@@ -16820,11 +13851,10 @@ struct {
  */
 /* ARGSUSED */
 static void
-tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2)
+tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	conn_t	*connp = (conn_t *)arg;
 	tcp_t	*tcp = connp->conn_tcp;
-	queue_t	*q = tcp->tcp_wq;
 
 	ASSERT(DB_TYPE(mp) != M_IOCTL);
 	/*
@@ -16851,7 +13881,7 @@ tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2)
 		tcp_wput_flush(tcp, mp);
 		break;
 	default:
-		CALL_IP_WPUT(connp, q, mp);
+		ip_wput_nondata(connp->conn_wq, mp);
 		break;
 	}
 }
@@ -16862,7 +13892,7 @@ tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2)
  */
 /* ARGSUSED */
 void
-tcp_output(void *arg, mblk_t *mp, void *arg2)
+tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	int		len;
 	int		hdrlen;
@@ -16870,7 +13900,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
 	mblk_t		*mp1;
 	uchar_t		*rptr;
 	uint32_t	snxt;
-	tcph_t		*tcph;
+	tcpha_t		*tcpha;
 	struct datab	*db;
 	uint32_t	suna;
 	uint32_t	mss;
@@ -16882,7 +13912,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
 	tcp_t		*tcp = connp->conn_tcp;
 	uint32_t	msize;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
+	ip_xmit_attr_t	*ixa;
 
 	/*
 	 * Try and ASSERT the minimum possible references on the
@@ -16903,25 +13933,18 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
 	tcp->tcp_squeue_bytes -= msize;
 	mutex_exit(&tcp->tcp_non_sq_lock);
 
-	/* Check to see if this connection wants to be re-fused. */
-	if (tcp->tcp_refuse) {
-		if (tcp->tcp_ipversion == IPV4_VERSION &&
-		    !ipst->ips_ip4_observe.he_interested) {
-			tcp_fuse(tcp, (uchar_t *)&tcp->tcp_saved_ipha,
-			    &tcp->tcp_saved_tcph);
-		} else if (tcp->tcp_ipversion == IPV6_VERSION &&
-		    !ipst->ips_ip6_observe.he_interested) {
-			tcp_fuse(tcp, (uchar_t *)&tcp->tcp_saved_ip6h,
-			    &tcp->tcp_saved_tcph);
-		}
-	}
 	/* Bypass tcp protocol for fused tcp loopback */
 	if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
 		return;
 
 	mss = tcp->tcp_mss;
-	if (tcp->tcp_xmit_zc_clean)
-		mp = tcp_zcopy_backoff(tcp, mp, 0);
+	/*
+	 * If ZEROCOPY has turned off, try not to send any zero-copy message
+	 * down. Do backoff, now.
+	 */
+	if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on)
+		mp = tcp_zcopy_backoff(tcp, mp, B_FALSE);
+
 
 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
 	len = (int)(mp->b_wptr - mp->b_rptr);
@@ -16977,8 +14000,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
 	 * start again to get back the connection's "self-clock" as
 	 * described in VJ's paper.
 	 *
-	 * Refer to the comment in tcp_mss_set() for the calculation
-	 * of tcp_cwnd after idle.
+	 * Reinitialize tcp_cwnd after idle.
 	 */
 	if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
 	    (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
@@ -16999,7 +14021,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
 
 	mutex_enter(&tcp->tcp_non_sq_lock);
 	if (tcp->tcp_flow_stopped &&
-	    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+	    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
 		tcp_clrqfull(tcp);
 	}
 	mutex_exit(&tcp->tcp_non_sq_lock);
@@ -17046,43 +14068,43 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
 	mp->b_next = (mblk_t *)(uintptr_t)snxt;
 
 	/* adjust tcp header information */
-	tcph = tcp->tcp_tcph;
-	tcph->th_flags[0] = (TH_ACK|TH_PUSH);
+	tcpha = tcp->tcp_tcpha;
+	tcpha->tha_flags = (TH_ACK|TH_PUSH);
 
-	sum = len + tcp->tcp_tcp_hdr_len + tcp->tcp_sum;
+	sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
 	sum = (sum >> 16) + (sum & 0xFFFF);
-	U16_TO_ABE16(sum, tcph->th_sum);
+	tcpha->tha_sum = htons(sum);
 
-	U32_TO_ABE32(snxt, tcph->th_seq);
+	tcpha->tha_seq = htonl(snxt);
 
 	BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs);
 	UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len);
 	BUMP_LOCAL(tcp->tcp_obsegs);
 
 	/* Update the latest receive window size in TCP header. */
-	U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
-	    tcph->th_win);
+	tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
 
 	tcp->tcp_last_sent_len = (ushort_t)len;
 
-	plen = len + tcp->tcp_hdr_len;
+	plen = len + connp->conn_ht_iphc_len;
 
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
+	ixa = connp->conn_ixa;
+	ixa->ixa_pktlen = plen;
+
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
 		tcp->tcp_ipha->ipha_length = htons(plen);
 	} else {
-		tcp->tcp_ip6h->ip6_plen = htons(plen -
-		    ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
+		tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
 	}
 
 	/* see if we need to allocate a mblk for the headers */
-	hdrlen = tcp->tcp_hdr_len;
+	hdrlen = connp->conn_ht_iphc_len;
 	rptr = mp1->b_rptr - hdrlen;
 	db = mp1->b_datap;
 	if ((db->db_ref != 2) || rptr < db->db_base ||
 	    (!OK_32PTR(rptr))) {
 		/* NOTE: we assume allocb returns an OK_32PTR */
-		mp = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH +
-		    tcps->tcps_wroff_xtra, BPRI_MED);
+		mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
 		if (!mp) {
 			freemsg(mp1);
 			goto no_memory;
@@ -17090,7 +14112,6 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
 		mp->b_cont = mp1;
 		mp1 = mp;
 		/* Leave room for Link Level header */
-		/* hdrlen = tcp->tcp_hdr_len; */
 		rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
 		mp1->b_wptr = &rptr[hdrlen];
 	}
@@ -17099,16 +14120,16 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
 	/* Fill in the timestamp option. */
 	if (tcp->tcp_snd_ts_ok) {
 		U32_TO_BE32((uint32_t)lbolt,
-		    (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
+		    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
 		U32_TO_BE32(tcp->tcp_ts_recent,
-		    (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
+		    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
 	} else {
-		ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH);
+		ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
 	}
 
 	/* copy header into outgoing packet */
 	dst = (ipaddr_t *)rptr;
-	src = (ipaddr_t *)tcp->tcp_iphc;
+	src = (ipaddr_t *)connp->conn_ht_iphc;
 	dst[0] = src[0];
 	dst[1] = src[1];
 	dst[2] = src[2];
@@ -17135,21 +14156,22 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
 	if (tcp->tcp_ecn_ok) {
 		SET_ECT(tcp, rptr);
 
-		tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len);
+		tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length);
 		if (tcp->tcp_ecn_echo_on)
-			tcph->th_flags[0] |= TH_ECE;
+			tcpha->tha_flags |= TH_ECE;
 		if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
-			tcph->th_flags[0] |= TH_CWR;
+			tcpha->tha_flags |= TH_CWR;
 			tcp->tcp_ecn_cwr_sent = B_TRUE;
 		}
 	}
 
 	if (tcp->tcp_ip_forward_progress) {
-		ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
-		*(uint32_t *)mp1->b_rptr  |= IP_FORWARD_PROG;
 		tcp->tcp_ip_forward_progress = B_FALSE;
+		connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
+	} else {
+		connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
 	}
-	tcp_send_data(tcp, tcp->tcp_wq, mp1);
+	tcp_send_data(tcp, mp1);
 	return;
 
 	/*
@@ -17166,29 +14188,27 @@ slow:
 	tcp_wput_data(tcp, NULL, B_FALSE);
 }
 
+/*
+ * This runs at the tail end of accept processing on the squeue of the
+ * new connection.
+ */
 /* ARGSUSED */
 void
-tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
+tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	conn_t			*connp = (conn_t *)arg;
 	tcp_t			*tcp = connp->conn_tcp;
-	queue_t			*q = tcp->tcp_rq;
-	struct tcp_options	*tcpopt;
+	queue_t			*q = connp->conn_rq;
 	tcp_stack_t		*tcps = tcp->tcp_tcps;
-
 	/* socket options */
-	uint_t 			sopp_flags;
-	ssize_t			sopp_rxhiwat;
-	ssize_t			sopp_maxblk;
-	ushort_t		sopp_wroff;
-	ushort_t		sopp_tail;
-	ushort_t		sopp_copyopt;
+	struct sock_proto_props	sopp;
 
-	tcpopt = (struct tcp_options *)mp->b_rptr;
+	/* We should just receive a single mblk that fits a T_discon_ind */
+	ASSERT(mp->b_cont == NULL);
 
 	/*
 	 * Drop the eager's ref on the listener, that was placed when
-	 * this eager began life in tcp_conn_request.
+	 * this eager began life in tcp_input_listener.
 	 */
 	CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
 	if (IPCL_IS_NONSTR(connp)) {
@@ -17227,15 +14247,12 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
 				 * memory allocation failure problems. We know
 				 * that the size of the incoming mblk i.e.
 				 * stroptions is greater than sizeof
-				 * T_discon_ind. So the reallocb below can't
-				 * fail.
+				 * T_discon_ind.
 				 */
-				freemsg(mp->b_cont);
-				mp->b_cont = NULL;
 				ASSERT(DB_REF(mp) == 1);
-				mp = reallocb(mp, sizeof (struct T_discon_ind),
-				    B_FALSE);
-				ASSERT(mp != NULL);
+				ASSERT(MBLKSIZE(mp) >=
+				    sizeof (struct T_discon_ind));
+
 				DB_TYPE(mp) = M_PROTO;
 				((union T_primitives *)mp->b_rptr)->type =
 				    T_DISCON_IND;
@@ -17251,41 +14268,21 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
 				mp->b_wptr = mp->b_rptr +
 				    sizeof (struct T_discon_ind);
 				putnext(q, mp);
-				return;
 			}
 		}
-		if (tcp->tcp_hard_binding) {
-			tcp->tcp_hard_binding = B_FALSE;
-			tcp->tcp_hard_bound = B_TRUE;
-		}
+		tcp->tcp_hard_binding = B_FALSE;
 		return;
 	}
 
-	if (tcpopt->to_flags & TCPOPT_BOUNDIF) {
-		int boundif = tcpopt->to_boundif;
-		uint_t len = sizeof (int);
-
-		(void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6,
-		    IPV6_BOUND_IF, len, (uchar_t *)&boundif, &len,
-		    (uchar_t *)&boundif, NULL, tcp->tcp_cred, NULL);
-	}
-	if (tcpopt->to_flags & TCPOPT_RECVPKTINFO) {
-		uint_t on = 1;
-		uint_t len = sizeof (uint_t);
-		(void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6,
-		    IPV6_RECVPKTINFO, len, (uchar_t *)&on, &len,
-		    (uchar_t *)&on, NULL, tcp->tcp_cred, NULL);
-	}
-
 	/*
-	 * Set max window size (tcp_recv_hiwater) of the acceptor.
+	 * Set max window size (conn_rcvbuf) of the acceptor.
 	 */
 	if (tcp->tcp_rcv_list == NULL) {
 		/*
 		 * Recv queue is empty, tcp_rwnd should not have changed.
 		 * That means it should be equal to the listener's tcp_rwnd.
 		 */
-		tcp->tcp_recv_hiwater = tcp->tcp_rwnd;
+		connp->conn_rcvbuf = tcp->tcp_rwnd;
 	} else {
 #ifdef DEBUG
 		mblk_t *tmp;
@@ -17300,19 +14297,19 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
 		ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt);
 #endif
 		/* There is some data, add them back to get the max. */
-		tcp->tcp_recv_hiwater = tcp->tcp_rwnd + tcp->tcp_rcv_cnt;
+		connp->conn_rcvbuf = tcp->tcp_rwnd + tcp->tcp_rcv_cnt;
 	}
 	/*
 	 * This is the first time we run on the correct
 	 * queue after tcp_accept. So fix all the q parameters
 	 * here.
 	 */
-	sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF;
-	sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
+	sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF;
+	sopp.sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
 
-	sopp_rxhiwat = tcp->tcp_fused ?
-	    tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) :
-	    tcp->tcp_recv_hiwater;
+	sopp.sopp_rxhiwat = tcp->tcp_fused ?
+	    tcp_fuse_set_rcv_hiwat(tcp, connp->conn_rcvbuf) :
+	    connp->conn_rcvbuf;
 
 	/*
 	 * Determine what write offset value to use depending on SACK and
@@ -17328,18 +14325,18 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
 		 * since it would reduce the amount of work done by kmem.
 		 * Non-fused tcp loopback case is handled separately below.
 		 */
-		sopp_wroff = 0;
+		sopp.sopp_wroff = 0;
 		/*
 		 * Update the peer's transmit parameters according to
 		 * our recently calculated high water mark value.
 		 */
 		(void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE);
 	} else if (tcp->tcp_snd_sack_ok) {
-		sopp_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN +
+		sopp.sopp_wroff = connp->conn_ht_iphc_allocated +
 		    (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra);
 	} else {
-		sopp_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 :
-		    tcps->tcps_wroff_xtra);
+		sopp.sopp_wroff = connp->conn_ht_iphc_len +
+		    (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra);
 	}
 
 	/*
@@ -17354,30 +14351,22 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
 	 * costs.
 	 */
 	if (tcp->tcp_kssl_ctx != NULL) {
-		sopp_wroff += SSL3_WROFFSET;
+		sopp.sopp_wroff += SSL3_WROFFSET;
 
-		sopp_flags |= SOCKOPT_TAIL;
-		sopp_tail = SSL3_MAX_TAIL_LEN;
+		sopp.sopp_flags |= SOCKOPT_TAIL;
+		sopp.sopp_tail = SSL3_MAX_TAIL_LEN;
 
-		sopp_flags |= SOCKOPT_ZCOPY;
-		sopp_copyopt = ZCVMUNSAFE;
+		sopp.sopp_flags |= SOCKOPT_ZCOPY;
+		sopp.sopp_zcopyflag = ZCVMUNSAFE;
 
-		sopp_maxblk = SSL3_MAX_RECORD_LEN;
+		sopp.sopp_maxblk = SSL3_MAX_RECORD_LEN;
 	}
 
 	/* Send the options up */
 	if (IPCL_IS_NONSTR(connp)) {
-		struct sock_proto_props sopp;
-
-		sopp.sopp_flags = sopp_flags;
-		sopp.sopp_wroff = sopp_wroff;
-		sopp.sopp_maxblk = sopp_maxblk;
-		sopp.sopp_rxhiwat = sopp_rxhiwat;
-		if (sopp_flags & SOCKOPT_TAIL) {
+		if (sopp.sopp_flags & SOCKOPT_TAIL) {
 			ASSERT(tcp->tcp_kssl_ctx != NULL);
-			ASSERT(sopp_flags & SOCKOPT_ZCOPY);
-			sopp.sopp_tail = sopp_tail;
-			sopp.sopp_zcopyflag = sopp_copyopt;
+			ASSERT(sopp.sopp_flags & SOCKOPT_ZCOPY);
 		}
 		if (tcp->tcp_loopback) {
 			sopp.sopp_flags |= SOCKOPT_LOOPBACK;
@@ -17385,34 +14374,40 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
 		}
 		(*connp->conn_upcalls->su_set_proto_props)
 		    (connp->conn_upper_handle, &sopp);
+		freemsg(mp);
 	} else {
+		/*
+		 * Let us reuse the incoming mblk to avoid
+		 * memory allocation failure problems. We know
+		 * that the size of the incoming mblk is at least
+		 * stroptions
+		 */
 		struct stroptions *stropt;
-		mblk_t *stropt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
-		if (stropt_mp == NULL) {
-			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
-			return;
-		}
-		DB_TYPE(stropt_mp) = M_SETOPTS;
-		stropt = (struct stroptions *)stropt_mp->b_rptr;
-		stropt_mp->b_wptr += sizeof (struct stroptions);
+
+		ASSERT(DB_REF(mp) == 1);
+		ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
+
+		DB_TYPE(mp) = M_SETOPTS;
+		stropt = (struct stroptions *)mp->b_rptr;
+		mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
+		stropt = (struct stroptions *)mp->b_rptr;
 		stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
-		stropt->so_hiwat = sopp_rxhiwat;
-		stropt->so_wroff = sopp_wroff;
-		stropt->so_maxblk = sopp_maxblk;
+		stropt->so_hiwat = sopp.sopp_rxhiwat;
+		stropt->so_wroff = sopp.sopp_wroff;
+		stropt->so_maxblk = sopp.sopp_maxblk;
 
-		if (sopp_flags & SOCKOPT_TAIL) {
+		if (sopp.sopp_flags & SOCKOPT_TAIL) {
 			ASSERT(tcp->tcp_kssl_ctx != NULL);
 
 			stropt->so_flags |= SO_TAIL | SO_COPYOPT;
-			stropt->so_tail = sopp_tail;
-			stropt->so_copyopt = sopp_copyopt;
+			stropt->so_tail = sopp.sopp_tail;
+			stropt->so_copyopt = sopp.sopp_zcopyflag;
 		}
 
 		/* Send the options up */
-		putnext(q, stropt_mp);
+		putnext(q, mp);
 	}
 
-	freemsg(mp);
 	/*
 	 * Pass up any data and/or a fin that has been received.
 	 *
@@ -17432,7 +14427,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
 			if (!tcp->tcp_fused && (*connp->conn_upcalls->su_recv)
 			    (connp->conn_upper_handle, NULL, 0, 0, &error,
 			    &push) >= 0) {
-				tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+				tcp->tcp_rwnd = connp->conn_rcvbuf;
 				if (tcp->tcp_state >= TCPS_ESTABLISHED &&
 				    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
 					tcp_xmit_ctl(NULL,
@@ -17463,7 +14458,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
 			/* We drain directly in case of fused tcp loopback */
 
 			if (!tcp->tcp_fused && canputnext(q)) {
-				tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+				tcp->tcp_rwnd = connp->conn_rcvbuf;
 				if (tcp->tcp_state >= TCPS_ESTABLISHED &&
 				    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
 					tcp_xmit_ctl(NULL,
@@ -17508,12 +14503,9 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
 			putnext(q, mp);
 		}
 	}
-	if (tcp->tcp_hard_binding) {
-		tcp->tcp_hard_binding = B_FALSE;
-		tcp->tcp_hard_bound = B_TRUE;
-	}
+	tcp->tcp_hard_binding = B_FALSE;
 
-	if (tcp->tcp_ka_enabled) {
+	if (connp->conn_keepalive) {
 		tcp->tcp_ka_last_intrvl = 0;
 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
 		    MSEC_TO_TICK(tcp->tcp_ka_interval));
@@ -17535,14 +14527,14 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
 
 /*
  * The function called through squeue to get behind listener's perimeter to
- * send a deffered conn_ind.
+ * send a deferred conn_ind.
  */
 /* ARGSUSED */
 void
-tcp_send_pending(void *arg, mblk_t *mp, void *arg2)
+tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
-	conn_t	*connp = (conn_t *)arg;
-	tcp_t *listener = connp->conn_tcp;
+	conn_t	*lconnp = (conn_t *)arg;
+	tcp_t *listener = lconnp->conn_tcp;
 	struct T_conn_ind *conn_ind;
 	tcp_t *tcp;
 
@@ -17560,29 +14552,34 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2)
 		return;
 	}
 
-	tcp_ulp_newconn(connp, tcp->tcp_connp, mp);
+	tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
 }
 
-/* ARGSUSED */
+/*
+ * Common to TPI and sockfs accept code.
+ */
+/* ARGSUSED2 */
 static int
 tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr)
 {
 	tcp_t *listener, *eager;
-	mblk_t *opt_mp;
-	struct tcp_options *tcpopt;
+	mblk_t *discon_mp;
 
 	listener = lconnp->conn_tcp;
 	ASSERT(listener->tcp_state == TCPS_LISTEN);
 	eager = econnp->conn_tcp;
 	ASSERT(eager->tcp_listener != NULL);
 
-	ASSERT(eager->tcp_rq != NULL);
+	/*
+	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
+	 * use it if something failed.
+	 */
+	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
+	    sizeof (struct stroptions)), BPRI_HI);
 
-	opt_mp = allocb(sizeof (struct tcp_options), BPRI_HI);
-	if (opt_mp == NULL) {
+	if (discon_mp == NULL) {
 		return (-TPROTO);
 	}
-	bzero((char *)opt_mp->b_rptr, sizeof (struct tcp_options));
 	eager->tcp_issocket = B_TRUE;
 
 	econnp->conn_zoneid = listener->tcp_connp->conn_zoneid;
@@ -17607,24 +14604,6 @@ tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr)
 	 */
 	ASSERT(econnp->conn_ref >= 3);
 
-	opt_mp->b_datap->db_type = M_SETOPTS;
-	opt_mp->b_wptr += sizeof (struct tcp_options);
-
-	/*
-	 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
-	 * from listener to acceptor.
-	 */
-	tcpopt = (struct tcp_options *)opt_mp->b_rptr;
-	tcpopt->to_flags = 0;
-
-	if (listener->tcp_bound_if != 0) {
-		tcpopt->to_flags |= TCPOPT_BOUNDIF;
-		tcpopt->to_boundif = listener->tcp_bound_if;
-	}
-	if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
-		tcpopt->to_flags |= TCPOPT_RECVPKTINFO;
-	}
-
 	mutex_enter(&listener->tcp_eager_lock);
 	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
 
@@ -17686,7 +14665,7 @@ tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr)
 		/* Need to get inside the listener perimeter */
 		CONN_INC_REF(listener->tcp_connp);
 		SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
-		    tcp_send_pending, listener->tcp_connp, SQ_FILL,
+		    tcp_send_pending, listener->tcp_connp, NULL, SQ_FILL,
 		    SQTAG_TCP_SEND_PENDING);
 	}
 no_more_eagers:
@@ -17700,8 +14679,8 @@ no_more_eagers:
 	 * before sending the conn_ind in tcp_send_conn_ind.
 	 * The ref will be dropped in tcp_accept_finish().
 	 */
-	SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish,
-	    econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
+	SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
+	    econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
 	return (0);
 }
 
@@ -17712,7 +14691,6 @@ tcp_accept(sock_lower_handle_t lproto_handle,
 {
 	conn_t *lconnp, *econnp;
 	tcp_t *listener, *eager;
-	tcp_stack_t	*tcps;
 
 	lconnp = (conn_t *)lproto_handle;
 	listener = lconnp->conn_tcp;
@@ -17720,7 +14698,6 @@ tcp_accept(sock_lower_handle_t lproto_handle,
 	econnp = (conn_t *)eproto_handle;
 	eager = econnp->conn_tcp;
 	ASSERT(eager->tcp_listener != NULL);
-	tcps = eager->tcp_tcps;
 
 	/*
 	 * It is OK to manipulate these fields outside the eager's squeue
@@ -17732,19 +14709,6 @@ tcp_accept(sock_lower_handle_t lproto_handle,
 	econnp->conn_upper_handle = sock_handle;
 	econnp->conn_upcalls = lconnp->conn_upcalls;
 	ASSERT(IPCL_IS_NONSTR(econnp));
-	/*
-	 * Create helper stream if it is a non-TPI TCP connection.
-	 */
-	if (ip_create_helper_stream(econnp, tcps->tcps_ldi_ident)) {
-		ip1dbg(("tcp_accept: create of IP helper stream"
-		    " failed\n"));
-		return (EPROTO);
-	}
-	eager->tcp_rq = econnp->conn_rq;
-	eager->tcp_wq = econnp->conn_wq;
-
-	ASSERT(eager->tcp_rq != NULL);
-
 	return (tcp_accept_common(lconnp, econnp, cr));
 }
 
@@ -17752,7 +14716,7 @@ tcp_accept(sock_lower_handle_t lproto_handle,
 /*
  * This is the STREAMS entry point for T_CONN_RES coming down on
  * Acceptor STREAM when  sockfs listener does accept processing.
- * Read the block comment on top of tcp_conn_request().
+ * Read the block comment on top of tcp_input_listener().
  */
 void
 tcp_tpi_accept(queue_t *q, mblk_t *mp)
@@ -17815,8 +14779,8 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
 		econnp = eager->tcp_connp;
 		econnp->conn_dev = (dev_t)RD(q)->q_ptr;
 		econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr);
-		eager->tcp_rq = rq;
-		eager->tcp_wq = q;
+		econnp->conn_rq = rq;
+		econnp->conn_wq = q;
 		rq->q_ptr = econnp;
 		rq->q_qinfo = &tcp_rinitv4;	/* No open - same as rinitv6 */
 		q->q_ptr = econnp;
@@ -17836,7 +14800,7 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
 		 * should already be enough space in the mp that came
 		 * down from soaccept().
 		 */
-		if (eager->tcp_family == AF_INET) {
+		if (econnp->conn_family == AF_INET) {
 			sin_t *sin;
 
 			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
@@ -17844,8 +14808,8 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
 			sin = (sin_t *)mp->b_wptr;
 			mp->b_wptr += sizeof (sin_t);
 			sin->sin_family = AF_INET;
-			sin->sin_port = eager->tcp_lport;
-			sin->sin_addr.s_addr = eager->tcp_ipha->ipha_src;
+			sin->sin_port = econnp->conn_lport;
+			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
 		} else {
 			sin6_t *sin6;
 
@@ -17854,20 +14818,23 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
 			sin6 = (sin6_t *)mp->b_wptr;
 			mp->b_wptr += sizeof (sin6_t);
 			sin6->sin6_family = AF_INET6;
-			sin6->sin6_port = eager->tcp_lport;
-			if (eager->tcp_ipversion == IPV4_VERSION) {
+			sin6->sin6_port = econnp->conn_lport;
+			sin6->sin6_addr = econnp->conn_laddr_v6;
+			if (econnp->conn_ipversion == IPV4_VERSION) {
 				sin6->sin6_flowinfo = 0;
-				IN6_IPADDR_TO_V4MAPPED(
-				    eager->tcp_ipha->ipha_src,
-				    &sin6->sin6_addr);
 			} else {
 				ASSERT(eager->tcp_ip6h != NULL);
 				sin6->sin6_flowinfo =
 				    eager->tcp_ip6h->ip6_vcf &
 				    ~IPV6_VERS_AND_FLOW_MASK;
-				sin6->sin6_addr = eager->tcp_ip6h->ip6_src;
 			}
-			sin6->sin6_scope_id = 0;
+			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
+			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
+				sin6->sin6_scope_id =
+				    econnp->conn_ixa->ixa_scopeid;
+			} else {
+				sin6->sin6_scope_id = 0;
+			}
 			sin6->__sin6_src_id = 0;
 		}
 
@@ -17881,97 +14848,6 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
 	}
 }
 
-static int
-tcp_do_getsockname(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp)
-{
-	sin_t *sin = (sin_t *)sa;
-	sin6_t *sin6 = (sin6_t *)sa;
-
-	switch (tcp->tcp_family) {
-	case AF_INET:
-		ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
-
-		if (*salenp < sizeof (sin_t))
-			return (EINVAL);
-
-		*sin = sin_null;
-		sin->sin_family = AF_INET;
-		if (tcp->tcp_state >= TCPS_BOUND) {
-			sin->sin_port = tcp->tcp_lport;
-			sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src;
-		}
-		*salenp = sizeof (sin_t);
-		break;
-
-	case AF_INET6:
-		if (*salenp < sizeof (sin6_t))
-			return (EINVAL);
-
-		*sin6 = sin6_null;
-		sin6->sin6_family = AF_INET6;
-		if (tcp->tcp_state >= TCPS_BOUND) {
-			sin6->sin6_port = tcp->tcp_lport;
-			mutex_enter(&tcp->tcp_connp->conn_lock);
-			if (tcp->tcp_ipversion == IPV4_VERSION) {
-				IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
-				    &sin6->sin6_addr);
-			} else {
-				sin6->sin6_addr = tcp->tcp_ip6h->ip6_src;
-			}
-			mutex_exit(&tcp->tcp_connp->conn_lock);
-		}
-		*salenp = sizeof (sin6_t);
-		break;
-	}
-
-	return (0);
-}
-
-static int
-tcp_do_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp)
-{
-	sin_t *sin = (sin_t *)sa;
-	sin6_t *sin6 = (sin6_t *)sa;
-
-	if (tcp->tcp_state < TCPS_SYN_RCVD)
-		return (ENOTCONN);
-
-	switch (tcp->tcp_family) {
-	case AF_INET:
-		ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
-
-		if (*salenp < sizeof (sin_t))
-			return (EINVAL);
-
-		*sin = sin_null;
-		sin->sin_family = AF_INET;
-		sin->sin_port = tcp->tcp_fport;
-		IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6,
-		    sin->sin_addr.s_addr);
-		*salenp = sizeof (sin_t);
-		break;
-
-	case AF_INET6:
-		if (*salenp < sizeof (sin6_t))
-			return (EINVAL);
-
-		*sin6 = sin6_null;
-		sin6->sin6_family = AF_INET6;
-		sin6->sin6_port = tcp->tcp_fport;
-		sin6->sin6_addr = tcp->tcp_remote_v6;
-		mutex_enter(&tcp->tcp_connp->conn_lock);
-		if (tcp->tcp_ipversion == IPV6_VERSION) {
-			sin6->sin6_flowinfo = tcp->tcp_ip6h->ip6_vcf &
-			    ~IPV6_VERS_AND_FLOW_MASK;
-		}
-		mutex_exit(&tcp->tcp_connp->conn_lock);
-		*salenp = sizeof (sin6_t);
-		break;
-	}
-
-	return (0);
-}
-
 /*
  * Handle special out-of-band ioctl requests (see PSARC/2008/265).
  */
@@ -17980,7 +14856,8 @@ tcp_wput_cmdblk(queue_t *q, mblk_t *mp)
 {
 	void	*data;
 	mblk_t	*datamp = mp->b_cont;
-	tcp_t	*tcp = Q_TO_TCP(q);
+	conn_t	*connp = Q_TO_CONN(q);
+	tcp_t	*tcp = connp->conn_tcp;
 	cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr;
 
 	if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) {
@@ -17993,10 +14870,14 @@ tcp_wput_cmdblk(queue_t *q, mblk_t *mp)
 
 	switch (cmdp->cb_cmd) {
 	case TI_GETPEERNAME:
-		cmdp->cb_error = tcp_do_getpeername(tcp, data, &cmdp->cb_len);
+		if (tcp->tcp_state < TCPS_SYN_RCVD)
+			cmdp->cb_error = ENOTCONN;
+		else
+			cmdp->cb_error = conn_getpeername(connp, data,
+			    &cmdp->cb_len);
 		break;
 	case TI_GETMYNAME:
-		cmdp->cb_error = tcp_do_getsockname(tcp, data, &cmdp->cb_len);
+		cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len);
 		break;
 	default:
 		cmdp->cb_error = EINVAL;
@@ -18029,14 +14910,14 @@ tcp_wput(queue_t *q, mblk_t *mp)
 
 		mutex_enter(&tcp->tcp_non_sq_lock);
 		tcp->tcp_squeue_bytes += size;
-		if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
+		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
 			tcp_setqfull(tcp);
 		}
 		mutex_exit(&tcp->tcp_non_sq_lock);
 
 		CONN_INC_REF(connp);
 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
-		    tcp_squeue_flag, SQTAG_TCP_OUTPUT);
+		    NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 		return;
 
 	case M_CMD:
@@ -18053,7 +14934,7 @@ tcp_wput(queue_t *q, mblk_t *mp)
 		if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
 			type = ((union T_primitives *)rptr)->type;
 		} else {
-			if (tcp->tcp_debug) {
+			if (connp->conn_debug) {
 				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_wput_proto, dropping one...");
@@ -18093,7 +14974,7 @@ tcp_wput(queue_t *q, mblk_t *mp)
 		/*
 		 * Most ioctls can be processed right away without going via
 		 * squeues - process them right here. Those that do require
-		 * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK)
+		 * squeue (currently _SIOCSOCKFALLBACK)
 		 * are processed by tcp_wput_ioctl().
 		 */
 		iocp = (struct iocblk *)mp->b_rptr;
@@ -18111,26 +14992,13 @@ tcp_wput(queue_t *q, mblk_t *mp)
 		case ND_SET:
 			/* nd_getset does the necessary checks */
 		case ND_GET:
-			if (!nd_getset(q, tcps->tcps_g_nd, mp)) {
-				CALL_IP_WPUT(connp, q, mp);
-				return;
-			}
-			qreply(q, mp);
-			return;
-		case TCP_IOC_DEFAULT_Q:
-			/*
-			 * Wants to be the default wq. Check the credentials
-			 * first, the rest is executed via squeue.
-			 */
-			if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) {
-				iocp->ioc_error = EPERM;
-				iocp->ioc_count = 0;
-				mp->b_datap->db_type = M_IOCACK;
+			if (nd_getset(q, tcps->tcps_g_nd, mp)) {
 				qreply(q, mp);
 				return;
 			}
-			output_proc = tcp_wput_ioctl;
-			break;
+			ip_wput_nondata(q, mp);
+			return;
+
 		default:
 			output_proc = tcp_wput_ioctl;
 			break;
@@ -18143,7 +15011,7 @@ tcp_wput(queue_t *q, mblk_t *mp)
 
 	CONN_INC_REF(connp);
 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp,
-	    tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
+	    NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
 }
 
 /*
@@ -18188,52 +15056,32 @@ tcp_wput_fallback(queue_t *wq, mblk_t *mp)
 	freemsg(mp);
 }
 
+/*
+ * Check the usability of ZEROCOPY. It's instead checking the flag set by IP.
+ */
 static boolean_t
 tcp_zcopy_check(tcp_t *tcp)
 {
-	conn_t	*connp = tcp->tcp_connp;
-	ire_t	*ire;
+	conn_t		*connp = tcp->tcp_connp;
+	ip_xmit_attr_t	*ixa = connp->conn_ixa;
 	boolean_t	zc_enabled = B_FALSE;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
 	if (do_tcpzcopy == 2)
 		zc_enabled = B_TRUE;
-	else if (tcp->tcp_ipversion == IPV4_VERSION &&
-	    IPCL_IS_CONNECTED(connp) &&
-	    (connp->conn_flags & IPCL_CHECK_POLICY) == 0 &&
-	    connp->conn_dontroute == 0 &&
-	    !connp->conn_nexthop_set &&
-	    connp->conn_outgoing_ill == NULL &&
-	    do_tcpzcopy == 1) {
-		/*
-		 * the checks above  closely resemble the fast path checks
-		 * in tcp_send_data().
-		 */
-		mutex_enter(&connp->conn_lock);
-		ire = connp->conn_ire_cache;
-		ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
-		if (ire != NULL && !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-			IRE_REFHOLD(ire);
-			if (ire->ire_stq != NULL) {
-				ill_t	*ill = (ill_t *)ire->ire_stq->q_ptr;
-
-				zc_enabled = ill && (ill->ill_capabilities &
-				    ILL_CAPAB_ZEROCOPY) &&
-				    (ill->ill_zerocopy_capab->
-				    ill_zerocopy_flags != 0);
-			}
-			IRE_REFRELE(ire);
-		}
-		mutex_exit(&connp->conn_lock);
-	}
+	else if ((do_tcpzcopy == 1) && (ixa->ixa_flags & IXAF_ZCOPY_CAPAB))
+		zc_enabled = B_TRUE;
+
 	tcp->tcp_snd_zcopy_on = zc_enabled;
 	if (!TCP_IS_DETACHED(tcp)) {
 		if (zc_enabled) {
-			(void) proto_set_tx_copyopt(tcp->tcp_rq, connp,
+			ixa->ixa_flags |= IXAF_VERIFY_ZCOPY;
+			(void) proto_set_tx_copyopt(connp->conn_rq, connp,
 			    ZCVMSAFE);
 			TCP_STAT(tcps, tcp_zcopy_on);
 		} else {
-			(void) proto_set_tx_copyopt(tcp->tcp_rq, connp,
+			ixa->ixa_flags &= ~IXAF_VERIFY_ZCOPY;
+			(void) proto_set_tx_copyopt(connp->conn_rq, connp,
 			    ZCVMUNSAFE);
 			TCP_STAT(tcps, tcp_zcopy_off);
 		}
@@ -18241,99 +15089,84 @@ tcp_zcopy_check(tcp_t *tcp)
 	return (zc_enabled);
 }
 
-static mblk_t *
-tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp)
-{
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-
-	if (do_tcpzcopy == 2)
-		return (bp);
-	else if (tcp->tcp_snd_zcopy_on) {
-		tcp->tcp_snd_zcopy_on = B_FALSE;
-		if (!TCP_IS_DETACHED(tcp)) {
-			(void) proto_set_tx_copyopt(tcp->tcp_rq, tcp->tcp_connp,
-			    ZCVMUNSAFE);
-			TCP_STAT(tcps, tcp_zcopy_disable);
-		}
-	}
-	return (tcp_zcopy_backoff(tcp, bp, 0));
-}
-
 /*
- * Backoff from a zero-copy mblk by copying data to a new mblk and freeing
- * the original desballoca'ed segmapped mblk.
+ * Backoff from a zero-copy message by copying data to a new allocated
+ * message and freeing the original desballoca'ed segmapped message.
+ *
+ * This function is called by following two callers:
+ * 1. tcp_timer: fix_xmitlist is set to B_TRUE, because it's safe to free
+ *    the origial desballoca'ed message and notify sockfs. This is in re-
+ *    transmit state.
+ * 2. tcp_output: fix_xmitlist is set to B_FALSE. Flag STRUIO_ZCNOTIFY need
+ *    to be copied to new message.
  */
 static mblk_t *
-tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist)
+tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, boolean_t fix_xmitlist)
 {
-	mblk_t *head, *tail, *nbp;
+	mblk_t		*nbp;
+	mblk_t		*head = NULL;
+	mblk_t		*tail = NULL;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
-	if (IS_VMLOANED_MBLK(bp)) {
-		TCP_STAT(tcps, tcp_zcopy_backoff);
-		if ((head = copyb(bp)) == NULL) {
-			/* fail to backoff; leave it for the next backoff */
-			tcp->tcp_xmit_zc_clean = B_FALSE;
-			return (bp);
-		}
-		if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) {
-			if (fix_xmitlist)
-				tcp_zcopy_notify(tcp);
-			else
-				head->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
-		}
-		nbp = bp->b_cont;
-		if (fix_xmitlist) {
-			head->b_prev = bp->b_prev;
-			head->b_next = bp->b_next;
-			if (tcp->tcp_xmit_tail == bp)
-				tcp->tcp_xmit_tail = head;
-		}
-		bp->b_next = NULL;
-		bp->b_prev = NULL;
-		freeb(bp);
-	} else {
-		head = bp;
-		nbp = bp->b_cont;
-	}
-	tail = head;
-	while (nbp) {
-		if (IS_VMLOANED_MBLK(nbp)) {
+	ASSERT(bp != NULL);
+	while (bp != NULL) {
+		if (IS_VMLOANED_MBLK(bp)) {
 			TCP_STAT(tcps, tcp_zcopy_backoff);
-			if ((tail->b_cont = copyb(nbp)) == NULL) {
+			if ((nbp = copyb(bp)) == NULL) {
 				tcp->tcp_xmit_zc_clean = B_FALSE;
-				tail->b_cont = nbp;
-				return (head);
+				if (tail != NULL)
+					tail->b_cont = bp;
+				return ((head == NULL) ? bp : head);
 			}
-			tail = tail->b_cont;
-			if (nbp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) {
+
+			if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) {
 				if (fix_xmitlist)
 					tcp_zcopy_notify(tcp);
 				else
-					tail->b_datap->db_struioflag |=
+					nbp->b_datap->db_struioflag |=
 					    STRUIO_ZCNOTIFY;
 			}
-			bp = nbp;
-			nbp = nbp->b_cont;
+			nbp->b_cont = bp->b_cont;
+
+			/*
+			 * Copy saved information and adjust tcp_xmit_tail
+			 * if needed.
+			 */
 			if (fix_xmitlist) {
-				tail->b_prev = bp->b_prev;
-				tail->b_next = bp->b_next;
+				nbp->b_prev = bp->b_prev;
+				nbp->b_next = bp->b_next;
+
 				if (tcp->tcp_xmit_tail == bp)
-					tcp->tcp_xmit_tail = tail;
+					tcp->tcp_xmit_tail = nbp;
 			}
-			bp->b_next = NULL;
+
+			/* Free the original message. */
 			bp->b_prev = NULL;
+			bp->b_next = NULL;
 			freeb(bp);
+
+			bp = nbp;
+		}
+
+		if (head == NULL) {
+			head = bp;
+		}
+		if (tail == NULL) {
+			tail = bp;
 		} else {
-			tail->b_cont = nbp;
-			tail = nbp;
-			nbp = nbp->b_cont;
+			tail->b_cont = bp;
+			tail = bp;
 		}
+
+		/* Move forward. */
+		bp = bp->b_cont;
 	}
+
 	if (fix_xmitlist) {
 		tcp->tcp_xmit_last = tail;
 		tcp->tcp_xmit_zc_clean = B_TRUE;
 	}
+
 	return (head);
 }
 
@@ -18341,7 +15174,7 @@ static void
 tcp_zcopy_notify(tcp_t *tcp)
 {
 	struct stdata	*stp;
-	conn_t *connp;
+	conn_t		*connp;
 
 	if (tcp->tcp_detached)
 		return;
@@ -18351,323 +15184,149 @@ tcp_zcopy_notify(tcp_t *tcp)
 		    (connp->conn_upper_handle);
 		return;
 	}
-	stp = STREAM(tcp->tcp_rq);
+	stp = STREAM(connp->conn_rq);
 	mutex_enter(&stp->sd_lock);
 	stp->sd_flag |= STZCNOTIFY;
 	cv_broadcast(&stp->sd_zcopy_wait);
 	mutex_exit(&stp->sd_lock);
 }
 
-static boolean_t
-tcp_send_find_ire(tcp_t *tcp, ipaddr_t *dst, ire_t **irep)
+/*
+ * Update the TCP connection according to change of LSO capability.
+ */
+static void
+tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa)
 {
-	ire_t	*ire;
-	conn_t	*connp = tcp->tcp_connp;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
-
-	mutex_enter(&connp->conn_lock);
-	ire = connp->conn_ire_cache;
-	ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
-
-	if ((ire != NULL) &&
-	    (((dst != NULL) && (ire->ire_addr == *dst)) || ((dst == NULL) &&
-	    IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &tcp->tcp_ip6h->ip6_dst))) &&
-	    !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-		IRE_REFHOLD(ire);
-		mutex_exit(&connp->conn_lock);
-	} else {
-		boolean_t cached = B_FALSE;
-		ts_label_t *tsl;
-
-		/* force a recheck later on */
-		tcp->tcp_ire_ill_check_done = B_FALSE;
-
-		TCP_DBGSTAT(tcps, tcp_ire_null1);
-		connp->conn_ire_cache = NULL;
-		mutex_exit(&connp->conn_lock);
-
-		if (ire != NULL)
-			IRE_REFRELE_NOTR(ire);
-
-		tsl = crgetlabel(CONN_CRED(connp));
-		ire = (dst ?
-		    ire_cache_lookup(*dst, connp->conn_zoneid, tsl, ipst) :
-		    ire_cache_lookup_v6(&tcp->tcp_ip6h->ip6_dst,
-		    connp->conn_zoneid, tsl, ipst));
+	/*
+	 * We check against IPv4 header length to preserve the old behavior
+	 * of only enabling LSO when there are no IP options.
+	 * But this restriction might not be necessary at all. Before removing
+	 * it, need to verify how LSO is handled for source routing case, with
+	 * which IP does software checksum.
+	 *
+	 * For IPv6, whenever any extension header is needed, LSO is supressed.
+	 */
+	if (ixa->ixa_ip_hdr_length != ((ixa->ixa_flags & IXAF_IS_IPV4) ?
+	    IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN))
+		return;
 
-		if (ire == NULL) {
-			TCP_STAT(tcps, tcp_ire_null);
-			return (B_FALSE);
-		}
+	/*
+	 * Either the LSO capability newly became usable, or it has changed.
+	 */
+	if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
+		ill_lso_capab_t	*lsoc = &ixa->ixa_lso_capab;
 
-		IRE_REFHOLD_NOTR(ire);
+		ASSERT(lsoc->ill_lso_max > 0);
+		tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, lsoc->ill_lso_max);
 
-		mutex_enter(&connp->conn_lock);
-		if (CONN_CACHE_IRE(connp)) {
-			rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
-			if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-				TCP_CHECK_IREINFO(tcp, ire);
-				connp->conn_ire_cache = ire;
-				cached = B_TRUE;
-			}
-			rw_exit(&ire->ire_bucket->irb_lock);
-		}
-		mutex_exit(&connp->conn_lock);
+		DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso,
+		    boolean_t, B_TRUE, uint32_t, tcp->tcp_lso_max);
 
 		/*
-		 * We can continue to use the ire but since it was
-		 * not cached, we should drop the extra reference.
+		 * If LSO to be enabled, notify the STREAM header with larger
+		 * data block.
 		 */
-		if (!cached)
-			IRE_REFRELE_NOTR(ire);
+		if (!tcp->tcp_lso)
+			tcp->tcp_maxpsz_multiplier = 0;
+
+		tcp->tcp_lso = B_TRUE;
+		TCP_STAT(tcp->tcp_tcps, tcp_lso_enabled);
+	} else { /* LSO capability is not usable any more. */
+		DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso,
+		    boolean_t, B_FALSE, uint32_t, tcp->tcp_lso_max);
 
 		/*
-		 * Rampart note: no need to select a new label here, since
-		 * labels are not allowed to change during the life of a TCP
-		 * connection.
+		 * If LSO to be disabled, notify the STREAM header with smaller
+		 * data block. And need to restore fragsize to PMTU.
 		 */
+		if (tcp->tcp_lso) {
+			tcp->tcp_maxpsz_multiplier =
+			    tcp->tcp_tcps->tcps_maxpsz_multiplier;
+			ixa->ixa_fragsize = ixa->ixa_pmtu;
+			tcp->tcp_lso = B_FALSE;
+			TCP_STAT(tcp->tcp_tcps, tcp_lso_disabled);
+		}
 	}
 
-	*irep = ire;
-
-	return (B_TRUE);
+	(void) tcp_maxpsz_set(tcp, B_TRUE);
 }
 
 /*
- * Called from tcp_send() or tcp_send_data() to find workable IRE.
- *
- * 0 = success;
- * 1 = failed to find ire and ill.
+ * Update the TCP connection according to change of ZEROCOPY capability.
  */
-static boolean_t
-tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp)
+static void
+tcp_update_zcopy(tcp_t *tcp)
 {
-	ipha_t		*ipha;
-	ipaddr_t	dst;
-	ire_t		*ire;
-	ill_t		*ill;
-	mblk_t		*ire_fp_mp;
+	conn_t		*connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
-	if (mp != NULL)
-		ipha = (ipha_t *)mp->b_rptr;
-	else
-		ipha = tcp->tcp_ipha;
-	dst = ipha->ipha_dst;
-
-	if (!tcp_send_find_ire(tcp, &dst, &ire))
-		return (B_FALSE);
-
-	if ((ire->ire_flags & RTF_MULTIRT) ||
-	    (ire->ire_stq == NULL) ||
-	    (ire->ire_nce == NULL) ||
-	    ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) ||
-	    ((mp != NULL) && (ire->ire_max_frag < ntohs(ipha->ipha_length) ||
-	    MBLKL(ire_fp_mp) > MBLKHEAD(mp)))) {
-		TCP_STAT(tcps, tcp_ip_ire_send);
-		IRE_REFRELE(ire);
-		return (B_FALSE);
+	if (tcp->tcp_snd_zcopy_on) {
+		tcp->tcp_snd_zcopy_on = B_FALSE;
+		if (!TCP_IS_DETACHED(tcp)) {
+			(void) proto_set_tx_copyopt(connp->conn_rq, connp,
+			    ZCVMUNSAFE);
+			TCP_STAT(tcps, tcp_zcopy_off);
+		}
+	} else {
+		tcp->tcp_snd_zcopy_on = B_TRUE;
+		if (!TCP_IS_DETACHED(tcp)) {
+			(void) proto_set_tx_copyopt(connp->conn_rq, connp,
+			    ZCVMSAFE);
+			TCP_STAT(tcps, tcp_zcopy_on);
+		}
 	}
+}
 
-	ill = ire_to_ill(ire);
-	ASSERT(ill != NULL);
+/*
+ * Notify function registered with ip_xmit_attr_t. It's called in the squeue
+ * so it's safe to update the TCP connection.
+ */
+/* ARGSUSED1 */
+static void
+tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
+    ixa_notify_arg_t narg)
+{
+	tcp_t		*tcp = (tcp_t *)arg;
+	conn_t		*connp = tcp->tcp_connp;
 
-	if (!tcp->tcp_ire_ill_check_done) {
-		tcp_ire_ill_check(tcp, ire, ill, B_TRUE);
-		tcp->tcp_ire_ill_check_done = B_TRUE;
+	switch (ntype) {
+	case IXAN_LSO:
+		tcp_update_lso(tcp, connp->conn_ixa);
+		break;
+	case IXAN_PMTU:
+		tcp_update_pmtu(tcp, B_FALSE);
+		break;
+	case IXAN_ZCOPY:
+		tcp_update_zcopy(tcp);
+		break;
+	default:
+		break;
 	}
-
-	*irep = ire;
-	*illp = ill;
-
-	return (B_TRUE);
 }
 
 static void
-tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
+tcp_send_data(tcp_t *tcp, mblk_t *mp)
 {
-	ipha_t		*ipha;
-	ipaddr_t	src;
-	ipaddr_t	dst;
-	uint32_t	cksum;
-	ire_t		*ire;
-	uint16_t	*up;
-	ill_t		*ill;
 	conn_t		*connp = tcp->tcp_connp;
-	uint32_t	hcksum_txflags = 0;
-	mblk_t		*ire_fp_mp;
-	uint_t		ire_fp_mp_len;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
-	cred_t		*cr;
-	pid_t		cpid;
-
-	ASSERT(DB_TYPE(mp) == M_DATA);
 
 	/*
-	 * Here we need to handle the overloading of the cred_t for
-	 * both getpeerucred and TX.
-	 * If this is a SYN then the caller already set db_credp so
-	 * that getpeerucred will work. But if TX is in use we might have
-	 * a conn_effective_cred which is different, and we need to use that
-	 * cred to make TX use the correct label and label dependent route.
+	 * Check here to avoid sending zero-copy message down to IP when
+	 * ZEROCOPY capability has turned off. We only need to deal with
+	 * the race condition between sockfs and the notification here.
+	 * Since we have tried to backoff the tcp_xmit_head when turning
+	 * zero-copy off and new messages in tcp_output(), we simply drop
+	 * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean
+	 * is not true.
 	 */
-	if (is_system_labeled()) {
-		cr = msg_getcred(mp, &cpid);
-		if (cr == NULL || connp->conn_effective_cred != NULL)
-			mblk_setcred(mp, CONN_CRED(connp), cpid);
-	}
-
-	ipha = (ipha_t *)mp->b_rptr;
-	src = ipha->ipha_src;
-	dst = ipha->ipha_dst;
-
-	ASSERT(q != NULL);
-	DTRACE_PROBE2(tcp__trace__send, mblk_t *, mp, tcp_t *, tcp);
-
-	/*
-	 * Drop off fast path for IPv6 and also if options are present or
-	 * we need to resolve a TS label.
-	 */
-	if (tcp->tcp_ipversion != IPV4_VERSION ||
-	    !IPCL_IS_CONNECTED(connp) ||
-	    !CONN_IS_LSO_MD_FASTPATH(connp) ||
-	    (connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
-	    !connp->conn_ulp_labeled ||
-	    ipha->ipha_ident == IP_HDR_INCLUDED ||
-	    ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
-	    IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
-		if (tcp->tcp_snd_zcopy_aware)
-			mp = tcp_zcopy_disable(tcp, mp);
-		TCP_STAT(tcps, tcp_ip_send);
-		CALL_IP_WPUT(connp, q, mp);
-		return;
-	}
-
-	if (!tcp_send_find_ire_ill(tcp, mp, &ire, &ill)) {
-		if (tcp->tcp_snd_zcopy_aware)
-			mp = tcp_zcopy_backoff(tcp, mp, 0);
-		CALL_IP_WPUT(connp, q, mp);
+	if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on &&
+	    !tcp->tcp_xmit_zc_clean) {
+		ip_drop_output("TCP ZC was disabled but not clean", mp, NULL);
+		freemsg(mp);
 		return;
 	}
-	ire_fp_mp = ire->ire_nce->nce_fp_mp;
-	ire_fp_mp_len = MBLKL(ire_fp_mp);
-
-	ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
-	ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
-#ifndef _BIG_ENDIAN
-	ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
-#endif
-
-	/*
-	 * Check to see if we need to re-enable LSO/MDT for this connection
-	 * because it was previously disabled due to changes in the ill;
-	 * note that by doing it here, this re-enabling only applies when
-	 * the packet is not dispatched through CALL_IP_WPUT().
-	 *
-	 * That means for IPv4, it is worth re-enabling LSO/MDT for the fastpath
-	 * case, since that's how we ended up here.  For IPv6, we do the
-	 * re-enabling work in ip_xmit_v6(), albeit indirectly via squeue.
-	 */
-	if (connp->conn_lso_ok && !tcp->tcp_lso && ILL_LSO_TCP_USABLE(ill)) {
-		/*
-		 * Restore LSO for this connection, so that next time around
-		 * it is eligible to go through tcp_lsosend() path again.
-		 */
-		TCP_STAT(tcps, tcp_lso_enabled);
-		tcp->tcp_lso = B_TRUE;
-		ip1dbg(("tcp_send_data: reenabling LSO for connp %p on "
-		    "interface %s\n", (void *)connp, ill->ill_name));
-	} else if (connp->conn_mdt_ok && !tcp->tcp_mdt && ILL_MDT_USABLE(ill)) {
-		/*
-		 * Restore MDT for this connection, so that next time around
-		 * it is eligible to go through tcp_multisend() path again.
-		 */
-		TCP_STAT(tcps, tcp_mdt_conn_resumed1);
-		tcp->tcp_mdt = B_TRUE;
-		ip1dbg(("tcp_send_data: reenabling MDT for connp %p on "
-		    "interface %s\n", (void *)connp, ill->ill_name));
-	}
-
-	if (tcp->tcp_snd_zcopy_aware) {
-		if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 ||
-		    (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0))
-			mp = tcp_zcopy_disable(tcp, mp);
-		/*
-		 * we shouldn't need to reset ipha as the mp containing
-		 * ipha should never be a zero-copy mp.
-		 */
-	}
-
-	if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
-		ASSERT(ill->ill_hcksum_capab != NULL);
-		hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
-	}
-
-	/* pseudo-header checksum (do it in parts for IP header checksum) */
-	cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
-
-	ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
-	up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
-
-	IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
-	    IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
-
-	/* Software checksum? */
-	if (DB_CKSUMFLAGS(mp) == 0) {
-		TCP_STAT(tcps, tcp_out_sw_cksum);
-		TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes,
-		    ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
-	}
-
-	/* Calculate IP header checksum if hardware isn't capable */
-	if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
-		IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
-		    ((uint16_t *)ipha)[4]);
-	}
 
-	ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
-	mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
-	bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
-
-	UPDATE_OB_PKT_COUNT(ire);
-	ire->ire_last_used_time = lbolt;
-
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
-	    ntohs(ipha->ipha_length));
-
-	DTRACE_PROBE4(ip4__physical__out__start,
-	    ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
-	FW_HOOKS(ipst->ips_ip4_physical_out_event,
-	    ipst->ips_ipv4firewall_physical_out,
-	    NULL, ill, ipha, mp, mp, 0, ipst);
-	DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
-	DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL);
-
-	if (mp != NULL) {
-		if (ipst->ips_ip4_observe.he_interested) {
-			zoneid_t szone;
-
-			/*
-			 * Both of these functions expect b_rptr to be
-			 * where the IP header starts, so advance past the
-			 * link layer header if present.
-			 */
-			mp->b_rptr += ire_fp_mp_len;
-			szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
-			    ipst, ALL_ZONES);
-			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
-			    ALL_ZONES, ill, ipst);
-			mp->b_rptr -= ire_fp_mp_len;
-		}
-
-		ILL_SEND_TX(ill, ire, connp, mp, 0, NULL);
-	}
-
-	IRE_REFRELE(ire);
+	ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp);
+	(void) conn_ip_output(mp, connp->conn_ixa);
 }
 
 /*
@@ -18731,15 +15390,13 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
 	int		tcpstate;
 	int		usable = 0;
 	mblk_t		*xmit_tail;
-	queue_t		*q = tcp->tcp_wq;
 	int32_t		mss;
 	int32_t		num_sack_blk = 0;
+	int32_t		total_hdr_len;
 	int32_t		tcp_hdr_len;
-	int32_t		tcp_tcp_hdr_len;
-	int		mdt_thres;
 	int		rc;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t	*ipst;
+	conn_t		*connp = tcp->tcp_connp;
 
 	tcpstate = tcp->tcp_state;
 	if (mp == NULL) {
@@ -18771,7 +15428,7 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
 			    tcp_display(tcp, NULL,
 			    DISP_ADDR_AND_PORT));
 #else
-			if (tcp->tcp_debug) {
+			if (connp->conn_debug) {
 				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_TRACE|SL_ERROR,
 				    "tcp_wput_data: data after ordrel, %s\n",
@@ -18781,12 +15438,12 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
 #endif /* DEBUG */
 		}
 		if (tcp->tcp_snd_zcopy_aware &&
-		    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0)
+		    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
 			tcp_zcopy_notify(tcp);
 		freemsg(mp);
 		mutex_enter(&tcp->tcp_non_sq_lock);
 		if (tcp->tcp_flow_stopped &&
-		    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+		    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
 			tcp_clrqfull(tcp);
 		}
 		mutex_exit(&tcp->tcp_non_sq_lock);
@@ -18886,12 +15543,12 @@ data_null:
 		opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
 		    2 + TCPOPT_HEADER_LEN;
 		mss = tcp->tcp_mss - opt_len;
-		tcp_hdr_len = tcp->tcp_hdr_len + opt_len;
-		tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + opt_len;
+		total_hdr_len = connp->conn_ht_iphc_len + opt_len;
+		tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
 	} else {
 		mss = tcp->tcp_mss;
-		tcp_hdr_len = tcp->tcp_hdr_len;
-		tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len;
+		total_hdr_len = connp->conn_ht_iphc_len;
+		tcp_hdr_len = connp->conn_ht_ulp_len;
 	}
 
 	if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
@@ -18913,7 +15570,7 @@ data_null:
 		 * In the special case when cwnd is zero, which can only
 		 * happen if the connection is ECN capable, return now.
 		 * New segments is sent using tcp_timer().  The timer
-		 * is set in tcp_rput_data().
+		 * is set in tcp_input_data().
 		 */
 		if (tcp->tcp_cwnd == 0) {
 			/*
@@ -19023,66 +15680,12 @@ data_null:
 	}
 
 	/* Update the latest receive window size in TCP header. */
-	U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
-	    tcp->tcp_tcph->th_win);
+	tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
 
-	/*
-	 * Determine if it's worthwhile to attempt LSO or MDT, based on:
-	 *
-	 * 1. Simple TCP/IP{v4,v6} (no options).
-	 * 2. IPSEC/IPQoS processing is not needed for the TCP connection.
-	 * 3. If the TCP connection is in ESTABLISHED state.
-	 * 4. The TCP is not detached.
-	 *
-	 * If any of the above conditions have changed during the
-	 * connection, stop using LSO/MDT and restore the stream head
-	 * parameters accordingly.
-	 */
-	ipst = tcps->tcps_netstack->netstack_ip;
-
-	if ((tcp->tcp_lso || tcp->tcp_mdt) &&
-	    ((tcp->tcp_ipversion == IPV4_VERSION &&
-	    tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) ||
-	    (tcp->tcp_ipversion == IPV6_VERSION &&
-	    tcp->tcp_ip_hdr_len != IPV6_HDR_LEN) ||
-	    tcp->tcp_state != TCPS_ESTABLISHED ||
-	    TCP_IS_DETACHED(tcp) || !CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp) ||
-	    CONN_IPSEC_OUT_ENCAPSULATED(tcp->tcp_connp) ||
-	    IPP_ENABLED(IPP_LOCAL_OUT, ipst))) {
-		if (tcp->tcp_lso) {
-			tcp->tcp_connp->conn_lso_ok = B_FALSE;
-			tcp->tcp_lso = B_FALSE;
-		} else {
-			tcp->tcp_connp->conn_mdt_ok = B_FALSE;
-			tcp->tcp_mdt = B_FALSE;
-		}
-
-		/* Anything other than detached is considered pathological */
-		if (!TCP_IS_DETACHED(tcp)) {
-			if (tcp->tcp_lso)
-				TCP_STAT(tcps, tcp_lso_disabled);
-			else
-				TCP_STAT(tcps, tcp_mdt_conn_halted1);
-			(void) tcp_maxpsz_set(tcp, B_TRUE);
-		}
-	}
-
-	/* Use MDT if sendable amount is greater than the threshold */
-	if (tcp->tcp_mdt &&
-	    (mdt_thres = mss << tcp_mdt_smss_threshold, usable > mdt_thres) &&
-	    (tail_unsent > mdt_thres || (xmit_tail->b_cont != NULL &&
-	    MBLKL(xmit_tail->b_cont) > mdt_thres)) &&
-	    (tcp->tcp_valid_bits == 0 ||
-	    tcp->tcp_valid_bits == TCP_FSS_VALID)) {
-		ASSERT(tcp->tcp_connp->conn_mdt_ok);
-		rc = tcp_multisend(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len,
-		    num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
-		    local_time, mdt_thres);
-	} else {
-		rc = tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len,
-		    num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
-		    local_time, INT_MAX);
-	}
+	/* Send the packet. */
+	rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len,
+	    num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
+	    local_time);
 
 	/* Pretend that all we were trying to send really got sent */
 	if (rc < 0 && tail_unsent < 0) {
@@ -19131,39 +15734,41 @@ done:;
 	tcp->tcp_unsent += len;
 	mutex_enter(&tcp->tcp_non_sq_lock);
 	if (tcp->tcp_flow_stopped) {
-		if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+		if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
 			tcp_clrqfull(tcp);
 		}
-	} else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) {
-		tcp_setqfull(tcp);
+	} else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) {
+		if (!(tcp->tcp_detached))
+			tcp_setqfull(tcp);
 	}
 	mutex_exit(&tcp->tcp_non_sq_lock);
 }
 
 /*
- * tcp_fill_header is called by tcp_send() and tcp_multisend() to fill the
- * outgoing TCP header with the template header, as well as other
- * options such as time-stamp, ECN and/or SACK.
+ * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
+ * with the template header, as well as other options such as time-stamp,
+ * ECN and/or SACK.
  */
 static void
 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
 {
-	tcph_t *tcp_tmpl, *tcp_h;
+	tcpha_t *tcp_tmpl, *tcpha;
 	uint32_t *dst, *src;
 	int hdrlen;
+	conn_t *connp = tcp->tcp_connp;
 
 	ASSERT(OK_32PTR(rptr));
 
 	/* Template header */
-	tcp_tmpl = tcp->tcp_tcph;
+	tcp_tmpl = tcp->tcp_tcpha;
 
 	/* Header of outgoing packet */
-	tcp_h = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len);
+	tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
 
 	/* dst and src are opaque 32-bit fields, used for copying */
 	dst = (uint32_t *)rptr;
-	src = (uint32_t *)tcp->tcp_iphc;
-	hdrlen = tcp->tcp_hdr_len;
+	src = (uint32_t *)connp->conn_ht_iphc;
+	hdrlen = connp->conn_ht_iphc_len;
 
 	/* Fill time-stamp option if needed */
 	if (tcp->tcp_snd_ts_ok) {
@@ -19172,7 +15777,7 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
 		U32_TO_BE32(tcp->tcp_ts_recent,
 		    (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
 	} else {
-		ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH);
+		ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
 	}
 
 	/*
@@ -19208,16 +15813,16 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
 		SET_ECT(tcp, rptr);
 
 		if (tcp->tcp_ecn_echo_on)
-			tcp_h->th_flags[0] |= TH_ECE;
+			tcpha->tha_flags |= TH_ECE;
 		if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
-			tcp_h->th_flags[0] |= TH_CWR;
+			tcpha->tha_flags |= TH_CWR;
 			tcp->tcp_ecn_cwr_sent = B_TRUE;
 		}
 	}
 
 	/* Fill in SACK options */
 	if (num_sack_blk > 0) {
-		uchar_t *wptr = rptr + tcp->tcp_hdr_len;
+		uchar_t *wptr = rptr + connp->conn_ht_iphc_len;
 		sack_blk_t *tmp;
 		int32_t	i;
 
@@ -19235,1536 +15840,62 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
 			U32_TO_BE32(tmp[i].end, wptr);
 			wptr += sizeof (tcp_seq);
 		}
-		tcp_h->th_offset_and_rsrvd[0] +=
+		tcpha->tha_offset_and_reserved +=
 		    ((num_sack_blk * 2 + 1) << 4);
 	}
 }
 
 /*
- * tcp_mdt_add_attrs() is called by tcp_multisend() in order to attach
- * the destination address and SAP attribute, and if necessary, the
- * hardware checksum offload attribute to a Multidata message.
- */
-static int
-tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum,
-    const uint32_t start, const uint32_t stuff, const uint32_t end,
-    const uint32_t flags, tcp_stack_t *tcps)
-{
-	/* Add global destination address & SAP attribute */
-	if (dlmp == NULL || !ip_md_addr_attr(mmd, NULL, dlmp)) {
-		ip1dbg(("tcp_mdt_add_attrs: can't add global physical "
-		    "destination address+SAP\n"));
-
-		if (dlmp != NULL)
-			TCP_STAT(tcps, tcp_mdt_allocfail);
-		return (-1);
-	}
-
-	/* Add global hwcksum attribute */
-	if (hwcksum &&
-	    !ip_md_hcksum_attr(mmd, NULL, start, stuff, end, flags)) {
-		ip1dbg(("tcp_mdt_add_attrs: can't add global hardware "
-		    "checksum attribute\n"));
-
-		TCP_STAT(tcps, tcp_mdt_allocfail);
-		return (-1);
-	}
-
-	return (0);
-}
-
-/*
- * Smaller and private version of pdescinfo_t used specifically for TCP,
- * which allows for only two payload spans per packet.
- */
-typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t;
-
-/*
- * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit
- * scheme, and returns one the following:
+ * tcp_send() is called by tcp_wput_data() and returns one of the following:
  *
  * -1 = failed allocation.
  *  0 = success; burst count reached, or usable send window is too small,
  *      and that we'd rather wait until later before sending again.
  */
 static int
-tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
-    const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable,
-    uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
-    const int mdt_thres)
-{
-	mblk_t		*md_mp_head, *md_mp, *md_pbuf, *md_pbuf_nxt, *md_hbuf;
-	multidata_t	*mmd;
-	uint_t		obsegs, obbytes, hdr_frag_sz;
-	uint_t		cur_hdr_off, cur_pld_off, base_pld_off, first_snxt;
-	int		num_burst_seg, max_pld;
-	pdesc_t		*pkt;
-	tcp_pdescinfo_t	tcp_pkt_info;
-	pdescinfo_t	*pkt_info;
-	int		pbuf_idx, pbuf_idx_nxt;
-	int		seg_len, len, spill, af;
-	boolean_t	add_buffer, zcopy, clusterwide;
-	boolean_t	rconfirm = B_FALSE;
-	boolean_t	done = B_FALSE;
-	uint32_t	cksum;
-	uint32_t	hwcksum_flags;
-	ire_t		*ire = NULL;
-	ill_t		*ill;
-	ipha_t		*ipha;
-	ip6_t		*ip6h;
-	ipaddr_t	src, dst;
-	ill_zerocopy_capab_t *zc_cap = NULL;
-	uint16_t	*up;
-	int		err;
-	conn_t		*connp;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t 	*ipst = tcps->tcps_netstack->netstack_ip;
-	int		usable_mmd, tail_unsent_mmd;
-	uint_t		snxt_mmd, obsegs_mmd, obbytes_mmd;
-	mblk_t		*xmit_tail_mmd;
-	netstackid_t	stack_id;
-
-#ifdef	_BIG_ENDIAN
-#define	IPVER(ip6h)	((((uint32_t *)ip6h)[0] >> 28) & 0x7)
-#else
-#define	IPVER(ip6h)	((((uint32_t *)ip6h)[0] >> 4) & 0x7)
-#endif
-
-#define	PREP_NEW_MULTIDATA() {			\
-	mmd = NULL;				\
-	md_mp = md_hbuf = NULL;			\
-	cur_hdr_off = 0;			\
-	max_pld = tcp->tcp_mdt_max_pld;		\
-	pbuf_idx = pbuf_idx_nxt = -1;		\
-	add_buffer = B_TRUE;			\
-	zcopy = B_FALSE;			\
-}
-
-#define	PREP_NEW_PBUF() {			\
-	md_pbuf = md_pbuf_nxt = NULL;		\
-	pbuf_idx = pbuf_idx_nxt = -1;		\
-	cur_pld_off = 0;			\
-	first_snxt = *snxt;			\
-	ASSERT(*tail_unsent > 0);		\
-	base_pld_off = MBLKL(*xmit_tail) - *tail_unsent; \
-}
-
-	ASSERT(mdt_thres >= mss);
-	ASSERT(*usable > 0 && *usable > mdt_thres);
-	ASSERT(tcp->tcp_state == TCPS_ESTABLISHED);
-	ASSERT(!TCP_IS_DETACHED(tcp));
-	ASSERT(tcp->tcp_valid_bits == 0 ||
-	    tcp->tcp_valid_bits == TCP_FSS_VALID);
-	ASSERT((tcp->tcp_ipversion == IPV4_VERSION &&
-	    tcp->tcp_ip_hdr_len == IP_SIMPLE_HDR_LENGTH) ||
-	    (tcp->tcp_ipversion == IPV6_VERSION &&
-	    tcp->tcp_ip_hdr_len == IPV6_HDR_LEN));
-
-	connp = tcp->tcp_connp;
-	ASSERT(connp != NULL);
-	ASSERT(CONN_IS_LSO_MD_FASTPATH(connp));
-	ASSERT(!CONN_IPSEC_OUT_ENCAPSULATED(connp));
-
-	stack_id = connp->conn_netstack->netstack_stackid;
-
-	usable_mmd = tail_unsent_mmd = 0;
-	snxt_mmd = obsegs_mmd = obbytes_mmd = 0;
-	xmit_tail_mmd = NULL;
-	/*
-	 * Note that tcp will only declare at most 2 payload spans per
-	 * packet, which is much lower than the maximum allowable number
-	 * of packet spans per Multidata.  For this reason, we use the
-	 * privately declared and smaller descriptor info structure, in
-	 * order to save some stack space.
-	 */
-	pkt_info = (pdescinfo_t *)&tcp_pkt_info;
-
-	af = (tcp->tcp_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6;
-	if (af == AF_INET) {
-		dst = tcp->tcp_ipha->ipha_dst;
-		src = tcp->tcp_ipha->ipha_src;
-		ASSERT(!CLASSD(dst));
-	}
-	ASSERT(af == AF_INET ||
-	    !IN6_IS_ADDR_MULTICAST(&tcp->tcp_ip6h->ip6_dst));
-
-	obsegs = obbytes = 0;
-	num_burst_seg = tcp->tcp_snd_burst;
-	md_mp_head = NULL;
-	PREP_NEW_MULTIDATA();
-
-	/*
-	 * Before we go on further, make sure there is an IRE that we can
-	 * use, and that the ILL supports MDT.  Otherwise, there's no point
-	 * in proceeding any further, and we should just hand everything
-	 * off to the legacy path.
-	 */
-	if (!tcp_send_find_ire(tcp, (af == AF_INET) ? &dst : NULL, &ire))
-		goto legacy_send_no_md;
-
-	ASSERT(ire != NULL);
-	ASSERT(af != AF_INET || ire->ire_ipversion == IPV4_VERSION);
-	ASSERT(af == AF_INET || !IN6_IS_ADDR_V4MAPPED(&(ire->ire_addr_v6)));
-	ASSERT(af == AF_INET || ire->ire_nce != NULL);
-	ASSERT(!(ire->ire_type & IRE_BROADCAST));
-	/*
-	 * If we do support loopback for MDT (which requires modifications
-	 * to the receiving paths), the following assertions should go away,
-	 * and we would be sending the Multidata to loopback conn later on.
-	 */
-	ASSERT(!IRE_IS_LOCAL(ire));
-	ASSERT(ire->ire_stq != NULL);
-
-	ill = ire_to_ill(ire);
-	ASSERT(ill != NULL);
-	ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL);
-
-	if (!tcp->tcp_ire_ill_check_done) {
-		tcp_ire_ill_check(tcp, ire, ill, B_TRUE);
-		tcp->tcp_ire_ill_check_done = B_TRUE;
-	}
-
-	/*
-	 * If the underlying interface conditions have changed, or if the
-	 * new interface does not support MDT, go back to legacy path.
-	 */
-	if (!ILL_MDT_USABLE(ill) || (ire->ire_flags & RTF_MULTIRT) != 0) {
-		/* don't go through this path anymore for this connection */
-		TCP_STAT(tcps, tcp_mdt_conn_halted2);
-		tcp->tcp_mdt = B_FALSE;
-		ip1dbg(("tcp_multisend: disabling MDT for connp %p on "
-		    "interface %s\n", (void *)connp, ill->ill_name));
-		/* IRE will be released prior to returning */
-		goto legacy_send_no_md;
-	}
-
-	if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)
-		zc_cap = ill->ill_zerocopy_capab;
-
-	/*
-	 * Check if we can take tcp fast-path. Note that "incomplete"
-	 * ire's (where the link-layer for next hop is not resolved
-	 * or where the fast-path header in nce_fp_mp is not available
-	 * yet) are sent down the legacy (slow) path.
-	 * NOTE: We should fix ip_xmit_v4 to handle M_MULTIDATA
-	 */
-	if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) {
-		/* IRE will be released prior to returning */
-		goto legacy_send_no_md;
-	}
-
-	/* go to legacy path if interface doesn't support zerocopy */
-	if (tcp->tcp_snd_zcopy_aware && do_tcpzcopy != 2 &&
-	    (zc_cap == NULL || zc_cap->ill_zerocopy_flags == 0)) {
-		/* IRE will be released prior to returning */
-		goto legacy_send_no_md;
-	}
-
-	/* does the interface support hardware checksum offload? */
-	hwcksum_flags = 0;
-	if (ILL_HCKSUM_CAPABLE(ill) &&
-	    (ill->ill_hcksum_capab->ill_hcksum_txflags &
-	    (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL |
-	    HCKSUM_IPHDRCKSUM)) && dohwcksum) {
-		if (ill->ill_hcksum_capab->ill_hcksum_txflags &
-		    HCKSUM_IPHDRCKSUM)
-			hwcksum_flags = HCK_IPV4_HDRCKSUM;
-
-		if (ill->ill_hcksum_capab->ill_hcksum_txflags &
-		    (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
-			hwcksum_flags |= HCK_FULLCKSUM;
-		else if (ill->ill_hcksum_capab->ill_hcksum_txflags &
-		    HCKSUM_INET_PARTIAL)
-			hwcksum_flags |= HCK_PARTIALCKSUM;
-	}
-
-	/*
-	 * Each header fragment consists of the leading extra space,
-	 * followed by the TCP/IP header, and the trailing extra space.
-	 * We make sure that each header fragment begins on a 32-bit
-	 * aligned memory address (tcp_mdt_hdr_head is already 32-bit
-	 * aligned in tcp_mdt_update).
-	 */
-	hdr_frag_sz = roundup((tcp->tcp_mdt_hdr_head + tcp_hdr_len +
-	    tcp->tcp_mdt_hdr_tail), 4);
-
-	/* are we starting from the beginning of data block? */
-	if (*tail_unsent == 0) {
-		*xmit_tail = (*xmit_tail)->b_cont;
-		ASSERT((uintptr_t)MBLKL(*xmit_tail) <= (uintptr_t)INT_MAX);
-		*tail_unsent = (int)MBLKL(*xmit_tail);
-	}
-
-	/*
-	 * Here we create one or more Multidata messages, each made up of
-	 * one header buffer and up to N payload buffers.  This entire
-	 * operation is done within two loops:
-	 *
-	 * The outer loop mostly deals with creating the Multidata message,
-	 * as well as the header buffer that gets added to it.  It also
-	 * links the Multidata messages together such that all of them can
-	 * be sent down to the lower layer in a single putnext call; this
-	 * linking behavior depends on the tcp_mdt_chain tunable.
-	 *
-	 * The inner loop takes an existing Multidata message, and adds
-	 * one or more (up to tcp_mdt_max_pld) payload buffers to it.  It
-	 * packetizes those buffers by filling up the corresponding header
-	 * buffer fragments with the proper IP and TCP headers, and by
-	 * describing the layout of each packet in the packet descriptors
-	 * that get added to the Multidata.
-	 */
-	do {
-		/*
-		 * If usable send window is too small, or data blocks in
-		 * transmit list are smaller than our threshold (i.e. app
-		 * performs large writes followed by small ones), we hand
-		 * off the control over to the legacy path.  Note that we'll
-		 * get back the control once it encounters a large block.
-		 */
-		if (*usable < mss || (*tail_unsent <= mdt_thres &&
-		    (*xmit_tail)->b_cont != NULL &&
-		    MBLKL((*xmit_tail)->b_cont) <= mdt_thres)) {
-			/* send down what we've got so far */
-			if (md_mp_head != NULL) {
-				tcp_multisend_data(tcp, ire, ill, md_mp_head,
-				    obsegs, obbytes, &rconfirm);
-			}
-			/*
-			 * Pass control over to tcp_send(), but tell it to
-			 * return to us once a large-size transmission is
-			 * possible.
-			 */
-			TCP_STAT(tcps, tcp_mdt_legacy_small);
-			if ((err = tcp_send(q, tcp, mss, tcp_hdr_len,
-			    tcp_tcp_hdr_len, num_sack_blk, usable, snxt,
-			    tail_unsent, xmit_tail, local_time,
-			    mdt_thres)) <= 0) {
-				/* burst count reached, or alloc failed */
-				IRE_REFRELE(ire);
-				return (err);
-			}
-
-			/* tcp_send() may have sent everything, so check */
-			if (*usable <= 0) {
-				IRE_REFRELE(ire);
-				return (0);
-			}
-
-			TCP_STAT(tcps, tcp_mdt_legacy_ret);
-			/*
-			 * We may have delivered the Multidata, so make sure
-			 * to re-initialize before the next round.
-			 */
-			md_mp_head = NULL;
-			obsegs = obbytes = 0;
-			num_burst_seg = tcp->tcp_snd_burst;
-			PREP_NEW_MULTIDATA();
-
-			/* are we starting from the beginning of data block? */
-			if (*tail_unsent == 0) {
-				*xmit_tail = (*xmit_tail)->b_cont;
-				ASSERT((uintptr_t)MBLKL(*xmit_tail) <=
-				    (uintptr_t)INT_MAX);
-				*tail_unsent = (int)MBLKL(*xmit_tail);
-			}
-		}
-		/*
-		 * Record current values for parameters we may need to pass
-		 * to tcp_send() or tcp_multisend_data(). We checkpoint at
-		 * each iteration of the outer loop (each multidata message
-		 * creation). If we have a failure in the inner loop, we send
-		 * any complete multidata messages we have before reverting
-		 * to using the traditional non-md path.
-		 */
-		snxt_mmd = *snxt;
-		usable_mmd = *usable;
-		xmit_tail_mmd = *xmit_tail;
-		tail_unsent_mmd = *tail_unsent;
-		obsegs_mmd = obsegs;
-		obbytes_mmd = obbytes;
-
-		/*
-		 * max_pld limits the number of mblks in tcp's transmit
-		 * queue that can be added to a Multidata message.  Once
-		 * this counter reaches zero, no more additional mblks
-		 * can be added to it.  What happens afterwards depends
-		 * on whether or not we are set to chain the Multidata
-		 * messages.  If we are to link them together, reset
-		 * max_pld to its original value (tcp_mdt_max_pld) and
-		 * prepare to create a new Multidata message which will
-		 * get linked to md_mp_head.  Else, leave it alone and
-		 * let the inner loop break on its own.
-		 */
-		if (tcp_mdt_chain && max_pld == 0)
-			PREP_NEW_MULTIDATA();
-
-		/* adding a payload buffer; re-initialize values */
-		if (add_buffer)
-			PREP_NEW_PBUF();
-
-		/*
-		 * If we don't have a Multidata, either because we just
-		 * (re)entered this outer loop, or after we branched off
-		 * to tcp_send above, setup the Multidata and header
-		 * buffer to be used.
-		 */
-		if (md_mp == NULL) {
-			int md_hbuflen;
-			uint32_t start, stuff;
-
-			/*
-			 * Calculate Multidata header buffer size large enough
-			 * to hold all of the headers that can possibly be
-			 * sent at this moment.  We'd rather over-estimate
-			 * the size than running out of space; this is okay
-			 * since this buffer is small anyway.
-			 */
-			md_hbuflen = (howmany(*usable, mss) + 1) * hdr_frag_sz;
-
-			/*
-			 * Start and stuff offset for partial hardware
-			 * checksum offload; these are currently for IPv4.
-			 * For full checksum offload, they are set to zero.
-			 */
-			if ((hwcksum_flags & HCK_PARTIALCKSUM)) {
-				if (af == AF_INET) {
-					start = IP_SIMPLE_HDR_LENGTH;
-					stuff = IP_SIMPLE_HDR_LENGTH +
-					    TCP_CHECKSUM_OFFSET;
-				} else {
-					start = IPV6_HDR_LEN;
-					stuff = IPV6_HDR_LEN +
-					    TCP_CHECKSUM_OFFSET;
-				}
-			} else {
-				start = stuff = 0;
-			}
-
-			/*
-			 * Create the header buffer, Multidata, as well as
-			 * any necessary attributes (destination address,
-			 * SAP and hardware checksum offload) that should
-			 * be associated with the Multidata message.
-			 */
-			ASSERT(cur_hdr_off == 0);
-			if ((md_hbuf = allocb(md_hbuflen, BPRI_HI)) == NULL ||
-			    ((md_hbuf->b_wptr += md_hbuflen),
-			    (mmd = mmd_alloc(md_hbuf, &md_mp,
-			    KM_NOSLEEP)) == NULL) || (tcp_mdt_add_attrs(mmd,
-			    /* fastpath mblk */
-			    ire->ire_nce->nce_res_mp,
-			    /* hardware checksum enabled */
-			    (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)),
-			    /* hardware checksum offsets */
-			    start, stuff, 0,
-			    /* hardware checksum flag */
-			    hwcksum_flags, tcps) != 0)) {
-legacy_send:
-				/*
-				 * We arrive here from a failure within the
-				 * inner (packetizer) loop or we fail one of
-				 * the conditionals above. We restore the
-				 * previously checkpointed values for:
-				 *    xmit_tail
-				 *    usable
-				 *    tail_unsent
-				 *    snxt
-				 *    obbytes
-				 *    obsegs
-				 * We should then be able to dispatch any
-				 * complete multidata before reverting to the
-				 * traditional path with consistent parameters
-				 * (the inner loop updates these as it
-				 * iterates).
-				 */
-				*xmit_tail = xmit_tail_mmd;
-				*usable = usable_mmd;
-				*tail_unsent = tail_unsent_mmd;
-				*snxt = snxt_mmd;
-				obbytes = obbytes_mmd;
-				obsegs = obsegs_mmd;
-				if (md_mp != NULL) {
-					/* Unlink message from the chain */
-					if (md_mp_head != NULL) {
-						err = (intptr_t)rmvb(md_mp_head,
-						    md_mp);
-						/*
-						 * We can't assert that rmvb
-						 * did not return -1, since we
-						 * may get here before linkb
-						 * happens.  We do, however,
-						 * check if we just removed the
-						 * only element in the list.
-						 */
-						if (err == 0)
-							md_mp_head = NULL;
-					}
-					/* md_hbuf gets freed automatically */
-					TCP_STAT(tcps, tcp_mdt_discarded);
-					freeb(md_mp);
-				} else {
-					/* Either allocb or mmd_alloc failed */
-					TCP_STAT(tcps, tcp_mdt_allocfail);
-					if (md_hbuf != NULL)
-						freeb(md_hbuf);
-				}
-
-				/* send down what we've got so far */
-				if (md_mp_head != NULL) {
-					tcp_multisend_data(tcp, ire, ill,
-					    md_mp_head, obsegs, obbytes,
-					    &rconfirm);
-				}
-legacy_send_no_md:
-				if (ire != NULL)
-					IRE_REFRELE(ire);
-				/*
-				 * Too bad; let the legacy path handle this.
-				 * We specify INT_MAX for the threshold, since
-				 * we gave up with the Multidata processings
-				 * and let the old path have it all.
-				 */
-				TCP_STAT(tcps, tcp_mdt_legacy_all);
-				return (tcp_send(q, tcp, mss, tcp_hdr_len,
-				    tcp_tcp_hdr_len, num_sack_blk, usable,
-				    snxt, tail_unsent, xmit_tail, local_time,
-				    INT_MAX));
-			}
-
-			/* link to any existing ones, if applicable */
-			TCP_STAT(tcps, tcp_mdt_allocd);
-			if (md_mp_head == NULL) {
-				md_mp_head = md_mp;
-			} else if (tcp_mdt_chain) {
-				TCP_STAT(tcps, tcp_mdt_linked);
-				linkb(md_mp_head, md_mp);
-			}
-		}
-
-		ASSERT(md_mp_head != NULL);
-		ASSERT(tcp_mdt_chain || md_mp_head->b_cont == NULL);
-		ASSERT(md_mp != NULL && mmd != NULL);
-		ASSERT(md_hbuf != NULL);
-
-		/*
-		 * Packetize the transmittable portion of the data block;
-		 * each data block is essentially added to the Multidata
-		 * as a payload buffer.  We also deal with adding more
-		 * than one payload buffers, which happens when the remaining
-		 * packetized portion of the current payload buffer is less
-		 * than MSS, while the next data block in transmit queue
-		 * has enough data to make up for one.  This "spillover"
-		 * case essentially creates a split-packet, where portions
-		 * of the packet's payload fragments may span across two
-		 * virtually discontiguous address blocks.
-		 */
-		seg_len = mss;
-		do {
-			len = seg_len;
-
-			/* one must remain NULL for DTRACE_IP_FASTPATH */
-			ipha = NULL;
-			ip6h = NULL;
-
-			ASSERT(len > 0);
-			ASSERT(max_pld >= 0);
-			ASSERT(!add_buffer || cur_pld_off == 0);
-
-			/*
-			 * First time around for this payload buffer; note
-			 * in the case of a spillover, the following has
-			 * been done prior to adding the split-packet
-			 * descriptor to Multidata, and we don't want to
-			 * repeat the process.
-			 */
-			if (add_buffer) {
-				ASSERT(mmd != NULL);
-				ASSERT(md_pbuf == NULL);
-				ASSERT(md_pbuf_nxt == NULL);
-				ASSERT(pbuf_idx == -1 && pbuf_idx_nxt == -1);
-
-				/*
-				 * Have we reached the limit?  We'd get to
-				 * this case when we're not chaining the
-				 * Multidata messages together, and since
-				 * we're done, terminate this loop.
-				 */
-				if (max_pld == 0)
-					break; /* done */
-
-				if ((md_pbuf = dupb(*xmit_tail)) == NULL) {
-					TCP_STAT(tcps, tcp_mdt_allocfail);
-					goto legacy_send; /* out_of_mem */
-				}
-
-				if (IS_VMLOANED_MBLK(md_pbuf) && !zcopy &&
-				    zc_cap != NULL) {
-					if (!ip_md_zcopy_attr(mmd, NULL,
-					    zc_cap->ill_zerocopy_flags)) {
-						freeb(md_pbuf);
-						TCP_STAT(tcps,
-						    tcp_mdt_allocfail);
-						/* out_of_mem */
-						goto legacy_send;
-					}
-					zcopy = B_TRUE;
-				}
-
-				md_pbuf->b_rptr += base_pld_off;
-
-				/*
-				 * Add a payload buffer to the Multidata; this
-				 * operation must not fail, or otherwise our
-				 * logic in this routine is broken.  There
-				 * is no memory allocation done by the
-				 * routine, so any returned failure simply
-				 * tells us that we've done something wrong.
-				 *
-				 * A failure tells us that either we're adding
-				 * the same payload buffer more than once, or
-				 * we're trying to add more buffers than
-				 * allowed (max_pld calculation is wrong).
-				 * None of the above cases should happen, and
-				 * we panic because either there's horrible
-				 * heap corruption, and/or programming mistake.
-				 */
-				pbuf_idx = mmd_addpldbuf(mmd, md_pbuf);
-				if (pbuf_idx < 0) {
-					cmn_err(CE_PANIC, "tcp_multisend: "
-					    "payload buffer logic error "
-					    "detected for tcp %p mmd %p "
-					    "pbuf %p (%d)\n",
-					    (void *)tcp, (void *)mmd,
-					    (void *)md_pbuf, pbuf_idx);
-				}
-
-				ASSERT(max_pld > 0);
-				--max_pld;
-				add_buffer = B_FALSE;
-			}
-
-			ASSERT(md_mp_head != NULL);
-			ASSERT(md_pbuf != NULL);
-			ASSERT(md_pbuf_nxt == NULL);
-			ASSERT(pbuf_idx != -1);
-			ASSERT(pbuf_idx_nxt == -1);
-			ASSERT(*usable > 0);
-
-			/*
-			 * We spillover to the next payload buffer only
-			 * if all of the following is true:
-			 *
-			 *   1. There is not enough data on the current
-			 *	payload buffer to make up `len',
-			 *   2. We are allowed to send `len',
-			 *   3. The next payload buffer length is large
-			 *	enough to accomodate `spill'.
-			 */
-			if ((spill = len - *tail_unsent) > 0 &&
-			    *usable >= len &&
-			    MBLKL((*xmit_tail)->b_cont) >= spill &&
-			    max_pld > 0) {
-				md_pbuf_nxt = dupb((*xmit_tail)->b_cont);
-				if (md_pbuf_nxt == NULL) {
-					TCP_STAT(tcps, tcp_mdt_allocfail);
-					goto legacy_send; /* out_of_mem */
-				}
-
-				if (IS_VMLOANED_MBLK(md_pbuf_nxt) && !zcopy &&
-				    zc_cap != NULL) {
-					if (!ip_md_zcopy_attr(mmd, NULL,
-					    zc_cap->ill_zerocopy_flags)) {
-						freeb(md_pbuf_nxt);
-						TCP_STAT(tcps,
-						    tcp_mdt_allocfail);
-						/* out_of_mem */
-						goto legacy_send;
-					}
-					zcopy = B_TRUE;
-				}
-
-				/*
-				 * See comments above on the first call to
-				 * mmd_addpldbuf for explanation on the panic.
-				 */
-				pbuf_idx_nxt = mmd_addpldbuf(mmd, md_pbuf_nxt);
-				if (pbuf_idx_nxt < 0) {
-					panic("tcp_multisend: "
-					    "next payload buffer logic error "
-					    "detected for tcp %p mmd %p "
-					    "pbuf %p (%d)\n",
-					    (void *)tcp, (void *)mmd,
-					    (void *)md_pbuf_nxt, pbuf_idx_nxt);
-				}
-
-				ASSERT(max_pld > 0);
-				--max_pld;
-			} else if (spill > 0) {
-				/*
-				 * If there's a spillover, but the following
-				 * xmit_tail couldn't give us enough octets
-				 * to reach "len", then stop the current
-				 * Multidata creation and let the legacy
-				 * tcp_send() path take over.  We don't want
-				 * to send the tiny segment as part of this
-				 * Multidata for performance reasons; instead,
-				 * we let the legacy path deal with grouping
-				 * it with the subsequent small mblks.
-				 */
-				if (*usable >= len &&
-				    MBLKL((*xmit_tail)->b_cont) < spill) {
-					max_pld = 0;
-					break;	/* done */
-				}
-
-				/*
-				 * We can't spillover, and we are near
-				 * the end of the current payload buffer,
-				 * so send what's left.
-				 */
-				ASSERT(*tail_unsent > 0);
-				len = *tail_unsent;
-			}
-
-			/* tail_unsent is negated if there is a spillover */
-			*tail_unsent -= len;
-			*usable -= len;
-			ASSERT(*usable >= 0);
-
-			if (*usable < mss)
-				seg_len = *usable;
-			/*
-			 * Sender SWS avoidance; see comments in tcp_send();
-			 * everything else is the same, except that we only
-			 * do this here if there is no more data to be sent
-			 * following the current xmit_tail.  We don't check
-			 * for 1-byte urgent data because we shouldn't get
-			 * here if TCP_URG_VALID is set.
-			 */
-			if (*usable > 0 && *usable < mss &&
-			    ((md_pbuf_nxt == NULL &&
-			    (*xmit_tail)->b_cont == NULL) ||
-			    (md_pbuf_nxt != NULL &&
-			    (*xmit_tail)->b_cont->b_cont == NULL)) &&
-			    seg_len < (tcp->tcp_max_swnd >> 1) &&
-			    (tcp->tcp_unsent -
-			    ((*snxt + len) - tcp->tcp_snxt)) > seg_len &&
-			    !tcp->tcp_zero_win_probe) {
-				if ((*snxt + len) == tcp->tcp_snxt &&
-				    (*snxt + len) == tcp->tcp_suna) {
-					TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
-				}
-				done = B_TRUE;
-			}
-
-			/*
-			 * Prime pump for IP's checksumming on our behalf;
-			 * include the adjustment for a source route if any.
-			 * Do this only for software/partial hardware checksum
-			 * offload, as this field gets zeroed out later for
-			 * the full hardware checksum offload case.
-			 */
-			if (!(hwcksum_flags & HCK_FULLCKSUM)) {
-				cksum = len + tcp_tcp_hdr_len + tcp->tcp_sum;
-				cksum = (cksum >> 16) + (cksum & 0xFFFF);
-				U16_TO_ABE16(cksum, tcp->tcp_tcph->th_sum);
-			}
-
-			U32_TO_ABE32(*snxt, tcp->tcp_tcph->th_seq);
-			*snxt += len;
-
-			tcp->tcp_tcph->th_flags[0] = TH_ACK;
-			/*
-			 * We set the PUSH bit only if TCP has no more buffered
-			 * data to be transmitted (or if sender SWS avoidance
-			 * takes place), as opposed to setting it for every
-			 * last packet in the burst.
-			 */
-			if (done ||
-			    (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) == 0)
-				tcp->tcp_tcph->th_flags[0] |= TH_PUSH;
-
-			/*
-			 * Set FIN bit if this is our last segment; snxt
-			 * already includes its length, and it will not
-			 * be adjusted after this point.
-			 */
-			if (tcp->tcp_valid_bits == TCP_FSS_VALID &&
-			    *snxt == tcp->tcp_fss) {
-				if (!tcp->tcp_fin_acked) {
-					tcp->tcp_tcph->th_flags[0] |= TH_FIN;
-					BUMP_MIB(&tcps->tcps_mib,
-					    tcpOutControl);
-				}
-				if (!tcp->tcp_fin_sent) {
-					tcp->tcp_fin_sent = B_TRUE;
-					/*
-					 * tcp state must be ESTABLISHED
-					 * in order for us to get here in
-					 * the first place.
-					 */
-					tcp->tcp_state = TCPS_FIN_WAIT_1;
-
-					/*
-					 * Upon returning from this routine,
-					 * tcp_wput_data() will set tcp_snxt
-					 * to be equal to snxt + tcp_fin_sent.
-					 * This is essentially the same as
-					 * setting it to tcp_fss + 1.
-					 */
-				}
-			}
-
-			tcp->tcp_last_sent_len = (ushort_t)len;
-
-			len += tcp_hdr_len;
-			if (tcp->tcp_ipversion == IPV4_VERSION)
-				tcp->tcp_ipha->ipha_length = htons(len);
-			else
-				tcp->tcp_ip6h->ip6_plen = htons(len -
-				    ((char *)&tcp->tcp_ip6h[1] -
-				    tcp->tcp_iphc));
-
-			pkt_info->flags = (PDESC_HBUF_REF | PDESC_PBUF_REF);
-
-			/* setup header fragment */
-			PDESC_HDR_ADD(pkt_info,
-			    md_hbuf->b_rptr + cur_hdr_off,	/* base */
-			    tcp->tcp_mdt_hdr_head,		/* head room */
-			    tcp_hdr_len,			/* len */
-			    tcp->tcp_mdt_hdr_tail);		/* tail room */
-
-			ASSERT(pkt_info->hdr_lim - pkt_info->hdr_base ==
-			    hdr_frag_sz);
-			ASSERT(MBLKIN(md_hbuf,
-			    (pkt_info->hdr_base - md_hbuf->b_rptr),
-			    PDESC_HDRSIZE(pkt_info)));
-
-			/* setup first payload fragment */
-			PDESC_PLD_INIT(pkt_info);
-			PDESC_PLD_SPAN_ADD(pkt_info,
-			    pbuf_idx,				/* index */
-			    md_pbuf->b_rptr + cur_pld_off,	/* start */
-			    tcp->tcp_last_sent_len);		/* len */
-
-			/* create a split-packet in case of a spillover */
-			if (md_pbuf_nxt != NULL) {
-				ASSERT(spill > 0);
-				ASSERT(pbuf_idx_nxt > pbuf_idx);
-				ASSERT(!add_buffer);
-
-				md_pbuf = md_pbuf_nxt;
-				md_pbuf_nxt = NULL;
-				pbuf_idx = pbuf_idx_nxt;
-				pbuf_idx_nxt = -1;
-				cur_pld_off = spill;
-
-				/* trim out first payload fragment */
-				PDESC_PLD_SPAN_TRIM(pkt_info, 0, spill);
-
-				/* setup second payload fragment */
-				PDESC_PLD_SPAN_ADD(pkt_info,
-				    pbuf_idx,			/* index */
-				    md_pbuf->b_rptr,		/* start */
-				    spill);			/* len */
-
-				if ((*xmit_tail)->b_next == NULL) {
-					/*
-					 * Store the lbolt used for RTT
-					 * estimation. We can only record one
-					 * timestamp per mblk so we do it when
-					 * we reach the end of the payload
-					 * buffer.  Also we only take a new
-					 * timestamp sample when the previous
-					 * timed data from the same mblk has
-					 * been ack'ed.
-					 */
-					(*xmit_tail)->b_prev = local_time;
-					(*xmit_tail)->b_next =
-					    (mblk_t *)(uintptr_t)first_snxt;
-				}
-
-				first_snxt = *snxt - spill;
-
-				/*
-				 * Advance xmit_tail; usable could be 0 by
-				 * the time we got here, but we made sure
-				 * above that we would only spillover to
-				 * the next data block if usable includes
-				 * the spilled-over amount prior to the
-				 * subtraction.  Therefore, we are sure
-				 * that xmit_tail->b_cont can't be NULL.
-				 */
-				ASSERT((*xmit_tail)->b_cont != NULL);
-				*xmit_tail = (*xmit_tail)->b_cont;
-				ASSERT((uintptr_t)MBLKL(*xmit_tail) <=
-				    (uintptr_t)INT_MAX);
-				*tail_unsent = (int)MBLKL(*xmit_tail) - spill;
-			} else {
-				cur_pld_off += tcp->tcp_last_sent_len;
-			}
-
-			/*
-			 * Fill in the header using the template header, and
-			 * add options such as time-stamp, ECN and/or SACK,
-			 * as needed.
-			 */
-			tcp_fill_header(tcp, pkt_info->hdr_rptr,
-			    (clock_t)local_time, num_sack_blk);
-
-			/* take care of some IP header businesses */
-			if (af == AF_INET) {
-				ipha = (ipha_t *)pkt_info->hdr_rptr;
-
-				ASSERT(OK_32PTR((uchar_t *)ipha));
-				ASSERT(PDESC_HDRL(pkt_info) >=
-				    IP_SIMPLE_HDR_LENGTH);
-				ASSERT(ipha->ipha_version_and_hdr_length ==
-				    IP_SIMPLE_HDR_VERSION);
-
-				/*
-				 * Assign ident value for current packet; see
-				 * related comments in ip_wput_ire() about the
-				 * contract private interface with clustering
-				 * group.
-				 */
-				clusterwide = B_FALSE;
-				if (cl_inet_ipident != NULL) {
-					ASSERT(cl_inet_isclusterwide != NULL);
-					if ((*cl_inet_isclusterwide)(stack_id,
-					    IPPROTO_IP, AF_INET,
-					    (uint8_t *)(uintptr_t)src, NULL)) {
-						ipha->ipha_ident =
-						    (*cl_inet_ipident)(stack_id,
-						    IPPROTO_IP, AF_INET,
-						    (uint8_t *)(uintptr_t)src,
-						    (uint8_t *)(uintptr_t)dst,
-						    NULL);
-						clusterwide = B_TRUE;
-					}
-				}
-
-				if (!clusterwide) {
-					ipha->ipha_ident = (uint16_t)
-					    atomic_add_32_nv(
-						&ire->ire_ident, 1);
-				}
-#ifndef _BIG_ENDIAN
-				ipha->ipha_ident = (ipha->ipha_ident << 8) |
-				    (ipha->ipha_ident >> 8);
-#endif
-			} else {
-				ip6h = (ip6_t *)pkt_info->hdr_rptr;
-
-				ASSERT(OK_32PTR((uchar_t *)ip6h));
-				ASSERT(IPVER(ip6h) == IPV6_VERSION);
-				ASSERT(ip6h->ip6_nxt == IPPROTO_TCP);
-				ASSERT(PDESC_HDRL(pkt_info) >=
-				    (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET +
-				    TCP_CHECKSUM_SIZE));
-				ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
-
-				if (tcp->tcp_ip_forward_progress) {
-					rconfirm = B_TRUE;
-					tcp->tcp_ip_forward_progress = B_FALSE;
-				}
-			}
-
-			/* at least one payload span, and at most two */
-			ASSERT(pkt_info->pld_cnt > 0 && pkt_info->pld_cnt < 3);
-
-			/* add the packet descriptor to Multidata */
-			if ((pkt = mmd_addpdesc(mmd, pkt_info, &err,
-			    KM_NOSLEEP)) == NULL) {
-				/*
-				 * Any failure other than ENOMEM indicates
-				 * that we have passed in invalid pkt_info
-				 * or parameters to mmd_addpdesc, which must
-				 * not happen.
-				 *
-				 * EINVAL is a result of failure on boundary
-				 * checks against the pkt_info contents.  It
-				 * should not happen, and we panic because
-				 * either there's horrible heap corruption,
-				 * and/or programming mistake.
-				 */
-				if (err != ENOMEM) {
-					cmn_err(CE_PANIC, "tcp_multisend: "
-					    "pdesc logic error detected for "
-					    "tcp %p mmd %p pinfo %p (%d)\n",
-					    (void *)tcp, (void *)mmd,
-					    (void *)pkt_info, err);
-				}
-				TCP_STAT(tcps, tcp_mdt_addpdescfail);
-				goto legacy_send; /* out_of_mem */
-			}
-			ASSERT(pkt != NULL);
-
-			/* calculate IP header and TCP checksums */
-			if (af == AF_INET) {
-				/* calculate pseudo-header checksum */
-				cksum = (dst >> 16) + (dst & 0xFFFF) +
-				    (src >> 16) + (src & 0xFFFF);
-
-				/* offset for TCP header checksum */
-				up = IPH_TCPH_CHECKSUMP(ipha,
-				    IP_SIMPLE_HDR_LENGTH);
-			} else {
-				up = (uint16_t *)&ip6h->ip6_src;
-
-				/* calculate pseudo-header checksum */
-				cksum = up[0] + up[1] + up[2] + up[3] +
-				    up[4] + up[5] + up[6] + up[7] +
-				    up[8] + up[9] + up[10] + up[11] +
-				    up[12] + up[13] + up[14] + up[15];
-
-				/* Fold the initial sum */
-				cksum = (cksum & 0xffff) + (cksum >> 16);
-
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET);
-			}
-
-			if (hwcksum_flags & HCK_FULLCKSUM) {
-				/* clear checksum field for hardware */
-				*up = 0;
-			} else if (hwcksum_flags & HCK_PARTIALCKSUM) {
-				uint32_t sum;
-
-				/* pseudo-header checksumming */
-				sum = *up + cksum + IP_TCP_CSUM_COMP;
-				sum = (sum & 0xFFFF) + (sum >> 16);
-				*up = (sum & 0xFFFF) + (sum >> 16);
-			} else {
-				/* software checksumming */
-				TCP_STAT(tcps, tcp_out_sw_cksum);
-				TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes,
-				    tcp->tcp_hdr_len + tcp->tcp_last_sent_len);
-				*up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len,
-				    cksum + IP_TCP_CSUM_COMP);
-				if (*up == 0)
-					*up = 0xFFFF;
-			}
-
-			/* IPv4 header checksum */
-			if (af == AF_INET) {
-				if (hwcksum_flags & HCK_IPV4_HDRCKSUM) {
-					ipha->ipha_hdr_checksum = 0;
-				} else {
-					IP_HDR_CKSUM(ipha, cksum,
-					    ((uint32_t *)ipha)[0],
-					    ((uint16_t *)ipha)[4]);
-				}
-			}
-
-			if (af == AF_INET &&
-			    HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) ||
-			    af == AF_INET6 &&
-			    HOOKS6_INTERESTED_PHYSICAL_OUT(ipst)) {
-				mblk_t	*mp, *mp1;
-				uchar_t	*hdr_rptr, *hdr_wptr;
-				uchar_t	*pld_rptr, *pld_wptr;
-
-				/*
-				 * We reconstruct a pseudo packet for the hooks
-				 * framework using mmd_transform_link().
-				 * If it is a split packet we pullup the
-				 * payload. FW_HOOKS expects a pkt comprising
-				 * of two mblks: a header and the payload.
-				 */
-				if ((mp = mmd_transform_link(pkt)) == NULL) {
-					TCP_STAT(tcps, tcp_mdt_allocfail);
-					goto legacy_send;
-				}
-
-				if (pkt_info->pld_cnt > 1) {
-					/* split payload, more than one pld */
-					if ((mp1 = msgpullup(mp->b_cont, -1)) ==
-					    NULL) {
-						freemsg(mp);
-						TCP_STAT(tcps,
-						    tcp_mdt_allocfail);
-						goto legacy_send;
-					}
-					freemsg(mp->b_cont);
-					mp->b_cont = mp1;
-				} else {
-					mp1 = mp->b_cont;
-				}
-				ASSERT(mp1 != NULL && mp1->b_cont == NULL);
-
-				/*
-				 * Remember the message offsets. This is so we
-				 * can detect changes when we return from the
-				 * FW_HOOKS callbacks.
-				 */
-				hdr_rptr = mp->b_rptr;
-				hdr_wptr = mp->b_wptr;
-				pld_rptr = mp->b_cont->b_rptr;
-				pld_wptr = mp->b_cont->b_wptr;
-
-				if (af == AF_INET) {
-					DTRACE_PROBE4(
-					    ip4__physical__out__start,
-					    ill_t *, NULL,
-					    ill_t *, ill,
-					    ipha_t *, ipha,
-					    mblk_t *, mp);
-					FW_HOOKS(
-					    ipst->ips_ip4_physical_out_event,
-					    ipst->ips_ipv4firewall_physical_out,
-					    NULL, ill, ipha, mp, mp, 0, ipst);
-					DTRACE_PROBE1(
-					    ip4__physical__out__end,
-					    mblk_t *, mp);
-				} else {
-					DTRACE_PROBE4(
-					    ip6__physical__out_start,
-					    ill_t *, NULL,
-					    ill_t *, ill,
-					    ip6_t *, ip6h,
-					    mblk_t *, mp);
-					FW_HOOKS6(
-					    ipst->ips_ip6_physical_out_event,
-					    ipst->ips_ipv6firewall_physical_out,
-					    NULL, ill, ip6h, mp, mp, 0, ipst);
-					DTRACE_PROBE1(
-					    ip6__physical__out__end,
-					    mblk_t *, mp);
-				}
-
-				if (mp == NULL ||
-				    (mp1 = mp->b_cont) == NULL ||
-				    mp->b_rptr != hdr_rptr ||
-				    mp->b_wptr != hdr_wptr ||
-				    mp1->b_rptr != pld_rptr ||
-				    mp1->b_wptr != pld_wptr ||
-				    mp1->b_cont != NULL) {
-					/*
-					 * We abandon multidata processing and
-					 * return to the normal path, either
-					 * when a packet is blocked, or when
-					 * the boundaries of header buffer or
-					 * payload buffer have been changed by
-					 * FW_HOOKS[6].
-					 */
-					if (mp != NULL)
-						freemsg(mp);
-					goto legacy_send;
-				}
-				/* Finished with the pseudo packet */
-				freemsg(mp);
-			}
-			DTRACE_IP_FASTPATH(md_hbuf, pkt_info->hdr_rptr,
-			    ill, ipha, ip6h);
-			/* advance header offset */
-			cur_hdr_off += hdr_frag_sz;
-
-			obbytes += tcp->tcp_last_sent_len;
-			++obsegs;
-		} while (!done && *usable > 0 && --num_burst_seg > 0 &&
-		    *tail_unsent > 0);
-
-		if ((*xmit_tail)->b_next == NULL) {
-			/*
-			 * Store the lbolt used for RTT estimation. We can only
-			 * record one timestamp per mblk so we do it when we
-			 * reach the end of the payload buffer. Also we only
-			 * take a new timestamp sample when the previous timed
-			 * data from the same mblk has been ack'ed.
-			 */
-			(*xmit_tail)->b_prev = local_time;
-			(*xmit_tail)->b_next = (mblk_t *)(uintptr_t)first_snxt;
-		}
-
-		ASSERT(*tail_unsent >= 0);
-		if (*tail_unsent > 0) {
-			/*
-			 * We got here because we broke out of the above
-			 * loop due to of one of the following cases:
-			 *
-			 *   1. len < adjusted MSS (i.e. small),
-			 *   2. Sender SWS avoidance,
-			 *   3. max_pld is zero.
-			 *
-			 * We are done for this Multidata, so trim our
-			 * last payload buffer (if any) accordingly.
-			 */
-			if (md_pbuf != NULL)
-				md_pbuf->b_wptr -= *tail_unsent;
-		} else if (*usable > 0) {
-			*xmit_tail = (*xmit_tail)->b_cont;
-			ASSERT((uintptr_t)MBLKL(*xmit_tail) <=
-			    (uintptr_t)INT_MAX);
-			*tail_unsent = (int)MBLKL(*xmit_tail);
-			add_buffer = B_TRUE;
-		}
-	} while (!done && *usable > 0 && num_burst_seg > 0 &&
-	    (tcp_mdt_chain || max_pld > 0));
-
-	if (md_mp_head != NULL) {
-		/* send everything down */
-		tcp_multisend_data(tcp, ire, ill, md_mp_head, obsegs, obbytes,
-		    &rconfirm);
-	}
-
-#undef PREP_NEW_MULTIDATA
-#undef PREP_NEW_PBUF
-#undef IPVER
-
-	IRE_REFRELE(ire);
-	return (0);
-}
-
-/*
- * A wrapper function for sending one or more Multidata messages down to
- * the module below ip; this routine does not release the reference of the
- * IRE (caller does that).  This routine is analogous to tcp_send_data().
- */
-static void
-tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head,
-    const uint_t obsegs, const uint_t obbytes, boolean_t *rconfirm)
+tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
+    const int tcp_hdr_len, const int num_sack_blk, int *usable,
+    uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
 {
-	uint64_t delta;
-	nce_t *nce;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
-
-	ASSERT(ire != NULL && ill != NULL);
-	ASSERT(ire->ire_stq != NULL);
-	ASSERT(md_mp_head != NULL);
-	ASSERT(rconfirm != NULL);
-
-	/* adjust MIBs and IRE timestamp */
-	DTRACE_PROBE2(tcp__trace__send, mblk_t *, md_mp_head, tcp_t *, tcp);
-	tcp->tcp_obsegs += obsegs;
-	UPDATE_MIB(&tcps->tcps_mib, tcpOutDataSegs, obsegs);
-	UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, obbytes);
-	TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out, obsegs);
-
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
-		TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v4, obsegs);
-	} else {
-		TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v6, obsegs);
-	}
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests, obsegs);
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, obsegs);
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, obbytes);
-
-	ire->ire_ob_pkt_count += obsegs;
-	if (ire->ire_ipif != NULL)
-		atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, obsegs);
-	ire->ire_last_used_time = lbolt;
-
-	if ((tcp->tcp_ipversion == IPV4_VERSION &&
-	    ipst->ips_ip4_observe.he_interested) ||
-	    (tcp->tcp_ipversion == IPV6_VERSION &&
-	    ipst->ips_ip6_observe.he_interested)) {
-		multidata_t *dlmdp = mmd_getmultidata(md_mp_head);
-		pdesc_t *dl_pkt;
-		pdescinfo_t pinfo;
-		mblk_t *nmp;
-		zoneid_t szone = tcp->tcp_connp->conn_zoneid;
-
-		for (dl_pkt = mmd_getfirstpdesc(dlmdp, &pinfo);
-		    (dl_pkt != NULL);
-		    dl_pkt = mmd_getnextpdesc(dl_pkt, &pinfo)) {
-			if ((nmp = mmd_transform_link(dl_pkt)) == NULL)
-				continue;
-			ipobs_hook(nmp, IPOBS_HOOK_OUTBOUND, szone,
-			    ALL_ZONES, ill, ipst);
-			freemsg(nmp);
-		}
-	}
-
-	/* send it down */
-	putnext(ire->ire_stq, md_mp_head);
-
-	/* we're done for TCP/IPv4 */
-	if (tcp->tcp_ipversion == IPV4_VERSION)
-		return;
-
-	nce = ire->ire_nce;
-
-	ASSERT(nce != NULL);
-	ASSERT(!(nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT)));
-	ASSERT(nce->nce_state != ND_INCOMPLETE);
-
-	/* reachability confirmation? */
-	if (*rconfirm) {
-		nce->nce_last = TICK_TO_MSEC(lbolt64);
-		if (nce->nce_state != ND_REACHABLE) {
-			mutex_enter(&nce->nce_lock);
-			nce->nce_state = ND_REACHABLE;
-			nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
-			mutex_exit(&nce->nce_lock);
-			(void) untimeout(nce->nce_timeout_id);
-			if (ip_debug > 2) {
-				/* ip1dbg */
-				pr_addr_dbg("tcp_multisend_data: state "
-				    "for %s changed to REACHABLE\n",
-				    AF_INET6, &ire->ire_addr_v6);
-			}
-		}
-		/* reset transport reachability confirmation */
-		*rconfirm = B_FALSE;
-	}
-
-	delta =  TICK_TO_MSEC(lbolt64) - nce->nce_last;
-	ip1dbg(("tcp_multisend_data: delta = %" PRId64
-	    " ill_reachable_time = %d \n", delta, ill->ill_reachable_time));
-
-	if (delta > (uint64_t)ill->ill_reachable_time) {
-		mutex_enter(&nce->nce_lock);
-		switch (nce->nce_state) {
-		case ND_REACHABLE:
-		case ND_STALE:
-			/*
-			 * ND_REACHABLE is identical to ND_STALE in this
-			 * specific case. If reachable time has expired for
-			 * this neighbor (delta is greater than reachable
-			 * time), conceptually, the neighbor cache is no
-			 * longer in REACHABLE state, but already in STALE
-			 * state.  So the correct transition here is to
-			 * ND_DELAY.
-			 */
-			nce->nce_state = ND_DELAY;
-			mutex_exit(&nce->nce_lock);
-			NDP_RESTART_TIMER(nce,
-			    ipst->ips_delay_first_probe_time);
-			if (ip_debug > 3) {
-				/* ip2dbg */
-				pr_addr_dbg("tcp_multisend_data: state "
-				    "for %s changed to DELAY\n",
-				    AF_INET6, &ire->ire_addr_v6);
-			}
-			break;
-		case ND_DELAY:
-		case ND_PROBE:
-			mutex_exit(&nce->nce_lock);
-			/* Timers have already started */
-			break;
-		case ND_UNREACHABLE:
-			/*
-			 * ndp timer has detected that this nce is
-			 * unreachable and initiated deleting this nce
-			 * and all its associated IREs. This is a race
-			 * where we found the ire before it was deleted
-			 * and have just sent out a packet using this
-			 * unreachable nce.
-			 */
-			mutex_exit(&nce->nce_lock);
-			break;
-		default:
-			ASSERT(0);
-		}
-	}
-}
-
-/*
- * Derived from tcp_send_data().
- */
-static void
-tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss,
-    int num_lso_seg)
-{
-	ipha_t		*ipha;
-	mblk_t		*ire_fp_mp;
-	uint_t		ire_fp_mp_len;
-	uint32_t	hcksum_txflags = 0;
-	ipaddr_t	src;
-	ipaddr_t	dst;
-	uint32_t	cksum;
-	uint16_t	*up;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
-
-	ASSERT(DB_TYPE(mp) == M_DATA);
-	ASSERT(tcp->tcp_state == TCPS_ESTABLISHED);
-	ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
-	ASSERT(tcp->tcp_connp != NULL);
-	ASSERT(CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp));
-
-	ipha = (ipha_t *)mp->b_rptr;
-	src = ipha->ipha_src;
-	dst = ipha->ipha_dst;
-
-	DTRACE_PROBE2(tcp__trace__send, mblk_t *, mp, tcp_t *, tcp);
-
-	ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
-	ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident,
-	    num_lso_seg);
-#ifndef _BIG_ENDIAN
-	ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
-#endif
-	if (tcp->tcp_snd_zcopy_aware) {
-		if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 ||
-		    (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0))
-			mp = tcp_zcopy_disable(tcp, mp);
-	}
-
-	if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
-		ASSERT(ill->ill_hcksum_capab != NULL);
-		hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
-	}
-
-	/*
-	 * Since the TCP checksum should be recalculated by h/w, we can just
-	 * zero the checksum field for HCK_FULLCKSUM, or calculate partial
-	 * pseudo-header checksum for HCK_PARTIALCKSUM.
-	 * The partial pseudo-header excludes TCP length, that was calculated
-	 * in tcp_send(), so to zero *up before further processing.
-	 */
-	cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
-
-	up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
-	*up = 0;
-
-	IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
-	    IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
-
-	/*
-	 * Append LSO flags and mss to the mp.
-	 */
-	lso_info_set(mp, mss, HW_LSO);
-
-	ipha->ipha_fragment_offset_and_flags |=
-	    (uint32_t)htons(ire->ire_frag_flag);
-
-	ire_fp_mp = ire->ire_nce->nce_fp_mp;
-	ire_fp_mp_len = MBLKL(ire_fp_mp);
-	ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
-	mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
-	bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
-
-	UPDATE_OB_PKT_COUNT(ire);
-	ire->ire_last_used_time = lbolt;
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
-	    ntohs(ipha->ipha_length));
-
-	DTRACE_PROBE4(ip4__physical__out__start,
-	    ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
-	FW_HOOKS(ipst->ips_ip4_physical_out_event,
-	    ipst->ips_ipv4firewall_physical_out, NULL,
-	    ill, ipha, mp, mp, 0, ipst);
-	DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
-	DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL);
-
-	if (mp != NULL) {
-		if (ipst->ips_ip4_observe.he_interested) {
-			zoneid_t szone;
-
-			if (ire_fp_mp_len != 0)
-				mp->b_rptr += ire_fp_mp_len;
-			szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
-			    ipst, ALL_ZONES);
-			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
-			    ALL_ZONES, ill, ipst);
-			if (ire_fp_mp_len != 0)
-				mp->b_rptr -= ire_fp_mp_len;
-		}
-
-		ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0, NULL);
-	}
-}
-
-/*
- * tcp_send() is called by tcp_wput_data() for non-Multidata transmission
- * scheme, and returns one of the following:
- *
- * -1 = failed allocation.
- *  0 = success; burst count reached, or usable send window is too small,
- *      and that we'd rather wait until later before sending again.
- *  1 = success; we are called from tcp_multisend(), and both usable send
- *      window and tail_unsent are greater than the MDT threshold, and thus
- *      Multidata Transmit should be used instead.
- */
-static int
-tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
-    const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable,
-    uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
-    const int mdt_thres)
-{
-	int num_burst_seg = tcp->tcp_snd_burst;
-	ire_t		*ire = NULL;
-	ill_t		*ill = NULL;
-	mblk_t		*ire_fp_mp = NULL;
-	uint_t		ire_fp_mp_len = 0;
+	int		num_burst_seg = tcp->tcp_snd_burst;
 	int		num_lso_seg = 1;
 	uint_t		lso_usable;
 	boolean_t	do_lso_send = B_FALSE;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
+	ip_xmit_attr_t	*ixa = connp->conn_ixa;
 
 	/*
-	 * Check LSO capability before any further work. And the similar check
-	 * need to be done in for(;;) loop.
-	 * LSO will be deployed when therer is more than one mss of available
-	 * data and a burst transmission is allowed.
+	 * Check LSO possibility. The value of tcp->tcp_lso indicates whether
+	 * the underlying connection is LSO capable. Will check whether having
+	 * enough available data to initiate LSO transmission in the for(){}
+	 * loops.
 	 */
-	if (tcp->tcp_lso &&
-	    (tcp->tcp_valid_bits == 0 ||
-	    tcp->tcp_valid_bits == TCP_FSS_VALID) &&
-	    num_burst_seg >= 2 && (*usable - 1) / mss >= 1) {
-		/*
-		 * Try to find usable IRE/ILL and do basic check to the ILL.
-		 * Double check LSO usability before going further, since the
-		 * underlying interface could have been changed. In case of any
-		 * change of LSO capability, set tcp_ire_ill_check_done to
-		 * B_FALSE to force to check the ILL with the next send.
-		 */
-		if (tcp_send_find_ire_ill(tcp, NULL, &ire, &ill) &&
-		    tcp->tcp_lso && ILL_LSO_TCP_USABLE(ill)) {
-			/*
-			 * Enable LSO with this transmission.
-			 * Since IRE has been hold in tcp_send_find_ire_ill(),
-			 * IRE_REFRELE(ire) should be called before return.
-			 */
+	if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
 			do_lso_send = B_TRUE;
-			ire_fp_mp = ire->ire_nce->nce_fp_mp;
-			ire_fp_mp_len = MBLKL(ire_fp_mp);
-			/* Round up to multiple of 4 */
-			ire_fp_mp_len = ((ire_fp_mp_len + 3) / 4) * 4;
-		} else {
-			tcp->tcp_lso = B_FALSE;
-			tcp->tcp_ire_ill_check_done = B_FALSE;
-			do_lso_send = B_FALSE;
-			ill = NULL;
-		}
-	}
 
 	for (;;) {
 		struct datab	*db;
-		tcph_t		*tcph;
+		tcpha_t		*tcpha;
 		uint32_t	sum;
 		mblk_t		*mp, *mp1;
 		uchar_t		*rptr;
 		int		len;
 
 		/*
-		 * If we're called by tcp_multisend(), and the amount of
-		 * sendable data as well as the size of current xmit_tail
-		 * is beyond the MDT threshold, return to the caller and
-		 * let the large data transmit be done using MDT.
+		 * Burst count reached, return successfully.
 		 */
-		if (*usable > 0 && *usable > mdt_thres &&
-		    (*tail_unsent > mdt_thres || (*tail_unsent == 0 &&
-		    MBLKL((*xmit_tail)->b_cont) > mdt_thres))) {
-			ASSERT(tcp->tcp_mdt);
-			return (1);	/* success; do large send */
-		}
-
 		if (num_burst_seg == 0)
-			break;		/* success; burst count reached */
+			break;
 
 		/*
-		 * Calculate the maximum payload length we can send in *one*
+		 * Calculate the maximum payload length we can send at one
 		 * time.
 		 */
 		if (do_lso_send) {
 			/*
-			 * Check whether need to do LSO any more.
+			 * Check whether be able to to do LSO for the current
+			 * available data.
 			 */
 			if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) {
 				lso_usable = MIN(tcp->tcp_lso_max, *usable);
@@ -20787,7 +15918,10 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 		}
 
 		ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);
-
+#ifdef DEBUG
+		DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t,
+		    do_lso_send);
+#endif
 		/*
 		 * Adjust num_burst_seg here.
 		 */
@@ -20817,7 +15951,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 				/*
 				 * If the retransmit timer is not running
 				 * we start it so that we will retransmit
-				 * in the case when the the receiver has
+				 * in the case when the receiver has
 				 * decremented the window.
 				 */
 				if (*snxt == tcp->tcp_snxt &&
@@ -20838,7 +15972,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 			}
 		}
 
-		tcph = tcp->tcp_tcph;
+		tcpha = tcp->tcp_tcpha;
 
 		/*
 		 * The reason to adjust len here is that we need to set flags
@@ -20849,19 +15983,25 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 
 		*usable -= len; /* Approximate - can be adjusted later */
 		if (*usable > 0)
-			tcph->th_flags[0] = TH_ACK;
+			tcpha->tha_flags = TH_ACK;
 		else
-			tcph->th_flags[0] = (TH_ACK | TH_PUSH);
+			tcpha->tha_flags = (TH_ACK | TH_PUSH);
 
 		/*
-		 * Prime pump for IP's checksumming on our behalf
+		 * Prime pump for IP's checksumming on our behalf.
 		 * Include the adjustment for a source route if any.
+		 * In case of LSO, the partial pseudo-header checksum should
+		 * exclusive TCP length, so zero tha_sum before IP calculate
+		 * pseudo-header checksum for partial checksum offload.
 		 */
-		sum = len + tcp_tcp_hdr_len + tcp->tcp_sum;
-		sum = (sum >> 16) + (sum & 0xFFFF);
-		U16_TO_ABE16(sum, tcph->th_sum);
-
-		U32_TO_ABE32(*snxt, tcph->th_seq);
+		if (do_lso_send) {
+			sum = 0;
+		} else {
+			sum = len + tcp_hdr_len + connp->conn_sum;
+			sum = (sum >> 16) + (sum & 0xFFFF);
+		}
+		tcpha->tha_sum = htons(sum);
+		tcpha->tha_seq = htonl(*snxt);
 
 		/*
 		 * Branch off to tcp_xmit_mp() if any of the VALID bits is
@@ -20907,8 +16047,6 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 				(*xmit_tail)->b_rptr = prev_rptr;
 
 			if (mp == NULL) {
-				if (ire != NULL)
-					IRE_REFRELE(ire);
 				return (-1);
 			}
 			mp1 = mp->b_cont;
@@ -20927,7 +16065,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 			BUMP_LOCAL(tcp->tcp_obsegs);
 			BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs);
 			UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len);
-			tcp_send_data(tcp, q, mp);
+			tcp_send_data(tcp, mp);
 			continue;
 		}
 
@@ -20942,18 +16080,18 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 				*tail_unsent -= len;
 				if (len <= mss) /* LSO is unusable */
 					tcp->tcp_last_sent_len = (ushort_t)len;
-				len += tcp_hdr_len;
-				if (tcp->tcp_ipversion == IPV4_VERSION)
+				len += total_hdr_len;
+				ixa->ixa_pktlen = len;
+
+				if (ixa->ixa_flags & IXAF_IS_IPV4) {
 					tcp->tcp_ipha->ipha_length = htons(len);
-				else
+				} else {
 					tcp->tcp_ip6h->ip6_plen =
-					    htons(len -
-					    ((char *)&tcp->tcp_ip6h[1] -
-					    tcp->tcp_iphc));
+					    htons(len - IPV6_HDR_LEN);
+				}
+
 				mp = dupb(*xmit_tail);
 				if (mp == NULL) {
-					if (ire != NULL)
-						IRE_REFRELE(ire);
 					return (-1);	/* out_of_mem */
 				}
 				mp->b_rptr = rptr;
@@ -20983,21 +16121,21 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 		if (len <= mss) /* LSO is unusable (!do_lso_send) */
 			tcp->tcp_last_sent_len = (ushort_t)len;
 
-		len += tcp_hdr_len;
-		if (tcp->tcp_ipversion == IPV4_VERSION)
+		len += total_hdr_len;
+		ixa->ixa_pktlen = len;
+
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
 			tcp->tcp_ipha->ipha_length = htons(len);
-		else
-			tcp->tcp_ip6h->ip6_plen = htons(len -
-			    ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
+		} else {
+			tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
+		}
 
 		mp = dupb(*xmit_tail);
 		if (mp == NULL) {
-			if (ire != NULL)
-				IRE_REFRELE(ire);
 			return (-1);	/* out_of_mem */
 		}
 
-		len = tcp_hdr_len;
+		len = total_hdr_len;
 		/*
 		 * There are four reasons to allocate a new hdr mblk:
 		 *  1) The bytes above us are in use by another packet
@@ -21008,24 +16146,21 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 		rptr = mp->b_rptr - len;
 		if (!OK_32PTR(rptr) ||
 		    ((db = mp->b_datap), db->db_ref != 2) ||
-		    rptr < db->db_base + ire_fp_mp_len) {
+		    rptr < db->db_base) {
 			/* NOTE: we assume allocb returns an OK_32PTR */
 
 		must_alloc:;
-			mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH +
-			    tcps->tcps_wroff_xtra + ire_fp_mp_len, BPRI_MED);
+			mp1 = allocb(connp->conn_ht_iphc_allocated +
+			    tcps->tcps_wroff_xtra, BPRI_MED);
 			if (mp1 == NULL) {
 				freemsg(mp);
-				if (ire != NULL)
-					IRE_REFRELE(ire);
 				return (-1);	/* out_of_mem */
 			}
 			mp1->b_cont = mp;
 			mp = mp1;
 			/* Leave room for Link Level header */
-			len = tcp_hdr_len;
-			rptr =
-			    &mp->b_rptr[tcps->tcps_wroff_xtra + ire_fp_mp_len];
+			len = total_hdr_len;
+			rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
 			mp->b_wptr = &rptr[len];
 		}
 
@@ -21057,18 +16192,17 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 
 				/*
 				 * Excess data in mblk; can we split it?
-				 * If MDT is enabled for the connection,
+				 * If LSO is enabled for the connection,
 				 * keep on splitting as this is a transient
 				 * send path.
 				 */
-				if (!do_lso_send && !tcp->tcp_mdt &&
-				    (spill + nmpsz > 0)) {
+				if (!do_lso_send && (spill + nmpsz > 0)) {
 					/*
 					 * Don't split if stream head was
 					 * told to break up larger writes
 					 * into smaller ones.
 					 */
-					if (tcp->tcp_maxpsz > 0)
+					if (tcp->tcp_maxpsz_multiplier > 0)
 						break;
 
 					/*
@@ -21096,8 +16230,6 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 				if (mp1 == NULL) {
 					*tail_unsent = spill;
 					freemsg(mp);
-					if (ire != NULL)
-						IRE_REFRELE(ire);
 					return (-1);	/* out_of_mem */
 				}
 			}
@@ -21119,11 +16251,12 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 				/*
 				 * Adjust the checksum
 				 */
-				tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len);
+				tcpha = (tcpha_t *)(rptr +
+				    ixa->ixa_ip_hdr_length);
 				sum += spill;
 				sum = (sum >> 16) + (sum & 0xFFFF);
-				U16_TO_ABE16(sum, tcph->th_sum);
-				if (tcp->tcp_ipversion == IPV4_VERSION) {
+				tcpha->tha_sum = htons(sum);
+				if (connp->conn_ipversion == IPV4_VERSION) {
 					sum = ntohs(
 					    ((ipha_t *)rptr)->ipha_length) +
 					    spill;
@@ -21136,311 +16269,55 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
 					((ip6_t *)rptr)->ip6_plen =
 					    htons(sum);
 				}
+				ixa->ixa_pktlen += spill;
 				*tail_unsent = 0;
 			}
 		}
 		if (tcp->tcp_ip_forward_progress) {
-			ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
-			*(uint32_t *)mp->b_rptr  |= IP_FORWARD_PROG;
 			tcp->tcp_ip_forward_progress = B_FALSE;
+			ixa->ixa_flags |= IXAF_REACH_CONF;
+		} else {
+			ixa->ixa_flags &= ~IXAF_REACH_CONF;
 		}
 
+		/*
+		 * Append LSO information, both flags and mss, to the mp.
+		 */
 		if (do_lso_send) {
-			tcp_lsosend_data(tcp, mp, ire, ill, mss,
-			    num_lso_seg);
-			tcp->tcp_obsegs += num_lso_seg;
+			lso_info_set(mp, mss, HW_LSO);
+			ixa->ixa_fragsize = IP_MAXPACKET;
+			ixa->ixa_extra_ident = num_lso_seg - 1;
 
+			DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
+			    boolean_t, B_TRUE);
+
+			tcp_send_data(tcp, mp);
+
+			/*
+			 * Restore values of ixa_fragsize and ixa_extra_ident.
+			 */
+			ixa->ixa_fragsize = ixa->ixa_pmtu;
+			ixa->ixa_extra_ident = 0;
+			tcp->tcp_obsegs += num_lso_seg;
 			TCP_STAT(tcps, tcp_lso_times);
 			TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
 		} else {
-			tcp_send_data(tcp, q, mp);
+			tcp_send_data(tcp, mp);
 			BUMP_LOCAL(tcp->tcp_obsegs);
 		}
 	}
 
-	if (ire != NULL)
-		IRE_REFRELE(ire);
 	return (0);
 }
 
-/* Unlink and return any mblk that looks like it contains a MDT info */
-static mblk_t *
-tcp_mdt_info_mp(mblk_t *mp)
-{
-	mblk_t	*prev_mp;
-
-	for (;;) {
-		prev_mp = mp;
-		/* no more to process? */
-		if ((mp = mp->b_cont) == NULL)
-			break;
-
-		switch (DB_TYPE(mp)) {
-		case M_CTL:
-			if (*(uint32_t *)mp->b_rptr != MDT_IOC_INFO_UPDATE)
-				continue;
-			ASSERT(prev_mp != NULL);
-			prev_mp->b_cont = mp->b_cont;
-			mp->b_cont = NULL;
-			return (mp);
-		default:
-			break;
-		}
-	}
-	return (mp);
-}
-
-/* MDT info update routine, called when IP notifies us about MDT */
-static void
-tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first)
-{
-	boolean_t prev_state;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-
-	/*
-	 * IP is telling us to abort MDT on this connection?  We know
-	 * this because the capability is only turned off when IP
-	 * encounters some pathological cases, e.g. link-layer change
-	 * where the new driver doesn't support MDT, or in situation
-	 * where MDT usage on the link-layer has been switched off.
-	 * IP would not have sent us the initial MDT_IOC_INFO_UPDATE
-	 * if the link-layer doesn't support MDT, and if it does, it
-	 * will indicate that the feature is to be turned on.
-	 */
-	prev_state = tcp->tcp_mdt;
-	tcp->tcp_mdt = (mdt_capab->ill_mdt_on != 0);
-	if (!tcp->tcp_mdt && !first) {
-		TCP_STAT(tcps, tcp_mdt_conn_halted3);
-		ip1dbg(("tcp_mdt_update: disabling MDT for connp %p\n",
-		    (void *)tcp->tcp_connp));
-	}
-
-	/*
-	 * We currently only support MDT on simple TCP/{IPv4,IPv6},
-	 * so disable MDT otherwise.  The checks are done here
-	 * and in tcp_wput_data().
-	 */
-	if (tcp->tcp_mdt &&
-	    (tcp->tcp_ipversion == IPV4_VERSION &&
-	    tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) ||
-	    (tcp->tcp_ipversion == IPV6_VERSION &&
-	    tcp->tcp_ip_hdr_len != IPV6_HDR_LEN))
-		tcp->tcp_mdt = B_FALSE;
-
-	if (tcp->tcp_mdt) {
-		if (mdt_capab->ill_mdt_version != MDT_VERSION_2) {
-			cmn_err(CE_NOTE, "tcp_mdt_update: unknown MDT "
-			    "version (%d), expected version is %d",
-			    mdt_capab->ill_mdt_version, MDT_VERSION_2);
-			tcp->tcp_mdt = B_FALSE;
-			return;
-		}
-
-		/*
-		 * We need the driver to be able to handle at least three
-		 * spans per packet in order for tcp MDT to be utilized.
-		 * The first is for the header portion, while the rest are
-		 * needed to handle a packet that straddles across two
-		 * virtually non-contiguous buffers; a typical tcp packet
-		 * therefore consists of only two spans.  Note that we take
-		 * a zero as "don't care".
-		 */
-		if (mdt_capab->ill_mdt_span_limit > 0 &&
-		    mdt_capab->ill_mdt_span_limit < 3) {
-			tcp->tcp_mdt = B_FALSE;
-			return;
-		}
-
-		/* a zero means driver wants default value */
-		tcp->tcp_mdt_max_pld = MIN(mdt_capab->ill_mdt_max_pld,
-		    tcps->tcps_mdt_max_pbufs);
-		if (tcp->tcp_mdt_max_pld == 0)
-			tcp->tcp_mdt_max_pld = tcps->tcps_mdt_max_pbufs;
-
-		/* ensure 32-bit alignment */
-		tcp->tcp_mdt_hdr_head = roundup(MAX(tcps->tcps_mdt_hdr_head_min,
-		    mdt_capab->ill_mdt_hdr_head), 4);
-		tcp->tcp_mdt_hdr_tail = roundup(MAX(tcps->tcps_mdt_hdr_tail_min,
-		    mdt_capab->ill_mdt_hdr_tail), 4);
-
-		if (!first && !prev_state) {
-			TCP_STAT(tcps, tcp_mdt_conn_resumed2);
-			ip1dbg(("tcp_mdt_update: reenabling MDT for connp %p\n",
-			    (void *)tcp->tcp_connp));
-		}
-	}
-}
-
-/* Unlink and return any mblk that looks like it contains a LSO info */
-static mblk_t *
-tcp_lso_info_mp(mblk_t *mp)
-{
-	mblk_t	*prev_mp;
-
-	for (;;) {
-		prev_mp = mp;
-		/* no more to process? */
-		if ((mp = mp->b_cont) == NULL)
-			break;
-
-		switch (DB_TYPE(mp)) {
-		case M_CTL:
-			if (*(uint32_t *)mp->b_rptr != LSO_IOC_INFO_UPDATE)
-				continue;
-			ASSERT(prev_mp != NULL);
-			prev_mp->b_cont = mp->b_cont;
-			mp->b_cont = NULL;
-			return (mp);
-		default:
-			break;
-		}
-	}
-
-	return (mp);
-}
-
-/* LSO info update routine, called when IP notifies us about LSO */
-static void
-tcp_lso_update(tcp_t *tcp, ill_lso_capab_t *lso_capab)
-{
-	tcp_stack_t *tcps = tcp->tcp_tcps;
-
-	/*
-	 * IP is telling us to abort LSO on this connection?  We know
-	 * this because the capability is only turned off when IP
-	 * encounters some pathological cases, e.g. link-layer change
-	 * where the new NIC/driver doesn't support LSO, or in situation
-	 * where LSO usage on the link-layer has been switched off.
-	 * IP would not have sent us the initial LSO_IOC_INFO_UPDATE
-	 * if the link-layer doesn't support LSO, and if it does, it
-	 * will indicate that the feature is to be turned on.
-	 */
-	tcp->tcp_lso = (lso_capab->ill_lso_on != 0);
-	TCP_STAT(tcps, tcp_lso_enabled);
-
-	/*
-	 * We currently only support LSO on simple TCP/IPv4,
-	 * so disable LSO otherwise.  The checks are done here
-	 * and in tcp_wput_data().
-	 */
-	if (tcp->tcp_lso &&
-	    (tcp->tcp_ipversion == IPV4_VERSION &&
-	    tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) ||
-	    (tcp->tcp_ipversion == IPV6_VERSION)) {
-		tcp->tcp_lso = B_FALSE;
-		TCP_STAT(tcps, tcp_lso_disabled);
-	} else {
-		tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH,
-		    lso_capab->ill_lso_max);
-	}
-}
-
-static void
-tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_lso_mdt)
-{
-	conn_t *connp = tcp->tcp_connp;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
-
-	ASSERT(ire != NULL);
-
-	/*
-	 * We may be in the fastpath here, and although we essentially do
-	 * similar checks as in ip_bind_connected{_v6}/ip_xxinfo_return,
-	 * we try to keep things as brief as possible.  After all, these
-	 * are only best-effort checks, and we do more thorough ones prior
-	 * to calling tcp_send()/tcp_multisend().
-	 */
-	if ((ipst->ips_ip_lso_outbound || ipst->ips_ip_multidata_outbound) &&
-	    check_lso_mdt && !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
-	    ill != NULL && !CONN_IPSEC_OUT_ENCAPSULATED(connp) &&
-	    !(ire->ire_flags & RTF_MULTIRT) &&
-	    !IPP_ENABLED(IPP_LOCAL_OUT, ipst) &&
-	    CONN_IS_LSO_MD_FASTPATH(connp)) {
-		if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) {
-			/* Cache the result */
-			connp->conn_lso_ok = B_TRUE;
-
-			ASSERT(ill->ill_lso_capab != NULL);
-			if (!ill->ill_lso_capab->ill_lso_on) {
-				ill->ill_lso_capab->ill_lso_on = 1;
-				ip1dbg(("tcp_ire_ill_check: connp %p enables "
-				    "LSO for interface %s\n", (void *)connp,
-				    ill->ill_name));
-			}
-			tcp_lso_update(tcp, ill->ill_lso_capab);
-		} else if (ipst->ips_ip_multidata_outbound &&
-		    ILL_MDT_CAPABLE(ill)) {
-			/* Cache the result */
-			connp->conn_mdt_ok = B_TRUE;
-
-			ASSERT(ill->ill_mdt_capab != NULL);
-			if (!ill->ill_mdt_capab->ill_mdt_on) {
-				ill->ill_mdt_capab->ill_mdt_on = 1;
-				ip1dbg(("tcp_ire_ill_check: connp %p enables "
-				    "MDT for interface %s\n", (void *)connp,
-				    ill->ill_name));
-			}
-			tcp_mdt_update(tcp, ill->ill_mdt_capab, B_TRUE);
-		}
-	}
-
-	/*
-	 * The goal is to reduce the number of generated tcp segments by
-	 * setting the maxpsz multiplier to 0; this will have an affect on
-	 * tcp_maxpsz_set().  With this behavior, tcp will pack more data
-	 * into each packet, up to SMSS bytes.  Doing this reduces the number
-	 * of outbound segments and incoming ACKs, thus allowing for better
-	 * network and system performance.  In contrast the legacy behavior
-	 * may result in sending less than SMSS size, because the last mblk
-	 * for some packets may have more data than needed to make up SMSS,
-	 * and the legacy code refused to "split" it.
-	 *
-	 * We apply the new behavior on following situations:
-	 *
-	 *   1) Loopback connections,
-	 *   2) Connections in which the remote peer is not on local subnet,
-	 *   3) Local subnet connections over the bge interface (see below).
-	 *
-	 * Ideally, we would like this behavior to apply for interfaces other
-	 * than bge.  However, doing so would negatively impact drivers which
-	 * perform dynamic mapping and unmapping of DMA resources, which are
-	 * increased by setting the maxpsz multiplier to 0 (more mblks per
-	 * packet will be generated by tcp).  The bge driver does not suffer
-	 * from this, as it copies the mblks into pre-mapped buffers, and
-	 * therefore does not require more I/O resources than before.
-	 *
-	 * Otherwise, this behavior is present on all network interfaces when
-	 * the destination endpoint is non-local, since reducing the number
-	 * of packets in general is good for the network.
-	 *
-	 * TODO We need to remove this hard-coded conditional for bge once
-	 *	a better "self-tuning" mechanism, or a way to comprehend
-	 *	the driver transmit strategy is devised.  Until the solution
-	 *	is found and well understood, we live with this hack.
-	 */
-	if (!tcp_static_maxpsz &&
-	    (tcp->tcp_loopback || !tcp->tcp_localnet ||
-	    (ill->ill_name_length > 3 && bcmp(ill->ill_name, "bge", 3) == 0))) {
-		/* override the default value */
-		tcp->tcp_maxpsz = 0;
-
-		ip3dbg(("tcp_ire_ill_check: connp %p tcp_maxpsz %d on "
-		    "interface %s\n", (void *)connp, tcp->tcp_maxpsz,
-		    ill != NULL ? ill->ill_name : ipif_loopback_name));
-	}
-
-	/* set the stream head parameters accordingly */
-	(void) tcp_maxpsz_set(tcp, B_TRUE);
-}
-
 /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */
 static void
 tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
 {
 	uchar_t	fval = *mp->b_rptr;
 	mblk_t	*tail;
-	queue_t	*q = tcp->tcp_wq;
+	conn_t	*connp = tcp->tcp_connp;
+	queue_t	*q = connp->conn_wq;
 
 	/* TODO: How should flush interact with urgent data? */
 	if ((fval & FLUSHW) && tcp->tcp_xmit_head &&
@@ -21473,7 +16350,7 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
 		}
 		/*
 		 * We have no unsent data, so unsent must be less than
-		 * tcp_xmit_lowater, so re-enable flow.
+		 * conn_sndlowat, so re-enable flow.
 		 */
 		mutex_enter(&tcp->tcp_non_sq_lock);
 		if (tcp->tcp_flow_stopped) {
@@ -21501,12 +16378,12 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
 static void
 tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
 {
-	mblk_t	*mp1;
-	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
+	mblk_t		*mp1;
+	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
 	STRUCT_HANDLE(strbuf, sb);
-	queue_t *q = tcp->tcp_wq;
-	int	error;
-	uint_t	addrlen;
+	uint_t		addrlen;
+	conn_t		*connp = tcp->tcp_connp;
+	queue_t 	*q = connp->conn_wq;
 
 	/* Make sure it is one of ours. */
 	switch (iocp->ioc_cmd) {
@@ -21514,7 +16391,7 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
 	case TI_GETPEERNAME:
 		break;
 	default:
-		CALL_IP_WPUT(tcp->tcp_connp, q, mp);
+		ip_wput_nondata(q, mp);
 		return;
 	}
 	switch (mi_copy_state(q, mp, &mp1)) {
@@ -21541,43 +16418,56 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
 	}
 
 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
-	addrlen = tcp->tcp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t);
+
+	if (connp->conn_family == AF_INET)
+		addrlen = sizeof (sin_t);
+	else
+		addrlen = sizeof (sin6_t);
+
 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
 		mi_copy_done(q, mp, EINVAL);
 		return;
 	}
 
-	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
-	if (mp1 == NULL)
-		return;
-
 	switch (iocp->ioc_cmd) {
 	case TI_GETMYNAME:
-		error = tcp_do_getsockname(tcp, (void *)mp1->b_rptr, &addrlen);
 		break;
 	case TI_GETPEERNAME:
-		error = tcp_do_getpeername(tcp, (void *)mp1->b_rptr, &addrlen);
+		if (tcp->tcp_state < TCPS_SYN_RCVD) {
+			mi_copy_done(q, mp, ENOTCONN);
+			return;
+		}
 		break;
 	}
+	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
+	if (!mp1)
+		return;
 
-	if (error != 0) {
-		mi_copy_done(q, mp, error);
-	} else {
-		mp1->b_wptr += addrlen;
-		STRUCT_FSET(sb, len, addrlen);
-
-		/* Copy out the address */
-		mi_copyout(q, mp);
+	STRUCT_FSET(sb, len, addrlen);
+	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
+	case TI_GETMYNAME:
+		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
+		    &addrlen);
+		break;
+	case TI_GETPEERNAME:
+		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
+		    &addrlen);
+		break;
 	}
+	mp1->b_wptr += addrlen;
+	/* Copy out the address */
+	mi_copyout(q, mp);
 }
 
 static void
 tcp_use_pure_tpi(tcp_t *tcp)
 {
+	conn_t		*connp = tcp->tcp_connp;
+
 #ifdef	_ILP32
-	tcp->tcp_acceptor_id = (t_uscalar_t)tcp->tcp_rq;
+	tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq;
 #else
-	tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
+	tcp->tcp_acceptor_id = connp->conn_dev;
 #endif
 	/*
 	 * Insert this socket into the acceptor hash.
@@ -21595,11 +16485,11 @@ tcp_use_pure_tpi(tcp_t *tcp)
  */
 /* ARGSUSED */
 static void
-tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
+tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
-	conn_t 	*connp = (conn_t *)arg;
-	tcp_t	*tcp = connp->conn_tcp;
-	queue_t	*q = tcp->tcp_wq;
+	conn_t 		*connp = (conn_t *)arg;
+	tcp_t		*tcp = connp->conn_tcp;
+	queue_t		*q = connp->conn_wq;
 	struct iocblk	*iocp;
 
 	ASSERT(DB_TYPE(mp) == M_IOCTL);
@@ -21617,17 +16507,6 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
 
 	iocp = (struct iocblk *)mp->b_rptr;
 	switch (iocp->ioc_cmd) {
-	case TCP_IOC_DEFAULT_Q:
-		/* Wants to be the default wq. */
-		if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) {
-			iocp->ioc_error = EPERM;
-			iocp->ioc_count = 0;
-			mp->b_datap->db_type = M_IOCACK;
-			qreply(q, mp);
-			return;
-		}
-		tcp_def_q_set(tcp, mp);
-		return;
 	case _SIOCSOCKFALLBACK:
 		/*
 		 * Either sockmod is about to be popped and the socket
@@ -21650,7 +16529,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
 		qreply(q, mp);
 		return;
 	}
-	CALL_IP_WPUT(connp, q, mp);
+	ip_wput_nondata(q, mp);
 }
 
 /*
@@ -21658,14 +16537,14 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
  */
 /* ARGSUSED */
 static void
-tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
+tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
-	conn_t 	*connp = (conn_t *)arg;
-	tcp_t	*tcp = connp->conn_tcp;
+	conn_t		*connp = (conn_t *)arg;
+	tcp_t		*tcp = connp->conn_tcp;
 	union T_primitives *tprim = (union T_primitives *)mp->b_rptr;
-	uchar_t *rptr;
-	t_scalar_t type;
-	cred_t *cr;
+	uchar_t		*rptr;
+	t_scalar_t	type;
+	cred_t		*cr;
 
 	/*
 	 * Try and ASSERT the minimum possible references on the
@@ -21684,7 +16563,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
 	if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
 		type = ((union T_primitives *)rptr)->type;
 		if (type == T_EXDATA_REQ) {
-			tcp_output_urgent(connp, mp, arg2);
+			tcp_output_urgent(connp, mp, arg2, NULL);
 		} else if (type != T_DATA_REQ) {
 			goto non_urgent_data;
 		} else {
@@ -21695,7 +16574,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
 		}
 		return;
 	} else {
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_wput_proto, dropping one...");
 		}
@@ -21776,17 +16655,10 @@ non_urgent_data:
 		 * for subsequent processing by ip_restart_optmgmt(), which
 		 * will do the CONN_DEC_REF().
 		 */
-		CONN_INC_REF(connp);
 		if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) {
-			if (svr4_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj,
-			    B_TRUE) != EINPROGRESS) {
-				CONN_DEC_REF(connp);
-			}
+			svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
 		} else {
-			if (tpi_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj,
-			    B_TRUE) != EINPROGRESS) {
-				CONN_DEC_REF(connp);
-			}
+			tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
 		}
 		break;
 
@@ -21804,7 +16676,7 @@ non_urgent_data:
 			 * We were crossing FINs and got a reset from
 			 * the other side. Just ignore it.
 			 */
-			if (tcp->tcp_debug) {
+			if (connp->conn_debug) {
 				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_wput_proto, T_ORDREL_REQ out of "
@@ -21818,7 +16690,7 @@ non_urgent_data:
 		tcp_addr_req(tcp, mp);
 		break;
 	default:
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_wput_proto, bogus TPI msg, type %d",
 			    tprim->type);
@@ -21844,19 +16716,6 @@ tcp_wsrv(queue_t *q)
 	TCP_STAT(tcps, tcp_wsrv_called);
 }
 
-/* Non overlapping byte exchanger */
-static void
-tcp_xchg(uchar_t *a, uchar_t *b, int len)
-{
-	uchar_t	uch;
-
-	while (len-- > 0) {
-		uch = a[len];
-		a[len] = b[len];
-		b[len] = uch;
-	}
-}
-
 /*
  * Send out a control packet on the tcp connection specified.  This routine
  * is typically called where we need a simple ACK or RST generated.
@@ -21865,50 +16724,51 @@ static void
 tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
 {
 	uchar_t		*rptr;
-	tcph_t		*tcph;
+	tcpha_t		*tcpha;
 	ipha_t		*ipha = NULL;
 	ip6_t		*ip6h = NULL;
 	uint32_t	sum;
-	int		tcp_hdr_len;
-	int		tcp_ip_hdr_len;
+	int		total_hdr_len;
+	int		ip_hdr_len;
 	mblk_t		*mp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
+	ip_xmit_attr_t	*ixa = connp->conn_ixa;
 
 	/*
 	 * Save sum for use in source route later.
 	 */
-	ASSERT(tcp != NULL);
-	sum = tcp->tcp_tcp_hdr_len + tcp->tcp_sum;
-	tcp_hdr_len = tcp->tcp_hdr_len;
-	tcp_ip_hdr_len = tcp->tcp_ip_hdr_len;
+	sum = connp->conn_ht_ulp_len + connp->conn_sum;
+	total_hdr_len = connp->conn_ht_iphc_len;
+	ip_hdr_len = ixa->ixa_ip_hdr_length;
 
 	/* If a text string is passed in with the request, pass it to strlog. */
-	if (str != NULL && tcp->tcp_debug) {
+	if (str != NULL && connp->conn_debug) {
 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
 		    str, seq, ack, ctl);
 	}
-	mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcps->tcps_wroff_xtra,
+	mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
 	    BPRI_MED);
 	if (mp == NULL) {
 		return;
 	}
 	rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
 	mp->b_rptr = rptr;
-	mp->b_wptr = &rptr[tcp_hdr_len];
-	bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len);
+	mp->b_wptr = &rptr[total_hdr_len];
+	bcopy(connp->conn_ht_iphc, rptr, total_hdr_len);
+
+	ixa->ixa_pktlen = total_hdr_len;
 
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
 		ipha = (ipha_t *)rptr;
-		ipha->ipha_length = htons(tcp_hdr_len);
+		ipha->ipha_length = htons(total_hdr_len);
 	} else {
 		ip6h = (ip6_t *)rptr;
-		ASSERT(tcp != NULL);
-		ip6h->ip6_plen = htons(tcp->tcp_hdr_len -
-		    ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
+		ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
 	}
-	tcph = (tcph_t *)&rptr[tcp_ip_hdr_len];
-	tcph->th_flags[0] = (uint8_t)ctl;
+	tcpha = (tcpha_t *)&rptr[ip_hdr_len];
+	tcpha->tha_flags = (uint8_t)ctl;
 	if (ctl & TH_RST) {
 		BUMP_MIB(&tcps->tcps_mib, tcpOutRsts);
 		BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
@@ -21917,43 +16777,45 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
 		 */
 		if (tcp->tcp_snd_ts_ok &&
 		    tcp->tcp_state > TCPS_SYN_SENT) {
-			mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN];
+			mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN];
 			*(mp->b_wptr) = TCPOPT_EOL;
-			if (tcp->tcp_ipversion == IPV4_VERSION) {
-				ipha->ipha_length = htons(tcp_hdr_len -
+
+			ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN;
+
+			if (connp->conn_ipversion == IPV4_VERSION) {
+				ipha->ipha_length = htons(total_hdr_len -
 				    TCPOPT_REAL_TS_LEN);
 			} else {
-				ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
-				    TCPOPT_REAL_TS_LEN);
+				ip6h->ip6_plen = htons(total_hdr_len -
+				    IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN);
 			}
-			tcph->th_offset_and_rsrvd[0] -= (3 << 4);
+			tcpha->tha_offset_and_reserved -= (3 << 4);
 			sum -= TCPOPT_REAL_TS_LEN;
 		}
 	}
 	if (ctl & TH_ACK) {
 		if (tcp->tcp_snd_ts_ok) {
 			U32_TO_BE32(lbolt,
-			    (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
+			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
 			U32_TO_BE32(tcp->tcp_ts_recent,
-			    (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
+			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
 		}
 
 		/* Update the latest receive window size in TCP header. */
-		U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
-		    tcph->th_win);
+		tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
 		tcp->tcp_rack = ack;
 		tcp->tcp_rack_cnt = 0;
 		BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
 	}
 	BUMP_LOCAL(tcp->tcp_obsegs);
-	U32_TO_BE32(seq, tcph->th_seq);
-	U32_TO_BE32(ack, tcph->th_ack);
+	tcpha->tha_seq = htonl(seq);
+	tcpha->tha_ack = htonl(ack);
 	/*
 	 * Include the adjustment for a source route if any.
 	 */
 	sum = (sum >> 16) + (sum & 0xFFFF);
-	U16_TO_BE16(sum, tcph->th_sum);
-	tcp_send_data(tcp, tcp->tcp_wq, mp);
+	tcpha->tha_sum = htons(sum);
+	tcp_send_data(tcp, mp);
 }
 
 /*
@@ -21991,115 +16853,32 @@ tcp_send_rst_chk(tcp_stack_t *tcps)
 }
 
 /*
- * Send down the advice IP ioctl to tell IP to mark an IRE temporary.
- */
-static void
-tcp_ip_ire_mark_advice(tcp_t *tcp)
-{
-	mblk_t *mp;
-	ipic_t *ipic;
-
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
-		mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN,
-		    &ipic);
-	} else {
-		mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN,
-		    &ipic);
-	}
-	if (mp == NULL)
-		return;
-	ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY;
-	CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
-}
-
-/*
- * Return an IP advice ioctl mblk and set ipic to be the pointer
- * to the advice structure.
- */
-static mblk_t *
-tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic)
-{
-	struct iocblk *ioc;
-	mblk_t *mp, *mp1;
-
-	mp = allocb(sizeof (ipic_t) + addr_len, BPRI_HI);
-	if (mp == NULL)
-		return (NULL);
-	bzero(mp->b_rptr, sizeof (ipic_t) + addr_len);
-	*ipic = (ipic_t *)mp->b_rptr;
-	(*ipic)->ipic_cmd = IP_IOC_IRE_ADVISE_NO_REPLY;
-	(*ipic)->ipic_addr_offset = sizeof (ipic_t);
-
-	bcopy(addr, *ipic + 1, addr_len);
-
-	(*ipic)->ipic_addr_length = addr_len;
-	mp->b_wptr = &mp->b_rptr[sizeof (ipic_t) + addr_len];
-
-	mp1 = mkiocb(IP_IOCTL);
-	if (mp1 == NULL) {
-		freemsg(mp);
-		return (NULL);
-	}
-	mp1->b_cont = mp;
-	ioc = (struct iocblk *)mp1->b_rptr;
-	ioc->ioc_count = sizeof (ipic_t) + addr_len;
-
-	return (mp1);
-}
-
-/*
  * Generate a reset based on an inbound packet, connp is set by caller
  * when RST is in response to an unexpected inbound packet for which
  * there is active tcp state in the system.
  *
  * IPSEC NOTE : Try to send the reply with the same protection as it came
- * in.  We still have the ipsec_mp that the packet was attached to. Thus
- * the packet will go out at the same level of protection as it came in by
- * converting the IPSEC_IN to IPSEC_OUT.
+ * in.  We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
+ * That way the packet will go out at the same level of protection as it
+ * came in with.
  */
 static void
-tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
-    uint32_t ack, int ctl, uint_t ip_hdr_len, zoneid_t zoneid,
-    tcp_stack_t *tcps, conn_t *connp)
+tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl,
+    ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp)
 {
 	ipha_t		*ipha = NULL;
 	ip6_t		*ip6h = NULL;
 	ushort_t	len;
-	tcph_t		*tcph;
+	tcpha_t		*tcpha;
 	int		i;
-	mblk_t		*ipsec_mp;
-	boolean_t	mctl_present;
-	ipic_t		*ipic;
 	ipaddr_t	v4addr;
 	in6_addr_t	v6addr;
-	int		addr_len;
-	void		*addr;
-	queue_t		*q = tcps->tcps_g_q;
-	tcp_t		*tcp;
-	cred_t		*cr;
-	pid_t		pid;
-	mblk_t		*nmp;
-	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
-
-	if (tcps->tcps_g_q == NULL) {
-		/*
-		 * For non-zero stackids the default queue isn't created
-		 * until the first open, thus there can be a need to send
-		 * a reset before then. But we can't do that, hence we just
-		 * drop the packet. Later during boot, when the default queue
-		 * has been setup, a retransmitted packet from the peer
-		 * will result in a reset.
-		 */
-		ASSERT(tcps->tcps_netstack->netstack_stackid !=
-		    GLOBAL_NETSTACKID);
-		freemsg(mp);
-		return;
-	}
-
-	if (connp != NULL)
-		tcp = connp->conn_tcp;
-	else
-		tcp = Q_TO_TCP(q);
+	netstack_t	*ns = ipst->ips_netstack;
+	tcp_stack_t	*tcps = ns->netstack_tcp;
+	ip_xmit_attr_t	ixas, *ixa;
+	uint_t		ip_hdr_len = ira->ira_ip_hdr_length;
+	boolean_t	need_refrele = B_FALSE;		/* ixa_refrele(ixa) */
+	ushort_t	port;
 
 	if (!tcp_send_rst_chk(tcps)) {
 		tcps->tcps_rst_unsent++;
@@ -22107,16 +16886,41 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
 		return;
 	}
 
-	if (mp->b_datap->db_type == M_CTL) {
-		ipsec_mp = mp;
-		mp = mp->b_cont;
-		mctl_present = B_TRUE;
+	/*
+	 * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other
+	 * options from the listener. In that case the caller must ensure that
+	 * we are running on the listener = connp squeue.
+	 *
+	 * We get a safe copy of conn_ixa so we don't need to restore anything
+	 * we or ip_output_simple might change in the ixa.
+	 */
+	if (connp != NULL) {
+		ASSERT(connp->conn_on_sqp);
+
+		ixa = conn_get_ixa_exclusive(connp);
+		if (ixa == NULL) {
+			tcps->tcps_rst_unsent++;
+			freemsg(mp);
+			return;
+		}
+		need_refrele = B_TRUE;
 	} else {
-		ipsec_mp = mp;
-		mctl_present = B_FALSE;
+		bzero(&ixas, sizeof (ixas));
+		ixa = &ixas;
+		/*
+		 * IXAF_VERIFY_SOURCE is overkill since we know the
+		 * packet was for us.
+		 */
+		ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE;
+		ixa->ixa_protocol = IPPROTO_TCP;
+		ixa->ixa_zoneid = ira->ira_zoneid;
+		ixa->ixa_ifindex = 0;
+		ixa->ixa_ipst = ipst;
+		ixa->ixa_cred = kcred;
+		ixa->ixa_cpid = NOPID;
 	}
 
-	if (str && q && tcps->tcps_dbg) {
+	if (str && tcps->tcps_dbg) {
 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
 		    "flags 0x%x",
@@ -22126,20 +16930,12 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
 		mblk_t *mp1 = copyb(mp);
 		freemsg(mp);
 		mp = mp1;
-		if (!mp) {
-			if (mctl_present)
-				freeb(ipsec_mp);
-			return;
-		} else {
-			if (mctl_present) {
-				ipsec_mp->b_cont = mp;
-			} else {
-				ipsec_mp = mp;
-			}
-		}
+		if (mp == NULL)
+			goto done;
 	} else if (mp->b_cont) {
 		freemsg(mp->b_cont);
 		mp->b_cont = NULL;
+		DB_CKSUMFLAGS(mp) = 0;
 	}
 	/*
 	 * We skip reversing source route here.
@@ -22159,18 +16955,20 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
 		 */
 		if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST ||
 		    CLASSD(ipha->ipha_src)) {
-			freemsg(ipsec_mp);
 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
-			return;
+			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+			freemsg(mp);
+			goto done;
 		}
 	} else {
 		ip6h = (ip6_t *)mp->b_rptr;
 
 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
-			freemsg(ipsec_mp);
 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
-			return;
+			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+			freemsg(mp);
+			goto done;
 		}
 
 		/* Remove any extension headers assuming partial overlay */
@@ -22185,13 +16983,13 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
 			ip6h->ip6_nxt = IPPROTO_TCP;
 		}
 	}
-	tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
-	if (tcph->th_flags[0] & TH_RST) {
-		freemsg(ipsec_mp);
-		return;
+	tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
+	if (tcpha->tha_flags & TH_RST) {
+		freemsg(mp);
+		goto done;
 	}
-	tcph->th_offset_and_rsrvd[0] = (5 << 4);
-	len = ip_hdr_len + sizeof (tcph_t);
+	tcpha->tha_offset_and_reserved = (5 << 4);
+	len = ip_hdr_len + sizeof (tcpha_t);
 	mp->b_wptr = &mp->b_rptr[len];
 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
 		ipha->ipha_length = htons(len);
@@ -22201,108 +16999,79 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
 		ipha->ipha_dst = v4addr;
 		ipha->ipha_ident = 0;
 		ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
-		addr_len = IP_ADDR_LEN;
-		addr = &v4addr;
+		ixa->ixa_flags |= IXAF_IS_IPV4;
+		ixa->ixa_ip_hdr_length = ip_hdr_len;
 	} else {
-		/* No ip6i_t in this case */
 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
 		/* Swap addresses */
 		v6addr = ip6h->ip6_src;
 		ip6h->ip6_src = ip6h->ip6_dst;
 		ip6h->ip6_dst = v6addr;
 		ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit;
-		addr_len = IPV6_ADDR_LEN;
-		addr = &v6addr;
-	}
-	tcp_xchg(tcph->th_fport, tcph->th_lport, 2);
-	U32_TO_BE32(ack, tcph->th_ack);
-	U32_TO_BE32(seq, tcph->th_seq);
-	U16_TO_BE16(0, tcph->th_win);
-	U16_TO_BE16(sizeof (tcph_t), tcph->th_sum);
-	tcph->th_flags[0] = (uint8_t)ctl;
+		ixa->ixa_flags &= ~IXAF_IS_IPV4;
+
+		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
+			ixa->ixa_flags |= IXAF_SCOPEID_SET;
+			ixa->ixa_scopeid = ira->ira_ruifindex;
+		}
+		ixa->ixa_ip_hdr_length = IPV6_HDR_LEN;
+	}
+	ixa->ixa_pktlen = len;
+
+	/* Swap the ports */
+	port = tcpha->tha_fport;
+	tcpha->tha_fport = tcpha->tha_lport;
+	tcpha->tha_lport = port;
+
+	tcpha->tha_ack = htonl(ack);
+	tcpha->tha_seq = htonl(seq);
+	tcpha->tha_win = 0;
+	tcpha->tha_sum = htons(sizeof (tcpha_t));
+	tcpha->tha_flags = (uint8_t)ctl;
 	if (ctl & TH_RST) {
 		BUMP_MIB(&tcps->tcps_mib, tcpOutRsts);
 		BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
 	}
 
-	/* IP trusts us to set up labels when required. */
-	if (is_system_labeled() && (cr = msg_getcred(mp, &pid)) != NULL &&
-	    crgetlabel(cr) != NULL) {
-		int err;
-
-		if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION)
-			err = tsol_check_label(cr, &mp,
-			    tcp->tcp_connp->conn_mac_mode,
-			    tcps->tcps_netstack->netstack_ip, pid);
-		else
-			err = tsol_check_label_v6(cr, &mp,
-			    tcp->tcp_connp->conn_mac_mode,
-			    tcps->tcps_netstack->netstack_ip, pid);
-		if (mctl_present)
-			ipsec_mp->b_cont = mp;
-		else
-			ipsec_mp = mp;
-		if (err != 0) {
-			freemsg(ipsec_mp);
-			return;
-		}
-		if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
-			ipha = (ipha_t *)mp->b_rptr;
-		} else {
-			ip6h = (ip6_t *)mp->b_rptr;
-		}
+	/* Discard any old label */
+	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+		ASSERT(ixa->ixa_tsl != NULL);
+		label_rele(ixa->ixa_tsl);
+		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
 	}
+	ixa->ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
 
-	if (mctl_present) {
-		ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-
-		ASSERT(ii->ipsec_in_type == IPSEC_IN);
-		if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h, zoneid)) {
-			return;
+	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+		/*
+		 * Apply IPsec based on how IPsec was applied to
+		 * the packet that caused the RST.
+		 */
+		if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) {
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			/* Note: mp already consumed and ip_drop_packet done */
+			goto done;
 		}
+	} else {
+		/*
+		 * This is in clear. The RST message we are building
+		 * here should go out in clear, independent of our policy.
+		 */
+		ixa->ixa_flags |= IXAF_NO_IPSEC;
 	}
-	if (zoneid == ALL_ZONES)
-		zoneid = GLOBAL_ZONEID;
-
-	/* Add the zoneid so ip_output routes it properly */
-	if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid, ipst)) == NULL) {
-		freemsg(ipsec_mp);
-		return;
-	}
-	ipsec_mp = nmp;
 
 	/*
 	 * NOTE:  one might consider tracing a TCP packet here, but
 	 * this function has no active TCP state and no tcp structure
 	 * that has a trace buffer.  If we traced here, we would have
 	 * to keep a local trace buffer in tcp_record_trace().
-	 *
-	 * TSol note: The mblk that contains the incoming packet was
-	 * reused by tcp_xmit_listener_reset, so it already contains
-	 * the right credentials and we don't need to call mblk_setcred.
-	 * Also the conn's cred is not right since it is associated
-	 * with tcps_g_q.
 	 */
-	CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp);
 
-	/*
-	 * Tell IP to mark the IRE used for this destination temporary.
-	 * This way, we can limit our exposure to DoS attack because IP
-	 * creates an IRE for each destination.  If there are too many,
-	 * the time to do any routing lookup will be extremely long.  And
-	 * the lookup can be in interrupt context.
-	 *
-	 * Note that in normal circumstances, this marking should not
-	 * affect anything.  It would be nice if only 1 message is
-	 * needed to inform IP that the IRE created for this RST should
-	 * not be added to the cache table.  But there is currently
-	 * not such communication mechanism between TCP and IP.  So
-	 * the best we can do now is to send the advice ioctl to IP
-	 * to mark the IRE temporary.
-	 */
-	if ((mp = tcp_ip_advise_mblk(addr, addr_len, &ipic)) != NULL) {
-		ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY;
-		CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
+	(void) ip_output_simple(mp, ixa);
+done:
+	ixa_cleanup(ixa);
+	if (need_refrele) {
+		ASSERT(ixa != &ixas);
+		ixa_refrele(ixa);
 	}
 }
 
@@ -22313,9 +17082,11 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
 static int
 tcp_xmit_end(tcp_t *tcp)
 {
-	ipic_t	*ipic;
-	mblk_t	*mp;
+	mblk_t		*mp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	iulp_t		uinfo;
+	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
+	conn_t		*connp = tcp->tcp_connp;
 
 	if (tcp->tcp_state < TCPS_SYN_RCVD ||
 	    tcp->tcp_state > TCPS_CLOSE_WAIT) {
@@ -22337,7 +17108,7 @@ tcp_xmit_end(tcp_t *tcp)
 		    tcp->tcp_fss, B_FALSE, NULL, B_FALSE);
 
 		if (mp) {
-			tcp_send_data(tcp, tcp->tcp_wq, mp);
+			tcp_send_data(tcp, mp);
 		} else {
 			/*
 			 * Couldn't allocate msg.  Pretend we got it out.
@@ -22373,66 +17144,49 @@ tcp_xmit_end(tcp_t *tcp)
 		return (0);
 
 	/*
-	 * NOTE: should not update if source routes i.e. if tcp_remote if
-	 * different from the destination.
+	 * We do not have a good algorithm to update ssthresh at this time.
+	 * So don't do any update.
+	 */
+	bzero(&uinfo, sizeof (uinfo));
+	uinfo.iulp_rtt = tcp->tcp_rtt_sa;
+	uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
+
+	/*
+	 * Note that uinfo is kept for conn_faddr in the DCE. Could update even
+	 * if source routed but we don't.
 	 */
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
-		if (tcp->tcp_remote !=  tcp->tcp_ipha->ipha_dst) {
+	if (connp->conn_ipversion == IPV4_VERSION) {
+		if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
 			return (0);
 		}
-		mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN,
-		    &ipic);
+		(void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
 	} else {
-		if (!(IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6,
+		uint_t ifindex;
+
+		if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
 		    &tcp->tcp_ip6h->ip6_dst))) {
 			return (0);
 		}
-		mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN,
-		    &ipic);
-	}
-
-	/* Record route attributes in the IRE for use by future connections. */
-	if (mp == NULL)
-		return (0);
-
-	/*
-	 * We do not have a good algorithm to update ssthresh at this time.
-	 * So don't do any update.
-	 */
-	ipic->ipic_rtt = tcp->tcp_rtt_sa;
-	ipic->ipic_rtt_sd = tcp->tcp_rtt_sd;
-
-	CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
-
-	return (0);
-}
+		ifindex = 0;
+		if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
+			ip_xmit_attr_t *ixa = connp->conn_ixa;
 
-/* ARGSUSED */
-void
-tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2)
-{
-	conn_t *connp = (conn_t *)arg;
-	mblk_t *mp1;
-	tcp_t *tcp = connp->conn_tcp;
-	tcp_xmit_reset_event_t *eventp;
-
-	ASSERT(mp->b_datap->db_type == M_PROTO &&
-	    MBLKL(mp) == sizeof (tcp_xmit_reset_event_t));
+			/*
+			 * If we are going to create a DCE we'd better have
+			 * an ifindex
+			 */
+			if (ixa->ixa_nce != NULL) {
+				ifindex = ixa->ixa_nce->nce_common->ncec_ill->
+				    ill_phyint->phyint_ifindex;
+			} else {
+				return (0);
+			}
+		}
 
-	if (tcp->tcp_state != TCPS_LISTEN) {
-		freemsg(mp);
-		return;
+		(void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo,
+		    ipst);
 	}
-
-	mp1 = mp->b_cont;
-	mp->b_cont = NULL;
-	eventp = (tcp_xmit_reset_event_t *)mp->b_rptr;
-	ASSERT(eventp->tcp_xre_tcps->tcps_netstack ==
-	    connp->conn_netstack);
-
-	tcp_xmit_listeners_reset(mp1, eventp->tcp_xre_iphdrlen,
-	    eventp->tcp_xre_zoneid, eventp->tcp_xre_tcps, connp);
-	freemsg(mp);
+	return (0);
 }
 
 /*
@@ -22442,45 +17196,25 @@ tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2)
  * Note that we are reusing the incoming mp to construct the outgoing RST.
  */
 void
-tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid,
-    tcp_stack_t *tcps, conn_t *connp)
+tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst,
+    conn_t *connp)
 {
 	uchar_t		*rptr;
 	uint32_t	seg_len;
-	tcph_t		*tcph;
+	tcpha_t		*tcpha;
 	uint32_t	seg_seq;
 	uint32_t	seg_ack;
 	uint_t		flags;
-	mblk_t		*ipsec_mp;
 	ipha_t 		*ipha;
 	ip6_t 		*ip6h;
-	boolean_t	mctl_present = B_FALSE;
-	boolean_t	check = B_TRUE;
 	boolean_t	policy_present;
+	netstack_t	*ns = ipst->ips_netstack;
+	tcp_stack_t	*tcps = ns->netstack_tcp;
 	ipsec_stack_t	*ipss = tcps->tcps_netstack->netstack_ipsec;
+	uint_t		ip_hdr_len = ira->ira_ip_hdr_length;
 
 	TCP_STAT(tcps, tcp_no_listener);
 
-	ipsec_mp = mp;
-
-	if (mp->b_datap->db_type == M_CTL) {
-		ipsec_in_t *ii;
-
-		mctl_present = B_TRUE;
-		mp = mp->b_cont;
-
-		ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-		ASSERT(ii->ipsec_in_type == IPSEC_IN);
-		if (ii->ipsec_in_dont_check) {
-			check = B_FALSE;
-			if (!ii->ipsec_in_secure) {
-				freeb(ipsec_mp);
-				mctl_present = B_FALSE;
-				ipsec_mp = mp;
-			}
-		}
-	}
-
 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
 		policy_present = ipss->ipsec_inbound_v4_policy_present;
 		ipha = (ipha_t *)mp->b_rptr;
@@ -22491,41 +17225,39 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid,
 		ip6h = (ip6_t *)mp->b_rptr;
 	}
 
-	if (check && policy_present) {
+	if (policy_present) {
 		/*
 		 * The conn_t parameter is NULL because we already know
 		 * nobody's home.
 		 */
-		ipsec_mp = ipsec_check_global_policy(
-		    ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present,
-		    tcps->tcps_netstack);
-		if (ipsec_mp == NULL)
+		mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h,
+		    ira, ns);
+		if (mp == NULL)
 			return;
 	}
-	if (is_system_labeled() && !tsol_can_reply_error(mp)) {
+	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
 		DTRACE_PROBE2(
 		    tx__ip__log__error__nolistener__tcp,
 		    char *, "Could not reply with RST to mp(1)",
 		    mblk_t *, mp);
 		ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
-		freemsg(ipsec_mp);
+		freemsg(mp);
 		return;
 	}
 
 	rptr = mp->b_rptr;
 
-	tcph = (tcph_t *)&rptr[ip_hdr_len];
-	seg_seq = BE32_TO_U32(tcph->th_seq);
-	seg_ack = BE32_TO_U32(tcph->th_ack);
-	flags = tcph->th_flags[0];
+	tcpha = (tcpha_t *)&rptr[ip_hdr_len];
+	seg_seq = ntohl(tcpha->tha_seq);
+	seg_ack = ntohl(tcpha->tha_ack);
+	flags = tcpha->tha_flags;
 
-	seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len);
+	seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len);
 	if (flags & TH_RST) {
-		freemsg(ipsec_mp);
+		freemsg(mp);
 	} else if (flags & TH_ACK) {
-		tcp_xmit_early_reset("no tcp, reset",
-		    ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid, tcps,
-		    connp);
+		tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST,
+		    ira, ipst, connp);
 	} else {
 		if (flags & TH_SYN) {
 			seg_len++;
@@ -22537,14 +17269,13 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid,
 			 * segment is neither.  Just drop it on the
 			 * floor.
 			 */
-			freemsg(ipsec_mp);
+			freemsg(mp);
 			tcps->tcps_rst_unsent++;
 			return;
 		}
 
-		tcp_xmit_early_reset("no tcp, reset/ack",
-		    ipsec_mp, 0, seg_seq + seg_len,
-		    TH_RST | TH_ACK, ip_hdr_len, zoneid, tcps, connp);
+		tcp_xmit_early_reset("no tcp, reset/ack", mp, 0,
+		    seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp);
 	}
 }
 
@@ -22573,14 +17304,16 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
 	mblk_t	*mp1;
 	mblk_t	*mp2;
 	uchar_t	*rptr;
-	tcph_t	*tcph;
+	tcpha_t	*tcpha;
 	int32_t	num_sack_blk = 0;
 	int32_t	sack_opt_len = 0;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
+	ip_xmit_attr_t	*ixa = connp->conn_ixa;
 
 	/* Allocate for our maximum TCP header + link-level */
-	mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH +
-	    tcps->tcps_wroff_xtra, BPRI_MED);
+	mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
+	    BPRI_MED);
 	if (!mp1)
 		return (NULL);
 	data_length = 0;
@@ -22646,15 +17379,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
 	}
 
 	/* Update the latest receive window size in TCP header. */
-	U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
-	    tcp->tcp_tcph->th_win);
+	tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
 
 	rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
 	mp1->b_rptr = rptr;
-	mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len;
-	bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len);
-	tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len];
-	U32_TO_ABE32(seq, tcph->th_seq);
+	mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len;
+	bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
+	tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
+	tcpha->tha_seq = htonl(seq);
 
 	/*
 	 * Use tcp_unsent to determine if the PUSH bit should be used assumes
@@ -22729,14 +17461,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
 			wptr[0] = TCPOPT_MAXSEG;
 			wptr[1] = TCPOPT_MAXSEG_LEN;
 			wptr += 2;
-			u1 = tcp->tcp_if_mtu -
-			    (tcp->tcp_ipversion == IPV4_VERSION ?
+			u1 = tcp->tcp_initial_pmtu -
+			    (connp->conn_ipversion == IPV4_VERSION ?
 			    IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) -
 			    TCP_MIN_HEADER_LENGTH;
 			U16_TO_BE16(u1, wptr);
 			mp1->b_wptr = wptr + 2;
 			/* Update the offset to cover the additional word */
-			tcph->th_offset_and_rsrvd[0] += (1 << 4);
+			tcpha->tha_offset_and_reserved += (1 << 4);
 
 			/*
 			 * Note that the following way of filling in
@@ -22763,7 +17495,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
 					ASSERT(tcp->tcp_ts_recent == 0);
 					U32_TO_BE32(0L, wptr);
 					mp1->b_wptr += TCPOPT_REAL_TS_LEN;
-					tcph->th_offset_and_rsrvd[0] +=
+					tcpha->tha_offset_and_reserved +=
 					    (3 << 4);
 				}
 
@@ -22819,7 +17551,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
 				wptr[2] =  TCPOPT_WS_LEN;
 				wptr[3] = (uchar_t)tcp->tcp_rcv_ws;
 				mp1->b_wptr += TCPOPT_REAL_WS_LEN;
-				tcph->th_offset_and_rsrvd[0] += (1 << 4);
+				tcpha->tha_offset_and_reserved += (1 << 4);
 			}
 
 			if (tcp->tcp_snd_sack_ok) {
@@ -22829,7 +17561,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
 				wptr[2] = TCPOPT_SACK_PERMITTED;
 				wptr[3] = TCPOPT_SACK_OK_LEN;
 				mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN;
-				tcph->th_offset_and_rsrvd[0] += (1 << 4);
+				tcpha->tha_offset_and_reserved += (1 << 4);
 			}
 
 			/* allocb() of adequate mblk assures space */
@@ -22840,9 +17572,9 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
 			 * Get IP set to checksum on our behalf
 			 * Include the adjustment for a source route if any.
 			 */
-			u1 += tcp->tcp_sum;
+			u1 += connp->conn_sum;
 			u1 = (u1 >> 16) + (u1 & 0xFFFF);
-			U16_TO_BE16(u1, tcph->th_sum);
+			tcpha->tha_sum = htons(u1);
 			BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
 		}
 		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
@@ -22878,10 +17610,10 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
 		    u1 < (uint32_t)(64 * 1024)) {
 			flags |= TH_URG;
 			BUMP_MIB(&tcps->tcps_mib, tcpOutUrg);
-			U32_TO_ABE16(u1, tcph->th_urp);
+			tcpha->tha_urp = htons(u1);
 		}
 	}
-	tcph->th_flags[0] = (uchar_t)flags;
+	tcpha->tha_flags = (uchar_t)flags;
 	tcp->tcp_rack = tcp->tcp_rnxt;
 	tcp->tcp_rack_cnt = 0;
 
@@ -22890,14 +17622,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
 			uint32_t llbolt = (uint32_t)lbolt;
 
 			U32_TO_BE32(llbolt,
-			    (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
+			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
 			U32_TO_BE32(tcp->tcp_ts_recent,
-			    (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
+			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
 		}
 	}
 
 	if (num_sack_blk > 0) {
-		uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len;
+		uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len;
 		sack_blk_t *tmp;
 		int32_t	i;
 
@@ -22915,33 +17647,34 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
 			U32_TO_BE32(tmp[i].end, wptr);
 			wptr += sizeof (tcp_seq);
 		}
-		tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4);
+		tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4);
 	}
 	ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX);
 	data_length += (int)(mp1->b_wptr - rptr);
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
+
+	ixa->ixa_pktlen = data_length;
+
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
 		((ipha_t *)rptr)->ipha_length = htons(data_length);
 	} else {
-		ip6_t *ip6 = (ip6_t *)(rptr +
-		    (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ?
-		    sizeof (ip6i_t) : 0));
+		ip6_t *ip6 = (ip6_t *)rptr;
 
-		ip6->ip6_plen = htons(data_length -
-		    ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
+		ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN);
 	}
 
 	/*
 	 * Prime pump for IP
 	 * Include the adjustment for a source route if any.
 	 */
-	data_length -= tcp->tcp_ip_hdr_len;
-	data_length += tcp->tcp_sum;
+	data_length -= ixa->ixa_ip_hdr_length;
+	data_length += connp->conn_sum;
 	data_length = (data_length >> 16) + (data_length & 0xFFFF);
-	U16_TO_ABE16(data_length, tcph->th_sum);
+	tcpha->tha_sum = htons(data_length);
 	if (tcp->tcp_ip_forward_progress) {
-		ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
-		*(uint32_t *)mp1->b_rptr  |= IP_FORWARD_PROG;
 		tcp->tcp_ip_forward_progress = B_FALSE;
+		connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
+	} else {
+		connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
 	}
 	return (mp1);
 }
@@ -23012,7 +17745,7 @@ tcp_ack_timer(void *arg)
 		BUMP_LOCAL(tcp->tcp_obsegs);
 		BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
 		BUMP_MIB(&tcps->tcps_mib, tcpOutAckDelayed);
-		tcp_send_data(tcp, tcp->tcp_wq, mp);
+		tcp_send_data(tcp, mp);
 	}
 }
 
@@ -23023,6 +17756,7 @@ tcp_ack_mp(tcp_t *tcp)
 {
 	uint32_t	seq_no;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
 	/*
 	 * There are a few cases to be considered while setting the sequence no.
@@ -23058,12 +17792,13 @@ tcp_ack_mp(tcp_t *tcp)
 		/* Generate a simple ACK */
 		int	data_length;
 		uchar_t	*rptr;
-		tcph_t	*tcph;
+		tcpha_t	*tcpha;
 		mblk_t	*mp1;
+		int32_t	total_hdr_len;
 		int32_t	tcp_hdr_len;
-		int32_t	tcp_tcp_hdr_len;
 		int32_t	num_sack_blk = 0;
 		int32_t sack_opt_len;
+		ip_xmit_attr_t *ixa = connp->conn_ixa;
 
 		/*
 		 * Allocate space for TCP + IP headers
@@ -23074,34 +17809,34 @@ tcp_ack_mp(tcp_t *tcp)
 			    tcp->tcp_num_sack_blk);
 			sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
 			    TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
-			tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len;
-			tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + sack_opt_len;
+			total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len;
+			tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len;
 		} else {
-			tcp_hdr_len = tcp->tcp_hdr_len;
-			tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len;
+			total_hdr_len = connp->conn_ht_iphc_len;
+			tcp_hdr_len = connp->conn_ht_ulp_len;
 		}
-		mp1 = allocb(tcp_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED);
+		mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED);
 		if (!mp1)
 			return (NULL);
 
 		/* Update the latest receive window size in TCP header. */
-		U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
-		    tcp->tcp_tcph->th_win);
+		tcp->tcp_tcpha->tha_win =
+		    htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
 		/* copy in prototype TCP + IP header */
 		rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
 		mp1->b_rptr = rptr;
-		mp1->b_wptr = rptr + tcp_hdr_len;
-		bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len);
+		mp1->b_wptr = rptr + total_hdr_len;
+		bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
 
-		tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len];
+		tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
 
 		/* Set the TCP sequence number. */
-		U32_TO_ABE32(seq_no, tcph->th_seq);
+		tcpha->tha_seq = htonl(seq_no);
 
 		/* Set up the TCP flag field. */
-		tcph->th_flags[0] = (uchar_t)TH_ACK;
+		tcpha->tha_flags = (uchar_t)TH_ACK;
 		if (tcp->tcp_ecn_echo_on)
-			tcph->th_flags[0] |= TH_ECE;
+			tcpha->tha_flags |= TH_ECE;
 
 		tcp->tcp_rack = tcp->tcp_rnxt;
 		tcp->tcp_rack_cnt = 0;
@@ -23111,14 +17846,15 @@ tcp_ack_mp(tcp_t *tcp)
 			uint32_t llbolt = (uint32_t)lbolt;
 
 			U32_TO_BE32(llbolt,
-			    (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
+			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
 			U32_TO_BE32(tcp->tcp_ts_recent,
-			    (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
+			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
 		}
 
 		/* Fill in SACK options */
 		if (num_sack_blk > 0) {
-			uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len;
+			uchar_t *wptr = (uchar_t *)tcpha +
+			    connp->conn_ht_ulp_len;
 			sack_blk_t *tmp;
 			int32_t	i;
 
@@ -23136,34 +17872,33 @@ tcp_ack_mp(tcp_t *tcp)
 				U32_TO_BE32(tmp[i].end, wptr);
 				wptr += sizeof (tcp_seq);
 			}
-			tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1)
-			    << 4);
+			tcpha->tha_offset_and_reserved +=
+			    ((num_sack_blk * 2 + 1) << 4);
 		}
 
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
-			((ipha_t *)rptr)->ipha_length = htons(tcp_hdr_len);
+		ixa->ixa_pktlen = total_hdr_len;
+
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
+			((ipha_t *)rptr)->ipha_length = htons(total_hdr_len);
 		} else {
-			/* Check for ip6i_t header in sticky hdrs */
-			ip6_t *ip6 = (ip6_t *)(rptr +
-			    (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ?
-			    sizeof (ip6i_t) : 0));
+			ip6_t *ip6 = (ip6_t *)rptr;
 
-			ip6->ip6_plen = htons(tcp_hdr_len -
-			    ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
+			ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
 		}
 
 		/*
 		 * Prime pump for checksum calculation in IP.  Include the
 		 * adjustment for a source route if any.
 		 */
-		data_length = tcp_tcp_hdr_len + tcp->tcp_sum;
+		data_length = tcp_hdr_len + connp->conn_sum;
 		data_length = (data_length >> 16) + (data_length & 0xFFFF);
-		U16_TO_ABE16(data_length, tcph->th_sum);
+		tcpha->tha_sum = htons(data_length);
 
 		if (tcp->tcp_ip_forward_progress) {
-			ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
-			*(uint32_t *)mp1->b_rptr  |= IP_FORWARD_PROG;
 			tcp->tcp_ip_forward_progress = B_FALSE;
+			connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
+		} else {
+			connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
 		}
 		return (mp1);
 	}
@@ -23183,6 +17918,8 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
 	tcp_t	**tcpp;
 	tcp_t	*tcpnext;
 	tcp_t	*tcphash;
+	conn_t	*connp = tcp->tcp_connp;
+	conn_t	*connext;
 
 	if (tcp->tcp_ptpbhn != NULL) {
 		ASSERT(!caller_holds_lock);
@@ -23199,7 +17936,7 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
 	if (tcphash != NULL) {
 		/* Look for an entry using the same port */
 		while ((tcphash = tcpp[0]) != NULL &&
-		    tcp->tcp_lport != tcphash->tcp_lport)
+		    connp->conn_lport != tcphash->tcp_connp->conn_lport)
 			tcpp = &(tcphash->tcp_bind_hash);
 
 		/* The port was not found, just add to the end */
@@ -23219,14 +17956,19 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
 		 * INADDR_ANY.
 		 */
 		tcpnext = tcphash;
+		connext = tcpnext->tcp_connp;
 		tcphash = NULL;
-		if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) &&
-		    !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) {
-			while ((tcpnext = tcpp[0]) != NULL &&
-			    !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6))
-				tcpp = &(tcpnext->tcp_bind_hash_port);
-
-			if (tcpnext) {
+		if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
+		    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
+			while ((tcpnext = tcpp[0]) != NULL) {
+				connext = tcpnext->tcp_connp;
+				if (!V6_OR_V4_INADDR_ANY(
+				    connext->conn_bound_addr_v6))
+					tcpp = &(tcpnext->tcp_bind_hash_port);
+				else
+					break;
+			}
+			if (tcpnext != NULL) {
 				tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
 				tcphash = tcpnext->tcp_bind_hash;
 				if (tcphash != NULL) {
@@ -23263,6 +18005,7 @@ tcp_bind_hash_remove(tcp_t *tcp)
 	tcp_t	*tcpnext;
 	kmutex_t *lockp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
 	if (tcp->tcp_ptpbhn == NULL)
 		return;
@@ -23271,8 +18014,9 @@ tcp_bind_hash_remove(tcp_t *tcp)
 	 * Extract the lock pointer in case there are concurrent
 	 * hash_remove's for this instance.
 	 */
-	ASSERT(tcp->tcp_lport != 0);
-	lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock;
+	ASSERT(connp->conn_lport != 0);
+	lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
+	    connp->conn_lport)].tf_lock;
 
 	ASSERT(lockp != NULL);
 	mutex_enter(lockp);
@@ -23548,7 +18292,7 @@ tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
 	*sys_errorp = 0;
 	*do_disconnectp = 0;
 
-	error = tpi_optcom_buf(tcp->tcp_wq, mp, opt_lenp,
+	error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp,
 	    opt_offset, cr, &tcp_opt_obj,
 	    NULL, &is_absreq_failure);
 
@@ -23663,238 +18407,6 @@ tcp_sack_info_constructor(void *buf, void *cdrarg, int kmflags)
 	return (0);
 }
 
-/* ARGSUSED */
-static int
-tcp_iphc_constructor(void *buf, void *cdrarg, int kmflags)
-{
-	bzero(buf, TCP_MAX_COMBINED_HEADER_LENGTH);
-	return (0);
-}
-
-/*
- * Make sure we wait until the default queue is setup, yet allow
- * tcp_g_q_create() to open a TCP stream.
- * We need to allow tcp_g_q_create() do do an open
- * of tcp, hence we compare curhread.
- * All others have to wait until the tcps_g_q has been
- * setup.
- */
-void
-tcp_g_q_setup(tcp_stack_t *tcps)
-{
-	mutex_enter(&tcps->tcps_g_q_lock);
-	if (tcps->tcps_g_q != NULL) {
-		mutex_exit(&tcps->tcps_g_q_lock);
-		return;
-	}
-	if (tcps->tcps_g_q_creator == NULL) {
-		/* This thread will set it up */
-		tcps->tcps_g_q_creator = curthread;
-		mutex_exit(&tcps->tcps_g_q_lock);
-		tcp_g_q_create(tcps);
-		mutex_enter(&tcps->tcps_g_q_lock);
-		ASSERT(tcps->tcps_g_q_creator == curthread);
-		tcps->tcps_g_q_creator = NULL;
-		cv_signal(&tcps->tcps_g_q_cv);
-		ASSERT(tcps->tcps_g_q != NULL);
-		mutex_exit(&tcps->tcps_g_q_lock);
-		return;
-	}
-	/* Everybody but the creator has to wait */
-	if (tcps->tcps_g_q_creator != curthread) {
-		while (tcps->tcps_g_q == NULL)
-			cv_wait(&tcps->tcps_g_q_cv, &tcps->tcps_g_q_lock);
-	}
-	mutex_exit(&tcps->tcps_g_q_lock);
-}
-
-#define	IP	"ip"
-
-#define	TCP6DEV		"/devices/pseudo/tcp6@0:tcp6"
-
-/*
- * Create a default tcp queue here instead of in strplumb
- */
-void
-tcp_g_q_create(tcp_stack_t *tcps)
-{
-	int error;
-	ldi_handle_t	lh = NULL;
-	ldi_ident_t	li = NULL;
-	int		rval;
-	cred_t		*cr;
-	major_t IP_MAJ;
-
-#ifdef NS_DEBUG
-	(void) printf("tcp_g_q_create()\n");
-#endif
-
-	IP_MAJ = ddi_name_to_major(IP);
-
-	ASSERT(tcps->tcps_g_q_creator == curthread);
-
-	error = ldi_ident_from_major(IP_MAJ, &li);
-	if (error) {
-#ifdef DEBUG
-		printf("tcp_g_q_create: lyr ident get failed error %d\n",
-		    error);
-#endif
-		return;
-	}
-
-	cr = zone_get_kcred(netstackid_to_zoneid(
-	    tcps->tcps_netstack->netstack_stackid));
-	ASSERT(cr != NULL);
-	/*
-	 * We set the tcp default queue to IPv6 because IPv4 falls
-	 * back to IPv6 when it can't find a client, but
-	 * IPv6 does not fall back to IPv4.
-	 */
-	error = ldi_open_by_name(TCP6DEV, FREAD|FWRITE, cr, &lh, li);
-	if (error) {
-#ifdef DEBUG
-		printf("tcp_g_q_create: open of TCP6DEV failed error %d\n",
-		    error);
-#endif
-		goto out;
-	}
-
-	/*
-	 * This ioctl causes the tcp framework to cache a pointer to
-	 * this stream, so we don't want to close the stream after
-	 * this operation.
-	 * Use the kernel credentials that are for the zone we're in.
-	 */
-	error = ldi_ioctl(lh, TCP_IOC_DEFAULT_Q,
-	    (intptr_t)0, FKIOCTL, cr, &rval);
-	if (error) {
-#ifdef DEBUG
-		printf("tcp_g_q_create: ioctl TCP_IOC_DEFAULT_Q failed "
-		    "error %d\n", error);
-#endif
-		goto out;
-	}
-	tcps->tcps_g_q_lh = lh;	/* For tcp_g_q_close */
-	lh = NULL;
-out:
-	/* Close layered handles */
-	if (li)
-		ldi_ident_release(li);
-	/* Keep cred around until _inactive needs it */
-	tcps->tcps_g_q_cr = cr;
-}
-
-/*
- * We keep tcp_g_q set until all other tcp_t's in the zone
- * has gone away, and then when tcp_g_q_inactive() is called
- * we clear it.
- */
-void
-tcp_g_q_destroy(tcp_stack_t *tcps)
-{
-#ifdef NS_DEBUG
-	(void) printf("tcp_g_q_destroy()for stack %d\n",
-	    tcps->tcps_netstack->netstack_stackid);
-#endif
-
-	if (tcps->tcps_g_q == NULL) {
-		return;	/* Nothing to cleanup */
-	}
-	/*
-	 * Drop reference corresponding to the default queue.
-	 * This reference was added from tcp_open when the default queue
-	 * was created, hence we compensate for this extra drop in
-	 * tcp_g_q_close. If the refcnt drops to zero here it means
-	 * the default queue was the last one to be open, in which
-	 * case, then tcp_g_q_inactive will be
-	 * called as a result of the refrele.
-	 */
-	TCPS_REFRELE(tcps);
-}
-
-/*
- * Called when last tcp_t drops reference count using TCPS_REFRELE.
- * Run by tcp_q_q_inactive using a taskq.
- */
-static void
-tcp_g_q_close(void *arg)
-{
-	tcp_stack_t *tcps = arg;
-	int error;
-	ldi_handle_t	lh = NULL;
-	ldi_ident_t	li = NULL;
-	cred_t		*cr;
-	major_t IP_MAJ;
-
-	IP_MAJ = ddi_name_to_major(IP);
-
-#ifdef NS_DEBUG
-	(void) printf("tcp_g_q_inactive() for stack %d refcnt %d\n",
-	    tcps->tcps_netstack->netstack_stackid,
-	    tcps->tcps_netstack->netstack_refcnt);
-#endif
-	lh = tcps->tcps_g_q_lh;
-	if (lh == NULL)
-		return;	/* Nothing to cleanup */
-
-	ASSERT(tcps->tcps_refcnt == 1);
-	ASSERT(tcps->tcps_g_q != NULL);
-
-	error = ldi_ident_from_major(IP_MAJ, &li);
-	if (error) {
-#ifdef DEBUG
-		printf("tcp_g_q_inactive: lyr ident get failed error %d\n",
-		    error);
-#endif
-		return;
-	}
-
-	cr = tcps->tcps_g_q_cr;
-	tcps->tcps_g_q_cr = NULL;
-	ASSERT(cr != NULL);
-
-	/*
-	 * Make sure we can break the recursion when tcp_close decrements
-	 * the reference count causing g_q_inactive to be called again.
-	 */
-	tcps->tcps_g_q_lh = NULL;
-
-	/* close the default queue */
-	(void) ldi_close(lh, FREAD|FWRITE, cr);
-	/*
-	 * At this point in time tcps and the rest of netstack_t might
-	 * have been deleted.
-	 */
-	tcps = NULL;
-
-	/* Close layered handles */
-	ldi_ident_release(li);
-	crfree(cr);
-}
-
-/*
- * Called when last tcp_t drops reference count using TCPS_REFRELE.
- *
- * Have to ensure that the ldi routines are not used by an
- * interrupt thread by using a taskq.
- */
-void
-tcp_g_q_inactive(tcp_stack_t *tcps)
-{
-	if (tcps->tcps_g_q_lh == NULL)
-		return;	/* Nothing to cleanup */
-
-	ASSERT(tcps->tcps_refcnt == 0);
-	TCPS_REFHOLD(tcps); /* Compensate for what g_q_destroy did */
-
-	if (servicing_interrupt()) {
-		(void) taskq_dispatch(tcp_taskq, tcp_g_q_close,
-		    (void *) tcps, TQ_SLEEP);
-	} else {
-		tcp_g_q_close(tcps);
-	}
-}
-
 /*
  * Called by IP when IP is loaded into the kernel
  */
@@ -23909,10 +18421,6 @@ tcp_ddi_g_init(void)
 	    sizeof (tcp_sack_info_t), 0,
 	    tcp_sack_info_constructor, NULL, NULL, NULL, NULL, 0);
 
-	tcp_iphc_cache = kmem_cache_create("tcp_iphc_cache",
-	    TCP_MAX_COMBINED_HEADER_LENGTH, 0,
-	    tcp_iphc_constructor, NULL, NULL, NULL, NULL, 0);
-
 	mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	/* Initialize the random number generator */
@@ -23923,9 +18431,6 @@ tcp_ddi_g_init(void)
 
 	tcp_g_kstat = tcp_g_kstat_init(&tcp_g_statistics);
 
-	tcp_taskq = taskq_create("tcp_taskq", 1, minclsyspri, 1, 1,
-	    TASKQ_PREPOPULATE);
-
 	tcp_squeue_flag = tcp_squeue_switch(tcp_squeue_wput);
 
 	/*
@@ -23933,8 +18438,7 @@ tcp_ddi_g_init(void)
 	 * destroyed in the kernel, so we can maintain the
 	 * set of tcp_stack_t's.
 	 */
-	netstack_register(NS_TCP, tcp_stack_init, tcp_stack_shutdown,
-	    tcp_stack_fini);
+	netstack_register(NS_TCP, tcp_stack_init, NULL, tcp_stack_fini);
 }
 
 
@@ -23956,8 +18460,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
 	tcps->tcps_netstack = ns;
 
 	/* Initialize locks */
-	mutex_init(&tcps->tcps_g_q_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&tcps->tcps_g_q_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&tcps->tcps_iss_key_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&tcps->tcps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL);
 
@@ -24018,6 +18520,11 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
 	major = mod_name_to_major(INET_NAME);
 	error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident);
 	ASSERT(error == 0);
+	tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
+	ASSERT(tcps->tcps_ixa_cleanup_mp != NULL);
+	cv_init(&tcps->tcps_ixa_cleanup_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL);
+
 	return (tcps);
 }
 
@@ -24035,22 +18542,8 @@ tcp_ddi_g_destroy(void)
 
 	kmem_cache_destroy(tcp_timercache);
 	kmem_cache_destroy(tcp_sack_info_cache);
-	kmem_cache_destroy(tcp_iphc_cache);
 
 	netstack_unregister(NS_TCP);
-	taskq_destroy(tcp_taskq);
-}
-
-/*
- * Shut down the TCP stack instance.
- */
-/* ARGSUSED */
-static void
-tcp_stack_shutdown(netstackid_t stackid, void *arg)
-{
-	tcp_stack_t *tcps = (tcp_stack_t *)arg;
-
-	tcp_g_q_destroy(tcps);
 }
 
 /*
@@ -24062,17 +18555,16 @@ tcp_stack_fini(netstackid_t stackid, void *arg)
 	tcp_stack_t *tcps = (tcp_stack_t *)arg;
 	int i;
 
+	freeb(tcps->tcps_ixa_cleanup_mp);
+	tcps->tcps_ixa_cleanup_mp = NULL;
+	cv_destroy(&tcps->tcps_ixa_cleanup_cv);
+	mutex_destroy(&tcps->tcps_ixa_cleanup_lock);
+
 	nd_free(&tcps->tcps_g_nd);
 	kmem_free(tcps->tcps_params, sizeof (lcl_tcp_param_arr));
 	tcps->tcps_params = NULL;
 	kmem_free(tcps->tcps_wroff_xtra_param, sizeof (tcpparam_t));
 	tcps->tcps_wroff_xtra_param = NULL;
-	kmem_free(tcps->tcps_mdt_head_param, sizeof (tcpparam_t));
-	tcps->tcps_mdt_head_param = NULL;
-	kmem_free(tcps->tcps_mdt_tail_param, sizeof (tcpparam_t));
-	tcps->tcps_mdt_tail_param = NULL;
-	kmem_free(tcps->tcps_mdt_max_pbufs_param, sizeof (tcpparam_t));
-	tcps->tcps_mdt_max_pbufs_param = NULL;
 
 	for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) {
 		ASSERT(tcps->tcps_bind_fanout[i].tf_tcp == NULL);
@@ -24091,8 +18583,6 @@ tcp_stack_fini(netstackid_t stackid, void *arg)
 	tcps->tcps_acceptor_fanout = NULL;
 
 	mutex_destroy(&tcps->tcps_iss_key_lock);
-	mutex_destroy(&tcps->tcps_g_q_lock);
-	cv_destroy(&tcps->tcps_g_q_cv);
 	mutex_destroy(&tcps->tcps_epriv_port_lock);
 
 	ip_drop_unregister(&tcps->tcps_dropper);
@@ -24120,6 +18610,7 @@ tcp_iss_init(tcp_t *tcp)
 	struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg;
 	uint32_t answer[4];
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	conn_t		*connp = tcp->tcp_connp;
 
 	tcps->tcps_iss_incr_extra += (ISS_INCR >> 1);
 	tcp->tcp_iss = tcps->tcps_iss_incr_extra;
@@ -24128,16 +18619,9 @@ tcp_iss_init(tcp_t *tcp)
 		mutex_enter(&tcps->tcps_iss_key_lock);
 		context = tcps->tcps_iss_key;
 		mutex_exit(&tcps->tcps_iss_key_lock);
-		arg.ports = tcp->tcp_ports;
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
-			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
-			    &arg.src);
-			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_dst,
-			    &arg.dst);
-		} else {
-			arg.src = tcp->tcp_ip6h->ip6_src;
-			arg.dst = tcp->tcp_ip6h->ip6_dst;
-		}
+		arg.ports = connp->conn_ports;
+		arg.src = connp->conn_laddr_v6;
+		arg.dst = connp->conn_faddr_v6;
 		MD5Update(&context, (uchar_t *)&arg, sizeof (arg));
 		MD5Final((uchar_t *)answer, &context);
 		tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3];
@@ -24220,27 +18704,16 @@ cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg,
 		connp = NULL;
 
 		while ((connp =
-		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) {
 
 			tcp = connp->conn_tcp;
 			cl_tcpi.cl_tcpi_version = CL_TCPI_V1;
-			cl_tcpi.cl_tcpi_ipversion = tcp->tcp_ipversion;
+			cl_tcpi.cl_tcpi_ipversion = connp->conn_ipversion;
 			cl_tcpi.cl_tcpi_state = tcp->tcp_state;
-			cl_tcpi.cl_tcpi_lport = tcp->tcp_lport;
-			cl_tcpi.cl_tcpi_fport = tcp->tcp_fport;
-			/*
-			 * The macros tcp_laddr and tcp_faddr give the IPv4
-			 * addresses. They are copied implicitly below as
-			 * mapped addresses.
-			 */
-			cl_tcpi.cl_tcpi_laddr_v6 = tcp->tcp_ip_src_v6;
-			if (tcp->tcp_ipversion == IPV4_VERSION) {
-				cl_tcpi.cl_tcpi_faddr =
-				    tcp->tcp_ipha->ipha_dst;
-			} else {
-				cl_tcpi.cl_tcpi_faddr_v6 =
-				    tcp->tcp_ip6h->ip6_dst;
-			}
+			cl_tcpi.cl_tcpi_lport = connp->conn_lport;
+			cl_tcpi.cl_tcpi_fport = connp->conn_fport;
+			cl_tcpi.cl_tcpi_laddr_v6 = connp->conn_laddr_v6;
+			cl_tcpi.cl_tcpi_faddr_v6 = connp->conn_faddr_v6;
 
 			/*
 			 * If the callback returns non-zero
@@ -24302,35 +18775,35 @@ cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg,
 /*
  * Check if a tcp structure matches the info in acp.
  */
-#define	TCP_AC_ADDR_MATCH(acp, tcp)					\
+#define	TCP_AC_ADDR_MATCH(acp, connp, tcp)			\
 	(((acp)->ac_local.ss_family == AF_INET) ?		\
 	((TCP_AC_V4LOCAL((acp)) == INADDR_ANY ||		\
-	TCP_AC_V4LOCAL((acp)) == (tcp)->tcp_ip_src) &&	\
+	TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) &&	\
 	(TCP_AC_V4REMOTE((acp)) == INADDR_ANY ||		\
-	TCP_AC_V4REMOTE((acp)) == (tcp)->tcp_remote) &&	\
+	TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) &&	\
 	(TCP_AC_V4LPORT((acp)) == 0 ||				\
-	TCP_AC_V4LPORT((acp)) == (tcp)->tcp_lport) &&		\
+	TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) &&	\
 	(TCP_AC_V4RPORT((acp)) == 0 ||				\
-	TCP_AC_V4RPORT((acp)) == (tcp)->tcp_fport) &&		\
-	(acp)->ac_start <= (tcp)->tcp_state &&	\
-	(acp)->ac_end >= (tcp)->tcp_state) :		\
+	TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) &&	\
+	(acp)->ac_start <= (tcp)->tcp_state &&			\
+	(acp)->ac_end >= (tcp)->tcp_state) :			\
 	((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) ||	\
 	IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)),		\
-	&(tcp)->tcp_ip_src_v6)) &&				\
+	&(connp)->conn_laddr_v6)) &&				\
 	(IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) ||	\
 	IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)),		\
-	&(tcp)->tcp_remote_v6)) &&				\
+	&(connp)->conn_faddr_v6)) &&				\
 	(TCP_AC_V6LPORT((acp)) == 0 ||				\
-	TCP_AC_V6LPORT((acp)) == (tcp)->tcp_lport) &&		\
+	TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) &&	\
 	(TCP_AC_V6RPORT((acp)) == 0 ||				\
-	TCP_AC_V6RPORT((acp)) == (tcp)->tcp_fport) &&		\
-	(acp)->ac_start <= (tcp)->tcp_state &&	\
+	TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) &&	\
+	(acp)->ac_start <= (tcp)->tcp_state &&			\
 	(acp)->ac_end >= (tcp)->tcp_state))
 
-#define	TCP_AC_MATCH(acp, tcp)					\
+#define	TCP_AC_MATCH(acp, connp, tcp)				\
 	(((acp)->ac_zoneid == ALL_ZONES ||			\
-	(acp)->ac_zoneid == tcp->tcp_connp->conn_zoneid) ?	\
-	TCP_AC_ADDR_MATCH(acp, tcp) : 0)
+	(acp)->ac_zoneid == (connp)->conn_zoneid) ?		\
+	TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0)
 
 /*
  * Build a message containing a tcp_ioc_abort_conn_t structure
@@ -24346,8 +18819,6 @@ tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
 	if (mp == NULL)
 		return (NULL);
 
-	mp->b_datap->db_type = M_CTL;
-
 	*((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN;
 	tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr +
 	    sizeof (uint32_t));
@@ -24359,17 +18830,17 @@ tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
 	if (acp->ac_local.ss_family == AF_INET) {
 		tacp->ac_local.ss_family = AF_INET;
 		tacp->ac_remote.ss_family = AF_INET;
-		TCP_AC_V4LOCAL(tacp) = tp->tcp_ip_src;
-		TCP_AC_V4REMOTE(tacp) = tp->tcp_remote;
-		TCP_AC_V4LPORT(tacp) = tp->tcp_lport;
-		TCP_AC_V4RPORT(tacp) = tp->tcp_fport;
+		TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4;
+		TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4;
+		TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport;
+		TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport;
 	} else {
 		tacp->ac_local.ss_family = AF_INET6;
 		tacp->ac_remote.ss_family = AF_INET6;
-		TCP_AC_V6LOCAL(tacp) = tp->tcp_ip_src_v6;
-		TCP_AC_V6REMOTE(tacp) = tp->tcp_remote_v6;
-		TCP_AC_V6LPORT(tacp) = tp->tcp_lport;
-		TCP_AC_V6RPORT(tacp) = tp->tcp_fport;
+		TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6;
+		TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6;
+		TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport;
+		TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport;
 	}
 	mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp);
 	return (mp);
@@ -24419,14 +18890,32 @@ tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp)
 }
 
 /*
- * Called inside tcp_rput when a message built using
+ * Called using SQ_FILL when a message built using
  * tcp_ioctl_abort_build_msg is put into a queue.
  * Note that when we get here there is no wildcard in acp any more.
  */
+/* ARGSUSED2 */
 static void
-tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp)
+tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
+    ip_recv_attr_t *dummy)
 {
-	tcp_ioc_abort_conn_t *acp;
+	conn_t			*connp = (conn_t *)arg;
+	tcp_t			*tcp = connp->conn_tcp;
+	tcp_ioc_abort_conn_t	*acp;
+
+	/*
+	 * Don't accept any input on a closed tcp as this TCP logically does
+	 * not exist on the system. Don't proceed further with this TCP.
+	 * For eg. this packet could trigger another close of this tcp
+	 * which would be disastrous for tcp_refcnt. tcp_close_detached /
+	 * tcp_clean_death / tcp_closei_local must be called at most once
+	 * on a TCP.
+	 */
+	if (tcp->tcp_state == TCPS_CLOSED ||
+	    tcp->tcp_state == TCPS_BOUND) {
+		freemsg(mp);
+		return;
+	}
 
 	acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t));
 	if (tcp->tcp_state <= acp->ac_end) {
@@ -24468,12 +18957,17 @@ startover:
 	for (tconnp = connfp->connf_head; tconnp != NULL;
 	    tconnp = tconnp->conn_next) {
 		tcp = tconnp->conn_tcp;
-		if (TCP_AC_MATCH(acp, tcp)) {
-			CONN_INC_REF(tcp->tcp_connp);
+		/*
+		 * We are missing a check on sin6_scope_id for linklocals here,
+		 * but current usage is just for aborting based on zoneid
+		 * for shared-IP zones.
+		 */
+		if (TCP_AC_MATCH(acp, tconnp, tcp)) {
+			CONN_INC_REF(tconnp);
 			mp = tcp_ioctl_abort_build_msg(acp, tcp);
 			if (mp == NULL) {
 				err = ENOMEM;
-				CONN_DEC_REF(tcp->tcp_connp);
+				CONN_DEC_REF(tconnp);
 				break;
 			}
 			mp->b_prev = (mblk_t *)tcp;
@@ -24501,8 +18995,9 @@ startover:
 		listhead = listhead->b_next;
 		tcp = (tcp_t *)mp->b_prev;
 		mp->b_next = mp->b_prev = NULL;
-		SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, tcp_input,
-		    tcp->tcp_connp, SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
+		SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp,
+		    tcp_ioctl_abort_handler, tcp->tcp_connp, NULL,
+		    SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
 	}
 
 	*count += nmatch;
@@ -24669,7 +19164,7 @@ out:
  */
 void
 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
-    uint32_t seg_ack, int seg_len, tcph_t *tcph)
+    uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
 {
 	int32_t		bytes_acked;
 	int32_t		gap;
@@ -24677,17 +19172,18 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
 	tcp_opt_t	tcpopt;
 	uint_t		flags;
 	uint32_t	new_swnd = 0;
-	conn_t		*connp;
+	conn_t		*nconnp;
+	conn_t		*connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
 	BUMP_LOCAL(tcp->tcp_ibsegs);
 	DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
 
-	flags = (unsigned int)tcph->th_flags[0] & 0xFF;
-	new_swnd = BE16_TO_U16(tcph->th_win) <<
-	    ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws);
+	flags = (unsigned int)tcpha->tha_flags & 0xFF;
+	new_swnd = ntohs(tcpha->tha_win) <<
+	    ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
 	if (tcp->tcp_snd_ts_ok) {
-		if (!tcp_paws_check(tcp, tcph, &tcpopt)) {
+		if (!tcp_paws_check(tcp, tcpha, &tcpopt)) {
 			tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
 			    tcp->tcp_rnxt, TH_ACK);
 			goto done;
@@ -24770,17 +19266,10 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
 			mutex_enter(&tcps->tcps_iss_key_lock);
 			context = tcps->tcps_iss_key;
 			mutex_exit(&tcps->tcps_iss_key_lock);
-			arg.ports = tcp->tcp_ports;
+			arg.ports = connp->conn_ports;
 			/* We use MAPPED addresses in tcp_iss_init */
-			arg.src = tcp->tcp_ip_src_v6;
-			if (tcp->tcp_ipversion == IPV4_VERSION) {
-				IN6_IPADDR_TO_V4MAPPED(
-				    tcp->tcp_ipha->ipha_dst,
-				    &arg.dst);
-			} else {
-				arg.dst =
-				    tcp->tcp_ip6h->ip6_dst;
-			}
+			arg.src = connp->conn_laddr_v6;
+			arg.dst = connp->conn_faddr_v6;
 			MD5Update(&context, (uchar_t *)&arg,
 			    sizeof (arg));
 			MD5Final((uchar_t *)answer, &context);
@@ -24813,21 +19302,11 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
 		 */
 		if (tcp_clean_death(tcp, 0, 27) == -1)
 			goto done;
-		/*
-		 * We will come back to tcp_rput_data
-		 * on the global queue. Packets destined
-		 * for the global queue will be checked
-		 * with global policy. But the policy for
-		 * this packet has already been checked as
-		 * this was destined for the detached
-		 * connection. We need to bypass policy
-		 * check this time by attaching a dummy
-		 * ipsec_in with ipsec_in_dont_check set.
-		 */
-		connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid, ipst);
-		if (connp != NULL) {
+		nconnp = ipcl_classify(mp, ira, ipst);
+		if (nconnp != NULL) {
 			TCP_STAT(tcps, tcp_time_wait_syn_success);
-			tcp_reinput(connp, mp, tcp->tcp_connp->conn_sqp);
+			/* Drops ref on nconnp */
+			tcp_reinput(nconnp, mp, ira, ipst);
 			return;
 		}
 		goto done;
@@ -24905,11 +19384,6 @@ process_ack:
 		    tcp->tcp_rnxt, TH_ACK);
 	}
 done:
-	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
-		DB_CKSUMSTART(mp) = 0;
-		mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
-		TCP_STAT(tcps, tcp_time_wait_syn_fail);
-	}
 	freemsg(mp);
 }
 
@@ -24965,11 +19439,12 @@ tcp_timer_callback(void *arg)
 	tcpt = (tcp_timer_t *)mp->b_rptr;
 	connp = tcpt->connp;
 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
-	    SQ_FILL, SQTAG_TCP_TIMER);
+	    NULL, SQ_FILL, SQTAG_TCP_TIMER);
 }
 
+/* ARGSUSED */
 static void
-tcp_timer_handler(void *arg, mblk_t *mp, void *arg2)
+tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	tcp_timer_t *tcpt;
 	conn_t *connp = (conn_t *)arg;
@@ -24983,7 +19458,7 @@ tcp_timer_handler(void *arg, mblk_t *mp, void *arg2)
 	 * If the TCP has reached the closed state, don't proceed any
 	 * further. This TCP logically does not exist on the system.
 	 * tcpt_proc could for example access queues, that have already
-	 * been qprocoff'ed off. Also see comments at the start of tcp_input
+	 * been qprocoff'ed off.
 	 */
 	if (tcp->tcp_state != TCPS_CLOSED) {
 		(*tcpt->tcpt_proc)(connp);
@@ -25148,26 +19623,9 @@ tcp_setqfull(tcp_t *tcp)
 	if (tcp->tcp_closed)
 		return;
 
-	if (IPCL_IS_NONSTR(connp)) {
-		(*connp->conn_upcalls->su_txq_full)
-		    (tcp->tcp_connp->conn_upper_handle, B_TRUE);
-		tcp->tcp_flow_stopped = B_TRUE;
-	} else {
-		queue_t *q = tcp->tcp_wq;
-
-		if (!(q->q_flag & QFULL)) {
-			mutex_enter(QLOCK(q));
-			if (!(q->q_flag & QFULL)) {
-				/* still need to set QFULL */
-				q->q_flag |= QFULL;
-				tcp->tcp_flow_stopped = B_TRUE;
-				mutex_exit(QLOCK(q));
-				TCP_STAT(tcps, tcp_flwctl_on);
-			} else {
-				mutex_exit(QLOCK(q));
-			}
-		}
-	}
+	conn_setqfull(connp, &tcp->tcp_flow_stopped);
+	if (tcp->tcp_flow_stopped)
+		TCP_STAT(tcps, tcp_flwctl_on);
 }
 
 void
@@ -25177,27 +19635,7 @@ tcp_clrqfull(tcp_t *tcp)
 
 	if (tcp->tcp_closed)
 		return;
-
-	if (IPCL_IS_NONSTR(connp)) {
-		(*connp->conn_upcalls->su_txq_full)
-		    (tcp->tcp_connp->conn_upper_handle, B_FALSE);
-		tcp->tcp_flow_stopped = B_FALSE;
-	} else {
-		queue_t *q = tcp->tcp_wq;
-
-		if (q->q_flag & QFULL) {
-			mutex_enter(QLOCK(q));
-			if (q->q_flag & QFULL) {
-				q->q_flag &= ~QFULL;
-				tcp->tcp_flow_stopped = B_FALSE;
-				mutex_exit(QLOCK(q));
-				if (q->q_flag & QWANTW)
-					qbackenable(q, 0);
-			} else {
-				mutex_exit(QLOCK(q));
-			}
-		}
-	}
+	conn_clrqfull(connp, &tcp->tcp_flow_stopped);
 }
 
 /*
@@ -25246,10 +19684,7 @@ tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp)
 	tcp_stat_t template = {
 		{ "tcp_time_wait",		KSTAT_DATA_UINT64 },
 		{ "tcp_time_wait_syn",		KSTAT_DATA_UINT64 },
-		{ "tcp_time_wait_success",	KSTAT_DATA_UINT64 },
-		{ "tcp_time_wait_fail",		KSTAT_DATA_UINT64 },
-		{ "tcp_reinput_syn",		KSTAT_DATA_UINT64 },
-		{ "tcp_ip_output",		KSTAT_DATA_UINT64 },
+		{ "tcp_time_wait_syn_success",	KSTAT_DATA_UINT64 },
 		{ "tcp_detach_non_time_wait",	KSTAT_DATA_UINT64 },
 		{ "tcp_detach_time_wait",	KSTAT_DATA_UINT64 },
 		{ "tcp_time_wait_reap",		KSTAT_DATA_UINT64 },
@@ -25287,37 +19722,14 @@ tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp)
 		{ "tcp_timermp_freed",		KSTAT_DATA_UINT64 },
 		{ "tcp_push_timer_cnt",		KSTAT_DATA_UINT64 },
 		{ "tcp_ack_timer_cnt",		KSTAT_DATA_UINT64 },
-		{ "tcp_ire_null1",		KSTAT_DATA_UINT64 },
-		{ "tcp_ire_null",		KSTAT_DATA_UINT64 },
-		{ "tcp_ip_send",		KSTAT_DATA_UINT64 },
-		{ "tcp_ip_ire_send",		KSTAT_DATA_UINT64 },
 		{ "tcp_wsrv_called",		KSTAT_DATA_UINT64 },
 		{ "tcp_flwctl_on",		KSTAT_DATA_UINT64 },
 		{ "tcp_timer_fire_early",	KSTAT_DATA_UINT64 },
 		{ "tcp_timer_fire_miss",	KSTAT_DATA_UINT64 },
 		{ "tcp_rput_v6_error",		KSTAT_DATA_UINT64 },
-		{ "tcp_out_sw_cksum",		KSTAT_DATA_UINT64 },
-		{ "tcp_out_sw_cksum_bytes",	KSTAT_DATA_UINT64 },
 		{ "tcp_zcopy_on",		KSTAT_DATA_UINT64 },
 		{ "tcp_zcopy_off",		KSTAT_DATA_UINT64 },
 		{ "tcp_zcopy_backoff",		KSTAT_DATA_UINT64 },
-		{ "tcp_zcopy_disable",		KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_pkt_out",		KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_pkt_out_v4",		KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_pkt_out_v6",		KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_discarded",		KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_conn_halted1",	KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_conn_halted2",	KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_conn_halted3",	KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_conn_resumed1",	KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_conn_resumed2",	KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_legacy_small",	KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_legacy_all",		KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_legacy_ret",		KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_allocfail",		KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_addpdescfail",	KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_allocd",		KSTAT_DATA_UINT64 },
-		{ "tcp_mdt_linked",		KSTAT_DATA_UINT64 },
 		{ "tcp_fusion_flowctl",		KSTAT_DATA_UINT64 },
 		{ "tcp_fusion_backenabled",	KSTAT_DATA_UINT64 },
 		{ "tcp_fusion_urg",		KSTAT_DATA_UINT64 },
@@ -25490,7 +19902,7 @@ tcp_kstat_update(kstat_t *kp, int rw)
 		connfp = &ipst->ips_ipcl_globalhash_fanout[i];
 		connp = NULL;
 		while ((connp =
-		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) {
 			tcp = connp->conn_tcp;
 			switch (tcp_snmp_state(tcp)) {
 			case MIB2_TCP_established:
@@ -25565,48 +19977,6 @@ tcp_kstat_update(kstat_t *kp, int rw)
 	return (0);
 }
 
-void
-tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp)
-{
-	uint16_t	hdr_len;
-	ipha_t		*ipha;
-	uint8_t		*nexthdrp;
-	tcph_t		*tcph;
-	tcp_stack_t	*tcps = connp->conn_tcp->tcp_tcps;
-
-	/* Already has an eager */
-	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
-		TCP_STAT(tcps, tcp_reinput_syn);
-		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
-		    SQ_PROCESS, SQTAG_TCP_REINPUT_EAGER);
-		return;
-	}
-
-	switch (IPH_HDR_VERSION(mp->b_rptr)) {
-	case IPV4_VERSION:
-		ipha = (ipha_t *)mp->b_rptr;
-		hdr_len = IPH_HDR_LENGTH(ipha);
-		break;
-	case IPV6_VERSION:
-		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
-		    &hdr_len, &nexthdrp)) {
-			CONN_DEC_REF(connp);
-			freemsg(mp);
-			return;
-		}
-		break;
-	}
-
-	tcph = (tcph_t *)&mp->b_rptr[hdr_len];
-	if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
-		mp->b_datap->db_struioflag |= STRUIO_EAGER;
-		DB_CKSUMSTART(mp) = (intptr_t)sqp;
-	}
-
-	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
-	    SQ_FILL, SQTAG_TCP_REINPUT);
-}
-
 static int
 tcp_squeue_switch(int val)
 {
@@ -25653,278 +20023,20 @@ tcp_squeue_add(squeue_t *sqp)
 	tcp_time_wait->tcp_free_list_cnt = 0;
 }
 
-static int
-tcp_post_ip_bind(tcp_t *tcp, mblk_t *mp, int error, cred_t *cr, pid_t pid)
+/*
+ * On a labeled system we have some protocols above TCP, such as RPC, which
+ * appear to assume that every mblk in a chain has a db_credp.
+ */
+static void
+tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
 {
-	mblk_t	*ire_mp = NULL;
-	mblk_t	*syn_mp;
-	mblk_t	*mdti;
-	mblk_t	*lsoi;
-	int	retval;
-	tcph_t	*tcph;
-	cred_t	*ecr;
-	ts_label_t	*tsl;
-	uint32_t	mss;
-	conn_t	*connp = tcp->tcp_connp;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-
-	if (error == 0) {
-		/*
-		 * Adapt Multidata information, if any.  The
-		 * following tcp_mdt_update routine will free
-		 * the message.
-		 */
-		if (mp != NULL && ((mdti = tcp_mdt_info_mp(mp)) != NULL)) {
-			tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti->
-			    b_rptr)->mdt_capab, B_TRUE);
-			freemsg(mdti);
-		}
-
-		/*
-		 * Check to update LSO information with tcp, and
-		 * tcp_lso_update routine will free the message.
-		 */
-		if (mp != NULL && ((lsoi = tcp_lso_info_mp(mp)) != NULL)) {
-			tcp_lso_update(tcp, &((ip_lso_info_t *)lsoi->
-			    b_rptr)->lso_capab);
-			freemsg(lsoi);
-		}
-
-		/* Get the IRE, if we had requested for it */
-		if (mp != NULL)
-			ire_mp = tcp_ire_mp(&mp);
-
-		if (tcp->tcp_hard_binding) {
-			tcp->tcp_hard_binding = B_FALSE;
-			tcp->tcp_hard_bound = B_TRUE;
-			CL_INET_CONNECT(tcp->tcp_connp, tcp, B_TRUE, retval);
-			if (retval != 0) {
-				error = EADDRINUSE;
-				goto bind_failed;
-			}
-		} else {
-			if (ire_mp != NULL)
-				freeb(ire_mp);
-			goto after_syn_sent;
-		}
-
-		retval = tcp_adapt_ire(tcp, ire_mp);
-		if (ire_mp != NULL)
-			freeb(ire_mp);
-		if (retval == 0) {
-			error = (int)((tcp->tcp_state >= TCPS_SYN_SENT) ?
-			    ENETUNREACH : EADDRNOTAVAIL);
-			goto ipcl_rm;
-		}
-		/*
-		 * Don't let an endpoint connect to itself.
-		 * Also checked in tcp_connect() but that
-		 * check can't handle the case when the
-		 * local IP address is INADDR_ANY.
-		 */
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
-			if ((tcp->tcp_ipha->ipha_dst ==
-			    tcp->tcp_ipha->ipha_src) &&
-			    (BE16_EQL(tcp->tcp_tcph->th_lport,
-			    tcp->tcp_tcph->th_fport))) {
-				error = EADDRNOTAVAIL;
-				goto ipcl_rm;
-			}
-		} else {
-			if (IN6_ARE_ADDR_EQUAL(
-			    &tcp->tcp_ip6h->ip6_dst,
-			    &tcp->tcp_ip6h->ip6_src) &&
-			    (BE16_EQL(tcp->tcp_tcph->th_lport,
-			    tcp->tcp_tcph->th_fport))) {
-				error = EADDRNOTAVAIL;
-				goto ipcl_rm;
-			}
-		}
-		ASSERT(tcp->tcp_state == TCPS_SYN_SENT);
-		/*
-		 * This should not be possible!  Just for
-		 * defensive coding...
-		 */
-		if (tcp->tcp_state != TCPS_SYN_SENT)
-			goto after_syn_sent;
-
-		if (is_system_labeled() &&
-		    !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) {
-			error = EHOSTUNREACH;
-			goto ipcl_rm;
-		}
-
-		/*
-		 * tcp_adapt_ire() does not adjust
-		 * for TCP/IP header length.
-		 */
-		mss = tcp->tcp_mss - tcp->tcp_hdr_len;
-
-		/*
-		 * Just make sure our rwnd is at
-		 * least tcp_recv_hiwat_mss * MSS
-		 * large, and round up to the nearest
-		 * MSS.
-		 *
-		 * We do the round up here because
-		 * we need to get the interface
-		 * MTU first before we can do the
-		 * round up.
-		 */
-		tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss),
-		    tcps->tcps_recv_hiwat_minmss * mss);
-		tcp->tcp_recv_hiwater = tcp->tcp_rwnd;
-		tcp_set_ws_value(tcp);
-		U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws),
-		    tcp->tcp_tcph->th_win);
-		if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always)
-			tcp->tcp_snd_ws_ok = B_TRUE;
-
-		/*
-		 * Set tcp_snd_ts_ok to true
-		 * so that tcp_xmit_mp will
-		 * include the timestamp
-		 * option in the SYN segment.
-		 */
-		if (tcps->tcps_tstamp_always ||
-		    (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) {
-			tcp->tcp_snd_ts_ok = B_TRUE;
-		}
-
-		/*
-		 * tcp_snd_sack_ok can be set in
-		 * tcp_adapt_ire() if the sack metric
-		 * is set.  So check it here also.
-		 */
-		if (tcps->tcps_sack_permitted == 2 ||
-		    tcp->tcp_snd_sack_ok) {
-			if (tcp->tcp_sack_info == NULL) {
-				tcp->tcp_sack_info =
-				    kmem_cache_alloc(tcp_sack_info_cache,
-				    KM_SLEEP);
-			}
-			tcp->tcp_snd_sack_ok = B_TRUE;
-		}
+	ASSERT(is_system_labeled());
+	ASSERT(ira->ira_cred != NULL);
 
-		/*
-		 * Should we use ECN?  Note that the current
-		 * default value (SunOS 5.9) of tcp_ecn_permitted
-		 * is 1.  The reason for doing this is that there
-		 * are equipments out there that will drop ECN
-		 * enabled IP packets.  Setting it to 1 avoids
-		 * compatibility problems.
-		 */
-		if (tcps->tcps_ecn_permitted == 2)
-			tcp->tcp_ecn_ok = B_TRUE;
-
-		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
-		syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
-		    tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
-		if (syn_mp) {
-			/*
-			 * cr contains the cred from the thread calling
-			 * connect().
-			 *
-			 * If no thread cred is available, use the
-			 * socket creator's cred instead. If still no
-			 * cred, drop the request rather than risk a
-			 * panic on production systems.
-			 */
-			if (cr == NULL) {
-				cr = CONN_CRED(connp);
-				pid = tcp->tcp_cpid;
-				ASSERT(cr != NULL);
-				if (cr != NULL) {
-					mblk_setcred(syn_mp, cr, pid);
-				} else {
-					error = ECONNABORTED;
-					goto ipcl_rm;
-				}
-
-			/*
-			 * If an effective security label exists for
-			 * the connection, create a copy of the thread's
-			 * cred but with the effective label attached.
-			 */
-			} else if (is_system_labeled() &&
-			    connp->conn_effective_cred != NULL &&
-			    (tsl = crgetlabel(connp->
-			    conn_effective_cred)) != NULL) {
-				if ((ecr = copycred_from_tslabel(cr,
-				    tsl, KM_NOSLEEP)) == NULL) {
-					error = ENOMEM;
-					goto ipcl_rm;
-				}
-				mblk_setcred(syn_mp, ecr, pid);
-				crfree(ecr);
-
-			/*
-			 * Default to using the thread's cred unchanged.
-			 */
-			} else {
-				mblk_setcred(syn_mp, cr, pid);
-			}
-
-			/*
-			 * We must bump the generation before sending the syn
-			 * to ensure that we use the right generation in case
-			 * this thread issues a "connected" up call.
-			 */
-			SOCK_CONNID_BUMP(tcp->tcp_connid);
-
-			tcp_send_data(tcp, tcp->tcp_wq, syn_mp);
-		}
-	after_syn_sent:
-		if (mp != NULL) {
-			ASSERT(mp->b_cont == NULL);
-			freeb(mp);
-		}
-		return (error);
-	} else {
-		/* error */
-		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
-			    "tcp_post_ip_bind: error == %d", error);
-		}
-		if (mp != NULL) {
-			freeb(mp);
-		}
+	while (mp != NULL) {
+		mblk_setcred(mp, ira->ira_cred, NOPID);
+		mp = mp->b_cont;
 	}
-
-ipcl_rm:
-	/*
-	 * Need to unbind with classifier since we were just
-	 * told that our bind succeeded. a.k.a error == 0 at the entry.
-	 */
-	tcp->tcp_hard_bound = B_FALSE;
-	tcp->tcp_hard_binding = B_FALSE;
-
-	ipcl_hash_remove(connp);
-
-bind_failed:
-	tcp->tcp_state = TCPS_IDLE;
-	if (tcp->tcp_ipversion == IPV4_VERSION)
-		tcp->tcp_ipha->ipha_src = 0;
-	else
-		V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
-	/*
-	 * Copy of the src addr. in tcp_t is needed since
-	 * the lookup funcs. can only look at tcp_t
-	 */
-	V6_SET_ZERO(tcp->tcp_ip_src_v6);
-
-	tcph = tcp->tcp_tcph;
-	tcph->th_lport[0] = 0;
-	tcph->th_lport[1] = 0;
-	tcp_bind_hash_remove(tcp);
-	bzero(&connp->u_port, sizeof (connp->u_port));
-	/* blow away saved option results if any */
-	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
-		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
-
-	conn_delete_ire(tcp->tcp_connp, NULL);
-
-	return (error);
 }
 
 static int
@@ -25936,16 +20048,16 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 	boolean_t	user_specified;
 	in_port_t	allocated_port;
 	in_port_t	requested_port = *requested_port_ptr;
-	conn_t		*connp;
+	conn_t		*connp = tcp->tcp_connp;
 	zone_t		*zone;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	in6_addr_t	v6addr = tcp->tcp_ip_src_v6;
+	in6_addr_t	v6addr = connp->conn_laddr_v6;
 
 	/*
 	 * XXX It's up to the caller to specify bind_to_req_port_only or not.
 	 */
-	if (cr == NULL)
-		cr = tcp->tcp_cred;
+	ASSERT(cr != NULL);
+
 	/*
 	 * Get a valid port (within the anonymous range and should not
 	 * be a privileged one) to use if the user has not given a port.
@@ -25961,7 +20073,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 	mlptype = mlptSingle;
 	mlp_port = requested_port;
 	if (requested_port == 0) {
-		requested_port = tcp->tcp_anon_priv_bind ?
+		requested_port = connp->conn_anon_priv_bind ?
 		    tcp_get_next_priv_port(tcp) :
 		    tcp_update_next_port(tcps->tcps_next_port_to_try,
 		    tcp, B_TRUE);
@@ -25975,7 +20087,6 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 		 * this socket and RPC is MLP in this zone, then give him an
 		 * anonymous MLP.
 		 */
-		connp = tcp->tcp_connp;
 		if (connp->conn_anon_mlp && is_system_labeled()) {
 			zone = crgetzone(cr);
 			addrtype = tsol_mlp_addr_type(
@@ -26016,7 +20127,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 		if (priv) {
 			if (secpolicy_net_privaddr(cr, requested_port,
 			    IPPROTO_TCP) != 0) {
-				if (tcp->tcp_debug) {
+				if (connp->conn_debug) {
 					(void) strlog(TCP_MOD_ID, 0, 1,
 					    SL_ERROR|SL_TRACE,
 					    "tcp_bind: no priv for port %d",
@@ -26044,7 +20155,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 
 	if (mlptype != mlptSingle) {
 		if (secpolicy_net_bindmlp(cr) != 0) {
-			if (tcp->tcp_debug) {
+			if (connp->conn_debug) {
 				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: no priv for multilevel port %d",
@@ -26068,7 +20179,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 			mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
 			    htons(mlp_port));
 			if (connp->conn_zoneid != mlpzone) {
-				if (tcp->tcp_debug) {
+				if (connp->conn_debug) {
 					(void) strlog(TCP_MOD_ID, 0, 1,
 					    SL_ERROR|SL_TRACE,
 					    "tcp_bind: attempt to bind port "
@@ -26083,10 +20194,10 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 
 		if (!user_specified) {
 			int err;
-			err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
+			err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
 			    requested_port, B_TRUE);
 			if (err != 0) {
-				if (tcp->tcp_debug) {
+				if (connp->conn_debug) {
 					(void) strlog(TCP_MOD_ID, 0, 1,
 					    SL_ERROR|SL_TRACE,
 					    "tcp_bind: cannot establish anon "
@@ -26101,17 +20212,18 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 	}
 
 	allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
-	    tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified);
+	    connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
+	    user_specified);
 
 	if (allocated_port == 0) {
 		connp->conn_mlp_type = mlptSingle;
 		if (connp->conn_anon_port) {
 			connp->conn_anon_port = B_FALSE;
-			(void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
+			(void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
 			    requested_port, B_FALSE);
 		}
 		if (bind_to_req_port_only) {
-			if (tcp->tcp_debug) {
+			if (connp->conn_debug) {
 				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: requested addr busy");
@@ -26119,7 +20231,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 			return (-TADDRBUSY);
 		} else {
 			/* If we are out of ports, fail the bind. */
-			if (tcp->tcp_debug) {
+			if (connp->conn_debug) {
 				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: out of ports?");
@@ -26133,6 +20245,9 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 	return (0);
 }
 
+/*
+ * Check the address and check/pick a local port number.
+ */
 static int
 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
     boolean_t bind_to_req_port_only)
@@ -26140,18 +20255,22 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 	tcp_t	*tcp = connp->conn_tcp;
 	sin_t	*sin;
 	sin6_t  *sin6;
-	in_port_t requested_port;
+	in_port_t	requested_port;
 	ipaddr_t	v4addr;
 	in6_addr_t	v6addr;
-	uint_t	ipversion;
-	int	error = 0;
+	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
+	zoneid_t	zoneid = IPCL_ZONEID(connp);
+	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
+	uint_t		scopeid = 0;
+	int		error = 0;
+	ip_xmit_attr_t	*ixa = connp->conn_ixa;
 
 	ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
 
 	if (tcp->tcp_state == TCPS_BOUND) {
 		return (0);
 	} else if (tcp->tcp_state > TCPS_BOUND) {
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_bind: bad state, %d", tcp->tcp_state);
 		}
@@ -26161,7 +20280,7 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 	ASSERT(sa != NULL && len != 0);
 
 	if (!OK_32PTR((char *)sa)) {
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1,
 			    SL_ERROR|SL_TRACE,
 			    "tcp_bind: bad address parameter, "
@@ -26171,38 +20290,48 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		return (-TPROTO);
 	}
 
+	error = proto_verify_ip_addr(connp->conn_family, sa, len);
+	if (error != 0) {
+		return (error);
+	}
+
 	switch (len) {
 	case sizeof (sin_t):	/* Complete IPv4 address */
 		sin = (sin_t *)sa;
-		/*
-		 * With sockets sockfs will accept bogus sin_family in
-		 * bind() and replace it with the family used in the socket
-		 * call.
-		 */
-		if (sin->sin_family != AF_INET ||
-		    tcp->tcp_family != AF_INET) {
-			return (EAFNOSUPPORT);
-		}
 		requested_port = ntohs(sin->sin_port);
-		ipversion = IPV4_VERSION;
 		v4addr = sin->sin_addr.s_addr;
 		IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
+		if (v4addr != INADDR_ANY) {
+			laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
+			    B_FALSE);
+		}
 		break;
 
 	case sizeof (sin6_t): /* Complete IPv6 address */
 		sin6 = (sin6_t *)sa;
-		if (sin6->sin6_family != AF_INET6 ||
-		    tcp->tcp_family != AF_INET6) {
-			return (EAFNOSUPPORT);
-		}
-		requested_port = ntohs(sin6->sin6_port);
-		ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ?
-		    IPV4_VERSION : IPV6_VERSION;
 		v6addr = sin6->sin6_addr;
+		requested_port = ntohs(sin6->sin6_port);
+		if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
+			if (connp->conn_ipv6_v6only)
+				return (EADDRNOTAVAIL);
+
+			IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
+			if (v4addr != INADDR_ANY) {
+				laddr_type = ip_laddr_verify_v4(v4addr,
+				    zoneid, ipst, B_FALSE);
+			}
+		} else {
+			if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
+				if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
+					scopeid = sin6->sin6_scope_id;
+				laddr_type = ip_laddr_verify_v6(&v6addr,
+				    zoneid, ipst, B_FALSE, scopeid);
+			}
+		}
 		break;
 
 	default:
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_bind: bad address length, %d", len);
 		}
@@ -26210,34 +20339,32 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		/* return (-TBADADDR); */
 	}
 
-	tcp->tcp_bound_source_v6 = v6addr;
+	/* Is the local address a valid unicast address? */
+	if (laddr_type == IPVL_BAD)
+		return (EADDRNOTAVAIL);
 
-	/* Check for change in ipversion */
-	if (tcp->tcp_ipversion != ipversion) {
-		ASSERT(tcp->tcp_family == AF_INET6);
-		error = (ipversion == IPV6_VERSION) ?
-		    tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp);
-		if (error) {
-			return (ENOMEM);
-		}
-	}
-
-	/*
-	 * Initialize family specific fields. Copy of the src addr.
-	 * in tcp_t is needed for the lookup funcs.
-	 */
-	if (tcp->tcp_ipversion == IPV6_VERSION) {
-		tcp->tcp_ip6h->ip6_src = v6addr;
+	connp->conn_bound_addr_v6 = v6addr;
+	if (scopeid != 0) {
+		ixa->ixa_flags |= IXAF_SCOPEID_SET;
+		ixa->ixa_scopeid = scopeid;
+		connp->conn_incoming_ifindex = scopeid;
 	} else {
-		IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src);
+		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		connp->conn_incoming_ifindex = connp->conn_bound_if;
 	}
-	tcp->tcp_ip_src_v6 = v6addr;
+
+	connp->conn_laddr_v6 = v6addr;
+	connp->conn_saddr_v6 = v6addr;
 
 	bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
 
 	error = tcp_bind_select_lport(tcp, &requested_port,
 	    bind_to_req_port_only, cr);
-
+	if (error != 0) {
+		connp->conn_laddr_v6 = ipv6_all_zeros;
+		connp->conn_saddr_v6 = ipv6_all_zeros;
+		connp->conn_bound_addr_v6 = ipv6_all_zeros;
+	}
 	return (error);
 }
 
@@ -26253,7 +20380,7 @@ tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 	tcp_t *tcp = connp->conn_tcp;
 
 	if (tcp->tcp_state >= TCPS_BOUND) {
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_bind: bad state, %d", tcp->tcp_state);
 		}
@@ -26265,19 +20392,8 @@ tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		return (error);
 
 	ASSERT(tcp->tcp_state == TCPS_BOUND);
-
 	tcp->tcp_conn_req_max = 0;
-
-	if (tcp->tcp_family == AF_INET6) {
-		ASSERT(tcp->tcp_connp->conn_af_isv6);
-		error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP,
-		    &tcp->tcp_bound_source_v6, 0, B_FALSE);
-	} else {
-		ASSERT(!tcp->tcp_connp->conn_af_isv6);
-		error = ip_proto_bind_laddr_v4(connp, NULL, IPPROTO_TCP,
-		    tcp->tcp_ipha->ipha_src, 0, B_FALSE);
-	}
-	return (tcp_post_ip_bind(tcp, NULL, error, NULL, 0));
+	return (0);
 }
 
 int
@@ -26337,7 +20453,14 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 	ipaddr_t	*dstaddrp;
 	in_port_t	dstport;
 	uint_t		srcid;
-	int		error = 0;
+	int		error;
+	uint32_t	mss;
+	mblk_t		*syn_mp;
+	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	int32_t		oldstate;
+	ip_xmit_attr_t	*ixa = connp->conn_ixa;
+
+	oldstate = tcp->tcp_state;
 
 	switch (len) {
 	default:
@@ -26351,7 +20474,7 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 		if (sin->sin_port == 0) {
 			return (-TBADADDR);
 		}
-		if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) {
+		if (connp->conn_ipv6_v6only) {
 			return (EAFNOSUPPORT);
 		}
 		break;
@@ -26365,23 +20488,18 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 	}
 	/*
 	 * If we're connecting to an IPv4-mapped IPv6 address, we need to
-	 * make sure that the template IP header in the tcp structure is an
-	 * IPv4 header, and that the tcp_ipversion is IPV4_VERSION.  We
+	 * make sure that the conn_ipversion is IPV4_VERSION.  We
 	 * need to this before we call tcp_bindi() so that the port lookup
 	 * code will look for ports in the correct port space (IPv4 and
 	 * IPv6 have separate port spaces).
 	 */
-	if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION &&
+	if (connp->conn_family == AF_INET6 &&
+	    connp->conn_ipversion == IPV6_VERSION &&
 	    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
-		int err = 0;
+		if (connp->conn_ipv6_v6only)
+			return (EADDRNOTAVAIL);
 
-		err = tcp_header_init_ipv4(tcp);
-			if (err != 0) {
-				error = ENOMEM;
-				goto connect_failed;
-			}
-		if (tcp->tcp_lport != 0)
-			*(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
+		connp->conn_ipversion = IPV4_VERSION;
 	}
 
 	switch (tcp->tcp_state) {
@@ -26399,43 +20517,147 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 		 */
 		/* FALLTHRU */
 	case TCPS_BOUND:
-		if (tcp->tcp_family == AF_INET6) {
-			if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
-				return (tcp_connect_ipv6(tcp,
-				    &sin6->sin6_addr,
-				    sin6->sin6_port, sin6->sin6_flowinfo,
-				    sin6->__sin6_src_id, sin6->sin6_scope_id,
-				    cr, pid));
-			}
+		break;
+	default:
+		return (-TOUTSTATE);
+	}
+
+	/*
+	 * We update our cred/cpid based on the caller of connect
+	 */
+	if (connp->conn_cred != cr) {
+		crhold(cr);
+		crfree(connp->conn_cred);
+		connp->conn_cred = cr;
+	}
+	connp->conn_cpid = pid;
+
+	/* Cache things in the ixa without any refhold */
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
+	if (is_system_labeled()) {
+		/* We need to restart with a label based on the cred */
+		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
+	}
+
+	if (connp->conn_family == AF_INET6) {
+		if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+			error = tcp_connect_ipv6(tcp, &sin6->sin6_addr,
+			    sin6->sin6_port, sin6->sin6_flowinfo,
+			    sin6->__sin6_src_id, sin6->sin6_scope_id);
+		} else {
 			/*
 			 * Destination adress is mapped IPv6 address.
 			 * Source bound address should be unspecified or
 			 * IPv6 mapped address as well.
 			 */
 			if (!IN6_IS_ADDR_UNSPECIFIED(
-			    &tcp->tcp_bound_source_v6) &&
-			    !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) {
+			    &connp->conn_bound_addr_v6) &&
+			    !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) {
 				return (EADDRNOTAVAIL);
 			}
 			dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr));
 			dstport = sin6->sin6_port;
 			srcid = sin6->__sin6_src_id;
-		} else {
-			dstaddrp = &sin->sin_addr.s_addr;
-			dstport = sin->sin_port;
-			srcid = 0;
+			error = tcp_connect_ipv4(tcp, dstaddrp, dstport,
+			    srcid);
 		}
+	} else {
+		dstaddrp = &sin->sin_addr.s_addr;
+		dstport = sin->sin_port;
+		srcid = 0;
+		error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid);
+	}
 
-		error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid, cr,
-		    pid);
-		break;
-	default:
-		return (-TOUTSTATE);
+	if (error != 0)
+		goto connect_failed;
+
+	CL_INET_CONNECT(connp, B_TRUE, error);
+	if (error != 0)
+		goto connect_failed;
+
+	/* connect succeeded */
+	BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
+	tcp->tcp_active_open = 1;
+
+	/*
+	 * tcp_set_destination() does not adjust for TCP/IP header length.
+	 */
+	mss = tcp->tcp_mss - connp->conn_ht_iphc_len;
+
+	/*
+	 * Just make sure our rwnd is at least rcvbuf * MSS large, and round up
+	 * to the nearest MSS.
+	 *
+	 * We do the round up here because we need to get the interface MTU
+	 * first before we can do the round up.
+	 */
+	tcp->tcp_rwnd = connp->conn_rcvbuf;
+	tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss),
+	    tcps->tcps_recv_hiwat_minmss * mss);
+	connp->conn_rcvbuf = tcp->tcp_rwnd;
+	tcp_set_ws_value(tcp);
+	tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
+	if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always)
+		tcp->tcp_snd_ws_ok = B_TRUE;
+
+	/*
+	 * Set tcp_snd_ts_ok to true
+	 * so that tcp_xmit_mp will
+	 * include the timestamp
+	 * option in the SYN segment.
+	 */
+	if (tcps->tcps_tstamp_always ||
+	    (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) {
+		tcp->tcp_snd_ts_ok = B_TRUE;
 	}
+
 	/*
-	 * Note: Code below is the "failure" case
+	 * tcp_snd_sack_ok can be set in
+	 * tcp_set_destination() if the sack metric
+	 * is set.  So check it here also.
+	 */
+	if (tcps->tcps_sack_permitted == 2 ||
+	    tcp->tcp_snd_sack_ok) {
+		if (tcp->tcp_sack_info == NULL) {
+			tcp->tcp_sack_info = kmem_cache_alloc(
+			    tcp_sack_info_cache, KM_SLEEP);
+		}
+		tcp->tcp_snd_sack_ok = B_TRUE;
+	}
+
+	/*
+	 * Should we use ECN?  Note that the current
+	 * default value (SunOS 5.9) of tcp_ecn_permitted
+	 * is 1.  The reason for doing this is that there
+	 * are equipments out there that will drop ECN
+	 * enabled IP packets.  Setting it to 1 avoids
+	 * compatibility problems.
 	 */
+	if (tcps->tcps_ecn_permitted == 2)
+		tcp->tcp_ecn_ok = B_TRUE;
+
+	TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
+	syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
+	    tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
+	if (syn_mp != NULL) {
+		/*
+		 * We must bump the generation before sending the syn
+		 * to ensure that we use the right generation in case
+		 * this thread issues a "connected" up call.
+		 */
+		SOCK_CONNID_BUMP(tcp->tcp_connid);
+		tcp_send_data(tcp, syn_mp);
+	}
+
+	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
+		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
+	return (0);
+
 connect_failed:
+	connp->conn_faddr_v6 = ipv6_all_zeros;
+	connp->conn_fport = 0;
+	tcp->tcp_state = oldstate;
 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
 		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
 	return (error);
@@ -26446,7 +20668,6 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
     socklen_t len, sock_connid_t *id, cred_t *cr)
 {
 	conn_t		*connp = (conn_t *)proto_handle;
-	tcp_t		*tcp = connp->conn_tcp;
 	squeue_t	*sqp = connp->conn_sqp;
 	int		error;
 
@@ -26455,7 +20676,7 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
-	error = proto_verify_ip_addr(tcp->tcp_family, sa, len);
+	error = proto_verify_ip_addr(connp->conn_family, sa, len);
 	if (error != 0) {
 		return (error);
 	}
@@ -26493,7 +20714,7 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 		}
 	}
 
-	if (tcp->tcp_loopback) {
+	if (connp->conn_tcp->tcp_loopback) {
 		struct sock_proto_props sopp;
 
 		sopp.sopp_flags = SOCKOPT_LOOPBACK;
@@ -26521,7 +20742,7 @@ tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 		return (NULL);
 	}
 
-	connp = tcp_create_common(NULL, credp, isv6, B_TRUE, errorp);
+	connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
 	if (connp == NULL) {
 		return (NULL);
 	}
@@ -26578,8 +20799,8 @@ tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
 	connp->conn_upcalls = sock_upcalls;
 	connp->conn_upper_handle = sock_handle;
 
-	ASSERT(connp->conn_tcp->tcp_recv_hiwater != 0 &&
-	    connp->conn_tcp->tcp_recv_hiwater == connp->conn_tcp->tcp_rwnd);
+	ASSERT(connp->conn_rcvbuf != 0 &&
+	    connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
 	(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
 }
 
@@ -26663,7 +20884,7 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 		/*
 		 * Squeue Flow Control
 		 */
-		if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
+		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
 			tcp_setqfull(tcp);
 		}
 		mutex_exit(&tcp->tcp_non_sq_lock);
@@ -26680,12 +20901,11 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 		CONN_INC_REF(connp);
 
 		if (msg->msg_flags & MSG_OOB) {
-			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
-			    tcp_output_urgent, connp, tcp_squeue_flag,
-			    SQTAG_TCP_OUTPUT);
+			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
+			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 		} else {
 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
-			    connp, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
+			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 		}
 
 		return (0);
@@ -26698,9 +20918,9 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 	return (0);
 }
 
-/* ARGSUSED */
+/* ARGSUSED2 */
 void
-tcp_output_urgent(void *arg, mblk_t *mp, void *arg2)
+tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	int len;
 	uint32_t msize;
@@ -26739,7 +20959,7 @@ tcp_output_urgent(void *arg, mblk_t *mp, void *arg2)
 	tcp_wput_data(tcp, mp, B_TRUE);
 }
 
-/* ARGSUSED */
+/* ARGSUSED3 */
 int
 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
     socklen_t *addrlenp, cred_t *cr)
@@ -26752,24 +20972,24 @@ tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 	ASSERT(cr != NULL);
 
 	ASSERT(tcp != NULL);
+	if (tcp->tcp_state < TCPS_SYN_RCVD)
+		return (ENOTCONN);
 
-	return (tcp_do_getpeername(tcp, addr, addrlenp));
+	return (conn_getpeername(connp, addr, addrlenp));
 }
 
-/* ARGSUSED */
+/* ARGSUSED3 */
 int
 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
     socklen_t *addrlenp, cred_t *cr)
 {
 	conn_t	*connp = (conn_t *)proto_handle;
-	tcp_t	*tcp = connp->conn_tcp;
 
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
 	ASSERT(connp->conn_upper_handle != NULL);
-
-	return (tcp_do_getsockname(tcp, addr, addrlenp));
+	return (conn_getsockname(connp, addr, addrlenp));
 }
 
 /*
@@ -26809,8 +21029,8 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
 
 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
 
-	connp->conn_tcp->tcp_rq = connp->conn_rq = RD(q);
-	connp->conn_tcp->tcp_wq = connp->conn_wq = WR(q);
+	connp->conn_rq = RD(q);
+	connp->conn_wq = WR(q);
 
 	WR(q)->q_qinfo = &tcp_sock_winit;
 
@@ -26830,11 +21050,11 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
 	stropt_mp->b_wptr += sizeof (struct stroptions);
 	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
 
-	stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 :
+	stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
 	    tcp->tcp_tcps->tcps_wroff_xtra);
 	if (tcp->tcp_snd_sack_ok)
 		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
-	stropt->so_hiwat = tcp->tcp_recv_hiwater;
+	stropt->so_hiwat = connp->conn_rcvbuf;
 	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
 
 	putnext(RD(q), stropt_mp);
@@ -26845,15 +21065,17 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
 	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
 
 	laddrlen = faddrlen = sizeof (sin6_t);
-	(void) tcp_do_getsockname(tcp, (struct sockaddr *)&laddr, &laddrlen);
-	error = tcp_do_getpeername(tcp, (struct sockaddr *)&faddr, &faddrlen);
+	(void) tcp_getsockname((sock_lower_handle_t)connp,
+	    (struct sockaddr *)&laddr, &laddrlen, CRED());
+	error = tcp_getpeername((sock_lower_handle_t)connp,
+	    (struct sockaddr *)&faddr, &faddrlen, CRED());
 	if (error != 0)
 		faddrlen = 0;
 
 	opts = 0;
-	if (tcp->tcp_oobinline)
+	if (connp->conn_oobinline)
 		opts |= SO_OOBINLINE;
-	if (tcp->tcp_dontroute)
+	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 		opts |= SO_DONTROUTE;
 
 	/*
@@ -26868,6 +21090,7 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
 	while ((mp = tcp->tcp_rcv_list) != NULL) {
 		tcp->tcp_rcv_list = mp->b_next;
 		mp->b_next = NULL;
+		/* We never do fallback for kernel RPC */
 		putnext(q, mp);
 	}
 	tcp->tcp_rcv_last_head = NULL;
@@ -26908,7 +21131,7 @@ tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs)
 	 * Sockfs guarantees that the listener will not be closed
 	 * during fallback. So we can safely use the listener's queue.
 	 */
-	putnext(listener->tcp_rq, mp);
+	putnext(listener->tcp_connp->conn_rq, mp);
 }
 
 int
@@ -26987,7 +21210,7 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 
 /* ARGSUSED */
 static void
-tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2)
+tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	conn_t 	*connp = (conn_t *)arg;
 	tcp_t	*tcp = connp->conn_tcp;
@@ -27002,7 +21225,7 @@ tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2)
 		 * We were crossing FINs and got a reset from
 		 * the other side. Just ignore it.
 		 */
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1,
 			    SL_ERROR|SL_TRACE,
 			    "tcp_shutdown_output() out of state %s",
@@ -27036,7 +21259,7 @@ tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
 		bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
 		CONN_INC_REF(connp);
 		SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
-		    connp, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
+		    connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
 
 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 		    SOCK_OPCTL_SHUT_SEND, 0);
@@ -27109,7 +21332,7 @@ tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len,
 			 */
 			goto do_listen;
 		}
-		if (tcp->tcp_debug) {
+		if (connp->conn_debug) {
 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_listen: bad state, %d", tcp->tcp_state);
 		}
@@ -27121,15 +21344,14 @@ tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len,
 			sin6_t *sin6;
 
 			ASSERT(IPCL_IS_NONSTR(connp));
-
 			/* Do an implicit bind: Request for a generic port. */
-			if (tcp->tcp_family == AF_INET) {
+			if (connp->conn_family == AF_INET) {
 				len = sizeof (sin_t);
 				sin = (sin_t *)&addr;
 				*sin = sin_null;
 				sin->sin_family = AF_INET;
 			} else {
-				ASSERT(tcp->tcp_family == AF_INET6);
+				ASSERT(connp->conn_family == AF_INET6);
 				len = sizeof (sin6_t);
 				sin6 = (sin6_t *)&addr;
 				*sin6 = sin6_null;
@@ -27171,23 +21393,42 @@ do_listen:
 	}
 
 	/*
-	 * We can call ip_bind directly, the processing continues
-	 * in tcp_post_ip_bind().
-	 *
 	 * We need to make sure that the conn_recv is set to a non-null
 	 * value before we insert the conn into the classifier table.
 	 * This is to avoid a race with an incoming packet which does an
 	 * ipcl_classify().
+	 * We initially set it to tcp_input_listener_unbound to try to
+	 * pick a good squeue for the listener when the first SYN arrives.
+	 * tcp_input_listener_unbound sets it to tcp_input_listener on that
+	 * first SYN.
 	 */
-	connp->conn_recv = tcp_conn_request;
-	if (tcp->tcp_family == AF_INET) {
-		error = ip_proto_bind_laddr_v4(connp, NULL,
-		    IPPROTO_TCP, tcp->tcp_bound_source, tcp->tcp_lport, B_TRUE);
-	} else {
-		error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP,
-		    &tcp->tcp_bound_source_v6, tcp->tcp_lport, B_TRUE);
+	connp->conn_recv = tcp_input_listener_unbound;
+
+	/* Insert the listener in the classifier table */
+	error = ip_laddr_fanout_insert(connp);
+	if (error != 0) {
+		/* Undo the bind - release the port number */
+		tcp->tcp_state = TCPS_IDLE;
+		connp->conn_bound_addr_v6 = ipv6_all_zeros;
+
+		connp->conn_laddr_v6 = ipv6_all_zeros;
+		connp->conn_saddr_v6 = ipv6_all_zeros;
+		connp->conn_ports = 0;
+
+		if (connp->conn_anon_port) {
+			zone_t		*zone;
+
+			zone = crgetzone(cr);
+			connp->conn_anon_port = B_FALSE;
+			(void) tsol_mlp_anon(zone, connp->conn_mlp_type,
+			    connp->conn_proto, connp->conn_lport, B_FALSE);
+		}
+		connp->conn_mlp_type = mlptSingle;
+
+		tcp_bind_hash_remove(tcp);
+		return (error);
 	}
-	return (tcp_post_ip_bind(tcp, NULL, error, NULL, 0));
+	return (error);
 }
 
 void
@@ -27222,7 +21463,7 @@ tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
 	if (tcp->tcp_fused) {
 		tcp_fuse_backenable(tcp);
 	} else {
-		tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+		tcp->tcp_rwnd = connp->conn_rcvbuf;
 		/*
 		 * Send back a window update immediately if TCP is above
 		 * ESTABLISHED state and the increase of the rcv window
@@ -27253,10 +21494,28 @@ tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
+	/*
+	 * If we don't have a helper stream then create one.
+	 * ip_create_helper_stream takes care of locking the conn_t,
+	 * so this check for NULL is just a performance optimization.
+	 */
+	if (connp->conn_helper_info == NULL) {
+		tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
+
+		/*
+		 * Create a helper stream for non-STREAMS socket.
+		 */
+		error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
+		if (error != 0) {
+			ip0dbg(("tcp_ioctl: create of IP helper stream "
+			    "failed %d\n", error));
+			return (error);
+		}
+	}
+
 	switch (cmd) {
 		case ND_SET:
 		case ND_GET:
-		case TCP_IOC_DEFAULT_Q:
 		case _SIOCSOCKFALLBACK:
 		case TCP_IOC_ABORT_CONN:
 		case TI_GETPEERNAME:
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index 3ee909cc4d..313b024943 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -69,50 +69,6 @@
 boolean_t do_tcp_fusion = B_TRUE;
 
 /*
- * Return true if this connection needs some IP functionality
- */
-static boolean_t
-tcp_loopback_needs_ip(tcp_t *tcp, netstack_t *ns)
-{
-	ipsec_stack_t	*ipss = ns->netstack_ipsec;
-
-	/*
-	 * If ire is not cached, do not use fusion
-	 */
-	if (tcp->tcp_connp->conn_ire_cache == NULL) {
-		/*
-		 * There is no need to hold conn_lock here because when called
-		 * from tcp_fuse() there can be no window where conn_ire_cache
-		 * can change. This is not true when called from
-		 * tcp_fuse_output() as conn_ire_cache can become null just
-		 * after the check. It will be necessary to recheck for a NULL
-		 * conn_ire_cache in tcp_fuse_output() to avoid passing a
-		 * stale ill pointer to FW_HOOKS.
-		 */
-		return (B_TRUE);
-	}
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
-		if (tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH)
-			return (B_TRUE);
-		if (CONN_OUTBOUND_POLICY_PRESENT(tcp->tcp_connp, ipss))
-			return (B_TRUE);
-		if (CONN_INBOUND_POLICY_PRESENT(tcp->tcp_connp, ipss))
-			return (B_TRUE);
-	} else {
-		if (tcp->tcp_ip_hdr_len != IPV6_HDR_LEN)
-			return (B_TRUE);
-		if (CONN_OUTBOUND_POLICY_PRESENT_V6(tcp->tcp_connp, ipss))
-			return (B_TRUE);
-		if (CONN_INBOUND_POLICY_PRESENT_V6(tcp->tcp_connp, ipss))
-			return (B_TRUE);
-	}
-	if (!CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp))
-		return (B_TRUE);
-	return (B_FALSE);
-}
-
-
-/*
  * This routine gets called by the eager tcp upon changing state from
  * SYN_RCVD to ESTABLISHED.  It fuses a direct path between itself
  * and the active connect tcp such that the regular tcp processings
@@ -124,10 +80,10 @@ tcp_loopback_needs_ip(tcp_t *tcp, netstack_t *ns)
  * same squeue as the one given to the active connect tcp during open.
  */
 void
-tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
+tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcpha_t *tcpha)
 {
-	conn_t *peer_connp, *connp = tcp->tcp_connp;
-	tcp_t *peer_tcp;
+	conn_t		*peer_connp, *connp = tcp->tcp_connp;
+	tcp_t		*peer_tcp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 	netstack_t	*ns;
 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
@@ -136,20 +92,16 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
 	ASSERT(tcp->tcp_loopback);
 	ASSERT(tcp->tcp_loopback_peer == NULL);
 	/*
-	 * We need to inherit tcp_recv_hiwater of the listener tcp,
+	 * We need to inherit conn_rcvbuf of the listener tcp,
 	 * but we can't really use tcp_listener since we get here after
-	 * sending up T_CONN_IND and tcp_wput_accept() may be called
+	 * sending up T_CONN_IND and tcp_tli_accept() may be called
 	 * independently, at which point tcp_listener is cleared;
 	 * this is why we use tcp_saved_listener. The listener itself
 	 * is guaranteed to be around until tcp_accept_finish() is called
 	 * on this eager -- this won't happen until we're done since we're
 	 * inside the eager's perimeter now.
-	 *
-	 * We can also get called in the case were a connection needs
-	 * to be re-fused. In this case tcp_saved_listener will be
-	 * NULL but tcp_refuse will be true.
 	 */
-	ASSERT(tcp->tcp_saved_listener != NULL || tcp->tcp_refuse);
+	ASSERT(tcp->tcp_saved_listener != NULL);
 	/*
 	 * Lookup peer endpoint; search for the remote endpoint having
 	 * the reversed address-port quadruplet in ESTABLISHED state,
@@ -157,12 +109,12 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
 	 * is applied accordingly for loopback address, but not for
 	 * local address since we want fusion to happen across Zones.
 	 */
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
+	if (connp->conn_ipversion == IPV4_VERSION) {
 		peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
-		    (ipha_t *)iphdr, tcph, ipst);
+		    (ipha_t *)iphdr, tcpha, ipst);
 	} else {
 		peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
-		    (ip6_t *)iphdr, tcph, ipst);
+		    (ip6_t *)iphdr, tcpha, ipst);
 	}
 
 	/*
@@ -202,28 +154,20 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
 	/*
 	 * Fuse the endpoints; we perform further checks against both
 	 * tcp endpoints to ensure that a fusion is allowed to happen.
-	 * In particular we bail out for non-simple TCP/IP or if IPsec/
-	 * IPQoS policy/kernel SSL exists. We also need to check if
-	 * the connection is quiescent to cover the case when we are
-	 * trying to re-enable fusion after IPobservability is turned off.
+	 * In particular we bail out if kernel SSL exists.
 	 */
 	ns = tcps->tcps_netstack;
 	ipst = ns->netstack_ip;
 
 	if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
-	    !tcp_loopback_needs_ip(tcp, ns) &&
-	    !tcp_loopback_needs_ip(peer_tcp, ns) &&
-	    tcp->tcp_kssl_ent == NULL &&
-	    tcp->tcp_xmit_head == NULL && peer_tcp->tcp_xmit_head == NULL &&
-	    !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN, ipst)) {
+	    (tcp->tcp_kssl_ent == NULL) && (tcp->tcp_xmit_head == NULL) &&
+	    (peer_tcp->tcp_xmit_head == NULL)) {
 		mblk_t *mp;
-		queue_t *peer_rq = peer_tcp->tcp_rq;
+		queue_t *peer_rq = peer_connp->conn_rq;
 
 		ASSERT(!TCP_IS_DETACHED(peer_tcp));
-		ASSERT(tcp->tcp_fused_sigurg_mp == NULL ||
-		    (!IPCL_IS_NONSTR(connp) && tcp->tcp_refuse));
-		ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL ||
-		    (!IPCL_IS_NONSTR(peer_connp) && peer_tcp->tcp_refuse));
+		ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
+		ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
 		ASSERT(tcp->tcp_kssl_ctx == NULL);
 
 		/*
@@ -272,54 +216,40 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
 		tcp_timers_stop(tcp);
 		tcp_timers_stop(peer_tcp);
 
-		if (!tcp->tcp_refuse) {
-			/*
-			 * Set receive buffer and max packet size for the
-			 * active open tcp.
-			 * eager's values will be set in tcp_accept_finish.
-			 */
-
-			(void) tcp_rwnd_set(peer_tcp,
-			    peer_tcp->tcp_recv_hiwater);
+		/*
+		 * Set receive buffer and max packet size for the
+		 * active open tcp.
+		 * eager's values will be set in tcp_accept_finish.
+		 */
+		(void) tcp_rwnd_set(peer_tcp, peer_tcp->tcp_connp->conn_rcvbuf);
 
-			/*
-			 * Set the write offset value to zero since we won't
-			 * be needing any room for TCP/IP headers.
-			 */
-			if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) {
-				struct stroptions *stropt;
+		/*
+		 * Set the write offset value to zero since we won't
+		 * be needing any room for TCP/IP headers.
+		 */
+		if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) {
+			struct stroptions *stropt;
 
-				DB_TYPE(mp) = M_SETOPTS;
-				mp->b_wptr += sizeof (*stropt);
+			DB_TYPE(mp) = M_SETOPTS;
+			mp->b_wptr += sizeof (*stropt);
 
-				stropt = (struct stroptions *)mp->b_rptr;
-				stropt->so_flags = SO_WROFF;
-				stropt->so_wroff = 0;
+			stropt = (struct stroptions *)mp->b_rptr;
+			stropt->so_flags = SO_WROFF;
+			stropt->so_wroff = 0;
 
-				/* Send the options up */
-				putnext(peer_rq, mp);
-			} else {
-				struct sock_proto_props sopp;
+			/* Send the options up */
+			putnext(peer_rq, mp);
+		} else {
+			struct sock_proto_props sopp;
 
-				/* The peer is a non-STREAMS end point */
-				ASSERT(IPCL_IS_TCP(peer_connp));
+			/* The peer is a non-STREAMS end point */
+			ASSERT(IPCL_IS_TCP(peer_connp));
 
-				sopp.sopp_flags = SOCKOPT_WROFF;
-				sopp.sopp_wroff = 0;
-				(*peer_connp->conn_upcalls->su_set_proto_props)
-				    (peer_connp->conn_upper_handle, &sopp);
-			}
-		} else {
-			/*
-			 * Endpoints are being re-fused, so options will not
-			 * be sent up. In case of STREAMS, free the stroptions
-			 * mblk.
-			 */
-			if (!IPCL_IS_NONSTR(connp))
-				freemsg(mp);
+			sopp.sopp_flags = SOCKOPT_WROFF;
+			sopp.sopp_wroff = 0;
+			(*peer_connp->conn_upcalls->su_set_proto_props)
+			    (peer_connp->conn_upper_handle, &sopp);
 		}
-		tcp->tcp_refuse = B_FALSE;
-		peer_tcp->tcp_refuse = B_FALSE;
 	} else {
 		TCP_STAT(tcps, tcp_fusion_unqualified);
 	}
@@ -374,12 +304,12 @@ tcp_unfuse(tcp_t *tcp)
 	 * when called from tcp_rcv_drain().
 	 */
 	if (!TCP_IS_DETACHED(tcp)) {
-		(void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp,
+		(void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp,
 		    &tcp->tcp_fused_sigurg_mp);
 	}
 	if (!TCP_IS_DETACHED(peer_tcp)) {
-		(void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
-		    &peer_tcp->tcp_fused_sigurg_mp);
+		(void) tcp_fuse_rcv_drain(peer_tcp->tcp_connp->conn_rq,
+		    peer_tcp,  &peer_tcp->tcp_fused_sigurg_mp);
 	}
 
 	/* Lift up any flow-control conditions */
@@ -398,12 +328,12 @@ tcp_unfuse(tcp_t *tcp)
 	mutex_exit(&peer_tcp->tcp_non_sq_lock);
 
 	/*
-	 * Update th_seq and th_ack in the header template
+	 * Update tha_seq and tha_ack in the header template
 	 */
-	U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq);
-	U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
-	U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq);
-	U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack);
+	tcp->tcp_tcpha->tha_seq = htonl(tcp->tcp_snxt);
+	tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt);
+	peer_tcp->tcp_tcpha->tha_seq = htonl(peer_tcp->tcp_snxt);
+	peer_tcp->tcp_tcpha->tha_ack = htonl(peer_tcp->tcp_rnxt);
 
 	/* Unfuse the endpoints */
 	peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
@@ -509,59 +439,28 @@ tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
 boolean_t
 tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
 {
-	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-	boolean_t flow_stopped, peer_data_queued = B_FALSE;
-	boolean_t urgent = (DB_TYPE(mp) != M_DATA);
-	boolean_t push = B_TRUE;
-	mblk_t *mp1 = mp;
-	ill_t *ilp, *olp;
-	ipif_t *iifp, *oifp;
-	ipha_t *ipha;
-	ip6_t *ip6h;
-	tcph_t *tcph;
-	uint_t ip_hdr_len;
-	uint32_t seq;
-	uint32_t recv_size = send_size;
+	conn_t		*connp = tcp->tcp_connp;
+	tcp_t		*peer_tcp = tcp->tcp_loopback_peer;
+	conn_t		*peer_connp = peer_tcp->tcp_connp;
+	boolean_t	flow_stopped, peer_data_queued = B_FALSE;
+	boolean_t	urgent = (DB_TYPE(mp) != M_DATA);
+	boolean_t	push = B_TRUE;
+	mblk_t		*mp1 = mp;
+	uint_t		ip_hdr_len;
+	uint32_t	recv_size = send_size;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 	netstack_t	*ns = tcps->tcps_netstack;
 	ip_stack_t	*ipst = ns->netstack_ip;
+	ipsec_stack_t	*ipss = ns->netstack_ipsec;
+	iaflags_t	ixaflags = connp->conn_ixa->ixa_flags;
+	boolean_t	do_ipsec, hooks_out, hooks_in, ipobs_enabled;
 
 	ASSERT(tcp->tcp_fused);
 	ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
-	ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
+	ASSERT(connp->conn_sqp == peer_connp->conn_sqp);
 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
 	    DB_TYPE(mp) == M_PCPROTO);
 
-	/* If this connection requires IP, unfuse and use regular path */
-	if (tcp_loopback_needs_ip(tcp, ns) ||
-	    tcp_loopback_needs_ip(peer_tcp, ns) ||
-	    IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN, ipst) ||
-	    (tcp->tcp_ipversion == IPV4_VERSION &&
-	    ipst->ips_ip4_observe.he_interested) ||
-	    (tcp->tcp_ipversion == IPV6_VERSION &&
-	    ipst->ips_ip6_observe.he_interested)) {
-		TCP_STAT(tcps, tcp_fusion_aborted);
-		tcp->tcp_refuse = B_TRUE;
-		peer_tcp->tcp_refuse = B_TRUE;
-
-		bcopy(peer_tcp->tcp_tcph, &tcp->tcp_saved_tcph,
-		    sizeof (tcph_t));
-		bcopy(tcp->tcp_tcph, &peer_tcp->tcp_saved_tcph,
-		    sizeof (tcph_t));
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
-			bcopy(peer_tcp->tcp_ipha, &tcp->tcp_saved_ipha,
-			    sizeof (ipha_t));
-			bcopy(tcp->tcp_ipha, &peer_tcp->tcp_saved_ipha,
-			    sizeof (ipha_t));
-		} else {
-			bcopy(peer_tcp->tcp_ip6h, &tcp->tcp_saved_ip6h,
-			    sizeof (ip6_t));
-			bcopy(tcp->tcp_ip6h, &peer_tcp->tcp_saved_ip6h,
-			    sizeof (ip6_t));
-		}
-		goto unfuse;
-	}
-
 	if (send_size == 0) {
 		freemsg(mp);
 		return (B_TRUE);
@@ -578,123 +477,74 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
 		mp1 = mp->b_cont;
 	}
 
-	if (tcp->tcp_ipversion == IPV4_VERSION &&
-	    (HOOKS4_INTERESTED_LOOPBACK_IN(ipst) ||
-	    HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) ||
-	    tcp->tcp_ipversion == IPV6_VERSION &&
-	    (HOOKS6_INTERESTED_LOOPBACK_IN(ipst) ||
-	    HOOKS6_INTERESTED_LOOPBACK_OUT(ipst))) {
-		/*
-		 * Build ip and tcp header to satisfy FW_HOOKS.
-		 * We only build it when any hook is present.
-		 */
+	/*
+	 * Check that we are still using an IRE_LOCAL or IRE_LOOPBACK before
+	 * further processes.
+	 */
+	if (!ip_output_verify_local(connp->conn_ixa))
+		goto unfuse;
+
+	/*
+	 * Build IP and TCP header in case we have something that needs the
+	 * headers. Those cases are:
+	 * 1. IPsec
+	 * 2. IPobs
+	 * 3. FW_HOOKS
+	 *
+	 * If tcp_xmit_mp() fails to dupb() the message, unfuse the connection
+	 * and back to regular path.
+	 */
+	if (ixaflags & IXAF_IS_IPV4) {
+		do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) ||
+		    CONN_INBOUND_POLICY_PRESENT(peer_connp, ipss);
+
+		hooks_out = HOOKS4_INTERESTED_LOOPBACK_OUT(ipst);
+		hooks_in = HOOKS4_INTERESTED_LOOPBACK_IN(ipst);
+		ipobs_enabled = (ipst->ips_ip4_observe.he_interested != 0);
+	} else {
+		do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) ||
+		    CONN_INBOUND_POLICY_PRESENT_V6(peer_connp, ipss);
+
+		hooks_out = HOOKS6_INTERESTED_LOOPBACK_OUT(ipst);
+		hooks_in = HOOKS6_INTERESTED_LOOPBACK_IN(ipst);
+		ipobs_enabled = (ipst->ips_ip6_observe.he_interested != 0);
+	}
+
+	/* We do logical 'or' for efficiency */
+	if (ipobs_enabled | do_ipsec | hooks_in | hooks_out) {
 		if ((mp1 = tcp_xmit_mp(tcp, mp1, tcp->tcp_mss, NULL, NULL,
 		    tcp->tcp_snxt, B_TRUE, NULL, B_FALSE)) == NULL)
 			/* If tcp_xmit_mp fails, use regular path */
 			goto unfuse;
 
 		/*
-		 * The ipif and ill can be safely referenced under the
-		 * protection of conn_lock - see head of function comment for
-		 * conn_get_held_ipif(). It is necessary to check that both
-		 * the ipif and ill can be looked up (i.e. not condemned). If
-		 * not, bail out and unfuse this connection.
+		 * Leave all IP relevant processes to ip_output_process_local(),
+		 * which handles IPsec, IPobs, and FW_HOOKS.
 		 */
-		mutex_enter(&peer_tcp->tcp_connp->conn_lock);
-		if ((peer_tcp->tcp_connp->conn_ire_cache == NULL) ||
-		    (peer_tcp->tcp_connp->conn_ire_cache->ire_marks &
-		    IRE_MARK_CONDEMNED) ||
-		    ((oifp = peer_tcp->tcp_connp->conn_ire_cache->ire_ipif)
-		    == NULL) ||
-		    (!IPIF_CAN_LOOKUP(oifp)) ||
-		    ((olp = oifp->ipif_ill) == NULL) ||
-		    (ill_check_and_refhold(olp) != 0)) {
-			mutex_exit(&peer_tcp->tcp_connp->conn_lock);
-			goto unfuse;
-		}
-		mutex_exit(&peer_tcp->tcp_connp->conn_lock);
-
-		/* PFHooks: LOOPBACK_OUT */
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
-			ipha = (ipha_t *)mp1->b_rptr;
-
-			DTRACE_PROBE4(ip4__loopback__out__start,
-			    ill_t *, NULL, ill_t *, olp,
-			    ipha_t *, ipha, mblk_t *, mp1);
-			FW_HOOKS(ipst->ips_ip4_loopback_out_event,
-			    ipst->ips_ipv4firewall_loopback_out,
-			    NULL, olp, ipha, mp1, mp1, 0, ipst);
-			DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp1);
-		} else {
-			ip6h = (ip6_t *)mp1->b_rptr;
-
-			DTRACE_PROBE4(ip6__loopback__out__start,
-			    ill_t *, NULL, ill_t *, olp,
-			    ip6_t *, ip6h, mblk_t *, mp1);
-			FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
-			    ipst->ips_ipv6firewall_loopback_out,
-			    NULL, olp, ip6h, mp1, mp1, 0, ipst);
-			DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp1);
-		}
-		ill_refrele(olp);
+		mp1 = ip_output_process_local(mp1, connp->conn_ixa, hooks_out,
+		    hooks_in, do_ipsec ? peer_connp : NULL);
 
+		/* If the message is dropped for any reason. */
 		if (mp1 == NULL)
 			goto unfuse;
 
 		/*
-		 * The ipif and ill can be safely referenced under the
-		 * protection of conn_lock - see head of function comment for
-		 * conn_get_held_ipif(). It is necessary to check that both
-		 * the ipif and ill can be looked up (i.e. not condemned). If
-		 * not, bail out and unfuse this connection.
+		 * Data length might have been changed by FW_HOOKS.
+		 * We assume that the first mblk contains the TCP/IP headers.
 		 */
-		mutex_enter(&tcp->tcp_connp->conn_lock);
-		if ((tcp->tcp_connp->conn_ire_cache == NULL) ||
-		    (tcp->tcp_connp->conn_ire_cache->ire_marks &
-		    IRE_MARK_CONDEMNED) ||
-		    ((iifp = tcp->tcp_connp->conn_ire_cache->ire_ipif)
-		    == NULL) ||
-		    (!IPIF_CAN_LOOKUP(iifp)) ||
-		    ((ilp = iifp->ipif_ill) == NULL) ||
-		    (ill_check_and_refhold(ilp) != 0)) {
-			mutex_exit(&tcp->tcp_connp->conn_lock);
-			goto unfuse;
-		}
-		mutex_exit(&tcp->tcp_connp->conn_lock);
-
-		/* PFHooks: LOOPBACK_IN */
-		if (tcp->tcp_ipversion == IPV4_VERSION) {
-			DTRACE_PROBE4(ip4__loopback__in__start,
-			    ill_t *, ilp, ill_t *, NULL,
-			    ipha_t *, ipha, mblk_t *, mp1);
-			FW_HOOKS(ipst->ips_ip4_loopback_in_event,
-			    ipst->ips_ipv4firewall_loopback_in,
-			    ilp, NULL, ipha, mp1, mp1, 0, ipst);
-			DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp1);
-			ill_refrele(ilp);
-			if (mp1 == NULL)
-				goto unfuse;
-
-			ip_hdr_len = IPH_HDR_LENGTH(ipha);
-		} else {
-			DTRACE_PROBE4(ip6__loopback__in__start,
-			    ill_t *, ilp, ill_t *, NULL,
-			    ip6_t *, ip6h, mblk_t *, mp1);
-			FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
-			    ipst->ips_ipv6firewall_loopback_in,
-			    ilp, NULL, ip6h, mp1, mp1, 0, ipst);
-			DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp1);
-			ill_refrele(ilp);
-			if (mp1 == NULL)
-				goto unfuse;
-
-			ip_hdr_len = ip_hdr_length_v6(mp1, ip6h);
-		}
+		if (hooks_in || hooks_out) {
+			tcpha_t *tcpha;
+
+			ip_hdr_len = (ixaflags & IXAF_IS_IPV4) ?
+			    IPH_HDR_LENGTH((ipha_t *)mp1->b_rptr) :
+			    ip_hdr_length_v6(mp1, (ip6_t *)mp1->b_rptr);
 
-		/* Data length might be changed by FW_HOOKS */
-		tcph = (tcph_t *)&mp1->b_rptr[ip_hdr_len];
-		seq = ABE32_TO_U32(tcph->th_seq);
-		recv_size += seq - tcp->tcp_snxt;
+			tcpha = (tcpha_t *)&mp1->b_rptr[ip_hdr_len];
+			ASSERT((uchar_t *)tcpha + sizeof (tcpha_t) <=
+			    mp1->b_wptr);
+			recv_size += htonl(tcpha->tha_seq) - tcp->tcp_snxt;
+
+		}
 
 		/*
 		 * The message duplicated by tcp_xmit_mp is freed.
@@ -712,7 +562,7 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
 	 * detached we use tcp_rcv_enqueue() instead. Queued data will be
 	 * drained when the accept completes (in tcp_accept_finish()).
 	 */
-	if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
+	if (IPCL_IS_NONSTR(peer_connp) &&
 	    !TCP_IS_DETACHED(peer_tcp)) {
 		int error;
 		int flags = 0;
@@ -720,18 +570,18 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
 		if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
 		    (tcp->tcp_urg == tcp->tcp_snxt)) {
 			flags = MSG_OOB;
-			(*peer_tcp->tcp_connp->conn_upcalls->su_signal_oob)
-			    (peer_tcp->tcp_connp->conn_upper_handle, 0);
+			(*peer_connp->conn_upcalls->su_signal_oob)
+			    (peer_connp->conn_upper_handle, 0);
 			tcp->tcp_valid_bits &= ~TCP_URG_VALID;
 		}
-		if ((*peer_tcp->tcp_connp->conn_upcalls->su_recv)(
-		    peer_tcp->tcp_connp->conn_upper_handle, mp, recv_size,
+		if ((*peer_connp->conn_upcalls->su_recv)(
+		    peer_connp->conn_upper_handle, mp, recv_size,
 		    flags, &error, &push) < 0) {
 			ASSERT(error != EOPNOTSUPP);
 			peer_data_queued = B_TRUE;
 		}
 	} else {
-		if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
+		if (IPCL_IS_NONSTR(peer_connp) &&
 		    (tcp->tcp_valid_bits & TCP_URG_VALID) &&
 		    (tcp->tcp_urg == tcp->tcp_snxt)) {
 			/*
@@ -744,7 +594,8 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
 			return (B_TRUE);
 		}
 
-		tcp_rcv_enqueue(peer_tcp, mp, recv_size);
+		tcp_rcv_enqueue(peer_tcp, mp, recv_size,
+		    tcp->tcp_connp->conn_cred);
 
 		/* In case it wrapped around and also to keep it constant */
 		peer_tcp->tcp_rwnd += recv_size;
@@ -764,22 +615,21 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
 	mutex_enter(&tcp->tcp_non_sq_lock);
 	flow_stopped = tcp->tcp_flow_stopped;
 	if ((TCP_IS_DETACHED(peer_tcp) &&
-	    (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_recv_hiwater)) ||
+	    (peer_tcp->tcp_rcv_cnt >= peer_connp->conn_rcvbuf)) ||
 	    (!TCP_IS_DETACHED(peer_tcp) &&
-	    !IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
-	    !canputnext(peer_tcp->tcp_rq))) {
+	    !IPCL_IS_NONSTR(peer_connp) && !canputnext(peer_connp->conn_rq))) {
 		peer_data_queued = B_TRUE;
 	}
 
 	if (!flow_stopped && (peer_data_queued ||
-	    (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater))) {
+	    (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf))) {
 		tcp_setqfull(tcp);
 		flow_stopped = B_TRUE;
 		TCP_STAT(tcps, tcp_fusion_flowctl);
 		DTRACE_PROBE3(tcp__fuse__output__flowctl, tcp_t *, tcp,
 		    uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt);
 	} else if (flow_stopped && !peer_data_queued &&
-	    (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater)) {
+	    (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat)) {
 		tcp_clrqfull(tcp);
 		TCP_STAT(tcps, tcp_fusion_backenabled);
 		flow_stopped = B_FALSE;
@@ -818,13 +668,14 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
 			/*
 			 * For TLI-based streams, a thread in tcp_accept_swap()
 			 * can race with us.  That thread will ensure that the
-			 * correct peer_tcp->tcp_rq is globally visible before
-			 * peer_tcp->tcp_detached is visible as clear, but we
-			 * must also ensure that the load of tcp_rq cannot be
-			 * reordered to be before the tcp_detached check.
+			 * correct peer_connp->conn_rq is globally visible
+			 * before peer_tcp->tcp_detached is visible as clear,
+			 * but we must also ensure that the load of conn_rq
+			 * cannot be reordered to be before the tcp_detached
+			 * check.
 			 */
 			membar_consumer();
-			(void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
+			(void) tcp_fuse_rcv_drain(peer_connp->conn_rq, peer_tcp,
 			    NULL);
 		}
 	}
@@ -928,11 +779,11 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
 	tcp->tcp_rcv_last_head = NULL;
 	tcp->tcp_rcv_last_tail = NULL;
 	tcp->tcp_rcv_cnt = 0;
-	tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+	tcp->tcp_rwnd = tcp->tcp_connp->conn_rcvbuf;
 
 	mutex_enter(&peer_tcp->tcp_non_sq_lock);
 	if (peer_tcp->tcp_flow_stopped && (TCP_UNSENT_BYTES(peer_tcp) <=
-	    peer_tcp->tcp_xmit_lowater)) {
+	    peer_tcp->tcp_connp->conn_sndlowat)) {
 		tcp_clrqfull(peer_tcp);
 		TCP_STAT(tcps, tcp_fusion_backenabled);
 	}
@@ -964,8 +815,8 @@ tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd)
 	 * Record high water mark, this is used for flow-control
 	 * purposes in tcp_fuse_output().
 	 */
-	tcp->tcp_recv_hiwater = rwnd;
-	tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+	tcp->tcp_connp->conn_rcvbuf = rwnd;
+	tcp->tcp_rwnd = rwnd;
 	return (rwnd);
 }
 
@@ -976,12 +827,13 @@ int
 tcp_fuse_maxpsz(tcp_t *tcp)
 {
 	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-	uint_t sndbuf = tcp->tcp_xmit_hiwater;
+	conn_t *connp = tcp->tcp_connp;
+	uint_t sndbuf = connp->conn_sndbuf;
 	uint_t maxpsz = sndbuf;
 
 	ASSERT(tcp->tcp_fused);
 	ASSERT(peer_tcp != NULL);
-	ASSERT(peer_tcp->tcp_recv_hiwater != 0);
+	ASSERT(peer_tcp->tcp_connp->conn_rcvbuf != 0);
 	/*
 	 * In the fused loopback case, we want the stream head to split
 	 * up larger writes into smaller chunks for a more accurate flow-
@@ -990,8 +842,8 @@ tcp_fuse_maxpsz(tcp_t *tcp)
 	 * We round up the buffer to system page size due to the lack of
 	 * TCP MSS concept in Fusion.
 	 */
-	if (maxpsz > peer_tcp->tcp_recv_hiwater)
-		maxpsz = peer_tcp->tcp_recv_hiwater;
+	if (maxpsz > peer_tcp->tcp_connp->conn_rcvbuf)
+		maxpsz = peer_tcp->tcp_connp->conn_rcvbuf;
 	maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1;
 
 	return (maxpsz);
@@ -1013,12 +865,12 @@ tcp_fuse_backenable(tcp_t *tcp)
 	    peer_tcp->tcp_connp->conn_sqp);
 
 	if (tcp->tcp_rcv_list != NULL)
-		(void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp, NULL);
+		(void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp, NULL);
 
 	mutex_enter(&peer_tcp->tcp_non_sq_lock);
 	if (peer_tcp->tcp_flow_stopped &&
 	    (TCP_UNSENT_BYTES(peer_tcp) <=
-	    peer_tcp->tcp_xmit_lowater)) {
+	    peer_tcp->tcp_connp->conn_sndlowat)) {
 		tcp_clrqfull(peer_tcp);
 	}
 	mutex_exit(&peer_tcp->tcp_non_sq_lock);
diff --git a/usr/src/uts/common/inet/tcp/tcp_kssl.c b/usr/src/uts/common/inet/tcp/tcp_kssl.c
index 75fa36196a..5d9051aed1 100644
--- a/usr/src/uts/common/inet/tcp/tcp_kssl.c
+++ b/usr/src/uts/common/inet/tcp/tcp_kssl.c
@@ -56,20 +56,21 @@
  * For the Kernel SSL proxy
  *
  * Routines in this file are called on tcp's incoming path,
- * tcp_rput_data() mainly, and right before the message is
+ * tcp_input_data() mainly, and right before the message is
  * to be putnext()'ed upstreams.
  */
 
 static void	tcp_kssl_input_callback(void *, mblk_t *, kssl_cmd_t);
-static void	tcp_kssl_input_asynch(void *, mblk_t *, void *);
+static void	tcp_kssl_input_asynch(void *, mblk_t *, void *,
+    ip_recv_attr_t *);
 
-extern void	tcp_output(void *, mblk_t *, void *);
+extern void	tcp_output(void *, mblk_t *, void *, ip_recv_attr_t *);
 extern void	tcp_send_conn_ind(void *, mblk_t *, void *);
 
 extern int tcp_squeue_flag;
 
 /*
- * tcp_rput_data() calls this routine for all packet destined to a
+ * tcp_input_data() calls this routine for all packet destined to a
  * connection to the SSL port, when the SSL kernel proxy is configured
  * to intercept and process those packets.
  * A packet may carry multiple SSL records, so the function
@@ -84,7 +85,7 @@ extern int tcp_squeue_flag;
  * which could decrement the conn/tcp reference before we get to increment it.
  */
 void
-tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
+tcp_kssl_input(tcp_t *tcp, mblk_t *mp, cred_t *cr)
 {
 	struct conn_s	*connp = tcp->tcp_connp;
 	tcp_t		*listener;
@@ -97,15 +98,26 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
 	boolean_t	is_v4;
 	void		*addr;
 
+	if (is_system_labeled() && mp != NULL) {
+		ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL);
+		/*
+		 * Provide for protocols above TCP such as RPC. NOPID leaves
+		 * db_cpid unchanged.
+		 * The cred could have already been set.
+		 */
+		if (cr != NULL)
+			mblk_setcred(mp, cr, NOPID);
+	}
+
 	/* First time here, allocate the SSL context */
 	if (tcp->tcp_kssl_ctx == NULL) {
 		ASSERT(tcp->tcp_kssl_pending);
 
-		is_v4 = (tcp->tcp_ipversion == IPV4_VERSION);
+		is_v4 = (connp->conn_ipversion == IPV4_VERSION);
 		if (is_v4) {
-			addr = &tcp->tcp_ipha->ipha_dst;
+			addr = &connp->conn_faddr_v4;
 		} else {
-			addr = &tcp->tcp_ip6h->ip6_dst;
+			addr = &connp->conn_faddr_v6;
 		}
 
 		if (kssl_init_context(tcp->tcp_kssl_ent,
@@ -146,7 +158,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
 			mutex_enter(&tcp->tcp_non_sq_lock);
 			tcp->tcp_squeue_bytes += msgdsize(outmp);
 			mutex_exit(&tcp->tcp_non_sq_lock);
-			tcp_output(connp, outmp, NULL);
+			tcp_output(connp, outmp, NULL, NULL);
 
 		/* FALLTHROUGH */
 		case KSSL_CMD_NONE:
@@ -194,7 +206,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
 				tci->PRIM_type = T_SSL_PROXY_CONN_IND;
 
 				/*
-				 * The code below is copied from tcp_rput_data()
+				 * The code below is copied from tcp_input_data
 				 * delivering the T_CONN_IND on a TCPS_SYN_RCVD,
 				 * and all conn ref cnt comments apply.
 				 */
@@ -214,7 +226,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
 					SQUEUE_ENTER_ONE(
 					    listener->tcp_connp->conn_sqp,
 					    ind_mp, tcp_send_conn_ind,
-					    listener->tcp_connp, SQ_FILL,
+					    listener->tcp_connp, NULL, SQ_FILL,
 					    SQTAG_TCP_CONN_IND);
 				}
 			}
@@ -240,11 +252,12 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
 			if (tcp->tcp_listener != NULL) {
 				DTRACE_PROBE1(kssl_mblk__input_rcv_enqueue,
 				    mblk_t *, outmp);
-				tcp_rcv_enqueue(tcp, outmp, msgdsize(outmp));
+				tcp_rcv_enqueue(tcp, outmp, msgdsize(outmp),
+				    NULL);
 			} else {
 				DTRACE_PROBE1(kssl_mblk__input_putnext,
 				    mblk_t *, outmp);
-				putnext(tcp->tcp_rq, outmp);
+				putnext(connp->conn_rq, outmp);
 			}
 			/*
 			 * We're at a phase where records are sent upstreams,
@@ -283,7 +296,7 @@ no_can_do:
 				tci->PRIM_type = T_SSL_PROXY_CONN_IND;
 
 				/*
-				 * The code below is copied from tcp_rput_data()
+				 * The code below is copied from tcp_input_data
 				 * delivering the T_CONN_IND on a TCPS_SYN_RCVD,
 				 * and all conn ref cnt comments apply.
 				 */
@@ -303,12 +316,12 @@ no_can_do:
 					SQUEUE_ENTER_ONE(
 					    listener->tcp_connp->conn_sqp,
 					    ind_mp, tcp_send_conn_ind,
-					    listener->tcp_connp,
+					    listener->tcp_connp, NULL,
 					    SQ_FILL, SQTAG_TCP_CONN_IND);
 				}
 			}
 			if (mp != NULL)
-				tcp_rcv_enqueue(tcp, mp, msgdsize(mp));
+				tcp_rcv_enqueue(tcp, mp, msgdsize(mp), NULL);
 			break;
 		}
 		mp = NULL;
@@ -351,7 +364,7 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
 		}
 		CONN_INC_REF(connp);
 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
-		    tcp_squeue_flag, SQTAG_TCP_OUTPUT);
+		    NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 
 	/* FALLTHROUGH */
 	case KSSL_CMD_NONE:
@@ -363,9 +376,9 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
 		 * Keep accumulating if not yet accepted.
 		 */
 		if (tcp->tcp_listener != NULL) {
-			tcp_rcv_enqueue(tcp, mp, msgdsize(mp));
+			tcp_rcv_enqueue(tcp, mp, msgdsize(mp), NULL);
 		} else {
-			putnext(tcp->tcp_rq, mp);
+			putnext(connp->conn_rq, mp);
 		}
 		break;
 
@@ -383,7 +396,7 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
 	if ((sqmp = allocb(1, BPRI_MED)) != NULL) {
 		CONN_INC_REF(connp);
 		SQUEUE_ENTER_ONE(connp->conn_sqp, sqmp, tcp_kssl_input_asynch,
-		    connp, SQ_FILL, SQTAG_TCP_KSSL_INPUT);
+		    connp, NULL, SQ_FILL, SQTAG_TCP_KSSL_INPUT);
 	} else {
 		DTRACE_PROBE(kssl_err__allocb_failed);
 	}
@@ -396,7 +409,7 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
  */
 /* ARGSUSED */
 void
-tcp_kssl_input_asynch(void *arg, mblk_t *mp, void *arg2)
+tcp_kssl_input_asynch(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 {
 	conn_t	*connp = (conn_t *)arg;
 	tcp_t *tcp = connp->conn_tcp;
@@ -409,6 +422,6 @@ tcp_kssl_input_asynch(void *arg, mblk_t *mp, void *arg2)
 	 * while we're away
 	 */
 	if (tcp->tcp_kssl_ctx != NULL) {
-		tcp_kssl_input(tcp, NULL);
+		tcp_kssl_input(tcp, NULL, NULL);
 	}
 }
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index fa2529a5ac..d15ff4ffcd 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -39,12 +39,7 @@
 #include <netinet/tcp.h>
 #include <inet/optcom.h>
 
-
-extern int	tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
-extern int	tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
-extern int	tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
-    int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
-    void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
+#include <inet/tcp_impl.h>
 
 /*
  * Table of all known options handled on a TCP protocol stack.
@@ -55,161 +50,165 @@ extern int	tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
  */
 opdes_t	tcp_opt_arr[] = {
 
-{ SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct linger), 0 },
 
-{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
-{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct timeval), 0 },
-{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct timeval), 0 },
-{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	0 },
-{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	0 },
-{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	0 },
-{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int),
+{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
 	0 },
-{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 
-{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 
-{ SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 
-{ TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
-{ TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (uint_t),
+{ TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
 	536 },
 
 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 
 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 
 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 
 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 
-{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	0 },
 
-{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, OP_PASSNEXT,
+{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
 	sizeof (int), 0 },
 
-{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
 
-{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, OP_PASSNEXT,
+{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
 	sizeof (int), 0 },
 
-{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0	},
 
-{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0	},
 
-{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 
 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 
-{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 	sizeof (int), -1 /* not initialized */ },
 
-{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 	sizeof (ipsec_req_t), -1 /* not initialized */ },
 
-{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
 	sizeof (int), 0 },
 
-{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 	sizeof (int), -1 /* not initialized */ },
 
-{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
+{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
+{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
 	sizeof (in_addr_t),	-1 /* not initialized  */ },
 
-{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
 	sizeof (int), 0 },
 
 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+	(OP_NODEFAULT|OP_VARLEN),
 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT),
+	OP_NODEFAULT,
 	sizeof (sin6_t), -1 /* not initialized */ },
 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8,
+	(OP_VARLEN|OP_NODEFAULT), 255*8,
 	-1 /* not initialized */ },
 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8,
+	(OP_VARLEN|OP_NODEFAULT), 255*8,
 	-1 /* not initialized */ },
 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8,
+	(OP_VARLEN|OP_NODEFAULT), 255*8,
 	-1 /* not initialized */ },
 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8,
+	(OP_VARLEN|OP_NODEFAULT), 255*8,
 	-1 /* not initialized */ },
 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT),
+	OP_NODEFAULT,
 	sizeof (int), -1 /* not initialized */ },
 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT),
+	OP_NODEFAULT,
 	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
-{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+	sizeof (int), 0 },
+{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
 
 /* Enable receipt of ancillary data */
-{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
 
-{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 	sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
 };
 
@@ -247,7 +246,6 @@ optdb_obj_t tcp_opt_obj = {
 	tcp_opt_default,	/* TCP default value function pointer */
 	tcp_tpi_opt_get,	/* TCP get function pointer */
 	tcp_tpi_opt_set,	/* TCP set function pointer */
-	B_TRUE,			/* TCP is tpi provider */
 	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
 	tcp_opt_arr,		/* TCP option database */
 	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index bec2b3256f..1b7c87736a 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -70,41 +70,6 @@ extern "C" {
 }
 
 /*
- * Before caching the conn IRE, we need to make sure certain TCP
- * states are in sync with the ire. The mismatch could occur if the
- * TCP state has been set in tcp_adapt_ire() using a different IRE,
- * e.g if an address was not present during an initial connect(),
- * tcp_adapt_ire() will set the state using the interface route.
- * Subsequently, if the address is added to the local machine, the
- * retransmitted SYN will get the correct (loopback) IRE, but the TCP
- * state (tcp_loopback and tcp_localnet) will remain out of sync.
- * This is especially an issue with TCP fusion which relies on the
- * TCP state to be accurate.
- *
- * This check/change should be made only if the TCP is not yet in
- * the established state, else it would lead to inconsistencies.
- */
-#define	TCP_CHECK_IREINFO(tcp, ire) {					\
-	if ((tcp)->tcp_state < TCPS_ESTABLISHED) {			\
-		if (((ire)->ire_type & (IRE_LOOPBACK | 			\
-		    IRE_LOCAL)) && !(tcp)->tcp_loopback) {		\
-			(tcp)->tcp_loopback = B_TRUE;			\
-		} else if ((tcp)->tcp_loopback && 			\
-		    !((ire)->ire_type & (IRE_LOOPBACK | IRE_LOCAL))) {	\
-			(tcp)->tcp_loopback = B_FALSE;			\
-		}							\
-		if ((tcp)->tcp_ipversion == IPV4_VERSION) {		\
-			(tcp)->tcp_localnet =				\
-			    ((ire)->ire_gateway_addr == 0);		\
-		} else {						\
-			(tcp)->tcp_localnet =				\
-			    IN6_IS_ADDR_UNSPECIFIED(			\
-			    &(ire)->ire_gateway_addr_v6);		\
-		}							\
-	}								\
-}
-
-/*
  * Write-side flow-control is implemented via the per instance STREAMS
  * write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s)
  * and clearing QFULL and calling qbackenable() to restart the flow based
@@ -205,18 +170,19 @@ typedef struct tcpparam_s {
 #define	tcps_keepalive_abort_interval_high	tcps_params[59].tcp_param_max
 #define	tcps_keepalive_abort_interval		tcps_params[59].tcp_param_val
 #define	tcps_keepalive_abort_interval_low	tcps_params[59].tcp_param_min
+#define	tcps_dev_flow_ctl		tcps_params[60].tcp_param_val
 
 extern struct qinit tcp_rinitv4, tcp_rinitv6;
 extern boolean_t do_tcp_fusion;
 
 extern int	tcp_maxpsz_set(tcp_t *, boolean_t);
 extern void	tcp_timers_stop(tcp_t *);
-extern void	tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t);
+extern void	tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t, cred_t *);
 extern void	tcp_push_timer(void *);
 extern timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t);
 extern clock_t	tcp_timeout_cancel(conn_t *, timeout_id_t);
 
-extern void	tcp_fuse(tcp_t *, uchar_t *, tcph_t *);
+extern void	tcp_fuse(tcp_t *, uchar_t *, tcpha_t *);
 extern void	tcp_unfuse(tcp_t *);
 extern boolean_t tcp_fuse_output(tcp_t *, mblk_t *, uint32_t);
 extern void	tcp_fuse_output_urg(tcp_t *, mblk_t *);
@@ -242,6 +208,11 @@ extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
 extern sock_downcalls_t sock_tcp_downcalls;
 
 
+extern int	tcp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int	tcp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int	tcp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
+		    uint_t *, uchar_t *, void *, cred_t *);
+
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h
index 2c151894eb..a254da4b43 100644
--- a/usr/src/uts/common/inet/tcp_stack.h
+++ b/usr/src/uts/common/inet/tcp_stack.h
@@ -42,9 +42,6 @@ typedef struct tcp_stat {
 	kstat_named_t	tcp_time_wait;
 	kstat_named_t	tcp_time_wait_syn;
 	kstat_named_t	tcp_time_wait_syn_success;
-	kstat_named_t	tcp_time_wait_syn_fail;
-	kstat_named_t	tcp_reinput_syn;
-	kstat_named_t	tcp_ip_output;
 	kstat_named_t	tcp_detach_non_time_wait;
 	kstat_named_t	tcp_detach_time_wait;
 	kstat_named_t	tcp_time_wait_reap;
@@ -82,37 +79,14 @@ typedef struct tcp_stat {
 	kstat_named_t	tcp_timermp_freed;
 	kstat_named_t	tcp_push_timer_cnt;
 	kstat_named_t	tcp_ack_timer_cnt;
-	kstat_named_t	tcp_ire_null1;
-	kstat_named_t	tcp_ire_null;
-	kstat_named_t	tcp_ip_send;
-	kstat_named_t	tcp_ip_ire_send;
 	kstat_named_t   tcp_wsrv_called;
 	kstat_named_t   tcp_flwctl_on;
 	kstat_named_t	tcp_timer_fire_early;
 	kstat_named_t	tcp_timer_fire_miss;
 	kstat_named_t	tcp_rput_v6_error;
-	kstat_named_t	tcp_out_sw_cksum;
-	kstat_named_t	tcp_out_sw_cksum_bytes;
 	kstat_named_t	tcp_zcopy_on;
 	kstat_named_t	tcp_zcopy_off;
 	kstat_named_t	tcp_zcopy_backoff;
-	kstat_named_t	tcp_zcopy_disable;
-	kstat_named_t	tcp_mdt_pkt_out;
-	kstat_named_t	tcp_mdt_pkt_out_v4;
-	kstat_named_t	tcp_mdt_pkt_out_v6;
-	kstat_named_t	tcp_mdt_discarded;
-	kstat_named_t	tcp_mdt_conn_halted1;
-	kstat_named_t	tcp_mdt_conn_halted2;
-	kstat_named_t	tcp_mdt_conn_halted3;
-	kstat_named_t	tcp_mdt_conn_resumed1;
-	kstat_named_t	tcp_mdt_conn_resumed2;
-	kstat_named_t	tcp_mdt_legacy_small;
-	kstat_named_t	tcp_mdt_legacy_all;
-	kstat_named_t	tcp_mdt_legacy_ret;
-	kstat_named_t	tcp_mdt_allocfail;
-	kstat_named_t	tcp_mdt_addpdescfail;
-	kstat_named_t	tcp_mdt_allocd;
-	kstat_named_t	tcp_mdt_linked;
 	kstat_named_t	tcp_fusion_flowctl;
 	kstat_named_t	tcp_fusion_backenabled;
 	kstat_named_t	tcp_fusion_urg;
@@ -154,15 +128,6 @@ struct tcp_stack {
 
 	mib2_tcp_t	tcps_mib;
 
-	/* Protected by tcps_g_q_lock */
-	queue_t		*tcps_g_q;	/* Default queue */
-	uint_t		tcps_refcnt;	/* Total number of tcp_t's */
-	kmutex_t	tcps_g_q_lock;
-	kcondvar_t	tcps_g_q_cv;
-	kthread_t	*tcps_g_q_creator;
-	struct __ldi_handle *tcps_g_q_lh;
-	cred_t		*tcps_g_q_cr;    /* For _inactive close call */
-
 	/*
 	 * Extra privileged ports. In host byte order.
 	 * Protected by tcp_epriv_port_lock.
@@ -182,9 +147,6 @@ struct tcp_stack {
 	caddr_t		tcps_g_nd;
 	struct tcpparam_s *tcps_params;	/* ndd parameters */
 	struct tcpparam_s *tcps_wroff_xtra_param;
-	struct tcpparam_s *tcps_mdt_head_param;
-	struct tcpparam_s *tcps_mdt_tail_param;
-	struct tcpparam_s *tcps_mdt_max_pbufs_param;
 
 	/* Hint not protected by any lock */
 	uint_t		tcps_next_port_to_try;
@@ -222,6 +184,11 @@ struct tcp_stack {
 	/* The number of RST not sent because of the rate limit. */
 	uint32_t	tcps_rst_unsent;
 	ldi_ident_t	tcps_ldi_ident;
+
+	/* Used to synchronize access when reclaiming memory */
+	mblk_t		*tcps_ixa_cleanup_mp;
+	kmutex_t	tcps_ixa_cleanup_lock;
+	kcondvar_t	tcps_ixa_cleanup_cv;
 };
 typedef struct tcp_stack tcp_stack_t;
 
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index d0bab511b0..e18fc57f40 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -26,12 +26,9 @@
 
 #include <sys/types.h>
 #include <sys/stream.h>
-#include <sys/dlpi.h>
-#include <sys/pattr.h>
 #include <sys/stropts.h>
 #include <sys/strlog.h>
 #include <sys/strsun.h>
-#include <sys/time.h>
 #define	_SUN_TPI_VERSION 2
 #include <sys/tihdr.h>
 #include <sys/timod.h>
@@ -41,7 +38,9 @@
 #include <sys/suntpi.h>
 #include <sys/xti_inet.h>
 #include <sys/kmem.h>
+#include <sys/cred_impl.h>
 #include <sys/policy.h>
+#include <sys/priv.h>
 #include <sys/ucred.h>
 #include <sys/zone.h>
 
@@ -57,12 +56,11 @@
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/udp.h>
-#include <net/if.h>
-#include <net/route.h>
 
 #include <inet/common.h>
 #include <inet/ip.h>
 #include <inet/ip_impl.h>
+#include <inet/ipsec_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip_ire.h>
 #include <inet/ip_if.h>
@@ -74,34 +72,25 @@
 #include <inet/optcom.h>
 #include <inet/snmpcom.h>
 #include <inet/kstatcom.h>
-#include <inet/udp_impl.h>
 #include <inet/ipclassifier.h>
-#include <inet/ipsec_impl.h>
-#include <inet/ipp_common.h>
 #include <sys/squeue_impl.h>
 #include <inet/ipnet.h>
 #include <sys/ethernet.h>
 
-/*
- * The ipsec_info.h header file is here since it has the definition for the
- * M_CTL message types used by IP to convey information to the ULP. The
- * ipsec_info.h needs the pfkeyv2.h, hence the latter's presence.
- */
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
-
 #include <sys/tsol/label.h>
 #include <sys/tsol/tnet.h>
 #include <rpc/pmap_prot.h>
 
+#include <inet/udp_impl.h>
+
 /*
  * Synchronization notes:
  *
  * UDP is MT and uses the usual kernel synchronization primitives. There are 2
- * locks, the fanout lock (uf_lock) and the udp endpoint lock udp_rwlock.
- * We also use conn_lock when updating things that affect the IP classifier
- * lookup.
- * The lock order is udp_rwlock -> uf_lock and is udp_rwlock -> conn_lock.
+ * locks, the fanout lock (uf_lock) and conn_lock. conn_lock
+ * protects the contents of the udp_t. uf_lock protects the address and the
+ * fanout information.
+ * The lock order is conn_lock -> uf_lock.
  *
  * The fanout lock uf_lock:
  * When a UDP endpoint is bound to a local port, it is inserted into
@@ -114,11 +103,6 @@
  * from the bind hash list only when it is being unbound or being closed.
  * The per bucket lock also protects a UDP endpoint's state changes.
  *
- * The udp_rwlock:
- * This protects most of the other fields in the udp_t. The exact list of
- * fields which are protected by each of the above locks is documented in
- * the udp_t structure definition.
- *
  * Plumbing notes:
  * UDP is always a device driver. For compatibility with mibopen() code
  * it is possible to I_PUSH "udp", but that results in pushing a passthrough
@@ -133,41 +117,32 @@
 /* For /etc/system control */
 uint_t udp_bind_fanout_size = UDP_BIND_FANOUT_SIZE;
 
-/* Option processing attrs */
-typedef struct udpattrs_s {
-	union {
-		ip6_pkt_t	*udpattr_ipp6;	/* For V6 */
-		ip4_pkt_t 	*udpattr_ipp4;	/* For V4 */
-	} udpattr_ippu;
-#define	udpattr_ipp6 udpattr_ippu.udpattr_ipp6
-#define	udpattr_ipp4 udpattr_ippu.udpattr_ipp4
-	mblk_t		*udpattr_mb;
-	boolean_t	udpattr_credset;
-} udpattrs_t;
-
 static void	udp_addr_req(queue_t *q, mblk_t *mp);
 static void	udp_tpi_bind(queue_t *q, mblk_t *mp);
 static void	udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp);
 static void	udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock);
-static int	udp_build_hdrs(udp_t *udp);
+static int	udp_build_hdr_template(conn_t *, const in6_addr_t *,
+    const in6_addr_t *, in_port_t, uint32_t);
 static void	udp_capability_req(queue_t *q, mblk_t *mp);
 static int	udp_tpi_close(queue_t *q, int flags);
+static void	udp_close_free(conn_t *);
 static void	udp_tpi_connect(queue_t *q, mblk_t *mp);
 static void	udp_tpi_disconnect(queue_t *q, mblk_t *mp);
 static void	udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
-		    int sys_error);
-static void	udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive,
-		    t_scalar_t tlierr, int unixerr);
+    int sys_error);
+static void	udp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
+    t_scalar_t tlierr, int sys_error);
 static int	udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
 		    cred_t *cr);
 static int	udp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
 		    char *value, caddr_t cp, cred_t *cr);
 static int	udp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
 		    char *value, caddr_t cp, cred_t *cr);
-static void	udp_icmp_error(conn_t *, mblk_t *);
-static void	udp_icmp_error_ipv6(conn_t *, mblk_t *);
+static void	udp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void	udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
+    ip_recv_attr_t *ira);
 static void	udp_info_req(queue_t *q, mblk_t *mp);
-static void	udp_input(void *, mblk_t *, void *);
+static void	udp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 static void	udp_lrput(queue_t *, mblk_t *);
 static void	udp_lwput(queue_t *, mblk_t *);
 static int	udp_open(queue_t *q, dev_t *devp, int flag, int sflag,
@@ -176,24 +151,34 @@ static int	udp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
 		    cred_t *credp);
 static int	udp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
 		    cred_t *credp);
-static  int	udp_unitdata_opt_process(queue_t *q, mblk_t *mp,
-		    int *errorp, udpattrs_t *udpattrs);
 static boolean_t udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
+int		udp_opt_set(conn_t *connp, uint_t optset_context,
+		    int level, int name, uint_t inlen,
+		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
+		    void *thisdg_attrs, cred_t *cr);
+int		udp_opt_get(conn_t *connp, int level, int name,
+		    uchar_t *ptr);
+static int	udp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr,
+		    pid_t pid);
+static int	udp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr,
+    pid_t pid, ip_xmit_attr_t *ixa);
+static int	udp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
+		    sin6_t *sin6, ushort_t ipversion, cred_t *cr, pid_t,
+		    ip_xmit_attr_t *ixa);
 static int	udp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
 static boolean_t udp_param_register(IDP *ndp, udpparam_t *udppa, int cnt);
 static int	udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
 		    cred_t *cr);
-static void	udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp,
-		    ipha_t *ipha);
-static void	udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr,
-		    t_scalar_t destlen, t_scalar_t err);
+static mblk_t	*udp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
+    const in6_addr_t *, const in6_addr_t *, in_port_t, uint32_t, mblk_t *,
+    int *);
+static mblk_t	*udp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
+    mblk_t *, const in6_addr_t *, in_port_t, uint32_t, int *);
+static void	udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
+static void	udp_ud_err_connected(conn_t *, t_scalar_t);
 static void	udp_tpi_unbind(queue_t *q, mblk_t *mp);
 static in_port_t udp_update_next_port(udp_t *udp, in_port_t port,
     boolean_t random);
-static mblk_t	*udp_output_v4(conn_t *, mblk_t *, ipaddr_t, uint16_t, uint_t,
-		    int *, boolean_t, struct nmsghdr *, cred_t *, pid_t);
-static mblk_t	*udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6,
-		    int *error, struct nmsghdr *msg, cred_t *cr, pid_t pid);
 static void	udp_wput_other(queue_t *q, mblk_t *mp);
 static void	udp_wput_iocdata(queue_t *q, mblk_t *mp);
 static void	udp_wput_fallback(queue_t *q, mblk_t *mp);
@@ -208,11 +193,9 @@ static void	*udp_kstat2_init(netstackid_t, udp_stat_t *);
 static void	udp_kstat2_fini(netstackid_t, kstat_t *);
 static int	udp_kstat_update(kstat_t *kp, int rw);
 
-static void	udp_xmit(queue_t *, mblk_t *, ire_t *ire, conn_t *, zoneid_t);
 
-static int	udp_send_connected(conn_t *, mblk_t *, struct nmsghdr *,
-		    cred_t *, pid_t);
-static void	udp_ulp_recv(conn_t *, mblk_t *);
+/* Common routines for TPI and socket module */
+static void	udp_ulp_recv(conn_t *, mblk_t *, uint_t, ip_recv_attr_t *);
 
 /* Common routine for TPI and socket module */
 static conn_t	*udp_do_open(cred_t *, boolean_t, int);
@@ -220,30 +203,20 @@ static void	udp_do_close(conn_t *);
 static int	udp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
     boolean_t);
 static int	udp_do_unbind(conn_t *);
-static int	udp_do_getsockname(udp_t *, struct sockaddr *, uint_t *);
-static int	udp_do_getpeername(udp_t *, struct sockaddr *, uint_t *);
 
 int		udp_getsockname(sock_lower_handle_t,
     struct sockaddr *, socklen_t *, cred_t *);
 int		udp_getpeername(sock_lower_handle_t,
     struct sockaddr *, socklen_t *, cred_t *);
 static int	udp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
-    cred_t *cr);
-static int	udp_post_ip_bind_connect(udp_t *, mblk_t *, int);
+    cred_t *, pid_t);
 
 #define	UDP_RECV_HIWATER	(56 * 1024)
 #define	UDP_RECV_LOWATER	128
 #define	UDP_XMIT_HIWATER	(56 * 1024)
 #define	UDP_XMIT_LOWATER	1024
 
-/*
- * The following is defined in tcp.c
- */
-extern int	(*cl_inet_connect2)(netstackid_t stack_id,
-		    uint8_t protocol, boolean_t is_outgoing,
-		    sa_family_t addr_family,
-		    uint8_t *laddrp, in_port_t lport,
-		    uint8_t *faddrp, in_port_t fport, void *args);
+#pragma inline(udp_output_connected, udp_output_newdst, udp_output_lastdst)
 
 /*
  * Checks if the given destination addr/port is allowed out.
@@ -251,7 +224,7 @@ extern int	(*cl_inet_connect2)(netstackid_t stack_id,
  * Called for each connect() and for sendto()/sendmsg() to a different
  * destination.
  * For connect(), called in udp_connect().
- * For sendto()/sendmsg(), called in udp_output_v{4,6}().
+ * For sendto()/sendmsg(), called in udp_output_newdst().
  *
  * This macro assumes that the cl_inet_connect2 hook is not NULL.
  * Please check this before calling this macro.
@@ -260,25 +233,26 @@ extern int	(*cl_inet_connect2)(netstackid_t stack_id,
  * CL_INET_UDP_CONNECT(conn_t cp, udp_t *udp, boolean_t is_outgoing,
  *     in6_addr_t *faddrp, in_port_t (or uint16_t) fport, int err);
  */
-#define	CL_INET_UDP_CONNECT(cp, udp, is_outgoing, faddrp, fport, err) {	\
+#define	CL_INET_UDP_CONNECT(cp, is_outgoing, faddrp, fport, err) {	\
 	(err) = 0;							\
 	/*								\
 	 * Running in cluster mode - check and register active		\
 	 * "connection" information					\
 	 */								\
-	if ((udp)->udp_ipversion == IPV4_VERSION)			\
+	if ((cp)->conn_ipversion == IPV4_VERSION)			\
 		(err) = (*cl_inet_connect2)(				\
 		    (cp)->conn_netstack->netstack_stackid,		\
 		    IPPROTO_UDP, is_outgoing, AF_INET,			\
-		    (uint8_t *)&((udp)->udp_v6src._S6_un._S6_u32[3]),	\
-		    (udp)->udp_port,					\
-		    (uint8_t *)&((faddrp)->_S6_un._S6_u32[3]),		\
+		    (uint8_t *)&((cp)->conn_laddr_v4),			\
+		    (cp)->conn_lport,					\
+		    (uint8_t *)&(V4_PART_OF_V6(*faddrp)),		\
 		    (in_port_t)(fport), NULL);				\
 	else								\
 		(err) = (*cl_inet_connect2)(				\
 		    (cp)->conn_netstack->netstack_stackid,		\
 		    IPPROTO_UDP, is_outgoing, AF_INET6,			\
-		    (uint8_t *)&((udp)->udp_v6src), (udp)->udp_port,	\
+		    (uint8_t *)&((cp)->conn_laddr_v6),			\
+		    (cp)->conn_lport,					\
 		    (uint8_t *)(faddrp), (in_port_t)(fport), NULL);	\
 }
 
@@ -387,6 +361,8 @@ udpparam_t udp_param_arr[] = {
  { 0,		     (1<<30), UDP_XMIT_LOWATER, "udp_xmit_lowat"},
  { UDP_RECV_LOWATER, (1<<30), UDP_RECV_HIWATER,	"udp_recv_hiwat"},
  { 65536,	(1<<30),	2*1024*1024,	"udp_max_buf"},
+ { 0,		1,		0,		"udp_pmtu_discovery" },
+ { 0,		1,		0,		"udp_sendto_ignerr" },
 };
 /* END CSTYLED */
 
@@ -451,9 +427,10 @@ retry:
 static void
 udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock)
 {
-	udp_t	*udpnext;
-	kmutex_t *lockp;
-	udp_stack_t *us = udp->udp_us;
+	udp_t		*udpnext;
+	kmutex_t	*lockp;
+	udp_stack_t	*us = udp->udp_us;
+	conn_t		*connp = udp->udp_connp;
 
 	if (udp->udp_ptpbhn == NULL)
 		return;
@@ -462,9 +439,9 @@ udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock)
 	 * Extract the lock pointer in case there are concurrent
 	 * hash_remove's for this instance.
 	 */
-	ASSERT(udp->udp_port != 0);
+	ASSERT(connp->conn_lport != 0);
 	if (!caller_holds_lock) {
-		lockp = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+		lockp = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
 		    us->us_bind_fanout_size)].uf_lock;
 		ASSERT(lockp != NULL);
 		mutex_enter(lockp);
@@ -486,8 +463,10 @@ udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock)
 static void
 udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp)
 {
+	conn_t	*connp = udp->udp_connp;
 	udp_t	**udpp;
 	udp_t	*udpnext;
+	conn_t	*connext;
 
 	ASSERT(MUTEX_HELD(&uf->uf_lock));
 	ASSERT(udp->udp_ptpbhn == NULL);
@@ -503,11 +482,11 @@ udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp)
 		 * specific address get preference over those binding to
 		 * INADDR_ANY.
 		 */
-		if (V6_OR_V4_INADDR_ANY(udp->udp_bound_v6src) &&
-		    !V6_OR_V4_INADDR_ANY(udpnext->udp_bound_v6src)) {
+		connext = udpnext->udp_connp;
+		if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
+		    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
 			while ((udpnext = udpp[0]) != NULL &&
-			    !V6_OR_V4_INADDR_ANY(
-			    udpnext->udp_bound_v6src)) {
+			    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
 				udpp = &(udpnext->udp_bind_hash);
 			}
 			if (udpnext != NULL)
@@ -525,10 +504,9 @@ udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp)
  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
  * passed to udp_wput.
  * It associates a port number and local address with the stream.
- * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the UDP
- * protocol type (IPPROTO_UDP) placed in the message following the address.
- * A T_BIND_ACK message is passed upstream when ip acknowledges the request.
- * (Called as writer.)
+ * It calls IP to verify the local IP address, and calls IP to insert
+ * the conn_t in the fanout table.
+ * If everything is ok it then sends the T_BIND_ACK back up.
  *
  * Note that UDP over IPv4 and IPv6 sockets can use the same port number
  * without setting SO_REUSEADDR. This is needed so that they
@@ -580,10 +558,10 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
 	}
 	/*
 	 * Reallocate the message to make sure we have enough room for an
-	 * address and the protocol type.
+	 * address.
 	 */
-	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
-	if (!mp1) {
+	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
+	if (mp1 == NULL) {
 		udp_err_ack(q, mp, TSYSERR, ENOMEM);
 		return;
 	}
@@ -597,7 +575,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
 	switch (tbr->ADDR_length) {
 	case 0:			/* Request for a generic port */
 		tbr->ADDR_offset = sizeof (struct T_bind_req);
-		if (udp->udp_family == AF_INET) {
+		if (connp->conn_family == AF_INET) {
 			tbr->ADDR_length = sizeof (sin_t);
 			sin = (sin_t *)&tbr[1];
 			*sin = sin_null;
@@ -605,7 +583,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
 			mp->b_wptr = (uchar_t *)&sin[1];
 			sa = (struct sockaddr *)sin;
 		} else {
-			ASSERT(udp->udp_family == AF_INET6);
+			ASSERT(connp->conn_family == AF_INET6);
 			tbr->ADDR_length = sizeof (sin6_t);
 			sin6 = (sin6_t *)&tbr[1];
 			*sin6 = sin6_null;
@@ -622,7 +600,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
 			udp_err_ack(q, mp, TSYSERR, EINVAL);
 			return;
 		}
-		if (udp->udp_family != AF_INET ||
+		if (connp->conn_family != AF_INET ||
 		    sa->sa_family != AF_INET) {
 			udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
 			return;
@@ -636,7 +614,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
 			udp_err_ack(q, mp, TSYSERR, EINVAL);
 			return;
 		}
-		if (udp->udp_family != AF_INET6 ||
+		if (connp->conn_family != AF_INET6 ||
 		    sa->sa_family != AF_INET6) {
 			udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
 			return;
@@ -669,29 +647,21 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
  * This routine handles each T_CONN_REQ message passed to udp.  It
  * associates a default destination address with the stream.
  *
- * This routine sends down a T_BIND_REQ to IP with the following mblks:
- *	T_BIND_REQ	- specifying local and remote address/port
- *	IRE_DB_REQ_TYPE	- to get an IRE back containing ire_type and src
- *	T_OK_ACK	- for the T_CONN_REQ
- *	T_CONN_CON	- to keep the TPI user happy
- *
- * The connect completes in udp_do_connect.
- * When a T_BIND_ACK is received information is extracted from the IRE
- * and the two appended messages are sent to the TPI user.
- * Should udp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will
- * convert it to an error ack for the appropriate primitive.
+ * After various error checks are completed, udp_connect() lays
+ * the target address and port into the composite header template.
+ * Then we ask IP for information, including a source address if we didn't
+ * already have one. Finally we send up the T_OK_ACK reply message.
  */
 static void
 udp_tpi_connect(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp;
 	conn_t	*connp = Q_TO_CONN(q);
 	int	error;
 	socklen_t	len;
 	struct sockaddr		*sa;
 	struct T_conn_req	*tcr;
 	cred_t		*cr;
-
+	pid_t		pid;
 	/*
 	 * All Solaris components should pass a db_credp
 	 * for this TPI message, hence we ASSERT.
@@ -699,14 +669,13 @@ udp_tpi_connect(queue_t *q, mblk_t *mp)
 	 * like a TPI message sent by some other kernel
 	 * component, we check and return an error.
 	 */
-	cr = msg_getcred(mp, NULL);
+	cr = msg_getcred(mp, &pid);
 	ASSERT(cr != NULL);
 	if (cr == NULL) {
 		udp_err_ack(q, mp, TSYSERR, EINVAL);
 		return;
 	}
 
-	udp = connp->conn_udp;
 	tcr = (struct T_conn_req *)mp->b_rptr;
 
 	/* A bit of sanity checking */
@@ -724,7 +693,7 @@ udp_tpi_connect(queue_t *q, mblk_t *mp)
 	 * Determine packet type based on type of address passed in
 	 * the request should contain an IPv4 or IPv6 address.
 	 * Make sure that address family matches the type of
-	 * family of the the address passed down
+	 * family of the address passed down.
 	 */
 	len = tcr->DEST_length;
 	switch (tcr->DEST_length) {
@@ -743,13 +712,13 @@ udp_tpi_connect(queue_t *q, mblk_t *mp)
 		break;
 	}
 
-	error = proto_verify_ip_addr(udp->udp_family, sa, len);
+	error = proto_verify_ip_addr(connp->conn_family, sa, len);
 	if (error != 0) {
 		udp_err_ack(q, mp, TSYSERR, error);
 		return;
 	}
 
-	error = udp_do_connect(connp, sa, len, cr);
+	error = udp_do_connect(connp, sa, len, cr, pid);
 	if (error != 0) {
 		if (error < 0)
 			udp_err_ack(q, mp, -error, 0);
@@ -761,7 +730,7 @@ udp_tpi_connect(queue_t *q, mblk_t *mp)
 		 * We have to send a connection confirmation to
 		 * keep TLI happy.
 		 */
-		if (udp->udp_family == AF_INET) {
+		if (connp->conn_family == AF_INET) {
 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 			    sizeof (sin_t), NULL, 0);
 		} else {
@@ -810,72 +779,14 @@ done:
 	return (0);
 }
 
-/*
- * Called in the close path to quiesce the conn
- */
-void
-udp_quiesce_conn(conn_t *connp)
-{
-	udp_t	*udp = connp->conn_udp;
-
-	if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) {
-		/*
-		 * Running in cluster mode - register unbind information
-		 */
-		if (udp->udp_ipversion == IPV4_VERSION) {
-			(*cl_inet_unbind)(
-			    connp->conn_netstack->netstack_stackid,
-			    IPPROTO_UDP, AF_INET,
-			    (uint8_t *)(&(V4_PART_OF_V6(udp->udp_v6src))),
-			    (in_port_t)udp->udp_port, NULL);
-		} else {
-			(*cl_inet_unbind)(
-			    connp->conn_netstack->netstack_stackid,
-			    IPPROTO_UDP, AF_INET6,
-			    (uint8_t *)(&(udp->udp_v6src)),
-			    (in_port_t)udp->udp_port, NULL);
-		}
-	}
-
-	udp_bind_hash_remove(udp, B_FALSE);
-
-}
-
-void
+static void
 udp_close_free(conn_t *connp)
 {
 	udp_t *udp = connp->conn_udp;
 
 	/* If there are any options associated with the stream, free them. */
-	if (udp->udp_ip_snd_options != NULL) {
-		mi_free((char *)udp->udp_ip_snd_options);
-		udp->udp_ip_snd_options = NULL;
-		udp->udp_ip_snd_options_len = 0;
-	}
-
-	if (udp->udp_ip_rcv_options != NULL) {
-		mi_free((char *)udp->udp_ip_rcv_options);
-		udp->udp_ip_rcv_options = NULL;
-		udp->udp_ip_rcv_options_len = 0;
-	}
-
-	/* Free memory associated with sticky options */
-	if (udp->udp_sticky_hdrs_len != 0) {
-		kmem_free(udp->udp_sticky_hdrs,
-		    udp->udp_sticky_hdrs_len);
-		udp->udp_sticky_hdrs = NULL;
-		udp->udp_sticky_hdrs_len = 0;
-	}
-	if (udp->udp_last_cred != NULL) {
-		crfree(udp->udp_last_cred);
-		udp->udp_last_cred = NULL;
-	}
-	if (udp->udp_effective_cred != NULL) {
-		crfree(udp->udp_effective_cred);
-		udp->udp_effective_cred = NULL;
-	}
-
-	ip6_pkt_free(&udp->udp_sticky_ipp);
+	if (udp->udp_recv_ipp.ipp_fields != 0)
+		ip_pkt_free(&udp->udp_recv_ipp);
 
 	/*
 	 * Clear any fields which the kmem_cache constructor clears.
@@ -892,59 +803,48 @@ static int
 udp_do_disconnect(conn_t *connp)
 {
 	udp_t	*udp;
-	mblk_t	*ire_mp;
 	udp_fanout_t *udpf;
 	udp_stack_t *us;
 	int	error;
 
 	udp = connp->conn_udp;
 	us = udp->udp_us;
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
-	if (udp->udp_state != TS_DATA_XFER || udp->udp_pending_op != -1) {
-		rw_exit(&udp->udp_rwlock);
+	mutex_enter(&connp->conn_lock);
+	if (udp->udp_state != TS_DATA_XFER) {
+		mutex_exit(&connp->conn_lock);
 		return (-TOUTSTATE);
 	}
-	udp->udp_pending_op = T_DISCON_REQ;
-	udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+	udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
 	    us->us_bind_fanout_size)];
 	mutex_enter(&udpf->uf_lock);
-	udp->udp_v6src = udp->udp_bound_v6src;
+	if (connp->conn_mcbc_bind)
+		connp->conn_saddr_v6 = ipv6_all_zeros;
+	else
+		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+	connp->conn_faddr_v6 = ipv6_all_zeros;
+	connp->conn_fport = 0;
 	udp->udp_state = TS_IDLE;
 	mutex_exit(&udpf->uf_lock);
 
-	if (udp->udp_family == AF_INET6) {
-		/* Rebuild the header template */
-		error = udp_build_hdrs(udp);
-		if (error != 0) {
-			udp->udp_pending_op = -1;
-			rw_exit(&udp->udp_rwlock);
-			return (error);
-		}
-	}
+	/* Remove any remnants of mapped address binding */
+	if (connp->conn_family == AF_INET6)
+		connp->conn_ipversion = IPV6_VERSION;
 
-	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
-	if (ire_mp == NULL) {
-		mutex_enter(&udpf->uf_lock);
-		udp->udp_pending_op = -1;
-		mutex_exit(&udpf->uf_lock);
-		rw_exit(&udp->udp_rwlock);
-		return (ENOMEM);
-	}
-
-	rw_exit(&udp->udp_rwlock);
-
-	if (udp->udp_family == AF_INET6) {
-		error = ip_proto_bind_laddr_v6(connp, &ire_mp, IPPROTO_UDP,
-		    &udp->udp_bound_v6src, udp->udp_port, B_TRUE);
-	} else {
-		error = ip_proto_bind_laddr_v4(connp, &ire_mp, IPPROTO_UDP,
-		    V4_PART_OF_V6(udp->udp_bound_v6src), udp->udp_port, B_TRUE);
-	}
+	connp->conn_v6lastdst = ipv6_all_zeros;
+	error = udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+	mutex_exit(&connp->conn_lock);
+	if (error != 0)
+		return (error);
 
-	return (udp_post_ip_bind_connect(udp, ire_mp, error));
+	/*
+	 * Tell IP to remove the full binding and revert
+	 * to the local address binding.
+	 */
+	return (ip_laddr_fanout_insert(connp));
 }
 
-
 static void
 udp_tpi_disconnect(queue_t *q, mblk_t *mp)
 {
@@ -981,12 +881,9 @@ int
 udp_disconnect(conn_t *connp)
 {
 	int error;
-	udp_t *udp = connp->conn_udp;
-
-	udp->udp_dgram_errind = B_FALSE;
 
+	connp->conn_dgram_errind = B_FALSE;
 	error = udp_do_disconnect(connp);
-
 	if (error < 0)
 		error = proto_tlitosyserr(-error);
 
@@ -1003,8 +900,8 @@ udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
 
 /* Shorthand to generate and send TPI error acks to our client */
 static void
-udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, t_scalar_t t_error,
-    int sys_error)
+udp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
+    t_scalar_t t_error, int sys_error)
 {
 	struct T_error_ack	*teackp;
 
@@ -1018,7 +915,7 @@ udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, t_scalar_t t_error,
 	}
 }
 
-/*ARGSUSED*/
+/*ARGSUSED2*/
 static int
 udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
 {
@@ -1033,7 +930,7 @@ udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
 	return (0);
 }
 
-/* ARGSUSED */
+/* ARGSUSED1 */
 static int
 udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
     cred_t *cr)
@@ -1072,7 +969,7 @@ udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
 	return (0);
 }
 
-/* ARGSUSED */
+/* ARGSUSED1 */
 static int
 udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
     cred_t *cr)
@@ -1109,39 +1006,41 @@ udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
 #define	ICMP_MIN_UDP_HDR	4
 
 /*
- * udp_icmp_error is called by udp_input to process ICMP msgs. passed up by IP.
+ * udp_icmp_input is called as conn_recvicmp to process ICMP messages.
  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
  * Assumes that IP has pulled up everything up to and including the ICMP header.
  */
+/* ARGSUSED2 */
 static void
-udp_icmp_error(conn_t *connp, mblk_t *mp)
+udp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 {
-	icmph_t *icmph;
-	ipha_t	*ipha;
-	int	iph_hdr_length;
-	udpha_t	*udpha;
-	sin_t	sin;
-	sin6_t	sin6;
-	mblk_t	*mp1;
-	int	error = 0;
-	udp_t	*udp = connp->conn_udp;
+	conn_t		*connp = (conn_t *)arg1;
+	icmph_t		*icmph;
+	ipha_t		*ipha;
+	int		iph_hdr_length;
+	udpha_t		*udpha;
+	sin_t		sin;
+	sin6_t		sin6;
+	mblk_t		*mp1;
+	int		error = 0;
+	udp_t		*udp = connp->conn_udp;
 
-	mp1 = NULL;
 	ipha = (ipha_t *)mp->b_rptr;
 
 	ASSERT(OK_32PTR(mp->b_rptr));
 
 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
-		udp_icmp_error_ipv6(connp, mp);
+		udp_icmp_error_ipv6(connp, mp, ira);
 		return;
 	}
 	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
 
 	/* Skip past the outer IP and ICMP headers */
-	iph_hdr_length = IPH_HDR_LENGTH(ipha);
+	ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
+	iph_hdr_length = ira->ira_ip_hdr_length;
 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
-	ipha = (ipha_t *)&icmph[1];
+	ipha = (ipha_t *)&icmph[1];	/* Inner IP header */
 
 	/* Skip past the inner IP and find the ULP header */
 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
@@ -1150,11 +1049,41 @@ udp_icmp_error(conn_t *connp, mblk_t *mp)
 	switch (icmph->icmph_type) {
 	case ICMP_DEST_UNREACHABLE:
 		switch (icmph->icmph_code) {
-		case ICMP_FRAGMENTATION_NEEDED:
+		case ICMP_FRAGMENTATION_NEEDED: {
+			ipha_t		*ipha;
+			ip_xmit_attr_t	*ixa;
 			/*
 			 * IP has already adjusted the path MTU.
+			 * But we need to adjust DF for IPv4.
 			 */
+			if (connp->conn_ipversion != IPV4_VERSION)
+				break;
+
+			ixa = conn_get_ixa(connp, B_FALSE);
+			if (ixa == NULL || ixa->ixa_ire == NULL) {
+				/*
+				 * Some other thread holds conn_ixa. We will
+				 * redo this on the next ICMP too big.
+				 */
+				if (ixa != NULL)
+					ixa_refrele(ixa);
+				break;
+			}
+			(void) ip_get_pmtu(ixa);
+
+			mutex_enter(&connp->conn_lock);
+			ipha = (ipha_t *)connp->conn_ht_iphc;
+			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
+				ipha->ipha_fragment_offset_and_flags |=
+				    IPH_DF_HTONS;
+			} else {
+				ipha->ipha_fragment_offset_and_flags &=
+				    ~IPH_DF_HTONS;
+			}
+			mutex_exit(&connp->conn_lock);
+			ixa_refrele(ixa);
 			break;
+		}
 		case ICMP_PORT_UNREACHABLE:
 		case ICMP_PROTOCOL_UNREACHABLE:
 			error = ECONNREFUSED;
@@ -1177,25 +1106,24 @@ udp_icmp_error(conn_t *connp, mblk_t *mp)
 	 * Deliver T_UDERROR_IND when the application has asked for it.
 	 * The socket layer enables this automatically when connected.
 	 */
-	if (!udp->udp_dgram_errind) {
+	if (!connp->conn_dgram_errind) {
 		freemsg(mp);
 		return;
 	}
 
-
-	switch (udp->udp_family) {
+	switch (connp->conn_family) {
 	case AF_INET:
 		sin = sin_null;
 		sin.sin_family = AF_INET;
 		sin.sin_addr.s_addr = ipha->ipha_dst;
 		sin.sin_port = udpha->uha_dst_port;
 		if (IPCL_IS_NONSTR(connp)) {
-			rw_enter(&udp->udp_rwlock, RW_WRITER);
+			mutex_enter(&connp->conn_lock);
 			if (udp->udp_state == TS_DATA_XFER) {
-				if (sin.sin_port == udp->udp_dstport &&
+				if (sin.sin_port == connp->conn_fport &&
 				    sin.sin_addr.s_addr ==
-				    V4_PART_OF_V6(udp->udp_v6dst)) {
-					rw_exit(&udp->udp_rwlock);
+				    connp->conn_faddr_v4) {
+					mutex_exit(&connp->conn_lock);
 					(*connp->conn_upcalls->su_set_error)
 					    (connp->conn_upper_handle, error);
 					goto done;
@@ -1204,10 +1132,12 @@ udp_icmp_error(conn_t *connp, mblk_t *mp)
 				udp->udp_delayed_error = error;
 				*((sin_t *)&udp->udp_delayed_addr) = sin;
 			}
-			rw_exit(&udp->udp_rwlock);
+			mutex_exit(&connp->conn_lock);
 		} else {
 			mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t),
 			    NULL, 0, error);
+			if (mp1 != NULL)
+				putnext(connp->conn_rq, mp1);
 		}
 		break;
 	case AF_INET6:
@@ -1216,12 +1146,12 @@ udp_icmp_error(conn_t *connp, mblk_t *mp)
 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &sin6.sin6_addr);
 		sin6.sin6_port = udpha->uha_dst_port;
 		if (IPCL_IS_NONSTR(connp)) {
-			rw_enter(&udp->udp_rwlock, RW_WRITER);
+			mutex_enter(&connp->conn_lock);
 			if (udp->udp_state == TS_DATA_XFER) {
-				if (sin6.sin6_port == udp->udp_dstport &&
+				if (sin6.sin6_port == connp->conn_fport &&
 				    IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
-				    &udp->udp_v6dst)) {
-					rw_exit(&udp->udp_rwlock);
+				    &connp->conn_faddr_v6)) {
+					mutex_exit(&connp->conn_lock);
 					(*connp->conn_upcalls->su_set_error)
 					    (connp->conn_upper_handle, error);
 					goto done;
@@ -1230,17 +1160,16 @@ udp_icmp_error(conn_t *connp, mblk_t *mp)
 				udp->udp_delayed_error = error;
 				*((sin6_t *)&udp->udp_delayed_addr) = sin6;
 			}
-			rw_exit(&udp->udp_rwlock);
+			mutex_exit(&connp->conn_lock);
 		} else {
 			mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
 			    NULL, 0, error);
+			if (mp1 != NULL)
+				putnext(connp->conn_rq, mp1);
 		}
 		break;
 	}
-	if (mp1 != NULL)
-		putnext(connp->conn_rq, mp1);
 done:
-	ASSERT(!RW_ISWRITER(&udp->udp_rwlock));
 	freemsg(mp);
 }
 
@@ -1251,7 +1180,7 @@ done:
  * ICMPv6 header.
  */
 static void
-udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
+udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
 {
 	icmp6_t		*icmp6;
 	ip6_t		*ip6h, *outer_ip6h;
@@ -1265,12 +1194,19 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 	udp_stack_t	*us = udp->udp_us;
 
 	outer_ip6h = (ip6_t *)mp->b_rptr;
+#ifdef DEBUG
 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
 	else
 		iph_hdr_length = IPV6_HDR_LEN;
+	ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
+#endif
+	/* Skip past the outer IP and ICMP headers */
+	iph_hdr_length = ira->ira_ip_hdr_length;
 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
-	ip6h = (ip6_t *)&icmp6[1];
+
+	/* Skip past the inner IP and find the ULP header */
+	ip6h = (ip6_t *)&icmp6[1];	/* Inner IP header */
 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
 		freemsg(mp);
 		return;
@@ -1308,7 +1244,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 		 * information, send up an empty message containing an
 		 * IPV6_PATHMTU ancillary data item.
 		 */
-		if (!udp->udp_ipv6_recvpathmtu)
+		if (!connp->conn_ipv6_recvpathmtu)
 			break;
 
 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
@@ -1334,7 +1270,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 		sin6 = (sin6_t *)&tudi[1];
 		bzero(sin6, sizeof (sin6_t));
 		sin6->sin6_family = AF_INET6;
-		sin6->sin6_addr = udp->udp_v6dst;
+		sin6->sin6_addr = connp->conn_faddr_v6;
 
 		toh = (struct T_opthdr *)&sin6[1];
 		toh->level = IPPROTO_IPV6;
@@ -1352,8 +1288,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 		 * message.  Free it, then send our empty message.
 		 */
 		freemsg(mp);
-		udp_ulp_recv(connp, newmp);
-
+		udp_ulp_recv(connp, newmp, msgdsize(newmp), ira);
 		return;
 	}
 	case ICMP6_TIME_EXCEEDED:
@@ -1378,7 +1313,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 	 * Deliver T_UDERROR_IND when the application has asked for it.
 	 * The socket layer enables this automatically when connected.
 	 */
-	if (!udp->udp_dgram_errind) {
+	if (!connp->conn_dgram_errind) {
 		freemsg(mp);
 		return;
 	}
@@ -1390,12 +1325,12 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
 
 	if (IPCL_IS_NONSTR(connp)) {
-		rw_enter(&udp->udp_rwlock, RW_WRITER);
+		mutex_enter(&connp->conn_lock);
 		if (udp->udp_state == TS_DATA_XFER) {
-			if (sin6.sin6_port == udp->udp_dstport &&
+			if (sin6.sin6_port == connp->conn_fport &&
 			    IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
-			    &udp->udp_v6dst)) {
-				rw_exit(&udp->udp_rwlock);
+			    &connp->conn_faddr_v6)) {
+				mutex_exit(&connp->conn_lock);
 				(*connp->conn_upcalls->su_set_error)
 				    (connp->conn_upper_handle, error);
 				goto done;
@@ -1404,7 +1339,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 			udp->udp_delayed_error = error;
 			*((sin6_t *)&udp->udp_delayed_addr) = sin6;
 		}
-		rw_exit(&udp->udp_rwlock);
+		mutex_exit(&connp->conn_lock);
 	} else {
 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
 		    NULL, 0, error);
@@ -1412,7 +1347,6 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
 			putnext(connp->conn_rq, mp1);
 	}
 done:
-	ASSERT(!RW_ISWRITER(&udp->udp_rwlock));
 	freemsg(mp);
 }
 
@@ -1426,11 +1360,12 @@ done:
 static void
 udp_addr_req(queue_t *q, mblk_t *mp)
 {
-	sin_t	*sin;
-	sin6_t	*sin6;
+	struct sockaddr *sa;
 	mblk_t	*ackmp;
 	struct T_addr_ack *taa;
 	udp_t	*udp = Q_TO_UDP(q);
+	conn_t	*connp = udp->udp_connp;
+	uint_t	addrlen;
 
 	/* Make it large enough for worst case */
 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
@@ -1446,7 +1381,13 @@ udp_addr_req(queue_t *q, mblk_t *mp)
 
 	taa->PRIM_type = T_ADDR_ACK;
 	ackmp->b_datap->db_type = M_PCPROTO;
-	rw_enter(&udp->udp_rwlock, RW_READER);
+
+	if (connp->conn_family == AF_INET)
+		addrlen = sizeof (sin_t);
+	else
+		addrlen = sizeof (sin6_t);
+
+	mutex_enter(&connp->conn_lock);
 	/*
 	 * Note: Following code assumes 32 bit alignment of basic
 	 * data structures like sin_t and struct T_addr_ack.
@@ -1456,91 +1397,23 @@ udp_addr_req(queue_t *q, mblk_t *mp)
 		 * Fill in local address first
 		 */
 		taa->LOCADDR_offset = sizeof (*taa);
-		if (udp->udp_family == AF_INET) {
-			taa->LOCADDR_length = sizeof (sin_t);
-			sin = (sin_t *)&taa[1];
-			/* Fill zeroes and then initialize non-zero fields */
-			*sin = sin_null;
-			sin->sin_family = AF_INET;
-			if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) &&
-			    !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
-				IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6src,
-				    sin->sin_addr.s_addr);
-			} else {
-				/*
-				 * INADDR_ANY
-				 * udp_v6src is not set, we might be bound to
-				 * broadcast/multicast. Use udp_bound_v6src as
-				 * local address instead (that could
-				 * also still be INADDR_ANY)
-				 */
-				IN6_V4MAPPED_TO_IPADDR(&udp->udp_bound_v6src,
-				    sin->sin_addr.s_addr);
-			}
-			sin->sin_port = udp->udp_port;
-			ackmp->b_wptr = (uchar_t *)&sin[1];
-			if (udp->udp_state == TS_DATA_XFER) {
-				/*
-				 * connected, fill remote address too
-				 */
-				taa->REMADDR_length = sizeof (sin_t);
-				/* assumed 32-bit alignment */
-				taa->REMADDR_offset = taa->LOCADDR_offset +
-				    taa->LOCADDR_length;
-
-				sin = (sin_t *)(ackmp->b_rptr +
-				    taa->REMADDR_offset);
-				/* initialize */
-				*sin = sin_null;
-				sin->sin_family = AF_INET;
-				sin->sin_addr.s_addr =
-				    V4_PART_OF_V6(udp->udp_v6dst);
-				sin->sin_port = udp->udp_dstport;
-				ackmp->b_wptr = (uchar_t *)&sin[1];
-			}
-		} else {
-			taa->LOCADDR_length = sizeof (sin6_t);
-			sin6 = (sin6_t *)&taa[1];
-			/* Fill zeroes and then initialize non-zero fields */
-			*sin6 = sin6_null;
-			sin6->sin6_family = AF_INET6;
-			if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
-				sin6->sin6_addr = udp->udp_v6src;
-			} else {
-				/*
-				 * UNSPECIFIED
-				 * udp_v6src is not set, we might be bound to
-				 * broadcast/multicast. Use udp_bound_v6src as
-				 * local address instead (that could
-				 * also still be UNSPECIFIED)
-				 */
-				sin6->sin6_addr =
-				    udp->udp_bound_v6src;
-			}
-			sin6->sin6_port = udp->udp_port;
-			ackmp->b_wptr = (uchar_t *)&sin6[1];
-			if (udp->udp_state == TS_DATA_XFER) {
-				/*
-				 * connected, fill remote address too
-				 */
-				taa->REMADDR_length = sizeof (sin6_t);
-				/* assumed 32-bit alignment */
-				taa->REMADDR_offset = taa->LOCADDR_offset +
-				    taa->LOCADDR_length;
-
-				sin6 = (sin6_t *)(ackmp->b_rptr +
-				    taa->REMADDR_offset);
-				/* initialize */
-				*sin6 = sin6_null;
-				sin6->sin6_family = AF_INET6;
-				sin6->sin6_addr = udp->udp_v6dst;
-				sin6->sin6_port =  udp->udp_dstport;
-				ackmp->b_wptr = (uchar_t *)&sin6[1];
-			}
-			ackmp->b_wptr = (uchar_t *)&sin6[1];
-		}
+		taa->LOCADDR_length = addrlen;
+		sa = (struct sockaddr *)&taa[1];
+		(void) conn_getsockname(connp, sa, &addrlen);
+		ackmp->b_wptr += addrlen;
 	}
-	rw_exit(&udp->udp_rwlock);
+	if (udp->udp_state == TS_DATA_XFER) {
+		/*
+		 * connected, fill remote address too
+		 */
+		taa->REMADDR_length = addrlen;
+		/* assumed 32-bit alignment */
+		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
+		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
+		(void) conn_getpeername(connp, sa, &addrlen);
+		ackmp->b_wptr += addrlen;
+	}
+	mutex_exit(&connp->conn_lock);
 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
 	qreply(q, ackmp);
 }
@@ -1548,7 +1421,9 @@ udp_addr_req(queue_t *q, mblk_t *mp)
 static void
 udp_copy_info(struct T_info_ack *tap, udp_t *udp)
 {
-	if (udp->udp_family == AF_INET) {
+	conn_t		*connp = udp->udp_connp;
+
+	if (connp->conn_family == AF_INET) {
 		*tap = udp_g_t_info_ack_ipv4;
 	} else {
 		*tap = udp_g_t_info_ack_ipv6;
@@ -1632,20 +1507,15 @@ udp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
  * This is the open routine for udp.  It allocates a udp_t structure for
  * the stream and, on the first open of the module, creates an ND table.
  */
-/*ARGSUSED2*/
 static int
 udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
     boolean_t isv6)
 {
-	int		error;
 	udp_t		*udp;
 	conn_t		*connp;
 	dev_t		conn_dev;
-	udp_stack_t	*us;
 	vmem_t		*minor_arena;
 
-	TRACE_1(TR_FAC_UDP, TR_UDP_OPEN, "udp_open: q %p", q);
-
 	/* If the stream is already open, return immediately. */
 	if (q->q_ptr != NULL)
 		return (0);
@@ -1685,7 +1555,6 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 		return (ENOMEM);
 	}
 	udp = connp->conn_udp;
-	us = udp->udp_us;
 
 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
 	connp->conn_dev = conn_dev;
@@ -1699,39 +1568,27 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 	connp->conn_rq = q;
 	connp->conn_wq = WR(q);
 
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
-	ASSERT(connp->conn_ulp == IPPROTO_UDP);
+	/*
+	 * Since this conn_t/udp_t is not yet visible to anybody else we don't
+	 * need to lock anything.
+	 */
+	ASSERT(connp->conn_proto == IPPROTO_UDP);
 	ASSERT(connp->conn_udp == udp);
 	ASSERT(udp->udp_connp == connp);
 
 	if (flag & SO_SOCKSTR) {
-		connp->conn_flags |= IPCL_SOCKET;
 		udp->udp_issocket = B_TRUE;
 	}
 
-	q->q_hiwat = us->us_recv_hiwat;
-	WR(q)->q_hiwat = us->us_xmit_hiwat;
-	WR(q)->q_lowat = us->us_xmit_lowat;
+	WR(q)->q_hiwat = connp->conn_sndbuf;
+	WR(q)->q_lowat = connp->conn_sndlowat;
 
 	qprocson(q);
 
-	if (udp->udp_family == AF_INET6) {
-		/* Build initial header template for transmit */
-		if ((error = udp_build_hdrs(udp)) != 0) {
-			rw_exit(&udp->udp_rwlock);
-			qprocsoff(q);
-			inet_minor_free(minor_arena, conn_dev);
-			ipcl_conn_destroy(connp);
-			return (error);
-		}
-	}
-	rw_exit(&udp->udp_rwlock);
-
 	/* Set the Stream head write offset and high watermark. */
-	(void) proto_set_tx_wroff(q, connp,
-	    udp->udp_max_hdr_len + us->us_wroff_extra);
-	/* XXX udp_set_rcv_hiwat() doesn't hold the lock, is it a bug??? */
-	(void) proto_set_rx_hiwat(q, connp, udp_set_rcv_hiwat(udp, q->q_hiwat));
+	(void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
+	(void) proto_set_rx_hiwat(q, connp,
+	    udp_set_rcv_hiwat(udp, connp->conn_rcvbuf));
 
 	mutex_enter(&connp->conn_lock);
 	connp->conn_state_flags &= ~CONN_INCIPIENT;
@@ -1753,7 +1610,6 @@ udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
  * This routine gets default values of certain options whose default
  * values are maintained by protcol specific code
  */
-/* ARGSUSED */
 int
 udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
 {
@@ -1791,456 +1647,127 @@ udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
 
 /*
  * This routine retrieves the current status of socket options.
- * It returns the size of the option retrieved.
+ * It returns the size of the option retrieved, or -1.
  */
-static int
-udp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
+int
+udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name,
+    uchar_t *ptr)
 {
-	udp_t		*udp = connp->conn_udp;
-	udp_stack_t	*us = udp->udp_us;
 	int		*i1 = (int *)ptr;
-	ip6_pkt_t 	*ipp = &udp->udp_sticky_ipp;
+	udp_t		*udp = connp->conn_udp;
 	int		len;
+	conn_opt_arg_t	coas;
+	int		retval;
 
-	ASSERT(RW_READ_HELD(&udp->udp_rwlock));
-	switch (level) {
-	case SOL_SOCKET:
-		switch (name) {
-		case SO_DEBUG:
-			*i1 = udp->udp_debug;
-			break;	/* goto sizeof (int) option return */
-		case SO_REUSEADDR:
-			*i1 = udp->udp_reuseaddr;
-			break;	/* goto sizeof (int) option return */
-		case SO_TYPE:
-			*i1 = SOCK_DGRAM;
-			break;	/* goto sizeof (int) option return */
+	coas.coa_connp = connp;
+	coas.coa_ixa = connp->conn_ixa;
+	coas.coa_ipp = &connp->conn_xmit_ipp;
+	coas.coa_ancillary = B_FALSE;
+	coas.coa_changed = 0;
 
+	/*
+	 * We assume that the optcom framework has checked for the set
+	 * of levels and names that are supported, hence we don't worry
+	 * about rejecting based on that.
+	 * First check for UDP specific handling, then pass to common routine.
+	 */
+	switch (level) {
+	case IPPROTO_IP:
 		/*
-		 * The following three items are available here,
-		 * but are only meaningful to IP.
+		 * Only allow IPv4 option processing on IPv4 sockets.
 		 */
-		case SO_DONTROUTE:
-			*i1 = udp->udp_dontroute;
-			break;	/* goto sizeof (int) option return */
-		case SO_USELOOPBACK:
-			*i1 = udp->udp_useloopback;
-			break;	/* goto sizeof (int) option return */
-		case SO_BROADCAST:
-			*i1 = udp->udp_broadcast;
-			break;	/* goto sizeof (int) option return */
-
-		case SO_SNDBUF:
-			*i1 = udp->udp_xmit_hiwat;
-			break;	/* goto sizeof (int) option return */
-		case SO_RCVBUF:
-			*i1 = udp->udp_rcv_disply_hiwat;
-			break;	/* goto sizeof (int) option return */
-		case SO_DGRAM_ERRIND:
-			*i1 = udp->udp_dgram_errind;
-			break;	/* goto sizeof (int) option return */
-		case SO_RECVUCRED:
-			*i1 = udp->udp_recvucred;
-			break;	/* goto sizeof (int) option return */
-		case SO_TIMESTAMP:
-			*i1 = udp->udp_timestamp;
-			break;	/* goto sizeof (int) option return */
-		case SO_ANON_MLP:
-			*i1 = connp->conn_anon_mlp;
-			break;	/* goto sizeof (int) option return */
-		case SO_MAC_EXEMPT:
-			*i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
-			break;
-		case SO_MAC_IMPLICIT:
-			*i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
-			break;
-		case SO_ALLZONES:
-			*i1 = connp->conn_allzones;
-			break;	/* goto sizeof (int) option return */
-		case SO_EXCLBIND:
-			*i1 = udp->udp_exclbind ? SO_EXCLBIND : 0;
-			break;
-		case SO_PROTOTYPE:
-			*i1 = IPPROTO_UDP;
-			break;
-		case SO_DOMAIN:
-			*i1 = udp->udp_family;
-			break;
-		default:
-			return (-1);
-		}
-		break;
-	case IPPROTO_IP:
-		if (udp->udp_family != AF_INET)
+		if (connp->conn_family != AF_INET)
 			return (-1);
+
 		switch (name) {
 		case IP_OPTIONS:
 		case T_IP_OPTIONS:
-			len = udp->udp_ip_rcv_options_len - udp->udp_label_len;
-			if (len > 0) {
-				bcopy(udp->udp_ip_rcv_options +
-				    udp->udp_label_len, ptr, len);
-			}
-			return (len);
-		case IP_TOS:
-		case T_IP_TOS:
-			*i1 = (int)udp->udp_type_of_service;
-			break;	/* goto sizeof (int) option return */
-		case IP_TTL:
-			*i1 = (int)udp->udp_ttl;
-			break;	/* goto sizeof (int) option return */
-		case IP_DHCPINIT_IF:
-			return (-EINVAL);
-		case IP_NEXTHOP:
-		case IP_RECVPKTINFO:
-			/*
-			 * This also handles IP_PKTINFO.
-			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
-			 * Differentiation is based on the size of the argument
-			 * passed in.
-			 * This option is handled in IP which will return an
-			 * error for IP_PKTINFO as it's not supported as a
-			 * sticky option.
-			 */
-			return (-EINVAL);
-		case IP_MULTICAST_IF:
-			/* 0 address if not set */
-			*(ipaddr_t *)ptr = udp->udp_multicast_if_addr;
-			return (sizeof (ipaddr_t));
-		case IP_MULTICAST_TTL:
-			*(uchar_t *)ptr = udp->udp_multicast_ttl;
-			return (sizeof (uchar_t));
-		case IP_MULTICAST_LOOP:
-			*ptr = connp->conn_multicast_loop;
-			return (sizeof (uint8_t));
-		case IP_RECVOPTS:
-			*i1 = udp->udp_recvopts;
-			break;	/* goto sizeof (int) option return */
-		case IP_RECVDSTADDR:
-			*i1 = udp->udp_recvdstaddr;
-			break;	/* goto sizeof (int) option return */
-		case IP_RECVIF:
-			*i1 = udp->udp_recvif;
-			break;	/* goto sizeof (int) option return */
-		case IP_RECVSLLA:
-			*i1 = udp->udp_recvslla;
-			break;	/* goto sizeof (int) option return */
-		case IP_RECVTTL:
-			*i1 = udp->udp_recvttl;
-			break;	/* goto sizeof (int) option return */
-		case IP_ADD_MEMBERSHIP:
-		case IP_DROP_MEMBERSHIP:
-		case IP_BLOCK_SOURCE:
-		case IP_UNBLOCK_SOURCE:
-		case IP_ADD_SOURCE_MEMBERSHIP:
-		case IP_DROP_SOURCE_MEMBERSHIP:
-		case MCAST_JOIN_GROUP:
-		case MCAST_LEAVE_GROUP:
-		case MCAST_BLOCK_SOURCE:
-		case MCAST_UNBLOCK_SOURCE:
-		case MCAST_JOIN_SOURCE_GROUP:
-		case MCAST_LEAVE_SOURCE_GROUP:
-			/* cannot "get" the value for these */
-			return (-1);
-		case IP_BOUND_IF:
-			/* Zero if not set */
-			*i1 = udp->udp_bound_if;
-			break;	/* goto sizeof (int) option return */
-		case IP_UNSPEC_SRC:
-			*i1 = udp->udp_unspec_source;
-			break;	/* goto sizeof (int) option return */
-		case IP_BROADCAST_TTL:
-			*(uchar_t *)ptr = connp->conn_broadcast_ttl;
-			return (sizeof (uchar_t));
-		default:
-			return (-1);
-		}
-		break;
-	case IPPROTO_IPV6:
-		if (udp->udp_family != AF_INET6)
-			return (-1);
-		switch (name) {
-		case IPV6_UNICAST_HOPS:
-			*i1 = (unsigned int)udp->udp_ttl;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_MULTICAST_IF:
-			/* 0 index if not set */
-			*i1 = udp->udp_multicast_if_index;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_MULTICAST_HOPS:
-			*i1 = udp->udp_multicast_ttl;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_MULTICAST_LOOP:
-			*i1 = connp->conn_multicast_loop;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_JOIN_GROUP:
-		case IPV6_LEAVE_GROUP:
-		case MCAST_JOIN_GROUP:
-		case MCAST_LEAVE_GROUP:
-		case MCAST_BLOCK_SOURCE:
-		case MCAST_UNBLOCK_SOURCE:
-		case MCAST_JOIN_SOURCE_GROUP:
-		case MCAST_LEAVE_SOURCE_GROUP:
-			/* cannot "get" the value for these */
-			return (-1);
-		case IPV6_BOUND_IF:
-			/* Zero if not set */
-			*i1 = udp->udp_bound_if;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_UNSPEC_SRC:
-			*i1 = udp->udp_unspec_source;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVPKTINFO:
-			*i1 = udp->udp_ip_recvpktinfo;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVTCLASS:
-			*i1 = udp->udp_ipv6_recvtclass;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVPATHMTU:
-			*i1 = udp->udp_ipv6_recvpathmtu;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVHOPLIMIT:
-			*i1 = udp->udp_ipv6_recvhoplimit;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVHOPOPTS:
-			*i1 = udp->udp_ipv6_recvhopopts;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVDSTOPTS:
-			*i1 = udp->udp_ipv6_recvdstopts;
-			break;	/* goto sizeof (int) option return */
-		case _OLD_IPV6_RECVDSTOPTS:
-			*i1 = udp->udp_old_ipv6_recvdstopts;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVRTHDRDSTOPTS:
-			*i1 = udp->udp_ipv6_recvrthdrdstopts;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_RECVRTHDR:
-			*i1 = udp->udp_ipv6_recvrthdr;
-			break;	/* goto sizeof (int) option return */
-		case IPV6_PKTINFO: {
-			/* XXX assumes that caller has room for max size! */
-			struct in6_pktinfo *pkti;
-
-			pkti = (struct in6_pktinfo *)ptr;
-			if (ipp->ipp_fields & IPPF_IFINDEX)
-				pkti->ipi6_ifindex = ipp->ipp_ifindex;
-			else
-				pkti->ipi6_ifindex = 0;
-			if (ipp->ipp_fields & IPPF_ADDR)
-				pkti->ipi6_addr = ipp->ipp_addr;
-			else
-				pkti->ipi6_addr = ipv6_all_zeros;
-			return (sizeof (struct in6_pktinfo));
-		}
-		case IPV6_TCLASS:
-			if (ipp->ipp_fields & IPPF_TCLASS)
-				*i1 = ipp->ipp_tclass;
-			else
-				*i1 = IPV6_FLOW_TCLASS(
-				    IPV6_DEFAULT_VERS_AND_FLOW);
-			break;	/* goto sizeof (int) option return */
-		case IPV6_NEXTHOP: {
-			sin6_t *sin6 = (sin6_t *)ptr;
-
-			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
-				return (0);
-			*sin6 = sin6_null;
-			sin6->sin6_family = AF_INET6;
-			sin6->sin6_addr = ipp->ipp_nexthop;
-			return (sizeof (sin6_t));
-		}
-		case IPV6_HOPOPTS:
-			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
-				return (0);
-			if (ipp->ipp_hopoptslen <= udp->udp_label_len_v6)
+			mutex_enter(&connp->conn_lock);
+			if (!(udp->udp_recv_ipp.ipp_fields &
+			    IPPF_IPV4_OPTIONS)) {
+				mutex_exit(&connp->conn_lock);
 				return (0);
-			/*
-			 * The cipso/label option is added by kernel.
-			 * User is not usually aware of this option.
-			 * We copy out the hbh opt after the label option.
-			 */
-			bcopy((char *)ipp->ipp_hopopts + udp->udp_label_len_v6,
-			    ptr, ipp->ipp_hopoptslen - udp->udp_label_len_v6);
-			if (udp->udp_label_len_v6 > 0) {
-				ptr[0] = ((char *)ipp->ipp_hopopts)[0];
-				ptr[1] = (ipp->ipp_hopoptslen -
-				    udp->udp_label_len_v6 + 7) / 8 - 1;
 			}
-			return (ipp->ipp_hopoptslen - udp->udp_label_len_v6);
-		case IPV6_RTHDRDSTOPTS:
-			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
-				return (0);
-			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
-			return (ipp->ipp_rtdstoptslen);
-		case IPV6_RTHDR:
-			if (!(ipp->ipp_fields & IPPF_RTHDR))
-				return (0);
-			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
-			return (ipp->ipp_rthdrlen);
-		case IPV6_DSTOPTS:
-			if (!(ipp->ipp_fields & IPPF_DSTOPTS))
-				return (0);
-			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
-			return (ipp->ipp_dstoptslen);
-		case IPV6_PATHMTU:
-			return (ip_fill_mtuinfo(&udp->udp_v6dst,
-			    udp->udp_dstport, (struct ip6_mtuinfo *)ptr,
-			    us->us_netstack));
-		default:
-			return (-1);
+
+			len = udp->udp_recv_ipp.ipp_ipv4_options_len;
+			ASSERT(len != 0);
+			bcopy(udp->udp_recv_ipp.ipp_ipv4_options, ptr, len);
+			mutex_exit(&connp->conn_lock);
+			return (len);
 		}
 		break;
 	case IPPROTO_UDP:
 		switch (name) {
-		case UDP_ANONPRIVBIND:
-			*i1 = udp->udp_anon_priv_bind;
-			break;
-		case UDP_EXCLBIND:
-			*i1 = udp->udp_exclbind ? UDP_EXCLBIND : 0;
-			break;
-		case UDP_RCVHDR:
-			*i1 = udp->udp_rcvhdr ? 1 : 0;
-			break;
 		case UDP_NAT_T_ENDPOINT:
+			mutex_enter(&connp->conn_lock);
 			*i1 = udp->udp_nat_t_endpoint;
-			break;
-		default:
-			return (-1);
+			mutex_exit(&connp->conn_lock);
+			return (sizeof (int));
+		case UDP_RCVHDR:
+			mutex_enter(&connp->conn_lock);
+			*i1 = udp->udp_rcvhdr ? 1 : 0;
+			mutex_exit(&connp->conn_lock);
+			return (sizeof (int));
 		}
-		break;
-	default:
-		return (-1);
 	}
-	return (sizeof (int));
+	mutex_enter(&connp->conn_lock);
+	retval = conn_opt_get(&coas, level, name, ptr);
+	mutex_exit(&connp->conn_lock);
+	return (retval);
 }
 
+/*
+ * This routine retrieves the current status of socket options.
+ * It returns the size of the option retrieved, or -1.
+ */
 int
 udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
 {
-	udp_t   *udp;
-	int	err;
-
-	udp = Q_TO_UDP(q);
+	conn_t		*connp = Q_TO_CONN(q);
+	int		err;
 
-	rw_enter(&udp->udp_rwlock, RW_READER);
-	err = udp_opt_get(Q_TO_CONN(q), level, name, ptr);
-	rw_exit(&udp->udp_rwlock);
+	err = udp_opt_get(connp, level, name, ptr);
 	return (err);
 }
 
 /*
  * This routine sets socket options.
  */
-/* ARGSUSED */
-static int
-udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
-    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
-    void *thisdg_attrs, boolean_t checkonly)
+int
+udp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
+    uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
 {
-	udpattrs_t *attrs = thisdg_attrs;
-	int	*i1 = (int *)invalp;
-	boolean_t onoff = (*i1 == 0) ? 0 : 1;
-	udp_t	*udp = connp->conn_udp;
+	conn_t		*connp = coa->coa_connp;
+	ip_xmit_attr_t	*ixa = coa->coa_ixa;
+	udp_t		*udp = connp->conn_udp;
 	udp_stack_t	*us = udp->udp_us;
-	int	error;
-	uint_t	newlen;
-	size_t	sth_wroff;
+	int		*i1 = (int *)invalp;
+	boolean_t 	onoff = (*i1 == 0) ? 0 : 1;
+	int		error;
 
-	ASSERT(RW_WRITE_HELD(&udp->udp_rwlock));
+	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
 	/*
-	 * For fixed length options, no sanity check
-	 * of passed in length is done. It is assumed *_optcom_req()
-	 * routines do the right thing.
+	 * First do UDP specific sanity checks and handle UDP specific
+	 * options. Note that some IPPROTO_UDP options are handled
+	 * by conn_opt_set.
 	 */
 	switch (level) {
 	case SOL_SOCKET:
 		switch (name) {
-		case SO_REUSEADDR:
-			if (!checkonly) {
-				udp->udp_reuseaddr = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case SO_DEBUG:
-			if (!checkonly)
-				udp->udp_debug = onoff;
-			break;
-		/*
-		 * The following three items are available here,
-		 * but are only meaningful to IP.
-		 */
-		case SO_DONTROUTE:
-			if (!checkonly) {
-				udp->udp_dontroute = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case SO_USELOOPBACK:
-			if (!checkonly) {
-				udp->udp_useloopback = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case SO_BROADCAST:
-			if (!checkonly) {
-				udp->udp_broadcast = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-
 		case SO_SNDBUF:
 			if (*i1 > us->us_max_buf) {
-				*outlenp = 0;
 				return (ENOBUFS);
 			}
-			if (!checkonly) {
-				udp->udp_xmit_hiwat = *i1;
-				connp->conn_wq->q_hiwat = *i1;
-			}
 			break;
 		case SO_RCVBUF:
 			if (*i1 > us->us_max_buf) {
-				*outlenp = 0;
 				return (ENOBUFS);
 			}
-			if (!checkonly) {
-				int size;
-
-				udp->udp_rcv_disply_hiwat = *i1;
-				size = udp_set_rcv_hiwat(udp, *i1);
-				rw_exit(&udp->udp_rwlock);
-				(void) proto_set_rx_hiwat(connp->conn_rq, connp,
-				    size);
-				rw_enter(&udp->udp_rwlock, RW_WRITER);
-			}
-			break;
-		case SO_DGRAM_ERRIND:
-			if (!checkonly)
-				udp->udp_dgram_errind = onoff;
-			break;
-		case SO_RECVUCRED:
-			if (!checkonly)
-				udp->udp_recvucred = onoff;
-			break;
-		case SO_ALLZONES:
-			/*
-			 * "soft" error (negative)
-			 * option not handled at this level
-			 * Do not modify *outlenp.
-			 */
-			return (-EINVAL);
-		case SO_TIMESTAMP:
-			if (!checkonly)
-				udp->udp_timestamp = onoff;
-			break;
-		case SO_ANON_MLP:
-		case SO_MAC_EXEMPT:
-		case SO_MAC_IMPLICIT:
-			PASS_OPT_TO_IP(connp);
 			break;
+
 		case SCM_UCRED: {
 			struct ucred_s *ucr;
-			cred_t *cr, *newcr;
+			cred_t *newcr;
 			ts_label_t *tsl;
 
 			/*
@@ -2250,20 +1777,18 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 			 */
 			if (connp->conn_mlp_type == mlptSingle)
 				break;
+
 			ucr = (struct ucred_s *)invalp;
 			if (inlen != ucredsize ||
 			    ucr->uc_labeloff < sizeof (*ucr) ||
 			    ucr->uc_labeloff + sizeof (bslabel_t) > inlen)
 				return (EINVAL);
 			if (!checkonly) {
-				mblk_t *mb;
-				pid_t  cpid;
-
-				if (attrs == NULL ||
-				    (mb = attrs->udpattr_mb) == NULL)
-					return (EINVAL);
-				if ((cr = msg_getcred(mb, &cpid)) == NULL)
-					cr = udp->udp_connp->conn_cred;
+				/*
+				 * Set ixa_tsl to the new label.
+				 * We assume that crgetzoneid doesn't change
+				 * as part of the SCM_UCRED.
+				 */
 				ASSERT(cr != NULL);
 				if ((tsl = crgetlabel(cr)) == NULL)
 					return (EINVAL);
@@ -2271,778 +1796,75 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 				    tsl->tsl_doi, KM_NOSLEEP);
 				if (newcr == NULL)
 					return (ENOSR);
-				mblk_setcred(mb, newcr, cpid);
-				attrs->udpattr_credset = B_TRUE;
-				crfree(newcr);
-			}
-			break;
-		}
-		case SO_EXCLBIND:
-			if (!checkonly)
-				udp->udp_exclbind = onoff;
-			break;
-		case SO_RCVTIMEO:
-		case SO_SNDTIMEO:
-			/*
-			 * Pass these two options in order for third part
-			 * protocol usage. Here just return directly.
-			 */
-			return (0);
-		default:
-			*outlenp = 0;
-			return (EINVAL);
-		}
-		break;
-	case IPPROTO_IP:
-		if (udp->udp_family != AF_INET) {
-			*outlenp = 0;
-			return (ENOPROTOOPT);
-		}
-		switch (name) {
-		case IP_OPTIONS:
-		case T_IP_OPTIONS:
-			/* Save options for use by IP. */
-			newlen = inlen + udp->udp_label_len;
-			if ((inlen & 0x3) || newlen > IP_MAX_OPT_LENGTH) {
-				*outlenp = 0;
-				return (EINVAL);
-			}
-			if (checkonly)
-				break;
-
-			/*
-			 * Update the stored options taking into account
-			 * any CIPSO option which we should not overwrite.
-			 */
-			if (!tsol_option_set(&udp->udp_ip_snd_options,
-			    &udp->udp_ip_snd_options_len,
-			    udp->udp_label_len, invalp, inlen)) {
-				*outlenp = 0;
-				return (ENOMEM);
-			}
-
-			udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
-			    UDPH_SIZE + udp->udp_ip_snd_options_len;
-			sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
-			rw_exit(&udp->udp_rwlock);
-			(void) proto_set_tx_wroff(connp->conn_rq, connp,
-			    sth_wroff);
-			rw_enter(&udp->udp_rwlock, RW_WRITER);
-			break;
-
-		case IP_TTL:
-			if (!checkonly) {
-				udp->udp_ttl = (uchar_t)*i1;
-			}
-			break;
-		case IP_TOS:
-		case T_IP_TOS:
-			if (!checkonly) {
-				udp->udp_type_of_service = (uchar_t)*i1;
-			}
-			break;
-		case IP_MULTICAST_IF: {
-			/*
-			 * TODO should check OPTMGMT reply and undo this if
-			 * there is an error.
-			 */
-			struct in_addr *inap = (struct in_addr *)invalp;
-			if (!checkonly) {
-				udp->udp_multicast_if_addr =
-				    inap->s_addr;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		}
-		case IP_MULTICAST_TTL:
-			if (!checkonly)
-				udp->udp_multicast_ttl = *invalp;
-			break;
-		case IP_MULTICAST_LOOP:
-			if (!checkonly) {
-				connp->conn_multicast_loop = *invalp;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IP_RECVOPTS:
-			if (!checkonly)
-				udp->udp_recvopts = onoff;
-			break;
-		case IP_RECVDSTADDR:
-			if (!checkonly)
-				udp->udp_recvdstaddr = onoff;
-			break;
-		case IP_RECVIF:
-			if (!checkonly) {
-				udp->udp_recvif = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IP_RECVSLLA:
-			if (!checkonly) {
-				udp->udp_recvslla = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IP_RECVTTL:
-			if (!checkonly)
-				udp->udp_recvttl = onoff;
-			break;
-		case IP_PKTINFO: {
-			/*
-			 * This also handles IP_RECVPKTINFO.
-			 * IP_PKTINFO and IP_RECVPKTINFO have same value.
-			 * Differentiation is based on the size of the
-			 * argument passed in.
-			 */
-			struct in_pktinfo *pktinfop;
-			ip4_pkt_t *attr_pktinfop;
-
-			if (checkonly)
-				break;
-
-			if (inlen == sizeof (int)) {
-				/*
-				 * This is IP_RECVPKTINFO option.
-				 * Keep a local copy of whether this option is
-				 * set or not and pass it down to IP for
-				 * processing.
-				 */
-
-				udp->udp_ip_recvpktinfo = onoff;
-				return (-EINVAL);
-			}
-
-			if (attrs == NULL ||
-			    (attr_pktinfop = attrs->udpattr_ipp4) == NULL) {
+				ASSERT(newcr->cr_label != NULL);
 				/*
-				 * sticky option or no buffer to return
-				 * the results.
+				 * Move the hold on the cr_label to ixa_tsl by
+				 * setting cr_label to NULL. Then release newcr.
 				 */
-				return (EINVAL);
-			}
-
-			if (inlen != sizeof (struct in_pktinfo))
-				return (EINVAL);
-
-			pktinfop = (struct in_pktinfo *)invalp;
-
-			/*
-			 * At least one of the values should be specified
-			 */
-			if (pktinfop->ipi_ifindex == 0 &&
-			    pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) {
-				return (EINVAL);
-			}
-
-			attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr;
-			attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex;
-
-			break;
-		}
-		case IP_ADD_MEMBERSHIP:
-		case IP_DROP_MEMBERSHIP:
-		case IP_BLOCK_SOURCE:
-		case IP_UNBLOCK_SOURCE:
-		case IP_ADD_SOURCE_MEMBERSHIP:
-		case IP_DROP_SOURCE_MEMBERSHIP:
-		case MCAST_JOIN_GROUP:
-		case MCAST_LEAVE_GROUP:
-		case MCAST_BLOCK_SOURCE:
-		case MCAST_UNBLOCK_SOURCE:
-		case MCAST_JOIN_SOURCE_GROUP:
-		case MCAST_LEAVE_SOURCE_GROUP:
-		case IP_SEC_OPT:
-		case IP_NEXTHOP:
-		case IP_DHCPINIT_IF:
-			/*
-			 * "soft" error (negative)
-			 * option not handled at this level
-			 * Do not modify *outlenp.
-			 */
-			return (-EINVAL);
-		case IP_BOUND_IF:
-			if (!checkonly) {
-				udp->udp_bound_if = *i1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IP_UNSPEC_SRC:
-			if (!checkonly) {
-				udp->udp_unspec_source = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IP_BROADCAST_TTL:
-			if (!checkonly)
-				connp->conn_broadcast_ttl = *invalp;
-			break;
-		default:
-			*outlenp = 0;
-			return (EINVAL);
-		}
-		break;
-	case IPPROTO_IPV6: {
-		ip6_pkt_t		*ipp;
-		boolean_t		sticky;
-
-		if (udp->udp_family != AF_INET6) {
-			*outlenp = 0;
-			return (ENOPROTOOPT);
-		}
-		/*
-		 * Deal with both sticky options and ancillary data
-		 */
-		sticky = B_FALSE;
-		if (attrs == NULL || (ipp = attrs->udpattr_ipp6) ==
-		    NULL) {
-			/* sticky options, or none */
-			ipp = &udp->udp_sticky_ipp;
-			sticky = B_TRUE;
-		}
-
-		switch (name) {
-		case IPV6_MULTICAST_IF:
-			if (!checkonly) {
-				udp->udp_multicast_if_index = *i1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_UNICAST_HOPS:
-			/* -1 means use default */
-			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
-				*outlenp = 0;
-				return (EINVAL);
-			}
-			if (!checkonly) {
-				if (*i1 == -1) {
-					udp->udp_ttl = ipp->ipp_unicast_hops =
-					    us->us_ipv6_hoplimit;
-					ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
-					/* Pass modified value to IP. */
-					*i1 = udp->udp_ttl;
-				} else {
-					udp->udp_ttl = ipp->ipp_unicast_hops =
-					    (uint8_t)*i1;
-					ipp->ipp_fields |= IPPF_UNICAST_HOPS;
-				}
-				/* Rebuild the header template */
-				error = udp_build_hdrs(udp);
-				if (error != 0) {
-					*outlenp = 0;
-					return (error);
-				}
-			}
-			break;
-		case IPV6_MULTICAST_HOPS:
-			/* -1 means use default */
-			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
-				*outlenp = 0;
-				return (EINVAL);
-			}
-			if (!checkonly) {
-				if (*i1 == -1) {
-					udp->udp_multicast_ttl =
-					    ipp->ipp_multicast_hops =
-					    IP_DEFAULT_MULTICAST_TTL;
-					ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS;
-					/* Pass modified value to IP. */
-					*i1 = udp->udp_multicast_ttl;
-				} else {
-					udp->udp_multicast_ttl =
-					    ipp->ipp_multicast_hops =
-					    (uint8_t)*i1;
-					ipp->ipp_fields |= IPPF_MULTICAST_HOPS;
-				}
-			}
-			break;
-		case IPV6_MULTICAST_LOOP:
-			if (*i1 != 0 && *i1 != 1) {
-				*outlenp = 0;
-				return (EINVAL);
-			}
-			if (!checkonly) {
-				connp->conn_multicast_loop = *i1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_JOIN_GROUP:
-		case IPV6_LEAVE_GROUP:
-		case MCAST_JOIN_GROUP:
-		case MCAST_LEAVE_GROUP:
-		case MCAST_BLOCK_SOURCE:
-		case MCAST_UNBLOCK_SOURCE:
-		case MCAST_JOIN_SOURCE_GROUP:
-		case MCAST_LEAVE_SOURCE_GROUP:
-			/*
-			 * "soft" error (negative)
-			 * option not handled at this level
-			 * Note: Do not modify *outlenp
-			 */
-			return (-EINVAL);
-		case IPV6_BOUND_IF:
-			if (!checkonly) {
-				udp->udp_bound_if = *i1;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_UNSPEC_SRC:
-			if (!checkonly) {
-				udp->udp_unspec_source = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		/*
-		 * Set boolean switches for ancillary data delivery
-		 */
-		case IPV6_RECVPKTINFO:
-			if (!checkonly) {
-				udp->udp_ip_recvpktinfo = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVTCLASS:
-			if (!checkonly) {
-				udp->udp_ipv6_recvtclass = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVPATHMTU:
-			if (!checkonly) {
-				udp->udp_ipv6_recvpathmtu = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVHOPLIMIT:
-			if (!checkonly) {
-				udp->udp_ipv6_recvhoplimit = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVHOPOPTS:
-			if (!checkonly) {
-				udp->udp_ipv6_recvhopopts = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVDSTOPTS:
-			if (!checkonly) {
-				udp->udp_ipv6_recvdstopts = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case _OLD_IPV6_RECVDSTOPTS:
-			if (!checkonly)
-				udp->udp_old_ipv6_recvdstopts = onoff;
-			break;
-		case IPV6_RECVRTHDRDSTOPTS:
-			if (!checkonly) {
-				udp->udp_ipv6_recvrthdrdstopts = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_RECVRTHDR:
-			if (!checkonly) {
-				udp->udp_ipv6_recvrthdr = onoff;
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		/*
-		 * Set sticky options or ancillary data.
-		 * If sticky options, (re)build any extension headers
-		 * that might be needed as a result.
-		 */
-		case IPV6_PKTINFO:
-			/*
-			 * The source address and ifindex are verified
-			 * in ip_opt_set(). For ancillary data the
-			 * source address is checked in ip_wput_v6.
-			 */
-			if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
-				return (EINVAL);
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
-				ipp->ipp_sticky_ignored |=
-				    (IPPF_IFINDEX|IPPF_ADDR);
-			} else {
-				struct in6_pktinfo *pkti;
-
-				pkti = (struct in6_pktinfo *)invalp;
-				ipp->ipp_ifindex = pkti->ipi6_ifindex;
-				ipp->ipp_addr = pkti->ipi6_addr;
-				if (ipp->ipp_ifindex != 0)
-					ipp->ipp_fields |= IPPF_IFINDEX;
-				else
-					ipp->ipp_fields &= ~IPPF_IFINDEX;
-				if (!IN6_IS_ADDR_UNSPECIFIED(
-				    &ipp->ipp_addr))
-					ipp->ipp_fields |= IPPF_ADDR;
-				else
-					ipp->ipp_fields &= ~IPPF_ADDR;
-			}
-			if (sticky) {
-				error = udp_build_hdrs(udp);
-				if (error != 0)
-					return (error);
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_HOPLIMIT:
-			if (sticky)
-				return (EINVAL);
-			if (inlen != 0 && inlen != sizeof (int))
-				return (EINVAL);
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~IPPF_HOPLIMIT;
-				ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT;
-			} else {
-				if (*i1 > 255 || *i1 < -1)
-					return (EINVAL);
-				if (*i1 == -1)
-					ipp->ipp_hoplimit =
-					    us->us_ipv6_hoplimit;
-				else
-					ipp->ipp_hoplimit = *i1;
-				ipp->ipp_fields |= IPPF_HOPLIMIT;
-			}
-			break;
-		case IPV6_TCLASS:
-			if (inlen != 0 && inlen != sizeof (int))
-				return (EINVAL);
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~IPPF_TCLASS;
-				ipp->ipp_sticky_ignored |= IPPF_TCLASS;
-			} else {
-				if (*i1 > 255 || *i1 < -1)
-					return (EINVAL);
-				if (*i1 == -1)
-					ipp->ipp_tclass = 0;
-				else
-					ipp->ipp_tclass = *i1;
-				ipp->ipp_fields |= IPPF_TCLASS;
-			}
-			if (sticky) {
-				error = udp_build_hdrs(udp);
-				if (error != 0)
-					return (error);
-			}
-			break;
-		case IPV6_NEXTHOP:
-			/*
-			 * IP will verify that the nexthop is reachable
-			 * and fail for sticky options.
-			 */
-			if (inlen != 0 && inlen != sizeof (sin6_t))
-				return (EINVAL);
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				ipp->ipp_fields &= ~IPPF_NEXTHOP;
-				ipp->ipp_sticky_ignored |= IPPF_NEXTHOP;
-			} else {
-				sin6_t *sin6 = (sin6_t *)invalp;
-
-				if (sin6->sin6_family != AF_INET6) {
-					return (EAFNOSUPPORT);
-				}
-				if (IN6_IS_ADDR_V4MAPPED(
-				    &sin6->sin6_addr))
-					return (EADDRNOTAVAIL);
-				ipp->ipp_nexthop = sin6->sin6_addr;
-				if (!IN6_IS_ADDR_UNSPECIFIED(
-				    &ipp->ipp_nexthop))
-					ipp->ipp_fields |= IPPF_NEXTHOP;
-				else
-					ipp->ipp_fields &= ~IPPF_NEXTHOP;
-			}
-			if (sticky) {
-				error = udp_build_hdrs(udp);
-				if (error != 0)
-					return (error);
-				PASS_OPT_TO_IP(connp);
-			}
-			break;
-		case IPV6_HOPOPTS: {
-			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (hopts->ip6h_len + 1)))
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			error = optcom_pkt_set(invalp, inlen, sticky,
-			    (uchar_t **)&ipp->ipp_hopopts,
-			    &ipp->ipp_hopoptslen,
-			    sticky ? udp->udp_label_len_v6 : 0);
-			if (error != 0)
-				return (error);
-			if (ipp->ipp_hopoptslen == 0) {
-				ipp->ipp_fields &= ~IPPF_HOPOPTS;
-				ipp->ipp_sticky_ignored |= IPPF_HOPOPTS;
-			} else {
-				ipp->ipp_fields |= IPPF_HOPOPTS;
-			}
-			if (sticky) {
-				error = udp_build_hdrs(udp);
-				if (error != 0)
-					return (error);
-			}
-			break;
-		}
-		case IPV6_RTHDRDSTOPTS: {
-			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (dopts->ip6d_len + 1)))
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				if (sticky &&
-				    (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) {
-					kmem_free(ipp->ipp_rtdstopts,
-					    ipp->ipp_rtdstoptslen);
-					ipp->ipp_rtdstopts = NULL;
-					ipp->ipp_rtdstoptslen = 0;
-				}
-
-				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
-				ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
-			} else {
-				error = optcom_pkt_set(invalp, inlen, sticky,
-				    (uchar_t **)&ipp->ipp_rtdstopts,
-				    &ipp->ipp_rtdstoptslen, 0);
-				if (error != 0)
-					return (error);
-				ipp->ipp_fields |= IPPF_RTDSTOPTS;
-			}
-			if (sticky) {
-				error = udp_build_hdrs(udp);
-				if (error != 0)
-					return (error);
-			}
-			break;
-		}
-		case IPV6_DSTOPTS: {
-			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (dopts->ip6d_len + 1)))
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				if (sticky &&
-				    (ipp->ipp_fields & IPPF_DSTOPTS) != 0) {
-					kmem_free(ipp->ipp_dstopts,
-					    ipp->ipp_dstoptslen);
-					ipp->ipp_dstopts = NULL;
-					ipp->ipp_dstoptslen = 0;
-				}
-				ipp->ipp_fields &= ~IPPF_DSTOPTS;
-				ipp->ipp_sticky_ignored |= IPPF_DSTOPTS;
-			} else {
-				error = optcom_pkt_set(invalp, inlen, sticky,
-				    (uchar_t **)&ipp->ipp_dstopts,
-				    &ipp->ipp_dstoptslen, 0);
-				if (error != 0)
-					return (error);
-				ipp->ipp_fields |= IPPF_DSTOPTS;
-			}
-			if (sticky) {
-				error = udp_build_hdrs(udp);
-				if (error != 0)
-					return (error);
-			}
-			break;
-		}
-		case IPV6_RTHDR: {
-			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
-
-			/*
-			 * Sanity checks - minimum size, size a multiple of
-			 * eight bytes, and matching size passed in.
-			 */
-			if (inlen != 0 &&
-			    inlen != (8 * (rt->ip6r_len + 1)))
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			if (inlen == 0) {
-				if (sticky &&
-				    (ipp->ipp_fields & IPPF_RTHDR) != 0) {
-					kmem_free(ipp->ipp_rthdr,
-					    ipp->ipp_rthdrlen);
-					ipp->ipp_rthdr = NULL;
-					ipp->ipp_rthdrlen = 0;
-				}
-				ipp->ipp_fields &= ~IPPF_RTHDR;
-				ipp->ipp_sticky_ignored |= IPPF_RTHDR;
-			} else {
-				error = optcom_pkt_set(invalp, inlen, sticky,
-				    (uchar_t **)&ipp->ipp_rthdr,
-				    &ipp->ipp_rthdrlen, 0);
-				if (error != 0)
-					return (error);
-				ipp->ipp_fields |= IPPF_RTHDR;
-			}
-			if (sticky) {
-				error = udp_build_hdrs(udp);
-				if (error != 0)
-					return (error);
+				ip_xmit_attr_replace_tsl(ixa, newcr->cr_label);
+				ixa->ixa_flags |= IXAF_UCRED_TSL;
+				newcr->cr_label = NULL;
+				crfree(newcr);
+				coa->coa_changed |= COA_HEADER_CHANGED;
+				coa->coa_changed |= COA_WROFF_CHANGED;
 			}
-			break;
+			/* Fully handled this option. */
+			return (0);
 		}
-
-		case IPV6_DONTFRAG:
-			if (checkonly)
-				break;
-
-			if (onoff) {
-				ipp->ipp_fields |= IPPF_DONTFRAG;
-			} else {
-				ipp->ipp_fields &= ~IPPF_DONTFRAG;
-			}
-			break;
-
-		case IPV6_USE_MIN_MTU:
-			if (inlen != sizeof (int))
-				return (EINVAL);
-
-			if (*i1 < -1 || *i1 > 1)
-				return (EINVAL);
-
-			if (checkonly)
-				break;
-
-			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
-			ipp->ipp_use_min_mtu = *i1;
-			break;
-
-		case IPV6_SEC_OPT:
-		case IPV6_SRC_PREFERENCES:
-		case IPV6_V6ONLY:
-			/* Handled at the IP level */
-			return (-EINVAL);
-		default:
-			*outlenp = 0;
-			return (EINVAL);
 		}
 		break;
-		}		/* end IPPROTO_IPV6 */
 	case IPPROTO_UDP:
 		switch (name) {
-		case UDP_ANONPRIVBIND:
-			if ((error = secpolicy_net_privaddr(cr, 0,
-			    IPPROTO_UDP)) != 0) {
-				*outlenp = 0;
-				return (error);
-			}
-			if (!checkonly) {
-				udp->udp_anon_priv_bind = onoff;
-			}
-			break;
-		case UDP_EXCLBIND:
-			if (!checkonly)
-				udp->udp_exclbind = onoff;
-			break;
-		case UDP_RCVHDR:
-			if (!checkonly)
-				udp->udp_rcvhdr = onoff;
-			break;
 		case UDP_NAT_T_ENDPOINT:
 			if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
-				*outlenp = 0;
 				return (error);
 			}
 
 			/*
-			 * Use udp_family instead so we can avoid ambiguitites
+			 * Use conn_family instead so we can avoid ambiguitites
 			 * with AF_INET6 sockets that may switch from IPv4
 			 * to IPv6.
 			 */
-			if (udp->udp_family != AF_INET) {
-				*outlenp = 0;
+			if (connp->conn_family != AF_INET) {
 				return (EAFNOSUPPORT);
 			}
 
 			if (!checkonly) {
-				int size;
-
+				mutex_enter(&connp->conn_lock);
 				udp->udp_nat_t_endpoint = onoff;
-
-				udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
-				    UDPH_SIZE + udp->udp_ip_snd_options_len;
-
-				/* Also, adjust wroff */
-				if (onoff) {
-					udp->udp_max_hdr_len +=
-					    sizeof (uint32_t);
-				}
-				size = udp->udp_max_hdr_len +
-				    us->us_wroff_extra;
-				(void) proto_set_tx_wroff(connp->conn_rq, connp,
-				    size);
+				mutex_exit(&connp->conn_lock);
+				coa->coa_changed |= COA_HEADER_CHANGED;
+				coa->coa_changed |= COA_WROFF_CHANGED;
 			}
-			break;
-		default:
-			*outlenp = 0;
-			return (EINVAL);
+			/* Fully handled this option. */
+			return (0);
+		case UDP_RCVHDR:
+			mutex_enter(&connp->conn_lock);
+			udp->udp_rcvhdr = onoff;
+			mutex_exit(&connp->conn_lock);
+			return (0);
 		}
 		break;
-	default:
-		*outlenp = 0;
-		return (EINVAL);
-	}
-	/*
-	 * Common case of OK return with outval same as inval.
-	 */
-	if (invalp != outvalp) {
-		/* don't trust bcopy for identical src/dst */
-		(void) bcopy(invalp, outvalp, inlen);
 	}
-	*outlenp = inlen;
-	return (0);
+	error = conn_opt_set(coa, level, name, inlen, invalp,
+	    checkonly, cr);
+	return (error);
 }
 
+/*
+ * This routine sets socket options.
+ */
 int
-udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
-    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
-    void *thisdg_attrs, cred_t *cr)
+udp_opt_set(conn_t *connp, uint_t optset_context, int level,
+    int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
+    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
 {
-	int		error;
+	udp_t		*udp = connp->conn_udp;
+	int		err;
+	conn_opt_arg_t	coas, *coa;
 	boolean_t	checkonly;
+	udp_stack_t	*us = udp->udp_us;
 
-	error = 0;
 	switch (optset_context) {
 	case SETFN_OPTCOM_CHECKONLY:
 		checkonly = B_TRUE;
@@ -3056,7 +1878,7 @@ udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 		 */
 		if (inlen == 0) {
 			*outlenp = 0;
-			goto done;
+			return (0);
 		}
 		break;
 	case SETFN_OPTCOM_NEGOTIATE:
@@ -3074,8 +1896,7 @@ udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 		 */
 		if (!udp_opt_allow_udr_set(level, name)) {
 			*outlenp = 0;
-			error = EINVAL;
-			goto done;
+			return (EINVAL);
 		}
 		break;
 	default:
@@ -3083,99 +1904,326 @@ udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 		 * We should never get here
 		 */
 		*outlenp = 0;
-		error = EINVAL;
-		goto done;
+		return (EINVAL);
 	}
 
 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
 
-	error = udp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
-	    outvalp, cr, thisdg_attrs, checkonly);
-done:
-	return (error);
+	if (thisdg_attrs != NULL) {
+		/* Options from T_UNITDATA_REQ */
+		coa = (conn_opt_arg_t *)thisdg_attrs;
+		ASSERT(coa->coa_connp == connp);
+		ASSERT(coa->coa_ixa != NULL);
+		ASSERT(coa->coa_ipp != NULL);
+		ASSERT(coa->coa_ancillary);
+	} else {
+		coa = &coas;
+		coas.coa_connp = connp;
+		/* Get a reference on conn_ixa to prevent concurrent mods */
+		coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
+		if (coas.coa_ixa == NULL) {
+			*outlenp = 0;
+			return (ENOMEM);
+		}
+		coas.coa_ipp = &connp->conn_xmit_ipp;
+		coas.coa_ancillary = B_FALSE;
+		coas.coa_changed = 0;
+	}
+
+	err = udp_do_opt_set(coa, level, name, inlen, invalp,
+	    cr, checkonly);
+	if (err != 0) {
+errout:
+		if (!coa->coa_ancillary)
+			ixa_refrele(coa->coa_ixa);
+		*outlenp = 0;
+		return (err);
+	}
+	/* Handle DHCPINIT here outside of lock */
+	if (level == IPPROTO_IP && name == IP_DHCPINIT_IF) {
+		uint_t	ifindex;
+		ill_t	*ill;
+
+		ifindex = *(uint_t *)invalp;
+		if (ifindex == 0) {
+			ill = NULL;
+		} else {
+			ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
+			    coa->coa_ixa->ixa_ipst);
+			if (ill == NULL) {
+				err = ENXIO;
+				goto errout;
+			}
+
+			mutex_enter(&ill->ill_lock);
+			if (ill->ill_state_flags & ILL_CONDEMNED) {
+				mutex_exit(&ill->ill_lock);
+				ill_refrele(ill);
+				err = ENXIO;
+				goto errout;
+			}
+			if (IS_VNI(ill)) {
+				mutex_exit(&ill->ill_lock);
+				ill_refrele(ill);
+				err = EINVAL;
+				goto errout;
+			}
+		}
+		mutex_enter(&connp->conn_lock);
+
+		if (connp->conn_dhcpinit_ill != NULL) {
+			/*
+			 * We've locked the conn so conn_cleanup_ill()
+			 * cannot clear conn_dhcpinit_ill -- so it's
+			 * safe to access the ill.
+			 */
+			ill_t *oill = connp->conn_dhcpinit_ill;
+
+			ASSERT(oill->ill_dhcpinit != 0);
+			atomic_dec_32(&oill->ill_dhcpinit);
+			ill_set_inputfn(connp->conn_dhcpinit_ill);
+			connp->conn_dhcpinit_ill = NULL;
+		}
+
+		if (ill != NULL) {
+			connp->conn_dhcpinit_ill = ill;
+			atomic_inc_32(&ill->ill_dhcpinit);
+			ill_set_inputfn(ill);
+			mutex_exit(&connp->conn_lock);
+			mutex_exit(&ill->ill_lock);
+			ill_refrele(ill);
+		} else {
+			mutex_exit(&connp->conn_lock);
+		}
+	}
+
+	/*
+	 * Common case of OK return with outval same as inval.
+	 */
+	if (invalp != outvalp) {
+		/* don't trust bcopy for identical src/dst */
+		(void) bcopy(invalp, outvalp, inlen);
+	}
+	*outlenp = inlen;
+
+	/*
+	 * If this was not ancillary data, then we rebuild the headers,
+	 * update the IRE/NCE, and IPsec as needed.
+	 * Since the label depends on the destination we go through
+	 * ip_set_destination first.
+	 */
+	if (coa->coa_ancillary) {
+		return (0);
+	}
+
+	if (coa->coa_changed & COA_ROUTE_CHANGED) {
+		in6_addr_t saddr, faddr, nexthop;
+		in_port_t fport;
+
+		/*
+		 * We clear lastdst to make sure we pick up the change
+		 * next time sending.
+		 * If we are connected we re-cache the information.
+		 * We ignore errors to preserve BSD behavior.
+		 * Note that we don't redo IPsec policy lookup here
+		 * since the final destination (or source) didn't change.
+		 */
+		mutex_enter(&connp->conn_lock);
+		connp->conn_v6lastdst = ipv6_all_zeros;
+
+		ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
+		    &connp->conn_faddr_v6, &nexthop);
+		saddr = connp->conn_saddr_v6;
+		faddr = connp->conn_faddr_v6;
+		fport = connp->conn_fport;
+		mutex_exit(&connp->conn_lock);
+
+		if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
+		    !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
+			(void) ip_attr_connect(connp, coa->coa_ixa,
+			    &saddr, &faddr, &nexthop, fport, NULL, NULL,
+			    IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
+		}
+	}
+
+	ixa_refrele(coa->coa_ixa);
+
+	if (coa->coa_changed & COA_HEADER_CHANGED) {
+		/*
+		 * Rebuild the header template if we are connected.
+		 * Otherwise clear conn_v6lastdst so we rebuild the header
+		 * in the data path.
+		 */
+		mutex_enter(&connp->conn_lock);
+		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
+		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
+			err = udp_build_hdr_template(connp,
+			    &connp->conn_saddr_v6, &connp->conn_faddr_v6,
+			    connp->conn_fport, connp->conn_flowinfo);
+			if (err != 0) {
+				mutex_exit(&connp->conn_lock);
+				return (err);
+			}
+		} else {
+			connp->conn_v6lastdst = ipv6_all_zeros;
+		}
+		mutex_exit(&connp->conn_lock);
+	}
+	if (coa->coa_changed & COA_RCVBUF_CHANGED) {
+		(void) proto_set_rx_hiwat(connp->conn_rq, connp,
+		    connp->conn_rcvbuf);
+	}
+	if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
+		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
+	}
+	if (coa->coa_changed & COA_WROFF_CHANGED) {
+		/* Increase wroff if needed */
+		uint_t wroff;
+
+		mutex_enter(&connp->conn_lock);
+		wroff = connp->conn_ht_iphc_allocated + us->us_wroff_extra;
+		if (udp->udp_nat_t_endpoint)
+			wroff += sizeof (uint32_t);
+		if (wroff > connp->conn_wroff) {
+			connp->conn_wroff = wroff;
+			mutex_exit(&connp->conn_lock);
+			(void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
+		} else {
+			mutex_exit(&connp->conn_lock);
+		}
+	}
+	return (err);
 }
 
-/* ARGSUSED */
+/* This routine sets socket options. */
 int
 udp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
-    void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+    void *thisdg_attrs, cred_t *cr)
 {
-	conn_t  *connp =  Q_TO_CONN(q);
+	conn_t	*connp = Q_TO_CONN(q);
 	int error;
-	udp_t	*udp = connp->conn_udp;
 
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
 	error = udp_opt_set(connp, optset_context, level, name, inlen, invalp,
 	    outlenp, outvalp, thisdg_attrs, cr);
-	rw_exit(&udp->udp_rwlock);
 	return (error);
 }
 
 /*
- * Update udp_sticky_hdrs based on udp_sticky_ipp, udp_v6src, and udp_ttl.
- * The headers include ip6i_t (if needed), ip6_t, any sticky extension
- * headers, and the udp header.
- * Returns failure if can't allocate memory.
+ * Setup IP and UDP headers.
+ * Returns NULL on allocation failure, in which case data_mp is freed.
  */
-static int
-udp_build_hdrs(udp_t *udp)
+mblk_t *
+udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
+    const in6_addr_t *v6src, const in6_addr_t *v6dst, in_port_t dstport,
+    uint32_t flowinfo, mblk_t *data_mp, int *errorp)
 {
-	udp_stack_t *us = udp->udp_us;
-	uchar_t	*hdrs;
-	uint_t	hdrs_len;
-	ip6_t	*ip6h;
-	ip6i_t	*ip6i;
-	udpha_t	*udpha;
-	ip6_pkt_t *ipp = &udp->udp_sticky_ipp;
-	size_t	sth_wroff;
-	conn_t	*connp = udp->udp_connp;
-
-	ASSERT(RW_WRITE_HELD(&udp->udp_rwlock));
-	ASSERT(connp != NULL);
+	mblk_t		*mp;
+	udpha_t		*udpha;
+	udp_stack_t	*us = connp->conn_netstack->netstack_udp;
+	uint_t		data_len;
+	uint32_t	cksum;
+	udp_t		*udp = connp->conn_udp;
+	boolean_t	insert_spi = udp->udp_nat_t_endpoint;
+	uint_t		ulp_hdr_len;
 
-	hdrs_len = ip_total_hdrs_len_v6(ipp) + UDPH_SIZE;
-	ASSERT(hdrs_len != 0);
-	if (hdrs_len != udp->udp_sticky_hdrs_len) {
-		/* Need to reallocate */
-		hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
-		if (hdrs == NULL)
-			return (ENOMEM);
+	data_len = msgdsize(data_mp);
+	ulp_hdr_len = UDPH_SIZE;
+	if (insert_spi)
+		ulp_hdr_len += sizeof (uint32_t);
 
-		if (udp->udp_sticky_hdrs_len != 0) {
-			kmem_free(udp->udp_sticky_hdrs,
-			    udp->udp_sticky_hdrs_len);
-		}
-		udp->udp_sticky_hdrs = hdrs;
-		udp->udp_sticky_hdrs_len = hdrs_len;
+	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
+	    ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
+	if (mp == NULL) {
+		ASSERT(*errorp != 0);
+		return (NULL);
 	}
-	ip_build_hdrs_v6(udp->udp_sticky_hdrs,
-	    udp->udp_sticky_hdrs_len - UDPH_SIZE, ipp, IPPROTO_UDP);
 
-	/* Set header fields not in ipp */
-	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
-		ip6i = (ip6i_t *)udp->udp_sticky_hdrs;
-		ip6h = (ip6_t *)&ip6i[1];
+	data_len += ulp_hdr_len;
+	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
+
+	udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);
+	udpha->uha_src_port = connp->conn_lport;
+	udpha->uha_dst_port = dstport;
+	udpha->uha_checksum = 0;
+	udpha->uha_length = htons(data_len);
+
+	/*
+	 * If there was a routing option/header then conn_prepend_hdr
+	 * has massaged it and placed the pseudo-header checksum difference
+	 * in the cksum argument.
+	 *
+	 * Setup header length and prepare for ULP checksum done in IP.
+	 *
+	 * We make it easy for IP to include our pseudo header
+	 * by putting our length in uha_checksum.
+	 * The IP source, destination, and length have already been set by
+	 * conn_prepend_hdr.
+	 */
+	cksum += data_len;
+	cksum = (cksum >> 16) + (cksum & 0xFFFF);
+	ASSERT(cksum < 0x10000);
+
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
+
+		ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
+
+		/* IP does the checksum if uha_checksum is non-zero */
+		if (us->us_do_checksum) {
+			if (cksum == 0)
+				udpha->uha_checksum = 0xffff;
+			else
+				udpha->uha_checksum = htons(cksum);
+		} else {
+			udpha->uha_checksum = 0;
+		}
 	} else {
-		ip6h = (ip6_t *)udp->udp_sticky_hdrs;
+		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
+		if (cksum == 0)
+			udpha->uha_checksum = 0xffff;
+		else
+			udpha->uha_checksum = htons(cksum);
 	}
 
-	if (!(ipp->ipp_fields & IPPF_ADDR))
-		ip6h->ip6_src = udp->udp_v6src;
+	/* Insert all-0s SPI now. */
+	if (insert_spi)
+		*((uint32_t *)(udpha + 1)) = 0;
 
-	udpha = (udpha_t *)(udp->udp_sticky_hdrs + hdrs_len - UDPH_SIZE);
-	udpha->uha_src_port = udp->udp_port;
+	return (mp);
+}
 
-	/* Try to get everything in a single mblk */
-	if (hdrs_len > udp->udp_max_hdr_len) {
-		udp->udp_max_hdr_len = hdrs_len;
-		sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
-		rw_exit(&udp->udp_rwlock);
-		(void) proto_set_tx_wroff(udp->udp_connp->conn_rq,
-		    udp->udp_connp, sth_wroff);
-		rw_enter(&udp->udp_rwlock, RW_WRITER);
-	}
+static int
+udp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
+    const in6_addr_t *v6dst, in_port_t dstport, uint32_t flowinfo)
+{
+	udpha_t		*udpha;
+	int		error;
+
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+	/*
+	 * We clear lastdst to make sure we don't use the lastdst path
+	 * next time sending since we might not have set v6dst yet.
+	 */
+	connp->conn_v6lastdst = ipv6_all_zeros;
+
+	error = conn_build_hdr_template(connp, UDPH_SIZE, 0, v6src, v6dst,
+	    flowinfo);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Any routing header/option has been massaged. The checksum difference
+	 * is stored in conn_sum.
+	 */
+	udpha = (udpha_t *)connp->conn_ht_ulp;
+	udpha->uha_src_port = connp->conn_lport;
+	udpha->uha_dst_port = dstport;
+	udpha->uha_checksum = 0;
+	udpha->uha_length = htons(UDPH_SIZE);	/* Filled in later */
 	return (0);
 }
 
@@ -3252,189 +2300,6 @@ udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
 	return (0);
 }
 
-/*
- * Copy hop-by-hop option from ipp->ipp_hopopts to the buffer provided (with
- * T_opthdr) and return the number of bytes copied.  'dbuf' may be NULL to
- * just count the length needed for allocation.  If 'dbuf' is non-NULL,
- * then it's assumed to be allocated to be large enough.
- *
- * Returns zero if trimming of the security option causes all options to go
- * away.
- */
-static size_t
-copy_hop_opts(const ip6_pkt_t *ipp, uchar_t *dbuf)
-{
-	struct T_opthdr *toh;
-	size_t hol = ipp->ipp_hopoptslen;
-	ip6_hbh_t *dstopt = NULL;
-	const ip6_hbh_t *srcopt = ipp->ipp_hopopts;
-	size_t tlen, olen, plen;
-	boolean_t deleting;
-	const struct ip6_opt *sopt, *lastpad;
-	struct ip6_opt *dopt;
-
-	if ((toh = (struct T_opthdr *)dbuf) != NULL) {
-		toh->level = IPPROTO_IPV6;
-		toh->name = IPV6_HOPOPTS;
-		toh->status = 0;
-		dstopt = (ip6_hbh_t *)(toh + 1);
-	}
-
-	/*
-	 * If labeling is enabled, then skip the label option
-	 * but get other options if there are any.
-	 */
-	if (is_system_labeled()) {
-		dopt = NULL;
-		if (dstopt != NULL) {
-			/* will fill in ip6h_len later */
-			dstopt->ip6h_nxt = srcopt->ip6h_nxt;
-			dopt = (struct ip6_opt *)(dstopt + 1);
-		}
-		sopt = (const struct ip6_opt *)(srcopt + 1);
-		hol -= sizeof (*srcopt);
-		tlen = sizeof (*dstopt);
-		lastpad = NULL;
-		deleting = B_FALSE;
-		/*
-		 * This loop finds the first (lastpad pointer) of any number of
-		 * pads that preceeds the security option, then treats the
-		 * security option as though it were a pad, and then finds the
-		 * next non-pad option (or end of list).
-		 *
-		 * It then treats the entire block as one big pad.  To preserve
-		 * alignment of any options that follow, or just the end of the
-		 * list, it computes a minimal new padding size that keeps the
-		 * same alignment for the next option.
-		 *
-		 * If it encounters just a sequence of pads with no security
-		 * option, those are copied as-is rather than collapsed.
-		 *
-		 * Note that to handle the end of list case, the code makes one
-		 * loop with 'hol' set to zero.
-		 */
-		for (;;) {
-			if (hol > 0) {
-				if (sopt->ip6o_type == IP6OPT_PAD1) {
-					if (lastpad == NULL)
-						lastpad = sopt;
-					sopt = (const struct ip6_opt *)
-					    &sopt->ip6o_len;
-					hol--;
-					continue;
-				}
-				olen = sopt->ip6o_len + sizeof (*sopt);
-				if (olen > hol)
-					olen = hol;
-				if (sopt->ip6o_type == IP6OPT_PADN ||
-				    sopt->ip6o_type == ip6opt_ls) {
-					if (sopt->ip6o_type == ip6opt_ls)
-						deleting = B_TRUE;
-					if (lastpad == NULL)
-						lastpad = sopt;
-					sopt = (const struct ip6_opt *)
-					    ((const char *)sopt + olen);
-					hol -= olen;
-					continue;
-				}
-			} else {
-				/* if nothing was copied at all, then delete */
-				if (tlen == sizeof (*dstopt))
-					return (0);
-				/* last pass; pick up any trailing padding */
-				olen = 0;
-			}
-			if (deleting) {
-				/*
-				 * compute aligning effect of deleted material
-				 * to reproduce with pad.
-				 */
-				plen = ((const char *)sopt -
-				    (const char *)lastpad) & 7;
-				tlen += plen;
-				if (dopt != NULL) {
-					if (plen == 1) {
-						dopt->ip6o_type = IP6OPT_PAD1;
-					} else if (plen > 1) {
-						plen -= sizeof (*dopt);
-						dopt->ip6o_type = IP6OPT_PADN;
-						dopt->ip6o_len = plen;
-						if (plen > 0)
-							bzero(dopt + 1, plen);
-					}
-					dopt = (struct ip6_opt *)
-					    ((char *)dopt + plen);
-				}
-				deleting = B_FALSE;
-				lastpad = NULL;
-			}
-			/* if there's uncopied padding, then copy that now */
-			if (lastpad != NULL) {
-				olen += (const char *)sopt -
-				    (const char *)lastpad;
-				sopt = lastpad;
-				lastpad = NULL;
-			}
-			if (dopt != NULL && olen > 0) {
-				bcopy(sopt, dopt, olen);
-				dopt = (struct ip6_opt *)((char *)dopt + olen);
-			}
-			if (hol == 0)
-				break;
-			tlen += olen;
-			sopt = (const struct ip6_opt *)
-			    ((const char *)sopt + olen);
-			hol -= olen;
-		}
-		/* go back and patch up the length value, rounded upward */
-		if (dstopt != NULL)
-			dstopt->ip6h_len = (tlen - 1) >> 3;
-	} else {
-		tlen = hol;
-		if (dstopt != NULL)
-			bcopy(srcopt, dstopt, hol);
-	}
-
-	tlen += sizeof (*toh);
-	if (toh != NULL)
-		toh->len = tlen;
-
-	return (tlen);
-}
-
-/*
- * Update udp_rcv_opt_len from the packet.
- * Called when options received, and when no options received but
- * udp_ip_recv_opt_len has previously recorded options.
- */
-static void
-udp_save_ip_rcv_opt(udp_t *udp, void *opt, int opt_len)
-{
-	/* Save the options if any */
-	if (opt_len > 0) {
-		if (opt_len > udp->udp_ip_rcv_options_len) {
-			/* Need to allocate larger buffer */
-			if (udp->udp_ip_rcv_options_len != 0)
-				mi_free((char *)udp->udp_ip_rcv_options);
-			udp->udp_ip_rcv_options_len = 0;
-			udp->udp_ip_rcv_options =
-			    (uchar_t *)mi_alloc(opt_len, BPRI_HI);
-			if (udp->udp_ip_rcv_options != NULL)
-				udp->udp_ip_rcv_options_len = opt_len;
-		}
-		if (udp->udp_ip_rcv_options_len != 0) {
-			bcopy(opt, udp->udp_ip_rcv_options, opt_len);
-			/* Adjust length if we are resusing the space */
-			udp->udp_ip_rcv_options_len = opt_len;
-		}
-	} else if (udp->udp_ip_rcv_options_len != 0) {
-		/* Clear out previously recorded options */
-		mi_free((char *)udp->udp_ip_rcv_options);
-		udp->udp_ip_rcv_options = NULL;
-		udp->udp_ip_rcv_options_len = 0;
-	}
-}
-
 static mblk_t *
 udp_queue_fallback(udp_t *udp, mblk_t *mp)
 {
@@ -3466,15 +2331,15 @@ udp_queue_fallback(udp_t *udp, mblk_t *mp)
  * TPI, then we'll queue the mp for later processing.
  */
 static void
-udp_ulp_recv(conn_t *connp, mblk_t *mp)
+udp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len, ip_recv_attr_t *ira)
 {
 	if (IPCL_IS_NONSTR(connp)) {
 		udp_t *udp = connp->conn_udp;
 		int error;
 
+		ASSERT(len == msgdsize(mp));
 		if ((*connp->conn_upcalls->su_recv)
-		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
-		    NULL) < 0) {
+		    (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
 			mutex_enter(&udp->udp_recv_lock);
 			if (error == ENOSPC) {
 				/*
@@ -3500,282 +2365,170 @@ udp_ulp_recv(conn_t *connp, mblk_t *mp)
 		}
 		ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock));
 	} else {
+		if (is_system_labeled()) {
+			ASSERT(ira->ira_cred != NULL);
+			/*
+			 * Provide for protocols above UDP such as RPC
+			 * NOPID leaves db_cpid unchanged.
+			 */
+			mblk_setcred(mp, ira->ira_cred, NOPID);
+		}
+
 		putnext(connp->conn_rq, mp);
 	}
 }
 
+/*
+ * This is the inbound data path.
+ * IP has already pulled up the IP plus UDP headers and verified alignment
+ * etc.
+ */
 /* ARGSUSED2 */
 static void
-udp_input(void *arg1, mblk_t *mp, void *arg2)
+udp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 {
-	conn_t *connp = (conn_t *)arg1;
+	conn_t			*connp = (conn_t *)arg1;
 	struct T_unitdata_ind	*tudi;
 	uchar_t			*rptr;		/* Pointer to IP header */
 	int			hdr_length;	/* Length of IP+UDP headers */
-	int			opt_len;
 	int			udi_size;	/* Size of T_unitdata_ind */
-	int			mp_len;
+	int			pkt_len;
 	udp_t			*udp;
 	udpha_t			*udpha;
-	int			ipversion;
-	ip6_pkt_t		ipp;
+	ip_pkt_t		ipps;
 	ip6_t			*ip6h;
-	ip6i_t			*ip6i;
 	mblk_t			*mp1;
-	mblk_t			*options_mp = NULL;
-	ip_pktinfo_t		*pinfo = NULL;
-	cred_t			*cr = NULL;
-	pid_t			cpid;
-	uint32_t		udp_ip_rcv_options_len;
-	udp_bits_t		udp_bits;
-	cred_t			*rcr = connp->conn_cred;
-	udp_stack_t *us;
+	uint32_t		udp_ipv4_options_len;
+	crb_t			recv_ancillary;
+	udp_stack_t		*us;
 
 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
 
 	udp = connp->conn_udp;
 	us = udp->udp_us;
 	rptr = mp->b_rptr;
-	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
+
+	ASSERT(DB_TYPE(mp) == M_DATA);
 	ASSERT(OK_32PTR(rptr));
+	ASSERT(ira->ira_pktlen == msgdsize(mp));
+	pkt_len = ira->ira_pktlen;
 
 	/*
-	 * IP should have prepended the options data in an M_CTL
-	 * Check M_CTL "type" to make sure are not here bcos of
-	 * a valid ICMP message
+	 * Get a snapshot of these and allow other threads to change
+	 * them after that. We need the same recv_ancillary when determining
+	 * the size as when adding the ancillary data items.
 	 */
-	if (DB_TYPE(mp) == M_CTL) {
-		if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
-		    ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
-		    IN_PKTINFO) {
-			/*
-			 * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information
-			 * has been prepended to the packet by IP. We need to
-			 * extract the mblk and adjust the rptr
-			 */
-			pinfo = (ip_pktinfo_t *)mp->b_rptr;
-			options_mp = mp;
-			mp = mp->b_cont;
-			rptr = mp->b_rptr;
-			UDP_STAT(us, udp_in_pktinfo);
-		} else {
-			/*
-			 * ICMP messages.
-			 */
-			udp_icmp_error(connp, mp);
-			return;
-		}
-	}
+	mutex_enter(&connp->conn_lock);
+	udp_ipv4_options_len = udp->udp_recv_ipp.ipp_ipv4_options_len;
+	recv_ancillary = connp->conn_recv_ancillary;
+	mutex_exit(&connp->conn_lock);
+
+	hdr_length = ira->ira_ip_hdr_length;
 
-	mp_len = msgdsize(mp);
 	/*
-	 * This is the inbound data path.
-	 * First, we check to make sure the IP version number is correct,
-	 * and then pull the IP and UDP headers into the first mblk.
+	 * IP inspected the UDP header thus all of it must be in the mblk.
+	 * UDP length check is performed for IPv6 packets and IPv4 packets
+	 * to check if the size of the packet as specified
+	 * by the UDP header is the same as the length derived from the IP
+	 * header.
 	 */
+	udpha = (udpha_t *)(rptr + hdr_length);
+	if (pkt_len != ntohs(udpha->uha_length) + hdr_length)
+		goto tossit;
 
-	/* Initialize regardless if ipversion is IPv4 or IPv6 */
-	ipp.ipp_fields = 0;
+	hdr_length += UDPH_SIZE;
+	ASSERT(MBLKL(mp) >= hdr_length);	/* IP did a pullup */
 
-	ipversion = IPH_HDR_VERSION(rptr);
+	/* Initialize regardless of IP version */
+	ipps.ipp_fields = 0;
 
-	rw_enter(&udp->udp_rwlock, RW_READER);
-	udp_ip_rcv_options_len = udp->udp_ip_rcv_options_len;
-	udp_bits = udp->udp_bits;
-	rw_exit(&udp->udp_rwlock);
+	if (((ira->ira_flags & IRAF_IPV4_OPTIONS) ||
+	    udp_ipv4_options_len > 0) &&
+	    connp->conn_family == AF_INET) {
+		int	err;
 
-	switch (ipversion) {
-	case IPV4_VERSION:
-		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
-		ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP);
-		hdr_length = IPH_HDR_LENGTH(rptr) + UDPH_SIZE;
-		opt_len = hdr_length - (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE);
-		if ((opt_len > 0 || udp_ip_rcv_options_len > 0) &&
-		    udp->udp_family == AF_INET) {
-			/*
-			 * Record/update udp_ip_rcv_options with the lock
-			 * held. Not needed for AF_INET6 sockets
-			 * since they don't support a getsockopt of IP_OPTIONS.
-			 */
-			rw_enter(&udp->udp_rwlock, RW_WRITER);
-			udp_save_ip_rcv_opt(udp, rptr + IP_SIMPLE_HDR_LENGTH,
-			    opt_len);
-			rw_exit(&udp->udp_rwlock);
-		}
-		/* Handle IPV6_RECVPKTINFO even for IPv4 packet. */
-		if ((udp->udp_family == AF_INET6) && (pinfo != NULL) &&
-		    udp->udp_ip_recvpktinfo) {
-			if (pinfo->ip_pkt_flags & IPF_RECVIF) {
-				ipp.ipp_fields |= IPPF_IFINDEX;
-				ipp.ipp_ifindex = pinfo->ip_pkt_ifindex;
-			}
-		}
-		break;
-	case IPV6_VERSION:
 		/*
-		 * IPv6 packets can only be received by applications
-		 * that are prepared to receive IPv6 addresses.
-		 * The IP fanout must ensure this.
+		 * Record/update udp_recv_ipp with the lock
+		 * held. Not needed for AF_INET6 sockets
+		 * since they don't support a getsockopt of IP_OPTIONS.
 		 */
-		ASSERT(udp->udp_family == AF_INET6);
+		mutex_enter(&connp->conn_lock);
+		err = ip_find_hdr_v4((ipha_t *)rptr, &udp->udp_recv_ipp,
+		    B_TRUE);
+		if (err != 0) {
+			/* Allocation failed. Drop packet */
+			mutex_exit(&connp->conn_lock);
+			freemsg(mp);
+			BUMP_MIB(&us->us_udp_mib, udpInErrors);
+			return;
+		}
+		mutex_exit(&connp->conn_lock);
+	}
 
-		ip6h = (ip6_t *)rptr;
-		ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
+	if (recv_ancillary.crb_all != 0) {
+		/*
+		 * Record packet information in the ip_pkt_t
+		 */
+		if (ira->ira_flags & IRAF_IS_IPV4) {
+			ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
+			ASSERT(MBLKL(mp) >= sizeof (ipha_t));
+			ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP);
+			ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
 
-		if (ip6h->ip6_nxt != IPPROTO_UDP) {
+			(void) ip_find_hdr_v4((ipha_t *)rptr, &ipps, B_FALSE);
+		} else {
 			uint8_t nexthdrp;
-			/* Look for ifindex information */
-			if (ip6h->ip6_nxt == IPPROTO_RAW) {
-				ip6i = (ip6i_t *)ip6h;
-				if ((uchar_t *)&ip6i[1] > mp->b_wptr)
-					goto tossit;
-
-				if (ip6i->ip6i_flags & IP6I_IFINDEX) {
-					ASSERT(ip6i->ip6i_ifindex != 0);
-					ipp.ipp_fields |= IPPF_IFINDEX;
-					ipp.ipp_ifindex = ip6i->ip6i_ifindex;
-				}
-				rptr = (uchar_t *)&ip6i[1];
-				mp->b_rptr = rptr;
-				if (rptr == mp->b_wptr) {
-					mp1 = mp->b_cont;
-					freeb(mp);
-					mp = mp1;
-					rptr = mp->b_rptr;
-				}
-				if (MBLKL(mp) < (IPV6_HDR_LEN + UDPH_SIZE))
-					goto tossit;
-				ip6h = (ip6_t *)rptr;
-				mp_len = msgdsize(mp);
-			}
+
+			ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
 			/*
-			 * Find any potentially interesting extension headers
-			 * as well as the length of the IPv6 + extension
-			 * headers.
+			 * IPv6 packets can only be received by applications
+			 * that are prepared to receive IPv6 addresses.
+			 * The IP fanout must ensure this.
 			 */
-			hdr_length = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp) +
-			    UDPH_SIZE;
-			ASSERT(nexthdrp == IPPROTO_UDP);
-		} else {
-			hdr_length = IPV6_HDR_LEN + UDPH_SIZE;
-			ip6i = NULL;
-		}
-		break;
-	default:
-		ASSERT(0);
-	}
+			ASSERT(connp->conn_family == AF_INET6);
 
-	/*
-	 * IP inspected the UDP header thus all of it must be in the mblk.
-	 * UDP length check is performed for IPv6 packets and IPv4 packets
-	 * to check if the size of the packet as specified
-	 * by the header is the same as the physical size of the packet.
-	 * FIXME? Didn't IP already check this?
-	 */
-	udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE));
-	if ((MBLKL(mp) < hdr_length) ||
-	    (mp_len != (ntohs(udpha->uha_length) + hdr_length - UDPH_SIZE))) {
-		goto tossit;
-	}
+			ip6h = (ip6_t *)rptr;
 
-
-	/* Walk past the headers unless UDP_RCVHDR was set. */
-	if (!udp_bits.udpb_rcvhdr) {
-		mp->b_rptr = rptr + hdr_length;
-		mp_len -= hdr_length;
+			/* We don't care about the length, but need the ipp */
+			hdr_length = ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps,
+			    &nexthdrp);
+			ASSERT(hdr_length == ira->ira_ip_hdr_length);
+			/* Restore */
+			hdr_length = ira->ira_ip_hdr_length + UDPH_SIZE;
+			ASSERT(nexthdrp == IPPROTO_UDP);
+		}
 	}
 
 	/*
 	 * This is the inbound data path.  Packets are passed upstream as
-	 * T_UNITDATA_IND messages with full IP headers still attached.
+	 * T_UNITDATA_IND messages.
 	 */
-	if (udp->udp_family == AF_INET) {
+	if (connp->conn_family == AF_INET) {
 		sin_t *sin;
 
 		ASSERT(IPH_HDR_VERSION((ipha_t *)rptr) == IPV4_VERSION);
 
 		/*
 		 * Normally only send up the source address.
-		 * If IP_RECVDSTADDR is set we include the destination IP
-		 * address as an option. With IP_RECVOPTS we include all
-		 * the IP options.
+		 * If any ancillary data items are wanted we add those.
 		 */
 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
-		if (udp_bits.udpb_recvdstaddr) {
-			udi_size += sizeof (struct T_opthdr) +
-			    sizeof (struct in_addr);
-			UDP_STAT(us, udp_in_recvdstaddr);
-		}
-
-		if (udp_bits.udpb_ip_recvpktinfo && (pinfo != NULL) &&
-		    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
-			udi_size += sizeof (struct T_opthdr) +
-			    sizeof (struct in_pktinfo);
-			UDP_STAT(us, udp_ip_rcvpktinfo);
-		}
-
-		if ((udp_bits.udpb_recvopts) && opt_len > 0) {
-			udi_size += sizeof (struct T_opthdr) + opt_len;
-			UDP_STAT(us, udp_in_recvopts);
-		}
-
-		/*
-		 * If the IP_RECVSLLA or the IP_RECVIF is set then allocate
-		 * space accordingly
-		 */
-		if ((udp_bits.udpb_recvif) && (pinfo != NULL) &&
-		    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
-			udi_size += sizeof (struct T_opthdr) + sizeof (uint_t);
-			UDP_STAT(us, udp_in_recvif);
-		}
-
-		if ((udp_bits.udpb_recvslla) && (pinfo != NULL) &&
-		    (pinfo->ip_pkt_flags & IPF_RECVSLLA)) {
-			udi_size += sizeof (struct T_opthdr) +
-			    sizeof (struct sockaddr_dl);
-			UDP_STAT(us, udp_in_recvslla);
-		}
-
-		if ((udp_bits.udpb_recvucred) &&
-		    (cr = msg_getcred(mp, &cpid)) != NULL) {
-			udi_size += sizeof (struct T_opthdr) + ucredsize;
-			UDP_STAT(us, udp_in_recvucred);
-		}
-
-		/*
-		 * If SO_TIMESTAMP is set allocate the appropriate sized
-		 * buffer. Since gethrestime() expects a pointer aligned
-		 * argument, we allocate space necessary for extra
-		 * alignment (even though it might not be used).
-		 */
-		if (udp_bits.udpb_timestamp) {
-			udi_size += sizeof (struct T_opthdr) +
-			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
-			UDP_STAT(us, udp_in_timestamp);
-		}
-
-		/*
-		 * If IP_RECVTTL is set allocate the appropriate sized buffer
-		 */
-		if (udp_bits.udpb_recvttl) {
-			udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t);
-			UDP_STAT(us, udp_in_recvttl);
+		if (recv_ancillary.crb_all != 0) {
+			udi_size += conn_recvancillary_size(connp,
+			    recv_ancillary, ira, mp, &ipps);
 		}
 
 		/* Allocate a message block for the T_UNITDATA_IND structure. */
 		mp1 = allocb(udi_size, BPRI_MED);
 		if (mp1 == NULL) {
 			freemsg(mp);
-			if (options_mp != NULL)
-				freeb(options_mp);
 			BUMP_MIB(&us->us_udp_mib, udpInErrors);
 			return;
 		}
 		mp1->b_cont = mp;
-		mp = mp1;
-		mp->b_datap->db_type = M_PROTO;
-		tudi = (struct T_unitdata_ind *)mp->b_rptr;
-		mp->b_wptr = (uchar_t *)tudi + udi_size;
+		mp1->b_datap->db_type = M_PROTO;
+		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
+		mp1->b_wptr = (uchar_t *)tudi + udi_size;
 		tudi->PRIM_type = T_UNITDATA_IND;
 		tudi->SRC_length = sizeof (sin_t);
 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
@@ -3786,7 +2539,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
 		sin = (sin_t *)&tudi[1];
 		sin->sin_addr.s_addr = ((ipha_t *)rptr)->ipha_src;
 		sin->sin_port =	udpha->uha_src_port;
-		sin->sin_family = udp->udp_family;
+		sin->sin_family = connp->conn_family;
 		*(uint32_t *)&sin->sin_zero[0] = 0;
 		*(uint32_t *)&sin->sin_zero[4] = 0;
 
@@ -3795,166 +2548,8 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
 		 * IP_RECVTTL has been set.
 		 */
 		if (udi_size != 0) {
-			/*
-			 * Copy in destination address before options to avoid
-			 * any padding issues.
-			 */
-			char *dstopt;
-
-			dstopt = (char *)&sin[1];
-			if (udp_bits.udpb_recvdstaddr) {
-				struct T_opthdr *toh;
-				ipaddr_t *dstptr;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IP;
-				toh->name = IP_RECVDSTADDR;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (ipaddr_t);
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				dstptr = (ipaddr_t *)dstopt;
-				*dstptr = ((ipha_t *)rptr)->ipha_dst;
-				dstopt += sizeof (ipaddr_t);
-				udi_size -= toh->len;
-			}
-
-			if (udp_bits.udpb_recvopts && opt_len > 0) {
-				struct T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IP;
-				toh->name = IP_RECVOPTS;
-				toh->len = sizeof (struct T_opthdr) + opt_len;
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				bcopy(rptr + IP_SIMPLE_HDR_LENGTH, dstopt,
-				    opt_len);
-				dstopt += opt_len;
-				udi_size -= toh->len;
-			}
-
-			if ((udp_bits.udpb_ip_recvpktinfo) && (pinfo != NULL) &&
-			    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
-				struct T_opthdr *toh;
-				struct in_pktinfo *pktinfop;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IP;
-				toh->name = IP_PKTINFO;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (*pktinfop);
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				pktinfop = (struct in_pktinfo *)dstopt;
-				pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex;
-				pktinfop->ipi_spec_dst =
-				    pinfo->ip_pkt_match_addr;
-				pktinfop->ipi_addr.s_addr =
-				    ((ipha_t *)rptr)->ipha_dst;
-
-				dstopt += sizeof (struct in_pktinfo);
-				udi_size -= toh->len;
-			}
-
-			if ((udp_bits.udpb_recvslla) && (pinfo != NULL) &&
-			    (pinfo->ip_pkt_flags & IPF_RECVSLLA)) {
-
-				struct T_opthdr *toh;
-				struct sockaddr_dl	*dstptr;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IP;
-				toh->name = IP_RECVSLLA;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (struct sockaddr_dl);
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				dstptr = (struct sockaddr_dl *)dstopt;
-				bcopy(&pinfo->ip_pkt_slla, dstptr,
-				    sizeof (struct sockaddr_dl));
-				dstopt += sizeof (struct sockaddr_dl);
-				udi_size -= toh->len;
-			}
-
-			if ((udp_bits.udpb_recvif) && (pinfo != NULL) &&
-			    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
-
-				struct T_opthdr *toh;
-				uint_t		*dstptr;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IP;
-				toh->name = IP_RECVIF;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (uint_t);
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				dstptr = (uint_t *)dstopt;
-				*dstptr = pinfo->ip_pkt_ifindex;
-				dstopt += sizeof (uint_t);
-				udi_size -= toh->len;
-			}
-
-			if (cr != NULL) {
-				struct T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = SOL_SOCKET;
-				toh->name = SCM_UCRED;
-				toh->len = sizeof (struct T_opthdr) + ucredsize;
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				(void) cred2ucred(cr, cpid, dstopt, rcr);
-				dstopt += ucredsize;
-				udi_size -= toh->len;
-			}
-
-			if (udp_bits.udpb_timestamp) {
-				struct	T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = SOL_SOCKET;
-				toh->name = SCM_TIMESTAMP;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				/* Align for gethrestime() */
-				dstopt = (char *)P2ROUNDUP((intptr_t)dstopt,
-				    sizeof (intptr_t));
-				gethrestime((timestruc_t *)dstopt);
-				dstopt = (char *)toh + toh->len;
-				udi_size -= toh->len;
-			}
-
-			/*
-			 * CAUTION:
-			 * Due to aligment issues
-			 * Processing of IP_RECVTTL option
-			 * should always be the last. Adding
-			 * any option processing after this will
-			 * cause alignment panic.
-			 */
-			if (udp_bits.udpb_recvttl) {
-				struct	T_opthdr *toh;
-				uint8_t	*dstptr;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IP;
-				toh->name = IP_RECVTTL;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (uint8_t);
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				dstptr = (uint8_t *)dstopt;
-				*dstptr = ((ipha_t *)rptr)->ipha_ttl;
-				dstopt += sizeof (uint8_t);
-				udi_size -= toh->len;
-			}
-
-			/* Consumed all of allocated space */
-			ASSERT(udi_size == 0);
+			conn_recvancillary_add(connp, recv_ancillary, ira,
+			    &ipps, (uchar_t *)&sin[1], udi_size);
 		}
 	} else {
 		sin6_t *sin6;
@@ -3968,89 +2563,21 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
 		 */
 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
 
-		if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS|
-		    IPPF_RTHDR|IPPF_IFINDEX)) {
-			if ((udp_bits.udpb_ipv6_recvhopopts) &&
-			    (ipp.ipp_fields & IPPF_HOPOPTS)) {
-				size_t hlen;
-
-				UDP_STAT(us, udp_in_recvhopopts);
-				hlen = copy_hop_opts(&ipp, NULL);
-				if (hlen == 0)
-					ipp.ipp_fields &= ~IPPF_HOPOPTS;
-				udi_size += hlen;
-			}
-			if (((udp_bits.udpb_ipv6_recvdstopts) ||
-			    udp_bits.udpb_old_ipv6_recvdstopts) &&
-			    (ipp.ipp_fields & IPPF_DSTOPTS)) {
-				udi_size += sizeof (struct T_opthdr) +
-				    ipp.ipp_dstoptslen;
-				UDP_STAT(us, udp_in_recvdstopts);
-			}
-			if ((((udp_bits.udpb_ipv6_recvdstopts) &&
-			    udp_bits.udpb_ipv6_recvrthdr &&
-			    (ipp.ipp_fields & IPPF_RTHDR)) ||
-			    (udp_bits.udpb_ipv6_recvrthdrdstopts)) &&
-			    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
-				udi_size += sizeof (struct T_opthdr) +
-				    ipp.ipp_rtdstoptslen;
-				UDP_STAT(us, udp_in_recvrtdstopts);
-			}
-			if ((udp_bits.udpb_ipv6_recvrthdr) &&
-			    (ipp.ipp_fields & IPPF_RTHDR)) {
-				udi_size += sizeof (struct T_opthdr) +
-				    ipp.ipp_rthdrlen;
-				UDP_STAT(us, udp_in_recvrthdr);
-			}
-			if ((udp_bits.udpb_ip_recvpktinfo) &&
-			    (ipp.ipp_fields & IPPF_IFINDEX)) {
-				udi_size += sizeof (struct T_opthdr) +
-				    sizeof (struct in6_pktinfo);
-				UDP_STAT(us, udp_in_recvpktinfo);
-			}
-
-		}
-		if ((udp_bits.udpb_recvucred) &&
-		    (cr = msg_getcred(mp, &cpid)) != NULL) {
-			udi_size += sizeof (struct T_opthdr) + ucredsize;
-			UDP_STAT(us, udp_in_recvucred);
-		}
-
-		/*
-		 * If SO_TIMESTAMP is set allocate the appropriate sized
-		 * buffer. Since gethrestime() expects a pointer aligned
-		 * argument, we allocate space necessary for extra
-		 * alignment (even though it might not be used).
-		 */
-		if (udp_bits.udpb_timestamp) {
-			udi_size += sizeof (struct T_opthdr) +
-			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
-			UDP_STAT(us, udp_in_timestamp);
-		}
-
-		if (udp_bits.udpb_ipv6_recvhoplimit) {
-			udi_size += sizeof (struct T_opthdr) + sizeof (int);
-			UDP_STAT(us, udp_in_recvhoplimit);
-		}
-
-		if (udp_bits.udpb_ipv6_recvtclass) {
-			udi_size += sizeof (struct T_opthdr) + sizeof (int);
-			UDP_STAT(us, udp_in_recvtclass);
+		if (recv_ancillary.crb_all != 0) {
+			udi_size += conn_recvancillary_size(connp,
+			    recv_ancillary, ira, mp, &ipps);
 		}
 
 		mp1 = allocb(udi_size, BPRI_MED);
 		if (mp1 == NULL) {
 			freemsg(mp);
-			if (options_mp != NULL)
-				freeb(options_mp);
 			BUMP_MIB(&us->us_udp_mib, udpInErrors);
 			return;
 		}
 		mp1->b_cont = mp;
-		mp = mp1;
-		mp->b_datap->db_type = M_PROTO;
-		tudi = (struct T_unitdata_ind *)mp->b_rptr;
-		mp->b_wptr = (uchar_t *)tudi + udi_size;
+		mp1->b_datap->db_type = M_PROTO;
+		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
+		mp1->b_wptr = (uchar_t *)tudi + udi_size;
 		tudi->PRIM_type = T_UNITDATA_IND;
 		tudi->SRC_length = sizeof (sin6_t);
 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
@@ -4059,7 +2586,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
 		tudi->OPT_length = udi_size;
 		sin6 = (sin6_t *)&tudi[1];
-		if (ipversion == IPV4_VERSION) {
+		if (ira->ira_flags & IRAF_IS_IPV4) {
 			in6_addr_t v6dst;
 
 			IN6_IPADDR_TO_V4MAPPED(((ipha_t *)rptr)->ipha_src,
@@ -4069,196 +2596,43 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
 			sin6->sin6_flowinfo = 0;
 			sin6->sin6_scope_id = 0;
 			sin6->__sin6_src_id = ip_srcid_find_addr(&v6dst,
-			    connp->conn_zoneid, us->us_netstack);
+			    IPCL_ZONEID(connp), us->us_netstack);
 		} else {
+			ip6h = (ip6_t *)rptr;
+
 			sin6->sin6_addr = ip6h->ip6_src;
 			/* No sin6_flowinfo per API */
 			sin6->sin6_flowinfo = 0;
-			/* For link-scope source pass up scope id */
-			if ((ipp.ipp_fields & IPPF_IFINDEX) &&
-			    IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
-				sin6->sin6_scope_id = ipp.ipp_ifindex;
+			/* For link-scope pass up scope id */
+			if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
+				sin6->sin6_scope_id = ira->ira_ruifindex;
 			else
 				sin6->sin6_scope_id = 0;
 			sin6->__sin6_src_id = ip_srcid_find_addr(
-			    &ip6h->ip6_dst, connp->conn_zoneid,
+			    &ip6h->ip6_dst, IPCL_ZONEID(connp),
 			    us->us_netstack);
 		}
 		sin6->sin6_port = udpha->uha_src_port;
-		sin6->sin6_family = udp->udp_family;
+		sin6->sin6_family = connp->conn_family;
 
 		if (udi_size != 0) {
-			uchar_t *dstopt;
-
-			dstopt = (uchar_t *)&sin6[1];
-			if ((udp_bits.udpb_ip_recvpktinfo) &&
-			    (ipp.ipp_fields & IPPF_IFINDEX)) {
-				struct T_opthdr *toh;
-				struct in6_pktinfo *pkti;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IPV6;
-				toh->name = IPV6_PKTINFO;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (*pkti);
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				pkti = (struct in6_pktinfo *)dstopt;
-				if (ipversion == IPV6_VERSION)
-					pkti->ipi6_addr = ip6h->ip6_dst;
-				else
-					IN6_IPADDR_TO_V4MAPPED(
-					    ((ipha_t *)rptr)->ipha_dst,
-					    &pkti->ipi6_addr);
-				pkti->ipi6_ifindex = ipp.ipp_ifindex;
-				dstopt += sizeof (*pkti);
-				udi_size -= toh->len;
-			}
-			if (udp_bits.udpb_ipv6_recvhoplimit) {
-				struct T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IPV6;
-				toh->name = IPV6_HOPLIMIT;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (uint_t);
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				if (ipversion == IPV6_VERSION)
-					*(uint_t *)dstopt = ip6h->ip6_hops;
-				else
-					*(uint_t *)dstopt =
-					    ((ipha_t *)rptr)->ipha_ttl;
-				dstopt += sizeof (uint_t);
-				udi_size -= toh->len;
-			}
-			if (udp_bits.udpb_ipv6_recvtclass) {
-				struct T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IPV6;
-				toh->name = IPV6_TCLASS;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (uint_t);
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				if (ipversion == IPV6_VERSION) {
-					*(uint_t *)dstopt =
-					    IPV6_FLOW_TCLASS(ip6h->ip6_flow);
-				} else {
-					ipha_t *ipha = (ipha_t *)rptr;
-					*(uint_t *)dstopt =
-					    ipha->ipha_type_of_service;
-				}
-				dstopt += sizeof (uint_t);
-				udi_size -= toh->len;
-			}
-			if ((udp_bits.udpb_ipv6_recvhopopts) &&
-			    (ipp.ipp_fields & IPPF_HOPOPTS)) {
-				size_t hlen;
-
-				hlen = copy_hop_opts(&ipp, dstopt);
-				dstopt += hlen;
-				udi_size -= hlen;
-			}
-			if ((udp_bits.udpb_ipv6_recvdstopts) &&
-			    (udp_bits.udpb_ipv6_recvrthdr) &&
-			    (ipp.ipp_fields & IPPF_RTHDR) &&
-			    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
-				struct T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IPV6;
-				toh->name = IPV6_DSTOPTS;
-				toh->len = sizeof (struct T_opthdr) +
-				    ipp.ipp_rtdstoptslen;
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				bcopy(ipp.ipp_rtdstopts, dstopt,
-				    ipp.ipp_rtdstoptslen);
-				dstopt += ipp.ipp_rtdstoptslen;
-				udi_size -= toh->len;
-			}
-			if ((udp_bits.udpb_ipv6_recvrthdr) &&
-			    (ipp.ipp_fields & IPPF_RTHDR)) {
-				struct T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IPV6;
-				toh->name = IPV6_RTHDR;
-				toh->len = sizeof (struct T_opthdr) +
-				    ipp.ipp_rthdrlen;
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen);
-				dstopt += ipp.ipp_rthdrlen;
-				udi_size -= toh->len;
-			}
-			if ((udp_bits.udpb_ipv6_recvdstopts) &&
-			    (ipp.ipp_fields & IPPF_DSTOPTS)) {
-				struct T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = IPPROTO_IPV6;
-				toh->name = IPV6_DSTOPTS;
-				toh->len = sizeof (struct T_opthdr) +
-				    ipp.ipp_dstoptslen;
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				bcopy(ipp.ipp_dstopts, dstopt,
-				    ipp.ipp_dstoptslen);
-				dstopt += ipp.ipp_dstoptslen;
-				udi_size -= toh->len;
-			}
-			if (cr != NULL) {
-				struct T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = SOL_SOCKET;
-				toh->name = SCM_UCRED;
-				toh->len = sizeof (struct T_opthdr) + ucredsize;
-				toh->status = 0;
-				(void) cred2ucred(cr, cpid, &toh[1], rcr);
-				dstopt += toh->len;
-				udi_size -= toh->len;
-			}
-			if (udp_bits.udpb_timestamp) {
-				struct	T_opthdr *toh;
-
-				toh = (struct T_opthdr *)dstopt;
-				toh->level = SOL_SOCKET;
-				toh->name = SCM_TIMESTAMP;
-				toh->len = sizeof (struct T_opthdr) +
-				    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
-				toh->status = 0;
-				dstopt += sizeof (struct T_opthdr);
-				/* Align for gethrestime() */
-				dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt,
-				    sizeof (intptr_t));
-				gethrestime((timestruc_t *)dstopt);
-				dstopt = (uchar_t *)toh + toh->len;
-				udi_size -= toh->len;
-			}
-
-			/* Consumed all of allocated space */
-			ASSERT(udi_size == 0);
+			conn_recvancillary_add(connp, recv_ancillary, ira,
+			    &ipps, (uchar_t *)&sin6[1], udi_size);
 		}
-#undef	sin6
-		/* No IP_RECVDSTADDR for IPv6. */
 	}
 
-	BUMP_MIB(&us->us_udp_mib, udpHCInDatagrams);
-	if (options_mp != NULL)
-		freeb(options_mp);
-
-	udp_ulp_recv(connp, mp);
+	/* Walk past the headers unless IP_RECVHDR was set. */
+	if (!udp->udp_rcvhdr) {
+		mp->b_rptr = rptr + hdr_length;
+		pkt_len -= hdr_length;
+	}
 
+	BUMP_MIB(&us->us_udp_mib, udpHCInDatagrams);
+	udp_ulp_recv(connp, mp1, pkt_len, ira);
 	return;
 
 tossit:
 	freemsg(mp);
-	if (options_mp != NULL)
-		freeb(options_mp);
 	BUMP_MIB(&us->us_udp_mib, udpInErrors);
 }
 
@@ -4386,23 +2760,34 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
 				needattr = B_TRUE;
 				break;
 			}
+			mutex_enter(&connp->conn_lock);
+			if (udp->udp_state == TS_DATA_XFER &&
+			    connp->conn_ixa->ixa_tsl != NULL) {
+				ts_label_t *tsl;
+
+				tsl = connp->conn_ixa->ixa_tsl;
+				mlp.tme_flags |= MIB2_TMEF_IS_LABELED;
+				mlp.tme_doi = label2doi(tsl);
+				mlp.tme_label = *label2bslabel(tsl);
+				needattr = B_TRUE;
+			}
+			mutex_exit(&connp->conn_lock);
 
 			/*
 			 * Create an IPv4 table entry for IPv4 entries and also
 			 * any IPv6 entries which are bound to in6addr_any
 			 * (i.e. anything a IPv4 peer could connect/send to).
 			 */
-			if (udp->udp_ipversion == IPV4_VERSION ||
+			if (connp->conn_ipversion == IPV4_VERSION ||
 			    (udp->udp_state <= TS_IDLE &&
-			    IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) {
+			    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6))) {
 				ude.udpEntryInfo.ue_state = state;
 				/*
 				 * If in6addr_any this will set it to
 				 * INADDR_ANY
 				 */
-				ude.udpLocalAddress =
-				    V4_PART_OF_V6(udp->udp_v6src);
-				ude.udpLocalPort = ntohs(udp->udp_port);
+				ude.udpLocalAddress = connp->conn_laddr_v4;
+				ude.udpLocalPort = ntohs(connp->conn_lport);
 				if (udp->udp_state == TS_DATA_XFER) {
 					/*
 					 * Can potentially get here for
@@ -4414,9 +2799,9 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
 					 * this part of the code.
 					 */
 					ude.udpEntryInfo.ue_RemoteAddress =
-					    V4_PART_OF_V6(udp->udp_v6dst);
+					    connp->conn_faddr_v4;
 					ude.udpEntryInfo.ue_RemotePort =
-					    ntohs(udp->udp_dstport);
+					    ntohs(connp->conn_fport);
 				} else {
 					ude.udpEntryInfo.ue_RemoteAddress = 0;
 					ude.udpEntryInfo.ue_RemotePort = 0;
@@ -4429,10 +2814,10 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
 				 */
 				ude.udpInstance = (uint32_t)(uintptr_t)udp;
 				ude.udpCreationProcess =
-				    (udp->udp_open_pid < 0) ?
+				    (connp->conn_cpid < 0) ?
 				    MIB2_UNKNOWN_PROCESS :
-				    udp->udp_open_pid;
-				ude.udpCreationTime = udp->udp_open_time;
+				    connp->conn_cpid;
+				ude.udpCreationTime = connp->conn_open_time;
 
 				(void) snmp_append_data2(mp_conn_ctl->b_cont,
 				    &mp_conn_tail, (char *)&ude, sizeof (ude));
@@ -4442,16 +2827,24 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
 					    mp_attr_ctl->b_cont, &mp_attr_tail,
 					    (char *)&mlp, sizeof (mlp));
 			}
-			if (udp->udp_ipversion == IPV6_VERSION) {
+			if (connp->conn_ipversion == IPV6_VERSION) {
 				ude6.udp6EntryInfo.ue_state  = state;
-				ude6.udp6LocalAddress = udp->udp_v6src;
-				ude6.udp6LocalPort = ntohs(udp->udp_port);
-				ude6.udp6IfIndex = udp->udp_bound_if;
+				ude6.udp6LocalAddress = connp->conn_laddr_v6;
+				ude6.udp6LocalPort = ntohs(connp->conn_lport);
+				mutex_enter(&connp->conn_lock);
+				if (connp->conn_ixa->ixa_flags &
+				    IXAF_SCOPEID_SET) {
+					ude6.udp6IfIndex =
+					    connp->conn_ixa->ixa_scopeid;
+				} else {
+					ude6.udp6IfIndex = connp->conn_bound_if;
+				}
+				mutex_exit(&connp->conn_lock);
 				if (udp->udp_state == TS_DATA_XFER) {
 					ude6.udp6EntryInfo.ue_RemoteAddress =
-					    udp->udp_v6dst;
+					    connp->conn_faddr_v6;
 					ude6.udp6EntryInfo.ue_RemotePort =
-					    ntohs(udp->udp_dstport);
+					    ntohs(connp->conn_fport);
 				} else {
 					ude6.udp6EntryInfo.ue_RemoteAddress =
 					    sin6_null.sin6_addr;
@@ -4464,10 +2857,10 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
 				 */
 				ude6.udp6Instance = (uint32_t)(uintptr_t)udp;
 				ude6.udp6CreationProcess =
-				    (udp->udp_open_pid < 0) ?
+				    (connp->conn_cpid < 0) ?
 				    MIB2_UNKNOWN_PROCESS :
-				    udp->udp_open_pid;
-				ude6.udp6CreationTime = udp->udp_open_time;
+				    connp->conn_cpid;
+				ude6.udp6CreationTime = connp->conn_open_time;
 
 				(void) snmp_append_data2(mp6_conn_ctl->b_cont,
 				    &mp6_conn_tail, (char *)&ude6,
@@ -4548,39 +2941,34 @@ udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
  * passed in mp.  This message is freed.
  */
 static void
-udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, t_scalar_t destlen,
-    t_scalar_t err)
+udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
 {
 	struct T_unitdata_req *tudr;
 	mblk_t	*mp1;
+	uchar_t *destaddr;
+	t_scalar_t destlen;
 	uchar_t	*optaddr;
 	t_scalar_t optlen;
 
-	if (DB_TYPE(mp) == M_DATA) {
-		ASSERT(destaddr != NULL && destlen != 0);
-		optaddr = NULL;
-		optlen = 0;
-	} else {
-		if ((mp->b_wptr < mp->b_rptr) ||
-		    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
-			goto done;
-		}
-		tudr = (struct T_unitdata_req *)mp->b_rptr;
-		destaddr = mp->b_rptr + tudr->DEST_offset;
-		if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
-		    destaddr + tudr->DEST_length < mp->b_rptr ||
-		    destaddr + tudr->DEST_length > mp->b_wptr) {
-			goto done;
-		}
-		optaddr = mp->b_rptr + tudr->OPT_offset;
-		if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
-		    optaddr + tudr->OPT_length < mp->b_rptr ||
-		    optaddr + tudr->OPT_length > mp->b_wptr) {
-			goto done;
-		}
-		destlen = tudr->DEST_length;
-		optlen = tudr->OPT_length;
+	if ((mp->b_wptr < mp->b_rptr) ||
+	    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
+		goto done;
 	}
+	tudr = (struct T_unitdata_req *)mp->b_rptr;
+	destaddr = mp->b_rptr + tudr->DEST_offset;
+	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
+	    destaddr + tudr->DEST_length < mp->b_rptr ||
+	    destaddr + tudr->DEST_length > mp->b_wptr) {
+		goto done;
+	}
+	optaddr = mp->b_rptr + tudr->OPT_offset;
+	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
+	    optaddr + tudr->OPT_length < mp->b_rptr ||
+	    optaddr + tudr->OPT_length > mp->b_wptr) {
+		goto done;
+	}
+	destlen = tudr->DEST_length;
+	optlen = tudr->OPT_length;
 
 	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
 	    (char *)optaddr, optlen, err);
@@ -4685,1093 +3073,721 @@ retry:
 	return (port);
 }
 
+/*
+ * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
+ * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
+ * the TPI options, otherwise we take them from msg_control.
+ * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
+ * Always consumes mp; never consumes tudr_mp.
+ */
 static int
-udp_update_label(queue_t *wq, mblk_t *mp, ipaddr_t dst)
+udp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
+    mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
 {
-	int err;
-	cred_t *cred;
-	cred_t *orig_cred = NULL;
-	cred_t *effective_cred = NULL;
-	uchar_t opt_storage[IP_MAX_OPT_LENGTH];
-	udp_t *udp = Q_TO_UDP(wq);
+	udp_t		*udp = connp->conn_udp;
 	udp_stack_t	*us = udp->udp_us;
+	int		error;
+	ip_xmit_attr_t	*ixa;
+	ip_pkt_t	*ipp;
+	in6_addr_t	v6src;
+	in6_addr_t	v6dst;
+	in6_addr_t	v6nexthop;
+	in_port_t	dstport;
+	uint32_t	flowinfo;
+	uint_t		srcid;
+	int		is_absreq_failure = 0;
+	conn_opt_arg_t	coas, *coa;
 
-	/*
-	 * All Solaris components should pass a db_credp
-	 * for this message, hence we ASSERT.
-	 * On production kernels we return an error to be robust against
-	 * random streams modules sitting on top of us.
-	 */
-	cred = orig_cred = msg_getcred(mp, NULL);
-	ASSERT(cred != NULL);
-	if (cred == NULL)
-		return (EINVAL);
+	ASSERT(tudr_mp != NULL || msg != NULL);
 
 	/*
-	 * Verify the destination is allowed to receive packets at
-	 * the security label of the message data. tsol_check_dest()
-	 * may create a new effective cred for this message with a
-	 * modified label or label flags. Note that we use the cred/label
-	 * from the message to handle MLP
+	 * Get ixa before checking state to handle a disconnect race.
+	 *
+	 * We need an exclusive copy of conn_ixa since the ancillary data
+	 * options might modify it. That copy has no pointers hence we
+	 * need to set them up once we've parsed the ancillary data.
 	 */
-	if ((err = tsol_check_dest(cred, &dst, IPV4_VERSION,
-	    udp->udp_connp->conn_mac_mode, &effective_cred)) != 0)
-		goto done;
-	if (effective_cred != NULL)
-		cred = effective_cred;
+	ixa = conn_get_ixa_exclusive(connp);
+	if (ixa == NULL) {
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		freemsg(mp);
+		return (ENOMEM);
+	}
+	ASSERT(cr != NULL);
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
+	if (is_system_labeled()) {
+		/* We need to restart with a label based on the cred */
+		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
+	}
 
-	/*
-	 * Calculate the security label to be placed in the text
-	 * of the message (if any).
-	 */
-	if ((err = tsol_compute_label(cred, dst, opt_storage,
-	    us->us_netstack->netstack_ip)) != 0)
-		goto done;
+	/* In case previous destination was multicast or multirt */
+	ip_attr_newdst(ixa);
 
-	/*
-	 * Insert the security label in the cached ip options,
-	 * removing any old label that may exist.
-	 */
-	if ((err = tsol_update_options(&udp->udp_ip_snd_options,
-	    &udp->udp_ip_snd_options_len, &udp->udp_label_len,
-	    opt_storage)) != 0)
+	/* Get a copy of conn_xmit_ipp since the options might change it */
+	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
+	if (ipp == NULL) {
+		ixa_refrele(ixa);
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		freemsg(mp);
+		return (ENOMEM);
+	}
+	mutex_enter(&connp->conn_lock);
+	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
+	mutex_exit(&connp->conn_lock);
+	if (error != 0) {
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		freemsg(mp);
 		goto done;
+	}
 
 	/*
-	 * Save the destination address and creds we used to
-	 * generate the security label text.
+	 * Parse the options and update ixa and ipp as a result.
+	 * Note that ixa_tsl can be updated if SCM_UCRED.
+	 * ixa_refrele/ixa_inactivate will release any reference on ixa_tsl.
 	 */
-	if (cred != udp->udp_effective_cred) {
-		if (udp->udp_effective_cred != NULL)
-			crfree(udp->udp_effective_cred);
-		crhold(cred);
-		udp->udp_effective_cred = cred;
-	}
-	if (orig_cred != udp->udp_last_cred) {
-		if (udp->udp_last_cred != NULL)
-			crfree(udp->udp_last_cred);
-		crhold(orig_cred);
-		udp->udp_last_cred = orig_cred;
-	}
-done:
-	if (effective_cred != NULL)
-		crfree(effective_cred);
 
-	if (err != 0) {
-		DTRACE_PROBE4(
-		    tx__ip__log__info__updatelabel__udp,
-		    char *, "queue(1) failed to update options(2) on mp(3)",
-		    queue_t *, wq, char *, opt_storage, mblk_t *, mp);
-	}
-	return (err);
-}
+	coa = &coas;
+	coa->coa_connp = connp;
+	coa->coa_ixa = ixa;
+	coa->coa_ipp = ipp;
+	coa->coa_ancillary = B_TRUE;
+	coa->coa_changed = 0;
 
-static mblk_t *
-udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port,
-    uint_t srcid, int *error, boolean_t insert_spi, struct nmsghdr *msg,
-    cred_t *cr, pid_t pid)
-{
-	udp_t		*udp = connp->conn_udp;
-	mblk_t		*mp1 = mp;
-	mblk_t		*mp2;
-	ipha_t		*ipha;
-	int		ip_hdr_length;
-	uint32_t 	ip_len;
-	udpha_t		*udpha;
-	boolean_t 	lock_held = B_FALSE;
-	in_port_t	uha_src_port;
-	udpattrs_t	attrs;
-	uchar_t		ip_snd_opt[IP_MAX_OPT_LENGTH];
-	uint32_t	ip_snd_opt_len = 0;
-	ip4_pkt_t  	pktinfo;
-	ip4_pkt_t  	*pktinfop = &pktinfo;
-	ip_opt_info_t	optinfo;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-	udp_stack_t	*us = udp->udp_us;
-	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
-	queue_t		*q = connp->conn_wq;
-	ire_t		*ire;
-	in6_addr_t	v6dst;
-	boolean_t	update_lastdst = B_FALSE;
-
-	*error = 0;
-	pktinfop->ip4_ill_index = 0;
-	pktinfop->ip4_addr = INADDR_ANY;
-	optinfo.ip_opt_flags = 0;
-	optinfo.ip_opt_ill_index = 0;
+	if (msg != NULL) {
+		error = process_auxiliary_options(connp, msg->msg_control,
+		    msg->msg_controllen, coa, &udp_opt_obj, udp_opt_set, cr);
+	} else {
+		struct T_unitdata_req *tudr;
 
-	if (v4dst == INADDR_ANY)
-		v4dst = htonl(INADDR_LOOPBACK);
+		tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
+		ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
+		error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
+		    &tudr->OPT_length, tudr->OPT_offset, cr, &udp_opt_obj,
+		    coa, &is_absreq_failure);
+	}
+	if (error != 0) {
+		/*
+		 * Note: No special action needed in this
+		 * module for "is_absreq_failure"
+		 */
+		freemsg(mp);
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		goto done;
+	}
+	ASSERT(is_absreq_failure == 0);
 
+	mutex_enter(&connp->conn_lock);
 	/*
-	 * If options passed in, feed it for verification and handling
+	 * If laddr is unspecified then we look at sin6_src_id.
+	 * We will give precedence to a source address set with IPV6_PKTINFO
+	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
+	 * want ip_attr_connect to select a source (since it can fail) when
+	 * IPV6_PKTINFO is specified.
+	 * If this doesn't result in a source address then we get a source
+	 * from ip_attr_connect() below.
 	 */
-	attrs.udpattr_credset = B_FALSE;
-	if (IPCL_IS_NONSTR(connp)) {
-		if (msg->msg_controllen != 0) {
-			attrs.udpattr_ipp4 = pktinfop;
-			attrs.udpattr_mb = mp;
-
-			rw_enter(&udp->udp_rwlock, RW_WRITER);
-			*error = process_auxiliary_options(connp,
-			    msg->msg_control, msg->msg_controllen,
-			    &attrs, &udp_opt_obj, udp_opt_set, cr);
-			rw_exit(&udp->udp_rwlock);
-			if (*error)
-				goto done;
+	v6src = connp->conn_saddr_v6;
+	if (sin != NULL) {
+		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
+		dstport = sin->sin_port;
+		flowinfo = 0;
+		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		ixa->ixa_flags |= IXAF_IS_IPV4;
+	} else if (sin6 != NULL) {
+		v6dst = sin6->sin6_addr;
+		dstport = sin6->sin6_port;
+		flowinfo = sin6->sin6_flowinfo;
+		srcid = sin6->__sin6_src_id;
+		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
+			ixa->ixa_scopeid = sin6->sin6_scope_id;
+			ixa->ixa_flags |= IXAF_SCOPEID_SET;
+		} else {
+			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 		}
-	} else {
-		if (DB_TYPE(mp) != M_DATA) {
-			mp1 = mp->b_cont;
-			if (((struct T_unitdata_req *)
-			    mp->b_rptr)->OPT_length != 0) {
-				attrs.udpattr_ipp4 = pktinfop;
-				attrs.udpattr_mb = mp;
-				if (udp_unitdata_opt_process(q, mp, error,
-				    &attrs) < 0)
-					goto done;
-				/*
-				 * Note: success in processing options.
-				 * mp option buffer represented by
-				 * OPT_length/offset now potentially modified
-				 * and contain option setting results
-				 */
-				ASSERT(*error == 0);
-			}
+		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+			    connp->conn_netstack);
 		}
+		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
+			ixa->ixa_flags |= IXAF_IS_IPV4;
+		else
+			ixa->ixa_flags &= ~IXAF_IS_IPV4;
+	} else {
+		/* Connected case */
+		v6dst = connp->conn_faddr_v6;
+		dstport = connp->conn_fport;
+		flowinfo = connp->conn_flowinfo;
 	}
+	mutex_exit(&connp->conn_lock);
 
-	/* mp1 points to the M_DATA mblk carrying the packet */
-	ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
-
-	/*
-	 * Determine whether we need to mark the mblk with the user's
-	 * credentials.
-	 * If labeled then sockfs would have already done this.
-	 */
-	ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
-
-	ire = connp->conn_ire_cache;
-	if (CLASSD(v4dst) || (ire == NULL) || (ire->ire_addr != v4dst) ||
-	    (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) {
-		if (cr != NULL && msg_getcred(mp, NULL) == NULL)
-			mblk_setcred(mp, cr, pid);
+	/* Handle IPV6_PKTINFO setting source address. */
+	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
+	    (ipp->ipp_fields & IPPF_ADDR)) {
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
+			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+				v6src = ipp->ipp_addr;
+		} else {
+			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+				v6src = ipp->ipp_addr;
+		}
 	}
 
-	rw_enter(&udp->udp_rwlock, RW_READER);
-	lock_held = B_TRUE;
+	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
+	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
+	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | IPDF_IPSEC);
 
-	/*
-	 * Cluster and TSOL note:
-	 *    udp.udp_v6lastdst		is shared by Cluster and TSOL
-	 *    udp.udp_lastdstport	is used by Cluster
-	 *
-	 * Both Cluster and TSOL need to update the dest addr and/or port.
-	 * Updating is done after both Cluster and TSOL checks, protected
-	 * by conn_lock.
-	 */
-	mutex_enter(&connp->conn_lock);
-
-	if (cl_inet_connect2 != NULL &&
-	    (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6lastdst) ||
-	    V4_PART_OF_V6(udp->udp_v6lastdst) != v4dst ||
-	    udp->udp_lastdstport != port)) {
-		mutex_exit(&connp->conn_lock);
-		*error = 0;
-		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
-		CL_INET_UDP_CONNECT(connp, udp, B_TRUE, &v6dst, port, *error);
-		if (*error != 0) {
-			*error = EHOSTUNREACH;
-			goto done;
+	switch (error) {
+	case 0:
+		break;
+	case EADDRNOTAVAIL:
+		/*
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
+		 */
+		error = ENETUNREACH;
+		goto failed;
+	case ENETDOWN:
+		/*
+		 * Have !ipif_addr_ready address; drop packet silently
+		 * until we can get applications to not send until we
+		 * are ready.
+		 */
+		error = 0;
+		goto failed;
+	case EHOSTUNREACH:
+	case ENETUNREACH:
+		if (ixa->ixa_ire != NULL) {
+			/*
+			 * Let conn_ip_output/ire_send_noroute return
+			 * the error and send any local ICMP error.
+			 */
+			error = 0;
+			break;
 		}
-		update_lastdst = B_TRUE;
-		mutex_enter(&connp->conn_lock);
+		/* FALLTHRU */
+	default:
+	failed:
+		freemsg(mp);
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		goto done;
 	}
 
 	/*
-	 * Check if our saved options are valid; update if not.
-	 * TSOL Note: Since we are not in WRITER mode, UDP packets
-	 * to different destination may require different labels,
-	 * or worse, UDP packets to same IP address may require
-	 * different labels due to use of shared all-zones address.
-	 * We use conn_lock to ensure that lastdst, ip_snd_options,
-	 * and ip_snd_options_len are consistent for the current
-	 * destination and are updated atomically.
+	 * We might be going to a different destination than last time,
+	 * thus check that TX allows the communication and compute any
+	 * needed label.
+	 *
+	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
+	 * don't have to worry about concurrent threads.
 	 */
 	if (is_system_labeled()) {
-		cred_t	*credp;
-		pid_t	cpid;
-
 		/* Using UDP MLP requires SCM_UCRED from user */
 		if (connp->conn_mlp_type != mlptSingle &&
-		    !attrs.udpattr_credset) {
-			mutex_exit(&connp->conn_lock);
-			DTRACE_PROBE4(
-			    tx__ip__log__info__output__udp,
-			    char *, "MLP mp(1) lacks SCM_UCRED attr(2) on q(3)",
-			    mblk_t *, mp, udpattrs_t *, &attrs, queue_t *, q);
-			*error = EINVAL;
+		    !((ixa->ixa_flags & IXAF_UCRED_TSL))) {
+			BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+			error = ECONNREFUSED;
+			freemsg(mp);
 			goto done;
 		}
 		/*
-		 * Update label option for this UDP socket if
-		 * - the destination has changed,
-		 * - the UDP socket is MLP, or
-		 * - the cred attached to the mblk changed.
+		 * Check whether Trusted Solaris policy allows communication
+		 * with this host, and pretend that the destination is
+		 * unreachable if not.
+		 * Compute any needed label and place it in ipp_label_v4/v6.
+		 *
+		 * Later conn_build_hdr_template/conn_prepend_hdr takes
+		 * ipp_label_v4/v6 to form the packet.
+		 *
+		 * Tsol note: We have ipp structure local to this thread so
+		 * no locking is needed.
 		 */
-		credp = msg_getcred(mp, &cpid);
-		if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6lastdst) ||
-		    V4_PART_OF_V6(udp->udp_v6lastdst) != v4dst ||
-		    connp->conn_mlp_type != mlptSingle ||
-		    credp != udp->udp_last_cred) {
-			if ((*error = udp_update_label(q, mp, v4dst)) != 0) {
-				mutex_exit(&connp->conn_lock);
-				goto done;
-			}
-			update_lastdst = B_TRUE;
+		error = conn_update_label(connp, ixa, &v6dst, ipp);
+		if (error != 0) {
+			freemsg(mp);
+			BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+			goto done;
 		}
-
-		/*
-		 * Attach the effective cred to the mblk to ensure future
-		 * routing decisions will be based on it's label.
-		 */
-		mblk_setcred(mp, udp->udp_effective_cred, cpid);
 	}
-	if (update_lastdst) {
-		IN6_IPADDR_TO_V4MAPPED(v4dst, &udp->udp_v6lastdst);
-		udp->udp_lastdstport = port;
+	mp = udp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, dstport,
+	    flowinfo, mp, &error);
+	if (mp == NULL) {
+		ASSERT(error != 0);
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		goto done;
 	}
-	if (udp->udp_ip_snd_options_len > 0) {
-		ip_snd_opt_len = udp->udp_ip_snd_options_len;
-		bcopy(udp->udp_ip_snd_options, ip_snd_opt, ip_snd_opt_len);
+	if (ixa->ixa_pktlen > IP_MAXPACKET) {
+		error = EMSGSIZE;
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		freemsg(mp);
+		goto done;
 	}
-	mutex_exit(&connp->conn_lock);
+	/* We're done.  Pass the packet to ip. */
+	BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams);
 
-	/* Add an IP header */
-	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + ip_snd_opt_len +
-	    (insert_spi ? sizeof (uint32_t) : 0);
-	ipha = (ipha_t *)&mp1->b_rptr[-ip_hdr_length];
-	if (DB_REF(mp1) != 1 || (uchar_t *)ipha < DB_BASE(mp1) ||
-	    !OK_32PTR(ipha)) {
-		mp2 = allocb(ip_hdr_length + us->us_wroff_extra, BPRI_LO);
-		if (mp2 == NULL) {
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-			    "udp_wput_end: q %p (%S)", q, "allocbfail2");
-			*error = ENOMEM;
-			goto done;
-		}
-		mp2->b_wptr = DB_LIM(mp2);
-		mp2->b_cont = mp1;
-		mp1 = mp2;
-		if (DB_TYPE(mp) != M_DATA)
-			mp->b_cont = mp1;
-		else
-			mp = mp1;
-		ipha = (ipha_t *)(mp1->b_wptr - ip_hdr_length);
-	}
-	ip_hdr_length -= (UDPH_SIZE + (insert_spi ? sizeof (uint32_t) : 0));
-#ifdef	_BIG_ENDIAN
-	/* Set version, header length, and tos */
-	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
-	    ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) |
-	    udp->udp_type_of_service);
-	/* Set ttl and protocol */
-	*(uint16_t *)&ipha->ipha_ttl = (udp->udp_ttl << 8) | IPPROTO_UDP;
-#else
-	/* Set version, header length, and tos */
-	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
-	    ((udp->udp_type_of_service << 8) |
-	    ((IP_VERSION << 4) | (ip_hdr_length>>2)));
-	/* Set ttl and protocol */
-	*(uint16_t *)&ipha->ipha_ttl = (IPPROTO_UDP << 8) | udp->udp_ttl;
-#endif
-	if (pktinfop->ip4_addr != INADDR_ANY) {
-		ipha->ipha_src = pktinfop->ip4_addr;
-		optinfo.ip_opt_flags = IP_VERIFY_SRC;
-	} else {
+	error = conn_ip_output(mp, ixa);
+	/* No udpOutErrors if an error since IP increases its error counter */
+	switch (error) {
+	case 0:
+		break;
+	case EWOULDBLOCK:
+		(void) ixa_check_drain_insert(connp, ixa);
+		error = 0;
+		break;
+	case EADDRNOTAVAIL:
 		/*
-		 * Copy our address into the packet.  If this is zero,
-		 * first look at __sin6_src_id for a hint. If we leave the
-		 * source as INADDR_ANY then ip will fill in the real source
-		 * address.
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
 		 */
-		IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6src, ipha->ipha_src);
-		if (srcid != 0 && ipha->ipha_src == INADDR_ANY) {
-			in6_addr_t v6src;
-
-			ip_srcid_find_id(srcid, &v6src, connp->conn_zoneid,
-			    us->us_netstack);
-			IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
-		}
-	}
-	uha_src_port = udp->udp_port;
-	if (ip_hdr_length == IP_SIMPLE_HDR_LENGTH) {
-		rw_exit(&udp->udp_rwlock);
-		lock_held = B_FALSE;
-	}
-
-	if (pktinfop->ip4_ill_index != 0) {
-		optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
+		error = ENETUNREACH;
+		/* FALLTHRU */
+	default:
+		mutex_enter(&connp->conn_lock);
+		/*
+		 * Clear the source and v6lastdst so we call ip_attr_connect
+		 * for the next packet and try to pick a better source.
+		 */
+		if (connp->conn_mcbc_bind)
+			connp->conn_saddr_v6 = ipv6_all_zeros;
+		else
+			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+		connp->conn_v6lastdst = ipv6_all_zeros;
+		mutex_exit(&connp->conn_lock);
+		break;
 	}
+done:
+	ixa_refrele(ixa);
+	ip_pkt_free(ipp);
+	kmem_free(ipp, sizeof (*ipp));
+	return (error);
+}
 
-	ipha->ipha_fragment_offset_and_flags = 0;
-	ipha->ipha_ident = 0;
-
-	mp1->b_rptr = (uchar_t *)ipha;
-
-	ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <=
-	    (uintptr_t)UINT_MAX);
+/*
+ * Handle sending an M_DATA for a connected socket.
+ * Handles both IPv4 and IPv6.
+ */
+static int
+udp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
+{
+	udp_t		*udp = connp->conn_udp;
+	udp_stack_t	*us = udp->udp_us;
+	int		error;
+	ip_xmit_attr_t	*ixa;
 
-	/* Determine length of packet */
-	ip_len = (uint32_t)(mp1->b_wptr - (uchar_t *)ipha);
-	if ((mp2 = mp1->b_cont) != NULL) {
-		do {
-			ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX);
-			ip_len += (uint32_t)MBLKL(mp2);
-		} while ((mp2 = mp2->b_cont) != NULL);
-	}
 	/*
-	 * If the size of the packet is greater than the maximum allowed by
-	 * ip, return an error. Passing this down could cause panics because
-	 * the size will have wrapped and be inconsistent with the msg size.
+	 * If no other thread is using conn_ixa this just gets a reference to
+	 * conn_ixa. Otherwise we get a safe copy of conn_ixa.
 	 */
-	if (ip_len > IP_MAXPACKET) {
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-		    "udp_wput_end: q %p (%S)", q, "IP length exceeded");
-		*error = EMSGSIZE;
-		goto done;
+	ixa = conn_get_ixa(connp, B_FALSE);
+	if (ixa == NULL) {
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		freemsg(mp);
+		return (ENOMEM);
 	}
-	ipha->ipha_length = htons((uint16_t)ip_len);
-	ip_len -= ip_hdr_length;
-	ip_len = htons((uint16_t)ip_len);
-	udpha = (udpha_t *)(((uchar_t *)ipha) + ip_hdr_length);
-
-	/* Insert all-0s SPI now. */
-	if (insert_spi)
-		*((uint32_t *)(udpha + 1)) = 0;
 
-	/*
-	 * Copy in the destination address
-	 */
-	ipha->ipha_dst = v4dst;
-
-	/*
-	 * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic.
-	 */
-	if (CLASSD(v4dst))
-		ipha->ipha_ttl = udp->udp_multicast_ttl;
-
-	udpha->uha_dst_port = port;
-	udpha->uha_src_port = uha_src_port;
+	ASSERT(cr != NULL);
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
 
-	if (ip_snd_opt_len > 0) {
-		uint32_t	cksum;
+	mutex_enter(&connp->conn_lock);
+	mp = udp_prepend_header_template(connp, ixa, mp, &connp->conn_saddr_v6,
+	    connp->conn_fport, connp->conn_flowinfo, &error);
 
-		bcopy(ip_snd_opt, &ipha[1], ip_snd_opt_len);
-		lock_held = B_FALSE;
-		rw_exit(&udp->udp_rwlock);
-		/*
-		 * Massage source route putting first source route in ipha_dst.
-		 * Ignore the destination in T_unitdata_req.
-		 * Create a checksum adjustment for a source route, if any.
-		 */
-		cksum = ip_massage_options(ipha, us->us_netstack);
-		cksum = (cksum & 0xFFFF) + (cksum >> 16);
-		cksum -= ((ipha->ipha_dst >> 16) & 0xFFFF) +
-		    (ipha->ipha_dst & 0xFFFF);
-		if ((int)cksum < 0)
-			cksum--;
-		cksum = (cksum & 0xFFFF) + (cksum >> 16);
-		/*
-		 * IP does the checksum if uha_checksum is non-zero,
-		 * We make it easy for IP to include our pseudo header
-		 * by putting our length in uha_checksum.
-		 */
-		cksum += ip_len;
-		cksum = (cksum & 0xFFFF) + (cksum >> 16);
-		/* There might be a carry. */
-		cksum = (cksum & 0xFFFF) + (cksum >> 16);
-#ifdef _LITTLE_ENDIAN
-		if (us->us_do_checksum)
-			ip_len = (cksum << 16) | ip_len;
-#else
-		if (us->us_do_checksum)
-			ip_len = (ip_len << 16) | cksum;
-		else
-			ip_len <<= 16;
-#endif
-	} else {
-		/*
-		 * IP does the checksum if uha_checksum is non-zero,
-		 * We make it easy for IP to include our pseudo header
-		 * by putting our length in uha_checksum.
-		 */
-		if (us->us_do_checksum)
-			ip_len |= (ip_len << 16);
-#ifndef _LITTLE_ENDIAN
-		else
-			ip_len <<= 16;
-#endif
+	if (mp == NULL) {
+		ASSERT(error != 0);
+		mutex_exit(&connp->conn_lock);
+		ixa_refrele(ixa);
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		freemsg(mp);
+		return (error);
 	}
-	ASSERT(!lock_held);
-	/* Set UDP length and checksum */
-	*((uint32_t *)&udpha->uha_length) = ip_len;
 
-	if (DB_TYPE(mp) != M_DATA) {
-		cred_t *cr;
-		pid_t cpid;
+	/*
+	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
+	 * safe copy, then we need to fill in any pointers in it.
+	 */
+	if (ixa->ixa_ire == NULL) {
+		in6_addr_t	faddr, saddr;
+		in6_addr_t	nexthop;
+		in_port_t	fport;
+
+		saddr = connp->conn_saddr_v6;
+		faddr = connp->conn_faddr_v6;
+		fport = connp->conn_fport;
+		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
+		mutex_exit(&connp->conn_lock);
 
-		/* Move any cred from the T_UNITDATA_REQ to the packet */
-		cr = msg_extractcred(mp, &cpid);
-		if (cr != NULL) {
-			if (mp1->b_datap->db_credp != NULL)
-				crfree(mp1->b_datap->db_credp);
-			mp1->b_datap->db_credp = cr;
-			mp1->b_datap->db_cpid = cpid;
+		error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
+		    fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
+		    IPDF_IPSEC);
+		switch (error) {
+		case 0:
+			break;
+		case EADDRNOTAVAIL:
+			/*
+			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+			 * Don't have the application see that errno
+			 */
+			error = ENETUNREACH;
+			goto failed;
+		case ENETDOWN:
+			/*
+			 * Have !ipif_addr_ready address; drop packet silently
+			 * until we can get applications to not send until we
+			 * are ready.
+			 */
+			error = 0;
+			goto failed;
+		case EHOSTUNREACH:
+		case ENETUNREACH:
+			if (ixa->ixa_ire != NULL) {
+				/*
+				 * Let conn_ip_output/ire_send_noroute return
+				 * the error and send any local ICMP error.
+				 */
+				error = 0;
+				break;
+			}
+			/* FALLTHRU */
+		default:
+		failed:
+			ixa_refrele(ixa);
+			freemsg(mp);
+			BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+			return (error);
 		}
-		ASSERT(mp != mp1);
-		freeb(mp);
+	} else {
+		/* Done with conn_t */
+		mutex_exit(&connp->conn_lock);
 	}
-
-	/* mp has been consumed and we'll return success */
-	ASSERT(*error == 0);
-	mp = NULL;
+	ASSERT(ixa->ixa_ire != NULL);
 
 	/* We're done.  Pass the packet to ip. */
 	BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams);
-	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-	    "udp_wput_end: q %p (%S)", q, "end");
-
-	if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
-	    CONN_OUTBOUND_POLICY_PRESENT(connp, ipss) ||
-	    connp->conn_dontroute ||
-	    connp->conn_outgoing_ill != NULL || optinfo.ip_opt_flags != 0 ||
-	    optinfo.ip_opt_ill_index != 0 ||
-	    ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
-	    IPP_ENABLED(IPP_LOCAL_OUT, ipst) ||
-	    ipst->ips_ip_g_mrouter != NULL) {
-		UDP_STAT(us, udp_ip_send);
-		ip_output_options(connp, mp1, connp->conn_wq, IP_WPUT,
-		    &optinfo);
-	} else {
-		udp_send_data(udp, connp->conn_wq, mp1, ipha);
-	}
 
-done:
-	if (lock_held)
-		rw_exit(&udp->udp_rwlock);
-	if (*error != 0) {
-		ASSERT(mp != NULL);
-		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+	error = conn_ip_output(mp, ixa);
+	/* No udpOutErrors if an error since IP increases its error counter */
+	switch (error) {
+	case 0:
+		break;
+	case EWOULDBLOCK:
+		(void) ixa_check_drain_insert(connp, ixa);
+		error = 0;
+		break;
+	case EADDRNOTAVAIL:
+		/*
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
+		 */
+		error = ENETUNREACH;
+		break;
 	}
-	return (mp);
+	ixa_refrele(ixa);
+	return (error);
 }
 
-static void
-udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
+/*
+ * Handle sending an M_DATA to the last destination.
+ * Handles both IPv4 and IPv6.
+ *
+ * NOTE: The caller must hold conn_lock and we drop it here.
+ */
+static int
+udp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
+    ip_xmit_attr_t *ixa)
 {
-	conn_t	*connp = udp->udp_connp;
-	ipaddr_t src, dst;
-	ire_t	*ire;
-	ipif_t	*ipif = NULL;
-	mblk_t	*ire_fp_mp;
-	boolean_t retry_caching;
-	udp_stack_t *us = udp->udp_us;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-
-	dst = ipha->ipha_dst;
-	src = ipha->ipha_src;
-	ASSERT(ipha->ipha_ident == 0);
-
-	if (CLASSD(dst)) {
-		int err;
-
-		ipif = conn_get_held_ipif(connp,
-		    &connp->conn_multicast_ipif, &err);
-
-		if (ipif == NULL || ipif->ipif_isv6 ||
-		    (ipif->ipif_ill->ill_phyint->phyint_flags &
-		    PHYI_LOOPBACK)) {
-			if (ipif != NULL)
-				ipif_refrele(ipif);
-			UDP_STAT(us, udp_ip_send);
-			ip_output(connp, mp, q, IP_WPUT);
-			return;
-		}
-	}
+	udp_t		*udp = connp->conn_udp;
+	udp_stack_t	*us = udp->udp_us;
+	int		error;
 
-	retry_caching = B_FALSE;
-	mutex_enter(&connp->conn_lock);
-	ire = connp->conn_ire_cache;
-	ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+	ASSERT(ixa != NULL);
 
-	if (ire == NULL || ire->ire_addr != dst ||
-	    (ire->ire_marks & IRE_MARK_CONDEMNED)) {
-		retry_caching = B_TRUE;
-	} else if (CLASSD(dst) && (ire->ire_type & IRE_CACHE)) {
-		ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+	ASSERT(cr != NULL);
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
 
-		ASSERT(ipif != NULL);
-		if (!IS_ON_SAME_LAN(stq_ill, ipif->ipif_ill))
-			retry_caching = B_TRUE;
-	}
+	mp = udp_prepend_header_template(connp, ixa, mp, &connp->conn_v6lastsrc,
+	    connp->conn_lastdstport, connp->conn_lastflowinfo, &error);
 
-	if (!retry_caching) {
-		ASSERT(ire != NULL);
-		IRE_REFHOLD(ire);
+	if (mp == NULL) {
+		ASSERT(error != 0);
 		mutex_exit(&connp->conn_lock);
-	} else {
-		boolean_t cached = B_FALSE;
+		ixa_refrele(ixa);
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		freemsg(mp);
+		return (error);
+	}
 
-		connp->conn_ire_cache = NULL;
+	/*
+	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
+	 * safe copy, then we need to fill in any pointers in it.
+	 */
+	if (ixa->ixa_ire == NULL) {
+		in6_addr_t	lastdst, lastsrc;
+		in6_addr_t	nexthop;
+		in_port_t	lastport;
+
+		lastsrc = connp->conn_v6lastsrc;
+		lastdst = connp->conn_v6lastdst;
+		lastport = connp->conn_lastdstport;
+		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
 		mutex_exit(&connp->conn_lock);
 
-		/* Release the old ire */
-		if (ire != NULL) {
-			IRE_REFRELE_NOTR(ire);
-			ire = NULL;
-		}
-
-		if (CLASSD(dst)) {
-			ASSERT(ipif != NULL);
-			ire = ire_ctable_lookup(dst, 0, 0, ipif,
-			    connp->conn_zoneid, msg_getlabel(mp),
-			    MATCH_IRE_ILL, ipst);
-		} else {
-			ASSERT(ipif == NULL);
-			ire = ire_cache_lookup(dst, connp->conn_zoneid,
-			    msg_getlabel(mp), ipst);
-		}
-
-		if (ire == NULL) {
-			if (ipif != NULL)
-				ipif_refrele(ipif);
-			UDP_STAT(us, udp_ire_null);
-			ip_output(connp, mp, q, IP_WPUT);
-			return;
-		}
-		IRE_REFHOLD_NOTR(ire);
-
-		mutex_enter(&connp->conn_lock);
-		if (CONN_CACHE_IRE(connp) && connp->conn_ire_cache == NULL &&
-		    !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-			irb_t		*irb = ire->ire_bucket;
-
+		error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
+		    &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
+		    IPDF_VERIFY_DST | IPDF_IPSEC);
+		switch (error) {
+		case 0:
+			break;
+		case EADDRNOTAVAIL:
 			/*
-			 * IRE's created for non-connection oriented transports
-			 * are normally initialized with IRE_MARK_TEMPORARY set
-			 * in the ire_marks. These IRE's are preferentially
-			 * reaped when the hash chain length in the cache
-			 * bucket exceeds the maximum value specified in
-			 * ip[6]_ire_max_bucket_cnt. This can severely affect
-			 * UDP performance if IRE cache entries that we need
-			 * to reuse are continually removed. To remedy this,
-			 * when we cache the IRE in the conn_t, we remove the
-			 * IRE_MARK_TEMPORARY bit from the ire_marks if it was
-			 * set.
+			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+			 * Don't have the application see that errno
 			 */
-			if (ire->ire_marks & IRE_MARK_TEMPORARY) {
-				rw_enter(&irb->irb_lock, RW_WRITER);
-				if (ire->ire_marks & IRE_MARK_TEMPORARY) {
-					ire->ire_marks &= ~IRE_MARK_TEMPORARY;
-					irb->irb_tmp_ire_cnt--;
-				}
-				rw_exit(&irb->irb_lock);
+			error = ENETUNREACH;
+			goto failed;
+		case ENETDOWN:
+			/*
+			 * Have !ipif_addr_ready address; drop packet silently
+			 * until we can get applications to not send until we
+			 * are ready.
+			 */
+			error = 0;
+			goto failed;
+		case EHOSTUNREACH:
+		case ENETUNREACH:
+			if (ixa->ixa_ire != NULL) {
+				/*
+				 * Let conn_ip_output/ire_send_noroute return
+				 * the error and send any local ICMP error.
+				 */
+				error = 0;
+				break;
 			}
-			connp->conn_ire_cache = ire;
-			cached = B_TRUE;
+			/* FALLTHRU */
+		default:
+		failed:
+			ixa_refrele(ixa);
+			freemsg(mp);
+			BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+			return (error);
 		}
+	} else {
+		/* Done with conn_t */
 		mutex_exit(&connp->conn_lock);
-
-		/*
-		 * We can continue to use the ire but since it was not
-		 * cached, we should drop the extra reference.
-		 */
-		if (!cached)
-			IRE_REFRELE_NOTR(ire);
 	}
-	ASSERT(ire != NULL && ire->ire_ipversion == IPV4_VERSION);
-	ASSERT(!CLASSD(dst) || ipif != NULL);
 
-	/*
-	 * Check if we can take the fast-path.
-	 * Note that "incomplete" ire's (where the link-layer for next hop
-	 * is not resolved, or where the fast-path header in nce_fp_mp is not
-	 * available yet) are sent down the legacy (slow) path
-	 */
-	if ((ire->ire_type & (IRE_BROADCAST|IRE_LOCAL|IRE_LOOPBACK)) ||
-	    (ire->ire_flags & RTF_MULTIRT) || (ire->ire_stq == NULL) ||
-	    (ire->ire_max_frag < ntohs(ipha->ipha_length)) ||
-	    ((ire->ire_nce == NULL) ||
-	    ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL)) ||
-	    connp->conn_nexthop_set || (MBLKL(ire_fp_mp) > MBLKHEAD(mp))) {
-		if (ipif != NULL)
-			ipif_refrele(ipif);
-		UDP_STAT(us, udp_ip_ire_send);
-		IRE_REFRELE(ire);
-		ip_output(connp, mp, q, IP_WPUT);
-		return;
-	}
+	/* We're done.  Pass the packet to ip. */
+	BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams);
 
-	if (src == INADDR_ANY && !connp->conn_unspec_src) {
-		if (CLASSD(dst) && !(ire->ire_flags & RTF_SETSRC))
-			ipha->ipha_src = ipif->ipif_src_addr;
+	error = conn_ip_output(mp, ixa);
+	/* No udpOutErrors if an error since IP increases its error counter */
+	switch (error) {
+	case 0:
+		break;
+	case EWOULDBLOCK:
+		(void) ixa_check_drain_insert(connp, ixa);
+		error = 0;
+		break;
+	case EADDRNOTAVAIL:
+		/*
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
+		 */
+		error = ENETUNREACH;
+		/* FALLTHRU */
+	default:
+		mutex_enter(&connp->conn_lock);
+		/*
+		 * Clear the source and v6lastdst so we call ip_attr_connect
+		 * for the next packet and try to pick a better source.
+		 */
+		if (connp->conn_mcbc_bind)
+			connp->conn_saddr_v6 = ipv6_all_zeros;
 		else
-			ipha->ipha_src = ire->ire_src_addr;
+			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+		connp->conn_v6lastdst = ipv6_all_zeros;
+		mutex_exit(&connp->conn_lock);
+		break;
 	}
-
-	if (ipif != NULL)
-		ipif_refrele(ipif);
-
-	udp_xmit(connp->conn_wq, mp, ire, connp, connp->conn_zoneid);
+	ixa_refrele(ixa);
+	return (error);
 }
 
-static void
-udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
+
+/*
+ * Prepend the header template and then fill in the source and
+ * flowinfo. The caller needs to handle the destination address since
+ * it's setting is different if rthdr or source route.
+ *
+ * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
+ * When it returns NULL it sets errorp.
+ */
+static mblk_t *
+udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
+    const in6_addr_t *v6src, in_port_t dstport, uint32_t flowinfo, int *errorp)
 {
-	ipaddr_t src, dst;
-	ill_t	*ill;
-	mblk_t	*ire_fp_mp;
-	uint_t	ire_fp_mp_len;
-	uint16_t *up;
-	uint32_t cksum, hcksum_txflags;
-	queue_t	*dev_q;
-	udp_t	*udp = connp->conn_udp;
-	ipha_t	*ipha = (ipha_t *)mp->b_rptr;
+	udp_t		*udp = connp->conn_udp;
 	udp_stack_t	*us = udp->udp_us;
-	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
-	boolean_t	ll_multicast = B_FALSE;
-	boolean_t	direct_send;
-
-	dev_q = ire->ire_stq->q_next;
-	ASSERT(dev_q != NULL);
+	boolean_t	insert_spi = udp->udp_nat_t_endpoint;
+	uint_t		pktlen;
+	uint_t		alloclen;
+	uint_t		copylen;
+	uint8_t		*iph;
+	uint_t		ip_hdr_length;
+	udpha_t		*udpha;
+	uint32_t	cksum;
+	ip_pkt_t	*ipp;
 
-	ill = ire_to_ill(ire);
-	ASSERT(ill != NULL);
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
 
 	/*
-	 * For the direct send case, if resetting of conn_direct_blocked
-	 * was missed, it is still ok because the putq() would enable
-	 * the queue and write service will drain it out.
+	 * Copy the header template and leave space for an SPI
 	 */
-	direct_send = ILL_DIRECT_CAPABLE(ill);
-
-	/* is queue flow controlled? */
-	if ((!direct_send) && (q->q_first != NULL || connp->conn_draining ||
-	    DEV_Q_FLOW_BLOCKED(dev_q))) {
-		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
-		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-		if (ipst->ips_ip_output_queue) {
-			DTRACE_PROBE1(udp__xmit__putq, conn_t *, connp);
-			(void) putq(connp->conn_wq, mp);
-		} else {
-			freemsg(mp);
-		}
-		ire_refrele(ire);
-		return;
-	}
-
-	ire_fp_mp = ire->ire_nce->nce_fp_mp;
-	ire_fp_mp_len = MBLKL(ire_fp_mp);
-	ASSERT(MBLKHEAD(mp) >= ire_fp_mp_len);
-
-	dst = ipha->ipha_dst;
-	src = ipha->ipha_src;
-
-
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
-
-	ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
-#ifndef _BIG_ENDIAN
-	ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
-#endif
-
-	if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
-		ASSERT(ill->ill_hcksum_capab != NULL);
-		hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
-	} else {
-		hcksum_txflags = 0;
-	}
-
-	/* pseudo-header checksum (do it in parts for IP header checksum) */
-	cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
-
-	ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
-	up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
-	if (*up != 0) {
-		IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags,
-		    mp, ipha, up, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH,
-		    ntohs(ipha->ipha_length), cksum);
-
-		/* Software checksum? */
-		if (DB_CKSUMFLAGS(mp) == 0) {
-			UDP_STAT(us, udp_out_sw_cksum);
-			UDP_STAT_UPDATE(us, udp_out_sw_cksum_bytes,
-			    ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
-		}
-	}
-
-	if (!CLASSD(dst)) {
-		ipha->ipha_fragment_offset_and_flags |=
-		    (uint32_t)htons(ire->ire_frag_flag);
-	}
-
-	/* Calculate IP header checksum if hardware isn't capable */
-	if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
-		IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
-		    ((uint16_t *)ipha)[4]);
+	copylen = connp->conn_ht_iphc_len;
+	alloclen = copylen + (insert_spi ? sizeof (uint32_t) : 0);
+	pktlen = alloclen + msgdsize(mp);
+	if (pktlen > IP_MAXPACKET) {
+		freemsg(mp);
+		*errorp = EMSGSIZE;
+		return (NULL);
 	}
+	ixa->ixa_pktlen = pktlen;
 
-	if (CLASSD(dst)) {
-		if (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL) {
-			ip_multicast_loopback(q, ill, mp,
-			    connp->conn_multicast_loop ? 0 :
-			    IP_FF_NO_MCAST_LOOP, zoneid);
-		}
+	/* check/fix buffer config, setup pointers into it */
+	iph = mp->b_rptr - alloclen;
+	if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
+		mblk_t *mp1;
 
-		/* If multicast TTL is 0 then we are done */
-		if (ipha->ipha_ttl == 0) {
+		mp1 = allocb(alloclen + us->us_wroff_extra, BPRI_MED);
+		if (mp1 == NULL) {
 			freemsg(mp);
-			ire_refrele(ire);
-			return;
+			*errorp = ENOMEM;
+			return (NULL);
 		}
-		ll_multicast = B_TRUE;
+		mp1->b_wptr = DB_LIM(mp1);
+		mp1->b_cont = mp;
+		mp = mp1;
+		iph = (mp->b_wptr - alloclen);
 	}
+	mp->b_rptr = iph;
+	bcopy(connp->conn_ht_iphc, iph, copylen);
+	ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
 
-	ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
-	mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
-	bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
-
-	UPDATE_OB_PKT_COUNT(ire);
-	ire->ire_last_used_time = lbolt;
-
-	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
-	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
-	    ntohs(ipha->ipha_length));
+	ixa->ixa_ip_hdr_length = ip_hdr_length;
+	udpha = (udpha_t *)(iph + ip_hdr_length);
 
-	DTRACE_PROBE4(ip4__physical__out__start,
-	    ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
-	FW_HOOKS(ipst->ips_ip4_physical_out_event,
-	    ipst->ips_ipv4firewall_physical_out, NULL, ill, ipha, mp, mp,
-	    ll_multicast, ipst);
-	DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
-	if (ipst->ips_ip4_observe.he_interested && mp != NULL) {
-		zoneid_t szone;
-
-		/*
-		 * Both of these functions expect b_rptr to be
-		 * where the IP header starts, so advance past the
-		 * link layer header if present.
-		 */
-		mp->b_rptr += ire_fp_mp_len;
-		szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
-		    ipst, ALL_ZONES);
-		ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
-		    ALL_ZONES, ill, ipst);
-		mp->b_rptr -= ire_fp_mp_len;
-	}
+	/*
+	 * Setup header length and prepare for ULP checksum done in IP.
+	 * udp_build_hdr_template has already massaged any routing header
+	 * and placed the result in conn_sum.
+	 *
+	 * We make it easy for IP to include our pseudo header
+	 * by putting our length in uha_checksum.
+	 */
+	cksum = pktlen - ip_hdr_length;
+	udpha->uha_length = htons(cksum);
 
-	if (mp == NULL)
-		goto bail;
+	cksum += connp->conn_sum;
+	cksum = (cksum >> 16) + (cksum & 0xFFFF);
+	ASSERT(cksum < 0x10000);
 
-	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
-	    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
-	    ipha_t *, ipha, ip6_t *, NULL, int, 0);
+	ipp = &connp->conn_xmit_ipp;
+	if (ixa->ixa_flags & IXAF_IS_IPV4) {
+		ipha_t	*ipha = (ipha_t *)iph;
 
-	if (direct_send) {
-		uintptr_t cookie;
-		ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct;
+		ipha->ipha_length = htons((uint16_t)pktlen);
 
-		cookie = idd->idd_tx_df(idd->idd_tx_dh, mp,
-		    (uintptr_t)connp, 0);
-		if (cookie != NULL) {
-			idl_tx_list_t *idl_txl;
+		/* IP does the checksum if uha_checksum is non-zero */
+		if (us->us_do_checksum)
+			udpha->uha_checksum = htons(cksum);
 
-			/*
-			 * Flow controlled.
-			 */
-			DTRACE_PROBE2(non__null__cookie, uintptr_t,
-			    cookie, conn_t *, connp);
-			idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
-			mutex_enter(&idl_txl->txl_lock);
-			/*
-			 * Check again after holding txl_lock to see if Tx
-			 * ring is still blocked and only then insert the
-			 * connp into the drain list.
-			 */
-			if (connp->conn_direct_blocked ||
-			    (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh,
-			    cookie) == 0)) {
-				mutex_exit(&idl_txl->txl_lock);
-				goto bail;
-			}
-			if (idl_txl->txl_cookie != NULL &&
-			    idl_txl->txl_cookie != cookie) {
-				DTRACE_PROBE2(udp__xmit__collision,
-				    uintptr_t, cookie,
-				    uintptr_t, idl_txl->txl_cookie);
-				UDP_STAT(us, udp_cookie_coll);
-			} else {
-				connp->conn_direct_blocked = B_TRUE;
-				idl_txl->txl_cookie = cookie;
-				conn_drain_insert(connp, idl_txl);
-				DTRACE_PROBE1(udp__xmit__insert,
-				    conn_t *, connp);
-			}
-			mutex_exit(&idl_txl->txl_lock);
+		/* if IP_PKTINFO specified an addres it wins over bind() */
+		if ((ipp->ipp_fields & IPPF_ADDR) &&
+		    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
+			ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
+			ipha->ipha_src = ipp->ipp_addr_v4;
+		} else {
+			IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
 		}
 	} else {
-		DTRACE_PROBE1(udp__xmit__putnext, mblk_t *, mp);
-		putnext(ire->ire_stq, mp);
-	}
-bail:
-	IRE_REFRELE(ire);
-}
+		ip6_t *ip6h = (ip6_t *)iph;
 
-static boolean_t
-udp_update_label_v6(queue_t *wq, mblk_t *mp, in6_addr_t *dst)
-{
-	udp_t *udp = Q_TO_UDP(wq);
-	int err;
-	cred_t *cred;
-	cred_t *orig_cred;
-	cred_t *effective_cred = NULL;
-	uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
-	udp_stack_t		*us = udp->udp_us;
-
-	/*
-	 * All Solaris components should pass a db_credp
-	 * for this message, hence we ASSERT.
-	 * On production kernels we return an error to be robust against
-	 * random streams modules sitting on top of us.
-	 */
-	cred = orig_cred = msg_getcred(mp, NULL);
-	ASSERT(cred != NULL);
-	if (cred == NULL)
-		return (EINVAL);
-
-	/*
-	 * Verify the destination is allowed to receive packets at
-	 * the security label of the message data. tsol_check_dest()
-	 * may create a new effective cred for this message with a
-	 * modified label or label flags. Note that we use the
-	 * cred/label from the message to handle MLP.
-	 */
-	if ((err = tsol_check_dest(cred, dst, IPV6_VERSION,
-	    udp->udp_connp->conn_mac_mode, &effective_cred)) != 0)
-		goto done;
-	if (effective_cred != NULL)
-		cred = effective_cred;
-
-	/*
-	 * Calculate the security label to be placed in the text
-	 * of the message (if any).
-	 */
-	if ((err = tsol_compute_label_v6(cred, dst, opt_storage,
-	    us->us_netstack->netstack_ip)) != 0)
-		goto done;
-
-	/*
-	 * Insert the security label in the cached ip options,
-	 * removing any old label that may exist.
-	 */
-	if ((err = tsol_update_sticky(&udp->udp_sticky_ipp,
-	    &udp->udp_label_len_v6, opt_storage)) != 0)
-		goto done;
+		ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
+		udpha->uha_checksum = htons(cksum);
 
-	/*
-	 * Save the destination address and cred we used to
-	 * generate the security label text.
-	 */
-	if (cred != udp->udp_effective_cred) {
-		if (udp->udp_effective_cred != NULL)
-			crfree(udp->udp_effective_cred);
-		crhold(cred);
-		udp->udp_effective_cred = cred;
-	}
-	if (orig_cred != udp->udp_last_cred) {
-		if (udp->udp_last_cred != NULL)
-			crfree(udp->udp_last_cred);
-		crhold(orig_cred);
-		udp->udp_last_cred = orig_cred;
+		/* if IP_PKTINFO specified an addres it wins over bind() */
+		if ((ipp->ipp_fields & IPPF_ADDR) &&
+		    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
+			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
+			ip6h->ip6_src = ipp->ipp_addr;
+		} else {
+			ip6h->ip6_src = *v6src;
+		}
+		ip6h->ip6_vcf =
+		    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
+		    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
+		if (ipp->ipp_fields & IPPF_TCLASS) {
+			/* Overrides the class part of flowinfo */
+			ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
+			    ipp->ipp_tclass);
+		}
 	}
 
-done:
-	if (effective_cred != NULL)
-		crfree(effective_cred);
+	/* Insert all-0s SPI now. */
+	if (insert_spi)
+		*((uint32_t *)(udpha + 1)) = 0;
 
-	if (err != 0) {
-		DTRACE_PROBE4(
-		    tx__ip__log__drop__updatelabel__udp6,
-		    char *, "queue(1) failed to update options(2) on mp(3)",
-		    queue_t *, wq, char *, opt_storage, mblk_t *, mp);
-	}
-	return (err);
+	udpha->uha_dst_port = dstport;
+	return (mp);
 }
 
-static int
-udp_send_connected(conn_t *connp, mblk_t *mp, struct nmsghdr *msg, cred_t *cr,
-    pid_t pid)
+/*
+ * Send a T_UDERR_IND in response to an M_DATA
+ */
+static void
+udp_ud_err_connected(conn_t *connp, t_scalar_t error)
 {
-	udp_t		*udp = connp->conn_udp;
-	udp_stack_t	*us = udp->udp_us;
-	ipaddr_t	v4dst;
-	in_port_t	dstport;
-	boolean_t	mapped_addr;
 	struct sockaddr_storage ss;
 	sin_t		*sin;
 	sin6_t		*sin6;
 	struct sockaddr	*addr;
 	socklen_t	addrlen;
-	int		error;
-	boolean_t	insert_spi = udp->udp_nat_t_endpoint;
-
-	/* M_DATA for connected socket */
-
-	ASSERT(udp->udp_issocket);
-	UDP_DBGSTAT(us, udp_data_conn);
+	mblk_t		*mp1;
 
 	mutex_enter(&connp->conn_lock);
-	if (udp->udp_state != TS_DATA_XFER) {
-		mutex_exit(&connp->conn_lock);
-		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
-		UDP_STAT(us, udp_out_err_notconn);
-		freemsg(mp);
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-		    "udp_wput_end: connp %p (%S)", connp,
-		    "not-connected; address required");
-		return (EDESTADDRREQ);
-	}
-
-	mapped_addr = IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst);
-	if (mapped_addr)
-		IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6dst, v4dst);
-
 	/* Initialize addr and addrlen as if they're passed in */
-	if (udp->udp_family == AF_INET) {
+	if (connp->conn_family == AF_INET) {
 		sin = (sin_t *)&ss;
+		*sin = sin_null;
 		sin->sin_family = AF_INET;
-		dstport = sin->sin_port = udp->udp_dstport;
-		ASSERT(mapped_addr);
-		sin->sin_addr.s_addr = v4dst;
+		sin->sin_port = connp->conn_fport;
+		sin->sin_addr.s_addr = connp->conn_faddr_v4;
 		addr = (struct sockaddr *)sin;
 		addrlen = sizeof (*sin);
 	} else {
 		sin6 = (sin6_t *)&ss;
+		*sin6 = sin6_null;
 		sin6->sin6_family = AF_INET6;
-		dstport = sin6->sin6_port = udp->udp_dstport;
-		sin6->sin6_flowinfo = udp->udp_flowinfo;
-		sin6->sin6_addr = udp->udp_v6dst;
-		sin6->sin6_scope_id = 0;
+		sin6->sin6_port = connp->conn_fport;
+		sin6->sin6_flowinfo = connp->conn_flowinfo;
+		sin6->sin6_addr = connp->conn_faddr_v6;
+		if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6) &&
+		    (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
+			sin6->sin6_scope_id = connp->conn_ixa->ixa_scopeid;
+		} else {
+			sin6->sin6_scope_id = 0;
+		}
 		sin6->__sin6_src_id = 0;
 		addr = (struct sockaddr *)sin6;
 		addrlen = sizeof (*sin6);
 	}
 	mutex_exit(&connp->conn_lock);
 
-	if (mapped_addr) {
-		/*
-		 * Handle both AF_INET and AF_INET6; the latter
-		 * for IPV4 mapped destination addresses.  Note
-		 * here that both addr and addrlen point to the
-		 * corresponding struct depending on the address
-		 * family of the socket.
-		 */
-		mp = udp_output_v4(connp, mp, v4dst, dstport, 0, &error,
-		    insert_spi, msg, cr, pid);
-	} else {
-		mp = udp_output_v6(connp, mp, sin6, &error, msg, cr, pid);
-	}
-	if (error == 0) {
-		ASSERT(mp == NULL);
-		return (0);
-	}
-
-	UDP_STAT(us, udp_out_err_output);
-	ASSERT(mp != NULL);
-	if (IPCL_IS_NONSTR(connp)) {
-		freemsg(mp);
-		return (error);
-	} else {
-		/* mp is freed by the following routine */
-		udp_ud_err(connp->conn_wq, mp, (uchar_t *)addr,
-		    (t_scalar_t)addrlen, (t_scalar_t)error);
-		return (0);
-	}
-}
-
-/* ARGSUSED */
-static int
-udp_send_not_connected(conn_t *connp,  mblk_t *mp, struct sockaddr *addr,
-    socklen_t addrlen, struct nmsghdr *msg, cred_t *cr, pid_t pid)
-{
-
-	udp_t		*udp = connp->conn_udp;
-	boolean_t	insert_spi = udp->udp_nat_t_endpoint;
-	int		error = 0;
-	sin6_t		*sin6;
-	sin_t		*sin;
-	uint_t		srcid;
-	uint16_t	port;
-	ipaddr_t	v4dst;
-
-
-	ASSERT(addr != NULL);
-
-	switch (udp->udp_family) {
-	case AF_INET6:
-		sin6 = (sin6_t *)addr;
-		if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
-			/*
-			 * Destination is a non-IPv4-compatible IPv6 address.
-			 * Send out an IPv6 format packet.
-			 */
-			mp = udp_output_v6(connp, mp, sin6, &error, msg, cr,
-			    pid);
-			if (error != 0)
-				goto ud_error;
-
-			return (0);
-		}
-		/*
-		 * If the local address is not zero or a mapped address
-		 * return an error.  It would be possible to send an IPv4
-		 * packet but the response would never make it back to the
-		 * application since it is bound to a non-mapped address.
-		 */
-		if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) &&
-		    !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
-			error = EADDRNOTAVAIL;
-			goto ud_error;
-		}
-		/* Send IPv4 packet without modifying udp_ipversion */
-		/* Extract port and ipaddr */
-		port = sin6->sin6_port;
-		IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst);
-		srcid = sin6->__sin6_src_id;
-		break;
-
-	case AF_INET:
-		sin = (sin_t *)addr;
-		/* Extract port and ipaddr */
-		port = sin->sin_port;
-		v4dst = sin->sin_addr.s_addr;
-		srcid = 0;
-		break;
-	}
-
-	mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error, insert_spi,
-	    msg, cr, pid);
-
-	if (error == 0) {
-		ASSERT(mp == NULL);
-		return (0);
-	}
-
-ud_error:
-	ASSERT(mp != NULL);
-
-	return (error);
+	mp1 = mi_tpi_uderror_ind((char *)addr, addrlen, NULL, 0, error);
+	if (mp1 != NULL)
+		putnext(connp->conn_rq, mp1);
 }
 
 /*
@@ -5788,15 +3804,20 @@ ud_error:
 void
 udp_wput(queue_t *q, mblk_t *mp)
 {
+	sin6_t		*sin6;
+	sin_t		*sin = NULL;
+	uint_t		srcid;
 	conn_t		*connp = Q_TO_CONN(q);
 	udp_t		*udp = connp->conn_udp;
 	int		error = 0;
-	struct sockaddr	*addr;
+	struct sockaddr	*addr = NULL;
 	socklen_t	addrlen;
 	udp_stack_t	*us = udp->udp_us;
-
-	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START,
-	    "udp_wput_start: queue %p mp %p", q, mp);
+	struct T_unitdata_req *tudr;
+	mblk_t		*data_mp;
+	ushort_t	ipversion;
+	cred_t		*cr;
+	pid_t		pid;
 
 	/*
 	 * We directly handle several cases here: T_UNITDATA_REQ message
@@ -5805,910 +3826,612 @@ udp_wput(queue_t *q, mblk_t *mp)
 	 */
 	switch (DB_TYPE(mp)) {
 	case M_DATA:
-		/*
-		 * Quick check for error cases. Checks will be done again
-		 * under the lock later on
-		 */
 		if (!udp->udp_issocket || udp->udp_state != TS_DATA_XFER) {
 			/* Not connected; address is required */
 			BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+			UDP_DBGSTAT(us, udp_data_notconn);
 			UDP_STAT(us, udp_out_err_notconn);
 			freemsg(mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-			    "udp_wput_end: connp %p (%S)", connp,
-			    "not-connected; address required");
 			return;
 		}
-		(void) udp_send_connected(connp, mp, NULL, NULL, -1);
+		/*
+		 * All Solaris components should pass a db_credp
+		 * for this message, hence we ASSERT.
+		 * On production kernels we return an error to be robust against
+		 * random streams modules sitting on top of us.
+		 */
+		cr = msg_getcred(mp, &pid);
+		ASSERT(cr != NULL);
+		if (cr == NULL) {
+			BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+			freemsg(mp);
+			return;
+		}
+		ASSERT(udp->udp_issocket);
+		UDP_DBGSTAT(us, udp_data_conn);
+		error = udp_output_connected(connp, mp, cr, pid);
+		if (error != 0) {
+			UDP_STAT(us, udp_out_err_output);
+			if (connp->conn_rq != NULL)
+				udp_ud_err_connected(connp, (t_scalar_t)error);
+#ifdef DEBUG
+			printf("udp_output_connected returned %d\n", error);
+#endif
+		}
 		return;
 
 	case M_PROTO:
-	case M_PCPROTO: {
-		struct T_unitdata_req *tudr;
-
-		ASSERT((uintptr_t)MBLKL(mp) <= (uintptr_t)INT_MAX);
+	case M_PCPROTO:
 		tudr = (struct T_unitdata_req *)mp->b_rptr;
-
-		/* Handle valid T_UNITDATA_REQ here */
-		if (MBLKL(mp) >= sizeof (*tudr) &&
-		    ((t_primp_t)mp->b_rptr)->type == T_UNITDATA_REQ) {
-			if (mp->b_cont == NULL) {
-				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				    "udp_wput_end: q %p (%S)", q, "badaddr");
-				error = EPROTO;
-				goto ud_error;
-			}
-
-			if (!MBLKIN(mp, 0, tudr->DEST_offset +
-			    tudr->DEST_length)) {
-				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				    "udp_wput_end: q %p (%S)", q, "badaddr");
-				error = EADDRNOTAVAIL;
-				goto ud_error;
-			}
-			/*
-			 * If a port has not been bound to the stream, fail.
-			 * This is not a problem when sockfs is directly
-			 * above us, because it will ensure that the socket
-			 * is first bound before allowing data to be sent.
-			 */
-			if (udp->udp_state == TS_UNBND) {
-				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				    "udp_wput_end: q %p (%S)", q, "outstate");
-				error = EPROTO;
-				goto ud_error;
-			}
-			addr = (struct sockaddr *)
-			    &mp->b_rptr[tudr->DEST_offset];
-			addrlen = tudr->DEST_length;
-			if (tudr->OPT_length != 0)
-				UDP_STAT(us, udp_out_opt);
-			break;
+		if (MBLKL(mp) < sizeof (*tudr) ||
+		    ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
+			udp_wput_other(q, mp);
+			return;
 		}
-		/* FALLTHRU */
-	}
+		break;
+
 	default:
 		udp_wput_other(q, mp);
 		return;
 	}
-	ASSERT(addr != NULL);
 
-	error = udp_send_not_connected(connp,  mp, addr, addrlen, NULL, NULL,
-	    -1);
-	if (error != 0) {
-ud_error:
-		UDP_STAT(us, udp_out_err_output);
-		ASSERT(mp != NULL);
-		/* mp is freed by the following routine */
-		udp_ud_err(q, mp, (uchar_t *)addr, (t_scalar_t)addrlen,
-		    (t_scalar_t)error);
+	/* Handle valid T_UNITDATA_REQ here */
+	data_mp = mp->b_cont;
+	if (data_mp == NULL) {
+		error = EPROTO;
+		goto ud_error2;
 	}
-}
+	mp->b_cont = NULL;
 
-/* ARGSUSED */
-static void
-udp_wput_fallback(queue_t *wq, mblk_t *mp)
-{
-#ifdef DEBUG
-	cmn_err(CE_CONT, "udp_wput_fallback: Message in fallback \n");
-#endif
-	freemsg(mp);
-}
-
-
-/*
- * udp_output_v6():
- * Assumes that udp_wput did some sanity checking on the destination
- * address.
- */
-static mblk_t *
-udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error,
-    struct nmsghdr *msg, cred_t *cr, pid_t pid)
-{
-	ip6_t		*ip6h;
-	ip6i_t		*ip6i;	/* mp1->b_rptr even if no ip6i_t */
-	mblk_t		*mp1 = mp;
-	mblk_t		*mp2;
-	int		udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
-	size_t		ip_len;
-	udpha_t		*udph;
-	udp_t		*udp = connp->conn_udp;
-	udp_stack_t	*us = udp->udp_us;
-	queue_t		*q = connp->conn_wq;
-	ip6_pkt_t	ipp_s;	/* For ancillary data options */
-	ip6_pkt_t	*ipp = &ipp_s;
-	ip6_pkt_t	*tipp;	/* temporary ipp */
-	uint32_t	csum = 0;
-	uint_t		ignore = 0;
-	uint_t		option_exists = 0, is_sticky = 0;
-	uint8_t		*cp;
-	uint8_t		*nxthdr_ptr;
-	in6_addr_t	ip6_dst;
-	in_port_t	port;
-	udpattrs_t	attrs;
-	boolean_t	opt_present;
-	ip6_hbh_t	*hopoptsptr = NULL;
-	uint_t		hopoptslen = 0;
-	boolean_t	is_ancillary = B_FALSE;
-	size_t		sth_wroff = 0;
-	ire_t		*ire;
-	boolean_t	update_lastdst = B_FALSE;
-
-	*error = 0;
-
-	/*
-	 * If the local address is a mapped address return
-	 * an error.
-	 * It would be possible to send an IPv6 packet but the
-	 * response would never make it back to the application
-	 * since it is bound to a mapped address.
-	 */
-	if (IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src)) {
-		*error = EADDRNOTAVAIL;
-		goto done;
+	if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
+		error = EADDRNOTAVAIL;
+		goto ud_error2;
 	}
 
-	ipp->ipp_fields = 0;
-	ipp->ipp_sticky_ignored = 0;
-
 	/*
-	 * If TPI options passed in, feed it for verification and handling
+	 * All Solaris components should pass a db_credp
+	 * for this TPI message, hence we should ASSERT.
+	 * However, RPC (svc_clts_ksend) does this odd thing where it
+	 * passes the options from a T_UNITDATA_IND unchanged in a
+	 * T_UNITDATA_REQ. While that is the right thing to do for
+	 * some options, SCM_UCRED being the key one, this also makes it
+	 * pass down IP_RECVDSTADDR. Hence we can't ASSERT here.
 	 */
-	attrs.udpattr_credset = B_FALSE;
-	opt_present = B_FALSE;
-	if (IPCL_IS_NONSTR(connp)) {
-		if (msg->msg_controllen != 0) {
-			attrs.udpattr_ipp6 = ipp;
-			attrs.udpattr_mb = mp;
-
-			rw_enter(&udp->udp_rwlock, RW_WRITER);
-			*error = process_auxiliary_options(connp,
-			    msg->msg_control, msg->msg_controllen,
-			    &attrs, &udp_opt_obj, udp_opt_set, cr);
-			rw_exit(&udp->udp_rwlock);
-			if (*error)
-				goto done;
-			ASSERT(*error == 0);
-			opt_present = B_TRUE;
-		}
-	} else {
-		if (DB_TYPE(mp) != M_DATA) {
-			mp1 = mp->b_cont;
-			if (((struct T_unitdata_req *)
-			    mp->b_rptr)->OPT_length != 0) {
-				attrs.udpattr_ipp6 = ipp;
-				attrs.udpattr_mb = mp;
-				if (udp_unitdata_opt_process(q, mp, error,
-				    &attrs) < 0) {
-					goto done;
-				}
-				ASSERT(*error == 0);
-				opt_present = B_TRUE;
-			}
-		}
+	cr = msg_getcred(mp, &pid);
+	if (cr == NULL) {
+		cr = connp->conn_cred;
+		pid = connp->conn_cpid;
 	}
 
 	/*
-	 * Determine whether we need to mark the mblk with the user's
-	 * credentials.
-	 * If labeled then sockfs would have already done this.
+	 * If a port has not been bound to the stream, fail.
+	 * This is not a problem when sockfs is directly
+	 * above us, because it will ensure that the socket
+	 * is first bound before allowing data to be sent.
 	 */
-	ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
-	ire = connp->conn_ire_cache;
-	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) || (ire == NULL) ||
-	    (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &sin6->sin6_addr)) ||
-	    (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) {
-		if (cr != NULL && msg_getcred(mp, NULL) == NULL)
-			mblk_setcred(mp, cr, pid);
-	}
-
-	rw_enter(&udp->udp_rwlock, RW_READER);
-	ignore = ipp->ipp_sticky_ignored;
-
-	/* mp1 points to the M_DATA mblk carrying the packet */
-	ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
-
-	if (sin6->sin6_scope_id != 0 &&
-	    IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
-		/*
-		 * IPPF_SCOPE_ID is special.  It's neither a sticky
-		 * option nor ancillary data.  It needs to be
-		 * explicitly set in options_exists.
-		 */
-		option_exists |= IPPF_SCOPE_ID;
+	if (udp->udp_state == TS_UNBND) {
+		error = EPROTO;
+		goto ud_error2;
 	}
+	addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
+	addrlen = tudr->DEST_length;
 
-	/*
-	 * Compute the destination address
-	 */
-	ip6_dst = sin6->sin6_addr;
-	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
-		ip6_dst = ipv6_loopback;
-
-	port = sin6->sin6_port;
-
-	/*
-	 * Cluster and TSOL notes, Cluster check:
-	 * see comments in udp_output_v4().
-	 */
-	mutex_enter(&connp->conn_lock);
-
-	if (cl_inet_connect2 != NULL &&
-	    (!IN6_ARE_ADDR_EQUAL(&ip6_dst, &udp->udp_v6lastdst) ||
-	    port != udp->udp_lastdstport)) {
-		mutex_exit(&connp->conn_lock);
-		*error = 0;
-		CL_INET_UDP_CONNECT(connp, udp, B_TRUE, &ip6_dst, port, *error);
-		if (*error != 0) {
-			*error = EHOSTUNREACH;
-			rw_exit(&udp->udp_rwlock);
-			goto done;
+	switch (connp->conn_family) {
+	case AF_INET6:
+		sin6 = (sin6_t *)addr;
+		if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
+		    (sin6->sin6_family != AF_INET6)) {
+			error = EADDRNOTAVAIL;
+			goto ud_error2;
 		}
-		update_lastdst = B_TRUE;
-		mutex_enter(&connp->conn_lock);
-	}
 
-	/*
-	 * If we're not going to the same destination as last time, then
-	 * recompute the label required.  This is done in a separate routine to
-	 * avoid blowing up our stack here.
-	 *
-	 * TSOL Note: Since we are not in WRITER mode, UDP packets
-	 * to different destination may require different labels,
-	 * or worse, UDP packets to same IP address may require
-	 * different labels due to use of shared all-zones address.
-	 * We use conn_lock to ensure that lastdst, sticky ipp_hopopts,
-	 * and sticky ipp_hopoptslen are consistent for the current
-	 * destination and are updated atomically.
-	 */
-	if (is_system_labeled()) {
-		cred_t  *credp;
-		pid_t   cpid;
+		srcid = sin6->__sin6_src_id;
+		if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+			/*
+			 * Destination is a non-IPv4-compatible IPv6 address.
+			 * Send out an IPv6 format packet.
+			 */
 
-		/* Using UDP MLP requires SCM_UCRED from user */
-		if (connp->conn_mlp_type != mlptSingle &&
-		    !attrs.udpattr_credset) {
-			DTRACE_PROBE4(
-			    tx__ip__log__info__output__udp6,
-			    char *, "MLP mp(1) lacks SCM_UCRED attr(2) on q(3)",
-			    mblk_t *, mp1, udpattrs_t *, &attrs, queue_t *, q);
-			*error = EINVAL;
-			rw_exit(&udp->udp_rwlock);
-			mutex_exit(&connp->conn_lock);
-			goto done;
-		}
-		/*
-		 * update label option for this UDP socket if
-		 * - the destination has changed,
-		 * - the UDP socket is MLP, or
-		 * - the cred attached to the mblk changed.
-		 */
-		credp = msg_getcred(mp, &cpid);
-		if (opt_present ||
-		    !IN6_ARE_ADDR_EQUAL(&udp->udp_v6lastdst, &ip6_dst) ||
-		    connp->conn_mlp_type != mlptSingle ||
-		    credp != udp->udp_last_cred) {
-			if ((*error = udp_update_label_v6(q, mp, &ip6_dst))
-			    != 0) {
-				rw_exit(&udp->udp_rwlock);
-				mutex_exit(&connp->conn_lock);
-				goto done;
+			/*
+			 * If the local address is a mapped address return
+			 * an error.
+			 * It would be possible to send an IPv6 packet but the
+			 * response would never make it back to the application
+			 * since it is bound to a mapped address.
+			 */
+			if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
+				error = EADDRNOTAVAIL;
+				goto ud_error2;
 			}
-			update_lastdst = B_TRUE;
-		}
-		/*
-		 * Attach the effective cred to the mblk to ensure future
-		 * routing decisions will be based on it's label.
-		 */
-		mblk_setcred(mp, udp->udp_effective_cred, cpid);
-	}
 
-	if (update_lastdst) {
-		udp->udp_v6lastdst = ip6_dst;
-		udp->udp_lastdstport = port;
-	}
+			UDP_DBGSTAT(us, udp_out_ipv6);
 
-	/*
-	 * If there's a security label here, then we ignore any options the
-	 * user may try to set.  We keep the peer's label as a hidden sticky
-	 * option. We make a private copy of this label before releasing the
-	 * lock so that label is kept consistent with the destination addr.
-	 */
-	if (udp->udp_label_len_v6 > 0) {
-		ignore &= ~IPPF_HOPOPTS;
-		ipp->ipp_fields &= ~IPPF_HOPOPTS;
-	}
+			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+				sin6->sin6_addr = ipv6_loopback;
+			ipversion = IPV6_VERSION;
+		} else {
+			if (connp->conn_ipv6_v6only) {
+				error = EADDRNOTAVAIL;
+				goto ud_error2;
+			}
 
-	if ((udp->udp_sticky_ipp.ipp_fields == 0) && (ipp->ipp_fields == 0)) {
-		/* No sticky options nor ancillary data. */
-		mutex_exit(&connp->conn_lock);
-		goto no_options;
-	}
+			/*
+			 * If the local address is not zero or a mapped address
+			 * return an error.  It would be possible to send an
+			 * IPv4 packet but the response would never make it
+			 * back to the application since it is bound to a
+			 * non-mapped address.
+			 */
+			if (!IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6) &&
+			    !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) {
+				error = EADDRNOTAVAIL;
+				goto ud_error2;
+			}
+			UDP_DBGSTAT(us, udp_out_mapped);
 
-	/*
-	 * Go through the options figuring out where each is going to
-	 * come from and build two masks.  The first mask indicates if
-	 * the option exists at all.  The second mask indicates if the
-	 * option is sticky or ancillary.
-	 */
-	if (!(ignore & IPPF_HOPOPTS)) {
-		if (ipp->ipp_fields & IPPF_HOPOPTS) {
-			option_exists |= IPPF_HOPOPTS;
-			udp_ip_hdr_len += ipp->ipp_hopoptslen;
-		} else if (udp->udp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
-			option_exists |= IPPF_HOPOPTS;
-			is_sticky |= IPPF_HOPOPTS;
-			ASSERT(udp->udp_sticky_ipp.ipp_hopoptslen != 0);
-			hopoptsptr = kmem_alloc(
-			    udp->udp_sticky_ipp.ipp_hopoptslen, KM_NOSLEEP);
-			if (hopoptsptr == NULL) {
-				*error = ENOMEM;
-				mutex_exit(&connp->conn_lock);
-				goto done;
+			if (V4_PART_OF_V6(sin6->sin6_addr) == INADDR_ANY) {
+				V4_PART_OF_V6(sin6->sin6_addr) =
+				    htonl(INADDR_LOOPBACK);
 			}
-			hopoptslen = udp->udp_sticky_ipp.ipp_hopoptslen;
-			bcopy(udp->udp_sticky_ipp.ipp_hopopts, hopoptsptr,
-			    hopoptslen);
-			udp_ip_hdr_len += hopoptslen;
+			ipversion = IPV4_VERSION;
 		}
-	}
-	mutex_exit(&connp->conn_lock);
 
-	if (!(ignore & IPPF_RTHDR)) {
-		if (ipp->ipp_fields & IPPF_RTHDR) {
-			option_exists |= IPPF_RTHDR;
-			udp_ip_hdr_len += ipp->ipp_rthdrlen;
-		} else if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
-			option_exists |= IPPF_RTHDR;
-			is_sticky |= IPPF_RTHDR;
-			udp_ip_hdr_len += udp->udp_sticky_ipp.ipp_rthdrlen;
-		}
-	}
+		if (tudr->OPT_length != 0) {
+			/*
+			 * If we are connected then the destination needs to be
+			 * the same as the connected one.
+			 */
+			if (udp->udp_state == TS_DATA_XFER &&
+			    !conn_same_as_last_v6(connp, sin6)) {
+				error = EISCONN;
+				goto ud_error2;
+			}
+			UDP_STAT(us, udp_out_opt);
+			error = udp_output_ancillary(connp, NULL, sin6,
+			    data_mp, mp, NULL, cr, pid);
+		} else {
+			ip_xmit_attr_t *ixa;
 
-	if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) {
-		if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
-			option_exists |= IPPF_RTDSTOPTS;
-			udp_ip_hdr_len += ipp->ipp_rtdstoptslen;
-		} else if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
-			option_exists |= IPPF_RTDSTOPTS;
-			is_sticky |= IPPF_RTDSTOPTS;
-			udp_ip_hdr_len += udp->udp_sticky_ipp.ipp_rtdstoptslen;
+			/*
+			 * We have to allocate an ip_xmit_attr_t before we grab
+			 * conn_lock and we need to hold conn_lock once we've
+			 * checked conn_same_as_last_v6 to handle concurrent
+			 * send* calls on a socket.
+			 */
+			ixa = conn_get_ixa(connp, B_FALSE);
+			if (ixa == NULL) {
+				error = ENOMEM;
+				goto ud_error2;
+			}
+			mutex_enter(&connp->conn_lock);
+
+			if (conn_same_as_last_v6(connp, sin6) &&
+			    connp->conn_lastsrcid == srcid &&
+			    ipsec_outbound_policy_current(ixa)) {
+				UDP_DBGSTAT(us, udp_out_lastdst);
+				/* udp_output_lastdst drops conn_lock */
+				error = udp_output_lastdst(connp, data_mp, cr,
+				    pid, ixa);
+			} else {
+				UDP_DBGSTAT(us, udp_out_diffdst);
+				/* udp_output_newdst drops conn_lock */
+				error = udp_output_newdst(connp, data_mp, NULL,
+				    sin6, ipversion, cr, pid, ixa);
+			}
+			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
 		}
-	}
-
-	if (!(ignore & IPPF_DSTOPTS)) {
-		if (ipp->ipp_fields & IPPF_DSTOPTS) {
-			option_exists |= IPPF_DSTOPTS;
-			udp_ip_hdr_len += ipp->ipp_dstoptslen;
-		} else if (udp->udp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
-			option_exists |= IPPF_DSTOPTS;
-			is_sticky |= IPPF_DSTOPTS;
-			udp_ip_hdr_len += udp->udp_sticky_ipp.ipp_dstoptslen;
+		if (error == 0) {
+			freeb(mp);
+			return;
 		}
-	}
+		break;
 
-	if (!(ignore & IPPF_IFINDEX)) {
-		if (ipp->ipp_fields & IPPF_IFINDEX) {
-			option_exists |= IPPF_IFINDEX;
-		} else if (udp->udp_sticky_ipp.ipp_fields & IPPF_IFINDEX) {
-			option_exists |= IPPF_IFINDEX;
-			is_sticky |= IPPF_IFINDEX;
+	case AF_INET:
+		sin = (sin_t *)addr;
+		if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
+		    (sin->sin_family != AF_INET)) {
+			error = EADDRNOTAVAIL;
+			goto ud_error2;
 		}
-	}
+		UDP_DBGSTAT(us, udp_out_ipv4);
+		if (sin->sin_addr.s_addr == INADDR_ANY)
+			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+		ipversion = IPV4_VERSION;
 
-	if (!(ignore & IPPF_ADDR)) {
-		if (ipp->ipp_fields & IPPF_ADDR) {
-			option_exists |= IPPF_ADDR;
-		} else if (udp->udp_sticky_ipp.ipp_fields & IPPF_ADDR) {
-			option_exists |= IPPF_ADDR;
-			is_sticky |= IPPF_ADDR;
-		}
-	}
+		srcid = 0;
+		if (tudr->OPT_length != 0) {
+			/*
+			 * If we are connected then the destination needs to be
+			 * the same as the connected one.
+			 */
+			if (udp->udp_state == TS_DATA_XFER &&
+			    !conn_same_as_last_v4(connp, sin)) {
+				error = EISCONN;
+				goto ud_error2;
+			}
+			UDP_STAT(us, udp_out_opt);
+			error = udp_output_ancillary(connp, sin, NULL,
+			    data_mp, mp, NULL, cr, pid);
+		} else {
+			ip_xmit_attr_t *ixa;
 
-	if (!(ignore & IPPF_DONTFRAG)) {
-		if (ipp->ipp_fields & IPPF_DONTFRAG) {
-			option_exists |= IPPF_DONTFRAG;
-		} else if (udp->udp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) {
-			option_exists |= IPPF_DONTFRAG;
-			is_sticky |= IPPF_DONTFRAG;
+			/*
+			 * We have to allocate an ip_xmit_attr_t before we grab
+			 * conn_lock and we need to hold conn_lock once we've
+			 * checked conn_same_as_last_v4 to handle concurrent
+			 * send* calls on a socket.
+			 */
+			ixa = conn_get_ixa(connp, B_FALSE);
+			if (ixa == NULL) {
+				error = ENOMEM;
+				goto ud_error2;
+			}
+			mutex_enter(&connp->conn_lock);
+
+			if (conn_same_as_last_v4(connp, sin) &&
+			    ipsec_outbound_policy_current(ixa)) {
+				UDP_DBGSTAT(us, udp_out_lastdst);
+				/* udp_output_lastdst drops conn_lock */
+				error = udp_output_lastdst(connp, data_mp, cr,
+				    pid, ixa);
+			} else {
+				UDP_DBGSTAT(us, udp_out_diffdst);
+				/* udp_output_newdst drops conn_lock */
+				error = udp_output_newdst(connp, data_mp, sin,
+				    NULL, ipversion, cr, pid, ixa);
+			}
+			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
 		}
-	}
-
-	if (!(ignore & IPPF_USE_MIN_MTU)) {
-		if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
-			option_exists |= IPPF_USE_MIN_MTU;
-		} else if (udp->udp_sticky_ipp.ipp_fields &
-		    IPPF_USE_MIN_MTU) {
-			option_exists |= IPPF_USE_MIN_MTU;
-			is_sticky |= IPPF_USE_MIN_MTU;
+		if (error == 0) {
+			freeb(mp);
+			return;
 		}
+		break;
 	}
+	UDP_STAT(us, udp_out_err_output);
+	ASSERT(mp != NULL);
+	/* mp is freed by the following routine */
+	udp_ud_err(q, mp, (t_scalar_t)error);
+	return;
 
-	if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT))
-		option_exists |= IPPF_HOPLIMIT;
-	/* IPV6_HOPLIMIT can never be sticky */
-	ASSERT(!(udp->udp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT));
+ud_error2:
+	BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+	freemsg(data_mp);
+	UDP_STAT(us, udp_out_err_output);
+	ASSERT(mp != NULL);
+	/* mp is freed by the following routine */
+	udp_ud_err(q, mp, (t_scalar_t)error);
+}
 
-	if (!(ignore & IPPF_UNICAST_HOPS) &&
-	    (udp->udp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) {
-		option_exists |= IPPF_UNICAST_HOPS;
-		is_sticky |= IPPF_UNICAST_HOPS;
-	}
+/*
+ * Handle the case of the IP address, port, flow label being different
+ * for both IPv4 and IPv6.
+ *
+ * NOTE: The caller must hold conn_lock and we drop it here.
+ */
+static int
+udp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
+    ushort_t ipversion, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
+{
+	uint_t		srcid;
+	uint32_t	flowinfo;
+	udp_t		*udp = connp->conn_udp;
+	int		error = 0;
+	ip_xmit_attr_t	*oldixa;
+	udp_stack_t	*us = udp->udp_us;
+	in6_addr_t	v6src;
+	in6_addr_t	v6dst;
+	in6_addr_t	v6nexthop;
+	in_port_t	dstport;
 
-	if (!(ignore & IPPF_MULTICAST_HOPS) &&
-	    (udp->udp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) {
-		option_exists |= IPPF_MULTICAST_HOPS;
-		is_sticky |= IPPF_MULTICAST_HOPS;
-	}
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+	ASSERT(ixa != NULL);
+	/*
+	 * We hold conn_lock across all the use and modifications of
+	 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
+	 * stay consistent.
+	 */
 
-	if (!(ignore & IPPF_TCLASS)) {
-		if (ipp->ipp_fields & IPPF_TCLASS) {
-			option_exists |= IPPF_TCLASS;
-		} else if (udp->udp_sticky_ipp.ipp_fields & IPPF_TCLASS) {
-			option_exists |= IPPF_TCLASS;
-			is_sticky |= IPPF_TCLASS;
-		}
+	ASSERT(cr != NULL);
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
+	if (is_system_labeled()) {
+		/* We need to restart with a label based on the cred */
+		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
 	}
 
-	if (!(ignore & IPPF_NEXTHOP) &&
-	    (udp->udp_sticky_ipp.ipp_fields & IPPF_NEXTHOP)) {
-		option_exists |= IPPF_NEXTHOP;
-		is_sticky |= IPPF_NEXTHOP;
+	/*
+	 * If we are connected then the destination needs to be the
+	 * same as the connected one, which is not the case here since we
+	 * checked for that above.
+	 */
+	if (udp->udp_state == TS_DATA_XFER) {
+		mutex_exit(&connp->conn_lock);
+		error = EISCONN;
+		goto ud_error;
 	}
 
-no_options:
+	/* In case previous destination was multicast or multirt */
+	ip_attr_newdst(ixa);
 
 	/*
-	 * If any options carried in the ip6i_t were specified, we
-	 * need to account for the ip6i_t in the data we'll be sending
-	 * down.
+	 * If laddr is unspecified then we look at sin6_src_id.
+	 * We will give precedence to a source address set with IPV6_PKTINFO
+	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
+	 * want ip_attr_connect to select a source (since it can fail) when
+	 * IPV6_PKTINFO is specified.
+	 * If this doesn't result in a source address then we get a source
+	 * from ip_attr_connect() below.
 	 */
-	if (option_exists & IPPF_HAS_IP6I)
-		udp_ip_hdr_len += sizeof (ip6i_t);
-
-	/* check/fix buffer config, setup pointers into it */
-	ip6h = (ip6_t *)&mp1->b_rptr[-udp_ip_hdr_len];
-	if (DB_REF(mp1) != 1 || ((unsigned char *)ip6h < DB_BASE(mp1)) ||
-	    !OK_32PTR(ip6h)) {
-
-		/* Try to get everything in a single mblk next time */
-		if (udp_ip_hdr_len > udp->udp_max_hdr_len) {
-			udp->udp_max_hdr_len = udp_ip_hdr_len;
-			sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
+	v6src = connp->conn_saddr_v6;
+	if (sin != NULL) {
+		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
+		dstport = sin->sin_port;
+		flowinfo = 0;
+		srcid = 0;
+		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
+			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+			    connp->conn_netstack);
 		}
-
-		mp2 = allocb(udp_ip_hdr_len + us->us_wroff_extra, BPRI_LO);
-		if (mp2 == NULL) {
-			*error = ENOMEM;
-			rw_exit(&udp->udp_rwlock);
-			goto done;
+		ixa->ixa_flags |= IXAF_IS_IPV4;
+	} else {
+		v6dst = sin6->sin6_addr;
+		dstport = sin6->sin6_port;
+		flowinfo = sin6->sin6_flowinfo;
+		srcid = sin6->__sin6_src_id;
+		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
+			ixa->ixa_scopeid = sin6->sin6_scope_id;
+			ixa->ixa_flags |= IXAF_SCOPEID_SET;
+		} else {
+			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 		}
-		mp2->b_wptr = DB_LIM(mp2);
-		mp2->b_cont = mp1;
-		mp1 = mp2;
-		if (DB_TYPE(mp) != M_DATA)
-			mp->b_cont = mp1;
+		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+			    connp->conn_netstack);
+		}
+		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
+			ixa->ixa_flags |= IXAF_IS_IPV4;
 		else
-			mp = mp1;
-
-		ip6h = (ip6_t *)(mp1->b_wptr - udp_ip_hdr_len);
+			ixa->ixa_flags &= ~IXAF_IS_IPV4;
 	}
-	mp1->b_rptr = (unsigned char *)ip6h;
-	ip6i = (ip6i_t *)ip6h;
-
-#define	ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &udp->udp_sticky_ipp : ipp)
-	if (option_exists & IPPF_HAS_IP6I) {
-		ip6h = (ip6_t *)&ip6i[1];
-		ip6i->ip6i_flags = 0;
-		ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-
-		/* sin6_scope_id takes precendence over IPPF_IFINDEX */
-		if (option_exists & IPPF_SCOPE_ID) {
-			ip6i->ip6i_flags |= IP6I_IFINDEX;
-			ip6i->ip6i_ifindex = sin6->sin6_scope_id;
-		} else if (option_exists & IPPF_IFINDEX) {
-			tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX);
-			ASSERT(tipp->ipp_ifindex != 0);
-			ip6i->ip6i_flags |= IP6I_IFINDEX;
-			ip6i->ip6i_ifindex = tipp->ipp_ifindex;
-		}
-
-		if (option_exists & IPPF_ADDR) {
-			/*
-			 * Enable per-packet source address verification if
-			 * IPV6_PKTINFO specified the source address.
-			 * ip6_src is set in the transport's _wput function.
-			 */
-			ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
-		}
-
-		if (option_exists & IPPF_DONTFRAG) {
-			ip6i->ip6i_flags |= IP6I_DONTFRAG;
-		}
+	/* Handle IPV6_PKTINFO setting source address. */
+	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
+	    (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) {
+		ip_pkt_t *ipp = &connp->conn_xmit_ipp;
 
-		if (option_exists & IPPF_USE_MIN_MTU) {
-			ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU(
-			    ip6i->ip6i_flags, ipp->ipp_use_min_mtu);
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
+			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+				v6src = ipp->ipp_addr;
+		} else {
+			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+				v6src = ipp->ipp_addr;
 		}
+	}
 
-		if (option_exists & IPPF_NEXTHOP) {
-			tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP);
-			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop));
-			ip6i->ip6i_flags |= IP6I_NEXTHOP;
-			ip6i->ip6i_nexthop = tipp->ipp_nexthop;
-		}
+	ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
+	mutex_exit(&connp->conn_lock);
 
+	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
+	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | IPDF_IPSEC);
+	switch (error) {
+	case 0:
+		break;
+	case EADDRNOTAVAIL:
 		/*
-		 * tell IP this is an ip6i_t private header
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
 		 */
-		ip6i->ip6i_nxt = IPPROTO_RAW;
-	}
-
-	/* Initialize IPv6 header */
-	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-	bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src));
-
-	/* Set the hoplimit of the outgoing packet. */
-	if (option_exists & IPPF_HOPLIMIT) {
-		/* IPV6_HOPLIMIT ancillary data overrides all other settings. */
-		ip6h->ip6_hops = ipp->ipp_hoplimit;
-		ip6i->ip6i_flags |= IP6I_HOPLIMIT;
-	} else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
-		ip6h->ip6_hops = udp->udp_multicast_ttl;
-		if (option_exists & IPPF_MULTICAST_HOPS)
-			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
-	} else {
-		ip6h->ip6_hops = udp->udp_ttl;
-		if (option_exists & IPPF_UNICAST_HOPS)
-			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
-	}
-
-	if (option_exists & IPPF_ADDR) {
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR);
-		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr));
-		ip6h->ip6_src = tipp->ipp_addr;
-	} else {
+		error = ENETUNREACH;
+		goto failed;
+	case ENETDOWN:
 		/*
-		 * The source address was not set using IPV6_PKTINFO.
-		 * First look at the bound source.
-		 * If unspecified fallback to __sin6_src_id.
+		 * Have !ipif_addr_ready address; drop packet silently
+		 * until we can get applications to not send until we
+		 * are ready.
 		 */
-		ip6h->ip6_src = udp->udp_v6src;
-		if (sin6->__sin6_src_id != 0 &&
-		    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
-			ip_srcid_find_id(sin6->__sin6_src_id,
-			    &ip6h->ip6_src, connp->conn_zoneid,
-			    us->us_netstack);
+		error = 0;
+		goto failed;
+	case EHOSTUNREACH:
+	case ENETUNREACH:
+		if (ixa->ixa_ire != NULL) {
+			/*
+			 * Let conn_ip_output/ire_send_noroute return
+			 * the error and send any local ICMP error.
+			 */
+			error = 0;
+			break;
 		}
+		/* FALLTHRU */
+	failed:
+	default:
+		goto ud_error;
 	}
 
-	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
-	cp = (uint8_t *)&ip6h[1];
 
 	/*
-	 * Here's where we have to start stringing together
-	 * any extension headers in the right order:
-	 * Hop-by-hop, destination, routing, and final destination opts.
+	 * Cluster note: we let the cluster hook know that we are sending to a
+	 * new address and/or port.
 	 */
-	if (option_exists & IPPF_HOPOPTS) {
-		/* Hop-by-hop options */
-		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS);
-		if (hopoptslen == 0) {
-			hopoptsptr = tipp->ipp_hopopts;
-			hopoptslen = tipp->ipp_hopoptslen;
-			is_ancillary = B_TRUE;
-		}
-
-		*nxthdr_ptr = IPPROTO_HOPOPTS;
-		nxthdr_ptr = &hbh->ip6h_nxt;
-
-		bcopy(hopoptsptr, cp, hopoptslen);
-		cp += hopoptslen;
-
-		if (hopoptsptr != NULL && !is_ancillary) {
-			kmem_free(hopoptsptr, hopoptslen);
-			hopoptsptr = NULL;
-			hopoptslen = 0;
+	if (cl_inet_connect2 != NULL) {
+		CL_INET_UDP_CONNECT(connp, B_TRUE, &v6dst, dstport, error);
+		if (error != 0) {
+			error = EHOSTUNREACH;
+			goto ud_error;
 		}
 	}
-	/*
-	 * En-route destination options
-	 * Only do them if there's a routing header as well
-	 */
-	if (option_exists & IPPF_RTDSTOPTS) {
-		ip6_dest_t *dst = (ip6_dest_t *)cp;
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS);
-
-		*nxthdr_ptr = IPPROTO_DSTOPTS;
-		nxthdr_ptr = &dst->ip6d_nxt;
 
-		bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen);
-		cp += tipp->ipp_rtdstoptslen;
-	}
-	/*
-	 * Routing header next
-	 */
-	if (option_exists & IPPF_RTHDR) {
-		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR);
-
-		*nxthdr_ptr = IPPROTO_ROUTING;
-		nxthdr_ptr = &rt->ip6r_nxt;
-
-		bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen);
-		cp += tipp->ipp_rthdrlen;
-	}
+	mutex_enter(&connp->conn_lock);
 	/*
-	 * Do ultimate destination options
+	 * While we dropped the lock some other thread might have connected
+	 * this socket. If so we bail out with EISCONN to ensure that the
+	 * connecting thread is the one that updates conn_ixa, conn_ht_*
+	 * and conn_*last*.
 	 */
-	if (option_exists & IPPF_DSTOPTS) {
-		ip6_dest_t *dest = (ip6_dest_t *)cp;
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS);
-
-		*nxthdr_ptr = IPPROTO_DSTOPTS;
-		nxthdr_ptr = &dest->ip6d_nxt;
-
-		bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen);
-		cp += tipp->ipp_dstoptslen;
+	if (udp->udp_state == TS_DATA_XFER) {
+		mutex_exit(&connp->conn_lock);
+		error = EISCONN;
+		goto ud_error;
 	}
-	/*
-	 * Now set the last header pointer to the proto passed in
-	 */
-	ASSERT((int)(cp - (uint8_t *)ip6i) == (udp_ip_hdr_len - UDPH_SIZE));
-	*nxthdr_ptr = IPPROTO_UDP;
-
-	/* Update UDP header */
-	udph = (udpha_t *)((uchar_t *)ip6i + udp_ip_hdr_len - UDPH_SIZE);
-	udph->uha_dst_port = sin6->sin6_port;
-	udph->uha_src_port = udp->udp_port;
 
 	/*
-	 * Copy in the destination address
+	 * We need to rebuild the headers if
+	 *  - we are labeling packets (could be different for different
+	 *    destinations)
+	 *  - we have a source route (or routing header) since we need to
+	 *    massage that to get the pseudo-header checksum
+	 *  - the IP version is different than the last time
+	 *  - a socket option with COA_HEADER_CHANGED has been set which
+	 *    set conn_v6lastdst to zero.
+	 *
+	 * Otherwise the prepend function will just update the src, dst,
+	 * dstport, and flow label.
 	 */
-	ip6h->ip6_dst = ip6_dst;
-
-	ip6h->ip6_vcf =
-	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
-	    (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
-
-	if (option_exists & IPPF_TCLASS) {
-		tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS);
-		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
-		    tipp->ipp_tclass);
-	}
-	rw_exit(&udp->udp_rwlock);
-
-	if (option_exists & IPPF_RTHDR) {
-		ip6_rthdr_t	*rth;
-
+	if (is_system_labeled()) {
+		/* TX MLP requires SCM_UCRED and don't have that here */
+		if (connp->conn_mlp_type != mlptSingle) {
+			mutex_exit(&connp->conn_lock);
+			error = ECONNREFUSED;
+			goto ud_error;
+		}
 		/*
-		 * Perform any processing needed for source routing.
-		 * We know that all extension headers will be in the same mblk
-		 * as the IPv6 header.
+		 * Check whether Trusted Solaris policy allows communication
+		 * with this host, and pretend that the destination is
+		 * unreachable if not.
+		 * Compute any needed label and place it in ipp_label_v4/v6.
+		 *
+		 * Later conn_build_hdr_template/conn_prepend_hdr takes
+		 * ipp_label_v4/v6 to form the packet.
+		 *
+		 * Tsol note: Since we hold conn_lock we know no other
+		 * thread manipulates conn_xmit_ipp.
 		 */
-		rth = ip_find_rthdr_v6(ip6h, mp1->b_wptr);
-		if (rth != NULL && rth->ip6r_segleft != 0) {
-			if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
-				/*
-				 * Drop packet - only support Type 0 routing.
-				 * Notify the application as well.
-				 */
-				*error = EPROTO;
-				goto done;
-			}
-
-			/*
-			 * rth->ip6r_len is twice the number of
-			 * addresses in the header. Thus it must be even.
-			 */
-			if (rth->ip6r_len & 0x1) {
-				*error = EPROTO;
-				goto done;
-			}
-			/*
-			 * Shuffle the routing header and ip6_dst
-			 * addresses, and get the checksum difference
-			 * between the first hop (in ip6_dst) and
-			 * the destination (in the last routing hdr entry).
-			 */
-			csum = ip_massage_options_v6(ip6h, rth,
-			    us->us_netstack);
-			/*
-			 * Verify that the first hop isn't a mapped address.
-			 * Routers along the path need to do this verification
-			 * for subsequent hops.
-			 */
-			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
-				*error = EADDRNOTAVAIL;
-				goto done;
+		error = conn_update_label(connp, ixa, &v6dst,
+		    &connp->conn_xmit_ipp);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			goto ud_error;
+		}
+		/* Rebuild the header template */
+		error = udp_build_hdr_template(connp, &v6src, &v6dst, dstport,
+		    flowinfo);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			goto ud_error;
+		}
+	} else if ((connp->conn_xmit_ipp.ipp_fields &
+	    (IPPF_IPV4_OPTIONS|IPPF_RTHDR)) ||
+	    ipversion != connp->conn_lastipversion ||
+	    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
+		/* Rebuild the header template */
+		error = udp_build_hdr_template(connp, &v6src, &v6dst, dstport,
+		    flowinfo);
+		if (error != 0) {
+			mutex_exit(&connp->conn_lock);
+			goto ud_error;
+		}
+	} else {
+		/* Simply update the destination address if no source route */
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
+			ipha_t	*ipha = (ipha_t *)connp->conn_ht_iphc;
+
+			IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
+			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
+				ipha->ipha_fragment_offset_and_flags |=
+				    IPH_DF_HTONS;
+			} else {
+				ipha->ipha_fragment_offset_and_flags &=
+				    ~IPH_DF_HTONS;
 			}
-
-			cp += (rth->ip6r_len + 1)*8;
+		} else {
+			ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
+			ip6h->ip6_dst = v6dst;
 		}
 	}
 
-	/* count up length of UDP packet */
-	ip_len = (mp1->b_wptr - (unsigned char *)ip6h) - IPV6_HDR_LEN;
-	if ((mp2 = mp1->b_cont) != NULL) {
-		do {
-			ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX);
-			ip_len += (uint32_t)MBLKL(mp2);
-		} while ((mp2 = mp2->b_cont) != NULL);
-	}
-
 	/*
-	 * If the size of the packet is greater than the maximum allowed by
-	 * ip, return an error. Passing this down could cause panics because
-	 * the size will have wrapped and be inconsistent with the msg size.
-	 */
-	if (ip_len > IP_MAXPACKET) {
-		*error = EMSGSIZE;
-		goto done;
-	}
-
-	/* Store the UDP length. Subtract length of extension hdrs */
-	udph->uha_length = htons(ip_len + IPV6_HDR_LEN -
-	    (int)((uchar_t *)udph - (uchar_t *)ip6h));
-
-	/*
-	 * We make it easy for IP to include our pseudo header
-	 * by putting our length in uh_checksum, modified (if
-	 * we have a routing header) by the checksum difference
-	 * between the ultimate destination and first hop addresses.
-	 * Note: UDP over IPv6 must always checksum the packet.
+	 * Remember the dst/dstport etc which corresponds to the built header
+	 * template and conn_ixa.
 	 */
-	csum += udph->uha_length;
-	csum = (csum & 0xFFFF) + (csum >> 16);
-	udph->uha_checksum = (uint16_t)csum;
-
-#ifdef _LITTLE_ENDIAN
-	ip_len = htons(ip_len);
-#endif
-	ip6h->ip6_plen = ip_len;
-
-	if (DB_TYPE(mp) != M_DATA) {
-		cred_t *cr;
-		pid_t cpid;
-
-		/* Move any cred from the T_UNITDATA_REQ to the packet */
-		cr = msg_extractcred(mp, &cpid);
-		if (cr != NULL) {
-			if (mp1->b_datap->db_credp != NULL)
-				crfree(mp1->b_datap->db_credp);
-			mp1->b_datap->db_credp = cr;
-			mp1->b_datap->db_cpid = cpid;
-		}
+	oldixa = conn_replace_ixa(connp, ixa);
+	connp->conn_v6lastdst = v6dst;
+	connp->conn_lastipversion = ipversion;
+	connp->conn_lastdstport = dstport;
+	connp->conn_lastflowinfo = flowinfo;
+	connp->conn_lastscopeid = ixa->ixa_scopeid;
+	connp->conn_lastsrcid = srcid;
+	/* Also remember a source to use together with lastdst */
+	connp->conn_v6lastsrc = v6src;
+
+	data_mp = udp_prepend_header_template(connp, ixa, data_mp, &v6src,
+	    dstport, flowinfo, &error);
+
+	/* Done with conn_t */
+	mutex_exit(&connp->conn_lock);
+	ixa_refrele(oldixa);
 
-		ASSERT(mp != mp1);
-		freeb(mp);
+	if (data_mp == NULL) {
+		ASSERT(error != 0);
+		goto ud_error;
 	}
 
-	/* mp has been consumed and we'll return success */
-	ASSERT(*error == 0);
-	mp = NULL;
-
-	/* We're done. Pass the packet to IP */
+	/* We're done.  Pass the packet to ip. */
 	BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams);
-	ip_output_v6(connp, mp1, q, IP_WPUT);
 
-done:
-	if (sth_wroff != 0) {
-		(void) proto_set_tx_wroff(RD(q), connp,
-		    udp->udp_max_hdr_len + us->us_wroff_extra);
-	}
-	if (hopoptsptr != NULL && !is_ancillary) {
-		kmem_free(hopoptsptr, hopoptslen);
-		hopoptsptr = NULL;
-	}
-	if (*error != 0) {
-		ASSERT(mp != NULL);
-		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
-	}
-	return (mp);
-}
-
-
-static int
-i_udp_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
-{
-	sin_t *sin = (sin_t *)sa;
-	sin6_t *sin6 = (sin6_t *)sa;
-
-	ASSERT(RW_LOCK_HELD(&udp->udp_rwlock));
-
-	if (udp->udp_state != TS_DATA_XFER)
-		return (ENOTCONN);
-
-	switch (udp->udp_family) {
-	case AF_INET:
-		ASSERT(udp->udp_ipversion == IPV4_VERSION);
-
-		if (*salenp < sizeof (sin_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin_t);
-		*sin = sin_null;
-		sin->sin_family = AF_INET;
-		sin->sin_port = udp->udp_dstport;
-		sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6dst);
+	error = conn_ip_output(data_mp, ixa);
+	/* No udpOutErrors if an error since IP increases its error counter */
+	switch (error) {
+	case 0:
 		break;
-
-	case AF_INET6:
-		if (*salenp < sizeof (sin6_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin6_t);
-		*sin6 = sin6_null;
-		sin6->sin6_family = AF_INET6;
-		sin6->sin6_port = udp->udp_dstport;
-		sin6->sin6_addr = udp->udp_v6dst;
-		sin6->sin6_flowinfo = udp->udp_flowinfo;
+	case EWOULDBLOCK:
+		(void) ixa_check_drain_insert(connp, ixa);
+		error = 0;
 		break;
-	}
-
-	return (0);
-}
-
-static int
-udp_getmyname(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
-{
-	sin_t *sin = (sin_t *)sa;
-	sin6_t *sin6 = (sin6_t *)sa;
-
-	ASSERT(RW_LOCK_HELD(&udp->udp_rwlock));
-
-	switch (udp->udp_family) {
-	case AF_INET:
-		ASSERT(udp->udp_ipversion == IPV4_VERSION);
-
-		if (*salenp < sizeof (sin_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin_t);
-		*sin = sin_null;
-		sin->sin_family = AF_INET;
-		sin->sin_port = udp->udp_port;
-
+	case EADDRNOTAVAIL:
 		/*
-		 * If udp_v6src is unspecified, we might be bound to broadcast
-		 * / multicast.  Use udp_bound_v6src as local address instead
-		 * (that could also still be unspecified).
+		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
+		 * Don't have the application see that errno
 		 */
-		if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) &&
-		    !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
-			sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6src);
-		} else {
-			sin->sin_addr.s_addr =
-			    V4_PART_OF_V6(udp->udp_bound_v6src);
-		}
-		break;
-
-	case AF_INET6:
-		if (*salenp < sizeof (sin6_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin6_t);
-		*sin6 = sin6_null;
-		sin6->sin6_family = AF_INET6;
-		sin6->sin6_port = udp->udp_port;
-		sin6->sin6_flowinfo = udp->udp_flowinfo;
-
+		error = ENETUNREACH;
+		/* FALLTHRU */
+	default:
+		mutex_enter(&connp->conn_lock);
 		/*
-		 * If udp_v6src is unspecified, we might be bound to broadcast
-		 * / multicast.  Use udp_bound_v6src as local address instead
-		 * (that could also still be unspecified).
+		 * Clear the source and v6lastdst so we call ip_attr_connect
+		 * for the next packet and try to pick a better source.
 		 */
-		if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))
-			sin6->sin6_addr = udp->udp_v6src;
+		if (connp->conn_mcbc_bind)
+			connp->conn_saddr_v6 = ipv6_all_zeros;
 		else
-			sin6->sin6_addr = udp->udp_bound_v6src;
+			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+		connp->conn_v6lastdst = ipv6_all_zeros;
+		mutex_exit(&connp->conn_lock);
 		break;
 	}
+	ixa_refrele(ixa);
+	return (error);
 
-	return (0);
+ud_error:
+	if (ixa != NULL)
+		ixa_refrele(ixa);
+
+	freemsg(data_mp);
+	BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+	UDP_STAT(us, udp_out_err_output);
+	return (error);
+}
+
+/* ARGSUSED */
+static void
+udp_wput_fallback(queue_t *wq, mblk_t *mp)
+{
+#ifdef DEBUG
+	cmn_err(CE_CONT, "udp_wput_fallback: Message in fallback \n");
+#endif
+	freemsg(mp);
 }
 
+
 /*
  * Handle special out-of-band ioctl requests (see PSARC/2008/265).
  */
@@ -6717,7 +4440,8 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp)
 {
 	void	*data;
 	mblk_t	*datamp = mp->b_cont;
-	udp_t	*udp = Q_TO_UDP(q);
+	conn_t	*connp = Q_TO_CONN(q);
+	udp_t	*udp = connp->conn_udp;
 	cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr;
 
 	if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) {
@@ -6727,19 +4451,23 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp)
 	}
 	data = datamp->b_rptr;
 
-	rw_enter(&udp->udp_rwlock, RW_READER);
+	mutex_enter(&connp->conn_lock);
 	switch (cmdp->cb_cmd) {
 	case TI_GETPEERNAME:
-		cmdp->cb_error = i_udp_getpeername(udp, data, &cmdp->cb_len);
+		if (udp->udp_state != TS_DATA_XFER)
+			cmdp->cb_error = ENOTCONN;
+		else
+			cmdp->cb_error = conn_getpeername(connp, data,
+			    &cmdp->cb_len);
 		break;
 	case TI_GETMYNAME:
-		cmdp->cb_error = udp_getmyname(udp, data, &cmdp->cb_len);
+		cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len);
 		break;
 	default:
 		cmdp->cb_error = EINVAL;
 		break;
 	}
-	rw_exit(&udp->udp_rwlock);
+	mutex_exit(&connp->conn_lock);
 
 	qreply(q, mp);
 }
@@ -6747,10 +4475,11 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp)
 static void
 udp_use_pure_tpi(udp_t *udp)
 {
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
-	udp->udp_issocket = B_FALSE;
-	rw_exit(&udp->udp_rwlock);
+	conn_t	*connp = udp->udp_connp;
 
+	mutex_enter(&connp->conn_lock);
+	udp->udp_issocket = B_FALSE;
+	mutex_exit(&connp->conn_lock);
 	UDP_STAT(udp->udp_us, udp_sock_fallback);
 }
 
@@ -6758,20 +4487,13 @@ static void
 udp_wput_other(queue_t *q, mblk_t *mp)
 {
 	uchar_t	*rptr = mp->b_rptr;
-	struct datab *db;
 	struct iocblk *iocp;
-	cred_t	*cr;
 	conn_t	*connp = Q_TO_CONN(q);
 	udp_t	*udp = connp->conn_udp;
-	udp_stack_t *us;
-
-	TRACE_1(TR_FAC_UDP, TR_UDP_WPUT_OTHER_START,
-	    "udp_wput_other_start: q %p", q);
-
-	us = udp->udp_us;
-	db = mp->b_datap;
+	udp_stack_t *us = udp->udp_us;
+	cred_t	*cr;
 
-	switch (db->db_type) {
+	switch (mp->b_datap->db_type) {
 	case M_CMD:
 		udp_wput_cmdblk(q, mp);
 		return;
@@ -6779,37 +4501,29 @@ udp_wput_other(queue_t *q, mblk_t *mp)
 	case M_PROTO:
 	case M_PCPROTO:
 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
+			/*
+			 * If the message does not contain a PRIM_type,
+			 * throw it away.
+			 */
 			freemsg(mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "protoshort");
 			return;
 		}
 		switch (((t_primp_t)rptr)->type) {
 		case T_ADDR_REQ:
 			udp_addr_req(q, mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "addrreq");
 			return;
 		case O_T_BIND_REQ:
 		case T_BIND_REQ:
 			udp_tpi_bind(q, mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "bindreq");
 			return;
 		case T_CONN_REQ:
 			udp_tpi_connect(q, mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "connreq");
 			return;
 		case T_CAPABILITY_REQ:
 			udp_capability_req(q, mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "capabreq");
 			return;
 		case T_INFO_REQ:
 			udp_info_req(q, mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "inforeq");
 			return;
 		case T_UNITDATA_REQ:
 			/*
@@ -6817,14 +4531,10 @@ udp_wput_other(queue_t *q, mblk_t *mp)
 			 * be bad.  Valid T_UNITDATA_REQs are handled
 			 * in udp_wput.
 			 */
-			udp_ud_err(q, mp, NULL, 0, EADDRNOTAVAIL);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "unitdatareq");
+			udp_ud_err(q, mp, EADDRNOTAVAIL);
 			return;
 		case T_UNBIND_REQ:
 			udp_tpi_unbind(q, mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "unbindreq");
 			return;
 		case T_SVR4_OPTMGMT_REQ:
 			/*
@@ -6842,11 +4552,8 @@ udp_wput_other(queue_t *q, mblk_t *mp)
 			}
 			if (!snmpcom_req(q, mp, udp_snmp_set, ip_snmp_get,
 			    cr)) {
-				(void) svr4_optcom_req(q,
-				    mp, cr, &udp_opt_obj, B_TRUE);
+				svr4_optcom_req(q, mp, cr, &udp_opt_obj);
 			}
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "optmgmtreq");
 			return;
 
 		case T_OPTMGMT_REQ:
@@ -6863,34 +4570,24 @@ udp_wput_other(queue_t *q, mblk_t *mp)
 				udp_err_ack(q, mp, TSYSERR, EINVAL);
 				return;
 			}
-			(void) tpi_optcom_req(q, mp, cr, &udp_opt_obj, B_TRUE);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "optmgmtreq");
+			tpi_optcom_req(q, mp, cr, &udp_opt_obj);
 			return;
 
 		case T_DISCON_REQ:
 			udp_tpi_disconnect(q, mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "disconreq");
 			return;
 
 		/* The following TPI message is not supported by udp. */
 		case O_T_CONN_RES:
 		case T_CONN_RES:
 			udp_err_ack(q, mp, TNOTSUPPORT, 0);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q,
-			    "connres/disconreq");
 			return;
 
-		/* The following 3 TPI messages are illegal for udp. */
+		/* The following 3 TPI requests are illegal for udp. */
 		case T_DATA_REQ:
 		case T_EXDATA_REQ:
 		case T_ORDREL_REQ:
 			udp_err_ack(q, mp, TNOTSUPPORT, 0);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q,
-			    "data/exdata/ordrel");
 			return;
 		default:
 			break;
@@ -6914,13 +4611,10 @@ udp_wput_other(queue_t *q, mblk_t *mp)
 				iocp->ioc_count = 0;
 				mp->b_datap->db_type = M_IOCACK;
 				qreply(q, mp);
-				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-				    "udp_wput_other_end: q %p (%S)", q,
-				    "getpeername");
 				return;
 			}
 			/* FALLTHRU */
-		case TI_GETMYNAME: {
+		case TI_GETMYNAME:
 			/*
 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
 			 * need to copyin the user's strbuf structure.
@@ -6929,17 +4623,12 @@ udp_wput_other(queue_t *q, mblk_t *mp)
 			 */
 			mi_copyin(q, mp, NULL,
 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			    "udp_wput_other_end: q %p (%S)", q, "getmyname");
 			return;
-			}
 		case ND_SET:
 			/* nd_getset performs the necessary checking */
 		case ND_GET:
 			if (nd_getset(q, us->us_nd, mp)) {
 				qreply(q, mp);
-				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-				    "udp_wput_other_end: q %p (%S)", q, "get");
 				return;
 			}
 			break;
@@ -6969,16 +4658,12 @@ udp_wput_other(queue_t *q, mblk_t *mp)
 		break;
 	case M_IOCDATA:
 		udp_wput_iocdata(q, mp);
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-		    "udp_wput_other_end: q %p (%S)", q, "iocdata");
 		return;
 	default:
 		/* Unrecognized messages are passed through without change. */
 		break;
 	}
-	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-	    "udp_wput_other_end: q %p (%S)", q, "end");
-	ip_output(connp, mp, q, IP_WPUT);
+	ip_wput_nondata(q, mp);
 }
 
 /*
@@ -6991,9 +4676,9 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
 	mblk_t		*mp1;
 	struct	iocblk *iocp = (struct iocblk *)mp->b_rptr;
 	STRUCT_HANDLE(strbuf, sb);
-	udp_t		*udp = Q_TO_UDP(q);
-	int		error;
 	uint_t		addrlen;
+	conn_t		*connp = Q_TO_CONN(q);
+	udp_t		*udp = connp->conn_udp;
 
 	/* Make sure it is one of ours. */
 	switch (iocp->ioc_cmd) {
@@ -7001,7 +4686,7 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
 	case TI_GETPEERNAME:
 		break;
 	default:
-		ip_output(udp->udp_connp, mp, q, IP_WPUT);
+		ip_wput_nondata(q, mp);
 		return;
 	}
 
@@ -7040,77 +4725,45 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
 	 * address and then we'll copyout the strbuf.
 	 */
 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
-	addrlen = udp->udp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t);
+
+	if (connp->conn_family == AF_INET)
+		addrlen = sizeof (sin_t);
+	else
+		addrlen = sizeof (sin6_t);
+
 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
 		mi_copy_done(q, mp, EINVAL);
 		return;
 	}
 
-	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
-
-	if (mp1 == NULL)
-		return;
-
-	rw_enter(&udp->udp_rwlock, RW_READER);
 	switch (iocp->ioc_cmd) {
 	case TI_GETMYNAME:
-		error = udp_do_getsockname(udp, (void *)mp1->b_rptr, &addrlen);
 		break;
 	case TI_GETPEERNAME:
-		error = udp_do_getpeername(udp, (void *)mp1->b_rptr, &addrlen);
+		if (udp->udp_state != TS_DATA_XFER) {
+			mi_copy_done(q, mp, ENOTCONN);
+			return;
+		}
 		break;
 	}
-	rw_exit(&udp->udp_rwlock);
-
-	if (error != 0) {
-		mi_copy_done(q, mp, error);
-	} else {
-		mp1->b_wptr += addrlen;
-		STRUCT_FSET(sb, len, addrlen);
-
-		/* Copy out the address */
-		mi_copyout(q, mp);
-	}
-}
-
-static int
-udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
-    udpattrs_t *udpattrs)
-{
-	struct T_unitdata_req *udreqp;
-	int is_absreq_failure;
-	cred_t *cr;
-
-	ASSERT(((t_primp_t)mp->b_rptr)->type);
-
-	/*
-	 * All Solaris components should pass a db_credp
-	 * for this TPI message, hence we should ASSERT.
-	 * However, RPC (svc_clts_ksend) does this odd thing where it
-	 * passes the options from a T_UNITDATA_IND unchanged in a
-	 * T_UNITDATA_REQ. While that is the right thing to do for
-	 * some options, SCM_UCRED being the key one, this also makes it
-	 * pass down IP_RECVDSTADDR. Hence we can't ASSERT here.
-	 */
-	cr = msg_getcred(mp, NULL);
-	if (cr == NULL) {
-		cr = Q_TO_CONN(q)->conn_cred;
-	}
-	udreqp = (struct T_unitdata_req *)mp->b_rptr;
-
-	*errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
-	    udreqp->OPT_offset, cr, &udp_opt_obj,
-	    udpattrs, &is_absreq_failure);
+	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
+	if (!mp1)
+		return;
 
-	if (*errorp != 0) {
-		/*
-		 * Note: No special action needed in this
-		 * module for "is_absreq_failure"
-		 */
-		return (-1);		/* failure */
+	STRUCT_FSET(sb, len, addrlen);
+	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
+	case TI_GETMYNAME:
+		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
+		    &addrlen);
+		break;
+	case TI_GETPEERNAME:
+		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
+		    &addrlen);
+		break;
 	}
-	ASSERT(is_absreq_failure == 0);
-	return (0);	/* success */
+	mp1->b_wptr += addrlen;
+	/* Copy out the address */
+	mi_copyout(q, mp);
 }
 
 void
@@ -7234,34 +4887,19 @@ udp_kstat2_init(netstackid_t stackid, udp_stat_t *us_statisticsp)
 	kstat_t *ksp;
 
 	udp_stat_t template = {
-		{ "udp_ip_send",		KSTAT_DATA_UINT64 },
-		{ "udp_ip_ire_send",		KSTAT_DATA_UINT64 },
-		{ "udp_ire_null",		KSTAT_DATA_UINT64 },
 		{ "udp_sock_fallback",		KSTAT_DATA_UINT64 },
-		{ "udp_out_sw_cksum",		KSTAT_DATA_UINT64 },
-		{ "udp_out_sw_cksum_bytes",	KSTAT_DATA_UINT64 },
 		{ "udp_out_opt",		KSTAT_DATA_UINT64 },
 		{ "udp_out_err_notconn",	KSTAT_DATA_UINT64 },
 		{ "udp_out_err_output",		KSTAT_DATA_UINT64 },
 		{ "udp_out_err_tudr",		KSTAT_DATA_UINT64 },
-		{ "udp_in_pktinfo",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvdstaddr",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvopts",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvif",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvslla",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvucred",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvttl",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvhopopts",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvhoplimit",	KSTAT_DATA_UINT64 },
-		{ "udp_in_recvdstopts",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvrtdstopts",	KSTAT_DATA_UINT64 },
-		{ "udp_in_recvrthdr",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvpktinfo",		KSTAT_DATA_UINT64 },
-		{ "udp_in_recvtclass",		KSTAT_DATA_UINT64 },
-		{ "udp_in_timestamp",		KSTAT_DATA_UINT64 },
 #ifdef DEBUG
 		{ "udp_data_conn",		KSTAT_DATA_UINT64 },
 		{ "udp_data_notconn",		KSTAT_DATA_UINT64 },
+		{ "udp_out_lastdst",		KSTAT_DATA_UINT64 },
+		{ "udp_out_diffdst",		KSTAT_DATA_UINT64 },
+		{ "udp_out_ipv6",		KSTAT_DATA_UINT64 },
+		{ "udp_out_mapped",		KSTAT_DATA_UINT64 },
+		{ "udp_out_ipv4",		KSTAT_DATA_UINT64 },
 #endif
 	};
 
@@ -7384,8 +5022,6 @@ udp_set_rcv_hiwat(udp_t *udp, size_t size)
 static void
 udp_lrput(queue_t *q, mblk_t *mp)
 {
-	mblk_t *mp1;
-
 	switch (mp->b_datap->db_type) {
 	case M_FLUSH:
 		/* Turn around */
@@ -7396,9 +5032,6 @@ udp_lrput(queue_t *q, mblk_t *mp)
 		}
 		break;
 	}
-	/* Could receive messages that passed through ar_rput */
-	for (mp1 = mp; mp1; mp1 = mp1->b_cont)
-		mp1->b_prev = mp1->b_next = NULL;
 	freemsg(mp);
 }
 
@@ -7425,6 +5058,7 @@ udp_do_open(cred_t *credp, boolean_t isv6, int flags)
 	zoneid_t 	zoneid;
 	netstack_t 	*ns;
 	udp_stack_t 	*us;
+	int		len;
 
 	ns = netstack_find_by_cred(credp);
 	ASSERT(ns != NULL);
@@ -7455,34 +5089,40 @@ udp_do_open(cred_t *credp, boolean_t isv6, int flags)
 	 */
 	netstack_rele(ns);
 
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
-	ASSERT(connp->conn_ulp == IPPROTO_UDP);
+	/*
+	 * Since this conn_t/udp_t is not yet visible to anybody else we don't
+	 * need to lock anything.
+	 */
+	ASSERT(connp->conn_proto == IPPROTO_UDP);
 	ASSERT(connp->conn_udp == udp);
 	ASSERT(udp->udp_connp == connp);
 
 	/* Set the initial state of the stream and the privilege status. */
 	udp->udp_state = TS_UNBND;
+	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
 	if (isv6) {
-		udp->udp_family = AF_INET6;
-		udp->udp_ipversion = IPV6_VERSION;
-		udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
-		udp->udp_ttl = us->us_ipv6_hoplimit;
-		connp->conn_af_isv6 = B_TRUE;
+		connp->conn_family = AF_INET6;
+		connp->conn_ipversion = IPV6_VERSION;
+		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
+		connp->conn_default_ttl = us->us_ipv6_hoplimit;
+		len = sizeof (ip6_t) + UDPH_SIZE;
 	} else {
-		udp->udp_family = AF_INET;
-		udp->udp_ipversion = IPV4_VERSION;
-		udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE;
-		udp->udp_ttl = us->us_ipv4_ttl;
-		connp->conn_af_isv6 = B_FALSE;
+		connp->conn_family = AF_INET;
+		connp->conn_ipversion = IPV4_VERSION;
+		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
+		connp->conn_default_ttl = us->us_ipv4_ttl;
+		len = sizeof (ipha_t) + UDPH_SIZE;
 	}
 
-	udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
-	udp->udp_pending_op = -1;
-	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
-	connp->conn_zoneid = zoneid;
+	ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
+	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
+
+	connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
+	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+	connp->conn_ixa->ixa_zoneid = zoneid;
 
-	udp->udp_open_time = lbolt64;
-	udp->udp_open_pid = curproc->p_pid;
+	connp->conn_zoneid = zoneid;
 
 	/*
 	 * If the caller has the process-wide flag set, then default to MAC
@@ -7491,22 +5131,38 @@ udp_do_open(cred_t *credp, boolean_t isv6, int flags)
 	if (getpflags(NET_MAC_AWARE, credp) != 0)
 		connp->conn_mac_mode = CONN_MAC_AWARE;
 
-	connp->conn_ulp_labeled = is_system_labeled();
+	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
 
 	udp->udp_us = us;
 
+	connp->conn_rcvbuf = us->us_recv_hiwat;
+	connp->conn_sndbuf = us->us_xmit_hiwat;
+	connp->conn_sndlowat = us->us_xmit_lowat;
+	connp->conn_rcvlowat = udp_mod_info.mi_lowat;
+
+	connp->conn_wroff = len + us->us_wroff_extra;
+	connp->conn_so_type = SOCK_DGRAM;
+
 	connp->conn_recv = udp_input;
+	connp->conn_recvicmp = udp_icmp_input;
 	crhold(credp);
 	connp->conn_cred = credp;
+	connp->conn_cpid = curproc->p_pid;
+	connp->conn_open_time = lbolt64;
+	/* Cache things in ixa without an extra refhold */
+	connp->conn_ixa->ixa_cred = connp->conn_cred;
+	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+	if (is_system_labeled())
+		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
 
 	*((sin6_t *)&udp->udp_delayed_addr) = sin6_null;
 
-	rw_exit(&udp->udp_rwlock);
+	if (us->us_pmtu_discovery)
+		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
 
 	return (connp);
 }
 
-/* ARGSUSED */
 sock_lower_handle_t
 udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
     uint_t *smodep, int *errorp, int flags, cred_t *credp)
@@ -7539,39 +5195,17 @@ udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 	ASSERT(us != NULL);
 
 	udp->udp_issocket = B_TRUE;
-	connp->conn_flags |= IPCL_NONSTR | IPCL_SOCKET;
-
-	/* Set flow control */
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
-	(void) udp_set_rcv_hiwat(udp, us->us_recv_hiwat);
-	udp->udp_rcv_disply_hiwat = us->us_recv_hiwat;
-	udp->udp_rcv_lowat = udp_mod_info.mi_lowat;
-	udp->udp_xmit_hiwat = us->us_xmit_hiwat;
-	udp->udp_xmit_lowat = us->us_xmit_lowat;
-
-	if (udp->udp_family == AF_INET6) {
-		/* Build initial header template for transmit */
-		if ((*errorp = udp_build_hdrs(udp)) != 0) {
-			rw_exit(&udp->udp_rwlock);
-			ipcl_conn_destroy(connp);
-			return (NULL);
-		}
-	}
-	rw_exit(&udp->udp_rwlock);
+	connp->conn_flags |= IPCL_NONSTR;
 
-	connp->conn_flow_cntrld = B_FALSE;
-
-	ASSERT(us->us_ldi_ident != NULL);
-
-	if ((*errorp = ip_create_helper_stream(connp, us->us_ldi_ident)) != 0) {
-		ip1dbg(("udp_create: create of IP helper stream failed\n"));
-		udp_do_close(connp);
-		return (NULL);
-	}
+	/*
+	 * Set flow control
+	 * Since this conn_t/udp_t is not yet visible to anybody else we don't
+	 * need to lock anything.
+	 */
+	(void) udp_set_rcv_hiwat(udp, connp->conn_rcvbuf);
+	udp->udp_rcv_disply_hiwat = connp->conn_rcvbuf;
 
-	/* Set the send flow control */
-	connp->conn_wq->q_hiwat = us->us_xmit_hiwat;
-	connp->conn_wq->q_lowat = us->us_xmit_lowat;
+	connp->conn_flow_cntrld = B_FALSE;
 
 	mutex_enter(&connp->conn_lock);
 	connp->conn_state_flags &= ~CONN_INCIPIENT;
@@ -7583,14 +5217,12 @@ udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 	return ((sock_lower_handle_t)connp);
 }
 
-/* ARGSUSED */
+/* ARGSUSED3 */
 void
 udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
 {
 	conn_t 		*connp = (conn_t *)proto_handle;
-	udp_t 		*udp = connp->conn_udp;
-	udp_stack_t	*us = udp->udp_us;
 	struct sock_proto_props sopp;
 
 	/* All Solaris components should pass a cred for this operation. */
@@ -7599,14 +5231,15 @@ udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
 	connp->conn_upcalls = sock_upcalls;
 	connp->conn_upper_handle = sock_handle;
 
-	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
+	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
-	sopp.sopp_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
+	sopp.sopp_wroff = connp->conn_wroff;
 	sopp.sopp_maxblk = INFPSZ;
-	sopp.sopp_rxhiwat = udp->udp_rcv_hiwat;
+	sopp.sopp_rxhiwat = connp->conn_rcvbuf;
+	sopp.sopp_rxlowat = connp->conn_rcvlowat;
 	sopp.sopp_maxaddrlen = sizeof (sin6_t);
 	sopp.sopp_maxpsz =
-	    (udp->udp_family == AF_INET) ? UDP_MAXPACKET_IPV4 :
+	    (connp->conn_family == AF_INET) ? UDP_MAXPACKET_IPV4 :
 	    UDP_MAXPACKET_IPV6;
 	sopp.sopp_minpsz = (udp_mod_info.mi_minpsz == 1) ? 0 :
 	    udp_mod_info.mi_minpsz;
@@ -7618,9 +5251,32 @@ udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
 static void
 udp_do_close(conn_t *connp)
 {
+	udp_t	*udp;
+
 	ASSERT(connp != NULL && IPCL_IS_UDP(connp));
+	udp = connp->conn_udp;
+
+	if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) {
+		/*
+		 * Running in cluster mode - register unbind information
+		 */
+		if (connp->conn_ipversion == IPV4_VERSION) {
+			(*cl_inet_unbind)(
+			    connp->conn_netstack->netstack_stackid,
+			    IPPROTO_UDP, AF_INET,
+			    (uint8_t *)(&V4_PART_OF_V6(connp->conn_laddr_v6)),
+			    (in_port_t)connp->conn_lport, NULL);
+		} else {
+			(*cl_inet_unbind)(
+			    connp->conn_netstack->netstack_stackid,
+			    IPPROTO_UDP, AF_INET6,
+			    (uint8_t *)&(connp->conn_laddr_v6),
+			    (in_port_t)connp->conn_lport, NULL);
+		}
+	}
+
+	udp_bind_hash_remove(udp, B_FALSE);
 
-	udp_quiesce_conn(connp);
 	ip_quiesce_conn(connp);
 
 	if (!IPCL_IS_NONSTR(connp)) {
@@ -7642,6 +5298,7 @@ udp_do_close(conn_t *connp)
 	 * future.
 	 */
 	ASSERT(connp->conn_ref == 1);
+
 	if (!IPCL_IS_NONSTR(connp)) {
 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
 	} else {
@@ -7652,7 +5309,7 @@ udp_do_close(conn_t *connp)
 	ipcl_conn_destroy(connp);
 }
 
-/* ARGSUSED */
+/* ARGSUSED1 */
 int
 udp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
 {
@@ -7671,59 +5328,41 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 {
 	sin_t		*sin;
 	sin6_t		*sin6;
-	sin6_t		sin6addr;
+	udp_t		*udp = connp->conn_udp;
+	int		error = 0;
+	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
 	in_port_t	port;		/* Host byte order */
 	in_port_t	requested_port;	/* Host byte order */
 	int		count;
+	ipaddr_t	v4src;		/* Set if AF_INET */
 	in6_addr_t	v6src;
 	int		loopmax;
 	udp_fanout_t	*udpf;
 	in_port_t	lport;		/* Network byte order */
-	udp_t		*udp;
+	uint_t		scopeid = 0;
+	zoneid_t	zoneid = IPCL_ZONEID(connp);
+	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
 	boolean_t	is_inaddr_any;
 	mlp_type_t	addrtype, mlptype;
-	udp_stack_t	*us;
-	int		error = 0;
-	mblk_t		*mp = NULL;
-
-	udp = connp->conn_udp;
-	us = udp->udp_us;
-
-	if (udp->udp_state != TS_UNBND) {
-		(void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
-		    "udp_bind: bad state, %u", udp->udp_state);
-		return (-TOUTSTATE);
-	}
+	udp_stack_t	*us = udp->udp_us;
 
 	switch (len) {
-	case 0:
-		if (udp->udp_family == AF_INET) {
-			sin = (sin_t *)&sin6addr;
-			*sin = sin_null;
-			sin->sin_family = AF_INET;
-			sin->sin_addr.s_addr = INADDR_ANY;
-			udp->udp_ipversion = IPV4_VERSION;
-		} else {
-			ASSERT(udp->udp_family == AF_INET6);
-			sin6 = (sin6_t *)&sin6addr;
-			*sin6 = sin6_null;
-			sin6->sin6_family = AF_INET6;
-			V6_SET_ZERO(sin6->sin6_addr);
-			udp->udp_ipversion = IPV6_VERSION;
-		}
-		port = 0;
-		break;
-
 	case sizeof (sin_t):	/* Complete IPv4 address */
 		sin = (sin_t *)sa;
 
 		if (sin == NULL || !OK_32PTR((char *)sin))
 			return (EINVAL);
 
-		if (udp->udp_family != AF_INET ||
+		if (connp->conn_family != AF_INET ||
 		    sin->sin_family != AF_INET) {
 			return (EAFNOSUPPORT);
 		}
+		v4src = sin->sin_addr.s_addr;
+		IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
+		if (v4src != INADDR_ANY) {
+			laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
+			    B_TRUE);
+		}
 		port = ntohs(sin->sin_port);
 		break;
 
@@ -7733,10 +5372,28 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		if (sin6 == NULL || !OK_32PTR((char *)sin6))
 			return (EINVAL);
 
-		if (udp->udp_family != AF_INET6 ||
+		if (connp->conn_family != AF_INET6 ||
 		    sin6->sin6_family != AF_INET6) {
 			return (EAFNOSUPPORT);
 		}
+		v6src = sin6->sin6_addr;
+		if (IN6_IS_ADDR_V4MAPPED(&v6src)) {
+			if (connp->conn_ipv6_v6only)
+				return (EADDRNOTAVAIL);
+
+			IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
+			if (v4src != INADDR_ANY) {
+				laddr_type = ip_laddr_verify_v4(v4src,
+				    zoneid, ipst, B_FALSE);
+			}
+		} else {
+			if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+				if (IN6_IS_ADDR_LINKSCOPE(&v6src))
+					scopeid = sin6->sin6_scope_id;
+				laddr_type = ip_laddr_verify_v6(&v6src,
+				    zoneid, ipst, B_TRUE, scopeid);
+			}
+		}
 		port = ntohs(sin6->sin6_port);
 		break;
 
@@ -7746,6 +5403,10 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		return (-TBADADDR);
 	}
 
+	/* Is the local address a valid unicast, multicast, or broadcast? */
+	if (laddr_type == IPVL_BAD)
+		return (EADDRNOTAVAIL);
+
 	requested_port = port;
 
 	if (requested_port == 0 || !bind_to_req_port_only)
@@ -7759,7 +5420,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		 * doesn't care which port number we bind to. Get one in the
 		 * valid range.
 		 */
-		if (udp->udp_anon_priv_bind) {
+		if (connp->conn_anon_priv_bind) {
 			port = udp_get_next_priv_port(udp);
 		} else {
 			port = udp_update_next_port(udp,
@@ -7798,53 +5459,45 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 	 * TPI primitives only 1 at a time and wait for the response before
 	 * sending the next primitive.
 	 */
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
-	if (udp->udp_state != TS_UNBND || udp->udp_pending_op != -1) {
-		rw_exit(&udp->udp_rwlock);
+	mutex_enter(&connp->conn_lock);
+	if (udp->udp_state != TS_UNBND) {
+		mutex_exit(&connp->conn_lock);
 		(void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 		    "udp_bind: bad state, %u", udp->udp_state);
 		return (-TOUTSTATE);
 	}
-	/* XXX how to remove the T_BIND_REQ? Should set it before calling */
-	udp->udp_pending_op = T_BIND_REQ;
 	/*
 	 * Copy the source address into our udp structure. This address
 	 * may still be zero; if so, IP will fill in the correct address
 	 * each time an outbound packet is passed to it. Since the udp is
 	 * not yet in the bind hash list, we don't grab the uf_lock to
-	 * change udp_ipversion
+	 * change conn_ipversion
 	 */
-	if (udp->udp_family == AF_INET) {
+	if (connp->conn_family == AF_INET) {
 		ASSERT(sin != NULL);
-		ASSERT(udp->udp_ipversion == IPV4_VERSION);
-		udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
-		    udp->udp_ip_snd_options_len;
-		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6src);
+		ASSERT(connp->conn_ixa->ixa_flags & IXAF_IS_IPV4);
 	} else {
-		ASSERT(sin6 != NULL);
-		v6src = sin6->sin6_addr;
 		if (IN6_IS_ADDR_V4MAPPED(&v6src)) {
 			/*
-			 * no need to hold the uf_lock to set the udp_ipversion
+			 * no need to hold the uf_lock to set the conn_ipversion
 			 * since we are not yet in the fanout list
 			 */
-			udp->udp_ipversion = IPV4_VERSION;
-			udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
-			    UDPH_SIZE + udp->udp_ip_snd_options_len;
+			connp->conn_ipversion = IPV4_VERSION;
+			connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
 		} else {
-			udp->udp_ipversion = IPV6_VERSION;
-			udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len;
+			connp->conn_ipversion = IPV6_VERSION;
+			connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
 		}
 	}
 
 	/*
-	 * If udp_reuseaddr is not set, then we have to make sure that
+	 * If conn_reuseaddr is not set, then we have to make sure that
 	 * the IP address and port number the application requested
 	 * (or we selected for the application) is not being used by
 	 * another stream.  If another stream is already using the
 	 * requested IP address and port, the behavior depends on
 	 * "bind_to_req_port_only". If set the bind fails; otherwise we
-	 * search for any an unused port to bind to the the stream.
+	 * search for any an unused port to bind to the stream.
 	 *
 	 * As per the BSD semantics, as modified by the Deering multicast
 	 * changes, if udp_reuseaddr is set, then we allow multiple binds
@@ -7860,7 +5513,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 	 */
 
 	count = 0;
-	if (udp->udp_anon_priv_bind) {
+	if (connp->conn_anon_priv_bind) {
 		/*
 		 * loopmax = (IPPORT_RESERVED-1) -
 		 *    us->us_min_anonpriv_port + 1
@@ -7876,6 +5529,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 	for (;;) {
 		udp_t		*udp1;
 		boolean_t	found_exclbind = B_FALSE;
+		conn_t		*connp1;
 
 		/*
 		 * Walk through the list of udp streams bound to
@@ -7887,7 +5541,9 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		mutex_enter(&udpf->uf_lock);
 		for (udp1 = udpf->uf_udp; udp1 != NULL;
 		    udp1 = udp1->udp_bind_hash) {
-			if (lport != udp1->udp_port)
+			connp1 = udp1->udp_connp;
+
+			if (lport != connp1->conn_lport)
 				continue;
 
 			/*
@@ -7896,7 +5552,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 			 * privilege as being in all zones, as there's
 			 * otherwise no way to identify the right receiver.
 			 */
-			if (!IPCL_BIND_ZONE_MATCH(udp1->udp_connp, connp))
+			if (!IPCL_BIND_ZONE_MATCH(connp1, connp))
 				continue;
 
 			/*
@@ -7918,12 +5574,13 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
 			 * as UDP_EXCLBIND, except that zoneid is ignored.
 			 */
-			if (udp1->udp_exclbind || udp->udp_exclbind ||
+			if (connp1->conn_exclbind || connp->conn_exclbind ||
 			    IPCL_CONNS_MAC(udp1->udp_connp, connp)) {
 				if (V6_OR_V4_INADDR_ANY(
-				    udp1->udp_bound_v6src) ||
+				    connp1->conn_bound_addr_v6) ||
 				    is_inaddr_any ||
-				    IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src,
+				    IN6_ARE_ADDR_EQUAL(
+				    &connp1->conn_bound_addr_v6,
 				    &v6src)) {
 					found_exclbind = B_TRUE;
 					break;
@@ -7935,7 +5592,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 			 * Check ipversion to allow IPv4 and IPv6 sockets to
 			 * have disjoint port number spaces.
 			 */
-			if (udp->udp_ipversion != udp1->udp_ipversion) {
+			if (connp->conn_ipversion != connp1->conn_ipversion) {
 
 				/*
 				 * On the first time through the loop, if the
@@ -7963,8 +5620,8 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 			 * (non-wildcard, also), keep going.
 			 */
 			if (!is_inaddr_any &&
-			    !V6_OR_V4_INADDR_ANY(udp1->udp_bound_v6src) &&
-			    !IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src,
+			    !V6_OR_V4_INADDR_ANY(connp1->conn_bound_addr_v6) &&
+			    !IN6_ARE_ADDR_EQUAL(&connp1->conn_laddr_v6,
 			    &v6src)) {
 				continue;
 			}
@@ -7972,7 +5629,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		}
 
 		if (!found_exclbind &&
-		    (udp->udp_reuseaddr && requested_port != 0)) {
+		    (connp->conn_reuseaddr && requested_port != 0)) {
 			break;
 		}
 
@@ -7995,12 +5652,11 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 			 * the routine (and exit the loop).
 			 *
 			 */
-			udp->udp_pending_op = -1;
-			rw_exit(&udp->udp_rwlock);
+			mutex_exit(&connp->conn_lock);
 			return (-TADDRBUSY);
 		}
 
-		if (udp->udp_anon_priv_bind) {
+		if (connp->conn_anon_priv_bind) {
 			port = udp_get_next_priv_port(udp);
 		} else {
 			if ((count == 0) && (requested_port != 0)) {
@@ -8025,66 +5681,82 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 			 * there are none available, so send an error
 			 * to the user.
 			 */
-			udp->udp_pending_op = -1;
-			rw_exit(&udp->udp_rwlock);
+			mutex_exit(&connp->conn_lock);
 			return (-TNOADDR);
 		}
 	}
 
 	/*
 	 * Copy the source address into our udp structure.  This address
-	 * may still be zero; if so, ip will fill in the correct address
-	 * each time an outbound packet is passed to it.
+	 * may still be zero; if so, ip_attr_connect will fill in the correct
+	 * address when a packet is about to be sent.
 	 * If we are binding to a broadcast or multicast address then
-	 * udp_post_ip_bind_connect will clear the source address
-	 * when udp_do_bind success.
+	 * we just set the conn_bound_addr since we don't want to use
+	 * that as the source address when sending.
 	 */
-	udp->udp_v6src = udp->udp_bound_v6src = v6src;
-	udp->udp_port = lport;
+	connp->conn_bound_addr_v6 = v6src;
+	connp->conn_laddr_v6 = v6src;
+	if (scopeid != 0) {
+		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
+		connp->conn_ixa->ixa_scopeid = scopeid;
+		connp->conn_incoming_ifindex = scopeid;
+	} else {
+		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		connp->conn_incoming_ifindex = connp->conn_bound_if;
+	}
+
+	switch (laddr_type) {
+	case IPVL_UNICAST_UP:
+	case IPVL_UNICAST_DOWN:
+		connp->conn_saddr_v6 = v6src;
+		connp->conn_mcbc_bind = B_FALSE;
+		break;
+	case IPVL_MCAST:
+	case IPVL_BCAST:
+		/* ip_set_destination will pick a source address later */
+		connp->conn_saddr_v6 = ipv6_all_zeros;
+		connp->conn_mcbc_bind = B_TRUE;
+		break;
+	}
+
+	/* Any errors after this point should use late_error */
+	connp->conn_lport = lport;
+
 	/*
-	 * Now reset the the next anonymous port if the application requested
+	 * Now reset the next anonymous port if the application requested
 	 * an anonymous port, or we handed out the next anonymous port.
 	 */
-	if ((requested_port == 0) && (!udp->udp_anon_priv_bind)) {
+	if ((requested_port == 0) && (!connp->conn_anon_priv_bind)) {
 		us->us_next_port_to_try = port + 1;
 	}
 
-	/* Initialize the O_T_BIND_REQ/T_BIND_REQ for ip. */
-	if (udp->udp_family == AF_INET) {
-		sin->sin_port = udp->udp_port;
+	/* Initialize the T_BIND_ACK. */
+	if (connp->conn_family == AF_INET) {
+		sin->sin_port = connp->conn_lport;
 	} else {
-		sin6->sin6_port = udp->udp_port;
-		/* Rebuild the header template */
-		error = udp_build_hdrs(udp);
-		if (error != 0) {
-			udp->udp_pending_op = -1;
-			rw_exit(&udp->udp_rwlock);
-			mutex_exit(&udpf->uf_lock);
-			return (error);
-		}
+		sin6->sin6_port = connp->conn_lport;
 	}
 	udp->udp_state = TS_IDLE;
 	udp_bind_hash_insert(udpf, udp);
 	mutex_exit(&udpf->uf_lock);
-	rw_exit(&udp->udp_rwlock);
+	mutex_exit(&connp->conn_lock);
 
 	if (cl_inet_bind) {
 		/*
 		 * Running in cluster mode - register bind information
 		 */
-		if (udp->udp_ipversion == IPV4_VERSION) {
+		if (connp->conn_ipversion == IPV4_VERSION) {
 			(*cl_inet_bind)(connp->conn_netstack->netstack_stackid,
-			    IPPROTO_UDP, AF_INET,
-			    (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)),
-			    (in_port_t)udp->udp_port, NULL);
+			    IPPROTO_UDP, AF_INET, (uint8_t *)&v4src,
+			    (in_port_t)connp->conn_lport, NULL);
 		} else {
 			(*cl_inet_bind)(connp->conn_netstack->netstack_stackid,
-			    IPPROTO_UDP, AF_INET6,
-			    (uint8_t *)&(udp->udp_v6src),
-			    (in_port_t)udp->udp_port, NULL);
+			    IPPROTO_UDP, AF_INET6, (uint8_t *)&v6src,
+			    (in_port_t)connp->conn_lport, NULL);
 		}
 	}
 
+	mutex_enter(&connp->conn_lock);
 	connp->conn_anon_port = (is_system_labeled() && requested_port == 0);
 	if (is_system_labeled() && (!connp->conn_anon_port ||
 	    connp->conn_anon_mlp)) {
@@ -8092,18 +5764,16 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		zone_t *zone;
 
 		zone = crgetzone(cr);
-		connp->conn_mlp_type = udp->udp_recvucred ? mlptBoth :
+		connp->conn_mlp_type =
+		    connp->conn_recv_ancillary.crb_recvucred ? mlptBoth :
 		    mlptSingle;
 		addrtype = tsol_mlp_addr_type(
 		    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
 		    IPV6_VERSION, &v6src, us->us_netstack->netstack_ip);
 		if (addrtype == mlptSingle) {
-			rw_enter(&udp->udp_rwlock, RW_WRITER);
-			udp->udp_pending_op = -1;
-			rw_exit(&udp->udp_rwlock);
-			connp->conn_anon_port = B_FALSE;
-			connp->conn_mlp_type = mlptSingle;
-			return (-TNOADDR);
+			error = -TNOADDR;
+			mutex_exit(&connp->conn_lock);
+			goto late_error;
 		}
 		mlpport = connp->conn_anon_port ? PMAPPORT : port;
 		mlptype = tsol_mlp_port_type(zone, IPPROTO_UDP, mlpport,
@@ -8115,12 +5785,9 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		 */
 		if (mlptype != mlptSingle &&
 		    connp->conn_mlp_type == mlptSingle) {
-			rw_enter(&udp->udp_rwlock, RW_WRITER);
-			udp->udp_pending_op = -1;
-			rw_exit(&udp->udp_rwlock);
-			connp->conn_anon_port = B_FALSE;
-			connp->conn_mlp_type = mlptSingle;
-			return (EINVAL);
+			error = EINVAL;
+			mutex_exit(&connp->conn_lock);
+			goto late_error;
 		}
 
 		/*
@@ -8129,18 +5796,15 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 		 */
 		if (mlptype != mlptSingle &&
 		    secpolicy_net_bindmlp(cr) != 0) {
-			if (udp->udp_debug) {
+			if (connp->conn_debug) {
 				(void) strlog(UDP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "udp_bind: no priv for multilevel port %d",
 				    mlpport);
 			}
-			rw_enter(&udp->udp_rwlock, RW_WRITER);
-			udp->udp_pending_op = -1;
-			rw_exit(&udp->udp_rwlock);
-			connp->conn_anon_port = B_FALSE;
-			connp->conn_mlp_type = mlptSingle;
-			return (-TACCES);
+			error = -TACCES;
+			mutex_exit(&connp->conn_lock);
+			goto late_error;
 		}
 
 		/*
@@ -8158,7 +5822,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 			mlpzone = tsol_mlp_findzone(IPPROTO_UDP,
 			    htons(mlpport));
 			if (connp->conn_zoneid != mlpzone) {
-				if (udp->udp_debug) {
+				if (connp->conn_debug) {
 					(void) strlog(UDP_MOD_ID, 0, 1,
 					    SL_ERROR|SL_TRACE,
 					    "udp_bind: attempt to bind port "
@@ -8167,62 +5831,82 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 					    mlpport, connp->conn_zoneid,
 					    mlpzone);
 				}
-				rw_enter(&udp->udp_rwlock, RW_WRITER);
-				udp->udp_pending_op = -1;
-				rw_exit(&udp->udp_rwlock);
-				connp->conn_anon_port = B_FALSE;
-				connp->conn_mlp_type = mlptSingle;
-				return (-TACCES);
+				error = -TACCES;
+				mutex_exit(&connp->conn_lock);
+				goto late_error;
 			}
 		}
 		if (connp->conn_anon_port) {
-			error = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
+			error = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
 			    port, B_TRUE);
 			if (error != 0) {
-				if (udp->udp_debug) {
+				if (connp->conn_debug) {
 					(void) strlog(UDP_MOD_ID, 0, 1,
 					    SL_ERROR|SL_TRACE,
 					    "udp_bind: cannot establish anon "
 					    "MLP for port %d", port);
 				}
-				rw_enter(&udp->udp_rwlock, RW_WRITER);
-				udp->udp_pending_op = -1;
-				rw_exit(&udp->udp_rwlock);
-				connp->conn_anon_port = B_FALSE;
-				connp->conn_mlp_type = mlptSingle;
-				return (-TACCES);
+				error = -TACCES;
+				mutex_exit(&connp->conn_lock);
+				goto late_error;
 			}
 		}
 		connp->conn_mlp_type = mlptype;
 	}
 
-	if (!V6_OR_V4_INADDR_ANY(udp->udp_v6src)) {
-		/*
-		 * Append a request for an IRE if udp_v6src not
-		 * zero (IPv4 - INADDR_ANY, or IPv6 - all-zeroes address).
-		 */
-		mp = allocb(sizeof (ire_t), BPRI_HI);
-		if (!mp) {
-			rw_enter(&udp->udp_rwlock, RW_WRITER);
-			udp->udp_pending_op = -1;
-			rw_exit(&udp->udp_rwlock);
-			return (ENOMEM);
-		}
-		mp->b_wptr += sizeof (ire_t);
-		mp->b_datap->db_type = IRE_DB_REQ_TYPE;
+	/*
+	 * We create an initial header template here to make a subsequent
+	 * sendto have a starting point. Since conn_last_dst is zero the
+	 * first sendto will always follow the 'dst changed' code path.
+	 * Note that we defer massaging options and the related checksum
+	 * adjustment until we have a destination address.
+	 */
+	error = udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+	if (error != 0) {
+		mutex_exit(&connp->conn_lock);
+		goto late_error;
 	}
-	if (udp->udp_family == AF_INET6) {
-		ASSERT(udp->udp_connp->conn_af_isv6);
-		error = ip_proto_bind_laddr_v6(connp, &mp, IPPROTO_UDP,
-		    &udp->udp_bound_v6src, udp->udp_port, B_TRUE);
-	} else {
-		ASSERT(!udp->udp_connp->conn_af_isv6);
-		error = ip_proto_bind_laddr_v4(connp, &mp, IPPROTO_UDP,
-		    V4_PART_OF_V6(udp->udp_bound_v6src), udp->udp_port,
-		    B_TRUE);
+	/* Just in case */
+	connp->conn_faddr_v6 = ipv6_all_zeros;
+	connp->conn_fport = 0;
+	connp->conn_v6lastdst = ipv6_all_zeros;
+	mutex_exit(&connp->conn_lock);
+
+	error = ip_laddr_fanout_insert(connp);
+	if (error != 0)
+		goto late_error;
+
+	/* Bind succeeded */
+	return (0);
+
+late_error:
+	/* We had already picked the port number, and then the bind failed */
+	mutex_enter(&connp->conn_lock);
+	udpf = &us->us_bind_fanout[
+	    UDP_BIND_HASH(connp->conn_lport,
+	    us->us_bind_fanout_size)];
+	mutex_enter(&udpf->uf_lock);
+	connp->conn_saddr_v6 = ipv6_all_zeros;
+	connp->conn_bound_addr_v6 = ipv6_all_zeros;
+	connp->conn_laddr_v6 = ipv6_all_zeros;
+	if (scopeid != 0) {
+		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		connp->conn_incoming_ifindex = connp->conn_bound_if;
 	}
+	udp->udp_state = TS_UNBND;
+	udp_bind_hash_remove(udp, B_TRUE);
+	connp->conn_lport = 0;
+	mutex_exit(&udpf->uf_lock);
+	connp->conn_anon_port = B_FALSE;
+	connp->conn_mlp_type = mlptSingle;
 
-	(void) udp_post_ip_bind_connect(udp, mp, error);
+	connp->conn_v6lastdst = ipv6_all_zeros;
+
+	/* Restore the header that was built above - different source address */
+	(void) udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+	mutex_exit(&connp->conn_lock);
 	return (error);
 }
 
@@ -8256,12 +5940,32 @@ udp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
 static int
 udp_implicit_bind(conn_t *connp, cred_t *cr)
 {
+	sin6_t sin6addr;
+	sin_t *sin;
+	sin6_t *sin6;
+	socklen_t len;
 	int error;
 
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
-	error = udp_do_bind(connp, NULL, 0, cr, B_FALSE);
+	if (connp->conn_family == AF_INET) {
+		len = sizeof (struct sockaddr_in);
+		sin = (sin_t *)&sin6addr;
+		*sin = sin_null;
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = INADDR_ANY;
+	} else {
+		ASSERT(connp->conn_family == AF_INET6);
+		len = sizeof (sin6_t);
+		sin6 = (sin6_t *)&sin6addr;
+		*sin6 = sin6_null;
+		sin6->sin6_family = AF_INET6;
+		V6_SET_ZERO(sin6->sin6_addr);
+	}
+
+	error = udp_do_bind(connp, (struct sockaddr *)&sin6addr, len,
+	    cr, B_FALSE);
 	return ((error < 0) ? proto_tlitosyserr(-error) : error);
 }
 
@@ -8280,137 +5984,51 @@ udp_do_unbind(conn_t *connp)
 		/*
 		 * Running in cluster mode - register unbind information
 		 */
-		if (udp->udp_ipversion == IPV4_VERSION) {
+		if (connp->conn_ipversion == IPV4_VERSION) {
 			(*cl_inet_unbind)(
 			    connp->conn_netstack->netstack_stackid,
 			    IPPROTO_UDP, AF_INET,
-			    (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)),
-			    (in_port_t)udp->udp_port, NULL);
+			    (uint8_t *)(&V4_PART_OF_V6(connp->conn_laddr_v6)),
+			    (in_port_t)connp->conn_lport, NULL);
 		} else {
 			(*cl_inet_unbind)(
 			    connp->conn_netstack->netstack_stackid,
 			    IPPROTO_UDP, AF_INET6,
-			    (uint8_t *)&(udp->udp_v6src),
-			    (in_port_t)udp->udp_port, NULL);
+			    (uint8_t *)&(connp->conn_laddr_v6),
+			    (in_port_t)connp->conn_lport, NULL);
 		}
 	}
 
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
-	if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) {
-		rw_exit(&udp->udp_rwlock);
+	mutex_enter(&connp->conn_lock);
+	/* If a bind has not been done, we can't unbind. */
+	if (udp->udp_state == TS_UNBND) {
+		mutex_exit(&connp->conn_lock);
 		return (-TOUTSTATE);
 	}
-	udp->udp_pending_op = T_UNBIND_REQ;
-	rw_exit(&udp->udp_rwlock);
-
-	/*
-	 * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK
-	 * and therefore ip_unbind must never return NULL.
-	 */
-	ip_unbind(connp);
-
-	/*
-	 * Once we're unbound from IP, the pending operation may be cleared
-	 * here.
-	 */
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
-	udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+	udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
 	    us->us_bind_fanout_size)];
-
 	mutex_enter(&udpf->uf_lock);
 	udp_bind_hash_remove(udp, B_TRUE);
-	V6_SET_ZERO(udp->udp_v6src);
-	V6_SET_ZERO(udp->udp_bound_v6src);
-	udp->udp_port = 0;
+	connp->conn_saddr_v6 = ipv6_all_zeros;
+	connp->conn_bound_addr_v6 = ipv6_all_zeros;
+	connp->conn_laddr_v6 = ipv6_all_zeros;
+	connp->conn_mcbc_bind = B_FALSE;
+	connp->conn_lport = 0;
+	/* In case we were also connected */
+	connp->conn_faddr_v6 = ipv6_all_zeros;
+	connp->conn_fport = 0;
 	mutex_exit(&udpf->uf_lock);
 
-	udp->udp_pending_op = -1;
+	connp->conn_v6lastdst = ipv6_all_zeros;
 	udp->udp_state = TS_UNBND;
-	if (udp->udp_family == AF_INET6)
-		(void) udp_build_hdrs(udp);
-	rw_exit(&udp->udp_rwlock);
 
-	return (0);
-}
-
-static int
-udp_post_ip_bind_connect(udp_t *udp, mblk_t *ire_mp, int error)
-{
-	ire_t		*ire;
-	udp_fanout_t	*udpf;
-	udp_stack_t	*us = udp->udp_us;
-
-	ASSERT(udp->udp_pending_op != -1);
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
-	if (error == 0) {
-		/* For udp_do_connect() success */
-		/* udp_do_bind() success will do nothing in here */
-		/*
-		 * If a broadcast/multicast address was bound, set
-		 * the source address to 0.
-		 * This ensures no datagrams with broadcast address
-		 * as source address are emitted (which would violate
-		 * RFC1122 - Hosts requirements)
-		 *
-		 * Note that when connecting the returned IRE is
-		 * for the destination address and we only perform
-		 * the broadcast check for the source address (it
-		 * is OK to connect to a broadcast/multicast address.)
-		 */
-		if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
-			ire = (ire_t *)ire_mp->b_rptr;
+	(void) udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+	mutex_exit(&connp->conn_lock);
 
-			/*
-			 * Note: we get IRE_BROADCAST for IPv6 to "mark" a
-			 * multicast local address.
-			 */
-			udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
-			    us->us_bind_fanout_size)];
-			if (ire->ire_type == IRE_BROADCAST &&
-			    udp->udp_state != TS_DATA_XFER) {
-				ASSERT(udp->udp_pending_op == T_BIND_REQ ||
-				    udp->udp_pending_op == O_T_BIND_REQ);
-				/*
-				 * This was just a local bind to a broadcast
-				 * addr.
-				 */
-				mutex_enter(&udpf->uf_lock);
-				V6_SET_ZERO(udp->udp_v6src);
-				mutex_exit(&udpf->uf_lock);
-				if (udp->udp_family == AF_INET6)
-					(void) udp_build_hdrs(udp);
-			} else if (V6_OR_V4_INADDR_ANY(udp->udp_v6src)) {
-				if (udp->udp_family == AF_INET6)
-					(void) udp_build_hdrs(udp);
-			}
-		}
-	} else {
-		udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
-		    us->us_bind_fanout_size)];
-		mutex_enter(&udpf->uf_lock);
+	ip_unbind(connp);
 
-		if (udp->udp_state == TS_DATA_XFER) {
-			/* Connect failed */
-			/* Revert back to the bound source */
-			udp->udp_v6src = udp->udp_bound_v6src;
-			udp->udp_state = TS_IDLE;
-		} else {
-			/* For udp_do_bind() failed */
-			V6_SET_ZERO(udp->udp_v6src);
-			V6_SET_ZERO(udp->udp_bound_v6src);
-			udp->udp_state = TS_UNBND;
-			udp_bind_hash_remove(udp, B_TRUE);
-			udp->udp_port = 0;
-		}
-		mutex_exit(&udpf->uf_lock);
-		if (udp->udp_family == AF_INET6)
-			(void) udp_build_hdrs(udp);
-	}
-	udp->udp_pending_op = -1;
-	rw_exit(&udp->udp_rwlock);
-	if (ire_mp != NULL)
-		freeb(ire_mp);
-	return (error);
+	return (0);
 }
 
 /*
@@ -8418,7 +6036,7 @@ udp_post_ip_bind_connect(udp_t *udp, mblk_t *ire_mp, int error)
  */
 static int
 udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
-    cred_t *cr)
+    cred_t *cr, pid_t pid)
 {
 	sin6_t		*sin6;
 	sin_t		*sin;
@@ -8426,12 +6044,16 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 	ipaddr_t 	v4dst;
 	uint16_t 	dstport;
 	uint32_t 	flowinfo;
-	mblk_t		*ire_mp;
 	udp_fanout_t	*udpf;
 	udp_t		*udp, *udp1;
 	ushort_t	ipversion;
 	udp_stack_t	*us;
 	int		error;
+	conn_t		*connp1;
+	ip_xmit_attr_t	*ixa;
+	uint_t		scopeid = 0;
+	uint_t		srcid = 0;
+	in6_addr_t	v6src = connp->conn_saddr_v6;
 
 	udp = connp->conn_udp;
 	us = udp->udp_us;
@@ -8451,7 +6073,7 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 		v4dst = sin->sin_addr.s_addr;
 		dstport = sin->sin_port;
 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
-		ASSERT(udp->udp_ipversion == IPV4_VERSION);
+		ASSERT(connp->conn_ipversion == IPV4_VERSION);
 		ipversion = IPV4_VERSION;
 		break;
 
@@ -8459,13 +6081,33 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 		sin6 = (sin6_t *)sa;
 		v6dst = sin6->sin6_addr;
 		dstport = sin6->sin6_port;
+		srcid = sin6->__sin6_src_id;
+		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+			    connp->conn_netstack);
+		}
 		if (IN6_IS_ADDR_V4MAPPED(&v6dst)) {
+			if (connp->conn_ipv6_v6only)
+				return (EADDRNOTAVAIL);
+
+			/*
+			 * Destination adress is mapped IPv6 address.
+			 * Source bound address should be unspecified or
+			 * IPv6 mapped address as well.
+			 */
+			if (!IN6_IS_ADDR_UNSPECIFIED(
+			    &connp->conn_bound_addr_v6) &&
+			    !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) {
+				return (EADDRNOTAVAIL);
+			}
 			IN6_V4MAPPED_TO_IPADDR(&v6dst, v4dst);
 			ipversion = IPV4_VERSION;
 			flowinfo = 0;
 		} else {
 			ipversion = IPV6_VERSION;
 			flowinfo = sin6->sin6_flowinfo;
+			if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
+				scopeid = sin6->sin6_scope_id;
 		}
 		break;
 	}
@@ -8473,44 +6115,53 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 	if (dstport == 0)
 		return (-TBADADDR);
 
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
+	/*
+	 * If there is a different thread using conn_ixa then we get a new
+	 * copy and cut the old one loose from conn_ixa. Otherwise we use
+	 * conn_ixa and prevent any other thread from using/changing it.
+	 * Once connect() is done other threads can use conn_ixa since the
+	 * refcnt will be back at one.
+	 */
+	ixa = conn_get_ixa(connp, B_TRUE);
+	if (ixa == NULL)
+		return (ENOMEM);
 
+	ASSERT(ixa->ixa_refcnt >= 2);
+	ASSERT(ixa == connp->conn_ixa);
+
+	mutex_enter(&connp->conn_lock);
 	/*
-	 * This UDP must have bound to a port already before doing a connect.
-	 * TPI mandates that users must send TPI primitives only 1 at a time
-	 * and wait for the response before sending the next primitive.
+	 * This udp_t must have bound to a port already before doing a connect.
+	 * Reject if a connect is in progress (we drop conn_lock during
+	 * udp_do_connect).
 	 */
-	if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) {
-		rw_exit(&udp->udp_rwlock);
+	if (udp->udp_state == TS_UNBND || udp->udp_state == TS_WCON_CREQ) {
+		mutex_exit(&connp->conn_lock);
 		(void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 		    "udp_connect: bad state, %u", udp->udp_state);
+		ixa_refrele(ixa);
 		return (-TOUTSTATE);
 	}
-	udp->udp_pending_op = T_CONN_REQ;
-	ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL);
-
-	if (ipversion == IPV4_VERSION) {
-		udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
-		    udp->udp_ip_snd_options_len;
-	} else {
-		udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len;
-	}
+	ASSERT(connp->conn_lport != 0 && udp->udp_ptpbhn != NULL);
 
-	udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+	udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
 	    us->us_bind_fanout_size)];
 
 	mutex_enter(&udpf->uf_lock);
 	if (udp->udp_state == TS_DATA_XFER) {
 		/* Already connected - clear out state */
-		udp->udp_v6src = udp->udp_bound_v6src;
+		if (connp->conn_mcbc_bind)
+			connp->conn_saddr_v6 = ipv6_all_zeros;
+		else
+			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+		connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+		connp->conn_faddr_v6 = ipv6_all_zeros;
+		connp->conn_fport = 0;
 		udp->udp_state = TS_IDLE;
 	}
 
-	/*
-	 * Create a default IP header with no IP options.
-	 */
-	udp->udp_dstport = dstport;
-	udp->udp_ipversion = ipversion;
+	connp->conn_fport = dstport;
+	connp->conn_ipversion = ipversion;
 	if (ipversion == IPV4_VERSION) {
 		/*
 		 * Interpret a zero destination to mean loopback.
@@ -8520,29 +6171,16 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 		if (v4dst == INADDR_ANY) {
 			v4dst = htonl(INADDR_LOOPBACK);
 			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
-			if (udp->udp_family == AF_INET) {
+			if (connp->conn_family == AF_INET) {
 				sin->sin_addr.s_addr = v4dst;
 			} else {
 				sin6->sin6_addr = v6dst;
 			}
 		}
-		udp->udp_v6dst = v6dst;
-		udp->udp_flowinfo = 0;
-
-		/*
-		 * If the destination address is multicast and
-		 * an outgoing multicast interface has been set,
-		 * use the address of that interface as our
-		 * source address if no source address has been set.
-		 */
-		if (V4_PART_OF_V6(udp->udp_v6src) == INADDR_ANY &&
-		    CLASSD(v4dst) &&
-		    udp->udp_multicast_if_addr != INADDR_ANY) {
-			IN6_IPADDR_TO_V4MAPPED(udp->udp_multicast_if_addr,
-			    &udp->udp_v6src);
-		}
+		connp->conn_faddr_v6 = v6dst;
+		connp->conn_flowinfo = 0;
 	} else {
-		ASSERT(udp->udp_ipversion == IPV6_VERSION);
+		ASSERT(connp->conn_ipversion == IPV6_VERSION);
 		/*
 		 * Interpret a zero destination to mean loopback.
 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
@@ -8552,82 +6190,133 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 			v6dst = ipv6_loopback;
 			sin6->sin6_addr = v6dst;
 		}
-		udp->udp_v6dst = v6dst;
-		udp->udp_flowinfo = flowinfo;
-		/*
-		 * If the destination address is multicast and
-		 * an outgoing multicast interface has been set,
-		 * then the ip bind logic will pick the correct source
-		 * address (i.e. matching the outgoing multicast interface).
-		 */
+		connp->conn_faddr_v6 = v6dst;
+		connp->conn_flowinfo = flowinfo;
+	}
+	mutex_exit(&udpf->uf_lock);
+
+	ixa->ixa_cred = cr;
+	ixa->ixa_cpid = pid;
+	if (is_system_labeled()) {
+		/* We need to restart with a label based on the cred */
+		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
+	}
+
+	if (scopeid != 0) {
+		ixa->ixa_flags |= IXAF_SCOPEID_SET;
+		ixa->ixa_scopeid = scopeid;
+		connp->conn_incoming_ifindex = scopeid;
+	} else {
+		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+		connp->conn_incoming_ifindex = connp->conn_bound_if;
+	}
+	/*
+	 * conn_connect will drop conn_lock and reacquire it.
+	 * To prevent a send* from messing with this udp_t while the lock
+	 * is dropped we set udp_state and clear conn_v6lastdst.
+	 * That will make all send* fail with EISCONN.
+	 */
+	connp->conn_v6lastdst = ipv6_all_zeros;
+	udp->udp_state = TS_WCON_CREQ;
+
+	error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
+	mutex_exit(&connp->conn_lock);
+	if (error != 0)
+		goto connect_failed;
+
+	/*
+	 * The addresses have been verified. Time to insert in
+	 * the correct fanout list.
+	 */
+	error = ipcl_conn_insert(connp);
+	if (error != 0)
+		goto connect_failed;
+
+	mutex_enter(&connp->conn_lock);
+	error = udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+	if (error != 0) {
+		mutex_exit(&connp->conn_lock);
+		goto connect_failed;
 	}
 
+	udp->udp_state = TS_DATA_XFER;
+	/* Record this as the "last" send even though we haven't sent any */
+	connp->conn_v6lastdst = connp->conn_faddr_v6;
+	connp->conn_lastipversion = connp->conn_ipversion;
+	connp->conn_lastdstport = connp->conn_fport;
+	connp->conn_lastflowinfo = connp->conn_flowinfo;
+	connp->conn_lastscopeid = scopeid;
+	connp->conn_lastsrcid = srcid;
+	/* Also remember a source to use together with lastdst */
+	connp->conn_v6lastsrc = v6src;
+	mutex_exit(&connp->conn_lock);
+
 	/*
-	 * Verify that the src/port/dst/port is unique for all
-	 * connections in TS_DATA_XFER
+	 * We've picked a source address above. Now we can
+	 * verify that the src/port/dst/port is unique for all
+	 * connections in TS_DATA_XFER, skipping ourselves.
 	 */
+	mutex_enter(&udpf->uf_lock);
 	for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) {
 		if (udp1->udp_state != TS_DATA_XFER)
 			continue;
-		if (udp->udp_port != udp1->udp_port ||
-		    udp->udp_ipversion != udp1->udp_ipversion ||
-		    dstport != udp1->udp_dstport ||
-		    !IN6_ARE_ADDR_EQUAL(&udp->udp_v6src, &udp1->udp_v6src) ||
-		    !IN6_ARE_ADDR_EQUAL(&v6dst, &udp1->udp_v6dst) ||
-		    !(IPCL_ZONE_MATCH(udp->udp_connp,
-		    udp1->udp_connp->conn_zoneid) ||
-		    IPCL_ZONE_MATCH(udp1->udp_connp,
-		    udp->udp_connp->conn_zoneid)))
+
+		if (udp1 == udp)
+			continue;
+
+		connp1 = udp1->udp_connp;
+		if (connp->conn_lport != connp1->conn_lport ||
+		    connp->conn_ipversion != connp1->conn_ipversion ||
+		    dstport != connp1->conn_fport ||
+		    !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
+		    &connp1->conn_laddr_v6) ||
+		    !IN6_ARE_ADDR_EQUAL(&v6dst, &connp1->conn_faddr_v6) ||
+		    !(IPCL_ZONE_MATCH(connp, connp1->conn_zoneid) ||
+		    IPCL_ZONE_MATCH(connp1, connp->conn_zoneid)))
 			continue;
 		mutex_exit(&udpf->uf_lock);
-		udp->udp_pending_op = -1;
-		rw_exit(&udp->udp_rwlock);
-		return (-TBADADDR);
+		error = -TBADADDR;
+		goto connect_failed;
 	}
-
 	if (cl_inet_connect2 != NULL) {
-		CL_INET_UDP_CONNECT(connp, udp, B_TRUE, &v6dst, dstport, error);
+		CL_INET_UDP_CONNECT(connp, B_TRUE, &v6dst, dstport, error);
 		if (error != 0) {
 			mutex_exit(&udpf->uf_lock);
-			udp->udp_pending_op = -1;
-			rw_exit(&udp->udp_rwlock);
-			return (-TBADADDR);
+			error = -TBADADDR;
+			goto connect_failed;
 		}
 	}
-
-	udp->udp_state = TS_DATA_XFER;
 	mutex_exit(&udpf->uf_lock);
 
-	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
-	if (ire_mp == NULL) {
-		mutex_enter(&udpf->uf_lock);
-		udp->udp_state = TS_IDLE;
-		udp->udp_pending_op = -1;
-		mutex_exit(&udpf->uf_lock);
-		rw_exit(&udp->udp_rwlock);
-		return (ENOMEM);
-	}
-
-	rw_exit(&udp->udp_rwlock);
+	ixa_refrele(ixa);
+	return (0);
 
-	ire_mp->b_wptr += sizeof (ire_t);
-	ire_mp->b_datap->db_type = IRE_DB_REQ_TYPE;
+connect_failed:
+	if (ixa != NULL)
+		ixa_refrele(ixa);
+	mutex_enter(&connp->conn_lock);
+	mutex_enter(&udpf->uf_lock);
+	udp->udp_state = TS_IDLE;
+	connp->conn_faddr_v6 = ipv6_all_zeros;
+	connp->conn_fport = 0;
+	/* In case the source address was set above */
+	if (connp->conn_mcbc_bind)
+		connp->conn_saddr_v6 = ipv6_all_zeros;
+	else
+		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+	mutex_exit(&udpf->uf_lock);
 
-	if (udp->udp_family == AF_INET) {
-		error = ip_proto_bind_connected_v4(connp, &ire_mp, IPPROTO_UDP,
-		    &V4_PART_OF_V6(udp->udp_v6src), udp->udp_port,
-		    V4_PART_OF_V6(udp->udp_v6dst), udp->udp_dstport,
-		    B_TRUE, B_TRUE, cr);
-	} else {
-		error = ip_proto_bind_connected_v6(connp, &ire_mp, IPPROTO_UDP,
-		    &udp->udp_v6src, udp->udp_port, &udp->udp_v6dst,
-		    &udp->udp_sticky_ipp, udp->udp_dstport, B_TRUE, B_TRUE, cr);
-	}
+	connp->conn_v6lastdst = ipv6_all_zeros;
+	connp->conn_flowinfo = 0;
 
-	return (udp_post_ip_bind_connect(udp, ire_mp, error));
+	(void) udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+	    &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+	mutex_exit(&connp->conn_lock);
+	return (error);
 }
 
-/* ARGSUSED */
 static int
 udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
     socklen_t len, sock_connid_t *id, cred_t *cr)
@@ -8636,6 +6325,7 @@ udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 	udp_t	*udp = connp->conn_udp;
 	int	error;
 	boolean_t did_bind = B_FALSE;
+	pid_t	pid = curproc->p_pid;
 
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
@@ -8652,7 +6342,7 @@ udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 		return (error);
 	}
 
-	error = proto_verify_ip_addr(udp->udp_family, sa, len);
+	error = proto_verify_ip_addr(connp->conn_family, sa, len);
 	if (error != 0)
 		goto done;
 
@@ -8671,9 +6361,9 @@ udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 	/*
 	 * set SO_DGRAM_ERRIND
 	 */
-	udp->udp_dgram_errind = B_TRUE;
+	connp->conn_dgram_errind = B_TRUE;
 
-	error = udp_do_connect(connp, sa, len, cr);
+	error = udp_do_connect(connp, sa, len, cr, pid);
 
 	if (error != 0 && did_bind) {
 		int unbind_err;
@@ -8702,44 +6392,33 @@ done:
 	return (error);
 }
 
-/* ARGSUSED */
 int
 udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
     cred_t *cr)
 {
+	sin6_t		*sin6;
+	sin_t		*sin = NULL;
+	uint_t		srcid;
 	conn_t		*connp = (conn_t *)proto_handle;
 	udp_t		*udp = connp->conn_udp;
-	udp_stack_t	*us = udp->udp_us;
 	int		error = 0;
+	udp_stack_t	*us = udp->udp_us;
+	ushort_t	ipversion;
+	pid_t		pid = curproc->p_pid;
+	ip_xmit_attr_t	*ixa;
 
 	ASSERT(DB_TYPE(mp) == M_DATA);
 
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
-	/* If labeled then sockfs should have already set db_credp */
-	ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
-
-	/*
-	 * If the socket is connected and no change in destination
-	 */
-	if (msg->msg_namelen == 0) {
-		error = udp_send_connected(connp, mp, msg, cr, curproc->p_pid);
-		if (error == EDESTADDRREQ)
-			return (error);
-		else
-			return (udp->udp_dgram_errind ? error : 0);
-	}
-
-	/*
-	 * Do an implicit bind if necessary.
-	 */
+	/* do an implicit bind if necessary */
 	if (udp->udp_state == TS_UNBND) {
 		error = udp_implicit_bind(connp, cr);
 		/*
 		 * We could be racing with an actual bind, in which case
 		 * we would see EPROTO. We cross our fingers and try
-		 * to send.
+		 * to connect.
 		 */
 		if (!(error == 0 || error == EPROTO)) {
 			freemsg(mp);
@@ -8747,75 +6426,203 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 		}
 	}
 
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
-
-	if (msg->msg_name != NULL && udp->udp_state == TS_DATA_XFER) {
-		rw_exit(&udp->udp_rwlock);
-		freemsg(mp);
+	/* Connected? */
+	if (msg->msg_name == NULL) {
+		if (udp->udp_state != TS_DATA_XFER) {
+			BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+			return (EDESTADDRREQ);
+		}
+		if (msg->msg_controllen != 0) {
+			error = udp_output_ancillary(connp, NULL, NULL, mp,
+			    NULL, msg, cr, pid);
+		} else {
+			error = udp_output_connected(connp, mp, cr, pid);
+		}
+		if (us->us_sendto_ignerr)
+			return (0);
+		else
+			return (error);
+	}
+	if (udp->udp_state == TS_DATA_XFER) {
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
 		return (EISCONN);
 	}
+	error = proto_verify_ip_addr(connp->conn_family,
+	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
+	if (error != 0) {
+		BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+		return (error);
+	}
+	switch (connp->conn_family) {
+	case AF_INET6:
+		sin6 = (sin6_t *)msg->msg_name;
 
+		srcid = sin6->__sin6_src_id;
 
-	if (udp->udp_delayed_error != 0) {
-		boolean_t	match;
+		if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+			/*
+			 * Destination is a non-IPv4-compatible IPv6 address.
+			 * Send out an IPv6 format packet.
+			 */
 
-		error = udp->udp_delayed_error;
-		match = B_FALSE;
-		udp->udp_delayed_error = 0;
-		switch (udp->udp_family) {
-		case AF_INET: {
-			/* Compare just IP address and port */
-			sin_t *sin1 = (sin_t *)msg->msg_name;
-			sin_t *sin2 = (sin_t *)&udp->udp_delayed_addr;
+			/*
+			 * If the local address is a mapped address return
+			 * an error.
+			 * It would be possible to send an IPv6 packet but the
+			 * response would never make it back to the application
+			 * since it is bound to a mapped address.
+			 */
+			if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
+				BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+				return (EADDRNOTAVAIL);
+			}
+			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+				sin6->sin6_addr = ipv6_loopback;
+			ipversion = IPV6_VERSION;
+		} else {
+			if (connp->conn_ipv6_v6only) {
+				BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+				return (EADDRNOTAVAIL);
+			}
 
-			if (msg->msg_namelen == sizeof (sin_t) &&
-			    sin1->sin_port == sin2->sin_port &&
-			    sin1->sin_addr.s_addr == sin2->sin_addr.s_addr)
-				match = B_TRUE;
+			/*
+			 * If the local address is not zero or a mapped address
+			 * return an error.  It would be possible to send an
+			 * IPv4 packet but the response would never make it
+			 * back to the application since it is bound to a
+			 * non-mapped address.
+			 */
+			if (!IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6) &&
+			    !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) {
+				BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+				return (EADDRNOTAVAIL);
+			}
 
-			break;
+			if (V4_PART_OF_V6(sin6->sin6_addr) == INADDR_ANY) {
+				V4_PART_OF_V6(sin6->sin6_addr) =
+				    htonl(INADDR_LOOPBACK);
+			}
+			ipversion = IPV4_VERSION;
 		}
-		case AF_INET6: {
-			sin6_t	*sin1 = (sin6_t *)msg->msg_name;
-			sin6_t	*sin2 = (sin6_t *)&udp->udp_delayed_addr;
 
-			if (msg->msg_namelen == sizeof (sin6_t) &&
-			    sin1->sin6_port == sin2->sin6_port &&
-			    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
-			    &sin2->sin6_addr))
-				match = B_TRUE;
-			break;
-		}
-		default:
-			ASSERT(0);
+		/*
+		 * We have to allocate an ip_xmit_attr_t before we grab
+		 * conn_lock and we need to hold conn_lock once we've check
+		 * conn_same_as_last_v6 to handle concurrent send* calls on a
+		 * socket.
+		 */
+		if (msg->msg_controllen == 0) {
+			ixa = conn_get_ixa(connp, B_FALSE);
+			if (ixa == NULL) {
+				BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+				return (ENOMEM);
+			}
+		} else {
+			ixa = NULL;
 		}
+		mutex_enter(&connp->conn_lock);
+		if (udp->udp_delayed_error != 0) {
+			sin6_t  *sin2 = (sin6_t *)&udp->udp_delayed_addr;
 
-		*((sin6_t *)&udp->udp_delayed_addr) = sin6_null;
+			error = udp->udp_delayed_error;
+			udp->udp_delayed_error = 0;
 
-		if (match) {
-			rw_exit(&udp->udp_rwlock);
-			freemsg(mp);
+			/* Compare IP address, port, and family */
+
+			if (sin6->sin6_port == sin2->sin6_port &&
+			    IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
+			    &sin2->sin6_addr) &&
+			    sin6->sin6_family == sin2->sin6_family) {
+				mutex_exit(&connp->conn_lock);
+				BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+				if (ixa != NULL)
+					ixa_refrele(ixa);
+				return (error);
+			}
+		}
+
+		if (msg->msg_controllen != 0) {
+			mutex_exit(&connp->conn_lock);
+			ASSERT(ixa == NULL);
+			error = udp_output_ancillary(connp, NULL, sin6, mp,
+			    NULL, msg, cr, pid);
+		} else if (conn_same_as_last_v6(connp, sin6) &&
+		    connp->conn_lastsrcid == srcid &&
+		    ipsec_outbound_policy_current(ixa)) {
+			/* udp_output_lastdst drops conn_lock */
+			error = udp_output_lastdst(connp, mp, cr, pid, ixa);
+		} else {
+			/* udp_output_newdst drops conn_lock */
+			error = udp_output_newdst(connp, mp, NULL, sin6,
+			    ipversion, cr, pid, ixa);
+		}
+		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+		if (us->us_sendto_ignerr)
+			return (0);
+		else
 			return (error);
+	case AF_INET:
+		sin = (sin_t *)msg->msg_name;
+
+		ipversion = IPV4_VERSION;
+
+		if (sin->sin_addr.s_addr == INADDR_ANY)
+			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+
+		/*
+		 * We have to allocate an ip_xmit_attr_t before we grab
+		 * conn_lock and we need to hold conn_lock once we've check
+		 * conn_same_as_last_v6 to handle concurrent send* on a socket.
+		 */
+		if (msg->msg_controllen == 0) {
+			ixa = conn_get_ixa(connp, B_FALSE);
+			if (ixa == NULL) {
+				BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+				return (ENOMEM);
+			}
+		} else {
+			ixa = NULL;
 		}
-	}
+		mutex_enter(&connp->conn_lock);
+		if (udp->udp_delayed_error != 0) {
+			sin_t  *sin2 = (sin_t *)&udp->udp_delayed_addr;
 
-	error = proto_verify_ip_addr(udp->udp_family,
-	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
-	rw_exit(&udp->udp_rwlock);
+			error = udp->udp_delayed_error;
+			udp->udp_delayed_error = 0;
 
-	if (error != 0) {
-		freemsg(mp);
-		return (error);
-	}
+			/* Compare IP address and port */
 
-	error = udp_send_not_connected(connp, mp,
-	    (struct sockaddr  *)msg->msg_name, msg->msg_namelen, msg, cr,
-	    curproc->p_pid);
-	if (error != 0) {
-		UDP_STAT(us, udp_out_err_output);
-		freemsg(mp);
+			if (sin->sin_port == sin2->sin_port &&
+			    sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
+				mutex_exit(&connp->conn_lock);
+				BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+				if (ixa != NULL)
+					ixa_refrele(ixa);
+				return (error);
+			}
+		}
+		if (msg->msg_controllen != 0) {
+			mutex_exit(&connp->conn_lock);
+			ASSERT(ixa == NULL);
+			error = udp_output_ancillary(connp, sin, NULL, mp,
+			    NULL, msg, cr, pid);
+		} else if (conn_same_as_last_v4(connp, sin) &&
+		    ipsec_outbound_policy_current(ixa)) {
+			/* udp_output_lastdst drops conn_lock */
+			error = udp_output_lastdst(connp, mp, cr, pid, ixa);
+		} else {
+			/* udp_output_newdst drops conn_lock */
+			error = udp_output_newdst(connp, mp, sin, NULL,
+			    ipversion, cr, pid, ixa);
+		}
+		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+		if (us->us_sendto_ignerr)
+			return (0);
+		else
+			return (error);
+	default:
+		return (EINVAL);
 	}
-	return (udp->udp_dgram_errind ? error : 0);
 }
 
 int
@@ -8854,8 +6661,7 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 	stropt_mp->b_wptr += sizeof (*stropt);
 	stropt = (struct stroptions *)stropt_mp->b_rptr;
 	stropt->so_flags = SO_WROFF | SO_HIWAT;
-	stropt->so_wroff =
-	    (ushort_t)(udp->udp_max_hdr_len + udp->udp_us->us_wroff_extra);
+	stropt->so_wroff = connp->conn_wroff;
 	stropt->so_hiwat = udp->udp_rcv_disply_hiwat;
 	putnext(RD(q), stropt_mp);
 
@@ -8881,9 +6687,9 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 		faddrlen = 0;
 
 	opts = 0;
-	if (udp->udp_dgram_errind)
+	if (connp->conn_dgram_errind)
 		opts |= SO_DGRAM_ERRIND;
-	if (udp->udp_dontroute)
+	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 		opts |= SO_DONTROUTE;
 
 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
@@ -8908,9 +6714,9 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 	/*
 	 * No longer a streams less socket
 	 */
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
+	mutex_enter(&connp->conn_lock);
 	connp->conn_flags &= ~IPCL_NONSTR;
-	rw_exit(&udp->udp_rwlock);
+	mutex_exit(&connp->conn_lock);
 
 	mutex_exit(&udp->udp_recv_lock);
 
@@ -8919,48 +6725,7 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 	return (0);
 }
 
-static int
-udp_do_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
-{
-	sin_t	*sin = (sin_t *)sa;
-	sin6_t	*sin6 = (sin6_t *)sa;
-
-	ASSERT(RW_LOCK_HELD(&udp->udp_rwlock));
-	ASSERT(udp != NULL);
-
-	if (udp->udp_state != TS_DATA_XFER)
-		return (ENOTCONN);
-
-	switch (udp->udp_family) {
-	case AF_INET:
-		ASSERT(udp->udp_ipversion == IPV4_VERSION);
-
-		if (*salenp < sizeof (sin_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin_t);
-		*sin = sin_null;
-		sin->sin_family = AF_INET;
-		sin->sin_port = udp->udp_dstport;
-		sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6dst);
-		break;
-	case AF_INET6:
-		if (*salenp < sizeof (sin6_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin6_t);
-		*sin6 = sin6_null;
-		sin6->sin6_family = AF_INET6;
-		sin6->sin6_port = udp->udp_dstport;
-		sin6->sin6_addr = udp->udp_v6dst;
-		sin6->sin6_flowinfo = udp->udp_flowinfo;
-		break;
-	}
-
-	return (0);
-}
-
-/* ARGSUSED */
+/* ARGSUSED3 */
 int
 udp_getpeername(sock_lower_handle_t  proto_handle, struct sockaddr *sa,
     socklen_t *salenp, cred_t *cr)
@@ -8972,104 +6737,29 @@ udp_getpeername(sock_lower_handle_t  proto_handle, struct sockaddr *sa,
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
-	ASSERT(udp != NULL);
-
-	rw_enter(&udp->udp_rwlock, RW_READER);
-
-	error = udp_do_getpeername(udp, sa, salenp);
-
-	rw_exit(&udp->udp_rwlock);
-
+	mutex_enter(&connp->conn_lock);
+	if (udp->udp_state != TS_DATA_XFER)
+		error = ENOTCONN;
+	else
+		error = conn_getpeername(connp, sa, salenp);
+	mutex_exit(&connp->conn_lock);
 	return (error);
 }
 
-static int
-udp_do_getsockname(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
-{
-	sin_t	*sin = (sin_t *)sa;
-	sin6_t	*sin6 = (sin6_t *)sa;
-
-	ASSERT(udp != NULL);
-	ASSERT(RW_LOCK_HELD(&udp->udp_rwlock));
-
-	switch (udp->udp_family) {
-	case AF_INET:
-		ASSERT(udp->udp_ipversion == IPV4_VERSION);
-
-		if (*salenp < sizeof (sin_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin_t);
-		*sin = sin_null;
-		sin->sin_family = AF_INET;
-		if (udp->udp_state == TS_UNBND) {
-			break;
-		}
-		sin->sin_port = udp->udp_port;
-
-		if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) &&
-		    !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
-			sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6src);
-		} else {
-			/*
-			 * INADDR_ANY
-			 * udp_v6src is not set, we might be bound to
-			 * broadcast/multicast. Use udp_bound_v6src as
-			 * local address instead (that could
-			 * also still be INADDR_ANY)
-			 */
-			sin->sin_addr.s_addr =
-			    V4_PART_OF_V6(udp->udp_bound_v6src);
-		}
-		break;
-
-	case AF_INET6:
-		if (*salenp < sizeof (sin6_t))
-			return (EINVAL);
-
-		*salenp = sizeof (sin6_t);
-		*sin6 = sin6_null;
-		sin6->sin6_family = AF_INET6;
-		if (udp->udp_state == TS_UNBND) {
-			break;
-		}
-		sin6->sin6_port = udp->udp_port;
-
-		if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
-			sin6->sin6_addr = udp->udp_v6src;
-		} else {
-			/*
-			 * UNSPECIFIED
-			 * udp_v6src is not set, we might be bound to
-			 * broadcast/multicast. Use udp_bound_v6src as
-			 * local address instead (that could
-			 * also still be UNSPECIFIED)
-			 */
-			sin6->sin6_addr = udp->udp_bound_v6src;
-		}
-	}
-	return (0);
-}
-
-/* ARGSUSED */
+/* ARGSUSED3 */
 int
 udp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
     socklen_t *salenp, cred_t *cr)
 {
 	conn_t	*connp = (conn_t *)proto_handle;
-	udp_t	*udp = connp->conn_udp;
 	int error;
 
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
-	ASSERT(udp != NULL);
-	rw_enter(&udp->udp_rwlock, RW_READER);
-
-	error = udp_do_getsockname(udp, sa, salenp);
-
-	rw_exit(&udp->udp_rwlock);
-
+	mutex_enter(&connp->conn_lock);
+	error = conn_getsockname(connp, sa, salenp);
+	mutex_exit(&connp->conn_lock);
 	return (error);
 }
 
@@ -9078,7 +6768,6 @@ udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
     void *optvalp, socklen_t *optlen, cred_t *cr)
 {
 	conn_t		*connp = (conn_t *)proto_handle;
-	udp_t		*udp = connp->conn_udp;
 	int		error;
 	t_uscalar_t	max_optbuf_len;
 	void		*optvalp_buf;
@@ -9090,7 +6779,6 @@ udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
 	    udp_opt_obj.odb_opt_des_arr,
 	    udp_opt_obj.odb_opt_arr_cnt,
-	    udp_opt_obj.odb_topmost_tpiprovider,
 	    B_FALSE, B_TRUE, cr);
 	if (error != 0) {
 		if (error < 0)
@@ -9099,28 +6787,22 @@ udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	}
 
 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
-	rw_enter(&udp->udp_rwlock, RW_READER);
 	len = udp_opt_get(connp, level, option_name, optvalp_buf);
-	rw_exit(&udp->udp_rwlock);
-
-	if (len < 0) {
-		/*
-		 * Pass on to IP
-		 */
+	if (len == -1) {
 		kmem_free(optvalp_buf, max_optbuf_len);
-		return (ip_get_options(connp, level, option_name,
-		    optvalp, optlen, cr));
-	} else {
-		/*
-		 * update optlen and copy option value
-		 */
-		t_uscalar_t size = MIN(len, *optlen);
-		bcopy(optvalp_buf, optvalp, size);
-		bcopy(&size, optlen, sizeof (size));
-
-		kmem_free(optvalp_buf, max_optbuf_len);
-		return (0);
+		return (EINVAL);
 	}
+
+	/*
+	 * update optlen and copy option value
+	 */
+	t_uscalar_t size = MIN(len, *optlen);
+
+	bcopy(optvalp_buf, optvalp, size);
+	bcopy(&size, optlen, sizeof (size));
+
+	kmem_free(optvalp_buf, max_optbuf_len);
+	return (0);
 }
 
 int
@@ -9128,7 +6810,6 @@ udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
     const void *optvalp, socklen_t optlen, cred_t *cr)
 {
 	conn_t		*connp = (conn_t *)proto_handle;
-	udp_t		*udp = connp->conn_udp;
 	int		error;
 
 	/* All Solaris components should pass a cred for this operation. */
@@ -9137,7 +6818,6 @@ udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 	error = proto_opt_check(level, option_name, optlen, NULL,
 	    udp_opt_obj.odb_opt_des_arr,
 	    udp_opt_obj.odb_opt_arr_cnt,
-	    udp_opt_obj.odb_topmost_tpiprovider,
 	    B_TRUE, B_FALSE, cr);
 
 	if (error != 0) {
@@ -9146,19 +6826,11 @@ udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 		return (error);
 	}
 
-	rw_enter(&udp->udp_rwlock, RW_WRITER);
 	error = udp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
 	    optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
 	    NULL, cr);
-	rw_exit(&udp->udp_rwlock);
 
-	if (error < 0) {
-		/*
-		 * Pass on to ip
-		 */
-		error = ip_set_options(connp, level, option_name, optvalp,
-		    optlen, cr);
-	}
+	ASSERT(error >= 0);
 
 	return (error);
 }
@@ -9174,7 +6846,7 @@ udp_clr_flowctrl(sock_lower_handle_t proto_handle)
 	mutex_exit(&udp->udp_recv_lock);
 }
 
-/* ARGSUSED */
+/* ARGSUSED2 */
 int
 udp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
 {
@@ -9204,6 +6876,27 @@ udp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
+	/*
+	 * If we don't have a helper stream then create one.
+	 * ip_create_helper_stream takes care of locking the conn_t,
+	 * so this check for NULL is just a performance optimization.
+	 */
+	if (connp->conn_helper_info == NULL) {
+		udp_stack_t *us = connp->conn_udp->udp_us;
+
+		ASSERT(us->us_ldi_ident != NULL);
+
+		/*
+		 * Create a helper stream for non-STREAMS socket.
+		 */
+		error = ip_create_helper_stream(connp, us->us_ldi_ident);
+		if (error != 0) {
+			ip0dbg(("tcp_ioctl: create of IP helper stream "
+			    "failed %d\n", error));
+			return (error);
+		}
+	}
+
 	switch (cmd) {
 		case ND_SET:
 		case ND_GET:
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index 425d258697..02d9d3f8f8 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -56,227 +56,229 @@
  */
 opdes_t	udp_opt_arr[] = {
 
-{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
-{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct timeval), 0 },
-{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct timeval), 0 },
-{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	0 },
-{ SO_RECVUCRED, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_RECVUCRED, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
-{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int),
+{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
 	0 },
-{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
-{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
     0 },
-{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
     0 },
-{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
     0 },
 { SCM_UCRED, SOL_SOCKET, OA_W, OA_W, OP_NP, OP_VARLEN|OP_NODEFAULT, 512, 0 },
-{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 
 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 
-{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_RECVOPTS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_RECVDSTADDR, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_RECVOPTS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_RECVDSTADDR, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
-{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_RECVSLLA, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_RECVTTL,	IPPROTO_IP,  OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_RECVSLLA, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_RECVTTL,	IPPROTO_IP,  OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 	0 },
-{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (struct in_addr),	0 /* INADDR_ANY */ },
 
-{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 	sizeof (uchar_t), -1 /* not initialized */},
 
-{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 	sizeof (uchar_t), -1 /* not initialized */ },
 
-{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ip_mreq), -1 /* not initialized */ },
 
-{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ip_mreq), -1 /* not initialized */ },
 
-{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ip_mreq_source), -1 /* not initialized */ },
 
-{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ip_mreq_source), -1 /* not initialized */ },
 
 { IP_ADD_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
+	OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 },
 
 { IP_DROP_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
+	OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 },
 
-{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 	sizeof (ipsec_req_t), -1 /* not initialized */ },
 
-{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
+{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
 	sizeof (int), 0 },
 
-{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
 	sizeof (int), 0 },
 
 { IP_BROADCAST_TTL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, sizeof (uchar_t),
 	0 /* disabled */ },
 
 { IP_PKTINFO, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+	(OP_NODEFAULT|OP_VARLEN),
 	sizeof (struct in_pktinfo), -1 /* not initialized */ },
-{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
+{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
 	sizeof (in_addr_t),	-1 /* not initialized  */ },
 
+{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
 { MCAST_JOIN_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+	OP_NODEFAULT, sizeof (struct group_req),
 	-1 /* not initialized */ },
 { MCAST_LEAVE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+	OP_NODEFAULT, sizeof (struct group_req),
 	-1 /* not initialized */ },
 { MCAST_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_JOIN_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 
-{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
 
 { IPV6_MULTICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 
 { IPV6_MULTICAST_LOOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */},
+	OP_DEF_FN, sizeof (int), -1 /* not initialized */},
 
-{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, OP_NODEFAULT,
 	sizeof (struct ipv6_mreq), -1 /* not initialized */ },
 
 { IPV6_LEAVE_GROUP,	IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT),
+	OP_NODEFAULT,
 	sizeof (struct ipv6_mreq), -1 /* not initialized */ },
 
-{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 	sizeof (int), -1 /* not initialized */ },
 
-{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
 	sizeof (int), 0 },
 
 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+	(OP_NODEFAULT|OP_VARLEN),
 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
 { IPV6_HOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT),
+	OP_NODEFAULT,
 	sizeof (int), -1 /* not initialized */ },
 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+	(OP_NODEFAULT|OP_VARLEN),
 	sizeof (sin6_t), -1 /* not initialized */ },
 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	MAX_EHDR_LEN, -1 /* not initialized */ },
 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	MAX_EHDR_LEN, -1 /* not initialized */ },
 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	MAX_EHDR_LEN, -1 /* not initialized */ },
 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+	(OP_VARLEN|OP_NODEFAULT),
 	MAX_EHDR_LEN, -1 /* not initialized */ },
 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT),
+	OP_NODEFAULT,
 	sizeof (int), -1 /* not initialized */ },
 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT),
-	sizeof (int), -1 /* not initialized */ },
-{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+	OP_NODEFAULT,
+	sizeof (struct ip6_mtuinfo), -1 },
+{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
 
-{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
-{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
 { IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
-	OP_PASSNEXT, sizeof (int), 0 },
-{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+	0, sizeof (int), 0 },
+{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0 },
 
-{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 	sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
 
 { MCAST_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+	OP_NODEFAULT, sizeof (struct group_req),
 	-1 /* not initialized */ },
 { MCAST_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+	OP_NODEFAULT, sizeof (struct group_req),
 	-1 /* not initialized */ },
 { MCAST_BLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_UNBLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_JOIN_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 { MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
-	(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+	OP_NODEFAULT, sizeof (struct group_source_req),
 	-1 /* not initialized */ },
 
-{ UDP_ANONPRIVBIND, IPPROTO_UDP, OA_R, OA_RW, OP_PRIVPORT, OP_PASSNEXT,
+{ UDP_ANONPRIVBIND, IPPROTO_UDP, OA_R, OA_RW, OP_PRIVPORT, 0,
 	sizeof (int), 0 },
-{ UDP_EXCLBIND, IPPROTO_UDP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ UDP_EXCLBIND, IPPROTO_UDP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
 { UDP_RCVHDR, IPPROTO_UDP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
@@ -317,7 +319,6 @@ optdb_obj_t udp_opt_obj = {
 	udp_opt_default,	/* UDP default value function pointer */
 	udp_tpi_opt_get,	/* UDP get function pointer */
 	udp_tpi_opt_set,	/* UDP set function pointer */
-	B_TRUE,			/* UDP is tpi provider */
 	UDP_OPT_ARR_CNT,	/* UDP option database count of entries */
 	udp_opt_arr,		/* UDP option database */
 	UDP_VALID_LEVELS_CNT,	/* UDP valid level count of entries */
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 1b4935f456..4da82a0377 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -51,84 +51,6 @@ extern "C" {
 
 #define	UDP_MOD_ID		5607
 
-typedef struct udp_bits_s {
-
-	uint32_t
-
-	udpb_debug : 1,		/* SO_DEBUG "socket" option. */
-	udpb_dontroute : 1,	/* SO_DONTROUTE "socket" option. */
-	udpb_broadcast : 1,	/* SO_BROADCAST "socket" option. */
-	udpb_useloopback : 1,	/* SO_USELOOPBACK "socket" option */
-
-	udpb_reuseaddr : 1,	/* SO_REUSEADDR "socket" option. */
-	udpb_dgram_errind : 1,	/* SO_DGRAM_ERRIND option */
-	udpb_recvdstaddr : 1,	/* IP_RECVDSTADDR option */
-	udpb_recvopts : 1,	/* IP_RECVOPTS option */
-
-	udpb_unspec_source : 1,	/* IP*_UNSPEC_SRC option */
-	udpb_ip_recvpktinfo : 1,	/* IPV6_RECVPKTINFO option  */
-	udpb_ipv6_recvhoplimit : 1,	/* IPV6_RECVHOPLIMIT option */
-	udpb_ipv6_recvhopopts : 1,	/* IPV6_RECVHOPOPTS option */
-
-	udpb_ipv6_recvdstopts : 1,	/* IPV6_RECVDSTOPTS option */
-	udpb_ipv6_recvrthdr : 1,	/* IPV6_RECVRTHDR option */
-	udpb_ipv6_recvtclass : 1,	/* IPV6_RECVTCLASS */
-	udpb_ipv6_recvpathmtu : 1,	/* IPV6_RECVPATHMTU */
-
-	udpb_anon_priv_bind : 1,
-	udpb_exclbind : 1,		/* ``exclusive'' binding */
-	udpb_recvif : 1,		/* IP_RECVIF option */
-	udpb_recvslla : 1,		/* IP_RECVSLLA option */
-
-	udpb_recvttl : 1,		/* IP_RECVTTL option */
-	udpb_recvucred : 1,		/* IP_RECVUCRED option */
-	udpb_old_ipv6_recvdstopts : 1,	/* old form of IPV6_DSTOPTS */
-	udpb_ipv6_recvrthdrdstopts : 1,	/* IPV6_RECVRTHDRDSTOPTS */
-
-	udpb_rcvhdr : 1,		/* UDP_RCVHDR option */
-	udpb_issocket : 1,		/* socket mode; sockfs is on top */
-	udpb_timestamp : 1,		/* SO_TIMESTAMP "socket" option */
-
-	udpb_nat_t_endpoint : 1,	/* UDP_NAT_T_ENDPOINT option */
-	udpb_pad_to_bit_31 : 4;
-} udp_bits_t;
-
-#define	udp_debug	udp_bits.udpb_debug
-#define	udp_dontroute	udp_bits.udpb_dontroute
-#define	udp_broadcast	udp_bits.udpb_broadcast
-#define	udp_useloopback	udp_bits.udpb_useloopback
-
-#define	udp_reuseaddr		udp_bits.udpb_reuseaddr
-#define	udp_dgram_errind	udp_bits.udpb_dgram_errind
-#define	udp_recvdstaddr		udp_bits.udpb_recvdstaddr
-#define	udp_recvopts		udp_bits.udpb_recvopts
-
-#define	udp_unspec_source	udp_bits.udpb_unspec_source
-#define	udp_ip_recvpktinfo	udp_bits.udpb_ip_recvpktinfo
-#define	udp_ipv6_recvhoplimit	udp_bits.udpb_ipv6_recvhoplimit
-#define	udp_ipv6_recvhopopts	udp_bits.udpb_ipv6_recvhopopts
-
-#define	udp_ipv6_recvdstopts	udp_bits.udpb_ipv6_recvdstopts
-#define	udp_ipv6_recvrthdr	udp_bits.udpb_ipv6_recvrthdr
-#define	udp_ipv6_recvtclass	udp_bits.udpb_ipv6_recvtclass
-#define	udp_ipv6_recvpathmtu	udp_bits.udpb_ipv6_recvpathmtu
-
-#define	udp_anon_priv_bind	udp_bits.udpb_anon_priv_bind
-#define	udp_exclbind		udp_bits.udpb_exclbind
-#define	udp_recvif		udp_bits.udpb_recvif
-#define	udp_recvslla		udp_bits.udpb_recvslla
-
-#define	udp_recvttl		udp_bits.udpb_recvttl
-#define	udp_recvucred		udp_bits.udpb_recvucred
-#define	udp_old_ipv6_recvdstopts	udp_bits.udpb_old_ipv6_recvdstopts
-#define	udp_ipv6_recvrthdrdstopts	udp_bits.udpb_ipv6_recvrthdrdstopts
-
-#define	udp_rcvhdr		udp_bits.udpb_rcvhdr
-#define	udp_issocket		udp_bits.udpb_issocket
-#define	udp_timestamp		udp_bits.udpb_timestamp
-
-#define	udp_nat_t_endpoint	udp_bits.udpb_nat_t_endpoint
-
 /*
  * Bind hash list size and hash function.  It has to be a power of 2 for
  * hashing.
@@ -148,49 +70,21 @@ typedef struct udp_fanout_s {
 #endif
 } udp_fanout_t;
 
-/*
- * dev_q is the write side queue of the entity below IP.
- * If there is a module below IP, we can't optimize by looking
- * at q_first of the queue below IP. If the driver is directly
- * below IP and if the q_first is NULL, we optimize by not doing
- * the canput check
- */
-#define	DEV_Q_FLOW_BLOCKED(dev_q)					\
-	(((dev_q)->q_next != NULL || (dev_q)->q_first != NULL) &&	\
-	!canput(dev_q))
-
 /* Kstats */
 typedef struct udp_stat {			/* Class "net" kstats */
-	kstat_named_t	udp_ip_send;
-	kstat_named_t	udp_ip_ire_send;
-	kstat_named_t	udp_ire_null;
 	kstat_named_t	udp_sock_fallback;
-	kstat_named_t	udp_out_sw_cksum;
-	kstat_named_t	udp_out_sw_cksum_bytes;
 	kstat_named_t	udp_out_opt;
 	kstat_named_t	udp_out_err_notconn;
 	kstat_named_t	udp_out_err_output;
 	kstat_named_t	udp_out_err_tudr;
-	kstat_named_t	udp_in_pktinfo;
-	kstat_named_t	udp_in_recvdstaddr;
-	kstat_named_t	udp_in_recvopts;
-	kstat_named_t	udp_in_recvif;
-	kstat_named_t	udp_in_recvslla;
-	kstat_named_t	udp_in_recvucred;
-	kstat_named_t	udp_in_recvttl;
-	kstat_named_t	udp_in_recvhopopts;
-	kstat_named_t	udp_in_recvhoplimit;
-	kstat_named_t	udp_in_recvdstopts;
-	kstat_named_t	udp_in_recvrtdstopts;
-	kstat_named_t	udp_in_recvrthdr;
-	kstat_named_t	udp_in_recvpktinfo;
-	kstat_named_t	udp_in_recvtclass;
-	kstat_named_t	udp_in_timestamp;
-	kstat_named_t	udp_ip_rcvpktinfo;
-	kstat_named_t	udp_cookie_coll;
 #ifdef DEBUG
 	kstat_named_t	udp_data_conn;
 	kstat_named_t	udp_data_notconn;
+	kstat_named_t	udp_out_lastdst;
+	kstat_named_t	udp_out_diffdst;
+	kstat_named_t	udp_out_ipv6;
+	kstat_named_t	udp_out_mapped;
+	kstat_named_t	udp_out_ipv4;
 #endif
 
 } udp_stat_t;
@@ -242,79 +136,43 @@ typedef struct udp_stack udp_stack_t;
 
 /* Internal udp control structure, one per open stream */
 typedef	struct udp_s {
-	krwlock_t	udp_rwlock;	/* Protects most of udp_t */
-	t_scalar_t	udp_pending_op;	/* The current TPI operation */
 	/*
-	 * Following fields up to udp_ipversion protected by conn_lock,
-	 * and the fanout lock i.e.uf_lock. Need both locks to change the
-	 * field, either lock is sufficient for reading the field.
+	 * The addresses and ports in the conn_t and udp_state are protected by
+	 * conn_lock and the fanout lock i.e. uf_lock. Need both locks to change
+	 * the fields, either lock is sufficient for reading the field.
+	 * conn_lock also protects the content of udp_t.
 	 */
 	uint32_t	udp_state;	/* TPI state */
-	in_port_t	udp_port;	/* Port bound to this stream */
-	in_port_t	udp_dstport;	/* Connected port */
-	in6_addr_t	udp_v6src;	/* Source address of this stream */
-	in6_addr_t	udp_bound_v6src; /* Explicitly bound address */
-	in6_addr_t	udp_v6dst;	/* Connected destination */
-	/*
-	 * IP format that packets transmitted from this struct should use.
-	 * Value can be IP4_VERSION or IPV6_VERSION.
-	 */
-	ushort_t	udp_ipversion;
 
-	/* Written to only once at the time of opening the endpoint */
-	sa_family_t	udp_family;	/* Family from socket() call */
-
-	/* Following protected by udp_rwlock */
-	uint32_t	udp_flowinfo;	/* Connected flow id and tclass */
-	uint32_t	udp_max_hdr_len; /* For write offset in stream head */
-	uint32_t	udp_ip_snd_options_len; /* Len of IPv4 options */
-	uchar_t		*udp_ip_snd_options;    /* Ptr to IPv4 options */
-	uint32_t	udp_ip_rcv_options_len; /* Len of IPv4 options recvd */
-	uchar_t		*udp_ip_rcv_options;    /* Ptr to IPv4 options recvd */
-	uchar_t		udp_multicast_ttl;	/* IP*_MULTICAST_TTL/HOPS */
-	ipaddr_t	udp_multicast_if_addr;  /* IP_MULTICAST_IF option */
-	uint_t		udp_multicast_if_index;	/* IPV6_MULTICAST_IF option */
-	int		udp_bound_if;		/* IP*_BOUND_IF option */
+	ip_pkt_t	udp_recv_ipp;	/* Used for IPv4 options received */
 
 	/* Written to only once at the time of opening the endpoint */
 	conn_t		*udp_connp;
 
-	/* Following protected by udp_rwlock */
-	udp_bits_t	udp_bits;		/* Bit fields defined above */
-	uint8_t		udp_type_of_service;	/* IP_TOS option */
-	uint8_t		udp_ttl;		/* TTL or hoplimit */
-	ip6_pkt_t	udp_sticky_ipp;		/* Sticky options */
-	uint8_t		*udp_sticky_hdrs;	/* Prebuilt IPv6 hdrs */
-	uint_t		udp_sticky_hdrs_len;	/* Incl. ip6h and any ip6i */
+	uint32_t
+		udp_issocket : 1,	/* socket mode; sockfs is on top */
+		udp_nat_t_endpoint : 1,	/* UDP_NAT_T_ENDPOINT option */
+		udp_rcvhdr : 1,		/* UDP_RCVHDR option */
+
+		udp_pad_to_bit_31 : 29;
 
 	/* Following 2 fields protected by the uf_lock */
 	struct udp_s	*udp_bind_hash; /* Bind hash chain */
 	struct udp_s	**udp_ptpbhn; /* Pointer to previous bind hash next. */
 
-	/* Following protected by udp_rwlock */
 	kmutex_t	udp_recv_lock;		/* recv lock */
 	size_t		udp_rcv_disply_hiwat;	/* user's view of rcvbuf */
 	size_t		udp_rcv_hiwat;		/* receive high watermark */
-	size_t		udp_rcv_lowat;		/* receive low watermark */
-	size_t		udp_xmit_hiwat;		/* Send buffer high watermark */
-	size_t		udp_xmit_lowat;		/* Send buffer low watermark */
-	uint_t		udp_label_len;		/* length of security label */
-	uint_t		udp_label_len_v6;	/* len of v6 security label */
-	in6_addr_t 	udp_v6lastdst;		/* most recent destination */
-	in_port_t	udp_lastdstport;	/* most recent dest port */
-	cred_t		*udp_last_cred;		/* most recent credentials */
-	cred_t		*udp_effective_cred;	/* cred with effective label */
-
-	uint64_t	udp_open_time;	/* time when this was opened */
-	pid_t		udp_open_pid;	/* process id when this was opened */
+
+	/* Set at open time and never changed */
 	udp_stack_t	*udp_us;		/* Stack instance for zone */
+
 	int		udp_delayed_error;
 	mblk_t		*udp_fallback_queue_head;
 	mblk_t		*udp_fallback_queue_tail;
 	struct sockaddr_storage	udp_delayed_addr;
 } udp_t;
 
-/* UDP Protocol header */
 /* UDP Protocol header aligned */
 typedef	struct udpahdr_s {
 	in_port_t	uha_src_port;		/* Source port */
@@ -334,6 +192,8 @@ typedef	struct udpahdr_s {
 #define	us_xmit_lowat			us_param_arr[8].udp_param_value
 #define	us_recv_hiwat			us_param_arr[9].udp_param_value
 #define	us_max_buf			us_param_arr[10].udp_param_value
+#define	us_pmtu_discovery		us_param_arr[11].udp_param_value
+#define	us_sendto_ignerr		us_param_arr[12].udp_param_value
 
 
 #define	UDP_STAT(us, x)		((us)->us_statistics.x.value.ui64++)
@@ -348,14 +208,11 @@ typedef	struct udpahdr_s {
 extern int	udp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
 extern int	udp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
 extern int	udp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
-		    uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+		    uint_t *, uchar_t *, void *, cred_t *);
 extern mblk_t	*udp_snmp_get(queue_t *, mblk_t *);
 extern int	udp_snmp_set(queue_t *, t_scalar_t, t_scalar_t, uchar_t *, int);
-extern void	udp_close_free(conn_t *);
-extern void	udp_quiesce_conn(conn_t *);
 extern void	udp_ddi_g_init(void);
 extern void	udp_ddi_g_destroy(void);
-extern void	udp_g_q_inactive(udp_stack_t *);
 extern void	udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr,
 		    socklen_t addrlen);
 extern void	udp_wput(queue_t *, mblk_t *);
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index 338a1c96d0..79b88ca659 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -1478,7 +1478,7 @@ dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags)
 			lso->lso_flags = 0;
 			/* translate the flag for mac clients */
 			if ((mac_lso.lso_flags & LSO_TX_BASIC_TCP_IPV4) != 0)
-				lso->lso_flags |= DLD_LSO_TX_BASIC_TCP_IPV4;
+				lso->lso_flags |= DLD_LSO_BASIC_TCP_IPV4;
 			dsp->ds_lso = B_TRUE;
 			dsp->ds_lso_max = lso->lso_max;
 		} else {
diff --git a/usr/src/uts/common/io/ib/clients/rds/rds_opt.c b/usr/src/uts/common/io/ib/clients/rds/rds_opt.c
index 902d838ff4..639bb28bcc 100644
--- a/usr/src/uts/common/io/ib/clients/rds/rds_opt.c
+++ b/usr/src/uts/common/io/ib/clients/rds/rds_opt.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,9 +29,9 @@
 #define	rds_max_buf 2097152
 opdes_t rds_opt_arr[] = {
 
-{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 };
 
 /* ARGSUSED */
@@ -79,7 +79,7 @@ rds_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
 int
 rds_opt_set(queue_t *q, uint_t optset_context, int level,
     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
-    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
 {
 	int	*i1 = (int *)(uintptr_t)invalp;
 	boolean_t checkonly;
@@ -187,7 +187,6 @@ optdb_obj_t rds_opt_obj = {
 	rds_opt_default,	/* RDS default value function pointer */
 	rds_opt_get,		/* RDS get function pointer */
 	rds_opt_set,		/* RDS set function pointer */
-	B_TRUE,			/* RDS is tpi provider */
 	RDS_OPT_ARR_CNT,	/* RDS option database count of entries */
 	rds_opt_arr,		/* RDS option database */
 	RDS_VALID_LEVELS_CNT,	/* RDS valid level count of entries */
diff --git a/usr/src/uts/common/io/ib/clients/rds/rdsddi.c b/usr/src/uts/common/io/ib/clients/rds/rdsddi.c
index a4a9c6c8e0..13a1d4bf75 100644
--- a/usr/src/uts/common/io/ib/clients/rds/rdsddi.c
+++ b/usr/src/uts/common/io/ib/clients/rds/rdsddi.c
@@ -654,11 +654,9 @@ rds_wput_other(queue_t *q, mblk_t *mp)
 			}
 			if (((union T_primitives *)(uintptr_t)rptr)->type ==
 			    T_SVR4_OPTMGMT_REQ) {
-				(void) svr4_optcom_req(q, mp, cr, &rds_opt_obj,
-				    B_FALSE);
+				svr4_optcom_req(q, mp, cr, &rds_opt_obj);
 			} else {
-				(void) tpi_optcom_req(q, mp, cr, &rds_opt_obj,
-				    B_FALSE);
+				tpi_optcom_req(q, mp, cr, &rds_opt_obj);
 			}
 			return;
 		case T_CONN_REQ:
diff --git a/usr/src/uts/common/io/ib/clients/rds/rdssubr.c b/usr/src/uts/common/io/ib/clients/rds/rdssubr.c
index 8e57cb783d..f9bbcd092f 100644
--- a/usr/src/uts/common/io/ib/clients/rds/rdssubr.c
+++ b/usr/src/uts/common/io/ib/clients/rds/rdssubr.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/ib/clients/rds/rds.h>
 #include <sys/ib/clients/rds/rds_kstat.h>
 
@@ -135,9 +133,9 @@ rds_init()
 	 * kstats
 	 */
 	rds_kstatsp = kstat_create("rds", 0,
-		"rds_kstat", "misc", KSTAT_TYPE_NAMED,
-		sizeof (rds_kstat) / sizeof (kstat_named_t),
-		KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
+	    "rds_kstat", "misc", KSTAT_TYPE_NAMED,
+	    sizeof (rds_kstat) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
 	if (rds_kstatsp != NULL) {
 		rds_kstatsp->ks_lock = &rds_kstat_mutex;
 		rds_kstatsp->ks_data = (void *)&rds_kstat;
@@ -298,17 +296,14 @@ rds_fanout(ipaddr_t local_addr, ipaddr_t rem_addr,
 boolean_t
 rds_islocal(ipaddr_t addr)
 {
-	ire_t *ire;
 	ip_stack_t *ipst;
 
 	ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip;
 	ASSERT(ipst != NULL);
-
-	ire = ire_ctable_lookup(addr, NULL, IRE_LOCAL | IRE_LOOPBACK |
-	    IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-	netstack_rele(ipst->ips_netstack);
-	if (ire == NULL)
+	if (ip_laddr_verify_v4(addr, ALL_ZONES, ipst, B_FALSE) == IPVL_BAD) {
+		netstack_rele(ipst->ips_netstack);
 		return (B_FALSE);
-	ire_refrele(ire);
+	}
+	netstack_rele(ipst->ips_netstack);
 	return (B_TRUE);
 }
diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c
index 944e61a067..3bb7d3a98c 100644
--- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c
+++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c
@@ -26,41 +26,28 @@
 #include <sys/types.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
-#include <sys/stropts.h>
-#include <sys/stream.h>
-#include <sys/strsun.h>
 #include <sys/strsubr.h>
 #include <sys/socket.h>
-#include <sys/stat.h>
 #include <net/if_arp.h>
 #include <net/if_types.h>
-#include <sys/file.h>
 #include <sys/sockio.h>
 #include <sys/pathname.h>
-#include <inet/arp.h>
-#include <sys/modctl.h>
 
 #include <sys/ib/mgt/ibcm/ibcm_arp.h>
 
 #include <sys/kstr.h>
-#include <sys/tiuser.h>
 #include <sys/t_kuser.h>
 
 extern char cmlog[];
 
-extern int ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
-    ibt_ip_addr_t *src_addr, ibcm_arp_pr_comp_func_t func);
-extern void ibcm_arp_pr_arp_ack(mblk_t *mp);
-extern void ibcm_arp_prwqn_delete(ibcm_arp_prwqn_t *wqnp);
+extern int ibcm_resolver_pr_lookup(ibcm_arp_streams_t *ib_s,
+    ibt_ip_addr_t *dst_addr, ibt_ip_addr_t *src_addr);
+extern void ibcm_arp_delete_prwqn(ibcm_arp_prwqn_t *wqnp);
 
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", datab))
 _NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibt_ip_addr_s))
 _NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_ip_t))
 _NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_ibd_insts_t))
 _NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_prwqn_t))
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", iocblk))
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", msgb))
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", queue))
 _NOTE(SCHEME_PROTECTS_DATA("Unshared data", sockaddr_in))
 _NOTE(SCHEME_PROTECTS_DATA("Unshared data", sockaddr_in6))
 
@@ -89,269 +76,6 @@ ibcm_ip_print(char *label, ibt_ip_addr_t *ipaddr)
 	}
 }
 
-/*
- * ibcm_arp_get_ibaddr_cb
- */
-static int
-ibcm_arp_get_ibaddr_cb(void *arg, int status)
-{
-	ibcm_arp_prwqn_t	*wqnp = (ibcm_arp_prwqn_t *)arg;
-	ibcm_arp_streams_t	*ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_get_ibaddr_cb(ib_s: %p wqnp: %p)",
-	    ib_s, wqnp);
-
-	mutex_enter(&ib_s->lock);
-	ib_s->status = status;
-	ib_s->done = B_TRUE;
-
-	IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr_cb: SGID %llX:%llX "
-	    "DGID: %llX:%llX", wqnp->sgid.gid_prefix, wqnp->sgid.gid_guid,
-	    wqnp->dgid.gid_prefix, wqnp->dgid.gid_guid);
-
-	/* lock is held by the caller. */
-	cv_signal(&ib_s->cv);
-	mutex_exit(&ib_s->lock);
-	return (0);
-}
-
-/*
- * Lower read service procedure (messages coming back from arp/ip).
- * Process messages based on queue type.
- */
-static int
-ibcm_arp_lrsrv(queue_t *q)
-{
-	mblk_t *mp;
-	ibcm_arp_streams_t *ib_s = q->q_ptr;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_lrsrv(%p, ibd_s: 0x%p)", q, ib_s);
-
-	if (WR(q) == ib_s->arpqueue) {
-		while (mp = getq(q)) {
-			ibcm_arp_pr_arp_ack(mp);
-		}
-	}
-
-	return (0);
-}
-
-/*
- * Lower write service procedure.
- * Used when lower streams are flow controlled.
- */
-static int
-ibcm_arp_lwsrv(queue_t *q)
-{
-	mblk_t *mp;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_lwsrv(%p)", q);
-
-	while (mp = getq(q)) {
-		if (canputnext(q)) {
-			putnext(q, mp);
-		} else {
-			(void) putbq(q, mp);
-			qenable(q);
-			break;
-		}
-	}
-
-	return (0);
-}
-
-/*
- * Lower read put procedure. Arp/ip messages come here.
- */
-static int
-ibcm_arp_lrput(queue_t *q, mblk_t *mp)
-{
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_lrput(0x%p, db_type: %d)",
-	    q, DB_TYPE(mp));
-
-	switch (DB_TYPE(mp)) {
-		case M_FLUSH:
-			/*
-			 * Turn around
-			 */
-			if (*mp->b_rptr & FLUSHW) {
-				*mp->b_rptr &= ~FLUSHR;
-				qreply(q, mp);
-				return (0);
-			}
-			freemsg(mp);
-			break;
-		case M_IOCACK:
-		case M_IOCNAK:
-		case M_DATA:
-			/*
-			 * This could be in interrupt context.
-			 * Some of the ibt calls cannot be called in
-			 * interrupt context, so
-			 * put it in the queue and the message will be
-			 * processed by service proccedure
-			 */
-			(void) putq(q, mp);
-			qenable(q);
-			break;
-		default:
-			IBTF_DPRINTF_L2(cmlog, "ibcm_arp_lrput: "
-			    "got unknown msg <0x%x>\n", mp->b_datap->db_type);
-			ASSERT(0);
-			break;
-	}
-
-	return (0);
-}
-
-/*
- * Streams write queue module info
- */
-static struct module_info ibcm_arp_winfo = {
-	0,		/* module ID number */
-	"ibcm",		/* module name */
-	0,		/* min packet size */
-	INFPSZ,
-	49152,		/* STREAM queue high water mark -- 49152 */
-	12		/* STREAM queue low water mark -- 12 */
-};
-
-/*
- * Streams lower write queue, for ibcm/ip requests.
- */
-static struct qinit ibcm_arp_lwinit = {
-	NULL,		/* qi_putp */
-	ibcm_arp_lwsrv,	/* qi_srvp */
-	NULL,		/* qi_qopen */
-	NULL,		/* qi_qclose */
-	NULL,		/* qi_qadmin */
-	&ibcm_arp_winfo,	/* module info */
-	NULL,		/* module statistics struct */
-	NULL,
-	NULL,
-	STRUIOT_NONE	/* stream uio type is standard uiomove() */
-};
-
-/*
- * Streams lower read queue: read reply messages from ibcm/ip.
- */
-static struct qinit ibcm_arp_lrinit = {
-	ibcm_arp_lrput,	/* qi_putp */
-	ibcm_arp_lrsrv,	/* qi_srvp */
-	NULL,		/* qi_qopen */
-	NULL,		/* qi_qclose */
-	NULL,		/* qi_qadmin */
-	&ibcm_arp_winfo,	/* module info */
-	NULL,		/* module statistics struct */
-	NULL,
-	NULL,
-	STRUIOT_NONE /* stream uio type is standard uiomove() */
-};
-
-
-static int
-ibcm_arp_link_driver(ibcm_arp_streams_t *ib_s, char *path, queue_t **q,
-    vnode_t **dev_vp)
-{
-	struct stdata *dev_stp;
-	vnode_t *vp;
-	int error;
-	queue_t *rq;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_link_driver: Enter: %s", path);
-
-	/* open the driver from inside the kernel */
-	error = vn_open(path, UIO_SYSSPACE, FREAD|FWRITE, 0, &vp,
-	    0, NULL);
-	if (error) {
-		IBTF_DPRINTF_L2(cmlog, "ibcm_arp_link_driver: "
-		    "vn_open('%s') failed\n", path);
-		return (error);
-	}
-	*dev_vp = vp;
-
-	dev_stp = vp->v_stream;
-	*q = dev_stp->sd_wrq;
-
-	VN_HOLD(vp);
-
-	rq = RD(dev_stp->sd_wrq);
-	RD(rq)->q_ptr = WR(rq)->q_ptr = ib_s;
-	setq(rq, &ibcm_arp_lrinit, &ibcm_arp_lwinit, NULL, QMTSAFE,
-	    SQ_CI|SQ_CO, B_FALSE);
-
-	return (0);
-}
-
-extern struct qinit strdata;
-extern struct qinit stwdata;
-
-/*
- * Unlink ip, ibcm, icmp6 drivers
- */
-/* ARGSUSED */
-static int
-ibcm_arp_unlink_driver(queue_t **q, vnode_t **dev_vp)
-{
-	vnode_t *vp = *dev_vp;
-	struct stdata *dev_stp = vp->v_stream;
-	queue_t *wrq, *rq;
-	int	rc;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_unlink_driver: Enter: 0x%p", q);
-
-	wrq = dev_stp->sd_wrq;
-	rq = RD(wrq);
-
-	disable_svc(rq);
-	wait_svc(rq);
-	flushq(rq, FLUSHALL);
-	flushq(WR(rq), FLUSHALL);
-
-	rq->q_ptr = wrq->q_ptr = dev_stp;
-
-	setq(rq, &strdata, &stwdata, NULL, QMTSAFE, SQ_CI|SQ_CO, B_TRUE);
-
-	if ((rc = VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL)) != 0) {
-		IBTF_DPRINTF_L2(cmlog, "ibcm_arp_unlink_driver: VOP_CLOSE "
-		    "failed %d\n", rc);
-	}
-	VN_RELE(vp);
-
-	return (0);
-}
-
-static int
-ibcm_arp_unlink_drivers(ibcm_arp_streams_t *ib_s)
-{
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_unlink_drivers(%p)", ib_s);
-
-	if (ib_s->arpqueue) {
-		(void) ibcm_arp_unlink_driver(&ib_s->arpqueue, &ib_s->arp_vp);
-	}
-
-	return (0);
-}
-
-/*
- * Link ip, ibtl drivers below ibtl
- */
-static int
-ibcm_arp_link_drivers(ibcm_arp_streams_t *ib_s)
-{
-	int	rc;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_link_drivers(%p)", ib_s);
-
-	if ((rc = ibcm_arp_link_driver(ib_s, "/dev/arp", &ib_s->arpqueue,
-	    &ib_s->arp_vp)) != 0) {
-		IBTF_DPRINTF_L2(cmlog, "ibcm_arp_link_drivers: "
-		    "ibcm_arp_link_driver failed: %d\n", rc);
-		return (rc);
-	}
-
-	return (0);
-}
 
 ibt_status_t
 ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr,
@@ -370,21 +94,13 @@ ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr,
 	mutex_init(&ib_s->lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ib_s->cv, NULL, CV_DRIVER, NULL);
 
-	ret = ibcm_arp_link_drivers(ib_s);
-	if (ret != 0) {
-		IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr: "
-		    "ibcm_arp_link_drivers failed %d", ret);
-		goto arp_ibaddr_error;
-	}
-
 	mutex_enter(&ib_s->lock);
 	ib_s->done = B_FALSE;
 	mutex_exit(&ib_s->lock);
 
-	ret = ibcm_arp_pr_lookup(ib_s, &destaddr, &srcaddr,
-	    ibcm_arp_get_ibaddr_cb);
+	ret = ibcm_resolver_pr_lookup(ib_s, &destaddr, &srcaddr);
 
-	IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr: ibcm_arp_pr_lookup "
+	IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr: ibcm_resolver_pr_lookup "
 	    "returned: %d", ret);
 	if (ret == 0) {
 		mutex_enter(&ib_s->lock);
@@ -393,7 +109,6 @@ ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr,
 		mutex_exit(&ib_s->lock);
 	}
 
-	(void) ibcm_arp_unlink_drivers(ib_s);
 	mutex_enter(&ib_s->lock);
 	wqnp = ib_s->wqnp;
 	if (ib_s->status == 0) {
@@ -407,11 +122,11 @@ ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr,
 		    ib_s->wqnp->sgid.gid_prefix, ib_s->wqnp->sgid.gid_guid,
 		    ib_s->wqnp->dgid.gid_prefix, ib_s->wqnp->dgid.gid_guid);
 
-		ibcm_arp_prwqn_delete(wqnp);
+		ibcm_arp_delete_prwqn(wqnp);
 	} else if (ret == 0) {
 		/*
 		 * We come here only when lookup has returned empty (failed)
-		 * via callback routine - ibcm_arp_get_ibaddr_cb
+		 * via callback routine.
 		 * i.e. ib_s->status is non-zero, while ret is zero.
 		 */
 		if (wqnp)
@@ -884,20 +599,3 @@ srcip_plist_end:
 
 	return (ret);
 }
-/* Routines for warlock */
-
-/* ARGSUSED */
-static int
-ibcm_arp_dummy_ibaddr_hdl(void *arg, int status)
-{
-	ibcm_arp_prwqn_t		dummy_wqn1;
-	ibcm_arp_prwqn_t		dummy_wqn2;
-
-	dummy_wqn1.func = ibcm_arp_get_ibaddr_cb;
-	dummy_wqn2.func = ibcm_arp_dummy_ibaddr_hdl;
-
-	IBTF_DPRINTF_L5(cmlog, "ibcm_arp_dummy_ibaddr_hdl: "
-	    "dummy_wqn1.func %p %p", dummy_wqn1.func, dummy_wqn2.func);
-
-	return (0);
-}
diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c
index 79d420d467..45fbfd7932 100644
--- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c
+++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c
@@ -24,309 +24,32 @@
  */
 
 #include <sys/types.h>
-#include <sys/stream.h>
-#include <sys/dlpi.h>
-#include <sys/stropts.h>
-#include <sys/strsun.h>
-#include <sys/sysmacros.h>
-#include <sys/strlog.h>
-#include <sys/ddi.h>
-#include <sys/cmn_err.h>
-#include <sys/socket.h>
 #include <net/if.h>
 #include <net/if_types.h>
-#include <netinet/in.h>
-#include <sys/ethernet.h>
-#include <inet/arp.h>
 #include <inet/ip.h>
 #include <inet/ip_ire.h>
 #include <inet/ip_if.h>
 #include <sys/ib/mgt/ibcm/ibcm_arp.h>
-#include <inet/ip_ftable.h>
-
-static areq_t ibcm_arp_areq_template = {
-	AR_ENTRY_QUERY,	/* cmd */
-	sizeof (areq_t) + (2 * IP_ADDR_LEN),	/* name offset */
-	sizeof (areq_t),	/* name len */
-	IP_ARP_PROTO_TYPE,	/* protocol, from arps perspective */
-	sizeof (areq_t),	/* target addr offset */
-	IP_ADDR_LEN,	/* target ADDR_length */
-	0,	/* flags */
-	sizeof (areq_t) + IP_ADDR_LEN,	/* sender addr offset */
-	IP_ADDR_LEN,	/* sender addr length */
-	IBCM_ARP_XMIT_COUNT,	/* xmit_count */
-	IBCM_ARP_XMIT_INTERVAL,	/* (re)xmit_interval in milliseconds */
-	4	/* max # of requests to buffer */
-		/*
-		 * anything else filled in by the code
-		 */
-};
-
-static area_t ibcm_arp_area_template = {
-	AR_ENTRY_ADD,			/* cmd */
-	sizeof (area_t) + IPOIB_ADDRL + (2 * IP_ADDR_LEN), /* name offset */
-	sizeof (area_t),		/* name len */
-	IP_ARP_PROTO_TYPE,		/* protocol, from arps perspective */
-	sizeof (area_t),		/* proto addr offset */
-	IP_ADDR_LEN,			/* proto ADDR_length */
-	sizeof (area_t) + (IP_ADDR_LEN),	/* proto mask offset */
-	0,				/* flags */
-	sizeof (area_t) + (2 * IP_ADDR_LEN),	/* hw addr offset */
-	IPOIB_ADDRL				/* hw addr length */
-};
 
 extern char cmlog[];
 
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", msgb))
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", area_t))
 _NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_streams_t))
 
-static void ibcm_arp_timeout(void *arg);
-static void ibcm_arp_pr_callback(ibcm_arp_prwqn_t *wqnp, int status);
-static void ibcm_ipv6_resolver_ack(ip2mac_t *, void *);
-static int ibcm_ipv6_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zid);
-
-/*
- * issue a AR_ENTRY_QUERY to arp driver and schedule a timeout.
- */
-static int
-ibcm_arp_query_arp(ibcm_arp_prwqn_t *wqnp)
-{
-	int len;
-	int name_len;
-	int name_offset;
-	char *cp;
-	mblk_t *mp;
-	mblk_t *mp1;
-	areq_t *areqp;
-	ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_query_arp(ib_s: %p wqnp: %p)",
-	    ib_s, wqnp);
-
-	name_offset = ibcm_arp_areq_template.areq_name_offset;
-
-	/*
-	 * allocate mblk for AR_ENTRY_QUERY
-	 */
-	name_len = strlen(wqnp->ifname) + 1;
-	len = name_len + name_offset;
-	if ((mp = allocb(len, BPRI_HI)) == NULL) {
-		return (ENOMEM);
-	}
-	bzero(mp->b_rptr, len);
-	mp->b_wptr += len;
-
-	/*
-	 * allocate a mblk and set wqnp in the data
-	 */
-	if ((mp1 = allocb(sizeof (void *), BPRI_HI)) == NULL) {
-		freeb(mp);
-		return (ENOMEM);
-	}
-
-	mp1->b_wptr += sizeof (void *);
-	*(uintptr_t *)(void *)mp1->b_rptr = (uintptr_t)wqnp;	/* store wqnp */
-
-	cp = (char *)mp->b_rptr;
-	bcopy(&ibcm_arp_areq_template, cp, sizeof (areq_t));
-	areqp = (void *)cp;
-	areqp->areq_name_length = name_len;
-
-	cp = (char *)areqp + areqp->areq_name_offset;
-	bcopy(wqnp->ifname, cp, name_len);
-
-	areqp->areq_proto = wqnp->ifproto;
-	bcopy(&wqnp->ifproto, areqp->areq_sap, 2);
-	cp = (char *)areqp + areqp->areq_target_addr_offset;
-	bcopy(&wqnp->dst_addr.un.ip4addr, cp, IP_ADDR_LEN);
-	cp = (char *)areqp + areqp->areq_sender_addr_offset;
-	bcopy(&wqnp->src_addr.un.ip4addr, cp, IP_ADDR_LEN);
-
-	mp->b_cont = mp1;
-
-	DB_TYPE(mp) = M_PROTO;
-
-	/*
-	 * issue the request to arp
-	 */
-	wqnp->flags |= IBCM_ARP_PR_RESOLVE_PENDING;
-	wqnp->timeout_id = timeout(ibcm_arp_timeout, wqnp,
-	    drv_usectohz(IBCM_ARP_TIMEOUT * 1000));
-	if (canputnext(ib_s->arpqueue)) {
-		putnext(ib_s->arpqueue, mp);
-	} else {
-		(void) putq(ib_s->arpqueue, mp);
-		qenable(ib_s->arpqueue);
-	}
-
-	return (0);
-}
-
-/*
- * issue AR_ENTRY_SQUERY to arp driver
- */
-static int
-ibcm_arp_squery_arp(ibcm_arp_prwqn_t *wqnp)
-{
-	int len;
-	int name_len;
-	char *cp;
-	mblk_t *mp;
-	mblk_t *mp1;
-	area_t *areap;
-	uint32_t  proto_mask = 0xffffffff;
-	struct iocblk *ioc;
-	ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_squery_arp(ib_s: %p wqnp: %p)",
-	    ib_s, wqnp);
-
-	/*
-	 * allocate mblk for AR_ENTRY_SQUERY
-	 */
-	name_len = strlen(wqnp->ifname) + 1;
-	len = ibcm_arp_area_template.area_name_offset + name_len +
-	    sizeof (uintptr_t);
-	if ((mp = allocb(len, BPRI_HI)) == NULL) {
-		return (ENOMEM);
-	}
-	bzero(mp->b_rptr, len);
-	mp->b_wptr += len + sizeof (uintptr_t);
-
-	*(uintptr_t *)(void *)mp->b_rptr = (uintptr_t)wqnp;	/* store wqnp */
-	mp->b_rptr += sizeof (uintptr_t);
-
-
-	cp = (char *)mp->b_rptr;
-	bcopy(&ibcm_arp_area_template, cp, sizeof (area_t));
-
-	areap = (void *)cp;
-	areap->area_cmd = AR_ENTRY_SQUERY;
-	areap->area_name_length = name_len;
-	cp = (char *)areap + areap->area_name_offset;
-	bcopy(wqnp->ifname, cp, name_len);
-
-	cp = (char *)areap + areap->area_proto_addr_offset;
-	bcopy(&wqnp->dst_addr.un.ip4addr, cp, IP_ADDR_LEN);
-
-	cp = (char *)areap + areap->area_proto_mask_offset;
-	bcopy(&proto_mask, cp, IP_ADDR_LEN);
-
-	mp1 = allocb(sizeof (struct iocblk), BPRI_HI);
-	if (mp1 == NULL) {
-		freeb(mp);
-		return (ENOMEM);
-	}
-	ioc = (void *)mp1->b_rptr;
-	ioc->ioc_cmd = AR_ENTRY_SQUERY;
-	ioc->ioc_error = 0;
-	ioc->ioc_cr = NULL;
-	ioc->ioc_count = msgdsize(mp);
-	mp1->b_wptr += sizeof (struct iocblk);
-	mp1->b_cont = mp;
-
-	DB_TYPE(mp1) = M_IOCTL;
-
-	if (canputnext(ib_s->arpqueue)) {
-		putnext(ib_s->arpqueue, mp1);
-	} else {
-		(void) putq(ib_s->arpqueue, mp1);
-		qenable(ib_s->arpqueue);
-	}
-	return (0);
-}
-
-/*
- * issue a AR_ENTRY_ADD to arp driver
- * This is required as arp driver does not maintain a cache.
- */
-static int
-ibcm_arp_add(ibcm_arp_prwqn_t *wqnp)
-{
-	int len;
-	int name_len;
-	char *cp;
-	mblk_t *mp;
-	area_t *areap;
-	uint32_t  proto_mask = 0xffffffff;
-	ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_add(ib_s: %p wqnp: %p)", ib_s, wqnp);
-
-	/*
-	 * allocate mblk for AR_ENTRY_ADD
-	 */
-
-	name_len = strlen(wqnp->ifname) + 1;
-	len = ibcm_arp_area_template.area_name_offset + name_len;
-	if ((mp = allocb(len, BPRI_HI)) == NULL) {
-		return (ENOMEM);
-	}
-	bzero(mp->b_rptr, len);
-	mp->b_wptr += len;
-
-	cp = (char *)mp->b_rptr;
-	bcopy(&ibcm_arp_area_template, cp, sizeof (area_t));
-
-	areap = (void *)mp->b_rptr;
-	areap->area_name_length = name_len;
-	cp = (char *)areap + areap->area_name_offset;
-	bcopy(wqnp->ifname, cp, name_len);
-
-	cp = (char *)areap + areap->area_proto_addr_offset;
-	bcopy(&wqnp->dst_addr.un.ip4addr, cp, IP_ADDR_LEN);
-
-	cp = (char *)areap + areap->area_proto_mask_offset;
-	bcopy(&proto_mask, cp, IP_ADDR_LEN);
-
-	cp = (char *)areap + areap->area_hw_addr_offset;
-	bcopy(&wqnp->dst_mac, cp, IPOIB_ADDRL);
-
-	DB_TYPE(mp) = M_PROTO;
-
-	if (canputnext(ib_s->arpqueue)) {
-		putnext(ib_s->arpqueue, mp);
-	} else {
-		(void) putq(ib_s->arpqueue, mp);
-		qenable(ib_s->arpqueue);
-	}
-	return (0);
-}
-
-
-/*
- * timeout routine when there is no response to AR_ENTRY_QUERY
- */
-static void
-ibcm_arp_timeout(void *arg)
-{
-	ibcm_arp_prwqn_t *wqnp = (ibcm_arp_prwqn_t *)arg;
-	ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_timeout(ib_s: %p wqnp: %p)",
-	    ib_s, wqnp);
-	wqnp->flags &= ~IBCM_ARP_PR_RESOLVE_PENDING;
-	cv_broadcast(&ib_s->cv);
-
-	/*
-	 * indicate to user
-	 */
-	ibcm_arp_pr_callback(wqnp, EHOSTUNREACH);
-}
+static void ibcm_resolver_ack(ip2mac_t *, void *);
+static int ibcm_nce_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zid);
 
 /*
  * delete a wait queue node from the list.
  * assumes mutex is acquired
  */
 void
-ibcm_arp_prwqn_delete(ibcm_arp_prwqn_t *wqnp)
+ibcm_arp_delete_prwqn(ibcm_arp_prwqn_t *wqnp)
 {
 	ibcm_arp_streams_t *ib_s;
 
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_prwqn_delete(%p)", wqnp);
+	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_delete_prwqn(%p)", wqnp);
 
-	ib_s = (ibcm_arp_streams_t *)wqnp->arg;
+	ib_s = wqnp->ib_str;
 	ib_s->wqnp = NULL;
 	kmem_free(wqnp, sizeof (ibcm_arp_prwqn_t));
 }
@@ -336,7 +59,7 @@ ibcm_arp_prwqn_delete(ibcm_arp_prwqn_t *wqnp)
  */
 static ibcm_arp_prwqn_t *
 ibcm_arp_create_prwqn(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
-    ibt_ip_addr_t *src_addr, ibcm_arp_pr_comp_func_t func)
+    ibt_ip_addr_t *src_addr)
 {
 	ibcm_arp_prwqn_t *wqnp;
 
@@ -354,8 +77,7 @@ ibcm_arp_create_prwqn(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
 	if (src_addr) {
 		wqnp->usrc_addr = *src_addr;
 	}
-	wqnp->func = func;
-	wqnp->arg = ib_s;
+	wqnp->ib_str = ib_s;
 	wqnp->ifproto = (dst_addr->family == AF_INET) ?
 	    ETHERTYPE_IP : ETHERTYPE_IPV6;
 
@@ -366,17 +88,6 @@ ibcm_arp_create_prwqn(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
 	return (wqnp);
 }
 
-/*
- * call the user function
- * called with lock held
- */
-static void
-ibcm_arp_pr_callback(ibcm_arp_prwqn_t *wqnp, int status)
-{
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_callback(%p, %d)", wqnp, status);
-
-	wqnp->func((void *)wqnp, status);
-}
 
 /*
  * Check if the interface is loopback or IB.
@@ -391,23 +102,24 @@ ibcm_arp_check_interface(ill_t *ill)
 }
 
 int
-ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
-    ibt_ip_addr_t *src_addr, ibcm_arp_pr_comp_func_t func)
+ibcm_resolver_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
+    ibt_ip_addr_t *src_addr)
 {
 	ibcm_arp_prwqn_t *wqnp;
 	ire_t	*ire = NULL;
-	ire_t	*src_ire = NULL;
-	ipif_t	*ipif;
-	ill_t	*ill, *hwaddr_ill = NULL;
+	ipif_t	*ipif = NULL;
+	ill_t	*ill = NULL;
+	ill_t	*hwaddr_ill = NULL;
 	ip_stack_t *ipst;
 	int		len;
+	ipaddr_t	setsrcv4;
+	in6_addr_t	setsrcv6;
 
 	IBCM_PRINT_IP("ibcm_arp_pr_lookup: SRC", src_addr);
 	IBCM_PRINT_IP("ibcm_arp_pr_lookup: DST", dst_addr);
 
-	if ((wqnp = ibcm_arp_create_prwqn(ib_s, dst_addr,
-	    src_addr, func)) == NULL) {
-		IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
+	if ((wqnp = ibcm_arp_create_prwqn(ib_s, dst_addr, src_addr)) == NULL) {
+		IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
 		    "ibcm_arp_create_prwqn failed");
 		ib_s->status = ENOMEM;
 		return (1);
@@ -416,86 +128,111 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
 	ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip;
 	if (dst_addr->family == AF_INET) {
 		/*
-		 * Get the ire for the local address
+		 * A local address is always specified, and it is used
+		 * to find the zoneid.
 		 */
-		IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: ire_ctable_lookup");
-		src_ire = ire_ctable_lookup(src_addr->un.ip4addr, NULL,
-		    IRE_LOCAL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-		if (src_ire == NULL) {
-			IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
-			    "ire_ctable_lookup failed");
+		ipif = ipif_lookup_addr(src_addr->un.ip4addr, NULL, ALL_ZONES,
+		    ipst);
+		if (ipif == NULL) {
+			IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+			    "ipif_lookup_addr failed");
 			ib_s->status = EFAULT;
 			goto fail;
 		}
-		IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: ire_ctable_lookup");
 
 		/*
-		 * get an ire for the destination address with the matching
-		 * source address
+		 * get an ire for the destination adress.
+		 * Note that we can't use MATCH_IRE_ILL since that would
+		 * require that the first ill we find have ire_ill set. Thus
+		 * we compare ire_ill against ipif_ill after the lookup.
 		 */
-		ire = ire_ftable_lookup(dst_addr->un.ip4addr, 0, 0, 0,
-		    src_ire->ire_ipif, 0, src_ire->ire_zoneid, 0, NULL,
-		    MATCH_IRE_SRC, ipst);
-		if (ire == NULL) {
-			IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
-			    "ire_ftable_lookup failed");
+		setsrcv4 = INADDR_ANY;
+		ire = ire_route_recursive_v4(dst_addr->un.ip4addr, 0, NULL,
+		    ipif->ipif_zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst,
+		    &setsrcv4, NULL, NULL);
+
+		ASSERT(ire != NULL);
+		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+			IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+			    "ire_route_recursive_v4 failed");
+			ib_s->status = EFAULT;
+			goto fail;
+		}
+		ill = ire_nexthop_ill(ire);
+		if (ill == NULL) {
+			IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+			    "ire_nexthop_ill failed");
+			ib_s->status = EFAULT;
+			goto fail;
+		}
+		if (ill != ipif->ipif_ill) {
+			IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+			    "wrong ill");
 			ib_s->status = EFAULT;
 			goto fail;
 		}
 
-		IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: ire_ftable_lookup:"
-		    "done");
-
-		wqnp->gateway.un.ip4addr =
-		    ((ire->ire_gateway_addr == INADDR_ANY) ?
-		    ire->ire_addr : ire->ire_gateway_addr);
+		wqnp->gateway.un.ip4addr = ire->ire_gateway_addr;
 		wqnp->netmask.un.ip4addr = ire->ire_mask;
-		wqnp->src_addr.un.ip4addr = ire->ire_src_addr;
+		wqnp->src_addr.un.ip4addr = src_addr->un.ip4addr;
 		wqnp->src_addr.family = wqnp->gateway.family =
 		    wqnp->netmask.family = AF_INET;
 
 	} else if (dst_addr->family == AF_INET6) {
 		/*
-		 * Get the ire for the local address
+		 * A local address is always specified, and it is used
+		 * to find the zoneid.
+		 * We should really match on scopeid for link locals here.
 		 */
-		src_ire = ire_ctable_lookup_v6(&src_addr->un.ip6addr, NULL,
-		    IRE_LOCAL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-		if (src_ire == NULL) {
-			IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
-			    "ire_ctable_lookup_v6 failed");
+		ipif = ipif_lookup_addr_v6(&src_addr->un.ip6addr, NULL,
+		    ALL_ZONES, ipst);
+		if (ipif == NULL) {
+			IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+			    "ipif_lookup_addr_v6 failed");
 			ib_s->status = EFAULT;
 			goto fail;
 		}
-		IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: "
-		    "ire_ctable_lookup_v6: done");
 
 		/*
-		 * get an ire for the destination address with the matching
-		 * source address
+		 * get an ire for the destination adress.
+		 * Note that we can't use MATCH_IRE_ILL since that would
+		 * require that the first ill we find have ire_ill set. Thus
+		 * we compare ire_ill against ipif_ill after the lookup.
 		 */
-		ire = ire_ftable_lookup_v6(&dst_addr->un.ip6addr, 0, 0, 0,
-		    src_ire->ire_ipif, 0, src_ire->ire_zoneid, 0, NULL,
-		    MATCH_IRE_SRC, ipst);
-		if (ire == NULL) {
-			IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
-			    "ire_ftable_lookup_v6 failed");
+		setsrcv6 = ipv6_all_zeros;
+		ire = ire_route_recursive_v6(&dst_addr->un.ip6addr, 0, NULL,
+		    ipif->ipif_zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst,
+		    &setsrcv6, NULL, NULL);
+
+		ASSERT(ire != NULL);
+		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+			IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+			    "ire_route_recursive_v6 failed");
+			ib_s->status = EFAULT;
+			goto fail;
+		}
+		ill = ire_nexthop_ill(ire);
+		if (ill == NULL) {
+			IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+			    "ire_nexthop_ill failed");
+			ib_s->status = EFAULT;
+			goto fail;
+		}
+
+		if (ill != ipif->ipif_ill) {
+			IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+			    "wrong ill");
 			ib_s->status = EFAULT;
 			goto fail;
 		}
-		IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: "
-		    "ire_ftable_lookup_v6: done");
 
-		wqnp->gateway.un.ip6addr =
-		    (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) ?
-		    ire->ire_addr_v6 : ire->ire_gateway_addr_v6);
+		wqnp->gateway.un.ip6addr = ire->ire_gateway_addr_v6;
 		wqnp->netmask.un.ip6addr = ire->ire_mask_v6;
-		wqnp->src_addr.un.ip6addr = ire->ire_src_addr_v6;
+		wqnp->src_addr.un.ip6addr = src_addr->un.ip6addr;
 		wqnp->src_addr.family = wqnp->gateway.family =
 		    wqnp->netmask.family = AF_INET6;
 	}
 
-	ipif = src_ire->ire_ipif;
-	ill = ipif->ipif_ill;
 	(void) strlcpy(wqnp->ifname, ill->ill_name, sizeof (wqnp->ifname));
 
 	/*
@@ -504,18 +241,19 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
 	 */
 	if (IS_IPMP(ill)) {
 		if ((hwaddr_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
-			IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: no bound "
-			    "ill for IPMP interface %s", ill->ill_name);
+			IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+			    "no bound ill for IPMP interface %s",
+			    ill->ill_name);
 			ib_s->status = EFAULT;
 			goto fail;
 		}
 	} else {
 		hwaddr_ill = ill;
-		ill_refhold(hwaddr_ill); 	/* for symmetry */
+		ill_refhold(hwaddr_ill);	/* for symmetry */
 	}
 
 	if ((ib_s->status = ibcm_arp_check_interface(hwaddr_ill)) != 0) {
-		IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
+		IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
 		    "ibcm_arp_check_interface failed");
 		goto fail;
 	}
@@ -523,7 +261,7 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
 	bcopy(hwaddr_ill->ill_phys_addr, &wqnp->src_mac,
 	    hwaddr_ill->ill_phys_addr_length);
 
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: outgoing if:%s",
+	IBTF_DPRINTF_L4(cmlog, "ibcm_resolver_pr_lookup: outgoing if:%s",
 	    wqnp->ifname);
 
 	/*
@@ -534,8 +272,8 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
 		len = (wqnp->usrc_addr.family == AF_INET) ?
 		    IP_ADDR_LEN : sizeof (in6_addr_t);
 		if (bcmp(&wqnp->usrc_addr.un, &wqnp->src_addr.un, len)) {
-			IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: srcaddr "
-			    "mismatch:%d", ENETUNREACH);
+			IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+			    "srcaddr mismatch:%d", ENETUNREACH);
 			goto fail;
 		}
 	}
@@ -545,253 +283,77 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
 	 * interface, now get the destination mac address from
 	 * arp or ipv6 drivers
 	 */
-	if (wqnp->dst_addr.family == AF_INET) {
-		if ((ib_s->status = ibcm_arp_squery_arp(wqnp)) != 0) {
-			IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
-			    "ibcm_arp_squery_arp failed: %d", ib_s->status);
-			goto fail;
-		}
-	} else {
-		if ((ib_s->status = ibcm_ipv6_lookup(wqnp, ill, getzoneid())) !=
-		    0) {
-			IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
-			    "ibcm_ipv6_lookup failed: %d", ib_s->status);
-			goto fail;
-		}
+	ib_s->status = ibcm_nce_lookup(wqnp, ill, getzoneid());
+	if (ib_s->status != 0) {
+		IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+		    "ibcm_nce_lookup failed: %d", ib_s->status);
+		goto fail;
 	}
 
 	ill_refrele(hwaddr_ill);
-	IRE_REFRELE(ire);
-	IRE_REFRELE(src_ire);
+	ill_refrele(ill);
+	ire_refrele(ire);
+	ipif_refrele(ipif);
 	netstack_rele(ipst->ips_netstack);
 
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: Return: 0x%p", wqnp);
+	IBTF_DPRINTF_L4(cmlog, "ibcm_resolver_pr_lookup: Return: 0x%p", wqnp);
 	return (0);
 fail:
 	if (hwaddr_ill != NULL)
 		ill_refrele(hwaddr_ill);
+	if (ill != NULL)
+		ill_refrele(ill);
 	if (ire != NULL)
-		IRE_REFRELE(ire);
-	if (src_ire != NULL)
-		IRE_REFRELE(src_ire);
-	ibcm_arp_prwqn_delete(wqnp);
+		ire_refrele(ire);
+	if (ipif != NULL)
+		ipif_refrele(ipif);
+	ibcm_arp_delete_prwqn(wqnp);
 	netstack_rele(ipst->ips_netstack);
 	return (1);
 }
 
 /*
- * called from lrsrv.
- * process a AR_ENTRY_QUERY reply from arp
- * the message should be M_DATA -->> dl_unitdata_req
- */
-static void
-ibcm_arp_pr_arp_query_ack(mblk_t *mp)
-{
-	ibcm_arp_prwqn_t 	*wqnp;
-	dl_unitdata_req_t *dlreq;
-	ibcm_arp_streams_t *ib_s;
-	char *cp;
-	int rc;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_arp_query_ack(%p)", mp);
-
-	/*
-	 * the first mblk contains the wqnp pointer for the request
-	 */
-	if (MBLKL(mp) != sizeof (void *)) {
-		freemsg(mp);
-		return;
-	}
-
-	wqnp = *(ibcm_arp_prwqn_t **)(void *)mp->b_rptr; /* retrieve wqnp */
-	ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
-	mutex_enter(&ib_s->lock);
-
-	/*
-	 * cancel the timeout for this request
-	 */
-	(void) untimeout(wqnp->timeout_id);
-
-	/*
-	 * sanity checks on the dl_unitdata_req block
-	 */
-	if (!mp->b_cont) {
-		IBTF_DPRINTF_L2(cmlog, "areq_ack: b_cont = NULL\n");
-		rc = EPROTO;
-		goto user_callback;
-	}
-	if (MBLKL(mp->b_cont) < (sizeof (dl_unitdata_req_t) + IPOIB_ADDRL)) {
-		IBTF_DPRINTF_L2(cmlog, "areq_ack: invalid len in "
-		    "dl_unitdatareq_t block\n");
-		rc = EPROTO;
-		goto user_callback;
-	}
-	dlreq = (void *)mp->b_cont->b_rptr;
-	if (dlreq->dl_primitive != DL_UNITDATA_REQ) {
-		IBTF_DPRINTF_L2(cmlog, "areq_ack: invalid dl_primitive "
-		    "in dl_unitdatareq_t block\n");
-		rc = EPROTO;
-		goto user_callback;
-	}
-	if (dlreq->dl_dest_addr_length != (IPOIB_ADDRL + 2)) {
-		IBTF_DPRINTF_L2(cmlog, "areq_ack: invalid hw len in "
-		    "dl_unitdatareq_t block %d\n", dlreq->dl_dest_addr_length);
-		rc = EPROTO;
-		goto user_callback;
-	}
-	cp = (char *)mp->b_cont->b_rptr + dlreq->dl_dest_addr_offset;
-	bcopy(cp, &wqnp->dst_mac, IPOIB_ADDRL);
-
-	/*
-	 * at this point we have src/dst gid's derived from the mac addresses
-	 * now get the hca, port
-	 */
-	bcopy(&wqnp->src_mac.ipoib_gidpref, &wqnp->sgid, sizeof (ib_gid_t));
-	bcopy(&wqnp->dst_mac.ipoib_gidpref, &wqnp->dgid, sizeof (ib_gid_t));
-	freemsg(mp);
-
-	IBCM_H2N_GID(wqnp->sgid);
-	IBCM_H2N_GID(wqnp->dgid);
-
-	(void) ibcm_arp_add(wqnp);
-
-	mutex_exit(&ib_s->lock);
-	ibcm_arp_pr_callback(wqnp, 0);
-
-	return;
-user_callback:
-	freemsg(mp);
-	mutex_exit(&ib_s->lock);
-
-	/*
-	 * indicate to user
-	 */
-	ibcm_arp_pr_callback(wqnp, rc);
-}
-
-/*
- * process a AR_ENTRY_SQUERY reply from arp
- * the message should be M_IOCACK -->> area_t
+ * Query the neighbor cache for IPv4/IPv6 to mac address mapping.
  */
-static void
-ibcm_arp_pr_arp_squery_ack(mblk_t *mp)
+static int
+ibcm_nce_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zoneid)
 {
-	struct iocblk *ioc;
-	mblk_t	*mp1;
-	ibcm_arp_prwqn_t 	*wqnp;
-	ibcm_arp_streams_t *ib_s;
-	area_t *areap;
-	char *cp;
-
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_arp_squery_ack(%p)", mp);
-
-	if (MBLKL(mp) < sizeof (struct iocblk)) {
-		freemsg(mp);
-		return;
-	}
-
-	ioc = (void *)mp->b_rptr;
-	if ((ioc->ioc_cmd != AR_ENTRY_SQUERY) || (mp->b_cont == NULL)) {
-		freemsg(mp);
-		return;
-	}
-
-	mp1 = mp->b_cont;
-
-	wqnp = *(ibcm_arp_prwqn_t **)((uintptr_t)mp1->b_rptr -
-	    sizeof (uintptr_t));
-	ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
-	mutex_enter(&ib_s->lock);
-
-	/*
-	 * cancel the timeout for this request
-	 */
-	(void) untimeout(wqnp->timeout_id);
-
-	/* If the entry was not in arp cache, ioc_error is set */
-	if (ioc->ioc_error) {
-
-		/*
-		 * send out AR_ENTRY_QUERY which would send
-		 * arp-request on wire
-		 */
-		IBTF_DPRINTF_L3(cmlog, "Sending a Query_ARP");
-
-		(void) ibcm_arp_query_arp(wqnp);
-		freemsg(mp);
-		mutex_exit(&ib_s->lock);
-		return;
+	ip2mac_t	ip2m;
+	sin_t		*sin;
+	sin6_t		*sin6;
+	ip2mac_id_t	ip2mid;
+	int		err;
+
+	if (wqnp->src_addr.family != wqnp->dst_addr.family) {
+		IBTF_DPRINTF_L2(cmlog, "ibcm_nce_lookup: Mis-match SRC_ADDR "
+		    "Family: %d, DST_ADDR Family %d", wqnp->src_addr.family,
+		    wqnp->dst_addr.family);
+		return (1);
 	}
+	bzero(&ip2m, sizeof (ip2m));
 
-	areap = (void *)mp1->b_rptr;
-	cp = (char *)areap + areap->area_hw_addr_offset;
-	bcopy(cp, &wqnp->dst_mac, IPOIB_ADDRL);
-
-	/*
-	 * at this point we have src/dst gid's derived from the mac addresses
-	 * now get the hca, port
-	 */
-	bcopy(&wqnp->src_mac.ipoib_gidpref, &wqnp->sgid, sizeof (ib_gid_t));
-	bcopy(&wqnp->dst_mac.ipoib_gidpref, &wqnp->dgid, sizeof (ib_gid_t));
-	freemsg(mp);
-
-	IBCM_H2N_GID(wqnp->sgid);
-	IBCM_H2N_GID(wqnp->dgid);
-
-	mutex_exit(&ib_s->lock);
-	ibcm_arp_pr_callback(wqnp, 0);
-}
-
-/*
- * Process arp ack's.
- */
-void
-ibcm_arp_pr_arp_ack(mblk_t *mp)
-{
-	IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_arp_ack(0x%p, DB_TYPE %lX)",
-	    mp, DB_TYPE(mp));
-
-	if (DB_TYPE(mp) == M_DATA) {
-		ibcm_arp_pr_arp_query_ack(mp);
-	} else if ((DB_TYPE(mp) == M_IOCACK) ||
-	    (DB_TYPE(mp) == M_IOCNAK)) {
-		ibcm_arp_pr_arp_squery_ack(mp);
+	if (wqnp->dst_addr.family == AF_INET) {
+		sin = (sin_t *)&ip2m.ip2mac_pa;
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = wqnp->dst_addr.un.ip4addr;
+	} else if (wqnp->dst_addr.family == AF_INET6) {
+		sin6 = (sin6_t *)&ip2m.ip2mac_pa;
+		sin6->sin6_family = AF_INET6;
+		sin6->sin6_addr = wqnp->dst_addr.un.ip6addr;
 	} else {
-		freemsg(mp);
-	}
-}
-
-/*
- * query the ipv6 driver cache for ipv6 to mac address mapping.
- */
-static int
-ibcm_ipv6_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zoneid)
-{
-	ip2mac_t ip2m;
-	sin6_t *sin6;
-	ip2mac_id_t ip2mid;
-	int err;
-
-	if (wqnp->src_addr.family != AF_INET6) {
-		IBTF_DPRINTF_L2(cmlog, "ibcm_ipv6_lookup: SRC_ADDR NOT INET6: "
-		    "%d", wqnp->src_addr.family);
+		IBTF_DPRINTF_L2(cmlog, "ibcm_nce_lookup: Invalid DST_ADDR "
+		    "Family: %d", wqnp->dst_addr.family);
 		return (1);
 	}
 
-	bzero(&ip2m, sizeof (ip2m));
-	sin6 = (sin6_t *)&ip2m.ip2mac_pa;
-	sin6->sin6_family = AF_INET6;
-	sin6->sin6_addr = wqnp->dst_addr.un.ip6addr;
 	ip2m.ip2mac_ifindex = ill->ill_phyint->phyint_ifindex;
 
 	wqnp->flags |= IBCM_ARP_PR_RESOLVE_PENDING;
+
 	/*
-	 * XXX XTBD set the scopeid?
 	 * issue the request to IP for Neighbor Discovery
 	 */
-	ip2mid = ip2mac(IP2MAC_RESOLVE, &ip2m, ibcm_ipv6_resolver_ack, wqnp,
+	ip2mid = ip2mac(IP2MAC_RESOLVE, &ip2m, ibcm_resolver_ack, wqnp,
 	    zoneid);
 	err = ip2m.ip2mac_err;
 	if (err == EINPROGRESS) {
@@ -799,7 +361,7 @@ ibcm_ipv6_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zoneid)
 		wqnp->flags |= IBCM_ARP_PR_RESOLVE_PENDING;
 		err = 0;
 	} else if (err == 0) {
-		ibcm_ipv6_resolver_ack(&ip2m, wqnp);
+		ibcm_resolver_ack(&ip2m, wqnp);
 	}
 	return (err);
 }
@@ -822,16 +384,16 @@ ibcm_check_sockdl(struct sockaddr_dl *sdl)
  * If Address resolution was succesful: return GID info.
  */
 static void
-ibcm_ipv6_resolver_ack(ip2mac_t *ip2macp, void *arg)
+ibcm_resolver_ack(ip2mac_t *ip2macp, void *arg)
 {
 	ibcm_arp_prwqn_t *wqnp = (ibcm_arp_prwqn_t *)arg;
 	ibcm_arp_streams_t *ib_s;
 	uchar_t *cp;
 	int err = 0;
 
-	IBTF_DPRINTF_L4(cmlog, "ibcm_ipv6_resolver_ack(%p, %p)", ip2macp, wqnp);
+	IBTF_DPRINTF_L4(cmlog, "ibcm_resolver_ack(%p, %p)", ip2macp, wqnp);
 
-	ib_s = (ibcm_arp_streams_t *)wqnp->arg;
+	ib_s = wqnp->ib_str;
 	mutex_enter(&ib_s->lock);
 
 	if (ip2macp->ip2mac_err != 0) {
@@ -842,7 +404,7 @@ ibcm_ipv6_resolver_ack(ip2mac_t *ip2macp, void *arg)
 	}
 
 	if (!ibcm_check_sockdl(&ip2macp->ip2mac_ha)) {
-		IBTF_DPRINTF_L2(cmlog, "ibcm_ipv6_resolver_ack: Error: "
+		IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_ack: Error: "
 		    "interface %s is not IB\n", wqnp->ifname);
 		err = EHOSTUNREACH;
 		goto user_callback;
@@ -862,6 +424,11 @@ ibcm_ipv6_resolver_ack(ip2mac_t *ip2macp, void *arg)
 	IBCM_H2N_GID(wqnp->dgid);
 
 user_callback:
+
+	ib_s->status = err;
+	ib_s->done = B_TRUE;
+
+	/* lock is held by the caller. */
+	cv_signal(&ib_s->cv);
 	mutex_exit(&ib_s->lock);
-	ibcm_arp_pr_callback(wqnp, err);
 }
diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c
index 0d342fdd93..88468b353e 100644
--- a/usr/src/uts/common/io/mac/mac_util.c
+++ b/usr/src/uts/common/io/mac/mac_util.c
@@ -476,7 +476,7 @@ mac_ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length,
 	endptr = mp->b_wptr;
 	if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
 		return (B_FALSE);
-	ASSERT((IPH_HDR_VERSION(ip6h) & ~IP_FORWARD_PROG_BIT) == IPV6_VERSION);
+	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
 	length = IPV6_HDR_LEN;
 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
 
diff --git a/usr/src/uts/common/io/softmac/softmac_dev.c b/usr/src/uts/common/io/softmac/softmac_dev.c
index 23f43ced0b..eeb09fcb0b 100644
--- a/usr/src/uts/common/io/softmac/softmac_dev.c
+++ b/usr/src/uts/common/io/softmac/softmac_dev.c
@@ -146,6 +146,9 @@ static struct modlinkage softmac_modlinkage = {
 	NULL
 };
 
+static void softmac_dedicated_rx(void *, mac_resource_handle_t, mblk_t *,
+    mac_header_info_t *);
+
 /*ARGSUSED*/
 static int
 softmac_upper_constructor(void *buf, void *arg, int kmflag)
@@ -367,7 +370,8 @@ softmac_mod_rput(queue_t *rq, mblk_t *mp)
 		if (dlp->dl_primitive == DL_UNITDATA_IND) {
 
 			if ((rxinfo = slp->sl_rxinfo) != NULL) {
-				rxinfo->slr_rx(rxinfo->slr_arg, NULL, mp, NULL);
+				softmac_dedicated_rx(slp->sl_sup, NULL, mp,
+				    NULL);
 				break;
 			}
 
diff --git a/usr/src/uts/common/io/softmac/softmac_fp.c b/usr/src/uts/common/io/softmac/softmac_fp.c
index 7a10aa68b7..2fc66e9bd3 100644
--- a/usr/src/uts/common/io/softmac/softmac_fp.c
+++ b/usr/src/uts/common/io/softmac/softmac_fp.c
@@ -674,9 +674,12 @@ softmac_wput_single_nondata(softmac_upper_t *sup, mblk_t *mp)
 	t_uscalar_t	prim;
 
 	dbtype = DB_TYPE(mp);
+	sup->su_is_arp = 0;
 	switch (dbtype) {
-	case M_IOCTL:
-	case M_CTL: {
+	case M_CTL:
+		sup->su_is_arp = 1;
+		/* FALLTHROUGH */
+	case M_IOCTL: {
 		uint32_t	expected_mode;
 
 		if (((struct iocblk *)(mp->b_rptr))->ioc_cmd != SIOCSLIFNAME)
@@ -1132,7 +1135,10 @@ softmac_datapath_switch(softmac_t *softmac, boolean_t disable, boolean_t admin)
 			break;
 
 		req->ssq_expected_mode = expected_mode;
-
+		if (sup->su_is_arp) {
+			list_insert_tail(&reqlist, req);
+			continue;
+		}
 		/*
 		 * Allocate the DL_NOTE_REPLUMB message.
 		 */
@@ -1174,18 +1180,19 @@ softmac_datapath_switch(softmac_t *softmac, boolean_t disable, boolean_t admin)
 	 */
 	for (sup = list_head(&softmac->smac_sup_list); sup != NULL;
 	    sup = list_next(&softmac->smac_sup_list, sup)) {
-		mp = head->b_next;
-		head->b_next = NULL;
-
+		if (!sup->su_is_arp) {
+			mp = head->b_next;
+			head->b_next = NULL;
+			softmac_wput_nondata(sup, head);
+			head = mp;
+		}
 		/*
-		 * Add the swtich request to the requests list of the stream.
+		 * Add the switch request to the requests list of the stream.
 		 */
 		req = list_head(&reqlist);
 		ASSERT(req != NULL);
 		list_remove(&reqlist, req);
 		list_insert_tail(&sup->su_req_list, req);
-		softmac_wput_nondata(sup, head);
-		head = mp;
 	}
 
 	mutex_exit(&softmac->smac_fp_mutex);
diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c
index b23036e9c5..658735b784 100644
--- a/usr/src/uts/common/io/stream.c
+++ b/usr/src/uts/common/io/stream.c
@@ -1605,7 +1605,9 @@ pullupmsg(mblk_t *mp, ssize_t len)
 		ASSERT(bp->b_datap->db_ref > 0);
 		ASSERT(bp->b_wptr >= bp->b_rptr);
 		n = MIN(bp->b_wptr - bp->b_rptr, len);
-		bcopy(bp->b_rptr, mp->b_wptr, (size_t)n);
+		ASSERT(n >= 0);		/* allow zero-length mblk_t's */
+		if (n > 0)
+			bcopy(bp->b_rptr, mp->b_wptr, (size_t)n);
 		mp->b_wptr += n;
 		bp->b_rptr += n;
 		len -= n;
diff --git a/usr/src/uts/common/io/strplumb.c b/usr/src/uts/common/io/strplumb.c
index f43648fd7f..473f7bc72e 100644
--- a/usr/src/uts/common/io/strplumb.c
+++ b/usr/src/uts/common/io/strplumb.c
@@ -53,17 +53,6 @@
 #include	<sys/esunddi.h>
 #include	<sys/promif.h>
 
-#include	<netinet/in.h>
-#include	<netinet/ip6.h>
-#include	<netinet/icmp6.h>
-#include	<netinet/sctp.h>
-#include	<inet/common.h>
-#include	<inet/ip.h>
-#include	<inet/ip6.h>
-#include	<inet/tcp.h>
-#include	<inet/sctp_ip.h>
-#include	<inet/udp_impl.h>
-
 #include	<sys/strlog.h>
 #include	<sys/log.h>
 #include	<sys/ethernet.h>
@@ -222,104 +211,6 @@ strplumb_init(void)
 	return (0);
 }
 
-static int
-strplumb_autopush(void)
-{
-	major_t		maj;
-	minor_t		min;
-	char		*mods[5];
-	uint_t		anchor = 1;
-	int		err;
-
-	min = (minor_t)-1;
-	mods[1] = NULL;
-
-	/*
-	 * ARP
-	 */
-	DBG0("setting up arp autopush\n");
-
-	mods[0] = ARP;
-
-	maj = ddi_name_to_major(ARP);
-	if ((err = kstr_autopush(SET_AUTOPUSH, &maj, &min, NULL, &anchor,
-	    mods)) != 0) {
-		printf("strplumb: kstr_autopush(SET/ARP) failed: %d\n", err);
-		return (err);
-	}
-
-	return (0);
-}
-
-static int
-strplumb_sctpq(ldi_ident_t li)
-{
-	ldi_handle_t	lh = NULL;
-	int		err;
-	int		rval;
-
-	DBG0("configuring SCTP default queue\n");
-
-	if ((err = ldi_open_by_name(SCTP6DEV, FREAD|FWRITE, CRED(), &lh,
-	    li)) != 0) {
-		printf("strplumb: open of SCTP6DEV failed: %d\n", err);
-		return (err);
-	}
-
-	if ((err = ldi_ioctl(lh, SCTP_IOC_DEFAULT_Q, (intptr_t)0, FKIOCTL,
-	    CRED(), &rval)) != 0) {
-		printf("strplumb: failed to set SCTP default queue: %d\n",
-		    err);
-		(void) ldi_close(lh, FREAD|FWRITE, CRED());
-		return (err);
-	}
-
-	return (0);
-}
-
-static int
-strplumb_tcpq(ldi_ident_t li)
-{
-	ldi_handle_t	lh = NULL;
-	ldi_handle_t	ip_lh = NULL;
-	int		err;
-	int		rval;
-
-	DBG0("configuring TCP default queue\n");
-
-	/*
-	 * We open IP6DEV here because we need to have it open to in
-	 * order to open TCP6DEV successfully.
-	 */
-	if ((err = ldi_open_by_name(IP6DEV, FREAD|FWRITE, CRED(), &ip_lh,
-	    li)) != 0) {
-		printf("strplumb: open of IP6DEV failed: %d\n", err);
-		return (err);
-	}
-
-	/*
-	 * We set the tcp default queue to IPv6 because IPv4 falls back to
-	 * IPv6 when it can't find a client, but IPv6 does not fall back to
-	 * IPv4.
-	 */
-	if ((err = ldi_open_by_name(TCP6DEV, FREAD|FWRITE, CRED(), &lh,
-	    li)) != 0) {
-		printf("strplumb: open of TCP6DEV failed: %d\n", err);
-		goto done;
-	}
-
-	if ((err = ldi_ioctl(lh, TCP_IOC_DEFAULT_Q, (intptr_t)0, FKIOCTL,
-	    CRED(), &rval)) != 0) {
-		printf("strplumb: failed to set TCP default queue: %d\n",
-		    err);
-		goto done;
-	}
-
-done:
-	(void) ldi_close(ip_lh, FREAD|FWRITE, CRED());
-	return (err);
-}
-
 /*
  * Can be set in /etc/system in the case of local booting. See comment below.
  */
@@ -447,11 +338,8 @@ strplumb_dev(ldi_ident_t li)
 
 	/*
 	 * Now set up the links. Ultimately, we should have two streams
-	 * permanently linked underneath UDP (which is actually IP with UDP
-	 * autopushed). One stream consists of the ARP-[ifname] combination,
-	 * while the other consists of ARP-IP-[ifname]. The second combination
-	 * seems a little weird, but is linked underneath UDP just to keep it
-	 * around.
+	 * permanently linked under UDP.  One stream consists of the
+	 * ARP-[ifname] combination, while the other consists of IP-[ifname].
 	 *
 	 * We pin underneath UDP here to match what is done in ifconfig(1m);
 	 * otherwise, ifconfig will be unable to unplumb the stream (the major
@@ -462,7 +350,7 @@ strplumb_dev(ldi_ident_t li)
 	 */
 
 	/*
-	 * Plumb UDP-ARP-IP-<dev>
+	 * Plumb UDP-IP-<dev>
 	 */
 
 	if ((err = ldi_open_by_name(rootfs.bo_devname, FREAD|FWRITE, CRED(),
@@ -494,12 +382,6 @@ strplumb_dev(ldi_ident_t li)
 		lifr.lifr_flags &= ~IFF_IPV4;
 		name = UDP6DEV;
 	}
-	if ((err = ldi_ioctl(lh, I_PUSH, (intptr_t)ARP, FKIOCTL, CRED(),
-	    &rval)) != 0) {
-		printf("strplumb: push ARP failed: %d\n", err);
-		goto done;
-	}
-
 	(void) strlcpy(lifr.lifr_name, rootfs.bo_ifname,
 	    sizeof (lifr.lifr_name));
 	lifr.lifr_ppa = rootfs.bo_ppa;
@@ -507,29 +389,17 @@ strplumb_dev(ldi_ident_t li)
 	if ((err = setifname(lh, &lifr)) != 0)
 		goto done;
 
-	/* Get the flags and check if ARP is needed */
+	/* get the flags and check if ARP is needed */
 	if ((err = getifflags(lh, &lifr)) != 0) {
 		printf("strplumb: getifflags %s IP failed, error %d\n",
 		    lifr.lifr_name, err);
 		goto done;
 	}
-
-	/* Pop out ARP if not needed */
-	if (lifr.lifr_flags & (IFF_NOARP | IFF_IPV6)) {
-		err = ldi_ioctl(lh, I_POP, (intptr_t)0, FKIOCTL, CRED(),
-		    &rval);
-		if (err != 0) {
-			printf("strplumb: pop ARP failed, error %d\n", err);
-			goto done;
-		}
-	}
-
 	if ((err = ldi_open_by_name(name, FREAD|FWRITE, CRED(), &mux_lh,
 	    li)) != 0) {
 		printf("strplumb: open of %s failed: %d\n", name, err);
 		goto done;
 	}
-
 	if ((err = ldi_ioctl(mux_lh, I_PLINK, (intptr_t)lh,
 	    FREAD|FWRITE|FNOCTTY|FKIOCTL, CRED(),
 	    &(ifr.ifr_ip_muxid))) != 0) {
@@ -538,9 +408,9 @@ strplumb_dev(ldi_ident_t li)
 		goto done;
 	}
 
-	if (af == AF_INET6) {
+	/* if ARP is not needed, we are done */
+	if (lifr.lifr_flags & (IFF_NOARP | IFF_IPV6))
 		goto done;
-	}
 
 	DBG2("UDP-ARP-IP-%s muxid: %d\n", rootfs.bo_ifname, ifr.ifr_ip_muxid);
 
@@ -610,22 +480,9 @@ strplumb(void)
 	if ((err = strplumb_init()) != 0)
 		return (err);
 
-	if ((err = strplumb_autopush()) != 0)
-		return (err);
-
 	if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0)
 		return (err);
 
-	/*
-	 * Setup the TCP and SCTP default queues for the global stack.
-	 * tcp/sctp_stack_init will do this for additional stack instances.
-	 */
-	if ((err = strplumb_sctpq(li)) != 0)
-		goto done;
-
-	if ((err = strplumb_tcpq(li)) != 0)
-		goto done;
-
 	if ((err = resolve_boot_path()) != 0)
 		goto done;
 
diff --git a/usr/src/uts/common/io/tl.c b/usr/src/uts/common/io/tl.c
index 7ddb24cddb..83f8cf6944 100644
--- a/usr/src/uts/common/io/tl.c
+++ b/usr/src/uts/common/io/tl.c
@@ -452,7 +452,7 @@ opdes_t	tl_opt_arr[] = {
 		OA_R,
 		OA_R,
 		OP_NP,
-		OP_PASSNEXT,
+		0,
 		sizeof (t_scalar_t),
 		0
 	},
@@ -462,7 +462,7 @@ opdes_t	tl_opt_arr[] = {
 		OA_RW,
 		OA_RW,
 		OP_NP,
-		OP_PASSNEXT,
+		0,
 		sizeof (int),
 		0
 	}
@@ -867,7 +867,7 @@ static void tl_fill_option(uchar_t *, cred_t *, pid_t, int, cred_t *);
 static int tl_default_opt(queue_t *, int, int, uchar_t *);
 static int tl_get_opt(queue_t *, int, int, uchar_t *);
 static int tl_set_opt(queue_t *, uint_t, int, int, uint_t, uchar_t *, uint_t *,
-    uchar_t *, void *, cred_t *, mblk_t *);
+    uchar_t *, void *, cred_t *);
 static void tl_memrecover(queue_t *, mblk_t *, size_t);
 static void tl_freetip(tl_endpt_t *, tl_icon_t *);
 static void tl_free(tl_endpt_t *);
@@ -904,7 +904,6 @@ optdb_obj_t tl_opt_obj = {
 	tl_default_opt,		/* TL default value function pointer */
 	tl_get_opt,		/* TL get function pointer */
 	tl_set_opt,		/* TL set function pointer */
-	B_TRUE,			/* TL is tpi provider */
 	TL_OPT_ARR_CNT,		/* TL option database count of entries */
 	tl_opt_arr,		/* TL option database */
 	TL_VALID_LEVELS_CNT,	/* TL valid level count of entries */
@@ -2789,12 +2788,10 @@ tl_optmgmt(queue_t *wq, mblk_t *mp)
 	 * call common option management routine from drv/ip
 	 */
 	if (prim->type == T_SVR4_OPTMGMT_REQ) {
-		(void) svr4_optcom_req(wq, mp, cr, &tl_opt_obj,
-		    B_FALSE);
+		svr4_optcom_req(wq, mp, cr, &tl_opt_obj);
 	} else {
 		ASSERT(prim->type == T_OPTMGMT_REQ);
-		(void) tpi_optcom_req(wq, mp, cr, &tl_opt_obj,
-		    B_FALSE);
+		tpi_optcom_req(wq, mp, cr, &tl_opt_obj);
 	}
 }
 
@@ -6066,8 +6063,7 @@ tl_set_opt(
 	uint_t		*outlenp,
 	uchar_t		*outvalp,
 	void		*thisdg_attrs,
-	cred_t		*cr,
-	mblk_t		*mblk)
+	cred_t		*cr)
 {
 	int error;
 	tl_endpt_t *tep;
diff --git a/usr/src/uts/common/io/warlock/ibcm.wlcmd b/usr/src/uts/common/io/warlock/ibcm.wlcmd
index b4ae04a925..e66149c4fd 100644
--- a/usr/src/uts/common/io/warlock/ibcm.wlcmd
+++ b/usr/src/uts/common/io/warlock/ibcm.wlcmd
@@ -66,11 +66,7 @@ root	ibt_get_src_ip
 root	ibt_ofuvcm_get_req_data
 root	ibt_ofuvcm_proceed
 
-root	ibcm_arp_timeout
 root	ibcm_arp_get_srcip_plist
-root	ibcm_arp_lrput
-root	ibcm_arp_lwsrv
-root	ibcm_arp_lrsrv
 root	ibcm_arp_get_ibd_insts_cb
 
 # callback entry points from ibmf
diff --git a/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c b/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c
index 27eaaba86f..c827fb9e82 100644
--- a/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c
+++ b/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/stream.h>
 #include <sys/dlpi.h>
@@ -88,8 +86,8 @@ dlcosmk_process(mblk_t **mpp, dlcosmk_data_t *dlcosmk_data, uint32_t ill_index,
 	}
 
 	if ((ill_index == 0) ||
-	    ((ill = ill_lookup_on_ifindex_global_instance(ill_index, B_FALSE,
-		NULL, NULL, NULL, NULL)) == NULL)) {
+	    ((ill = ill_lookup_on_ifindex_global_instance(ill_index,
+	    B_FALSE)) == NULL)) {
 		dlcosmk2dbg(("dlcosmk_process:invalid ill index %u\n",
 		    ill_index));
 		atomic_add_64(&dlcosmk_data->ipackets, 1);
diff --git a/usr/src/uts/common/ipp/ipgpc/classifierddi.c b/usr/src/uts/common/ipp/ipgpc/classifierddi.c
index 4d31da6396..e76c181d92 100644
--- a/usr/src/uts/common/ipp/ipgpc/classifierddi.c
+++ b/usr/src/uts/common/ipp/ipgpc/classifierddi.c
@@ -445,10 +445,9 @@ ipgpc_invoke_action(ipp_action_id_t aid, ipp_packet_t *packet)
 	pkt.direction = callout_pos; /* set packet direction */
 
 	/* The ill_index could be 0 when called from forwarding (read) path */
-	if (ill_idx > 0) {
-		ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE,
-		    NULL, NULL, NULL, NULL);
-	}
+	if (ill_idx > 0)
+		ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE);
+
 	if (ill != NULL) {
 		/*
 		 * Since all IPP actions in an IPMP group are performed
diff --git a/usr/src/uts/common/ktli/t_kutil.c b/usr/src/uts/common/ktli/t_kutil.c
index cfd153d873..ab762403fd 100644
--- a/usr/src/uts/common/ktli/t_kutil.c
+++ b/usr/src/uts/common/ktli/t_kutil.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,8 +36,6 @@
  * contributors.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Contains the following utility functions:
  * 	tli_send:
@@ -230,7 +228,7 @@ t_kadvise(TIUSER *tiptr, uchar_t *addr, int addr_len)
 
 	bzero(ipid, sizeof (*ipid));
 	ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY;
-	ipid->ipid_ire_type = IRE_CACHE;
+	ipid->ipid_ire_type = 0;
 	ipid->ipid_addr_offset = sizeof (ipid_t);
 	ipid->ipid_addr_length = addr_len;
 
diff --git a/usr/src/uts/common/net/route.h b/usr/src/uts/common/net/route.h
index 3e4307f25e..9c004b74b1 100644
--- a/usr/src/uts/common/net/route.h
+++ b/usr/src/uts/common/net/route.h
@@ -130,7 +130,8 @@ struct rtentry {
 #define	RTF_PROTO1	0x8000		/* protocol specific routing flag */
 #define	RTF_MULTIRT	0x10000		/* multiroute */
 #define	RTF_SETSRC	0x20000		/* set default outgoing src address */
-
+#define	RTF_INDIRECT	0x40000		/* gateway not directly reachable */
+#define	RTF_KERNEL	0x80000		/* created by kernel; can't delete */
 
 /*
  * OLD statistics not used by the kernel. The kernel uses <inet/mib2.h>.
diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h
index fc2c750ba7..c1166fc34f 100644
--- a/usr/src/uts/common/netinet/in.h
+++ b/usr/src/uts/common/netinet/in.h
@@ -888,6 +888,7 @@ struct sockaddr_in6 {
  */
 #define	IP_PKTINFO		0x1a	/* specify src address and/or index */
 #define	IP_RECVPKTINFO		0x1a	/* recv dest/matched addr and index */
+#define	IP_DONTFRAG		0x1b	/* don't fragment packets */
 
 #if !defined(_XPG4_2) || defined(__EXTENSIONS__)
 /*
diff --git a/usr/src/uts/common/netinet/ip_mroute.h b/usr/src/uts/common/netinet/ip_mroute.h
index 8a658a0fca..b1dde41b1f 100644
--- a/usr/src/uts/common/netinet/ip_mroute.h
+++ b/usr/src/uts/common/netinet/ip_mroute.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,17 +18,16 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 1991, 1997-1999, 2001, 2003 Sun Microsystems, Inc.
- * All rights reserved.  Use is subject to license terms.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
 #ifndef	_NETINET_IP_MROUTE_H
 #define	_NETINET_IP_MROUTE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -188,6 +186,7 @@ struct vif {
 	uint_t			v_refcnt;
 	uchar_t 		v_marks;
 	kmutex_t		v_lock;
+	ilm_t			*v_ilm;	/* allmulti join */
 };
 
 /*
diff --git a/usr/src/uts/common/os/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c
index 722c793b79..1fa1c9425b 100644
--- a/usr/src/uts/common/os/ip_cksum.c
+++ b/usr/src/uts/common/os/ip_cksum.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -93,9 +93,6 @@ ip_cksum(mblk_t *mp, int offset, uint_t sum)
 #endif
 	ASSERT(dp);
 
-	TRACE_2(TR_FAC_IP, TR_IP_CKSUM_START,
-	    "ip_cksum_start:%p (%X)", mp, sum);
-
 	if (mp->b_cont == NULL) {
 		/*
 		 * May be fast-path, only one mblk.
@@ -277,9 +274,6 @@ slow1:
 			mlen = mp->b_wptr - (uchar_t *)w;
 		}
 
-		TRACE_2(TR_FAC_IP, TR_IP_CKSUM_START,
-		    "ip_cksum_start:%p (%X)", mp, sum)
-
 		mp = mp->b_cont;
 		if (mlen > 0 && pmlen == -1) {
 			/*
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index 76ce1af025..22bdc86e03 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -8474,9 +8474,7 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA);
 	if (mp->b_datap->db_type == M_DATA) {
 		if (flags != NULL) {
-			*flags = DB_CKSUMFLAGS(mp) & (HCK_IPV4_HDRCKSUM |
-			    HCK_PARTIALCKSUM | HCK_FULLCKSUM |
-			    HCK_FULLCKSUM_OK);
+			*flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS;
 			if ((*flags & (HCK_PARTIALCKSUM |
 			    HCK_FULLCKSUM)) != 0) {
 				if (value != NULL)
diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h
index 9542a15a8e..ed80269fbc 100644
--- a/usr/src/uts/common/sys/dld.h
+++ b/usr/src/uts/common/sys/dld.h
@@ -395,7 +395,8 @@ typedef struct dld_capab_poll_s {
 /*
  * Currently supported flags for LSO.
  */
-#define	DLD_LSO_TX_BASIC_TCP_IPV4	0x01	/* TCP LSO capability */
+#define	DLD_LSO_BASIC_TCP_IPV4	0x01	/* TCP LSO over IPv4 capability */
+#define	DLD_LSO_BASIC_TCP_IPV6	0x02	/* TCP LSO over IPv6 capability */
 
 typedef struct dld_capab_lso_s {
 	uint_t  lso_flags;	/* capability flags */
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 8b0681e2d8..6b3a5801d7 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -593,10 +593,6 @@ union	DL_qos_types {
 					/* dl_data is dl_capab_id_t */
 #define	DL_CAPAB_HCKSUM		0x01	/* Checksum offload */
 					/* dl_data is dl_capab_hcksum_t */
-#define	DL_CAPAB_IPSEC_AH	0x02	/* IPsec AH acceleration */
-					/* dl_data is dl_capab_ipsec_t */
-#define	DL_CAPAB_IPSEC_ESP	0x03	/* IPsec ESP acceleration */
-					/* dl_data is dl_capab_ipsec_t */
 #define	DL_CAPAB_MDT		0x04	/* Multidata Transmit capability */
 					/* dl_data is dl_capab_mdt_t */
 #define	DL_CAPAB_ZEROCOPY	0x05	/* Zero-copy capability */
@@ -611,45 +607,8 @@ typedef struct {
 } dl_capability_sub_t;
 
 /*
- * Definitions and structures needed for DL_CONTROL_REQ and DL_CONTROL_ACK
- * primitives.
- * Extensible message to send down control information to the DLS provider.
- * The response is a DL_CONTROL_ACK or DL_ERROR_ACK.
- *
- * Different types of control operations will define different format for the
- * key and data fields. ADD requires key and data fields; if the <type, key>
- * matches an already existing entry a DL_ERROR_ACK will be returned. DELETE
- * requires a key field; if the <type, key> does not exist, a DL_ERROR_ACK
- * will be returned. FLUSH requires neither a key nor data; it
- * unconditionally removes all entries for the specified type. GET requires a
- * key field; the get operation returns the data for the <type, key>. If
- * <type, key> doesn't exist a DL_ERROR_ACK is returned. UPDATE requires key
- * and data fields; if <type, key> doesn't exist a DL_ERROR_ACK is returned.
- */
-
-/*
- * Control operations
- */
-#define	DL_CO_ADD	0x01	/* Add new entry matching for <type,key> */
-#define	DL_CO_DELETE	0x02	/* Delete the entry matching <type,key> */
-#define	DL_CO_FLUSH	0x03	/* Purge all entries of <type> */
-#define	DL_CO_GET	0x04	/* Get the data for the <type,key> */
-#define	DL_CO_UPDATE	0x05	/* Update the data for <type,key> */
-#define	DL_CO_SET	0x06	/* Add or update as appropriate */
-
-/*
- * Control types (dl_type field of dl_control_req_t and dl_control_ack_t)
- */
-#define	DL_CT_IPSEC_AH	0x01	/* AH; key=spi,dest_addr; */
-				/* data=keying material */
-#define	DL_CT_IPSEC_ESP	0x02	/* ESP; key=spi,des_taddr; */
-				/* data=keying material */
-
-/*
  * Module ID token to be included in new sub-capability structures.
- * Existing sub-capabilities lacking an identification token, e.g. IPSEC
- * hardware acceleration, need to be encapsulated within the ID sub-
- * capability.  Access to this structure must be done through
+ * Access to this structure must be done through
  * dlcapab{set,check}qid().
  */
 typedef struct {
diff --git a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h
index c307ed7575..e0b7e1e1e7 100644
--- a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h
+++ b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h
@@ -31,24 +31,11 @@ extern "C" {
 #endif
 
 #include <sys/ib/mgt/ibcm/ibcm_impl.h>
-#include <sys/modhash.h>
 #include <sys/ib/clients/ibd/ibd.h>
-#include <sys/strsun.h>
-#include <sys/socket.h>
-#include <sys/stat.h>	/* for S_IFCHR */
 #include <inet/ip2mac.h>
 #include <inet/ip6.h>
 
-/*
- * IPoIB addr lookup completion function
- */
-typedef int (*ibcm_arp_pr_comp_func_t) (void *usr_arg, int status);
-
 #define	IBCM_ARP_MAX_IFNAME_LEN		24
-#define	IBCM_ARP_XMIT_COUNT		6
-#define	IBCM_ARP_XMIT_INTERVAL		1000	/* timeout in milliseconds */
-#define	IBCM_ARP_TIMEOUT \
-		((IBCM_ARP_XMIT_COUNT + 1) * IBCM_ARP_XMIT_INTERVAL)
 
 #define	IBCM_H2N_GID(gid) \
 { \
@@ -68,9 +55,7 @@ typedef int (*ibcm_arp_pr_comp_func_t) (void *usr_arg, int status);
  * Path record wait queue node definition
  */
 typedef struct ibcm_arp_prwqn {
-	ibcm_arp_pr_comp_func_t	func;	/* user callback function */
-	void			*arg;	/* callback function arg */
-	timeout_id_t		timeout_id;
+	struct ibcm_arp_streams_s *ib_str;
 	uint8_t			flags;
 	ibt_ip_addr_t		usrc_addr;	/* user supplied src address */
 	ibt_ip_addr_t		dst_addr;	/* user supplied dest address */
@@ -89,15 +74,11 @@ typedef struct ibcm_arp_prwqn {
 typedef struct ibcm_arp_streams_s {
 	kmutex_t		lock;
 	kcondvar_t		cv;
-	queue_t			*arpqueue;
-	vnode_t			*arp_vp;
 	int			status;
 	boolean_t		done;
 	ibcm_arp_prwqn_t	*wqnp;
 } ibcm_arp_streams_t;
 
-/* GID to IP-Addr and Ip-Addr to GID look-up functions. */
-
 #define	IBCM_ARP_IBD_INSTANCES		4
 
 typedef struct ibcm_arp_ip_s {
diff --git a/usr/src/uts/common/sys/iphada.h b/usr/src/uts/common/sys/iphada.h
deleted file mode 100644
index 9d1a6e28e8..0000000000
--- a/usr/src/uts/common/sys/iphada.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2002-2003 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_IPHADA_H
-#define	_SYS_IPHADA_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	DA_ICV_MAX_LEN	128		/* max ICV length [bytes] */
-
-/*
- * iphada.h header for IP Hardware Acceleration Data Attributes
- *
- *   This is a contract private interface for use by the Sun
- *   Hardware Accelerated Ethernet driver ONLY.
- */
-typedef struct da_ipsec {
-	int		da_type;	/* M_CTL message ident */
-	int		da_flag;
-	uint32_t	da_icv_len;	/* da_icv length in bytes */
-	uchar_t		da_icv[DA_ICV_MAX_LEN];	/* ICV for AH or ESP+auth */
-} da_ipsec_t;
-
-#define	IPHADA_M_CTL    0xA1D53DE5u
-
-/*
- * IPSec algorithms capabilities (cip_data in dl_capab_ipsec_t)
- */
-typedef struct {
-	t_uscalar_t	alg_type;
-	t_uscalar_t	alg_prim;	/* algorithm primitive */
-	t_uscalar_t	alg_thruput;	/* approx throughput metric in Mb/s */
-	t_uscalar_t	alg_flag;	/* flags */
-	t_uscalar_t	alg_minbits;	/* minimum key len in bits */
-	t_uscalar_t	alg_maxbits;	/* maximum key len in bits */
-	t_uscalar_t	alg_incrbits;	/* key len increment in bits */
-} dl_capab_ipsec_alg_t;
-
-/*
- * IPSec sub-capability (follows dl_capability_sub_t)
- */
-typedef struct {
-	t_uscalar_t		cip_version;	/* interface version */
-	t_uscalar_t		cip_nciphers;	/* number ciphers supported */
-	dl_capab_ipsec_alg_t	cip_data[1];	/* data */
-} dl_capab_ipsec_t;
-
-/*
- * Algorithm types (alg_type field of dl_capab_ipsec_alg_t)
- */
-#define	DL_CAPAB_IPSEC_ALG_AUTH		0x01	/* authentication alg. */
-#define	DL_CAPAB_IPSEC_ALG_ENCR		0x02	/* encryption alg. */
-
-/* alg_prim ciphers */
-#define	DL_CAPAB_IPSEC_ENCR_DES		0x02
-#define	DL_CAPAB_IPSEC_ENCR_3DES	0x03
-#define	DL_CAPAB_IPSEC_ENCR_BLOWFISH	0x07
-#define	DL_CAPAB_IPSEC_ENCR_NULL	0x0b	/* no encryption */
-#define	DL_CAPAB_IPSEC_ENCR_AES		0x0c
-
-/* alg_prim authentications */
-#define	DL_CAPAB_IPSEC_AUTH_NONE	0x00	/* no authentication */
-#define	DL_CAPAB_IPSEC_AUTH_MD5HMAC	0x02
-#define	DL_CAPAB_IPSEC_AUTH_SHA1HMAC	0x03
-
-/* alg_flag values */
-#define	DL_CAPAB_ALG_ENABLE	0x01	/* enable this algorithm */
-
-/*
- * For DL_CT_IPSEC_AH and DL_CT_IPSEC_ESP, the optional dl_key data
- * that follows the dl_control_req_t or dl_control_ack_t will be the IPsec
- * SPI (Security Parameters Index) value and the destination address.
- * This is defined as being unique per protocol.
- */
-
-#define	DL_CTL_IPSEC_ADDR_LEN	16	/* IP addr length in bytes */
-
-typedef struct dl_ct_ipsec_key {
-	uint32_t dl_key_spi;		/* Security Parameters Index value */
-	uchar_t dl_key_dest_addr[DL_CTL_IPSEC_ADDR_LEN]; /* dest IP address */
-	uint32_t dl_key_addr_family; 	/* family of dest IP address */
-					/* (AF_INET or AF_INET6) */
-} dl_ct_ipsec_key_t;
-
-#define	DL_CT_IPSEC_MAX_KEY_LEN	512	/* max key length in bytes */
-
-/*
- * Possible flags for sadb_sa_flags.
- */
-#define	DL_CT_IPSEC_INBOUND	0x01	/* SA can be used for inbound pkts */
-#define	DL_CT_IPSEC_OUTBOUND	0x02	/* SA can be used for outbound pkts */
-
-/*
- * minimal SADB entry content
- * fields are defined as per RFC 2367 and <net/pfkeyv2.h>
- * This defines the content and format of the dl_data portion of
- * the dl_control_req_t or dl_control_ack_t.
- */
-typedef struct dl_ct_ipsec {
-	uint8_t sadb_sa_auth;			/* Authentication algorithm */
-	uint8_t sadb_sa_encrypt;		/* Encryption algorithm */
-	uint32_t sadb_sa_flags;			/* SA flags. */
-	uint16_t sadb_key_len_a;		/* auth key length in bytes */
-	uint16_t sadb_key_bits_a;		/* auth key length in bits */
-	uint16_t sadb_key_data_a[DL_CT_IPSEC_MAX_KEY_LEN];	/* key data */
-	uint16_t sadb_key_len_e;		/* encr key length in bytes */
-	uint16_t sadb_key_bits_e;		/* encr key length in bits */
-	uint16_t sadb_key_data_e[DL_CT_IPSEC_MAX_KEY_LEN];	/* key data */
-} dl_ct_ipsec_t;
-
-
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_IPHADA_H */
diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h
index cac046d675..f3b8397681 100644
--- a/usr/src/uts/common/sys/pattr.h
+++ b/usr/src/uts/common/sys/pattr.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_PATTR_H
 #define	_SYS_PATTR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -92,6 +90,9 @@ typedef struct pattr_hcksum_s {
 					/* check the attached h/w computed */
 					/* checksum value to determine if */
 					/* checksum was bad */
+
+#define	HCK_FLAGS		(HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM |	\
+				HCK_FULLCKSUM | HCK_FULLCKSUM_OK)
 /*
  * Extended hardware offloading flags that also use hcksum_flags
  */
diff --git a/usr/src/uts/common/sys/softmac_impl.h b/usr/src/uts/common/sys/softmac_impl.h
index eb71063bc7..bd94d4982e 100644
--- a/usr/src/uts/common/sys/softmac_impl.h
+++ b/usr/src/uts/common/sys/softmac_impl.h
@@ -301,7 +301,9 @@ typedef struct softmac_upper_s {
 
 	uint32_t		su_bound : 1,		/* SL */
 				su_active : 1,		/* SL */
-				su_direct : 1;		/* SL */
+				su_direct : 1,		/* SL */
+				su_is_arp : 1,
+				su_pad_to_32:28;
 
 	/*
 	 * Used for fastpath data path.
diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h
index a2d808f647..de0f18bd4d 100644
--- a/usr/src/uts/common/sys/squeue.h
+++ b/usr/src/uts/common/sys/squeue.h
@@ -44,21 +44,19 @@ typedef struct squeue_s squeue_t;
 	(mp)->b_prev = (mblk_t *)(arg);				\
 }
 
-#define	GET_SQUEUE(mp)	((conn_t *)((mp)->b_prev))->conn_sqp
-
 #define	SQ_FILL		0x0001
 #define	SQ_NODRAIN	0x0002
 #define	SQ_PROCESS	0x0004
 
-#define	SQUEUE_ENTER(sqp, head, tail, cnt, flag, tag) {	\
-	sqp->sq_enter(sqp, head, tail, cnt, flag, tag);	\
+#define	SQUEUE_ENTER(sqp, head, tail, cnt, ira, flag, tag) {	\
+	sqp->sq_enter(sqp, head, tail, cnt, ira, flag, tag);	\
 }
 
-#define	SQUEUE_ENTER_ONE(sqp, mp, proc, arg, flag, tag) {	\
+#define	SQUEUE_ENTER_ONE(sqp, mp, proc, arg, ira, flag, tag) {	\
 	ASSERT(mp->b_next == NULL);				\
 	ASSERT(mp->b_prev == NULL);				\
 	SET_SQUEUE(mp, proc, arg);				\
-	SQUEUE_ENTER(sqp, mp, mp, 1, flag, tag);		\
+	SQUEUE_ENTER(sqp, mp, mp, 1, ira, flag, tag);		\
 }
 
 /*
@@ -77,12 +75,13 @@ typedef enum {
 	SQPRIVATE_MAX
 } sqprivate_t;
 
+struct ip_recv_attr_s;
 extern void squeue_init(void);
 extern squeue_t *squeue_create(clock_t, pri_t);
 extern void squeue_bind(squeue_t *, processorid_t);
 extern void squeue_unbind(squeue_t *);
 extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *,
-    uint32_t, int, uint8_t);
+    uint32_t, struct ip_recv_attr_s *, int, uint8_t);
 extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t);
 
 struct conn_s;
diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h
index bd934cc0b3..22550886eb 100644
--- a/usr/src/uts/common/sys/squeue_impl.h
+++ b/usr/src/uts/common/sys/squeue_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -79,9 +79,9 @@ typedef struct squeue_set_s {
 	processorid_t	sqs_cpuid;
 } squeue_set_t;
 
-typedef void (*sqproc_t)(void *, mblk_t *, void *);
+typedef void (*sqproc_t)(void *, mblk_t *, void *, struct ip_recv_attr_s *);
 typedef void (*sq_enter_proc_t)(squeue_t *, mblk_t *, mblk_t *, uint32_t,
-		int, uint8_t);
+	    struct ip_recv_attr_s *, int, uint8_t);
 typedef void (*sq_drain_proc_t)(squeue_t *, uint_t, hrtime_t);
 
 extern void squeue_worker_wakeup(squeue_t *);
diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h
index b9c96a8345..7a3b4e3448 100644
--- a/usr/src/uts/common/sys/stream.h
+++ b/usr/src/uts/common/sys/stream.h
@@ -404,9 +404,6 @@ typedef	struct	bcache {
 #define	STRUIO_IP	0x04	/* IP checksum stored in db_struioun */
 #define	STRUIO_ZC	0x08	/* mblk eligible for zero-copy */
 #define	STRUIO_ZCNOTIFY	0x10	/* notify stream head when mblk acked */
-#define	STRUIO_EAGER	0x20	/* new eager; db_cksumstart has squeue to use */
-#define	STRUIO_POLICY	0x40	/* new eager when IPsec is enabled */
-#define	STRUIO_CONNECT	0x80	/* conn did a connect */
 
 /*
  * Message flags.  These are interpreted by the stream head.
@@ -418,8 +415,7 @@ typedef	struct	bcache {
 /*	UNUSED		0x08	   was MSGNOGET (can be recycled) */
 #define	MSGMARKNEXT	0x10	/* Private: first byte of next msg marked */
 #define	MSGNOTMARKNEXT	0x20	/* Private: ... not marked */
-#define	MSGHASREF	0x40	/* Private: message has reference to owner */
-#define	MSGWAITSYNC	0x80	/* Private: waiting for sync squeue enter */
+#define	MSGWAITSYNC	0x40	/* Private: waiting for sync squeue enter */
 
 /*
  * Streams message types.
diff --git a/usr/src/uts/common/sys/tsol/tnet.h b/usr/src/uts/common/sys/tsol/tnet.h
index 221f4c775a..0da65ae5ca 100644
--- a/usr/src/uts/common/sys/tsol/tnet.h
+++ b/usr/src/uts/common/sys/tsol/tnet.h
@@ -46,35 +46,30 @@ extern "C" {
 
 extern int tsol_tnrh_chk(tsol_tpent_t *, bslabel_t *, int);
 extern tsol_tnrhc_t *find_rhc(const void *, uchar_t, boolean_t);
-extern int tsol_check_dest(const cred_t *, const void *, uchar_t, uint_t,
-    cred_t **);
-extern int tsol_compute_label(const cred_t *, ipaddr_t, uchar_t *,
-    ip_stack_t *);
-extern int tsol_compute_label_v6(const cred_t *, const in6_addr_t *, uchar_t *,
-    ip_stack_t *);
-extern int tsol_check_label(const cred_t *, mblk_t **, uint_t,
-    ip_stack_t *, pid_t);
-extern int tsol_check_label_v6(const cred_t *, mblk_t **, uint_t,
-    ip_stack_t *, pid_t);
+extern int tsol_check_dest(const ts_label_t *, const void *, uchar_t,
+    uint_t, boolean_t, ts_label_t **);
+extern int tsol_compute_label_v4(const ts_label_t *, zoneid_t, ipaddr_t,
+    uchar_t *, ip_stack_t *);
+extern int tsol_compute_label_v6(const ts_label_t *, zoneid_t,
+    const in6_addr_t *, uchar_t *, ip_stack_t *);
+extern int tsol_check_label_v4(const ts_label_t *, zoneid_t, mblk_t **,
+    uint_t, boolean_t, ip_stack_t *, ts_label_t **);
+extern int tsol_check_label_v6(const ts_label_t *, zoneid_t, mblk_t **,
+    uint_t, boolean_t, ip_stack_t *, ts_label_t **);
 extern int tsol_prepend_option(uchar_t *, ipha_t *, int);
 extern int tsol_prepend_option_v6(uchar_t *, ip6_t *, int);
 extern int tsol_remove_secopt(ipha_t *, int);
 extern int tsol_remove_secopt_v6(ip6_t *, int);
-extern int tsol_update_sticky(ip6_pkt_t *, uint_t *, const uchar_t *);
-extern int tsol_update_options(uchar_t **, uint_t *, uint_t *,
-    const uchar_t *);
-extern boolean_t tsol_option_set(uchar_t **, uint_t *, uint_t, const uchar_t *,
-    uint_t);
 
 extern tsol_ire_gw_secattr_t *ire_gw_secattr_alloc(int);
 extern void ire_gw_secattr_free(tsol_ire_gw_secattr_t *);
 
-extern boolean_t tsol_can_reply_error(const mblk_t *);
+extern boolean_t tsol_can_reply_error(const mblk_t *, ip_recv_attr_t *);
 extern boolean_t tsol_receive_local(const mblk_t *, const void *, uchar_t,
-    boolean_t, const conn_t *);
-extern boolean_t tsol_can_accept_raw(mblk_t *, boolean_t);
-extern boolean_t tsol_get_pkt_label(mblk_t *, int);
-extern zoneid_t tsol_packet_to_zoneid(const mblk_t *);
+    ip_recv_attr_t *, const conn_t *);
+extern boolean_t tsol_can_accept_raw(mblk_t *, ip_recv_attr_t *, boolean_t);
+extern boolean_t tsol_get_pkt_label(mblk_t *, int, ip_recv_attr_t *);
+extern zoneid_t tsol_attr_to_zoneid(const ip_recv_attr_t *);
 
 extern boolean_t tsol_get_option_v4(mblk_t *, tsol_ip_label_t *, uint8_t **);
 extern boolean_t tsol_get_option_v6(mblk_t *, tsol_ip_label_t *, uint8_t **);
@@ -83,8 +78,8 @@ extern boolean_t tsol_find_secopt_v6(const uchar_t *, uint_t, uchar_t **,
 
 extern int tsol_ire_match_gwattr(ire_t *, const ts_label_t *);
 extern int tsol_rtsa_init(rt_msghdr_t *, tsol_rtsecattr_t *, caddr_t);
-extern int tsol_ire_init_gwattr(ire_t *, uchar_t, tsol_gc_t *, tsol_gcgrp_t *);
-extern mblk_t *tsol_ip_forward(ire_t *, mblk_t *);
+extern int tsol_ire_init_gwattr(ire_t *, uchar_t, tsol_gc_t *);
+extern mblk_t *tsol_ip_forward(ire_t *, mblk_t *, const ip_recv_attr_t *);
 extern uint32_t tsol_pmtu_adjust(mblk_t *, uint32_t, int, int);
 
 extern mlp_type_t tsol_mlp_addr_type(zoneid_t, uchar_t, const void *,
diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared
index f1ceb0257e..6b20559ef4 100644
--- a/usr/src/uts/intel/Makefile.intel.shared
+++ b/usr/src/uts/intel/Makefile.intel.shared
@@ -371,7 +371,6 @@ DRV_KMODS	+= pppt
 DRV_KMODS	+= ncall nsctl sdbc nskern sv
 DRV_KMODS	+= ii rdc rdcsrv rdcstub 
 DRV_KMODS	+= iptun
-DRV_KMODS	+= iptunq
 
 #
 # Don't build some of these for OpenSolaris, since they will be
diff --git a/usr/src/uts/intel/arp/Makefile b/usr/src/uts/intel/arp/Makefile
index aff11806da..9b91950434 100644
--- a/usr/src/uts/intel/arp/Makefile
+++ b/usr/src/uts/intel/arp/Makefile
@@ -21,11 +21,9 @@
 #
 # uts/intel/arp/Makefile
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
-#
 #	This makefile drives the production of the arp driver kernel module.
 #
 #	intel implementation architecture dependent
@@ -68,7 +66,7 @@ INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
 #
 #	depends on ip
 #
-LDFLAGS		+= -dy -Ndrv/ip -Ndrv/hook -Nmisc/neti
+LDFLAGS		+= -dy -Ndrv/ip
 
 #
 # For now, disable these lint checks; maintainers should endeavor
diff --git a/usr/src/uts/intel/arp/arp.global-objs.debug64 b/usr/src/uts/intel/arp/arp.global-objs.debug64
index 7f826ea213..f936276753 100644
--- a/usr/src/uts/intel/arp/arp.global-objs.debug64
+++ b/usr/src/uts/intel/arp/arp.global-objs.debug64
@@ -23,15 +23,6 @@
 # Use is subject to license terms.
 #
 
-ar_cmd_tbl
-ar_m_tbl
-arp_mod_info
-arp_no_defense
-arpinfo
-arprinit
-arpwinit
-arp_param_arr
-arp_netinfo
 cb_inet_devops
 fsw
 inet_dev_info
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index 6cd415a78f..3837728d4c 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -509,7 +509,6 @@ fcnname/**/_info:							\
 	MODULE(ipsecah,drv);
 	WSTUB(ipsecah,	ipsec_construct_inverse_acquire,	nomod_zero);
 	WSTUB(ipsecah,	sadb_acquire,		nomod_zero);
-	WSTUB(ipsecah,	sadb_ill_download,	nomod_zero); 	
 	WSTUB(ipsecah,	ipsecah_algs_changed,	nomod_zero);
 	WSTUB(ipsecah,	sadb_alg_update,	nomod_zero);
 	WSTUB(ipsecah,	sadb_unlinkassoc,	nomod_zero);
@@ -1294,8 +1293,6 @@ fcnname/**/_info:							\
 	STUB(iptun, iptun_create, nomod_einval);
 	STUB(iptun, iptun_delete, nomod_einval);
 	STUB(iptun, iptun_set_policy, nomod_void) ;
-	STUB(iptun, iptun_set_g_q, nomod_einval);
-	STUB(iptun, iptun_clear_g_q, nomod_void);
 	END_MODULE(iptun);
 #endif
 
diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64
index 6009f5b006..07e9aaedde 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.debug64
+++ b/usr/src/uts/intel/ip/ip.global-objs.debug64
@@ -23,19 +23,24 @@
 # Use is subject to license terms.
 #
 
+arp_m_tbl
+arp_mod_info
+arp_netinfo
+arp_no_defense
+arpinfo
 cb_inet_devops
 cl_inet_bind
+cl_inet_checkspi
 cl_inet_connect2
+cl_inet_deletespi
 cl_inet_disconnect
+cl_inet_getspi
+cl_inet_idlesa
 cl_inet_ipident
 cl_inet_isclusterwide
 cl_inet_listen
 cl_inet_unbind
 cl_inet_unlisten
-cl_inet_getspi
-cl_inet_checkspi
-cl_inet_deletespi
-cl_inet_idlesa
 cl_sctp_assoc_change
 cl_sctp_check_addrs
 cl_sctp_connect
@@ -43,6 +48,7 @@ cl_sctp_disconnect
 cl_sctp_listen
 cl_sctp_unlisten
 conn_drain_nthreads
+dce_cache
 default_ip6_asp_table
 do_tcp_fusion
 do_tcpzcopy
@@ -97,74 +103,45 @@ ill_no_arena
 ill_null
 inet_dev_info
 inet_devops
-ip6_area_template
-ip6_ared_template
-ip6_cache_table_size
 ip6_ftable_hash_size
-ip6_ire_max_bucket_cnt
-ip6_ire_min_bucket_cnt
-ip6_max_cache_table_size
 ip6opt_ls
-ip_ard_template
-ip_area_template
-ip_ared_template
-ip_areq_template
-ip_arma_multi_template
-ip_aroff_template
-ip_aron_template
-ip_aru_template
-ip_cache_table_size
 ip_cgtp_filter_rev
 ip_conn_cache
 ip_debug
 ip_g_all_ones
-ip_helper_stream_cache
 ip_helper_stream_info
 ip_helper_stream_rinit
 ip_helper_stream_winit
 ip_ioctl_ftbl
-ip_ire_cleanup_cnt
-ip_ire_cpu_ratio
-ip_ire_max_bucket_cnt
-ip_ire_mem_ratio
-ip_ire_min_bucket_cnt
-ip_loopback_mtu
 ip_loopback_mtu_v6plus
 ip_loopback_mtuplus
 ip_m_tbl
-ip_max_cache_table_size
 ip_max_frag_dups
 ip_min_frag_prune_time
-ip_minor_arena_sa
 ip_minor_arena_la
+ip_minor_arena_sa
 ip_misc_ioctl_count
 ip_misc_ioctl_table
 ip_mod_info
 ip_modclose_ackwait_ms
 ip_ndx_ioctl_count
 ip_ndx_ioctl_table
-ip_opt_arr
-ip_opt_obj
 ip_poll_normal_ms
 ip_poll_normal_ticks
 ip_rput_pullups
 ip_six_byte_all_ones
 ip_squeue_create_callback
 ip_squeue_enter
-ip_squeue_enter_unbound
 ip_squeue_fanout
 ip_squeue_flag
 ip_squeue_worker_wait
 ip_thread_data
 ip_thread_list
 ip_thread_rwlock
-ip_use_helper_cache
-ip_wput_frag_mdt_min
 ipcl_bind_fanout_size
 ipcl_conn_hash_maxsize
 ipcl_conn_hash_memfactor
 ipcl_conn_hash_size
-ipcl_debug_level
 ipcl_iptun_fanout_size
 ipcl_raw_fanout_size
 ipcl_udp_fanout_size
@@ -174,24 +151,16 @@ ipinfov4
 ipinfov6
 iplrinit
 iplwinit
-ipmp_aract_template
-ipmp_ardeact_template
 ipmp_kstats
 iprinitv4
 iprinitv6
 ipsec_action_cache
 ipsec_hdr_pullup_needed
-ipsec_info_cache
 ipsec_pol_cache
 ipsec_policy_failure_msgs
 ipsec_sel_cache
 ipsec_spd_hashsize
 ipsec_weird_null_inbound_policy
-ipsechw_debug
-iptunq_info
-iptunq_modinfo
-iptunq_rinit
-iptunq_winit
 ipv4_forward_suffix
 ipv4info
 ipv6_all_hosts_mcast
@@ -199,29 +168,22 @@ ipv6_all_ones
 ipv6_all_rtrs_mcast
 ipv6_all_v2rtrs_mcast
 ipv6_all_zeros
-ipv6_areq_template
 ipv6_forward_suffix
 ipv6_ll_template
 ipv6_loopback
 ipv6_solicited_node_mcast
 ipv6_unspecified_group
 ipv6info
-ipwinitv4
-ipwinitv6
+ipwinit
 ire_cache
 ire_gw_secattr_cache
-ire_idle_cutoff_interval
 ire_null
 ire_nv_arr
 ire_nv_tbl
-ire_uinfo_null
 lcl_ndp_arr
 lcl_param_arr
 lcl_sctp_param_arr
 lcl_sctp_wroff_xtra_param
-lcl_tcp_mdt_head_param
-lcl_tcp_mdt_max_pbufs_param
-lcl_tcp_mdt_tail_param
 lcl_tcp_param_arr
 lcl_tcp_wroff_xtra_param
 mask_rnhead
@@ -230,6 +192,8 @@ modldrv
 modlinkage
 modlstrmod
 multicast_encap_iphdr
+nce_cache
+ncec_cache
 netdev_privs
 prov_update_handle
 radix_mask_cache
@@ -238,6 +202,7 @@ rawip_conn_cache
 recvq_call
 recvq_loop_cnt
 req_arr
+rinit_arp
 rn_mkfreelist
 rn_ones
 rn_zeros
@@ -260,25 +225,23 @@ sctp_kmem_faddr_cache
 sctp_kmem_ftsn_set_cache
 sctp_kmem_set_cache
 sctp_mod_info
+sctp_opt_arr
+sctp_opt_arr_size
 sctp_recvq_tq_task_max
 sctp_recvq_tq_task_min
 sctp_recvq_tq_thr_max
 sctp_recvq_tq_thr_min
 sctp_sin6_null
-sctp_taskq
 sctpdebug
 sctpinfo
 sctprinit
 sctpwinit
-sendq_collision
-sendq_empty
-sendq_loop_cnt
 sin6_null
 sin_null
 skip_sctp_cksum
-sock_tcp_downcalls
-sock_rts_downcalls
 sock_rawip_downcalls
+sock_rts_downcalls
+sock_tcp_downcalls
 sock_udp_downcalls
 sqset_global_list
 sqset_global_size
@@ -300,12 +263,10 @@ tcp_g_statistics
 tcp_g_t_info_ack
 tcp_g_t_info_ack_v6
 tcp_icmp_source_quench
-tcp_iphc_cache
 tcp_max_optsize
-tcp_mdt_chain
-tcp_mdt_smss_threshold
 tcp_opt_arr
 tcp_opt_obj
+tcp_outbound_squeue_switch
 tcp_random_anon_port
 tcp_random_end_ptr
 tcp_random_fptr
@@ -321,13 +282,11 @@ tcp_sock_winit
 tcp_squeue_flag
 tcp_squeue_wput
 tcp_static_maxpsz
-tcp_taskq
 tcp_timercache
 tcp_tx_pull_len
 tcp_valid_levels_arr
 tcp_winfo
 tcp_winit
-tcp_outbound_squeue_switch
 tcpinfov4
 tcpinfov6
 tli_errs
@@ -352,4 +311,6 @@ udp_valid_levels_arr
 udp_winit
 udpinfov4
 udpinfov6
-zero_info
+winit_arp
+eri_cksum_workaround
+nxge_cksum_workaround
diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64
index 1706a82aa7..526e907ab5 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.obj64
+++ b/usr/src/uts/intel/ip/ip.global-objs.obj64
@@ -23,19 +23,24 @@
 # Use is subject to license terms.
 #
 
+arp_m_tbl
+arp_mod_info
+arp_netinfo
+arp_no_defense
+arpinfo
 cb_inet_devops
 cl_inet_bind
+cl_inet_checkspi
 cl_inet_connect2
+cl_inet_deletespi
 cl_inet_disconnect
+cl_inet_getspi
+cl_inet_idlesa
 cl_inet_ipident
 cl_inet_isclusterwide
 cl_inet_listen
 cl_inet_unbind
 cl_inet_unlisten
-cl_inet_getspi
-cl_inet_checkspi
-cl_inet_deletespi
-cl_inet_idlesa
 cl_sctp_assoc_change
 cl_sctp_check_addrs
 cl_sctp_connect
@@ -43,6 +48,7 @@ cl_sctp_disconnect
 cl_sctp_listen
 cl_sctp_unlisten
 conn_drain_nthreads
+dce_cache
 default_ip6_asp_table
 do_tcp_fusion
 do_tcpzcopy
@@ -97,69 +103,41 @@ ill_no_arena
 ill_null
 inet_dev_info
 inet_devops
-ip6_area_template
-ip6_ared_template
-ip6_cache_table_size
 ip6_ftable_hash_size
-ip6_ire_max_bucket_cnt
-ip6_ire_min_bucket_cnt
-ip6_max_cache_table_size
 ip6opt_ls
-ip_ard_template
-ip_area_template
-ip_ared_template
-ip_areq_template
-ip_arma_multi_template
-ip_aroff_template
-ip_aron_template
-ip_aru_template
-ip_cache_table_size
 ip_cgtp_filter_rev
 ip_conn_cache
 ip_debug
 ip_g_all_ones
-ip_helper_stream_cache
 ip_helper_stream_info
 ip_helper_stream_rinit
 ip_helper_stream_winit
 ip_ioctl_ftbl
-ip_ire_cleanup_cnt
-ip_ire_cpu_ratio
-ip_ire_max_bucket_cnt
-ip_ire_mem_ratio
-ip_ire_min_bucket_cnt
-ip_loopback_mtu
 ip_loopback_mtu_v6plus
 ip_loopback_mtuplus
 ip_m_tbl
-ip_max_cache_table_size
 ip_max_frag_dups
 ip_min_frag_prune_time
-ip_minor_arena_sa
 ip_minor_arena_la
+ip_minor_arena_sa
 ip_misc_ioctl_count
 ip_misc_ioctl_table
 ip_mod_info
 ip_modclose_ackwait_ms
 ip_ndx_ioctl_count
 ip_ndx_ioctl_table
-ip_opt_arr
-ip_opt_obj
 ip_poll_normal_ms
 ip_poll_normal_ticks
 ip_rput_pullups
 ip_six_byte_all_ones
 ip_squeue_create_callback
 ip_squeue_enter
-ip_squeue_enter_unbound
 ip_squeue_fanout
 ip_squeue_flag
 ip_squeue_worker_wait
 ip_thread_data
 ip_thread_list
 ip_thread_rwlock
-ip_use_helper_cache
-ip_wput_frag_mdt_min
 ipcl_bind_fanout_size
 ipcl_conn_hash_maxsize
 ipcl_conn_hash_memfactor
@@ -173,23 +151,16 @@ ipinfov4
 ipinfov6
 iplrinit
 iplwinit
-ipmp_aract_template
-ipmp_ardeact_template
 ipmp_kstats
 iprinitv4
 iprinitv6
 ipsec_action_cache
 ipsec_hdr_pullup_needed
-ipsec_info_cache
 ipsec_pol_cache
 ipsec_policy_failure_msgs
 ipsec_sel_cache
 ipsec_spd_hashsize
 ipsec_weird_null_inbound_policy
-iptunq_info
-iptunq_modinfo
-iptunq_rinit
-iptunq_winit
 ipv4_forward_suffix
 ipv4info
 ipv6_all_hosts_mcast
@@ -197,29 +168,22 @@ ipv6_all_ones
 ipv6_all_rtrs_mcast
 ipv6_all_v2rtrs_mcast
 ipv6_all_zeros
-ipv6_areq_template
 ipv6_forward_suffix
 ipv6_ll_template
 ipv6_loopback
 ipv6_solicited_node_mcast
 ipv6_unspecified_group
 ipv6info
-ipwinitv4
-ipwinitv6
+ipwinit
 ire_cache
 ire_gw_secattr_cache
-ire_idle_cutoff_interval
 ire_null
 ire_nv_arr
 ire_nv_tbl
-ire_uinfo_null
 lcl_ndp_arr
 lcl_param_arr
 lcl_sctp_param_arr
 lcl_sctp_wroff_xtra_param
-lcl_tcp_mdt_head_param
-lcl_tcp_mdt_max_pbufs_param
-lcl_tcp_mdt_tail_param
 lcl_tcp_param_arr
 lcl_tcp_wroff_xtra_param
 mask_rnhead
@@ -228,12 +192,15 @@ modldrv
 modlinkage
 modlstrmod
 multicast_encap_iphdr
+nce_cache
+ncec_cache
 netdev_privs
 prov_update_handle
 radix_mask_cache
 radix_node_cache
 rawip_conn_cache
 req_arr
+rinit_arp
 rn_mkfreelist
 rn_ones
 rn_zeros
@@ -256,21 +223,22 @@ sctp_kmem_faddr_cache
 sctp_kmem_ftsn_set_cache
 sctp_kmem_set_cache
 sctp_mod_info
+sctp_opt_arr
+sctp_opt_arr_size
 sctp_recvq_tq_task_max
 sctp_recvq_tq_task_min
 sctp_recvq_tq_thr_max
 sctp_recvq_tq_thr_min
 sctp_sin6_null
-sctp_taskq
 sctpdebug
 sctpinfo
 sctprinit
 sctpwinit
 sin6_null
 sin_null
-sock_tcp_downcalls
-sock_rts_downcalls
 sock_rawip_downcalls
+sock_rts_downcalls
+sock_tcp_downcalls
 sock_udp_downcalls
 sqset_global_list
 sqset_global_size
@@ -292,12 +260,10 @@ tcp_g_statistics
 tcp_g_t_info_ack
 tcp_g_t_info_ack_v6
 tcp_icmp_source_quench
-tcp_iphc_cache
 tcp_max_optsize
-tcp_mdt_chain
-tcp_mdt_smss_threshold
 tcp_opt_arr
 tcp_opt_obj
+tcp_outbound_squeue_switch
 tcp_random_anon_port
 tcp_random_end_ptr
 tcp_random_fptr
@@ -313,13 +279,11 @@ tcp_sock_winit
 tcp_squeue_flag
 tcp_squeue_wput
 tcp_static_maxpsz
-tcp_taskq
 tcp_timercache
 tcp_tx_pull_len
 tcp_valid_levels_arr
 tcp_winfo
 tcp_winit
-tcp_outbound_squeue_switch
 tcpinfov4
 tcpinfov6
 tli_errs
@@ -344,4 +308,6 @@ udp_valid_levels_arr
 udp_winit
 udpinfov4
 udpinfov6
-zero_info
+winit_arp
+eri_cksum_workaround
+nxge_cksum_workaround
diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared
index 7aa463978d..873557cbd6 100644
--- a/usr/src/uts/sparc/Makefile.sparc.shared
+++ b/usr/src/uts/sparc/Makefile.sparc.shared
@@ -205,7 +205,7 @@ DRV_KMODS	+= aggr arp audio bl bofi clone cn conskbd consms cpuid
 DRV_KMODS	+= crypto cryptoadm devinfo dump
 DRV_KMODS	+= dtrace fasttrap fbt lockstat profile sdt systrace dcpc
 DRV_KMODS	+= fssnap icmp icmp6 ip ip6 ipnet ipsecah
-DRV_KMODS	+= ipsecesp iptun iptunq iwscn keysock kmdb kstat ksyms llc1
+DRV_KMODS	+= ipsecesp iptun iwscn keysock kmdb kstat ksyms llc1
 DRV_KMODS	+= lofi
 DRV_KMODS	+= log logindmux kssl mm nca physmem pm poll pool
 DRV_KMODS	+= pseudo ptc ptm pts ptsl ramdisk random rsm rts sad
diff --git a/usr/src/uts/sparc/arp/Makefile b/usr/src/uts/sparc/arp/Makefile
index 21c26c762e..6d1610da66 100644
--- a/usr/src/uts/sparc/arp/Makefile
+++ b/usr/src/uts/sparc/arp/Makefile
@@ -20,11 +20,9 @@
 #
 #
 # uts/sparc/arp/Makefile
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
 #	This makefile drives the production of the arp driver kernel module.
 #
 #	sparc architecture dependent
@@ -72,7 +70,7 @@ CFLAGS		+= $(CCVERBOSE)
 #
 #	depends on ip
 #
-LDFLAGS		+= -dy -Ndrv/ip -Ndrv/hook -Nmisc/neti
+LDFLAGS		+= -dy -Ndrv/ip
 
 #
 # For now, disable these lint checks; maintainers should endeavor
diff --git a/usr/src/uts/sparc/arp/arp.global-objs.debug64 b/usr/src/uts/sparc/arp/arp.global-objs.debug64
index 7f826ea213..f936276753 100644
--- a/usr/src/uts/sparc/arp/arp.global-objs.debug64
+++ b/usr/src/uts/sparc/arp/arp.global-objs.debug64
@@ -23,15 +23,6 @@
 # Use is subject to license terms.
 #
 
-ar_cmd_tbl
-ar_m_tbl
-arp_mod_info
-arp_no_defense
-arpinfo
-arprinit
-arpwinit
-arp_param_arr
-arp_netinfo
 cb_inet_devops
 fsw
 inet_dev_info
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64
index 8df87d813d..07e9aaedde 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.debug64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64
@@ -23,19 +23,24 @@
 # Use is subject to license terms.
 #
 
+arp_m_tbl
+arp_mod_info
+arp_netinfo
+arp_no_defense
+arpinfo
 cb_inet_devops
 cl_inet_bind
+cl_inet_checkspi
 cl_inet_connect2
+cl_inet_deletespi
 cl_inet_disconnect
+cl_inet_getspi
+cl_inet_idlesa
 cl_inet_ipident
 cl_inet_isclusterwide
 cl_inet_listen
 cl_inet_unbind
 cl_inet_unlisten
-cl_inet_getspi
-cl_inet_checkspi
-cl_inet_deletespi
-cl_inet_idlesa
 cl_sctp_assoc_change
 cl_sctp_check_addrs
 cl_sctp_connect
@@ -43,6 +48,7 @@ cl_sctp_disconnect
 cl_sctp_listen
 cl_sctp_unlisten
 conn_drain_nthreads
+dce_cache
 default_ip6_asp_table
 do_tcp_fusion
 do_tcpzcopy
@@ -97,74 +103,45 @@ ill_no_arena
 ill_null
 inet_dev_info
 inet_devops
-ip6_area_template
-ip6_ared_template
-ip6_cache_table_size
 ip6_ftable_hash_size
-ip6_ire_max_bucket_cnt
-ip6_ire_min_bucket_cnt
-ip6_max_cache_table_size
 ip6opt_ls
-ip_ard_template
-ip_area_template
-ip_ared_template
-ip_areq_template
-ip_arma_multi_template
-ip_aroff_template
-ip_aron_template
-ip_aru_template
-ip_cache_table_size
 ip_cgtp_filter_rev
 ip_conn_cache
 ip_debug
 ip_g_all_ones
-ip_helper_stream_cache
 ip_helper_stream_info
 ip_helper_stream_rinit
 ip_helper_stream_winit
 ip_ioctl_ftbl
-ip_ire_cleanup_cnt
-ip_ire_cpu_ratio
-ip_ire_max_bucket_cnt
-ip_ire_mem_ratio
-ip_ire_min_bucket_cnt
-ip_loopback_mtu
 ip_loopback_mtu_v6plus
 ip_loopback_mtuplus
 ip_m_tbl
-ip_max_cache_table_size
 ip_max_frag_dups
 ip_min_frag_prune_time
-ip_minor_arena_sa
 ip_minor_arena_la
+ip_minor_arena_sa
 ip_misc_ioctl_count
 ip_misc_ioctl_table
 ip_mod_info
 ip_modclose_ackwait_ms
 ip_ndx_ioctl_count
 ip_ndx_ioctl_table
-ip_opt_arr
-ip_opt_obj
 ip_poll_normal_ms
 ip_poll_normal_ticks
 ip_rput_pullups
 ip_six_byte_all_ones
 ip_squeue_create_callback
 ip_squeue_enter
-ip_squeue_enter_unbound
 ip_squeue_fanout
 ip_squeue_flag
 ip_squeue_worker_wait
 ip_thread_data
 ip_thread_list
 ip_thread_rwlock
-ip_use_helper_cache
-ip_wput_frag_mdt_min
 ipcl_bind_fanout_size
 ipcl_conn_hash_maxsize
 ipcl_conn_hash_memfactor
 ipcl_conn_hash_size
-ipcl_debug_level
 ipcl_iptun_fanout_size
 ipcl_raw_fanout_size
 ipcl_udp_fanout_size
@@ -174,24 +151,16 @@ ipinfov4
 ipinfov6
 iplrinit
 iplwinit
-ipmp_aract_template
-ipmp_ardeact_template
 ipmp_kstats
 iprinitv4
 iprinitv6
 ipsec_action_cache
 ipsec_hdr_pullup_needed
-ipsec_info_cache
 ipsec_pol_cache
 ipsec_policy_failure_msgs
 ipsec_sel_cache
 ipsec_spd_hashsize
 ipsec_weird_null_inbound_policy
-ipsechw_debug
-iptunq_info
-iptunq_modinfo
-iptunq_rinit
-iptunq_winit
 ipv4_forward_suffix
 ipv4info
 ipv6_all_hosts_mcast
@@ -199,29 +168,22 @@ ipv6_all_ones
 ipv6_all_rtrs_mcast
 ipv6_all_v2rtrs_mcast
 ipv6_all_zeros
-ipv6_areq_template
 ipv6_forward_suffix
 ipv6_ll_template
 ipv6_loopback
 ipv6_solicited_node_mcast
 ipv6_unspecified_group
 ipv6info
-ipwinitv4
-ipwinitv6
+ipwinit
 ire_cache
 ire_gw_secattr_cache
-ire_idle_cutoff_interval
 ire_null
 ire_nv_arr
 ire_nv_tbl
-ire_uinfo_null
 lcl_ndp_arr
 lcl_param_arr
 lcl_sctp_param_arr
 lcl_sctp_wroff_xtra_param
-lcl_tcp_mdt_head_param
-lcl_tcp_mdt_max_pbufs_param
-lcl_tcp_mdt_tail_param
 lcl_tcp_param_arr
 lcl_tcp_wroff_xtra_param
 mask_rnhead
@@ -230,6 +192,8 @@ modldrv
 modlinkage
 modlstrmod
 multicast_encap_iphdr
+nce_cache
+ncec_cache
 netdev_privs
 prov_update_handle
 radix_mask_cache
@@ -238,6 +202,7 @@ rawip_conn_cache
 recvq_call
 recvq_loop_cnt
 req_arr
+rinit_arp
 rn_mkfreelist
 rn_ones
 rn_zeros
@@ -260,19 +225,17 @@ sctp_kmem_faddr_cache
 sctp_kmem_ftsn_set_cache
 sctp_kmem_set_cache
 sctp_mod_info
+sctp_opt_arr
+sctp_opt_arr_size
 sctp_recvq_tq_task_max
 sctp_recvq_tq_task_min
 sctp_recvq_tq_thr_max
 sctp_recvq_tq_thr_min
 sctp_sin6_null
-sctp_taskq
 sctpdebug
 sctpinfo
 sctprinit
 sctpwinit
-sendq_collision
-sendq_empty
-sendq_loop_cnt
 sin6_null
 sin_null
 skip_sctp_cksum
@@ -300,12 +263,10 @@ tcp_g_statistics
 tcp_g_t_info_ack
 tcp_g_t_info_ack_v6
 tcp_icmp_source_quench
-tcp_iphc_cache
 tcp_max_optsize
-tcp_mdt_chain
-tcp_mdt_smss_threshold
 tcp_opt_arr
 tcp_opt_obj
+tcp_outbound_squeue_switch
 tcp_random_anon_port
 tcp_random_end_ptr
 tcp_random_fptr
@@ -321,13 +282,11 @@ tcp_sock_winit
 tcp_squeue_flag
 tcp_squeue_wput
 tcp_static_maxpsz
-tcp_taskq
 tcp_timercache
 tcp_tx_pull_len
 tcp_valid_levels_arr
 tcp_winfo
 tcp_winit
-tcp_outbound_squeue_switch
 tcpinfov4
 tcpinfov6
 tli_errs
@@ -352,4 +311,6 @@ udp_valid_levels_arr
 udp_winit
 udpinfov4
 udpinfov6
-zero_info
+winit_arp
+eri_cksum_workaround
+nxge_cksum_workaround
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64
index 3df973b8f9..526e907ab5 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.obj64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64
@@ -23,19 +23,24 @@
 # Use is subject to license terms.
 #
 
+arp_m_tbl
+arp_mod_info
+arp_netinfo
+arp_no_defense
+arpinfo
 cb_inet_devops
 cl_inet_bind
+cl_inet_checkspi
 cl_inet_connect2
+cl_inet_deletespi
 cl_inet_disconnect
+cl_inet_getspi
+cl_inet_idlesa
 cl_inet_ipident
 cl_inet_isclusterwide
 cl_inet_listen
 cl_inet_unbind
 cl_inet_unlisten
-cl_inet_getspi
-cl_inet_checkspi
-cl_inet_deletespi
-cl_inet_idlesa
 cl_sctp_assoc_change
 cl_sctp_check_addrs
 cl_sctp_connect
@@ -43,6 +48,7 @@ cl_sctp_disconnect
 cl_sctp_listen
 cl_sctp_unlisten
 conn_drain_nthreads
+dce_cache
 default_ip6_asp_table
 do_tcp_fusion
 do_tcpzcopy
@@ -97,69 +103,41 @@ ill_no_arena
 ill_null
 inet_dev_info
 inet_devops
-ip6_area_template
-ip6_ared_template
-ip6_cache_table_size
 ip6_ftable_hash_size
-ip6_ire_max_bucket_cnt
-ip6_ire_min_bucket_cnt
-ip6_max_cache_table_size
 ip6opt_ls
-ip_ard_template
-ip_area_template
-ip_ared_template
-ip_areq_template
-ip_arma_multi_template
-ip_aroff_template
-ip_aron_template
-ip_aru_template
-ip_cache_table_size
 ip_cgtp_filter_rev
 ip_conn_cache
 ip_debug
 ip_g_all_ones
-ip_helper_stream_cache
 ip_helper_stream_info
 ip_helper_stream_rinit
 ip_helper_stream_winit
 ip_ioctl_ftbl
-ip_ire_cleanup_cnt
-ip_ire_cpu_ratio
-ip_ire_max_bucket_cnt
-ip_ire_mem_ratio
-ip_ire_min_bucket_cnt
-ip_loopback_mtu
 ip_loopback_mtu_v6plus
 ip_loopback_mtuplus
 ip_m_tbl
-ip_max_cache_table_size
 ip_max_frag_dups
 ip_min_frag_prune_time
-ip_minor_arena_sa
 ip_minor_arena_la
+ip_minor_arena_sa
 ip_misc_ioctl_count
 ip_misc_ioctl_table
 ip_mod_info
 ip_modclose_ackwait_ms
 ip_ndx_ioctl_count
 ip_ndx_ioctl_table
-ip_opt_arr
-ip_opt_obj
 ip_poll_normal_ms
 ip_poll_normal_ticks
 ip_rput_pullups
 ip_six_byte_all_ones
 ip_squeue_create_callback
 ip_squeue_enter
-ip_squeue_enter_unbound
 ip_squeue_fanout
 ip_squeue_flag
 ip_squeue_worker_wait
 ip_thread_data
 ip_thread_list
 ip_thread_rwlock
-ip_use_helper_cache
-ip_wput_frag_mdt_min
 ipcl_bind_fanout_size
 ipcl_conn_hash_maxsize
 ipcl_conn_hash_memfactor
@@ -173,23 +151,16 @@ ipinfov4
 ipinfov6
 iplrinit
 iplwinit
-ipmp_aract_template
-ipmp_ardeact_template
 ipmp_kstats
 iprinitv4
 iprinitv6
 ipsec_action_cache
 ipsec_hdr_pullup_needed
-ipsec_info_cache
 ipsec_pol_cache
 ipsec_policy_failure_msgs
 ipsec_sel_cache
 ipsec_spd_hashsize
 ipsec_weird_null_inbound_policy
-iptunq_info
-iptunq_modinfo
-iptunq_rinit
-iptunq_winit
 ipv4_forward_suffix
 ipv4info
 ipv6_all_hosts_mcast
@@ -197,29 +168,22 @@ ipv6_all_ones
 ipv6_all_rtrs_mcast
 ipv6_all_v2rtrs_mcast
 ipv6_all_zeros
-ipv6_areq_template
 ipv6_forward_suffix
 ipv6_ll_template
 ipv6_loopback
 ipv6_solicited_node_mcast
 ipv6_unspecified_group
 ipv6info
-ipwinitv4
-ipwinitv6
+ipwinit
 ire_cache
 ire_gw_secattr_cache
-ire_idle_cutoff_interval
 ire_null
 ire_nv_arr
 ire_nv_tbl
-ire_uinfo_null
 lcl_ndp_arr
 lcl_param_arr
 lcl_sctp_param_arr
 lcl_sctp_wroff_xtra_param
-lcl_tcp_mdt_head_param
-lcl_tcp_mdt_max_pbufs_param
-lcl_tcp_mdt_tail_param
 lcl_tcp_param_arr
 lcl_tcp_wroff_xtra_param
 mask_rnhead
@@ -228,12 +192,15 @@ modldrv
 modlinkage
 modlstrmod
 multicast_encap_iphdr
+nce_cache
+ncec_cache
 netdev_privs
 prov_update_handle
 radix_mask_cache
 radix_node_cache
 rawip_conn_cache
 req_arr
+rinit_arp
 rn_mkfreelist
 rn_ones
 rn_zeros
@@ -256,12 +223,13 @@ sctp_kmem_faddr_cache
 sctp_kmem_ftsn_set_cache
 sctp_kmem_set_cache
 sctp_mod_info
+sctp_opt_arr
+sctp_opt_arr_size
 sctp_recvq_tq_task_max
 sctp_recvq_tq_task_min
 sctp_recvq_tq_thr_max
 sctp_recvq_tq_thr_min
 sctp_sin6_null
-sctp_taskq
 sctpdebug
 sctpinfo
 sctprinit
@@ -292,12 +260,10 @@ tcp_g_statistics
 tcp_g_t_info_ack
 tcp_g_t_info_ack_v6
 tcp_icmp_source_quench
-tcp_iphc_cache
 tcp_max_optsize
-tcp_mdt_chain
-tcp_mdt_smss_threshold
 tcp_opt_arr
 tcp_opt_obj
+tcp_outbound_squeue_switch
 tcp_random_anon_port
 tcp_random_end_ptr
 tcp_random_fptr
@@ -313,13 +279,11 @@ tcp_sock_winit
 tcp_squeue_flag
 tcp_squeue_wput
 tcp_static_maxpsz
-tcp_taskq
 tcp_timercache
 tcp_tx_pull_len
 tcp_valid_levels_arr
 tcp_winfo
 tcp_winit
-tcp_outbound_squeue_switch
 tcpinfov4
 tcpinfov6
 tli_errs
@@ -344,4 +308,6 @@ udp_valid_levels_arr
 udp_winit
 udpinfov4
 udpinfov6
-zero_info
+winit_arp
+eri_cksum_workaround
+nxge_cksum_workaround
diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s
index 18eba0bdfa..24058b72e4 100644
--- a/usr/src/uts/sparc/ml/modstubs.s
+++ b/usr/src/uts/sparc/ml/modstubs.s
@@ -397,7 +397,6 @@ stubs_base:
 	MODULE(ipsecah,drv);
 	WSTUB(ipsecah,	ipsec_construct_inverse_acquire,	nomod_zero);
 	WSTUB(ipsecah,	sadb_acquire,		nomod_zero);
-	WSTUB(ipsecah,	sadb_ill_download,	nomod_zero); 	
 	WSTUB(ipsecah,	ipsecah_algs_changed,	nomod_zero);
 	WSTUB(ipsecah,	sadb_alg_update,	nomod_zero);
 	WSTUB(ipsecah,	sadb_unlinkassoc,	nomod_zero);
@@ -1218,8 +1217,6 @@ stubs_base:
 	STUB(iptun, iptun_create, nomod_einval);
 	STUB(iptun, iptun_delete, nomod_einval);
 	STUB(iptun, iptun_set_policy, nomod_einval);
-	STUB(iptun, iptun_set_g_q, nomod_einval);
-	STUB(iptun, iptun_clear_g_q, nomod_void);
 	END_MODULE(iptun);
 #endif