50 files changed, 9123 insertions, 11880 deletions
diff --git a/usr/src/uts/common/inet/arp.h b/usr/src/uts/common/inet/arp.h
index 0bca52e9ae..4351c91666 100644
--- a/usr/src/uts/common/inet/arp.h
+++ b/usr/src/uts/common/inet/arp.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -28,6 +28,7 @@
 #define	_INET_ARP_H
 
 #include <sys/types.h>
+#include <net/if.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -64,6 +65,8 @@ extern "C" {
  */
 #define	AR_ARP_CLOSING		(AR_IOCTL + 16)
 #define	AR_ARP_EXTEND		(AR_IOCTL + 17)
+#define	AR_IPMP_ACTIVATE	(AR_IOCTL + 18)
+#define	AR_IPMP_DEACTIVATE	(AR_IOCTL + 19)
 
 /* Both ace_flags and area_flags; must also modify arp.c in mdb */
 #define	ACE_F_PERMANENT		0x0001
@@ -182,6 +185,14 @@ typedef	struct ar_mapping_add_s {
 					/* the mask&proto_addr */
 } arma_t;
 
+/* Structure used to notify ARP of changes to IPMP group topology */
+typedef	struct ar_ipmp_event_s {
+	uint32_t	arie_cmd;
+	uint32_t	arie_name_offset;
+	uint32_t	arie_name_length;
+	char		arie_grifname[LIFNAMSIZ];
+} arie_t;
+
 /* Structure used to notify clients of interesting conditions. */
 typedef struct ar_client_notify_s {
 	uint32_t	arcn_cmd;
diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c
index 815dfd19d3..06c499ced9 100644
--- a/usr/src/uts/common/inet/arp/arp.c
+++ b/usr/src/uts/common/inet/arp/arp.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -85,6 +85,30 @@
  * talking to a given peer, then it doesn't matter if we have the right mapping
  * for that peer.  It would be possible to send queries on aging entries that
  * are active, but this isn't done.
+ *
+ * IPMP Notes
+ * ----------
+ *
+ * ARP is aware of IPMP.  In particular, IP notifies ARP about all "active"
+ * (able to transmit data packets) interfaces in a given group via
+ * AR_IPMP_ACTIVATE and AR_IPMP_DEACTIVATE messages.  These messages, combined
+ * with the "IPMP arl_t" that ARP creates over the IPMP DLPI stub driver,
+ * enable ARP to track all the arl_t's that are in the same group and thus
+ * ensure that ACEs are shared across each group and the arl_t that ARP
+ * chooses to transmit on for a given ACE is optimal.
+ *
+ * ARP relies on IP for hardware address updates.  In particular, if the
+ * hardware address of an interface changes (DL_NOTE_PHYS_ADDR), then IP will
+ * bring the interface down and back up -- and as part of bringing it back
+ * up, will send messages to ARP that allow it to update the affected arl's
+ * with new hardware addresses.
+ *
+ * N.B.: One side-effect of this approach is that when an interface fails and
+ * then starts to repair, it will temporarily populate the ARP cache with
+ * addresses that are owned by it rather than the group's arl_t.  To address
+ * this, we could add more messages (e.g., AR_IPMP_JOIN and AR_IPMP_LEAVE),
+ * but as the issue appears to be only cosmetic (redundant entries in the ARP
+ * cache during interace repair), we've kept things simple for now.
  */
 
 /*
@@ -134,6 +158,12 @@ typedef struct {
 #define	ARH_FIXED_LEN	8
 
 /*
+ * Macro used when creating ACEs to determine the arl that should own it.
+ */
+#define	OWNING_ARL(arl)							\
+	((arl)->arl_ipmp_arl != NULL ? (arl)->arl_ipmp_arl : arl)
+
+/*
  * MAC-specific intelligence.  Shouldn't be needed, but the DL_INFO_ACK
  * doesn't quite do it for us.
  */
@@ -154,7 +184,7 @@ static int	ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr,
     uint32_t hw_addr_len, uchar_t *proto_addr,
     uint32_t proto_addr_len, uchar_t *proto_mask,
     uchar_t *proto_extract_mask, uint32_t hw_extract_start,
-    uint32_t flags);
+    uchar_t *sender_addr, uint32_t flags);
 static void	ar_ce_delete(ace_t *ace);
 static void	ar_ce_delete_per_arl(ace_t *ace, void *arg);
 static ace_t	**ar_ce_hash(arp_stack_t *as, uint32_t proto,
@@ -167,6 +197,8 @@ static ace_t	*ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp,
     ace_t *matchfn());
 static ace_t	*ar_ce_lookup_mapping(arl_t *arl, uint32_t proto,
     const uchar_t *proto_addr, uint32_t proto_addr_length);
+static ace_t	*ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto,
+    uchar_t *proto_addr, uint32_t proto_addr_length);
 static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr,
     uint32_t hw_addr_length);
 static void	ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *),
@@ -187,6 +219,8 @@ static int	ar_interface_up(queue_t *q, mblk_t *mp);
 static int	ar_interface_down(queue_t *q, mblk_t *mp);
 static int	ar_interface_on(queue_t *q, mblk_t *mp);
 static int	ar_interface_off(queue_t *q, mblk_t *mp);
+static int	ar_ipmp_activate(queue_t *q, mblk_t *mp);
+static int	ar_ipmp_deactivate(queue_t *q, mblk_t *mp);
 static void	ar_ll_cleanup_arl_queue(queue_t *q);
 static void	ar_ll_down(arl_t *arl);
 static arl_t	*ar_ll_lookup_by_name(arp_stack_t *as, const char *name);
@@ -208,7 +242,7 @@ static int	ar_param_set(queue_t *q, mblk_t *mp, char *value,
 static void	ar_query_delete(ace_t *ace, void *ar);
 static void	ar_query_reply(ace_t *ace, int ret_val,
     uchar_t *proto_addr, uint32_t proto_addr_len);
-static clock_t	ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace);
+static clock_t	ar_query_xmit(arp_stack_t *as, ace_t *ace);
 static void	ar_rput(queue_t *q, mblk_t *mp_orig);
 static void	ar_rput_dlpi(queue_t *q, mblk_t *mp);
 static void	ar_set_address(ace_t *ace, uchar_t *addrpos,
@@ -344,6 +378,10 @@ static arct_t	ar_cmd_tbl[] = {
 	    ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_ON" },
 	{ ar_interface_off,	AR_INTERFACE_OFF,	sizeof (arc_t),
 	    ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_OFF" },
+	{ ar_ipmp_activate,	AR_IPMP_ACTIVATE,	sizeof (arie_t),
+	    ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_ACTIVATE" },
+	{ ar_ipmp_deactivate, 	AR_IPMP_DEACTIVATE,	sizeof (arie_t),
+	    ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_DEACTIVATE" },
 	{ ar_set_ppa,		(uint32_t)IF_UNITSEL,	sizeof (int),
 	    ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "IF_UNITSEL" },
 	{ ar_nd_ioctl,		ND_GET,			1,
@@ -358,6 +396,65 @@ static arct_t	ar_cmd_tbl[] = {
 };
 
 /*
+ * Lookup and return an arl appropriate for sending packets with either source
+ * hardware address `hw_addr' or source protocol address `ip_addr', in that
+ * order.  If neither was specified or neither match, return any arl in the
+ * same group as `arl'.
+ */
+static arl_t *
+ar_ipmp_lookup_xmit_arl(arl_t *arl, uchar_t *hw_addr, uint_t hw_addrlen,
+    uchar_t *ip_addr)
+{
+	arlphy_t *ap;
+	ace_t *src_ace;
+	arl_t *xmit_arl = NULL;
+	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
+
+	ASSERT(arl->arl_flags & ARL_F_IPMP);
+
+	if (hw_addr != NULL && hw_addrlen != 0) {
+		xmit_arl = as->as_arl_head;
+		for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) {
+			/*
+			 * There may be arls with the same HW address that are
+			 * not in our IPMP group; we don't want those.
+			 */
+			if (xmit_arl->arl_ipmp_arl != arl)
+				continue;
+
+			ap = xmit_arl->arl_phy;
+			if (ap != NULL && ap->ap_hw_addrlen == hw_addrlen &&
+			    bcmp(ap->ap_hw_addr, hw_addr, hw_addrlen) == 0)
+				break;
+		}
+
+		DTRACE_PROBE4(xmit_arl_hwsrc, arl_t *, arl, arl_t *,
+		    xmit_arl, uchar_t *, hw_addr, uint_t, hw_addrlen);
+	}
+
+	if (xmit_arl == NULL && ip_addr != NULL) {
+		src_ace = ar_ce_lookup_permanent(as, IP_ARP_PROTO_TYPE, ip_addr,
+		    IP_ADDR_LEN);
+		if (src_ace != NULL)
+			xmit_arl = src_ace->ace_xmit_arl;
+
+		DTRACE_PROBE4(xmit_arl_ipsrc, arl_t *, arl, arl_t *,
+		    xmit_arl, uchar_t *, ip_addr, uint_t, IP_ADDR_LEN);
+	}
+
+	if (xmit_arl == NULL) {
+		xmit_arl = as->as_arl_head;
+		for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next)
+			if (xmit_arl->arl_ipmp_arl == arl && xmit_arl != arl)
+				break;
+
+		DTRACE_PROBE2(xmit_arl_any, arl_t *, arl, arl_t *, xmit_arl);
+	}
+
+	return (xmit_arl);
+}
+
+/*
  * ARP Cache Entry creation routine.
  * Cache entries are allocated within timer messages and inserted into
  * the global hash list based on protocol and protocol address.
@@ -365,7 +462,8 @@ static arct_t	ar_cmd_tbl[] = {
 static int
 ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
     uchar_t *proto_addr, uint_t proto_addr_len, uchar_t *proto_mask,
-    uchar_t *proto_extract_mask, uint_t hw_extract_start, uint_t flags)
+    uchar_t *proto_extract_mask, uint_t hw_extract_start, uchar_t *sender_addr,
+    uint_t flags)
 {
 	static ace_t	ace_null;
 	ace_t	*ace;
@@ -373,17 +471,35 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
 	uchar_t	*dst;
 	mblk_t	*mp;
 	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
+	arl_t	*xmit_arl;
 	arlphy_t *ap;
 
 	if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL)
 		return (EINVAL);
 
-	if ((ap = arl->arl_phy) == NULL)
+	if (proto_addr == NULL || proto_addr_len == 0 ||
+	    (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN))
 		return (EINVAL);
 
 	if (flags & ACE_F_MYADDR)
 		flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY;
 
+	/*
+	 * Latch a transmit arl for this ace.
+	 */
+	if (arl->arl_flags & ARL_F_IPMP) {
+		ASSERT(proto == IP_ARP_PROTO_TYPE);
+		xmit_arl = ar_ipmp_lookup_xmit_arl(arl, hw_addr, hw_addr_len,
+		    sender_addr);
+	} else {
+		xmit_arl = arl;
+	}
+
+	if (xmit_arl == NULL || xmit_arl->arl_phy == NULL)
+		return (EINVAL);
+
+	ap = xmit_arl->arl_phy;
+
 	if (!hw_addr && hw_addr_len == 0) {
 		if (flags == ACE_F_PERMANENT) {	/* Not publish */
 			/* 224.0.0.0 to zero length address */
@@ -398,9 +514,6 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
 		flags |= ACE_F_RESOLVED;
 	}
 
-	if (proto_addr == NULL || proto_addr_len == 0 ||
-	    (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN))
-		return (EINVAL);
 	/* Handle hw_addr_len == 0 for DL_ENABMULTI_REQ etc. */
 	if (hw_addr_len != 0 && hw_addr == NULL)
 		return (EINVAL);
@@ -432,6 +545,7 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
 	ace->ace_proto = proto;
 	ace->ace_mp = mp;
 	ace->ace_arl = arl;
+	ace->ace_xmit_arl = xmit_arl;
 
 	dst = (uchar_t *)&ace[1];
 
@@ -510,12 +624,73 @@ ar_ce_delete(ace_t *ace)
 static void
 ar_ce_delete_per_arl(ace_t *ace, void *arl)
 {
-	if (ace->ace_arl == arl) {
+	if (ace->ace_arl == arl || ace->ace_xmit_arl == arl) {
 		ace->ace_flags &= ~ACE_F_PERMANENT;
 		ar_ce_delete(ace);
 	}
 }
 
+/*
+ * ar_ce_walk routine used when deactivating an `arl' in a group.  Deletes
+ * `ace' if it was using `arl_arg' as its output interface.
+ */
+static void
+ar_ce_ipmp_deactivate(ace_t *ace, void *arl_arg)
+{
+	arl_t *arl = arl_arg;
+
+	ASSERT(!(arl->arl_flags & ARL_F_IPMP));
+
+	if (ace->ace_arl == arl) {
+		ASSERT(ace->ace_xmit_arl == arl);
+		/*
+		 * This ACE is tied to the arl leaving the group (e.g., an
+		 * ACE_F_PERMANENT for a test address) and is not used by the
+		 * group, so we can leave it be.
+		 */
+		return;
+	}
+
+	if (ace->ace_xmit_arl != arl)
+		return;
+
+	ASSERT(ace->ace_arl == arl->arl_ipmp_arl);
+
+	/*
+	 * IP should've already sent us messages asking us to move any
+	 * ACE_F_MYADDR entries to another arl, but there are two exceptions:
+	 *
+	 * 1. The group was misconfigured with interfaces that have duplicate
+	 *    hardware addresses, but in.mpathd was unable to offline those
+	 *    duplicate interfaces.
+	 *
+	 * 2. The messages from IP were lost or never created (e.g. due to
+	 *    memory pressure).
+	 *
+	 * We handle the first case by just quietly deleting the ACE.  Since
+	 * the second case cannot be distinguished from a more serious bug in
+	 * the IPMP framework, we ASSERT() that this can't happen on DEBUG
+	 * systems, but quietly delete the ACE on production systems (the
+	 * deleted ACE will render the IP address unreachable).
+	 */
+	if (ace->ace_flags & ACE_F_MYADDR) {
+		arlphy_t *ap = arl->arl_phy;
+		uint_t hw_addrlen = ap->ap_hw_addrlen;
+
+		ASSERT(hw_addrlen == ace->ace_hw_addr_length &&
+		    bcmp(ap->ap_hw_addr, ace->ace_hw_addr, hw_addrlen) == 0);
+	}
+
+	/*
+	 * NOTE: it's possible this arl got selected as the ace_xmit_arl when
+	 * creating an ACE_F_PERMANENT ACE on behalf of an SIOCS*ARP ioctl for
+	 * an IPMP IP interface.  But it's still OK for us to delete such an
+	 * ACE since ipmp_illgrp_refresh_arpent() will ask us to recreate it
+	 * and we'll pick another arl then.
+	 */
+	ar_ce_delete(ace);
+}
+
 /* Cache entry hash routine, based on protocol and protocol address. */
 static ace_t **
 ar_ce_hash(arp_stack_t *as, uint32_t proto, const uchar_t *proto_addr,
@@ -559,7 +734,8 @@ ar_ce_lookup_entry(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
 		return (NULL);
 	ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
 	for (; ace; ace = ace->ace_next) {
-		if (ace->ace_arl == arl &&
+		if ((ace->ace_arl == arl ||
+		    ace->ace_arl == arl->arl_ipmp_arl) &&
 		    ace->ace_proto_addr_length == proto_addr_length &&
 		    ace->ace_proto == proto) {
 			int	i1 = proto_addr_length;
@@ -632,13 +808,6 @@ ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
 
 /*
  * Look for a permanent entry for proto_addr across all interfaces.
- * This is used for sending ARP requests out. Requests may come from
- * IP on le0 with the source address of le1 and we need to send out
- * the request on le1 so that ARP does not think that somebody else
- * is using its PERMANENT address. If le0 and le1 are sitting on
- * the same wire, the same IP -> ethernet mapping might exist on
- * both the interfaces. But we should look for the permanent
- * mapping to avoid arp interpreting it as a duplicate.
  */
 static ace_t *
 ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr,
@@ -653,8 +822,8 @@ ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr,
 		if (ace->ace_proto_addr_length == proto_addr_length &&
 		    ace->ace_proto == proto) {
 			int	i1 = proto_addr_length;
-			uchar_t	*ace_addr = ace->ace_proto_addr;
-			uchar_t	*mask = ace->ace_proto_mask;
+			uchar_t *ace_addr = ace->ace_proto_addr;
+			uchar_t *mask = ace->ace_proto_mask;
 
 			/*
 			 * Note that the ace_proto_mask is applied to the
@@ -703,12 +872,8 @@ ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length)
  * 1. Resolution of unresolved entries and update of resolved entries.
  * 2. Detection of nodes with our own IP address (duplicates).
  *
- * This is complicated by ill groups.  We don't currently have knowledge of ill
- * groups, so we can't distinguish between a packet that comes in on one of the
- * arls that's part of the group versus one that's on an unrelated arl.  Thus,
- * we take a conservative approach.  If the arls match, then we update resolved
- * and unresolved entries alike.  If they don't match, then we update only
- * unresolved entries.
+ * If the resolving ARL is in the same group as a matching ACE's ARL, then
+ * update the ACE.  Otherwise, make no updates.
  *
  * For all entries, we first check to see if this is a duplicate (probable
  * loopback) message.  If so, then just ignore it.
@@ -741,7 +906,7 @@ ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length)
 
 static int
 ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
-    uint32_t hlen, const uchar_t *src_paddr, uint32_t plen)
+    uint32_t hlen, const uchar_t *src_paddr, uint32_t plen, arl_t **ace_arlp)
 {
 	ace_t *ace;
 	ace_t *ace_next;
@@ -778,31 +943,35 @@ ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
 		if (i1 >= 0)
 			continue;
 
+		*ace_arlp = ace->ace_arl;
+
 		/*
-		 * If both IP addr and hardware address match what we already
-		 * have, then this is a broadcast packet emitted by one of our
-		 * interfaces, reflected by the switch and received on another
-		 * interface.  We return AR_LOOPBACK.
+		 * If the IP address is ours, and the hardware address matches
+		 * one of our own arls, then this is a broadcast packet
+		 * emitted by one of our interfaces, reflected by the switch
+		 * and received on another interface.  We return AR_LOOPBACK.
 		 */
-		if ((ace->ace_flags & ACE_F_MYADDR) &&
-		    hlen == ace->ace_hw_addr_length &&
-		    bcmp(ace->ace_hw_addr, src_haddr,
-		    ace->ace_hw_addr_length) == 0) {
-			return (AR_LOOPBACK);
+		if (ace->ace_flags & ACE_F_MYADDR) {
+			arl_t *hw_arl = as->as_arl_head;
+			arlphy_t *ap;
+
+			for (; hw_arl != NULL; hw_arl = hw_arl->arl_next) {
+				ap = hw_arl->arl_phy;
+				if (ap != NULL && ap->ap_hw_addrlen == hlen &&
+				    bcmp(ap->ap_hw_addr, src_haddr, hlen) == 0)
+					return (AR_LOOPBACK);
+			}
 		}
 
 		/*
 		 * If the entry is unverified, then we've just verified that
 		 * someone else already owns this address, because this is a
 		 * message with the same protocol address but different
-		 * hardware address. Conflicts received via an interface which
-		 * doesn't own the conflict address are not actioned. Multiple
-		 * interfaces on the same segment imply any conflict will also
-		 * be seen via the correct interface, so we can ignore anything
-		 * not matching the arl from the ace.
+		 * hardware address.  NOTE: the ace_xmit_arl check ensures we
+		 * don't send duplicate AR_FAILEDs if arl is in an IPMP group.
 		 */
 		if ((ace->ace_flags & ACE_F_UNVERIFIED) &&
-		    arl == ace->ace_arl) {
+		    arl == ace->ace_xmit_arl) {
 			ar_ce_delete(ace);
 			return (AR_FAILED);
 		}
@@ -814,30 +983,29 @@ ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
 		 * that, if we're currently in initial announcement mode, we
 		 * switch back to the lazier defense mode.  Knowing that
 		 * there's at least one duplicate out there, we ought not
-		 * blindly announce. Conflicts received via an interface which
-		 * doesn't own the conflict address are not actioned. Multiple
-		 * interfaces on the same segment imply the conflict will also
-		 * be seen via the correct interface, so we can ignore anything
-		 * not matching the arl from the ace.
+		 * blindly announce.  NOTE: the ace_xmit_arl check ensures we
+		 * don't send duplicate AR_BOGONs if arl is in an IPMP group.
 		 */
 		if ((ace->ace_flags & ACE_F_AUTHORITY) &&
-		    arl == ace->ace_arl) {
+		    arl == ace->ace_xmit_arl) {
 			ace->ace_xmit_count = 0;
 			return (AR_BOGON);
 		}
 
 		/*
-		 * Limit updating across other ills to unresolved
-		 * entries only.  We don't want to inadvertently update
-		 * published entries.
+		 * Only update this ACE if it's on the same network -- i.e.,
+		 * it's for our ARL or another ARL in the same IPMP group.
 		 */
-		if (ace->ace_arl == arl || !ACE_RESOLVED(ace)) {
+		if (ace->ace_arl == arl || ace->ace_arl == arl->arl_ipmp_arl) {
 			if (ar_ce_resolve(ace, src_haddr, hlen))
 				retv = AR_CHANGED;
 			else if (retv == AR_NOTFOUND)
 				retv = AR_MERGED;
 		}
 	}
+
+	if (retv == AR_NOTFOUND)
+		*ace_arlp = NULL;
 	return (retv);
 }
 
@@ -917,7 +1085,7 @@ static void
 ar_delete_notify(const ace_t *ace)
 {
 	const arl_t *arl = ace->ace_arl;
-	const arlphy_t *ap = arl->arl_phy;
+	const arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
 	mblk_t	*mp;
 	size_t	len;
 	arh_t	*arh;
@@ -945,7 +1113,7 @@ ar_close(queue_t *q)
 {
 	ar_t	*ar = (ar_t *)q->q_ptr;
 	char	name[LIFNAMSIZ];
-	arl_t	*arl;
+	arl_t	*arl, *xarl;
 	arl_t	**arlp;
 	cred_t	*cr;
 	arc_t	*arc;
@@ -999,6 +1167,21 @@ ar_close(queue_t *q)
 		while (arl->arl_state != ARL_S_DOWN)
 			qwait(arl->arl_rq);
 
+		if (arl->arl_flags & ARL_F_IPMP) {
+			/*
+			 * Though rude, someone could force the IPMP arl
+			 * closed without removing the underlying interfaces.
+			 * In that case, force the ARLs out of the group.
+			 */
+			xarl = as->as_arl_head;
+			for (; xarl != NULL; xarl = xarl->arl_next) {
+				if (xarl->arl_ipmp_arl != arl || xarl == arl)
+					continue;
+				ar_ce_walk(as, ar_ce_ipmp_deactivate, xarl);
+				xarl->arl_ipmp_arl = NULL;
+			}
+		}
+
 		ar_ll_clear_defaults(arl);
 		/*
 		 * If this is the control stream for an arl, delete anything
@@ -1417,9 +1600,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
 	area_t	*area;
 	ace_t	*ace;
 	uchar_t	*hw_addr;
-	uint32_t	hw_addr_len;
+	uint32_t hw_addr_len;
 	uchar_t	*proto_addr;
-	uint32_t	proto_addr_len;
+	uint32_t proto_addr_len;
 	uchar_t	*proto_mask;
 	arl_t	*arl;
 	mblk_t	*mp = mp_orig;
@@ -1494,6 +1677,7 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
 	    proto_mask,
 	    NULL,
 	    (uint32_t)0,
+	    NULL,
 	    aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND);
 	if (err != 0) {
 		DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area,
@@ -1502,7 +1686,13 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
 	}
 
 	if (aflags & ACE_F_PUBLISH) {
-		arlphy_t *ap = arl->arl_phy;
+		arlphy_t *ap;
+
+		ace = ar_ce_lookup(arl, area->area_proto, proto_addr,
+		    proto_addr_len);
+		ASSERT(ace != NULL);
+
+		ap = ace->ace_xmit_arl->arl_phy;
 
 		if (hw_addr == NULL || hw_addr_len == 0) {
 			hw_addr = ap->ap_hw_addr;
@@ -1519,10 +1709,6 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
 			ap->ap_hw_addrlen = hw_addr_len;
 		}
 
-		ace = ar_ce_lookup(arl, area->area_proto, proto_addr,
-		    proto_addr_len);
-		ASSERT(ace != NULL);
-
 		if (ace->ace_flags & ACE_F_FAST) {
 			ace->ace_xmit_count = as->as_fastprobe_count;
 			ace->ace_xmit_interval = as->as_fastprobe_delay;
@@ -1555,9 +1741,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
 				 */
 				DTRACE_PROBE2(eadd_probe, ace_t *, ace,
 				    area_t *, area);
-				ar_xmit(arl, ARP_REQUEST, area->area_proto,
-				    proto_addr_len, hw_addr, NULL, NULL,
-				    proto_addr, NULL, as);
+				ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
+				    area->area_proto, proto_addr_len,
+				    hw_addr, NULL, NULL, proto_addr, NULL, as);
 				ace->ace_xmit_count--;
 				ace->ace_xmit_interval =
 				    (ace->ace_flags & ACE_F_FAST) ?
@@ -1573,9 +1759,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
 		} else {
 			DTRACE_PROBE2(eadd_announce, ace_t *, ace,
 			    area_t *, area);
-			ar_xmit(arl, ARP_REQUEST, area->area_proto,
-			    proto_addr_len, hw_addr, proto_addr,
-			    ap->ap_arp_addr, proto_addr, NULL, as);
+			ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
+			    area->area_proto, proto_addr_len, hw_addr,
+			    proto_addr, ap->ap_arp_addr, proto_addr, NULL, as);
 			ace->ace_last_bcast = ddi_get_lbolt();
 
 			/*
@@ -1583,9 +1769,9 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig)
 			 * entry; we believe we're the authority for this
 			 * entry.  In that case, and if we're not just doing
 			 * one-off defense of the address, we send more than
-			 * one copy, so that if this is an IPMP failover, we'll
-			 * still have a good chance of updating everyone even
-			 * when there's a packet loss or two.
+			 * one copy, so we'll still have a good chance of
+			 * updating everyone even when there's a packet loss
+			 * or two.
 			 */
 			if ((aflags & ACE_F_AUTHORITY) &&
 			    !(aflags & ACE_F_DEFEND) &&
@@ -1667,7 +1853,6 @@ static int
 ar_entry_query(queue_t *q, mblk_t *mp_orig)
 {
 	ace_t	*ace;
-	ace_t	*src_ace = NULL;
 	areq_t	*areq;
 	arl_t	*arl;
 	int	err;
@@ -1782,20 +1967,12 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig)
 			err = ENXIO;
 			goto err_ret;
 		}
-		if (arl->arl_phy == NULL) {
-			/* Can't get help if we don't know how. */
-			DTRACE_PROBE2(query_no_phy, ace_t *, ace,
-			    areq_t *, areq);
-			mpp[0] = NULL;
-			mp->b_prev = NULL;
-			err = ENXIO;
-			goto err_ret;
-		}
 		DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq);
 	} else {
 		/* No ace yet.	Make one now.  (This is the common case.) */
-		if (areq->areq_xmit_count == 0 || arl->arl_phy == NULL) {
-			DTRACE_PROBE2(query_phy, arl_t *, arl, areq_t *, areq);
+		if (areq->areq_xmit_count == 0) {
+			DTRACE_PROBE2(query_template, arl_t *, arl,
+			    areq_t *, areq);
 			mp->b_prev = NULL;
 			err = ENXIO;
 			goto err_ret;
@@ -1814,9 +1991,9 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig)
 			err = EINVAL;
 			goto err_ret;
 		}
-		err = ar_ce_create(arl, areq->areq_proto, NULL, 0,
+		err = ar_ce_create(OWNING_ARL(arl), areq->areq_proto, NULL, 0,
 		    proto_addr, proto_addr_len, NULL,
-		    NULL, (uint32_t)0,
+		    NULL, (uint32_t)0, sender_addr,
 		    areq->areq_flags);
 		if (err != 0) {
 			DTRACE_PROBE3(query_create_failed, arl_t *, arl,
@@ -1835,49 +2012,13 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig)
 			goto err_ret;
 		}
 		ace->ace_query_mp = mp;
-		/*
-		 * We don't have group information here. But if the sender
-		 * address belongs to a different arl, we might as well
-		 * search the other arl for a resolved ACE. If we find one,
-		 * we resolve it rather than sending out a ARP request.
-		 */
-		src_ace = ar_ce_lookup_permanent(as, areq->areq_proto,
-		    sender_addr, areq->areq_sender_addr_length);
-		if (src_ace == NULL) {
-			DTRACE_PROBE3(query_source_missing, arl_t *, arl,
-			    areq_t *, areq, ace_t *, ace);
-			ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
-			/*
-			 * ar_query_reply has already freed the mp.
-			 * Return EINPROGRESS, so that caller won't attempt
-			 * to free the 'mp' again.
-			 */
-			return (EINPROGRESS);
-		}
-		if (src_ace->ace_arl != ace->ace_arl) {
-			ace_t *dst_ace;
-
-			/*
-			 * Check for a resolved entry in the src_ace->ace_arl.
-			 */
-			dst_ace = ar_ce_lookup_entry(src_ace->ace_arl,
-			    areq->areq_proto, proto_addr, proto_addr_len);
-
-			if (dst_ace != NULL && ACE_RESOLVED(dst_ace)) {
-				DTRACE_PROBE3(query_other_arl, arl_t *, arl,
-				    areq_t *, areq, ace_t *, dst_ace);
-				(void) ar_ce_resolve(ace, dst_ace->ace_hw_addr,
-				    dst_ace->ace_hw_addr_length);
-				return (EINPROGRESS);
-			}
-		}
 	}
-	ms = ar_query_xmit(as, ace, src_ace);
+	ms = ar_query_xmit(as, ace);
 	if (ms == 0) {
 		/* Immediate reply requested. */
 		ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
 	} else {
-		mi_timer(arl->arl_wq, ace->ace_mp, ms);
+		mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, ms);
 	}
 	return (EINPROGRESS);
 err_ret:
@@ -2073,6 +2214,80 @@ done:
 }
 
 /*
+ * Given an arie_t `mp', find the arl_t's that it names and return them
+ * in `*arlp' and `*ipmp_arlp'.  If they cannot be found, return B_FALSE.
+ */
+static boolean_t
+ar_ipmp_lookup(arp_stack_t *as, mblk_t *mp, arl_t **arlp, arl_t **ipmp_arlp)
+{
+	arie_t	*arie = (arie_t *)mp->b_rptr;
+
+	*arlp = ar_ll_lookup_from_mp(as, mp);
+	if (*arlp == NULL) {
+		DTRACE_PROBE1(ipmp_lookup_no_arl, mblk_t *, mp);
+		return (B_FALSE);
+	}
+
+	arie->arie_grifname[LIFNAMSIZ - 1] = '\0';
+	*ipmp_arlp = ar_ll_lookup_by_name(as, arie->arie_grifname);
+	if (*ipmp_arlp == NULL) {
+		DTRACE_PROBE1(ipmp_lookup_no_ipmp_arl, mblk_t *, mp);
+		return (B_FALSE);
+	}
+
+	DTRACE_PROBE2(ipmp_lookup, arl_t *, *arlp, arl_t *, *ipmp_arlp);
+	return (B_TRUE);
+}
+
+/*
+ * Bind an arl_t to an IPMP group arl_t.
+ */
+static int
+ar_ipmp_activate(queue_t *q, mblk_t *mp)
+{
+	arl_t *arl, *ipmp_arl;
+	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
+
+	if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
+		return (EINVAL);
+
+	if (arl->arl_ipmp_arl != NULL) {
+		DTRACE_PROBE1(ipmp_activated_already, arl_t *, arl);
+		return (EALREADY);
+	}
+
+	DTRACE_PROBE2(ipmp_activate, arl_t *, arl, arl_t *, ipmp_arl);
+	arl->arl_ipmp_arl = ipmp_arl;
+	return (0);
+}
+
+/*
+ * Unbind an arl_t from an IPMP group arl_t and update the ace_t's so
+ * that it is no longer part of the group.
+ */
+static int
+ar_ipmp_deactivate(queue_t *q, mblk_t *mp)
+{
+	arl_t *arl, *ipmp_arl;
+	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
+
+	if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
+		return (EINVAL);
+
+	if (ipmp_arl != arl->arl_ipmp_arl) {
+		DTRACE_PROBE2(ipmp_deactivate_notactive, arl_t *, arl, arl_t *,
+		    ipmp_arl);
+		return (EINVAL);
+	}
+
+	DTRACE_PROBE2(ipmp_deactivate, arl_t *, arl, arl_t *,
+	    arl->arl_ipmp_arl);
+	ar_ce_walk(as, ar_ce_ipmp_deactivate, arl);
+	arl->arl_ipmp_arl = NULL;
+	return (0);
+}
+
+/*
  * Enable an interface to process ARP_REQUEST and ARP_RESPONSE messages.
  */
 /* ARGSUSED */
@@ -2199,6 +2414,11 @@ ar_ll_init(arp_stack_t *as, ar_t *ar, mblk_t *mp)
 	if ((arl = (arl_t *)mi_zalloc(sizeof (arl_t))) == NULL)
 		return;
 
+	if (dlia->dl_mac_type == SUNW_DL_IPMP) {
+		arl->arl_flags |= ARL_F_IPMP;
+		arl->arl_ipmp_arl = arl;
+	}
+
 	arl->arl_provider_style = dlia->dl_provider_style;
 	arl->arl_rq = ar->ar_rq;
 	arl->arl_wq = ar->ar_wq;
@@ -2261,7 +2481,7 @@ ar_ll_set_defaults(arl_t *arl, mblk_t *mp)
 	dl_info_ack_t	*dlia = (dl_info_ack_t *)mp->b_rptr;
 	dl_unitdata_req_t *dlur;
 	uchar_t		*up;
-	arlphy_t	*ap;
+	arlphy_t 	*ap;
 
 	ASSERT(arl != NULL);
 
@@ -2270,6 +2490,14 @@ ar_ll_set_defaults(arl_t *arl, mblk_t *mp)
 	 */
 	ar_ll_clear_defaults(arl);
 
+	if (arl->arl_flags & ARL_F_IPMP) {
+		/*
+		 * If this is an IPMP arl_t, we have nothing to do,
+		 * since we will never transmit or receive.
+		 */
+		return;
+	}
+
 	ap = kmem_zalloc(sizeof (arlphy_t), KM_NOSLEEP);
 	if (ap == NULL)
 		goto bad;
@@ -2470,12 +2698,12 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig)
 	mblk_t	*mp = mp_orig;
 	ace_t	*ace;
 	uchar_t	*hw_addr;
-	uint32_t	hw_addr_len;
+	uint32_t hw_addr_len;
 	uchar_t	*proto_addr;
-	uint32_t	proto_addr_len;
+	uint32_t proto_addr_len;
 	uchar_t	*proto_mask;
 	uchar_t	*proto_extract_mask;
-	uint32_t	hw_extract_start;
+	uint32_t hw_extract_start;
 	arl_t	*arl;
 	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
 
@@ -2524,6 +2752,7 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig)
 	    proto_mask,
 	    proto_extract_mask,
 	    hw_extract_start,
+	    NULL,
 	    arma->arma_flags | ACE_F_MAPPING));
 }
 
@@ -2857,12 +3086,12 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
     uint32_t proto_addr_len)
 {
 	mblk_t	*areq_mp;
-	arl_t	*arl = ace->ace_arl;
 	mblk_t	*mp;
 	mblk_t	*xmit_mp;
-	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
+	queue_t	*arl_wq = ace->ace_arl->arl_wq;
+	arp_stack_t *as = ARL_TO_ARPSTACK(ace->ace_arl);
 	ip_stack_t *ipst = as->as_netstack->netstack_ip;
-	arlphy_t *ap = arl->arl_phy;
+	arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
 
 	/*
 	 * On error or completion for a query, we need to shut down the timer.
@@ -2870,7 +3099,8 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
 	 * Duplicate Address Detection, or it will never finish that phase.
 	 */
 	if (!(ace->ace_flags & (ACE_F_UNVERIFIED | ACE_F_AUTHORITY)))
-		mi_timer(arl->arl_wq, ace->ace_mp, -1L);
+		mi_timer(arl_wq, ace->ace_mp, -1L);
+
 	/* Establish the return value appropriate. */
 	if (ret_val == 0) {
 		if (!ACE_RESOLVED(ace) || ap == NULL)
@@ -2973,25 +3203,24 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
 			 */
 			ar_ce_delete(ace);
 		} else {
-			mi_timer(arl->arl_wq, ace->ace_mp,
-			    as->as_cleanup_interval);
+			mi_timer(arl_wq, ace->ace_mp, as->as_cleanup_interval);
 		}
 	}
 }
 
 /*
  * Returns number of milliseconds after which we should either rexmit or abort.
- * Return of zero means we should abort. src_ace is the ace corresponding
- * to the source address in the areq sent by IP.
+ * Return of zero means we should abort.
  */
 static clock_t
-ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace)
+ar_query_xmit(arp_stack_t *as, ace_t *ace)
 {
 	areq_t	*areq;
 	mblk_t	*mp;
 	uchar_t	*proto_addr;
 	uchar_t	*sender_addr;
-	arl_t *src_arl;
+	ace_t	*src_ace;
+	arl_t	*xmit_arl = ace->ace_xmit_arl;
 
 	mp = ace->ace_query_mp;
 	/*
@@ -3016,18 +3245,15 @@ ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace)
 	    areq->areq_sender_addr_length);
 
 	/*
-	 * Get the source h/w address for the sender addr. With interface
-	 * groups, IP sends us source address belonging to a different
-	 * interface.
+	 * Get the ace for the sender address, so that we can verify that
+	 * we have one and that DAD has completed.
 	 */
+	src_ace = ar_ce_lookup(xmit_arl, areq->areq_proto, sender_addr,
+	    areq->areq_sender_addr_length);
 	if (src_ace == NULL) {
-		src_ace = ar_ce_lookup_permanent(as, areq->areq_proto,
-		    sender_addr, areq->areq_sender_addr_length);
-		if (src_ace == NULL) {
-			DTRACE_PROBE3(xmit_no_source, ace_t *, ace,
-			    areq_t *, areq, uchar_t *, sender_addr);
-			return (0);
-		}
+		DTRACE_PROBE3(xmit_no_source, ace_t *, ace, areq_t *, areq,
+		    uchar_t *, sender_addr);
+		return (0);
 	}
 
 	/*
@@ -3044,18 +3270,12 @@ ar_query_xmit(arp_stack_t *as, ace_t *ace, ace_t *src_ace)
 		return (areq->areq_xmit_interval);
 	}
 
-	/*
-	 * Transmit on src_arl. We should transmit on src_arl. Otherwise
-	 * the switch will send back a copy on other interfaces of the
-	 * same group and as we could be using somebody else's source
-	 * address + hardware address, ARP will treat this as a bogon.
-	 */
-	src_arl = src_ace->ace_arl;
 	DTRACE_PROBE3(xmit_send, ace_t *, ace, ace_t *, src_ace,
 	    areq_t *, areq);
-	ar_xmit(src_arl, ARP_REQUEST, areq->areq_proto,
-	    areq->areq_sender_addr_length, src_arl->arl_phy->ap_hw_addr,
-	    sender_addr, src_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as);
+
+	ar_xmit(xmit_arl, ARP_REQUEST, areq->areq_proto,
+	    areq->areq_sender_addr_length, xmit_arl->arl_phy->ap_hw_addr,
+	    sender_addr, xmit_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as);
 	src_ace->ace_last_bcast = ddi_get_lbolt();
 	return (areq->areq_xmit_interval);
 }
@@ -3066,6 +3286,7 @@ ar_rput(queue_t *q, mblk_t *mp)
 {
 	arh_t	*arh;
 	arl_t	*arl;
+	arl_t	*client_arl;
 	ace_t	*dst_ace;
 	uchar_t	*dst_paddr;
 	int	err;
@@ -3079,6 +3300,8 @@ ar_rput(queue_t *q, mblk_t *mp)
 	uchar_t	*src_paddr;
 	uchar_t *dst_haddr;
 	boolean_t is_probe;
+	boolean_t is_unicast = B_FALSE;
+	dl_unitdata_ind_t *dlindp;
 	int i;
 	arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
 
@@ -3135,9 +3358,10 @@ ar_rput(queue_t *q, mblk_t *mp)
 		return;
 	case M_PCPROTO:
 	case M_PROTO:
+		dlindp = (dl_unitdata_ind_t *)mp->b_rptr;
 		if (MBLKL(mp) >= sizeof (dl_unitdata_ind_t) &&
-		    ((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive ==
-		    DL_UNITDATA_IND) {
+		    dlindp->dl_primitive == DL_UNITDATA_IND) {
+			is_unicast = (dlindp->dl_group_address == 0);
 			arl = ((ar_t *)q->q_ptr)->ar_arl;
 			if (arl != NULL && arl->arl_phy != NULL) {
 				/* Real messages from the wire! */
@@ -3261,19 +3485,24 @@ ar_rput(queue_t *q, mblk_t *mp)
 	 * RFC 826: first check if the <protocol, sender protocol address> is
 	 * in the cache, if there is a sender protocol address.  Note that this
 	 * step also handles resolutions based on source.
+	 *
+	 * Note that IP expects that each notification it receives will be
+	 * tied to the ill it received it on.  Thus, we must talk to it over
+	 * the arl tied to the resolved IP address (if any), hence client_arl.
 	 */
 	if (is_probe)
 		err = AR_NOTFOUND;
 	else
 		err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, src_paddr,
-		    plen);
+		    plen, &client_arl);
+
 	switch (err) {
 	case AR_BOGON:
-		ar_client_notify(arl, mp1, AR_CN_BOGON);
+		ar_client_notify(client_arl, mp1, AR_CN_BOGON);
 		mp1 = NULL;
 		break;
 	case AR_FAILED:
-		ar_client_notify(arl, mp1, AR_CN_FAILED);
+		ar_client_notify(client_arl, mp1, AR_CN_FAILED);
 		mp1 = NULL;
 		break;
 	case AR_LOOPBACK:
@@ -3293,7 +3522,9 @@ ar_rput(queue_t *q, mblk_t *mp)
 	 * Now look up the destination address.  By RFC 826, we ignore the
 	 * packet at this step if the target isn't one of our addresses.  This
 	 * is true even if the target is something we're trying to resolve and
-	 * the packet is a response.
+	 * the packet is a response.  To avoid duplicate responses, we also
+	 * ignore the packet if it was multicast/broadcast to an arl that's in
+	 * an IPMP group but was not the designated xmit_arl for the ACE.
 	 *
 	 * Note that in order to do this correctly, we need to know when to
 	 * notify IP of a change implied by the source address of the ARP
@@ -3304,6 +3535,7 @@ ar_rput(queue_t *q, mblk_t *mp)
 	 */
 	dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen);
 	if (dst_ace == NULL || !ACE_RESOLVED(dst_ace) ||
+	    (dst_ace->ace_xmit_arl != arl && !is_unicast) ||
 	    !(dst_ace->ace_flags & ACE_F_PUBLISH)) {
 		/*
 		 * Let the client know if the source mapping has changed, even
@@ -3311,7 +3543,7 @@ ar_rput(queue_t *q, mblk_t *mp)
 		 * client.
 		 */
 		if (err == AR_CHANGED)
-			ar_client_notify(arl, mp1, AR_CN_ANNOUNCE);
+			ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
 		else
 			freemsg(mp1);
 		freeb(mp);
@@ -3341,6 +3573,7 @@ ar_rput(queue_t *q, mblk_t *mp)
 			    "arp_rput_end: q %p (%S)", q, "reflection");
 			return;
 		}
+
 		/*
 		 * Conflicts seen via the wrong interface may be bogus.
 		 * Multiple interfaces on the same segment imply any conflict
@@ -3378,12 +3611,21 @@ ar_rput(queue_t *q, mblk_t *mp)
 		 * the src_paddr field before sending it to IP. The same is
 		 * required for probes, where src_paddr will be INADDR_ANY.
 		 */
-		if (is_probe || op == ARP_RESPONSE) {
+		if (is_probe) {
+			/*
+			 * In this case, client_arl will be invalid (e.g.,
+			 * since probes don't have a valid sender address).
+			 * But dst_ace has the appropriate arl.
+			 */
 			bcopy(dst_paddr, src_paddr, plen);
-			ar_client_notify(arl, mp1, AR_CN_FAILED);
+			ar_client_notify(dst_ace->ace_arl, mp1, AR_CN_FAILED);
+			ar_ce_delete(dst_ace);
+		} else if (op == ARP_RESPONSE) {
+			bcopy(dst_paddr, src_paddr, plen);
+			ar_client_notify(client_arl, mp1, AR_CN_FAILED);
 			ar_ce_delete(dst_ace);
 		} else if (err == AR_CHANGED) {
-			ar_client_notify(arl, mp1, AR_CN_ANNOUNCE);
+			ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
 		} else {
 			DTRACE_PROBE3(rput_request_unverified, arl_t *, arl,
 			    arh_t *, arh, ace_t *, dst_ace);
@@ -3431,19 +3673,19 @@ ar_rput(queue_t *q, mblk_t *mp)
 		    dst_ace->ace_hw_addr, dst_ace->ace_proto_addr,
 		    src_haddr, src_paddr, dstaddr, as);
 		if (!is_probe && err == AR_NOTFOUND &&
-		    ar_ce_create(arl, proto, src_haddr, hlen, src_paddr, plen,
-		    NULL, NULL, 0, 0) == 0) {
+		    ar_ce_create(OWNING_ARL(arl), proto, src_haddr, hlen,
+		    src_paddr, plen, NULL, NULL, 0, NULL, 0) == 0) {
 			ace_t *ace;
 
 			ace = ar_ce_lookup(arl, proto, src_paddr, plen);
 			ASSERT(ace != NULL);
-			mi_timer(arl->arl_wq, ace->ace_mp,
+			mi_timer(ace->ace_arl->arl_wq, ace->ace_mp,
 			    as->as_cleanup_interval);
 		}
 	}
 	if (err == AR_CHANGED) {
 		freeb(mp);
-		ar_client_notify(arl, mp1, AR_CN_ANNOUNCE);
+		ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
 		TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
 		    "arp_rput_end: q %p (%S)", q, "reqchange");
 	} else {
@@ -3459,7 +3701,7 @@ ar_ce_restart_dad(ace_t *ace, void *arl_arg)
 	arl_t *arl = arl_arg;
 	arp_stack_t *as = ARL_TO_ARPSTACK(arl);
 
-	if ((ace->ace_arl == arl) &&
+	if ((ace->ace_xmit_arl == arl) &&
 	    (ace->ace_flags & (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) ==
 	    (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) {
 		/*
@@ -4060,9 +4302,9 @@ ar_wput(queue_t *q, mblk_t *mp)
 static boolean_t
 arp_say_ready(ace_t *ace)
 {
-	mblk_t *mp;
+	mblk_t	*mp;
 	arl_t *arl = ace->ace_arl;
-	arlphy_t *ap = arl->arl_phy;
+	arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
 	arh_t *arh;
 	uchar_t *cp;
 
@@ -4107,7 +4349,7 @@ ace_reschedule(ace_t *ace, void *arg)
 	ace_t **acemax;
 	ace_t *atemp;
 
-	if (ace->ace_arl != art->art_arl)
+	if (ace->ace_xmit_arl != art->art_arl)
 		return;
 	/*
 	 * Only published entries that are ready for announcement are eligible.
@@ -4179,7 +4421,6 @@ static void
 ar_wsrv(queue_t *q)
 {
 	ace_t *ace;
-	arl_t *arl;
 	arlphy_t *ap;
 	mblk_t *mp;
 	clock_t	ms;
@@ -4196,8 +4437,7 @@ ar_wsrv(queue_t *q)
 			ace = (ace_t *)mp->b_rptr;
 			if (ace->ace_flags & ACE_F_DYING)
 				continue;
-			arl = ace->ace_arl;
-			ap = arl->arl_phy;
+			ap = ace->ace_xmit_arl->arl_phy;
 			if (ace->ace_flags & ACE_F_UNVERIFIED) {
 				ASSERT(ace->ace_flags & ACE_F_PUBLISH);
 				ASSERT(ace->ace_query_mp == NULL);
@@ -4216,7 +4456,7 @@ ar_wsrv(queue_t *q)
 					DTRACE_PROBE1(timer_probe,
 					    ace_t *, ace);
 					ace->ace_xmit_count--;
-					ar_xmit(arl, ARP_REQUEST,
+					ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
 					    ace->ace_proto,
 					    ace->ace_proto_addr_length,
 					    ace->ace_hw_addr, NULL, NULL,
@@ -4247,7 +4487,7 @@ ar_wsrv(queue_t *q)
 				    now - ap->ap_defend_start >
 				    SEC_TO_TICK(as->as_defend_period)) {
 					ap->ap_defend_start = now;
-					arl_reschedule(arl);
+					arl_reschedule(ace->ace_xmit_arl);
 				}
 				/*
 				 * Finish the job that we started in
@@ -4288,12 +4528,12 @@ ar_wsrv(queue_t *q)
 					DTRACE_PROBE1(timer_defend,
 					    ace_t *, ace);
 				}
-				ar_xmit(arl, ARP_REQUEST,
+				ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
 				    ace->ace_proto,
 				    ace->ace_proto_addr_length,
 				    ace->ace_hw_addr,
 				    ace->ace_proto_addr,
-				    ap->ap_arp_addr,
+				    ace->ace_xmit_arl->arl_phy->ap_arp_addr,
 				    ace->ace_proto_addr, NULL, as);
 				ace->ace_last_bcast = now;
 				if (ace->ace_xmit_count == 0)
@@ -4316,7 +4556,8 @@ ar_wsrv(queue_t *q)
 				    ndp_lookup_ipaddr(*(ipaddr_t *)
 				    ace->ace_proto_addr, as->as_netstack)) {
 					ace->ace_flags |= ACE_F_OLD;
-					mi_timer(arl->arl_wq, ace->ace_mp,
+					mi_timer(ace->ace_arl->arl_wq,
+					    ace->ace_mp,
 					    as->as_cleanup_interval);
 				} else {
 					ar_delete_notify(ace);
@@ -4333,7 +4574,7 @@ ar_wsrv(queue_t *q)
 			 * we complete the operation with a failure indication.
 			 * Otherwise, we restart the timer.
 			 */
-			ms = ar_query_xmit(as, ace, NULL);
+			ms = ar_query_xmit(as, ace);
 			if (ms == 0)
 				ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
 			else
@@ -4360,6 +4601,8 @@ ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen,
 	mblk_t	*mp;
 	arlphy_t *ap = arl->arl_phy;
 
+	ASSERT(!(arl->arl_flags & ARL_F_IPMP));
+
 	if (ap == NULL) {
 		DTRACE_PROBE1(xmit_no_arl_phy, arl_t *, arl);
 		return;
diff --git a/usr/src/uts/common/inet/arp_impl.h b/usr/src/uts/common/inet/arp_impl.h
index a2564d5602..f16fdc97a0 100644
--- a/usr/src/uts/common/inet/arp_impl.h
+++ b/usr/src/uts/common/inet/arp_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,6 +67,7 @@ typedef struct arl_s {
 	uint_t		arl_closing : 1;	/* stream is closing */
 	uint32_t	arl_index;		/* instance number */
 	struct arlphy_s	*arl_phy;		/* physical info, if any */
+	struct arl_s	*arl_ipmp_arl;		/* pointer to group arl_t */
 } arl_t;
 
 /*
@@ -75,7 +76,7 @@ typedef struct arl_s {
  */
 #define	ARL_TO_ARPSTACK(_arl)	(((ar_t *)(_arl)->arl_rq->q_ptr)->ar_as)
 
-/* ARL physical info structure for a link level device */
+/* ARL physical info structure, one per physical link level device */
 typedef struct arlphy_s {
 	uint32_t	ap_arp_hw_type;		/* hardware type */
 	uchar_t		*ap_arp_addr;		/* multicast address to use */
@@ -110,6 +111,7 @@ typedef struct ace_s {
 	clock_t		ace_last_bcast;		/* last broadcast Response */
 	clock_t		ace_xmit_interval;
 	int		ace_xmit_count;
+	arl_t		*ace_xmit_arl;		/* xmit on this arl */
 } ace_t;
 
 #define	ARPHOOK_INTERESTED_PHYSICAL_IN(as)	\
@@ -216,6 +218,7 @@ struct arp_stack {
 typedef struct arp_stack arp_stack_t;
 
 #define	ARL_F_NOARP	0x01
+#define	ARL_F_IPMP	0x02
 
 #define	ARL_S_DOWN	0x00
 #define	ARL_S_PENDING	0x01
diff --git a/usr/src/uts/common/inet/dlpistub/dlpistub.c b/usr/src/uts/common/inet/dlpistub/dlpistub.c
new file mode 100644
index 0000000000..961876ac47
--- /dev/null
+++ b/usr/src/uts/common/inet/dlpistub/dlpistub.c
@@ -0,0 +1,370 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * DLPI stub driver; currently supports VNI and IPMP stub devices.
+ */
+
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/dlpi.h>
+#include <sys/stat.h>
+#include <sys/strsun.h>
+#include <sys/stropts.h>
+#include <sys/types.h>
+#include <sys/id_space.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/modctl.h>
+#include <sys/mkdev.h>
+#include <sys/sdt.h>
+
+#include "dlpistub_impl.h"
+
+static id_space_t *ds_minors;
+static dev_info_t *ds_dip;
+
+/*
+ * DL_INFO_ACK template.
+ */
+static dl_info_ack_t ds_infoack = {
+	DL_INFO_ACK,	/* dl_primitive */
+	0,		/* dl_max_sdu */
+	0,		/* dl_min_sdu */
+	0,		/* dl_addr_length */
+	0,		/* dl_mac_type */
+	0,		/* dl_reserved */
+	0,		/* dl_current_state */
+	0,		/* dl_sap_length */
+	DL_CLDLS,	/* dl_service_mode */
+	0,		/* dl_qos_length */
+	0,		/* dl_qos_offset */
+	0,		/* dl_qos_range_length */
+	0,		/* dl_qos_range_offset */
+	DL_STYLE2,	/* dl_provider_style */
+	0,		/* dl_addr_offset */
+	DL_VERSION_2,	/* dl_version */
+	0,		/* dl_brdcst_addr_length */
+	0,		/* dl_brdcst_addr_offset */
+	0		/* dl_growth */
+};
+
+static int
+ds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	if (ddi_create_minor_node(dip, "vni", S_IFCHR, DS_MINOR_VNI,
+	    DDI_PSEUDO, 0) == DDI_FAILURE ||
+	    ddi_create_minor_node(dip, "ipmpstub", S_IFCHR, DS_MINOR_IPMP,
+	    DDI_PSEUDO, 0) == DDI_FAILURE) {
+		ddi_remove_minor_node(dip, NULL);
+		cmn_err(CE_NOTE, "ds_attach: cannot create minor nodes");
+		return (DDI_FAILURE);
+	}
+
+	ds_dip = dip;
+	ds_minors = id_space_create("ds_minors", DS_MINOR_START, MAXMIN32);
+	return (DDI_SUCCESS);
+}
+
+static int
+ds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	id_space_destroy(ds_minors);
+	ds_minors = NULL;
+	ASSERT(dip == ds_dip);
+	ddi_remove_minor_node(dip, NULL);
+	ds_dip = NULL;
+	return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+ds_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	int error = DDI_FAILURE;
+
+	switch (infocmd) {
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)0;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2DEVINFO:
+		if (ds_dip != NULL) {
+			*result = ds_dip;
+			error = DDI_SUCCESS;
+		}
+		break;
+	}
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+ds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+	int type;
+	dlpistub_t *dsp;
+
+	if (sflag == CLONEOPEN || sflag == MODOPEN)
+		return (EINVAL);
+
+	if (q->q_ptr != NULL)
+		return (0);
+
+	switch (getminor(*devp)) {
+	case DS_MINOR_VNI:
+		type = SUNW_DL_VNI;
+		break;
+	case DS_MINOR_IPMP:
+		type = SUNW_DL_IPMP;
+		break;
+	default:
+		return (ENXIO);
+	}
+
+	dsp = kmem_zalloc(sizeof (dlpistub_t), KM_SLEEP);
+	dsp->ds_type = type;
+	dsp->ds_minor = (minor_t)id_alloc(ds_minors);
+	dsp->ds_state = DL_UNATTACHED;
+	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
+	q->q_ptr = WR(q)->q_ptr = dsp;
+	qprocson(q);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+ds_close(queue_t *q, int flag, cred_t *credp)
+{
+	dlpistub_t	*dsp = q->q_ptr;
+
+	qprocsoff(q);
+	q->q_ptr = WR(q)->q_ptr = NULL;
+
+	id_free(ds_minors, dsp->ds_minor);
+	kmem_free(dsp, sizeof (dlpistub_t));
+
+	return (0);
+}
+
+static int
+ds_badprim(queue_t *q, mblk_t *mp, t_scalar_t prim)
+{
+	dlerrorack(q, mp, prim, DL_BADPRIM, 0);
+	return (0);
+}
+
+static int
+ds_outstate(queue_t *q, mblk_t *mp, t_scalar_t prim)
+{
+	dlerrorack(q, mp, prim, DL_OUTSTATE, 0);
+	return (0);
+}
+
+static int
+ds_wput(queue_t *q, mblk_t *mp)
+{
+	union DL_primitives	*dlp;
+	dl_info_ack_t		*dlip;
+	dlpistub_t		*dsp = q->q_ptr;
+	t_scalar_t		prim;
+
+	switch (DB_TYPE(mp)) {
+	case M_PROTO:
+	case M_PCPROTO:
+		if (MBLKL(mp) < sizeof (t_scalar_t)) {
+			dlerrorack(q, mp, DL_PRIM_INVAL, DL_UNSUPPORTED, 0);
+			return (0);
+		}
+
+		dlp = (void *)mp->b_rptr;
+		prim = dlp->dl_primitive;
+		switch (prim) {
+		case DL_ATTACH_REQ:
+			if (MBLKL(mp) < DL_ATTACH_REQ_SIZE)
+				return (ds_badprim(q, mp, prim));
+
+			if (dsp->ds_state != DL_UNATTACHED)
+				return (ds_outstate(q, mp, prim));
+
+			dsp->ds_state = DL_UNBOUND;
+			dlokack(q, mp, DL_ATTACH_REQ);
+			break;
+
+		case DL_BIND_REQ:
+			if (MBLKL(mp) < DL_BIND_REQ_SIZE)
+				return (ds_badprim(q, mp, prim));
+
+			if (dsp->ds_state != DL_UNBOUND)
+				return (ds_outstate(q, mp, prim));
+
+			dsp->ds_state = DL_IDLE;
+			dlbindack(q, mp, dlp->bind_req.dl_sap, NULL, 0, 0, 0);
+			break;
+
+		case DL_INFO_REQ:
+			if (MBLKL(mp) < DL_INFO_REQ_SIZE)
+				return (ds_badprim(q, mp, prim));
+
+			mp = mexchange(q, mp, sizeof (dl_info_ack_t),
+			    M_PCPROTO, DL_INFO_ACK);
+			if (mp != NULL) {
+				dlip = (void *)mp->b_rptr;
+				*dlip = ds_infoack;
+				dlip->dl_mac_type = dsp->ds_type;
+				dlip->dl_current_state = dsp->ds_state;
+				qreply(q, mp);
+			}
+			break;
+
+		case DL_PHYS_ADDR_REQ:
+			if (MBLKL(mp) < DL_PHYS_ADDR_REQ_SIZE)
+				return (ds_badprim(q, mp, prim));
+
+			dlphysaddrack(q, mp, NULL, 0);
+			break;
+
+		case DL_UNBIND_REQ:
+			if (MBLKL(mp) < DL_UNBIND_REQ_SIZE)
+				return (ds_badprim(q, mp, prim));
+
+			if (dsp->ds_state != DL_IDLE)
+				return (ds_outstate(q, mp, prim));
+
+			dsp->ds_state = DL_UNBOUND;
+			dlokack(q, mp, DL_UNBIND_REQ);
+			break;
+
+		case DL_DETACH_REQ:
+			if (MBLKL(mp) < DL_DETACH_REQ_SIZE)
+				return (ds_badprim(q, mp, prim));
+
+			if (dsp->ds_state != DL_UNBOUND)
+				return (ds_outstate(q, mp, prim));
+
+			dsp->ds_state = DL_UNATTACHED;
+			dlokack(q, mp, DL_DETACH_REQ);
+			break;
+
+		case DL_UNITDATA_REQ:
+			DTRACE_PROBE2(dlpistub__data, dlpistub_t *, dsp,
+			    mblk_t *, mp);
+			freemsg(mp);
+			break;
+
+		default:
+			dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
+		}
+		break;
+
+	case M_IOCTL:
+		miocnak(q, mp, 0, EINVAL);
+		break;
+
+	case M_FLUSH:
+		*mp->b_rptr &= ~FLUSHW;
+		if (*mp->b_rptr & FLUSHR)
+			qreply(q, mp);
+		else
+			freemsg(mp);
+		break;
+	default:
+		freemsg(mp);
+		break;
+	}
+
+	return (0);
+}
+
+static struct module_info ds_minfo = {
+	DS_IDNUM,	/* mi_idnum */
+	"dlpistub",	/* mi_idname */
+	0,		/* mi_minpsz */
+	INFPSZ,		/* mi_maxpsz */
+	0,		/* mi_hiwat */
+	0,		/* mi_lowat */
+};
+
+static struct qinit ds_rinit = {
+	NULL,		/* qi_putp */
+	NULL,		/* qi_srvp */
+	ds_open,	/* qi_qopen */
+	ds_close,	/* qi_qclose */
+	NULL,		/* qi_qadmin */
+	&ds_minfo,	/* qi_minfo */
+};
+
+static struct qinit ds_winit = {
+	ds_wput,	/* qi_putp */
+	NULL,		/* qi_srvp */
+	NULL,		/* qi_qopen */
+	NULL,		/* qi_qclose */
+	NULL,		/* qi_qadmin */
+	&ds_minfo,	/* qi_minfo */
+};
+
+static struct streamtab ds_info = {
+	&ds_rinit,	/* st_rdinit */
+	&ds_winit	/* st_wrinit */
+};
+
+DDI_DEFINE_STREAM_OPS(ds_ops, nulldev, nulldev, ds_attach, ds_detach,
+    nodev, ds_devinfo, D_MP|D_MTPERMOD, &ds_info, ddi_quiesce_not_supported);
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	"DLPI stub driver",
+	&ds_ops,
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modldrv, NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_fini(void)
+{
+	return (mod_remove(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/vni/vni.conf b/usr/src/uts/common/inet/dlpistub/dlpistub.conf
index d79915e01c..72264ca466 100644
--- a/usr/src/uts/common/inet/vni/vni.conf
+++ b/usr/src/uts/common/inet/dlpistub/dlpistub.conf
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -20,10 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
-name="vni" parent="pseudo" instance=0;
+name="dlpistub" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h b/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h
new file mode 100644
index 0000000000..ece15320ee
--- /dev/null
+++ b/usr/src/uts/common/inet/dlpistub/dlpistub_impl.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_INET_DLPISTUB_IMPL_H
+#define	_INET_DLPISTUB_IMPL_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+
+typedef struct dlpistub {
+	int		ds_type;	/* DLPI MAC type */
+	t_uscalar_t	ds_state;	/* DLPI state */
+	minor_t		ds_minor;	/* corresponding minor */
+} dlpistub_t;
+
+#define	DS_IDNUM	0x2a84
+
+enum { DS_MINOR_VNI = 1, DS_MINOR_IPMP, DS_MINOR_START };
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _INET_DLPISTUB_IMPL_H */
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 323c8fd0de..41595280cb 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -56,6 +56,7 @@ extern "C" {
 #include <net/route.h>
 #include <sys/systm.h>
 #include <sys/multidata.h>
+#include <sys/list.h>
 #include <net/radix.h>
 #include <sys/modhash.h>
 
@@ -565,15 +566,21 @@ typedef struct ipha_s {
 #define	IPH_ECN_ECT0	0x2	/* ECN-Capable Transport, ECT(0) */
 #define	IPH_ECN_CE	0x3	/* ECN-Congestion Experienced (CE) */
 
+struct ill_s;
+
+typedef	boolean_t ip_v6intfid_func_t(struct ill_s *, in6_addr_t *);
+typedef	boolean_t ip_v6mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *,
+    in6_addr_t *);
+typedef boolean_t ip_v4mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *,
+    ipaddr_t *);
+
 /* IP Mac info structure */
 typedef struct ip_m_s {
-	t_uscalar_t	ip_m_mac_type;	/* From <sys/dlpi.h> */
-	int		ip_m_type;	/* From <net/if_types.h> */
-	boolean_t	(*ip_m_v4mapinfo)(uint_t, uint8_t *, uint8_t *,
-			    uint32_t *, ipaddr_t *);
-	boolean_t	(*ip_m_v6mapinfo)(uint_t, uint8_t *, uint8_t *,
-			    uint32_t *, in6_addr_t *);
-	boolean_t	(*ip_m_v6intfid)(uint_t, uint8_t *, in6_addr_t *);
+	t_uscalar_t		ip_m_mac_type;	/* From <sys/dlpi.h> */
+	int			ip_m_type;	/* From <net/if_types.h> */
+	ip_v4mapinfo_func_t	*ip_m_v4mapinfo;
+	ip_v6mapinfo_func_t	*ip_m_v6mapinfo;
+	ip_v6intfid_func_t	*ip_m_v6intfid;
 } ip_m_t;
 
 /*
@@ -583,18 +590,22 @@ typedef struct ip_m_s {
  * layer multicast address range.
  * b. map from IPv6 multicast address range (ff00::/8) to the link
  * layer multicast address range.
- * c. derive the default IPv6 interface identifier from the link layer
- * address.
+ * c. derive the default IPv6 interface identifier from the interface.
+ * d. derive the default IPv6 destination interface identifier from
+ * the interface (point-to-point only).
  */
 #define	MEDIA_V4MINFO(ip_m, plen, bphys, maddr, hwxp, v4ptr) \
 	(((ip_m)->ip_m_v4mapinfo != NULL) && \
 	(*(ip_m)->ip_m_v4mapinfo)(plen, bphys, maddr, hwxp, v4ptr))
-#define	MEDIA_V6INTFID(ip_m, plen, phys, v6ptr) \
-	(((ip_m)->ip_m_v6intfid != NULL) && \
-	(*(ip_m)->ip_m_v6intfid)(plen, phys, v6ptr))
 #define	MEDIA_V6MINFO(ip_m, plen, bphys, maddr, hwxp, v6ptr) \
 	(((ip_m)->ip_m_v6mapinfo != NULL) && \
 	(*(ip_m)->ip_m_v6mapinfo)(plen, bphys, maddr, hwxp, v6ptr))
+#define	MEDIA_V6INTFID(ip_m, ill, v6ptr) \
+	(((ip_m)->ip_m_v6intfid != NULL) && \
+	(*(ip_m)->ip_m_v6intfid)(ill, v6ptr))
+#define	MEDIA_V6DESTINTFID(ip_m, ill, v6ptr) \
+	(((ip_m)->ip_m_v6destintfid != NULL) && \
+	(*(ip_m)->ip_m_v6destintfid)(ill, v6ptr))
 
 /* Router entry types */
 #define	IRE_BROADCAST		0x0001	/* Route entry for broadcast address */
@@ -621,18 +632,12 @@ typedef struct ip_m_s {
  * the bucket should delete this IRE from this bucket.
  */
 #define	IRE_MARK_CONDEMNED	0x0001
+
 /*
- * If a broadcast IRE is marked with IRE_MARK_NORECV, ip_rput will drop the
- * broadcast packets received on that interface. This is marked only
- * on broadcast ires. Employed by IPMP, where we have multiple NICs on the
- * same subnet receiving the same broadcast packet.
- */
-#define	IRE_MARK_NORECV		0x0002
-/*
- * IRE_CACHE marked this way won't be returned by ire_cache_lookup. Need
- * to look specifically using MATCH_IRE_MARK_HIDDEN. Used by IPMP.
+ * An IRE with IRE_MARK_TESTHIDDEN is used by in.mpathd for test traffic.  It
+ * can only be looked up by requesting MATCH_IRE_MARK_TESTHIDDEN.
  */
-#define	IRE_MARK_HIDDEN		0x0004	/* Typically Used by in.mpathd */
+#define	IRE_MARK_TESTHIDDEN	0x0004
 
 /*
  * An IRE with IRE_MARK_NOADD is created in ip_newroute_ipif when the outgoing
@@ -788,45 +793,18 @@ typedef struct mrec_s {
  * ilm records the state of multicast memberships with the driver and is
  * maintained per interface.
  *
- * Notes :
- *
- * 1) There is no direct link between a given ilg and ilm. If the
- *    application has joined a group G with ifindex I, we will have
- *    an ilg with ilg_v6group and ilg_ill. There will be a corresponding
- *    ilm with ilm_ill/ilm_v6addr recording the multicast membership.
- *    To delete the membership,
- *
- *		a) Search for ilg matching on G and I with ilg_v6group
- *		   and ilg_ill. Delete ilg_ill.
- *		b) Search the corresponding ilm matching on G and I with
- *		   ilm_v6addr and ilm_ill. Delete ilm.
- *
- *    In IPv4, the only difference is, we look using ipifs instead of
- *    ills.
- *
- * 2) With IP multipathing, we want to keep receiving even after the
- *    interface has failed. We do this by moving multicast memberships
- *    to a new_ill within the group. This is achieved by sending
- *    DL_DISABMULTI_REQS on ilg_ill/ilm_ill and sending DL_ENABMULTIREQS
- *    on the new_ill and changing ilg_ill/ilm_ill to new_ill. But, we
- *    need to be able to delete memberships which will still come down
- *    with the ifindex of the old ill which is what the application
- *    knows of. Thus we store the ilm_/ilg_orig_ifindex to keep track
- *    of where we joined initially so that we can lookup even after we
- *    moved the membership. It is also used for moving back the membership
- *    when the old ill has been repaired. This is done by looking up for
- *    ilms with ilm_orig_ifindex matching on the old ill's ifindex. Only
- *    ilms actually move from old ill to new ill. ilgs don't move (just
- *    the ilg_ill is changed when it moves) as it just records the state
- *    of the application that has joined a group G where as ilm records
- *    the state joined with the driver. Thus when we send DL_XXXMULTI_REQs
- *    we also need to keep the ilm in the right ill.
- *
- *    In IPv4, as ipifs move from old ill to new_ill, ilgs and ilms move
- *    implicitly as we use only ipifs in IPv4. Thus, one can always lookup
- *    a given ilm/ilg even after it fails without the support of
- *    orig_ifindex. We move ilms still to record the driver state as
- *    mentioned above.
+ * There is no direct link between a given ilg and ilm. If the
+ * application has joined a group G with ifindex I, we will have
+ * an ilg with ilg_v6group and ilg_ill. There will be a corresponding
+ * ilm with ilm_ill/ilm_v6addr recording the multicast membership.
+ * To delete the membership:
+ *
+ *	a) Search for ilg matching on G and I with ilg_v6group
+ *	   and ilg_ill. Delete ilg_ill.
+ *	b) Search the corresponding ilm matching on G and I with
+ *	   ilm_v6addr and ilm_ill. Delete ilm.
+ *
+ * For IPv4 the only difference is that we look using ipifs, not ills.
  */
 
 /*
@@ -839,7 +817,6 @@ typedef struct ilg_s {
 	in6_addr_t	ilg_v6group;
 	struct ipif_s	*ilg_ipif;	/* Logical interface we are member on */
 	struct ill_s	*ilg_ill;	/* Used by IPv6 */
-	int		ilg_orig_ifindex; /* Interface originally joined on */
 	uint_t		ilg_flags;
 	mcast_record_t	ilg_fmode;	/* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
 	slist_t		*ilg_filter;
@@ -866,9 +843,7 @@ typedef struct ilm_s {
 	struct ilm_s	*ilm_next;	/* Linked list for each ill */
 	uint_t		ilm_state;	/* state of the membership */
 	struct ill_s	*ilm_ill;	/* Back pointer to ill for IPv6 */
-	int		ilm_orig_ifindex;  /* V6_MULTICAST_IF/ilm_ipif index */
 	uint_t		ilm_flags;
-	boolean_t	ilm_is_new;	/* new ilm */
 	boolean_t	ilm_notify_driver; /* Need to notify the driver */
 	zoneid_t	ilm_zoneid;
 	int		ilm_no_ilg_cnt;	/* number of joins w/ no ilg */
@@ -881,28 +856,11 @@ typedef struct ilm_s {
 
 #define	ilm_addr	V4_PART_OF_V6(ilm_v6addr)
 
-/*
- * ilm_walker_cleanup needs to execute when the ilm_walker_cnt goes down to
- * zero. In addition it needs to block new walkers while it is unlinking ilm's
- * from the list. Thus simple atomics for the ill_ilm_walker_cnt don't suffice.
- */
-#define	ILM_WALKER_HOLD(ill)    {               \
-	mutex_enter(&(ill)->ill_lock);          \
-	ill->ill_ilm_walker_cnt++;              \
-	mutex_exit(&(ill)->ill_lock);           \
-}
-
-/*
- * ilm_walker_cleanup releases ill_lock
- */
-#define	ILM_WALKER_RELE(ill)	{ 		\
-	mutex_enter(&(ill)->ill_lock);		\
-	(ill)->ill_ilm_walker_cnt--;		\
-	if ((ill)->ill_ilm_walker_cnt == 0 && (ill)->ill_ilm_cleanup_reqd) \
-		ilm_walker_cleanup(ill);	\
-	else 					\
-		mutex_exit(&(ill)->ill_lock);	\
-}
+typedef struct ilm_walker {
+	struct ill_s	*ilw_ill;	/* associated ill */
+	struct ill_s	*ilw_ipmp_ill; 	/* associated ipmp ill (if any) */
+	struct ill_s	*ilw_walk_ill; 	/* current ill being walked */
+} ilm_walker_t;
 
 /*
  * Soft reference to an IPsec SA.
@@ -1047,11 +1005,8 @@ typedef struct conn_s conn_t;
  * ipc_acking_unbind 		conn_acking_unbind
  * ipc_pad_to_bit_31 		conn_pad_to_bit_31
  *
- * ipc_nofailover_ill		conn_nofailover_ill
- *
  * ipc_proto			conn_proto
  * ipc_incoming_ill		conn_incoming_ill
- * ipc_outgoing_pill		conn_outgoing_pill
  * ipc_pending_ill		conn_pending_ill
  * ipc_unbind_mp		conn_unbind_mp
  * ipc_ilg			conn_ilg
@@ -1061,8 +1016,6 @@ typedef struct conn_s conn_t;
  * ipc_refcv			conn_refcv
  * ipc_multicast_ipif		conn_multicast_ipif
  * ipc_multicast_ill		conn_multicast_ill
- * ipc_orig_bound_ifindex	conn_orig_bound_ifindex
- * ipc_orig_multicast_ifindex	conn_orig_multicast_ifindex
  * ipc_drain_next		conn_drain_next
  * ipc_drain_prev		conn_drain_prev
  * ipc_idl			conn_idl
@@ -1263,7 +1216,6 @@ typedef struct th_hash_s {
 /* The following are ipif_state_flags */
 #define	IPIF_CONDEMNED		0x1	/* The ipif is being removed */
 #define	IPIF_CHANGING		0x2	/* A critcal ipif field is changing */
-#define	IPIF_MOVING		0x8	/* The ipif is being moved */
 #define	IPIF_SET_LINKLOCAL	0x10	/* transient flag during bringup */
 #define	IPIF_ZERO_SOURCE	0x20	/* transient flag during bringup */
 
@@ -1273,7 +1225,6 @@ typedef struct ipif_s {
 	struct	ill_s	*ipif_ill;	/* Back pointer to our ill */
 	int	ipif_id;		/* Logical unit number */
 	uint_t	ipif_mtu;		/* Starts at ipif_ill->ill_max_frag */
-	uint_t	ipif_saved_mtu;		/* Save of mtu during ipif_move() */
 	in6_addr_t ipif_v6lcl_addr;	/* Local IP address for this if. */
 	in6_addr_t ipif_v6src_addr;	/* Source IP address for this if. */
 	in6_addr_t ipif_v6subnet;	/* Subnet prefix for this if. */
@@ -1306,17 +1257,15 @@ typedef struct ipif_s {
 	uint_t	ipif_ob_pkt_count;	/* Outbound packets to our dead IREs */
 	/* Exclusive bit fields, protected by ipsq_t */
 	unsigned int
-		ipif_multicast_up : 1,	/* We have joined the allhosts group */
-		ipif_replace_zero : 1,	/* Replacement for zero */
+		ipif_multicast_up : 1,	/* ipif_multicast_up() successful */
 		ipif_was_up : 1,	/* ipif was up before */
 		ipif_addr_ready : 1,	/* DAD is done */
-
 		ipif_was_dup : 1,	/* DAD had failed */
+
+		ipif_joined_allhosts : 1, /* allhosts joined */
 		ipif_pad_to_31 : 27;
 
-	int	ipif_orig_ifindex;	/* ifindex before SLIFFAILOVER */
 	uint_t	ipif_seqid;		/* unique index across all ills */
-	uint_t	ipif_orig_ipifid;	/* ipif_id before SLIFFAILOVER */
 	uint_t	ipif_state_flags;	/* See IPIF_* flag defs above */
 	uint_t	ipif_refcnt;		/* active consistent reader cnt */
 
@@ -1328,6 +1277,16 @@ typedef struct ipif_s {
 	zoneid_t ipif_zoneid;		/* zone ID number */
 	timeout_id_t ipif_recovery_id;	/* Timer for DAD recovery */
 	boolean_t ipif_trace_disable;	/* True when alloc fails */
+	/*
+	 * For an IPMP interface, ipif_bound_ill tracks the ill whose hardware
+	 * information this ipif is associated with via ARP/NDP.  We can use
+	 * an ill pointer (rather than an index) because only ills that are
+	 * part of a group will be pointed to, and an ill cannot disappear
+	 * while it's in a group.
+	 */
+	struct ill_s	*ipif_bound_ill;
+	struct ipif_s	*ipif_bound_next; /* bound ipif chain */
+	boolean_t	ipif_bound;	 /* B_TRUE if we successfully bound */
 } ipif_t;
 
 /*
@@ -1405,8 +1364,6 @@ typedef struct ipif_s {
  *
  * bit fields		ill_lock		ill_lock
  *
- * ipif_orig_ifindex	ipsq			None
- * ipif_orig_ipifid	ipsq			None
  * ipif_seqid		ipsq			Write once
  *
  * ipif_state_flags	ill_lock		ill_lock
@@ -1414,6 +1371,10 @@ typedef struct ipif_s {
  * ipif_ire_cnt		ill_lock		ill_lock
  * ipif_ilm_cnt		ill_lock		ill_lock
  * ipif_saved_ire_cnt
+ *
+ * ipif_bound_ill	ipsq + ipmp_lock	ipsq OR ipmp_lock
+ * ipif_bound_next	ipsq			ipsq
+ * ipif_bound		ipsq			ipsq
  */
 
 #define	IP_TR_HASH(tid)	((((uintptr_t)tid) >> 6) & (IP_TR_HASH_MAX - 1))
@@ -1457,103 +1418,154 @@ typedef struct ipif_s {
 #define	IPI2MODE(ipi)	((ipi)->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT)
 
 /*
- * The IP-MT design revolves around the serialization object ipsq_t.
- * It is associated with an IPMP group. If IPMP is not enabled, there is
- * 1 ipsq_t per phyint. Eg. an ipsq_t would cover both hme0's IPv4 stream
- *
- * ipsq_lock protects
- *	ipsq_reentry_cnt, ipsq_writer, ipsq_xopq_mphead, ipsq_xopq_mptail,
- *	ipsq_mphead, ipsq_mptail, ipsq_split
- *
- *	ipsq_pending_ipif, ipsq_current_ipif, ipsq_pending_mp, ipsq_flags,
- *	ipsq_waitfor
- *
- * The fields in the last line above below are set mostly by a writer thread
- * But there is an exception in the last call to ipif_ill_refrele_tail which
- * could also race with a conn close which could be cleaning up the
- * fields. So we choose to protect using ipsq_lock instead of depending on
- * the property of the writer.
- * ill_g_lock protects
- *	ipsq_refs, ipsq_phyint_list
- */
-typedef struct ipsq_s {
-	kmutex_t ipsq_lock;
-	int	ipsq_reentry_cnt;
-	kthread_t *ipsq_writer;		/* current owner (thread id) */
-	int	ipsq_flags;
-	mblk_t	*ipsq_xopq_mphead;	/* list of excl ops mostly ioctls */
-	mblk_t	*ipsq_xopq_mptail;
-	mblk_t	*ipsq_mphead;		/* msgs on ipsq linked thru b_next */
-	mblk_t	*ipsq_mptail;		/* msgs on ipsq linked thru b_next */
-	int	ipsq_current_ioctl;	/* current ioctl, or 0 if no ioctl */
-	boolean_t ipsq_current_done; 	/* is the current op done? */
-	ipif_t	*ipsq_current_ipif;	/* ipif associated with current op */
-	ipif_t	*ipsq_pending_ipif;	/* ipif associated w. ipsq_pending_mp */
-	mblk_t	*ipsq_pending_mp;	/* current ioctl mp while waiting for */
-					/* response from another module */
-	struct	ipsq_s	*ipsq_next;	/* list of all syncq's (ipsq_g_list) */
-	uint_t		ipsq_refs;	/* Number of phyints on this ipsq */
-	struct phyint	*ipsq_phyint_list; /* List of phyints on this ipsq */
-	boolean_t	ipsq_split;	/* ipsq may need to be split */
-	int		ipsq_waitfor;	/* Values encoded below */
-	char		ipsq_name[LIFNAMSIZ+1];	/* same as phyint_groupname */
-	ip_stack_t	*ipsq_ipst;	/* Does not have a netstack_hold */
-
+ * The IP-MT design revolves around the serialization objects ipsq_t (IPSQ)
+ * and ipxop_t (exclusive operation or "xop").  Becoming "writer" on an IPSQ
+ * ensures that no other threads can become "writer" on any IPSQs sharing that
+ * IPSQ's xop until the writer thread is done.
+ *
+ * Each phyint points to one IPSQ that remains fixed over the phyint's life.
+ * Each IPSQ points to one xop that can change over the IPSQ's life.  If a
+ * phyint is *not* in an IPMP group, then its IPSQ will refer to the IPSQ's
+ * "own" xop (ipsq_ownxop).  If a phyint *is* part of an IPMP group, then its
+ * IPSQ will refer to the "group" xop, which is shorthand for the xop of the
+ * IPSQ of the IPMP meta-interface's phyint.  Thus, all phyints that are part
+ * of the same IPMP group will have their IPSQ's point to the group xop, and
+ * thus becoming "writer" on any phyint in the group will prevent any other
+ * writer on any other phyint in the group.  All IPSQs sharing the same xop
+ * are chained together through ipsq_next (in the degenerate common case,
+ * ipsq_next simply refers to itself).  Note that the group xop is guaranteed
+ * to exist at least as long as there are members in the group, since the IPMP
+ * meta-interface can only be destroyed if the group is empty.
+ *
+ * Incoming exclusive operation requests are enqueued on the IPSQ they arrived
+ * on rather than the xop.  This makes switching xop's (as would happen when a
+ * phyint leaves an IPMP group) simple, because after the phyint leaves the
+ * group, any operations enqueued on its IPSQ can be safely processed with
+ * respect to its new xop, and any operations enqueued on the IPSQs of its
+ * former group can be processed with respect to their existing group xop.
+ * Even so, switching xops is a subtle dance; see ipsq_dq() for details.
+ *
+ * An IPSQ's "own" xop is embedded within the IPSQ itself since they have have
+ * identical lifetimes, and because doing so simplifies pointer management.
+ * While each phyint and IPSQ point to each other, it is not possible to free
+ * the IPSQ when the phyint is freed, since we may still *inside* the IPSQ
+ * when the phyint is being freed.  Thus, ipsq_phyint is set to NULL when the
+ * phyint is freed, and the IPSQ free is later done in ipsq_exit().
+ *
+ * ipsq_t synchronization:	read			write
+ *
+ *	ipsq_xopq_mphead	ipx_lock		ipx_lock
+ *	ipsq_xopq_mptail	ipx_lock		ipx_lock
+ *	ipsq_xop_switch_mp	ipsq_lock		ipsq_lock
+ *	ipsq_phyint		write once		write once
+ *	ipsq_next		RW_READER ill_g_lock	RW_WRITER ill_g_lock
+ *	ipsq_xop 		ipsq_lock or ipsq	ipsq_lock + ipsq
+ *	ipsq_swxop		ipsq			ipsq
+ * 	ipsq_ownxop		see ipxop_t		see ipxop_t
+ *	ipsq_ipst		write once		write once
+ *
+ * ipxop_t synchronization:     read			write
+ *
+ *	ipx_writer  		ipx_lock		ipx_lock
+ *	ipx_xop_queued		ipx_lock 		ipx_lock
+ *	ipx_mphead		ipx_lock		ipx_lock
+ *	ipx_mptail		ipx_lock		ipx_lock
+ *	ipx_ipsq		write once		write once
+ *	ips_ipsq_queued		ipx_lock		ipx_lock
+ *	ipx_waitfor		ipsq or ipx_lock	ipsq + ipx_lock
+ *	ipx_reentry_cnt		ipsq or ipx_lock	ipsq + ipx_lock
+ *	ipx_current_done	ipsq			ipsq
+ *	ipx_current_ioctl	ipsq			ipsq
+ *	ipx_current_ipif	ipsq or ipx_lock	ipsq + ipx_lock
+ *	ipx_pending_ipif	ipsq or ipx_lock	ipsq + ipx_lock
+ *	ipx_pending_mp		ipsq or ipx_lock	ipsq + ipx_lock
+ *	ipx_forced		ipsq			ipsq
+ *	ipx_depth		ipsq			ipsq
+ *	ipx_stack		ipsq			ipsq
+ */
+typedef struct ipxop_s {
+	kmutex_t	ipx_lock;	/* see above */
+	kthread_t	*ipx_writer;  	/* current owner */
+	mblk_t		*ipx_mphead;	/* messages tied to this op */
+	mblk_t		*ipx_mptail;
+	struct ipsq_s	*ipx_ipsq;	/* associated ipsq */
+	boolean_t	ipx_ipsq_queued; /* ipsq using xop has queued op */
+	int		ipx_waitfor;	/* waiting; values encoded below */
+	int		ipx_reentry_cnt;
+	boolean_t	ipx_current_done;  /* is the current operation done? */
+	int		ipx_current_ioctl; /* current ioctl, or 0 if no ioctl */
+	ipif_t		*ipx_current_ipif; /* ipif for current op */
+	ipif_t		*ipx_pending_ipif; /* ipif for ipsq_pending_mp */
+	mblk_t 		*ipx_pending_mp;   /* current ioctl mp while waiting */
+	boolean_t	ipx_forced; 			/* debugging aid */
 #ifdef DEBUG
-	int		ipsq_depth;	/* debugging aid */
-#define	IPSQ_STACK_DEPTH	15
-	pc_t		ipsq_stack[IPSQ_STACK_DEPTH];	/* debugging aid */
+	int		ipx_depth;			/* debugging aid */
+#define	IPX_STACK_DEPTH	15
+	pc_t		ipx_stack[IPX_STACK_DEPTH];	/* debugging aid */
 #endif
-} ipsq_t;
+} ipxop_t;
 
-/* ipsq_flags */
-#define	IPSQ_GROUP	0x1	/* This ipsq belongs to an IPMP group */
+typedef struct ipsq_s {
+	kmutex_t ipsq_lock;		/* see above */
+	mblk_t	*ipsq_switch_mp;	/* op to handle right after switch */
+	mblk_t	*ipsq_xopq_mphead;	/* list of excl ops (mostly ioctls) */
+	mblk_t	*ipsq_xopq_mptail;
+	struct phyint	*ipsq_phyint;	/* associated phyint */
+	struct ipsq_s	*ipsq_next;	/* next ipsq sharing ipsq_xop */
+	struct ipxop_s	*ipsq_xop;	/* current xop synchronization info */
+	struct ipxop_s	*ipsq_swxop;	/* switch xop to on ipsq_exit() */
+	struct ipxop_s	ipsq_ownxop;	/* our own xop (may not be in-use) */
+	ip_stack_t	*ipsq_ipst;	/* does not have a netstack_hold */
+} ipsq_t;
 
 /*
- * ipsq_waitfor:
- *
- * IPIF_DOWN	1	ipif_down waiting for refcnts to drop
- * ILL_DOWN	2	ill_down waiting for refcnts to drop
- * IPIF_FREE	3	ipif_free waiting for refcnts to drop
- * ILL_FREE	4	ill unplumb waiting for refcnts to drop
- * ILL_MOVE_OK	5	failover waiting for refcnts to drop
+ * ipx_waitfor values:
  */
+enum {
+	IPIF_DOWN = 1,	/* ipif_down() waiting for refcnts to drop */
+	ILL_DOWN,	/* ill_down() waiting for refcnts to drop */
+	IPIF_FREE,	/* ipif_free() waiting for refcnts to drop */
+	ILL_FREE	/* ill unplumb waiting for refcnts to drop */
+};
 
-enum { IPIF_DOWN = 1, ILL_DOWN, IPIF_FREE, ILL_FREE, ILL_MOVE_OK };
+/* Operation types for ipsq_try_enter() */
+#define	CUR_OP 0	/* request writer within current operation */
+#define	NEW_OP 1	/* request writer for a new operation */
+#define	SWITCH_OP 2	/* request writer once IPSQ XOP switches */
 
-/* Flags passed to ipsq_try_enter */
-#define	CUR_OP 0		/* Current ioctl continuing again */
-#define	NEW_OP 1		/* New ioctl starting afresh */
+/*
+ * Kstats tracked on each IPMP meta-interface.  Order here must match
+ * ipmp_kstats[] in ip/ipmp.c.
+ */
+enum {
+	IPMP_KSTAT_OBYTES,	IPMP_KSTAT_OBYTES64,	IPMP_KSTAT_RBYTES,
+	IPMP_KSTAT_RBYTES64,	IPMP_KSTAT_OPACKETS,	IPMP_KSTAT_OPACKETS64,
+	IPMP_KSTAT_OERRORS,	IPMP_KSTAT_IPACKETS,	IPMP_KSTAT_IPACKETS64,
+	IPMP_KSTAT_IERRORS,	IPMP_KSTAT_MULTIRCV,	IPMP_KSTAT_MULTIXMT,
+	IPMP_KSTAT_BRDCSTRCV,	IPMP_KSTAT_BRDCSTXMT,	IPMP_KSTAT_LINK_UP,
+	IPMP_KSTAT_MAX		/* keep last */
+};
 
 /*
  * phyint represents state that is common to both IPv4 and IPv6 interfaces.
  * There is a separate ill_t representing IPv4 and IPv6 which has a
  * backpointer to the phyint structure for accessing common state.
- *
- * NOTE : It just stores the group name as there is only one name for
- *	  IPv4 and IPv6 i.e it is a underlying link property. Actually
- *        IPv4 and IPv6 ill are grouped together when their phyints have
- *        the same name.
  */
 typedef struct phyint {
 	struct ill_s	*phyint_illv4;
 	struct ill_s	*phyint_illv6;
-	uint_t		phyint_ifindex;		/* SIOCLSLIFINDEX */
-	char		*phyint_groupname;	/* SIOCSLIFGROUPNAME */
-	uint_t		phyint_groupname_len;
+	uint_t		phyint_ifindex;		/* SIOCSLIFINDEX */
 	uint64_t	phyint_flags;
 	avl_node_t	phyint_avl_by_index;	/* avl tree by index */
 	avl_node_t	phyint_avl_by_name;	/* avl tree by name */
 	kmutex_t	phyint_lock;
 	struct ipsq_s	*phyint_ipsq;		/* back pointer to ipsq */
-	struct phyint	*phyint_ipsq_next;	/* phyint list on this ipsq */
-	/* Once Clearview IPMP is added the follow two fields can be removed */
-	uint_t		phyint_group_ifindex;	/* index assigned to group */
-	uint_t		phyint_hook_ifindex;	/* index used with neti/hook */
+	struct ipmp_grp_s *phyint_grp;		/* associated IPMP group */
+	char		phyint_name[LIFNAMSIZ];	/* physical interface name */
+	uint64_t	phyint_kstats0[IPMP_KSTAT_MAX];	/* baseline kstats */
 } phyint_t;
 
 #define	CACHE_ALIGN_SIZE 64
-
 #define	CACHE_ALIGN(align_struct)	P2ROUNDUP(sizeof (struct align_struct),\
 							CACHE_ALIGN_SIZE)
 struct _phyint_list_s_ {
@@ -1568,34 +1580,6 @@ typedef union phyint_list_u {
 
 #define	phyint_list_avl_by_index	phyint_list_s.phyint_list_avl_by_index
 #define	phyint_list_avl_by_name		phyint_list_s.phyint_list_avl_by_name
-/*
- * ILL groups. We group ills,
- *
- * - if the ills have the same group name. (New way)
- *
- * ill_group locking notes:
- *
- * illgrp_lock protects ill_grp_ill_schednext.
- *
- * ill_g_lock protects ill_grp_next, illgrp_ill, illgrp_ill_count.
- * Holding ill_g_lock freezes the memberships of ills in IPMP groups.
- * It also freezes the global list of ills and all ipifs in all ills.
- *
- * To remove an ipif from the linked list of ipifs of that ill ipif_free_tail
- * holds both ill_g_lock, and ill_lock. Similarly to remove an ill from the
- * global list of ills, ill_glist_delete() holds ill_g_lock as writer.
- * This simplifies things for ipif_select_source, illgrp_scheduler etc.
- * that need to walk the members of an illgrp. They just hold ill_g_lock
- * as reader to do the walk.
- *
- */
-typedef	struct ill_group {
-	kmutex_t	illgrp_lock;
-	struct ill_group *illgrp_next;		/* Next ill_group */
-	struct ill_s	*illgrp_ill_schednext;	/* Next ill to be scheduled */
-	struct ill_s	*illgrp_ill;		/* First ill in the group */
-	int		illgrp_ill_count;
-} ill_group_t;
 
 /*
  * Fragmentation hash bucket
@@ -1792,6 +1776,108 @@ typedef struct ill_lso_capab_s ill_lso_capab_t;
 #define	IS_LOOPBACK(ill) \
 	((ill)->ill_phyint->phyint_flags & PHYI_LOOPBACK)
 
+/* Is this an IPMP meta-interface ILL? */
+#define	IS_IPMP(ill)							\
+	((ill)->ill_phyint->phyint_flags & PHYI_IPMP)
+
+/* Is this ILL under an IPMP meta-interface? (aka "in a group?") */
+#define	IS_UNDER_IPMP(ill)						\
+	((ill)->ill_grp != NULL && !IS_IPMP(ill))
+
+/* Is ill1 in the same illgrp as ill2? */
+#define	IS_IN_SAME_ILLGRP(ill1, ill2)					\
+	((ill1)->ill_grp != NULL && ((ill1)->ill_grp == (ill2)->ill_grp))
+
+/* Is ill1 on the same LAN as ill2? */
+#define	IS_ON_SAME_LAN(ill1, ill2)					\
+	((ill1) == (ill2) || IS_IN_SAME_ILLGRP(ill1, ill2))
+
+#define	ILL_OTHER(ill)							\
+	((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 :		\
+	    (ill)->ill_phyint->phyint_illv6)
+
+/*
+ * IPMP group ILL state structure -- up to two per IPMP group (V4 and V6).
+ * Created when the V4 and/or V6 IPMP meta-interface is I_PLINK'd.  It is
+ * guaranteed to persist while there are interfaces of that type in the group.
+ * In general, most fields are accessed outside of the IPSQ (e.g., in the
+ * datapath), and thus use locks in addition to the IPSQ for protection.
+ *
+ * synchronization:		read			write
+ *
+ *	ig_if			ipsq or ill_g_lock	ipsq and ill_g_lock
+ *	ig_actif		ipsq or ipmp_lock	ipsq and ipmp_lock
+ *	ig_nactif		ipsq or ipmp_lock	ipsq and ipmp_lock
+ *	ig_next_ill		ipsq or ipmp_lock	ipsq and ipmp_lock
+ *	ig_ipmp_ill		write once		write once
+ *	ig_cast_ill		ipsq or ipmp_lock	ipsq and ipmp_lock
+ *	ig_arpent		ipsq			ipsq
+ *	ig_mtu			ipsq			ipsq
+ */
+typedef struct ipmp_illgrp_s {
+	list_t		ig_if; 		/* list of all interfaces */
+	list_t		ig_actif;	/* list of active interfaces */
+	uint_t		ig_nactif;	/* number of active interfaces */
+	struct ill_s	*ig_next_ill;	/* next active interface to use */
+	struct ill_s	*ig_ipmp_ill;	/* backpointer to IPMP meta-interface */
+	struct ill_s	*ig_cast_ill;	/* nominated ill for multi/broadcast */
+	list_t		ig_arpent;	/* list of ARP entries */
+	uint_t		ig_mtu;		/* ig_ipmp_ill->ill_max_mtu */
+} ipmp_illgrp_t;
+
+/*
+ * IPMP group state structure -- one per IPMP group.  Created when the
+ * IPMP meta-interface is plumbed; it is guaranteed to persist while there
+ * are interfaces in it.
+ *
+ * ipmp_grp_t synchronization:		read			write
+ *
+ *	gr_name				ipmp_lock		ipmp_lock
+ *	gr_ifname			write once		write once
+ *	gr_mactype			ipmp_lock		ipmp_lock
+ *	gr_phyint			write once		write once
+ *	gr_nif				ipmp_lock		ipmp_lock
+ *	gr_nactif			ipsq			ipsq
+ *	gr_v4				ipmp_lock		ipmp_lock
+ *	gr_v6				ipmp_lock		ipmp_lock
+ *	gr_nv4				ipmp_lock		ipmp_lock
+ *	gr_nv6				ipmp_lock		ipmp_lock
+ *	gr_pendv4			ipmp_lock		ipmp_lock
+ *	gr_pendv6			ipmp_lock		ipmp_lock
+ *	gr_linkdownmp			ipsq			ipsq
+ *	gr_ksp				ipmp_lock		ipmp_lock
+ *	gr_kstats0			atomic			atomic
+ */
+typedef struct ipmp_grp_s {
+	char		gr_name[LIFGRNAMSIZ];	/* group name */
+	char		gr_ifname[LIFNAMSIZ];	/* interface name */
+	t_uscalar_t	gr_mactype;	/* DLPI mactype of group */
+	phyint_t	*gr_phyint;	/* IPMP group phyint */
+	uint_t		gr_nif;		/* number of interfaces in group */
+	uint_t		gr_nactif; 	/* number of active interfaces */
+	ipmp_illgrp_t	*gr_v4;		/* V4 group information */
+	ipmp_illgrp_t	*gr_v6;		/* V6 group information */
+	uint_t		gr_nv4;		/* number of ills in V4 group */
+	uint_t		gr_nv6;		/* number of ills in V6 group */
+	uint_t		gr_pendv4; 	/* number of pending ills in V4 group */
+	uint_t		gr_pendv6; 	/* number of pending ills in V6 group */
+	mblk_t		*gr_linkdownmp;	/* message used to bring link down */
+	kstat_t		*gr_ksp;	/* group kstat pointer */
+	uint64_t	gr_kstats0[IPMP_KSTAT_MAX]; /* baseline group kstats */
+} ipmp_grp_t;
+
+/*
+ * IPMP ARP entry -- one per SIOCS*ARP entry tied to the group.  Used to keep
+ * ARP up-to-date as the active set of interfaces in the group changes.
+ */
+typedef struct ipmp_arpent_s {
+	mblk_t		*ia_area_mp;	/* AR_ENTRY_ADD pointer */
+	ipaddr_t	ia_ipaddr; 	/* IP address for this entry */
+	boolean_t	ia_proxyarp; 	/* proxy ARP entry? */
+	boolean_t	ia_notified; 	/* ARP notified about this entry? */
+	list_node_t	ia_node; 	/* next ARP entry in list */
+} ipmp_arpent_t;
+
 /*
  * IP Lower level Structure.
  * Instance data structure in ip_open when there is a device below us.
@@ -1851,6 +1937,7 @@ typedef struct ill_s {
 	mblk_t	*ill_unbind_mp;		/* unbind mp from ill_dl_up() */
 	mblk_t	*ill_promiscoff_mp;	/* for ill_leave_allmulti() */
 	mblk_t	*ill_dlpi_deferred;	/* b_next chain of control messages */
+	mblk_t	*ill_ardeact_mp;	/* deact mp from ipmp_ill_activate() */
 	mblk_t	*ill_phys_addr_mp;	/* mblk which holds ill_phys_addr */
 #define	ill_last_mp_to_free	ill_phys_addr_mp
 
@@ -1867,21 +1954,19 @@ typedef struct ill_s {
 		ill_dlpi_style_set : 1,
 
 		ill_ifname_pending : 1,
-		ill_move_in_progress : 1, /* FAILOVER/FAILBACK in progress */
 		ill_join_allmulti : 1,
 		ill_logical_down : 1,
-
 		ill_is_6to4tun : 1,	/* Interface is a 6to4 tunnel */
+
 		ill_promisc_on_phys : 1, /* phys interface in promisc mode */
 		ill_dl_up : 1,
 		ill_up_ipifs : 1,
-
 		ill_note_link : 1,	/* supports link-up notification */
+
 		ill_capab_reneg : 1, /* capability renegotiation to be done */
 		ill_dld_capab_inprog : 1, /* direct dld capab call in prog */
 		ill_need_recover_multicast : 1,
-
-		ill_pad_to_bit_31 : 16;
+		ill_pad_to_bit_31 : 17;
 
 	/* Following bit fields protected by ill_lock */
 	uint_t
@@ -1891,10 +1976,8 @@ typedef struct ill_s {
 		ill_arp_closing : 1,
 
 		ill_arp_bringup_pending : 1,
-		ill_mtu_userspecified : 1, /* SIOCSLIFLNKINFO has set the mtu */
 		ill_arp_extend : 1,	/* ARP has DAD extensions */
-
-		ill_pad_bit_31 : 25;
+		ill_pad_bit_31 : 26;
 
 	/*
 	 * Used in SIOCSIFMUXID and SIOCGIFMUXID for 'ifconfig unplumb'.
@@ -1931,6 +2014,7 @@ typedef struct ill_s {
 	 */
 	uint8_t	ill_max_hops;	/* Maximum hops for any logical interface */
 	uint_t	ill_max_mtu;	/* Maximum MTU for any logical interface */
+	uint_t	ill_user_mtu;	/* User-specified MTU via SIOCSLIFLNKINFO */
 	uint32_t ill_reachable_time;	/* Value for ND algorithm in msec */
 	uint32_t ill_reachable_retrans_time; /* Value for ND algorithm msec */
 	uint_t	ill_max_buf;		/* Max # of req to buffer for ND */
@@ -1953,13 +2037,9 @@ typedef struct ill_s {
 	 * of the ipif.
 	 */
 	mblk_t			*ill_arp_on_mp;
-	/* Peer ill of an IPMP move operation */
-	struct ill_s		*ill_move_peer;
 
 	phyint_t		*ill_phyint;
 	uint64_t		ill_flags;
-	ill_group_t		*ill_group;
-	struct ill_s		*ill_group_next;
 
 	kmutex_t	ill_lock;	/* Please see table below */
 	/*
@@ -2005,6 +2085,18 @@ typedef struct ill_s {
 	void		*ill_flownotify_mh; /* Tx flow ctl, mac cb handle */
 	uint_t		ill_ilm_cnt;    /* ilms referencing this ill */
 	uint_t		ill_ipallmulti_cnt; /* ip_join_allmulti() calls */
+	/*
+	 * IPMP fields.
+	 */
+	ipmp_illgrp_t	*ill_grp;	/* IPMP group information */
+	list_node_t	ill_actnode; 	/* next active ill in group */
+	list_node_t	ill_grpnode;	/* next ill in group */
+	ipif_t		*ill_src_ipif;	/* source address selection rotor */
+	ipif_t		*ill_move_ipif;	/* ipif awaiting move to new ill */
+	boolean_t	ill_nom_cast;	/* nominated for mcast/bcast */
+	uint_t		ill_bound_cnt;	/* # of data addresses bound to ill */
+	ipif_t		*ill_bound_ipif; /* ipif chain bound to ill */
+	timeout_id_t	ill_refresh_tid; /* ill refresh retry timeout id */
 } ill_t;
 
 /*
@@ -2088,6 +2180,7 @@ typedef struct ill_s {
  *
  * ill_max_mtu
  *
+ * ill_user_mtu			ipsq + ill_lock		ill_lock
  * ill_reachable_time		ipsq + ill_lock		ill_lock
  * ill_reachable_retrans_time	ipsq + ill_lock		ill_lock
  * ill_max_buf			ipsq + ill_lock		ill_lock
@@ -2102,12 +2195,9 @@ typedef struct ill_s {
  * ill_arp_down_mp		ipsq			ipsq
  * ill_arp_del_mapping_mp	ipsq			ipsq
  * ill_arp_on_mp		ipsq			ipsq
- * ill_move_peer		ipsq			ipsq
  *
  * ill_phyint			ipsq, ill_g_lock, ill_lock	Any of them
  * ill_flags			ill_lock		ill_lock
- * ill_group			ipsq, ill_g_lock, ill_lock	Any of them
- * ill_group_next		ipsq, ill_g_lock, ill_lock	Any of them
  * ill_nd_lla_mp		ipsq + down ill		only when ill is up
  * ill_nd_lla			ipsq + down ill		only when ill is up
  * ill_nd_lla_len		ipsq + down ill		only when ill is up
@@ -2122,11 +2212,26 @@ typedef struct ill_s {
  * ill_ilm_walker_cnt		ill_lock		ill_lock
  * ill_nce_cnt			ill_lock		ill_lock
  * ill_ilm_cnt			ill_lock		ill_lock
+ * ill_src_ipif			ill_g_lock		ill_g_lock
  * ill_trace			ill_lock		ill_lock
  * ill_usesrc_grp_next		ill_g_usesrc_lock	ill_g_usesrc_lock
  * ill_dhcpinit			atomics			atomics
  * ill_flownotify_mh		write once		write once
  * ill_capab_pending_cnt	ipsq			ipsq
+ *
+ * ill_bound_cnt		ipsq			ipsq
+ * ill_bound_ipif		ipsq			ipsq
+ * ill_actnode			ipsq + ipmp_lock	ipsq OR ipmp_lock
+ * ill_grpnode			ipsq + ill_g_lock	ipsq OR ill_g_lock
+ * ill_src_ipif			ill_g_lock		ill_g_lock
+ * ill_move_ipif		ipsq			ipsq
+ * ill_nom_cast			ipsq			ipsq OR advisory
+ * ill_refresh_tid		ill_lock		ill_lock
+ * ill_grp (for IPMP ill)	write once		write once
+ * ill_grp (for underlying ill)	ipsq + ill_g_lock	ipsq OR ill_g_lock
+ *
+ * NOTE: It's OK to make heuristic decisions on an underlying interface
+ *	 by using IS_UNDER_IPMP() or comparing ill_grp's raw pointer value.
  */
 
 /*
@@ -2167,7 +2272,7 @@ enum { IF_CMD = 1, LIF_CMD, TUN_CMD, ARP_CMD, XARP_CMD, MSFILT_CMD, MISC_CMD };
 #define	IPI_MODOK	0x2	/* Permitted on mod instance of IP */
 #define	IPI_WR		0x4	/* Need to grab writer access */
 #define	IPI_GET_CMD	0x8	/* branch to mi_copyout on success */
-#define	IPI_REPL	0x10	/* valid for replacement ipif created in MOVE */
+/*	unused		0x10	*/
 #define	IPI_NULL_BCONT	0x20	/* ioctl has not data and hence no b_cont */
 #define	IPI_PASS_DOWN	0x40	/* pass this ioctl down when a module only */
 
@@ -2176,17 +2281,6 @@ extern ip_ioctl_cmd_t	ip_misc_ioctl_table[];
 extern int ip_ndx_ioctl_count;
 extern int ip_misc_ioctl_count;
 
-#define	ILL_CLEAR_MOVE(ill) {				\
-	ill_t *peer_ill;				\
-							\
-	peer_ill = (ill)->ill_move_peer;		\
-	ASSERT(peer_ill != NULL);			\
-	(ill)->ill_move_in_progress = B_FALSE;		\
-	peer_ill->ill_move_in_progress = B_FALSE;	\
-	(ill)->ill_move_peer = NULL;			\
-	peer_ill->ill_move_peer = NULL;			\
-}
-
 /* Passed down by ARP to IP during I_PLINK/I_PUNLINK */
 typedef struct ipmx_s {
 	char	ipmx_name[LIFNAMSIZ];		/* if name */
@@ -2799,19 +2893,11 @@ typedef struct ip_pktinfo {
 	(!((ipif)->ipif_state_flags & (IPIF_CONDEMNED)) ||		\
 	IAM_WRITER_IPIF(ipif))
 
-/*
- * These macros are used by critical set ioctls and failover ioctls to
- * mark the ipif appropriately before starting the operation and to clear the
- * marks after completing the operation.
- */
-#define	IPIF_UNMARK_MOVING(ipif)                                \
-	(ipif)->ipif_state_flags &= ~IPIF_MOVING & ~IPIF_CHANGING;
-
 #define	ILL_UNMARK_CHANGING(ill)                                \
 	(ill)->ill_state_flags &= ~ILL_CHANGING;
 
 /* Macros used to assert that this thread is a writer */
-#define	IAM_WRITER_IPSQ(ipsq)	((ipsq)->ipsq_writer == curthread)
+#define	IAM_WRITER_IPSQ(ipsq)	((ipsq)->ipsq_xop->ipx_writer == curthread)
 #define	IAM_WRITER_ILL(ill)	IAM_WRITER_IPSQ((ill)->ill_phyint->phyint_ipsq)
 #define	IAM_WRITER_IPIF(ipif)	IAM_WRITER_ILL((ipif)->ipif_ill)
 
@@ -2837,9 +2923,9 @@ typedef struct ip_pktinfo {
 #define	RELEASE_ILL_LOCKS(ill_1, ill_2)		\
 {						\
 	if (ill_1 != NULL)			\
-		mutex_exit(&(ill_1)->ill_lock);	\
+		mutex_exit(&(ill_1)->ill_lock); \
 	if (ill_2 != NULL && ill_2 != ill_1)	\
-		mutex_exit(&(ill_2)->ill_lock);	\
+		mutex_exit(&(ill_2)->ill_lock); \
 }
 
 /* Get the other protocol instance ill */
@@ -2847,14 +2933,9 @@ typedef struct ip_pktinfo {
 	((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 :	\
 	    (ill)->ill_phyint->phyint_illv6)
 
-#define	MATCH_V4_ONLY	0x1
-#define	MATCH_V6_ONLY	0x2
-#define	MATCH_ILL_ONLY	0x4
-
 /* ioctl command info: Ioctl properties extracted and stored in here */
 typedef struct cmd_info_s
 {
-	char    ci_groupname[LIFNAMSIZ + 1];	/* SIOCSLIFGROUPNAME */
 	ipif_t  *ci_ipif;	/* ipif associated with [l]ifreq ioctl's */
 	sin_t	*ci_sin;	/* the sin struct passed down */
 	sin6_t	*ci_sin6;	/* the sin6_t struct passed down */
@@ -2990,10 +3071,8 @@ extern struct module_info ip_mod_info;
 	((ipst)->ips_ip6_loopback_out_event.he_interested)
 
 /*
- * Hooks marcos used inside of ip
+ * Hooks macros used inside of ip
  */
-#define	IPHA_VHL	ipha_version_and_hdr_length
-
 #define	FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst)	\
 									\
 	if ((_hook).he_interested) {	\
@@ -3002,21 +3081,8 @@ extern struct module_info ip_mod_info;
 		_NOTE(CONSTCOND)					\
 		ASSERT((_ilp != NULL) || (_olp != NULL));		\
 									\
-		_NOTE(CONSTCOND)					\
-		if ((_ilp != NULL) &&					\
-		    (((ill_t *)(_ilp))->ill_phyint != NULL))		\
-			info.hpe_ifp = (phy_if_t)((ill_t *)		\
-			    (_ilp))->ill_phyint->phyint_hook_ifindex;	\
-		else							\
-			info.hpe_ifp = 0;				\
-									\
-		_NOTE(CONSTCOND)					\
-		if ((_olp != NULL) &&					\
-		    (((ill_t *)(_olp))->ill_phyint != NULL))		\
-			info.hpe_ofp = (phy_if_t)((ill_t *)		\
-			    (_olp))->ill_phyint->phyint_hook_ifindex;	\
-		else							\
-			info.hpe_ofp = 0;				\
+		FW_SET_ILL_INDEX(info.hpe_ifp, (ill_t *)_ilp);		\
+		FW_SET_ILL_INDEX(info.hpe_ofp, (ill_t *)_olp);		\
 		info.hpe_protocol = ipst->ips_ipv4_net_data;		\
 		info.hpe_hdr = _iph;					\
 		info.hpe_mp = &(_fm);					\
@@ -3026,10 +3092,8 @@ extern struct module_info ip_mod_info;
 		    _event, (hook_data_t)&info) != 0) {			\
 			ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\
 			    (_hook).he_name, (void *)_fm, (void *)_m));	\
-			if (_fm != NULL) {				\
-				freemsg(_fm);				\
-				_fm = NULL;				\
-			}						\
+			freemsg(_fm);					\
+			_fm = NULL;					\
 			_iph = NULL;					\
 			_m = NULL;					\
 		} else {						\
@@ -3046,21 +3110,8 @@ extern struct module_info ip_mod_info;
 		_NOTE(CONSTCOND)					\
 		ASSERT((_ilp != NULL) || (_olp != NULL));		\
 									\
-		_NOTE(CONSTCOND)					\
-		if ((_ilp != NULL) &&					\
-		    (((ill_t *)(_ilp))->ill_phyint != NULL))		\
-			info.hpe_ifp = (phy_if_t)((ill_t *)		\
-			    (_ilp))->ill_phyint->phyint_hook_ifindex;	\
-		else							\
-			info.hpe_ifp = 0;				\
-									\
-		_NOTE(CONSTCOND)					\
-		if ((_olp != NULL) &&					\
-		    (((ill_t *)(_olp))->ill_phyint != NULL))		\
-			info.hpe_ofp = (phy_if_t)((ill_t *)		\
-			    (_olp))->ill_phyint->phyint_hook_ifindex;	\
-		else							\
-			info.hpe_ofp = 0;				\
+		FW_SET_ILL_INDEX(info.hpe_ifp, (ill_t *)_ilp);		\
+		FW_SET_ILL_INDEX(info.hpe_ofp, (ill_t *)_olp);		\
 		info.hpe_protocol = ipst->ips_ipv6_net_data;		\
 		info.hpe_hdr = _iph;					\
 		info.hpe_mp = &(_fm);					\
@@ -3070,10 +3121,8 @@ extern struct module_info ip_mod_info;
 		    _event, (hook_data_t)&info) != 0) {			\
 			ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\
 			    (_hook).he_name, (void *)_fm, (void *)_m));	\
-			if (_fm != NULL) {				\
-				freemsg(_fm);				\
-				_fm = NULL;				\
-			}						\
+			freemsg(_fm);					\
+			_fm = NULL;					\
 			_iph = NULL;					\
 			_m = NULL;					\
 		} else {						\
@@ -3082,6 +3131,17 @@ extern struct module_info ip_mod_info;
 		}							\
 	}
 
+#define	FW_SET_ILL_INDEX(fp, ill)					\
+	_NOTE(CONSTCOND)						\
+	if ((ill) == NULL || (ill)->ill_phyint == NULL) {		\
+		(fp) = 0;						\
+		_NOTE(CONSTCOND)					\
+	} else if (IS_UNDER_IPMP(ill)) {				\
+		(fp) = ipmp_ill_get_ipmp_ifindex(ill);			\
+	} else {							\
+		(fp) = (ill)->ill_phyint->phyint_ifindex;		\
+	}
+
 /*
  * Network byte order macros
  */
@@ -3146,16 +3206,15 @@ struct	ipsec_out_s;
 
 struct	mac_header_info_s;
 
-extern boolean_t ip_assign_ifindex(uint_t *, ip_stack_t *);
 extern void	ill_frag_timer(void *);
 extern ill_t	*ill_first(int, int, ill_walk_context_t *, ip_stack_t *);
 extern ill_t	*ill_next(ill_walk_context_t *, ill_t *);
 extern void	ill_frag_timer_start(ill_t *);
 extern void	ill_nic_event_dispatch(ill_t *, lif_if_t, nic_event_t,
     nic_event_data_t, size_t);
-extern void	ill_nic_event_plumb(ill_t *, boolean_t);
 extern mblk_t	*ip_carve_mp(mblk_t **, ssize_t);
 extern mblk_t	*ip_dlpi_alloc(size_t, t_uscalar_t);
+extern mblk_t	*ip_dlnotify_alloc(uint_t, uint_t);
 extern char	*ip_dot_addr(ipaddr_t, char *);
 extern const char *mac_colon_addr(const uint8_t *, size_t, char *, size_t);
 extern void	ip_lwput(queue_t *, mblk_t *);
@@ -3239,8 +3298,49 @@ extern int	ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *);
 extern struct qinit iprinitv6;
 extern struct qinit ipwinitv6;
 
-extern	void	conn_drain_insert(conn_t *connp);
-extern	int	conn_ipsec_length(conn_t *connp);
+extern void	ipmp_init(ip_stack_t *);
+extern void	ipmp_destroy(ip_stack_t *);
+extern ipmp_grp_t *ipmp_grp_create(const char *, phyint_t *);
+extern void	ipmp_grp_destroy(ipmp_grp_t *);
+extern void	ipmp_grp_info(const ipmp_grp_t *, lifgroupinfo_t *);
+extern int	ipmp_grp_rename(ipmp_grp_t *, const char *);
+extern ipmp_grp_t *ipmp_grp_lookup(const char *, ip_stack_t *);
+extern int	ipmp_grp_vet_phyint(ipmp_grp_t *, phyint_t *);
+extern ipmp_illgrp_t *ipmp_illgrp_create(ill_t *);
+extern void	ipmp_illgrp_destroy(ipmp_illgrp_t *);
+extern ill_t	*ipmp_illgrp_add_ipif(ipmp_illgrp_t *, ipif_t *);
+extern void	ipmp_illgrp_del_ipif(ipmp_illgrp_t *, ipif_t *);
+extern ill_t	*ipmp_illgrp_next_ill(ipmp_illgrp_t *);
+extern ill_t	*ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *);
+extern ill_t	*ipmp_illgrp_cast_ill(ipmp_illgrp_t *);
+extern ill_t	*ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *);
+extern ill_t	*ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *);
+extern void	ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *);
+extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *, mblk_t *,
+    boolean_t);
+extern void	ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *, ipmp_arpent_t *);
+extern ipmp_arpent_t *ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *, ipaddr_t *);
+extern void	ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *);
+extern void	ipmp_illgrp_mark_arpent(ipmp_illgrp_t *, ipmp_arpent_t *);
+extern ill_t	*ipmp_illgrp_find_ill(ipmp_illgrp_t *, uchar_t *, uint_t);
+extern void	ipmp_illgrp_link_grp(ipmp_illgrp_t *, ipmp_grp_t *);
+extern int	ipmp_illgrp_unlink_grp(ipmp_illgrp_t *);
+extern uint_t	ipmp_ill_get_ipmp_ifindex(const ill_t *);
+extern void	ipmp_ill_join_illgrp(ill_t *, ipmp_illgrp_t *);
+extern void	ipmp_ill_leave_illgrp(ill_t *);
+extern ill_t	*ipmp_ill_hold_ipmp_ill(ill_t *);
+extern boolean_t ipmp_ill_is_active(ill_t *);
+extern void	ipmp_ill_refresh_active(ill_t *);
+extern void	ipmp_phyint_join_grp(phyint_t *, ipmp_grp_t *);
+extern void	ipmp_phyint_leave_grp(phyint_t *);
+extern void	ipmp_phyint_refresh_active(phyint_t *);
+extern ill_t	*ipmp_ipif_bound_ill(const ipif_t *);
+extern ill_t	*ipmp_ipif_hold_bound_ill(const ipif_t *);
+extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *);
+extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *);
+
+extern void	conn_drain_insert(conn_t *connp);
+extern int	conn_ipsec_length(conn_t *connp);
 extern void	ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *,
     ire_t *);
 extern ipaddr_t	ip_get_dst(ipha_t *);
@@ -3274,9 +3374,6 @@ extern int	ip_srcid_report(queue_t *, mblk_t *, caddr_t, cred_t *);
 extern uint8_t	ipoptp_next(ipoptp_t *);
 extern uint8_t	ipoptp_first(ipoptp_t *, ipha_t *);
 extern int	ip_opt_get_user(const ipha_t *, uchar_t *);
-extern ill_t	*ip_grab_attach_ill(ill_t *, mblk_t *, int, boolean_t,
-    ip_stack_t *);
-extern ire_t	*conn_set_outgoing_ill(conn_t *, ire_t *, ill_t **);
 extern int	ipsec_req_from_conn(conn_t *, ipsec_req_t *, int);
 extern int	ip_snmp_get(queue_t *q, mblk_t *mctl, int level);
 extern int	ip_snmp_set(queue_t *q, int, int, uchar_t *, int);
@@ -3295,7 +3392,6 @@ extern void	ip_savebuf(void **, uint_t *, boolean_t, const void *, uint_t);
 extern boolean_t	ipsq_pending_mp_cleanup(ill_t *, conn_t *);
 extern void	conn_ioctl_cleanup(conn_t *);
 extern ill_t	*conn_get_held_ill(conn_t *, ill_t **, int *);
-extern ill_t	*ip_newroute_get_dst_ill(ill_t *);
 
 struct multidata_s;
 struct pdesc_s;
@@ -3314,9 +3410,6 @@ extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *,
 			uint_t);
 extern	void	ip_unbind(conn_t *connp);
 
-extern phyint_t *phyint_lookup_group(char *, boolean_t, ip_stack_t *);
-extern phyint_t *phyint_lookup_group_ifindex(uint_t, ip_stack_t *);
-
 extern void tnet_init(void);
 extern void tnet_fini(void);
 
@@ -3434,6 +3527,8 @@ typedef struct ipobs_cb {
  * ihd_ifindex	  Interface index that the packet was received/sent over.
  *		  For local packets, this is the index of the interface
  *		  associated with the local destination address.
+ * ihd_grifindex  IPMP group interface index (zero unless ihd_ifindex
+ *		  is an IPMP underlying interface).
  * ihd_stack	  Netstack the packet is from.
  */
 typedef struct ipobs_hook_data {
@@ -3443,6 +3538,7 @@ typedef struct ipobs_hook_data {
 	ipobs_hook_type_t	ihd_htype;
 	uint16_t		ihd_ipver;
 	uint64_t		ihd_ifindex;
+	uint64_t 		ihd_grifindex;
 	netstack_t		*ihd_stack;
 } ipobs_hook_data_t;
 
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 3f967ea183..d484831a3c 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -1892,7 +1892,6 @@ icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 		 * case MRT_VERSION:
 		 * case MRT_ASSERT:
 		 * case IP_SEC_OPT:
-		 * case IP_DONTFAILOVER_IF:
 		 * case IP_NEXTHOP:
 		 */
 		default:
@@ -2481,7 +2480,6 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case IP_SEC_OPT:
-		case IP_DONTFAILOVER_IF:
 		case IP_NEXTHOP:
 			/*
 			 * "soft" error (negative)
@@ -3014,9 +3012,7 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 		case IPV6_PATHMTU:
 			return (EINVAL);
 
-		case IPV6_BOUND_PIF:
 		case IPV6_SEC_OPT:
-		case IPV6_DONTFAILOVER_IF:
 		case IPV6_SRC_PREFERENCES:
 		case IPV6_V6ONLY:
 			/* Handled at IP level */
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index 4f15801dfb..24ba9d689c 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -138,9 +138,6 @@ opdes_t	icmp_opt_arr[] = {
 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
-	sizeof (struct in_addr), 0 /* not initialized */ },
-
 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
 	sizeof (int), 0 },
 
@@ -222,12 +219,6 @@ opdes_t	icmp_opt_arr[] = {
 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
-	sizeof (int),	0 /* no ifindex */ },
-
-{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
-	sizeof (int),	0 /* no ifindex */ },
-
 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
 	sizeof (int), 0 },
 
diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c
index 091509c71e..681f198aa7 100644
--- a/usr/src/uts/common/inet/ip/igmp.c
+++ b/usr/src/uts/common/inet/ip/igmp.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -46,7 +46,7 @@
 #include <sys/cmn_err.h>
 #include <sys/atomic.h>
 #include <sys/zone.h>
-
+#include <sys/callb.h>
 #include <sys/param.h>
 #include <sys/socket.h>
 #include <inet/ipclassifier.h>
@@ -83,7 +83,7 @@ static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
 		    mcast_record_t rtype, slist_t *flist);
 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
-
+static void	mcast_signal_restart_thread(ip_stack_t *ipst);
 
 /*
  * Macros used to do timer len conversions.  Timer values are always
@@ -122,7 +122,7 @@ static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
  * The first multicast join will trigger the igmp timers / mld timers
  * The unit for next is milliseconds.
  */
-void
+static void
 igmp_start_timers(unsigned next, ip_stack_t *ipst)
 {
 	int	time_left;
@@ -207,7 +207,7 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst)
  * mld_start_timers:
  * The unit for next is milliseconds.
  */
-void
+static void
 mld_start_timers(unsigned next, ip_stack_t *ipst)
 {
 	int	time_left;
@@ -306,7 +306,8 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
 	uint32_t 	group;
 	uint_t		next;
 	ipif_t 		*ipif;
-	ip_stack_t	 *ipst;
+	ip_stack_t	*ipst;
+	ilm_walker_t	ilw;
 
 	ASSERT(ill != NULL);
 	ASSERT(!ill->ill_isv6);
@@ -401,8 +402,7 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
 					    "igmp_input: we are only "
 					    "member src 0x%x ipif_local 0x%x",
 					    (int)ntohl(src),
-					    (int)
-					    ntohl(ipif->ipif_lcl_addr));
+					    (int)ntohl(ipif->ipif_lcl_addr));
 				}
 				mutex_exit(&ill->ill_lock);
 				return (mp);
@@ -440,23 +440,20 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
 		}
 
 		/*
-		 * If we belong to the group being reported, and
-		 * we are a 'Delaying member' in the RFC terminology,
-		 * stop our timer for that group and 'clear flag' i.e.
-		 * mark as IGMP_OTHERMEMBER. Do this for all logical
-		 * interfaces on the given physical interface.
+		 * If our ill has ILMs that belong to the group being
+		 * reported, and we are a 'Delaying Member' in the RFC
+		 * terminology, stop our timer for that group and 'clear
+		 * flag' i.e. mark as IGMP_OTHERMEMBER.
 		 */
-		mutex_enter(&ill->ill_lock);
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			ilm = ilm_lookup_ipif(ipif, group);
-			if (ilm != NULL) {
+		ilm = ilm_walker_start(&ilw, ill);
+		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+			if (ilm->ilm_addr == group) {
 				++ipst->ips_igmpstat.igps_rcv_ourreports;
 				ilm->ilm_timer = INFINITY;
 				ilm->ilm_state = IGMP_OTHERMEMBER;
 			}
-		} /* for */
-		mutex_exit(&ill->ill_lock);
+		}
+		ilm_walker_finish(&ilw);
 		break;
 
 	case IGMP_V3_MEMBERSHIP_REPORT:
@@ -485,6 +482,7 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 	int	timer;
 	uint_t	next, current;
 	ip_stack_t	 *ipst;
+	ilm_walker_t 	ilw;
 
 	ipst = ill->ill_ipst;
 	++ipst->ips_igmpstat.igps_rcv_queries;
@@ -583,11 +581,12 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 	 *  the maximum timeout.
 	 */
 	next = (unsigned)INFINITY;
-	mutex_enter(&ill->ill_lock);
 
+	ilm = ilm_walker_start(&ilw, ill);
+	mutex_enter(&ill->ill_lock);
 	current = CURRENT_MSTIME;
-	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
 
+	for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
 		/*
 		 * A multicast router joins INADDR_ANY address
 		 * to enable promiscuous reception of all
@@ -610,6 +609,7 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
 		}
 	}
 	mutex_exit(&ill->ill_lock);
+	ilm_walker_finish(&ilw);
 
 	return (next);
 }
@@ -623,6 +623,7 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
 	ipaddr_t	*src_array;
 	uint8_t		qrv;
 	ip_stack_t	 *ipst;
+	ilm_walker_t	ilw;
 
 	ipst = ill->ill_ipst;
 	/* make sure numsrc matches packet size */
@@ -693,8 +694,9 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
 
 	} else {
 		/* group or group/source specific query */
+		ilm = ilm_walker_start(&ilw, ill);
 		mutex_enter(&ill->ill_lock);
-		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
@@ -749,6 +751,7 @@ group_query:
 			ilm->ilm_timer += current;
 		}
 		mutex_exit(&ill->ill_lock);
+		ilm_walker_finish(&ilw);
 	}
 
 	return (next);
@@ -819,13 +822,22 @@ igmp_joingroup(ilm_t *ilm)
 		mutex_exit(&ill->ill_lock);
 
 		/*
-		 * To avoid deadlock, we defer igmp_start_timers() to
-		 * ipsq_exit().  See the comment in ipsq_exit() for details.
+		 * We need to restart the IGMP timers, but we can't do it here
+		 * since we're inside the IPSQ and thus igmp_start_timers() ->
+		 * untimeout() (inside the IPSQ, waiting for a running timeout
+		 * to finish) could deadlock with igmp_timeout_handler() ->
+		 * ipsq_enter() (running the timeout, waiting to get inside
+		 * the IPSQ).  We also can't just delay it until after we
+		 * ipsq_exit() since we could be inside more than one IPSQ and
+		 * thus still have the other IPSQs pinned after we exit -- and
+		 * igmp_start_timers() may be trying to enter one of those.
+		 * Instead, signal a dedicated thread that will do it for us.
 		 */
 		mutex_enter(&ipst->ips_igmp_timer_lock);
 		ipst->ips_igmp_deferred_next = MIN(timer,
 		    ipst->ips_igmp_deferred_next);
 		mutex_exit(&ipst->ips_igmp_timer_lock);
+		mcast_signal_restart_thread(ipst);
 	}
 
 	if (ip_debug > 1) {
@@ -897,13 +909,14 @@ mld_joingroup(ilm_t *ilm)
 		mutex_exit(&ill->ill_lock);
 
 		/*
-		 * To avoid deadlock, we defer mld_start_timers() to
-		 * ipsq_exit().  See the comment in ipsq_exit() for details.
+		 * Signal another thread to restart the timers.  See the
+		 * comment in igmp_joingroup() for details.
 		 */
 		mutex_enter(&ipst->ips_mld_timer_lock);
 		ipst->ips_mld_deferred_next = MIN(timer,
 		    ipst->ips_mld_deferred_next);
 		mutex_exit(&ipst->ips_mld_timer_lock);
+		mcast_signal_restart_thread(ipst);
 	}
 
 	if (ip_debug > 1) {
@@ -1073,8 +1086,8 @@ send_to_in:
 	/*
 	 * Need to set up retransmission state; merge the new info with the
 	 * current state (which may be null).  If the timer is not currently
-	 * running, start it (need to do a delayed start of the timer as
-	 * we're currently in the sq).
+	 * running, signal a thread to restart it -- see the comment in
+	 * igmp_joingroup() for details.
 	 */
 	rp = mcast_merge_rtx(ilm, rp, flist);
 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
@@ -1085,6 +1098,7 @@ send_to_in:
 		    ilm->ilm_rtx.rtx_timer);
 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
 		mutex_exit(&ipst->ips_igmp_timer_lock);
+		mcast_signal_restart_thread(ipst);
 	}
 
 	mutex_exit(&ill->ill_lock);
@@ -1161,8 +1175,8 @@ send_to_in:
 	/*
 	 * Need to set up retransmission state; merge the new info with the
 	 * current state (which may be null).  If the timer is not currently
-	 * running, start it (need to do a deferred start of the timer as
-	 * we're currently in the sq).
+	 * running, signal a thread to restart it -- see the comment in
+	 * igmp_joingroup() for details.
 	 */
 	rp = mcast_merge_rtx(ilm, rp, flist);
 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
@@ -1174,6 +1188,7 @@ send_to_in:
 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
 		mutex_exit(&ipst->ips_mld_timer_lock);
+		mcast_signal_restart_thread(ipst);
 	}
 
 	mutex_exit(&ill->ill_lock);
@@ -1397,12 +1412,10 @@ per_ilm_rtxtimer:
  *
  * igmp_input() receives igmp queries and responds to the queries
  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
- * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
+ * Later the igmp_timer fires, the timeout handler igmp_timeout_handler()
  * performs the action exclusively after entering each ill's ipsq as writer.
- * The actual igmp timeout handler needs to run in the ipsq since it has to
- * access the ilm's and we don't want another exclusive operation like
- * say an IPMP failover to be simultaneously moving the ilms from one ill to
- * another.
+ * (The need to enter the IPSQ is largely historical but there are still some
+ * fields like ilm_filter that rely on it.)
  *
  * The igmp_slowtimeo() function is called thru another timer.
  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
@@ -1420,7 +1433,6 @@ igmp_timeout_handler(void *arg)
 	ASSERT(arg != NULL);
 	mutex_enter(&ipst->ips_igmp_timer_lock);
 	ASSERT(ipst->ips_igmp_timeout_id != 0);
-	ipst->ips_igmp_timer_thread = curthread;
 	ipst->ips_igmp_timer_scheduled_last = 0;
 	ipst->ips_igmp_time_to_next = 0;
 	mutex_exit(&ipst->ips_igmp_timer_lock);
@@ -1452,7 +1464,6 @@ igmp_timeout_handler(void *arg)
 	mutex_enter(&ipst->ips_igmp_timer_lock);
 	ASSERT(ipst->ips_igmp_timeout_id != 0);
 	ipst->ips_igmp_timeout_id = 0;
-	ipst->ips_igmp_timer_thread = NULL;
 	mutex_exit(&ipst->ips_igmp_timer_lock);
 
 	if (global_next != INFINITY)
@@ -1663,7 +1674,6 @@ mld_timeout_handler(void *arg)
 	ASSERT(arg != NULL);
 	mutex_enter(&ipst->ips_mld_timer_lock);
 	ASSERT(ipst->ips_mld_timeout_id != 0);
-	ipst->ips_mld_timer_thread = curthread;
 	ipst->ips_mld_timer_scheduled_last = 0;
 	ipst->ips_mld_time_to_next = 0;
 	mutex_exit(&ipst->ips_mld_timer_lock);
@@ -1695,7 +1705,6 @@ mld_timeout_handler(void *arg)
 	mutex_enter(&ipst->ips_mld_timer_lock);
 	ASSERT(ipst->ips_mld_timeout_id != 0);
 	ipst->ips_mld_timeout_id = 0;
-	ipst->ips_mld_timer_thread = NULL;
 	mutex_exit(&ipst->ips_mld_timer_lock);
 
 	if (global_next != INFINITY)
@@ -1871,7 +1880,7 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
 	size_t	size  = hdrlen + sizeof (igmpa_t);
 	ipif_t 	*ipif = ilm->ilm_ipif;
-	ill_t 	*ill  = ipif->ipif_ill;	/* Will be the "lower" ill */
+	ill_t 	*ill  = ipif->ipif_ill;
 	mblk_t	*first_mp;
 	ipsec_out_t *io;
 	zoneid_t zoneid;
@@ -1887,14 +1896,6 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
 	 * not get forwarded on other interfaces or looped back, we
 	 * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
 	 * to B_FALSE.
-	 *
-	 * We also need to make sure that this does not get load balanced
-	 * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
-	 * here. If it gets load balanced, switches supporting igmp snooping
-	 * will send the packet that it receives for this multicast group
-	 * to the interface that we are sending on. As we have joined the
-	 * multicast group on this ill, by sending the packet out on this
-	 * ill, we receive all the packets back on this ill.
 	 */
 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
 	if (first_mp == NULL)
@@ -1909,7 +1910,6 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
 	io->ipsec_out_len = sizeof (ipsec_out_t);
 	io->ipsec_out_use_global_policy = B_TRUE;
 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
-	io->ipsec_out_attach_if = B_TRUE;
 	io->ipsec_out_multicast_loop = B_FALSE;
 	io->ipsec_out_dontroute = B_TRUE;
 	if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
@@ -1995,6 +1995,8 @@ igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
 	zoneid_t zoneid;
 	ip_stack_t	 *ipst = ill->ill_ipst;
 
+	ASSERT(IAM_WRITER_IPIF(ipif));
+
 	/* if there aren't any records, there's nothing to send */
 	if (reclist == NULL)
 		return;
@@ -2022,6 +2024,14 @@ nextpkt:
 				int srcspace, srcsperpkt;
 				srcspace = ill->ill_max_frag - (size +
 				    sizeof (grphdra_t));
+
+				/*
+				 * Skip if there's not even enough room in
+				 * a single packet to send something useful.
+				 */
+				if (srcspace <= sizeof (ipaddr_t))
+					continue;
+
 				srcsperpkt = srcspace / sizeof (ipaddr_t);
 				/*
 				 * Increment size and numrec, because we will
@@ -2082,7 +2092,6 @@ nextpkt:
 	io->ipsec_out_len = sizeof (ipsec_out_t);
 	io->ipsec_out_use_global_policy = B_TRUE;
 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
-	io->ipsec_out_attach_if = B_TRUE;
 	io->ipsec_out_multicast_loop = B_FALSE;
 	io->ipsec_out_dontroute = B_TRUE;
 	if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
@@ -2188,6 +2197,7 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
 	uint_t		next;
 	int		mldlen;
 	ip_stack_t	*ipst = ill->ill_ipst;
+	ilm_walker_t	ilw;
 
 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
 
@@ -2294,7 +2304,6 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
 			return;
 		}
 
-
 		/*
 		 * If we belong to the group being reported, and we are a
 		 * 'Delaying member' per the RFC terminology, stop our timer
@@ -2303,8 +2312,8 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
 		 * membership entries for the same group address (one per zone)
 		 * so we need to walk the ill_ilm list.
 		 */
-		mutex_enter(&ill->ill_lock);
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
+		ilm = ilm_walker_start(&ilw, ill);
+		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
 				continue;
 			BUMP_MIB(ill->ill_icmp6_mib,
@@ -2313,7 +2322,7 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
 			ilm->ilm_timer = INFINITY;
 			ilm->ilm_state = IGMP_OTHERMEMBER;
 		}
-		mutex_exit(&ill->ill_lock);
+		ilm_walker_finish(&ilw);
 		break;
 	}
 	case MLD_LISTENER_REDUCTION:
@@ -2343,6 +2352,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
 	int	timer;
 	uint_t	next, current;
 	in6_addr_t *v6group;
+	ilm_walker_t ilw;
 
 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
 
@@ -2397,10 +2407,12 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
 	 * maximum timeout.
 	 */
 	next = INFINITY;
-	mutex_enter(&ill->ill_lock);
 
+	ilm = ilm_walker_start(&ilw, ill);
+	mutex_enter(&ill->ill_lock);
 	current = CURRENT_MSTIME;
-	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
+
+	for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
 
 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
@@ -2430,6 +2442,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
 		}
 	}
 	mutex_exit(&ill->ill_lock);
+	ilm_walker_finish(&ilw);
 
 	return (next);
 }
@@ -2446,6 +2459,7 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
 	in6_addr_t *v6group, *src_array;
 	uint_t	next, numsrc, i, mrd, delay, qqi, current;
 	uint8_t	qrv;
+	ilm_walker_t ilw;
 
 	v6group = &mld2q->mld2q_addr;
 	numsrc = ntohs(mld2q->mld2q_numsrc);
@@ -2518,8 +2532,9 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
 
 	} else {
 		/* group or group/source specific query */
+		ilm = ilm_walker_start(&ilw, ill);
 		mutex_enter(&ill->ill_lock);
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
+		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
@@ -2574,6 +2589,7 @@ group_query:
 			break;
 		}
 		mutex_exit(&ill->ill_lock);
+		ilm_walker_finish(&ilw);
 	}
 
 	return (next);
@@ -2591,9 +2607,8 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
 	ip6_hbh_t	*ip6hbh;
 	struct ip6_opt_router	*ip6router;
 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
-	ill_t		*ill = ilm->ilm_ill;   /* Will be the "lower" ill */
+	ill_t		*ill = ilm->ilm_ill;
 	ipif_t		*ipif;
-	ip6i_t		*ip6i;
 
 	/*
 	 * We need to place a router alert option in this packet.  The length
@@ -2605,30 +2620,14 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
 
 	ASSERT(ill->ill_isv6);
 
-	/*
-	 * We need to make sure that this packet does not get load balanced.
-	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
-	 * ip_newroute_ipif_v6 knows how to handle such packets.
-	 * If it gets load balanced, switches supporting MLD snooping
-	 * (in the future) will send the packet that it receives for this
-	 * multicast group to the interface that we are sending on. As we have
-	 * joined the multicast group on this ill, by sending the packet out
-	 * on this ill, we receive all the packets back on this ill.
-	 */
-	size += sizeof (ip6i_t) + router_alert_length;
+	size += router_alert_length;
 	mp = allocb(size, BPRI_HI);
 	if (mp == NULL)
 		return;
 	bzero(mp->b_rptr, size);
 	mp->b_wptr = mp->b_rptr + size;
 
-	ip6i = (ip6i_t *)mp->b_rptr;
-	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-	ip6i->ip6i_nxt = IPPROTO_RAW;
-	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
-	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
-
-	ip6h = (ip6_t *)&ip6i[1];
+	ip6h = (ip6_t *)mp->b_rptr;
 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
 	/*
@@ -2698,7 +2697,6 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
 	in6_addr_t	*srcarray;
 	ip6_t		*ip6h;
 	ip6_hbh_t	*ip6hbh;
-	ip6i_t		*ip6i;
 	struct ip6_opt_router	*ip6router;
 	size_t		size, optlen, padlen, icmpsize, rsize;
 	ipif_t		*ipif;
@@ -2707,6 +2705,8 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
 	mrec_t		*next_reclist = reclist;
 	boolean_t	morepkts;
 
+	ASSERT(IAM_WRITER_ILL(ill));
+
 	/* If there aren't any records, there's nothing to send */
 	if (reclist == NULL)
 		return;
@@ -2743,6 +2743,14 @@ nextpkt:
 				int srcspace, srcsperpkt;
 				srcspace = ill->ill_max_frag -
 				    (size + sizeof (mld2mar_t));
+
+				/*
+				 * Skip if there's not even enough room in
+				 * a single packet to send something useful.
+				 */
+				if (srcspace <= sizeof (in6_addr_t))
+					continue;
+
 				srcsperpkt = srcspace / sizeof (in6_addr_t);
 				/*
 				 * Increment icmpsize and size, because we will
@@ -2787,30 +2795,13 @@ nextpkt:
 		size += rsize;
 	}
 
-	/*
-	 * We need to make sure that this packet does not get load balanced.
-	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
-	 * ip_newroute_ipif_v6 know how to handle such packets.
-	 * If it gets load balanced, switches supporting MLD snooping
-	 * (in the future) will send the packet that it receives for this
-	 * multicast group to the interface that we are sending on. As we have
-	 * joined the multicast group on this ill, by sending the packet out
-	 * on this ill, we receive all the packets back on this ill.
-	 */
-	size += sizeof (ip6i_t);
 	mp = allocb(size, BPRI_HI);
 	if (mp == NULL)
 		goto free_reclist;
 	bzero(mp->b_rptr, size);
 	mp->b_wptr = mp->b_rptr + size;
 
-	ip6i = (ip6i_t *)mp->b_rptr;
-	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-	ip6i->ip6i_nxt = IPPROTO_RAW;
-	ip6i->ip6i_flags = IP6I_ATTACH_IF;
-	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
-
-	ip6h = (ip6_t *)&(ip6i[1]);
+	ip6h = (ip6_t *)mp->b_rptr;
 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
@@ -3102,3 +3093,64 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
 
 	return (rtnmrec);
 }
+
+/*
+ * Convenience routine to signal the restart-timer thread.
+ */
+static void
+mcast_signal_restart_thread(ip_stack_t *ipst)
+{
+	mutex_enter(&ipst->ips_mrt_lock);
+	ipst->ips_mrt_flags |= IP_MRT_RUN;
+	cv_signal(&ipst->ips_mrt_cv);
+	mutex_exit(&ipst->ips_mrt_lock);
+}
+
+/*
+ * Thread to restart IGMP/MLD timers.  See the comment in igmp_joingroup() for
+ * the story behind this unfortunate thread.
+ */
+void
+mcast_restart_timers_thread(ip_stack_t *ipst)
+{
+	int next;
+	char name[64];
+	callb_cpr_t cprinfo;
+
+	(void) snprintf(name, sizeof (name), "mcast_restart_timers_thread_%d",
+	    ipst->ips_netstack->netstack_stackid);
+	CALLB_CPR_INIT(&cprinfo, &ipst->ips_mrt_lock, callb_generic_cpr, name);
+
+	for (;;) {
+		mutex_enter(&ipst->ips_mrt_lock);
+		while (!(ipst->ips_mrt_flags & (IP_MRT_STOP|IP_MRT_RUN))) {
+			CALLB_CPR_SAFE_BEGIN(&cprinfo);
+			cv_wait(&ipst->ips_mrt_cv, &ipst->ips_mrt_lock);
+			CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_mrt_lock);
+		}
+		if (ipst->ips_mrt_flags & IP_MRT_STOP)
+			break;
+		ipst->ips_mrt_flags &= ~IP_MRT_RUN;
+		mutex_exit(&ipst->ips_mrt_lock);
+
+		mutex_enter(&ipst->ips_igmp_timer_lock);
+		next = ipst->ips_igmp_deferred_next;
+		ipst->ips_igmp_deferred_next = INFINITY;
+		mutex_exit(&ipst->ips_igmp_timer_lock);
+
+		if (next != INFINITY)
+			igmp_start_timers(next, ipst);
+
+		mutex_enter(&ipst->ips_mld_timer_lock);
+		next = ipst->ips_mld_deferred_next;
+		ipst->ips_mld_deferred_next = INFINITY;
+		mutex_exit(&ipst->ips_mld_timer_lock);
+		if (next != INFINITY)
+			mld_start_timers(next, ipst);
+	}
+
+	ipst->ips_mrt_flags |= IP_MRT_DONE;
+	cv_signal(&ipst->ips_mrt_done_cv);
+	CALLB_CPR_EXIT(&cprinfo);	/* drops ips_mrt_lock */
+	thread_exit();
+}
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 1d0bcf37de..dd87a09974 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -170,11 +170,14 @@ typedef struct listptr_s listptr_t;
  */
 typedef struct iproutedata_s {
 	uint_t		ird_idx;
+	uint_t		ird_flags;	/* see below */
 	listptr_t	ird_route;	/* ipRouteEntryTable */
 	listptr_t	ird_netmedia;	/* ipNetToMediaEntryTable */
 	listptr_t	ird_attrs;	/* ipRouteAttributeTable */
 } iproutedata_t;
 
+#define	IRD_REPORT_TESTHIDDEN	0x01	/* include IRE_MARK_TESTHIDDEN routes */
+
 /*
  * Cluster specific hooks. These should be NULL when booted as a non-cluster
  */
@@ -228,31 +231,27 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
  * MT level protection given by STREAMS. IP uses a combination of its own
  * internal serialization mechanism and standard Solaris locking techniques.
- * The internal serialization is per phyint (no IPMP) or per IPMP group.
- * This is used to serialize plumbing operations, IPMP operations, certain
- * multicast operations, most set ioctls, igmp/mld timers etc.
+ * The internal serialization is per phyint.  This is used to serialize
+ * plumbing operations, certain multicast operations, most set ioctls,
+ * igmp/mld timers etc.
  *
  * Plumbing is a long sequence of operations involving message
  * exchanges between IP, ARP and device drivers. Many set ioctls are typically
  * involved in plumbing operations. A natural model is to serialize these
  * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
  * parallel without any interference. But various set ioctls on hme0 are best
- * serialized. However if the system uses IPMP, the operations are easier if
- * they are serialized on a per IPMP group basis since IPMP operations
- * happen across ill's of a group. Thus the lowest common denominator is to
- * serialize most set ioctls, multicast join/leave operations, IPMP operations
- * igmp/mld timer operations, and processing of DLPI control messages received
- * from drivers on a per IPMP group basis. If the system does not employ
- * IPMP the serialization is on a per phyint basis. This serialization is
- * provided by the ipsq_t and primitives operating on this. Details can
- * be found in ip_if.c above the core primitives operating on ipsq_t.
+ * serialized, along with multicast join/leave operations, igmp/mld timer
+ * operations, and processing of DLPI control messages received from drivers
+ * on a per phyint basis.  This serialization is provided by the ipsq_t and
+ * primitives operating on this. Details can be found in ip_if.c above the
+ * core primitives operating on ipsq_t.
  *
  * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
  * Simiarly lookup of an ire by a thread also returns a refheld ire.
  * In addition ipif's and ill's referenced by the ire are also indirectly
  * refheld. Thus no ipif or ill can vanish nor can critical parameters like
  * the ipif's address or netmask change as long as an ipif is refheld
- * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the
+ * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
  * address of an ipif has to go through the ipsq_t. This ensures that only
  * 1 such exclusive operation proceeds at any time on the ipif. It then
  * deletes all ires associated with this ipif, and waits for all refcnts
@@ -281,33 +280,24 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * - ill_g_lock: This is a global reader/writer lock. Protects the following
  *	* The AVL tree based global multi list of all ills.
  *	* The linked list of all ipifs of an ill
- *	* The <ill-ipsq> mapping
- *	* The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next
- *	* The illgroup list threaded by ill_group_next.
+ *	* The <ipsq-xop> mapping
  *	* <ill-phyint> association
  *   Insertion/deletion of an ill in the system, insertion/deletion of an ipif
- *   into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion
- *   of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill
- *   will all have to hold the ill_g_lock as writer for the actual duration
- *   of the insertion/deletion/change. More details about the <ill-ipsq> mapping
- *   may be found in the IPMP section.
+ *   into an ill, changing the <ipsq-xop> mapping of an ill, changing the
+ *   <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
+ *   writer for the actual duration of the insertion/deletion/change.
  *
  * - ill_lock:  This is a per ill mutex.
- *   It protects some members of the ill and is documented below.
- *   It also protects the <ill-ipsq> mapping
- *   It also protects the illgroup list threaded by ill_group_next.
+ *   It protects some members of the ill_t struct; see ip.h for details.
  *   It also protects the <ill-phyint> assoc.
  *   It also protects the list of ipifs hanging off the ill.
  *
  * - ipsq_lock: This is a per ipsq_t mutex lock.
- *   This protects all the other members of the ipsq struct except
- *   ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock
+ *   This protects some members of the ipsq_t struct; see ip.h for details.
+ *   It also protects the <ipsq-ipxop> mapping
  *
- * - illgrp_lock: This is a per ill_group mutex lock.
- *   The only thing it protects is the illgrp_ill_schednext member of ill_group
- *   which dictates which is the next ill in an ill_group that is to be chosen
- *   for sending outgoing packets, through creation of an IRE_CACHE that
- *   references this ill.
+ * - ipx_lock: This is a per ipxop_t mutex lock.
+ *   This protects some members of the ipxop_t struct; see ip.h for details.
  *
  * - phyint_lock: This is a per phyint mutex lock. Protects just the
  *   phyint_flags
@@ -335,27 +325,24 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  *   Note, it is only necessary to take this lock if the ill_usesrc_grp_next
  *   field is changing state i.e from NULL to non-NULL or vice-versa. For
  *   example, it is not necessary to take this lock in the initial portion
- *   of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and
- *   ip_sioctl_flags since the these operations are executed exclusively and
- *   that ensures that the "usesrc group state" cannot change. The "usesrc
- *   group state" change can happen only in the latter part of
- *   ip_sioctl_slifusesrc and in ill_delete.
+ *   of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
+ *   operations are executed exclusively and that ensures that the "usesrc
+ *   group state" cannot change. The "usesrc group state" change can happen
+ *   only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
  *
- * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications.
+ * Changing <ill-phyint>, <ipsq-xop> assocications:
  *
  * To change the <ill-phyint> association, the ill_g_lock must be held
  * as writer, and the ill_locks of both the v4 and v6 instance of the ill
  * must be held.
  *
- * To change the <ill-ipsq> association the ill_g_lock must be held as writer
- * and the ill_lock of the ill in question must be held.
- *
- * To change the <ill-illgroup> association the ill_g_lock must be held as
- * writer and the ill_lock of the ill in question must be held.
+ * To change the <ipsq-xop> association, the ill_g_lock must be held as
+ * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
+ * This is only done when ills are added or removed from IPMP groups.
  *
  * To add or delete an ipif from the list of ipifs hanging off the ill,
  * ill_g_lock (writer) and ill_lock must be held and the thread must be
- * a writer on the associated ipsq,.
+ * a writer on the associated ipsq.
  *
  * To add or delete an ill to the system, the ill_g_lock must be held as
  * writer and the thread must be a writer on the associated ipsq.
@@ -367,8 +354,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  *
  * Some lock hierarchy scenarios are listed below.
  *
- * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
- * ill_g_lock -> illgrp_lock -> ill_lock
+ * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
  * ill_g_lock -> ill_lock(s) -> phyint_lock
  * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock
  * ill_g_lock -> ip_addr_avail_lock
@@ -587,8 +573,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
  * back, i.e. the loopback which is required since neither Ethernet drivers
  * nor Ethernet hardware loops them back. This is the case when the normal
  * routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill (or ill group) as the ill with which is IRE_LOCAL is
- * associated.
+ * the same ill as the ill with which is IRE_LOCAL is associated.
  *
  * Multiple zones can share a common broadcast address; typically all zones
  * share the 255.255.255.255 address. Incoming as well as locally originated
@@ -695,8 +680,8 @@ static boolean_t	ip_rput_multimblk_ipoptions(queue_t *, ill_t *,
 			    mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *);
 static int	ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *,
     ip_stack_t *);
-static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *,
-		    uint16_t *);
+static boolean_t ip_rput_fragment(ill_t *, ill_t *, mblk_t **, ipha_t *,
+    uint32_t *, uint16_t *);
 int		ip_snmp_get(queue_t *, mblk_t *, int);
 static mblk_t	*ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
 		    mib2_ipIfStatsEntry_t *, ip_stack_t *);
@@ -723,9 +708,9 @@ static mblk_t	*ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
 		    ip_stack_t *ipst);
 static mblk_t	*ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
 		    ip_stack_t *ipst);
-static mblk_t	*ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *,
+static mblk_t	*ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
 		    ip_stack_t *ipst);
-static mblk_t	*ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *,
+static mblk_t	*ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
 		    ip_stack_t *ipst);
 static void	ip_snmp_get2_v4(ire_t *, iproutedata_t *);
 static void	ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
@@ -775,8 +760,6 @@ static int	ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
     caddr_t cp, cred_t *cr);
 static int	ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
     cred_t *);
-static int	ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t,
-    cred_t *);
 static int	ip_squeue_switch(int);
 
 static void	*ip_kstat_init(netstackid_t, ip_stack_t *);
@@ -946,8 +929,6 @@ static ipndp_t	lcl_ndp_arr[] = {
 	{  ip_cgtp_filter_get,	ip_cgtp_filter_set, NULL,
 	    "ip_cgtp_filter" },
 #define	IPNDP_IPMP_HOOK_OFFSET		10
-	{  ip_param_generic_get, ipmp_hook_emulation_set, NULL,
-	    "ipmp_hook_emulation" },
 	{  ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug,
 	    "ip_debug" },
 };
@@ -984,20 +965,19 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 
 	/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 			IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
-	/* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+	/* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
 			IF_CMD, ip_sioctl_get_addr, NULL },
 
 	/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 			IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
 	/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
-			IPI_GET_CMD | IPI_REPL,
-			IF_CMD, ip_sioctl_get_dstaddr, NULL },
+			IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
 
 	/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
+			IPI_PRIV | IPI_WR,
 			IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
 	/* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
-			IPI_MODOK | IPI_GET_CMD | IPI_REPL,
+			IPI_MODOK | IPI_GET_CMD,
 			IF_CMD, ip_sioctl_get_flags, NULL },
 
 	/* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1009,31 +989,28 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 
 	/* 021 */ { SIOCSIFMTU,	sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 			IF_CMD, ip_sioctl_mtu, NULL },
-	/* 022 */ { SIOCGIFMTU,	sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+	/* 022 */ { SIOCGIFMTU,	sizeof (struct ifreq), IPI_GET_CMD,
 			IF_CMD, ip_sioctl_get_mtu, NULL },
 	/* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
-			IPI_GET_CMD | IPI_REPL,
-			IF_CMD, ip_sioctl_get_brdaddr, NULL },
+			IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
 	/* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 			IF_CMD, ip_sioctl_brdaddr, NULL },
 	/* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
-			IPI_GET_CMD | IPI_REPL,
-			IF_CMD, ip_sioctl_get_netmask, NULL },
+			IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
 	/* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 			IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
 	/* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
-			IPI_GET_CMD | IPI_REPL,
-			IF_CMD, ip_sioctl_get_metric, NULL },
+			IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
 	/* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
 			IF_CMD, ip_sioctl_metric, NULL },
 	/* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 
 	/* See 166-168 below for extended SIOC*XARP ioctls */
-	/* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV,
+	/* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
 			ARP_CMD, ip_sioctl_arp, NULL },
-	/* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL,
+	/* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
 			ARP_CMD, ip_sioctl_arp, NULL },
-	/* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV,
+	/* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
 			ARP_CMD, ip_sioctl_arp, NULL },
 
 	/* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1098,21 +1075,19 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 	/* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 	/* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 
-	/* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL,
+	/* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
 			MISC_CMD, ip_sioctl_get_ifnum, NULL },
-	/* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+	/* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
 			IF_CMD, ip_sioctl_get_muxid, NULL },
 	/* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
-			IF_CMD, ip_sioctl_muxid, NULL },
+			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
 
 	/* Both if and lif variants share same func */
-	/* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
+	/* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
 			IF_CMD, ip_sioctl_get_lifindex, NULL },
 	/* Both if and lif variants share same func */
 	/* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
-			IF_CMD, ip_sioctl_slifindex, NULL },
+			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
 
 	/* copyin size cannot be coded for SIOCGIFCONF */
 	/* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
@@ -1136,28 +1111,25 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 	/* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 
 	/* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
-			LIF_CMD, ip_sioctl_removeif,
+			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
 			ip_sioctl_removeif_restart },
 	/* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL,
+			IPI_GET_CMD | IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_addif, NULL },
 #define	SIOCLIFADDR_NDX 112
 	/* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
 	/* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_addr, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
 	/* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
 	/* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_dstaddr, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
 	/* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
+			IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
 	/* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_MODOK | IPI_REPL,
+			IPI_GET_CMD | IPI_MODOK,
 			LIF_CMD, ip_sioctl_get_flags, NULL },
 
 	/* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1167,58 +1139,48 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 			ip_sioctl_get_lifconf, NULL },
 	/* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_mtu, NULL },
-	/* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL,
+	/* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
 			LIF_CMD, ip_sioctl_get_mtu, NULL },
 	/* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_brdaddr, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
 	/* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_brdaddr, NULL },
 	/* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_netmask, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
 	/* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
 	/* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_metric, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
 	/* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_metric, NULL },
 	/* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
-			IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL,
+			IPI_PRIV | IPI_WR | IPI_MODOK,
 			LIF_CMD, ip_sioctl_slifname,
 			ip_sioctl_slifname_restart },
 
-	/* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL,
+	/* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
 			MISC_CMD, ip_sioctl_get_lifnum, NULL },
 	/* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_muxid, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
 	/* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
-			LIF_CMD, ip_sioctl_muxid, NULL },
+			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
 	/* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_lifindex, 0 },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
 	/* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
-			LIF_CMD, ip_sioctl_slifindex, 0 },
+			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
 	/* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_token, NULL },
 	/* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_token, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
 	/* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
 	/* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_subnet, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
 	/* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_lnkinfo, NULL },
 
 	/* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
 	/* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
 			LIF_CMD, ip_siocdelndp_v6, NULL },
 	/* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
@@ -1231,8 +1193,8 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 			MISC_CMD, ip_sioctl_tonlink, NULL },
 	/* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
 			MISC_CMD, ip_sioctl_tmysite, NULL },
-	/* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL,
-		    TUN_CMD, ip_sioctl_tunparam, NULL },
+	/* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), 0,
+			TUN_CMD, ip_sioctl_tunparam, NULL },
 	/* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req),
 		    IPI_PRIV | IPI_WR,
 		    TUN_CMD, ip_sioctl_tunparam, NULL },
@@ -1243,29 +1205,24 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 	/* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
 	/* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
 
-	/* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
-			LIF_CMD, ip_sioctl_move, ip_sioctl_move },
-	/* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
-			LIF_CMD, ip_sioctl_move, ip_sioctl_move },
+	/* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
+
+	/* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD |
+			IPI_WR, LIF_CMD, ip_sioctl_get_binding, NULL },
 	/* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
+			IPI_PRIV | IPI_WR,
 			LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
 	/* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_groupname, NULL },
-	/* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_oindex, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
+	/* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
+			IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
 
 	/* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
 	/* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 	/* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 	/* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 
-	/* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
-		    LIF_CMD, ip_sioctl_slifoindex, NULL },
+	/* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 
 	/* These are handled in ip_sioctl_copyin_setup itself */
 	/* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
@@ -1277,22 +1234,20 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 	/* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
 			ip_sioctl_get_lifconf, NULL },
 
-	/* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV,
+	/* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
 			XARP_CMD, ip_sioctl_arp, NULL },
-	/* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL,
+	/* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
 			XARP_CMD, ip_sioctl_arp, NULL },
-	/* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV,
+	/* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
 			XARP_CMD, ip_sioctl_arp, NULL },
 
 	/* SIOCPOPSOCKFS is not handled by IP */
 	/* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
 
 	/* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
-			IPI_GET_CMD | IPI_REPL,
-			LIF_CMD, ip_sioctl_get_lifzone, NULL },
+			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
 	/* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
-			IPI_PRIV | IPI_WR | IPI_REPL,
-			LIF_CMD, ip_sioctl_slifzone,
+			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
 			ip_sioctl_slifzone_restart },
 	/* 172-174 are SCTP ioctls and not handled by IP */
 	/* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
@@ -1315,8 +1270,7 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
 	/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR,
 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
-	/* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD,
-			ip_sioctl_set_ipmpfailback, NULL },
+	/* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 	/* SIOCSENABLESDP is handled by SDP */
 	/* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
 	/* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
@@ -1326,7 +1280,7 @@ int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
 
 ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
 	{ OSIOCGTUNPARAM, sizeof (struct old_iftun_req),
-		IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL },
+		IPI_GET_CMD, TUN_CMD, ip_sioctl_tunparam, NULL },
 	{ OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR,
 		TUN_CMD, ip_sioctl_tunparam, NULL },
 	{ I_LINK,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
@@ -1336,11 +1290,11 @@ ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
 	{ ND_GET,	0, IPI_PASS_DOWN, 0, NULL, NULL },
 	{ ND_SET,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
 	{ IP_IOCTL,	0, 0, 0, NULL, NULL },
-	{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD,
+	{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
 		MISC_CMD, mrt_ioctl},
-	{ SIOCGETSGCNT,	sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD,
+	{ SIOCGETSGCNT,	sizeof (struct sioc_sg_req), IPI_GET_CMD,
 		MISC_CMD, mrt_ioctl},
-	{ SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD,
+	{ SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
 		MISC_CMD, mrt_ioctl}
 };
 
@@ -1629,8 +1583,6 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
 	ipif_t	*ipif;
 	mblk_t *first_mp;
 	ipsec_in_t *ii;
-	ire_t *src_ire;
-	boolean_t onlink;
 	timestruc_t now;
 	uint32_t ill_index;
 	ip_stack_t *ipst;
@@ -2014,59 +1966,6 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
 	if (!IS_SIMPLE_IPH(ipha))
 		icmp_options_update(ipha);
 
-	/*
-	 * ICMP echo replies should go out on the same interface
-	 * the request came on as probes used by in.mpathd for detecting
-	 * NIC failures are ECHO packets. We turn-off load spreading
-	 * by setting ipsec_in_attach_if to B_TRUE, which is copied
-	 * to ipsec_out_attach_if by ipsec_in_to_out called later in this
-	 * function. This is in turn handled by ip_wput and ip_newroute
-	 * to make sure that the packet goes out on the interface it came
-	 * in on. If we don't turnoff load spreading, the packets might get
-	 * dropped if there are no non-FAILED/INACTIVE interfaces for it
-	 * to go out and in.mpathd would wrongly detect a failure or
-	 * mis-detect a NIC failure for link failure. As load spreading
-	 * can happen only if ill_group is not NULL, we do only for
-	 * that case and this does not affect the normal case.
-	 *
-	 * We turn off load spreading only on echo packets that came from
-	 * on-link hosts. If the interface route has been deleted, this will
-	 * not be enforced as we can't do much. For off-link hosts, as the
-	 * default routes in IPv4 does not typically have an ire_ipif
-	 * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute.
-	 * Moreover, expecting a default route through this interface may
-	 * not be correct. We use ipha_dst because of the swap above.
-	 */
-	onlink = B_FALSE;
-	if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) {
-		/*
-		 * First, we need to make sure that it is not one of our
-		 * local addresses. If we set onlink when it is one of
-		 * our local addresses, we will end up creating IRE_CACHES
-		 * for one of our local addresses. Then, we will never
-		 * accept packets for them afterwards.
-		 */
-		src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL,
-		    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-		if (src_ire == NULL) {
-			ipif = ipif_get_next_ipif(NULL, ill);
-			if (ipif == NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				freemsg(mp);
-				return;
-			}
-			src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0,
-			    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
-			    NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, ipst);
-			ipif_refrele(ipif);
-			if (src_ire != NULL) {
-				onlink = B_TRUE;
-				ire_refrele(src_ire);
-			}
-		} else {
-			ire_refrele(src_ire);
-		}
-	}
 	if (!mctl_present) {
 		/*
 		 * This packet should go out the same way as it
@@ -2085,20 +1984,7 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
 
 		/* This is not a secure packet */
 		ii->ipsec_in_secure = B_FALSE;
-		if (onlink) {
-			ii->ipsec_in_attach_if = B_TRUE;
-			ii->ipsec_in_ill_index =
-			    ill->ill_phyint->phyint_ifindex;
-			ii->ipsec_in_rill_index =
-			    recv_ill->ill_phyint->phyint_ifindex;
-		}
 		first_mp->b_cont = mp;
-	} else if (onlink) {
-		ii = (ipsec_in_t *)first_mp->b_rptr;
-		ii->ipsec_in_attach_if = B_TRUE;
-		ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex;
-		ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex;
-		ii->ipsec_in_ns = ipst->ips_netstack;	/* No netstack_hold */
 	} else {
 		ii = (ipsec_in_t *)first_mp->b_rptr;
 		ii->ipsec_in_ns = ipst->ips_netstack;	/* No netstack_hold */
@@ -3733,7 +3619,6 @@ ipif_dup_recovery(void *arg)
 	ill_t *ill = ipif->ipif_ill;
 	mblk_t *arp_add_mp;
 	mblk_t *arp_del_mp;
-	area_t *area;
 	ip_stack_t *ipst = ill->ill_ipst;
 
 	ipif->ipif_recovery_id = 0;
@@ -3744,12 +3629,13 @@ ipif_dup_recovery(void *arg)
 	 */
 	if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) ||
 	    (ipif->ipif_flags & IPIF_POINTOPOINT) ||
-	    (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
+	    (ipif->ipif_state_flags & (IPIF_CONDEMNED))) {
 		/* No reason to try to bring this address back. */
 		return;
 	}
 
-	if ((arp_add_mp = ipif_area_alloc(ipif)) == NULL)
+	/* ACE_F_UNVERIFIED restarts DAD */
+	if ((arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL)
 		goto alloc_fail;
 
 	if (ipif->ipif_arp_del_mp == NULL) {
@@ -3758,10 +3644,6 @@ ipif_dup_recovery(void *arg)
 		ipif->ipif_arp_del_mp = arp_del_mp;
 	}
 
-	/* Setting the 'unverified' flag restarts DAD */
-	area = (area_t *)arp_add_mp->b_rptr;
-	area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR |
-	    ACE_F_UNVERIFIED;
 	putnext(ill->ill_rq, arp_add_mp);
 	return;
 
@@ -3873,6 +3755,7 @@ ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
 			    EINPROGRESS) {
 				ipif->ipif_addr_ready = 1;
 				(void) ipif_up_done(ipif);
+				ASSERT(ill->ill_move_ipif == NULL);
 			}
 			continue;
 		}
@@ -3893,6 +3776,7 @@ ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
 		    ill->ill_net_type == IRE_IF_RESOLVER &&
 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
 		    ipst->ips_ip_dup_recovery > 0) {
+			ASSERT(ipif->ipif_recovery_id == 0);
 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
 		}
@@ -4196,8 +4080,9 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
 {
 	mblk_t		*mp;
 	ip_pktinfo_t	*pinfo;
-	ipha_t *ipha;
+	ipha_t 		*ipha;
 	struct ether_header *pether;
+	boolean_t	ipmp_ill_held = B_FALSE;
 
 	mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED);
 	if (mp == NULL) {
@@ -4205,12 +4090,53 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
 		return (data_mp);
 	}
 
-	ipha	= (ipha_t *)data_mp->b_rptr;
+	ipha = (ipha_t *)data_mp->b_rptr;
 	pinfo = (ip_pktinfo_t *)mp->b_rptr;
 	bzero(pinfo, sizeof (ip_pktinfo_t));
 	pinfo->ip_pkt_flags = (uchar_t)flags;
 	pinfo->ip_pkt_ulp_type = IN_PKTINFO;	/* Tell ULP what type of info */
 
+	pether = (struct ether_header *)((char *)ipha
+	    - sizeof (struct ether_header));
+
+	/*
+	 * Make sure the interface is an ethernet type, since this option
+	 * is currently supported only on this type of interface. Also make
+	 * sure we are pointing correctly above db_base.
+	 */
+	if ((flags & IPF_RECVSLLA) &&
+	    ((uchar_t *)pether >= data_mp->b_datap->db_base) &&
+	    (ill->ill_type == IFT_ETHER) &&
+	    (ill->ill_net_type == IRE_IF_RESOLVER)) {
+		pinfo->ip_pkt_slla.sdl_type = IFT_ETHER;
+		bcopy(pether->ether_shost.ether_addr_octet,
+		    pinfo->ip_pkt_slla.sdl_data, ETHERADDRL);
+	} else {
+		/*
+		 * Clear the bit. Indicate to upper layer that IP is not
+		 * sending this ancillary info.
+		 */
+		pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA;
+	}
+
+	/*
+	 * If `ill' is in an IPMP group, use the IPMP ill to determine
+	 * IPF_RECVIF and IPF_RECVADDR.  (This currently assumes that
+	 * IPF_RECVADDR support on test addresses is not needed.)
+	 *
+	 * Note that `ill' may already be an IPMP ill if e.g. we're
+	 * processing a packet looped back to an IPMP data address
+	 * (since those IRE_LOCALs are tied to IPMP ills).
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		if ((ill = ipmp_ill_hold_ipmp_ill(ill)) == NULL) {
+			ip1dbg(("ip_add_info: cannot hold IPMP ill.\n"));
+			freemsg(mp);
+			return (data_mp);
+		}
+		ipmp_ill_held = B_TRUE;
+	}
+
 	if (flags & (IPF_RECVIF | IPF_RECVADDR))
 		pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex;
 	if (flags & IPF_RECVADDR) {
@@ -4239,7 +4165,7 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
 			ire = ire_ctable_lookup(ipha->ipha_dst, 0,
 			    IRE_LOCAL | IRE_LOOPBACK,
 			    ipif, zoneid, NULL,
-			    MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst);
+			    MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
 			if (ire == NULL) {
 				/*
 				 * packet must have come on a different
@@ -4276,29 +4202,8 @@ ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
 		}
 	}
 
-	pether = (struct ether_header *)((char *)ipha
-	    - sizeof (struct ether_header));
-	/*
-	 * Make sure the interface is an ethernet type, since this option
-	 * is currently supported only on this type of interface. Also make
-	 * sure we are pointing correctly above db_base.
-	 */
-
-	if ((flags & IPF_RECVSLLA) &&
-	    ((uchar_t *)pether >= data_mp->b_datap->db_base) &&
-	    (ill->ill_type == IFT_ETHER) &&
-	    (ill->ill_net_type == IRE_IF_RESOLVER)) {
-
-		pinfo->ip_pkt_slla.sdl_type = IFT_ETHER;
-		bcopy((uchar_t *)pether->ether_shost.ether_addr_octet,
-		    (uchar_t *)pinfo->ip_pkt_slla.sdl_data, ETHERADDRL);
-	} else {
-		/*
-		 * Clear the bit. Indicate to upper layer that IP is not
-		 * sending this ancillary info.
-		 */
-		pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA;
-	}
+	if (ipmp_ill_held)
+		ill_refrele(ill);
 
 	mp->b_datap->db_type = M_CTL;
 	mp->b_wptr += sizeof (ip_pktinfo_t);
@@ -4946,8 +4851,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
 		}
 	}
 
-	if (dst_ire != NULL &&
-	    dst_ire->ire_type == IRE_LOCAL &&
+	if (dst_ire != NULL && dst_ire->ire_type == IRE_LOCAL &&
 	    dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) {
 		/*
 		 * If the IRE belongs to a different zone, look for a matching
@@ -4983,7 +4887,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
 			 * Pick a source address so that a proper inbound
 			 * load spreading would happen.
 			 */
-			ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill;
+			ill_t *ire_ill = dst_ire->ire_ipif->ipif_ill;
 			ipif_t *src_ipif = NULL;
 			ire_t *ipif_ire;
 
@@ -4998,10 +4902,10 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
 			 *    found above so that upper layers know that the
 			 *    destination address is a broadcast address.
 			 *
-			 * 2) If this is part of a group, select a better
-			 *    source address so that better inbound load
-			 *    balancing happens. Do the same if the ipif
-			 *    is DEPRECATED.
+			 * 2) If the ipif is DEPRECATED, select a better
+			 *    source address.  Similarly, if the ipif is on
+			 *    the IPMP meta-interface, pick a source address
+			 *    at random to improve inbound load spreading.
 			 *
 			 * 3) If the outgoing interface is part of a usesrc
 			 *    group, then try selecting a source address from
@@ -5011,9 +4915,9 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
 			    dst_ire->ire_zoneid != ALL_ZONES) ||
 			    (!(dst_ire->ire_flags & RTF_SETSRC)) &&
 			    (!(dst_ire->ire_type & IRE_BROADCAST) &&
-			    ((dst_ill->ill_group != NULL) ||
+			    (IS_IPMP(ire_ill) ||
 			    (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
-			    (dst_ill->ill_usesrc_ifindex != 0)))) {
+			    (ire_ill->ill_usesrc_ifindex != 0)))) {
 				/*
 				 * If the destination is reachable via a
 				 * given gateway, the selected source address
@@ -5035,7 +4939,7 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
 				 */
 				ipaddr_t saddr =
 				    dst_ire->ire_ipif->ipif_src_addr;
-				src_ipif = ipif_select_source(dst_ill,
+				src_ipif = ipif_select_source(ire_ill,
 				    saddr, zoneid);
 				if (src_ipif != NULL) {
 					if (IS_VNI(src_ipif->ipif_ill)) {
@@ -5478,14 +5382,6 @@ ip_modclose(ill_t *ill)
 	(void) ill_frag_timeout(ill, 0);
 
 	/*
-	 * If MOVE was in progress, clear the
-	 * move_in_progress fields also.
-	 */
-	if (ill->ill_move_in_progress) {
-		ILL_CLEAR_MOVE(ill);
-	}
-
-	/*
 	 * Call ill_delete to bring down the ipifs, ilms and ill on
 	 * this ill. Then wait for the refcnts to drop to zero.
 	 * ill_is_freeable checks whether the ill is really quiescent.
@@ -5510,7 +5406,7 @@ ip_modclose(ill_t *ill)
 	 */
 	netstack_hold(ipst->ips_netstack);
 
-	/* qprocsoff is called in ill_delete_tail */
+	/* qprocsoff is done via ill_delete_tail */
 	ill_delete_tail(ill);
 	ASSERT(ill->ill_ipst == NULL);
 
@@ -5755,6 +5651,11 @@ ip_stack_shutdown(netstackid_t stackid, void *arg)
 	ipst->ips_capab_taskq_quit = B_TRUE;
 	cv_signal(&ipst->ips_capab_taskq_cv);
 	mutex_exit(&ipst->ips_capab_taskq_lock);
+
+	mutex_enter(&ipst->ips_mrt_lock);
+	ipst->ips_mrt_flags |= IP_MRT_STOP;
+	cv_signal(&ipst->ips_mrt_cv);
+	mutex_exit(&ipst->ips_mrt_lock);
 }
 
 /*
@@ -5766,6 +5667,9 @@ ip_stack_fini(netstackid_t stackid, void *arg)
 	ip_stack_t *ipst = (ip_stack_t *)arg;
 	int ret;
 
+#ifdef NS_DEBUG
+	printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
+#endif
 	/*
 	 * At this point, all of the notifications that the events and
 	 * protocols are going away have been run, meaning that we can
@@ -5779,9 +5683,14 @@ ip_stack_fini(netstackid_t stackid, void *arg)
 	cv_destroy(&ipst->ips_capab_taskq_cv);
 	list_destroy(&ipst->ips_capab_taskq_list);
 
-#ifdef NS_DEBUG
-	printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
-#endif
+	mutex_enter(&ipst->ips_mrt_lock);
+	while (!(ipst->ips_mrt_flags & IP_MRT_DONE))
+		cv_wait(&ipst->ips_mrt_done_cv, &ipst->ips_mrt_lock);
+	mutex_destroy(&ipst->ips_mrt_lock);
+	cv_destroy(&ipst->ips_mrt_cv);
+	cv_destroy(&ipst->ips_mrt_done_cv);
+
+	ipmp_destroy(ipst);
 	rw_destroy(&ipst->ips_srcid_lock);
 
 	ip_kstat_fini(stackid, ipst->ips_ip_mibkp);
@@ -6038,10 +5947,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
 	    "ip_cgtp_filter") == 0);
 	ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data =
 	    (caddr_t)&ipst->ips_ip_cgtp_filter;
-	ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_name,
-	    "ipmp_hook_emulation") == 0);
-	ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_data =
-	    (caddr_t)&ipst->ips_ipmp_hook_emulation;
 
 	(void) ip_param_register(&ipst->ips_ip_g_nd,
 	    ipst->ips_param_arr, A_CNT(lcl_param_arr),
@@ -6053,8 +5958,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
 	ipst->ips_ip6_kstat =
 	    ip6_kstat_init(stackid, &ipst->ips_ip6_statistics);
 
-	ipst->ips_ipmp_enable_failback = B_TRUE;
-
 	ipst->ips_ip_src_id = 1;
 	rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
 
@@ -6062,6 +5965,7 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
 	ip_net_init(ipst, ns);
 	ipv4_hook_init(ipst);
 	ipv6_hook_init(ipst);
+	ipmp_init(ipst);
 
 	/*
 	 * Create the taskq dispatcher thread and initialize related stuff.
@@ -6073,6 +5977,15 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
 	list_create(&ipst->ips_capab_taskq_list, sizeof (mblk_t),
 	    offsetof(mblk_t, b_next));
 
+	/*
+	 * Create the mcast_restart_timers_thread() worker thread.
+	 */
+	mutex_init(&ipst->ips_mrt_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&ipst->ips_mrt_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&ipst->ips_mrt_done_cv, NULL, CV_DEFAULT, NULL);
+	ipst->ips_mrt_thread = thread_create(NULL, 0,
+	    mcast_restart_timers_thread, ipst, 0, &p0, TS_RUN, minclsyspri);
+
 	major = mod_name_to_major(INET_NAME);
 	(void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
 	return (ipst);
@@ -6109,6 +6022,24 @@ ip_dlpi_alloc(size_t len, t_uscalar_t prim)
 }
 
 /*
+ * Allocate and initialize a DLPI notification.  (May be called as writer.)
+ */
+mblk_t *
+ip_dlnotify_alloc(uint_t notification, uint_t data)
+{
+	dl_notify_ind_t	*notifyp;
+	mblk_t		*mp;
+
+	if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
+		return (NULL);
+
+	notifyp = (dl_notify_ind_t *)mp->b_rptr;
+	notifyp->dl_notification = notification;
+	notifyp->dl_data = data;
+	return (mp);
+}
+
+/*
  * Debug formatting routine.  Returns a character string representation of the
  * addr in buf, of the form xxx.xxx.xxx.xxx.  This routine takes the address
  * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer.
@@ -7753,71 +7684,30 @@ ip_net_mask(ipaddr_t addr)
 }
 
 /*
- * Select an ill for the packet by considering load spreading across
- * a different ill in the group if dst_ill is part of some group.
- */
-ill_t *
-ip_newroute_get_dst_ill(ill_t *dst_ill)
-{
-	ill_t *ill;
-
-	/*
-	 * We schedule irrespective of whether the source address is
-	 * INADDR_ANY or not. illgrp_scheduler returns a held ill.
-	 */
-	ill = illgrp_scheduler(dst_ill);
-	if (ill == NULL)
-		return (NULL);
-
-	/*
-	 * For groups with names ip_sioctl_groupname ensures that all
-	 * ills are of same type. For groups without names, ifgrp_insert
-	 * ensures this.
-	 */
-	ASSERT(dst_ill->ill_type == ill->ill_type);
-
-	return (ill);
-}
-
-/*
- * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case.
+ * Helper ill lookup function used by IPsec.
  */
 ill_t *
-ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6,
-    ip_stack_t *ipst)
+ip_grab_ill(mblk_t *first_mp, int ifindex, boolean_t isv6, ip_stack_t *ipst)
 {
 	ill_t *ret_ill;
 
 	ASSERT(ifindex != 0);
+
 	ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
 	    ipst);
-	if (ret_ill == NULL ||
-	    (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) {
+	if (ret_ill == NULL) {
 		if (isv6) {
-			if (ill != NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
-			} else {
-				BUMP_MIB(&ipst->ips_ip6_mib,
-				    ipIfStatsOutDiscards);
-			}
-			ip1dbg(("ip_grab_attach_ill (IPv6): "
-			    "bad ifindex %d.\n", ifindex));
+			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
+			ip1dbg(("ip_grab_ill (IPv6): bad ifindex %d.\n",
+			    ifindex));
 		} else {
-			if (ill != NULL) {
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
-			} else {
-				BUMP_MIB(&ipst->ips_ip_mib,
-				    ipIfStatsOutDiscards);
-			}
-			ip1dbg(("ip_grab_attach_ill (IPv4): "
-			    "bad ifindex %d.\n", ifindex));
+			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+			ip1dbg(("ip_grab_ill (IPv4): bad ifindex %d.\n",
+			    ifindex));
 		}
-		if (ret_ill != NULL)
-			ill_refrele(ret_ill);
 		freemsg(first_mp);
 		return (NULL);
 	}
-
 	return (ret_ill);
 }
 
@@ -7859,7 +7749,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 	ire_t	*sire = NULL;
 	mblk_t	*first_mp;
 	ire_t	*save_ire;
-	ill_t	*attach_ill = NULL;	/* Bind to IPIF_NOFAILOVER address */
 	ushort_t ire_marks = 0;
 	boolean_t mctl_present;
 	ipsec_out_t *io;
@@ -7873,7 +7762,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 	boolean_t multirt_is_resolvable;
 	boolean_t multirt_resolve_next;
 	boolean_t unspec_src;
-	boolean_t do_attach_ill = B_FALSE;
 	boolean_t ip_nexthop = B_FALSE;
 	tsol_ire_gw_secattr_t *attrp = NULL;
 	tsol_gcgrp_t *gcgrp = NULL;
@@ -7902,22 +7790,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 		return;
 	}
 
-	if (mctl_present && io->ipsec_out_attach_if) {
-		/* ip_grab_attach_ill returns a held ill */
-		attach_ill = ip_grab_attach_ill(NULL, first_mp,
-		    io->ipsec_out_ill_index, B_FALSE, ipst);
-
-		/* Failure case frees things for us. */
-		if (attach_ill == NULL)
-			return;
-
-		/*
-		 * Check if we need an ire that will not be
-		 * looked up by anybody else i.e. HIDDEN.
-		 */
-		if (ill_is_probeonly(attach_ill))
-			ire_marks = IRE_MARK_HIDDEN;
-	}
 	if (mctl_present && io->ipsec_out_ip_nexthop) {
 		ip_nexthop = B_TRUE;
 		nexthop_addr = io->ipsec_out_nexthop_addr;
@@ -7997,31 +7869,15 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 				dst = nexthop_addr;
 			}
 		}
-	} else if (attach_ill == NULL) {
+	} else {
 		ire = ire_ftable_lookup(dst, 0, 0, 0,
 		    NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp),
 		    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
 		    MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT |
 		    MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE,
 		    ipst);
-	} else {
-		/*
-		 * attach_ill is set only for communicating with
-		 * on-link hosts. So, don't look for DEFAULT.
-		 */
-		ipif_t	*attach_ipif;
-
-		attach_ipif = ipif_get_next_ipif(NULL, attach_ill);
-		if (attach_ipif == NULL) {
-			ill_refrele(attach_ill);
-			goto icmp_err_ret;
-		}
-		ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif,
-		    &sire, zoneid, 0, MBLK_GETLABEL(mp),
-		    MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL |
-		    MATCH_IRE_SECATTR, ipst);
-		ipif_refrele(attach_ipif);
 	}
+
 	ip3dbg(("ip_newroute: ire_ftable_lookup() "
 	    "returned ire %p, sire %p\n", (void *)ire, (void *)sire));
 
@@ -8122,8 +7978,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 			}
 			ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0,
 			    RTA_DST, ipst);
-			if (attach_ill != NULL)
-				ill_refrele(attach_ill);
 			goto icmp_err_ret;
 		}
 
@@ -8134,8 +7988,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 		 */
 		if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
 		    (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) {
-			if (attach_ill != NULL)
-				ill_refrele(attach_ill);
 			goto icmp_err_ret;
 		}
 		/*
@@ -8157,119 +8009,51 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 			sire->ire_last_used_time = lbolt;
 		}
 		/*
-		 * We have a route to reach the destination.
-		 *
-		 * 1) If the interface is part of ill group, try to get a new
-		 *    ill taking load spreading into account.
-		 *
-		 * 2) After selecting the ill, get a source address that
-		 *    might create good inbound load spreading.
-		 *    ipif_select_source does this for us.
+		 * We have a route to reach the destination.  Find the
+		 * appropriate ill, then get a source address using
+		 * ipif_select_source().
 		 *
-		 * If the application specified the ill (ifindex), we still
-		 * load spread. Only if the packets needs to go out
-		 * specifically on a given ill e.g. binding to
-		 * IPIF_NOFAILOVER address, then we don't try to use a
-		 * different ill for load spreading.
+		 * If we are here trying to create an IRE_CACHE for an offlink
+		 * destination and have an IRE_CACHE entry for VNI, then use
+		 * ire_stq instead since VNI's queue is a black hole.
 		 */
-		if (attach_ill == NULL) {
-			/*
-			 * Don't perform outbound load spreading in the
-			 * case of an RTF_MULTIRT route, as we actually
-			 * typically want to replicate outgoing packets
-			 * through particular interfaces.
-			 */
-			if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) {
-				dst_ill = ire->ire_ipif->ipif_ill;
-				/* for uniformity */
-				ill_refhold(dst_ill);
-			} else {
-				/*
-				 * If we are here trying to create an IRE_CACHE
-				 * for an offlink destination and have the
-				 * IRE_CACHE for the next hop and the latter is
-				 * using virtual IP source address selection i.e
-				 * it's ire->ire_ipif is pointing to a virtual
-				 * network interface (vni) then
-				 * ip_newroute_get_dst_ll() will return the vni
-				 * interface as the dst_ill. Since the vni is
-				 * virtual i.e not associated with any physical
-				 * interface, it cannot be the dst_ill, hence
-				 * in such a case call ip_newroute_get_dst_ll()
-				 * with the stq_ill instead of the ire_ipif ILL.
-				 * The function returns a refheld ill.
-				 */
-				if ((ire->ire_type == IRE_CACHE) &&
-				    IS_VNI(ire->ire_ipif->ipif_ill))
-					dst_ill = ip_newroute_get_dst_ill(
-					    ire->ire_stq->q_ptr);
-				else
-					dst_ill = ip_newroute_get_dst_ill(
-					    ire->ire_ipif->ipif_ill);
-			}
-			if (dst_ill == NULL) {
-				if (ip_debug > 2) {
-					pr_addr_dbg("ip_newroute: "
-					    "no dst ill for dst"
-					    " %s\n", AF_INET, &dst);
-				}
-				goto icmp_err_ret;
-			}
-		} else {
-			dst_ill = ire->ire_ipif->ipif_ill;
-			/* for uniformity */
+		if ((ire->ire_type == IRE_CACHE) &&
+		    IS_VNI(ire->ire_ipif->ipif_ill)) {
+			dst_ill = ire->ire_stq->q_ptr;
 			ill_refhold(dst_ill);
-			/*
-			 * We should have found a route matching ill as we
-			 * called ire_ftable_lookup with MATCH_IRE_ILL.
-			 * Rather than asserting, when there is a mismatch,
-			 * we just drop the packet.
-			 */
-			if (dst_ill != attach_ill) {
-				ip0dbg(("ip_newroute: Packet dropped as "
-				    "IPIF_NOFAILOVER ill is %s, "
-				    "ire->ire_ipif->ipif_ill is %s\n",
-				    attach_ill->ill_name,
-				    dst_ill->ill_name));
-				ill_refrele(attach_ill);
-				goto icmp_err_ret;
+		} else {
+			ill_t *ill = ire->ire_ipif->ipif_ill;
+
+			if (IS_IPMP(ill)) {
+				dst_ill =
+				    ipmp_illgrp_hold_next_ill(ill->ill_grp);
+			} else {
+				dst_ill = ill;
+				ill_refhold(dst_ill);
 			}
 		}
-		/* attach_ill can't go in loop. IPMP and CGTP are disjoint */
-		if (attach_ill != NULL) {
-			ill_refrele(attach_ill);
-			attach_ill = NULL;
-			do_attach_ill = B_TRUE;
+
+		if (dst_ill == NULL) {
+			if (ip_debug > 2) {
+				pr_addr_dbg("ip_newroute: no dst "
+				    "ill for dst %s\n", AF_INET, &dst);
+			}
+			goto icmp_err_ret;
 		}
-		ASSERT(dst_ill != NULL);
 		ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name));
 
 		/*
 		 * Pick the best source address from dst_ill.
 		 *
-		 * 1) If it is part of a multipathing group, we would
-		 *    like to spread the inbound packets across different
-		 *    interfaces. ipif_select_source picks a random source
-		 *    across the different ills in the group.
-		 *
-		 * 2) If it is not part of a multipathing group, we try
-		 *    to pick the source address from the destination
+		 * 1) Try to pick the source address from the destination
 		 *    route. Clustering assumes that when we have multiple
 		 *    prefixes hosted on an interface, the prefix of the
 		 *    source address matches the prefix of the destination
 		 *    route. We do this only if the address is not
 		 *    DEPRECATED.
 		 *
-		 * 3) If the conn is in a different zone than the ire, we
+		 * 2) If the conn is in a different zone than the ire, we
 		 *    need to pick a source address from the right zone.
-		 *
-		 * NOTE : If we hit case (1) above, the prefix of the source
-		 *	  address picked may not match the prefix of the
-		 *	  destination routes prefix as ipif_select_source
-		 *	  does not look at "dst" while picking a source
-		 *	  address.
-		 *	  If we want the same behavior as (2), we will need
-		 *	  to change the behavior of ipif_select_source.
 		 */
 		ASSERT(src_ipif == NULL);
 		if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
@@ -8287,7 +8071,8 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 		if (src_ipif == NULL &&
 		    (!unspec_src || ipha->ipha_src != INADDR_ANY)) {
 			ire_marks |= IRE_MARK_USESRC_CHECK;
-			if ((dst_ill->ill_group != NULL) ||
+			if (!IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
+			    IS_IPMP(ire->ire_ipif->ipif_ill) ||
 			    (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
 			    (connp != NULL && ire->ire_zoneid != zoneid &&
 			    ire->ire_zoneid != ALL_ZONES) ||
@@ -8312,6 +8097,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 				 * as dst_ire source address.
 				 */
 				ipaddr_t saddr = ire->ire_ipif->ipif_src_addr;
+
 				src_ipif = ipif_select_source(dst_ill, saddr,
 				    zoneid);
 				if (src_ipif == NULL) {
@@ -8319,7 +8105,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 						pr_addr_dbg("ip_newroute: "
 						    "no src for dst %s ",
 						    AF_INET, &dst);
-						printf("through interface %s\n",
+						printf("on interface %s\n",
 						    dst_ill->ill_name);
 					}
 					goto icmp_err_ret;
@@ -8558,6 +8344,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 					MULTIRT_DEBUG_TAG(first_mp);
 				}
 			}
+
 			ire_add_then_send(q, ire, xmit_mp);
 			ire_refrele(save_ire);
 
@@ -8766,7 +8553,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 							    "ip_newroute: no "
 							    "src for gw %s ",
 							    AF_INET, &gw);
-							printf("through "
+							printf("on "
 							    "interface %s\n",
 							    dst_ill->ill_name);
 						}
@@ -8867,16 +8654,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
 			areq = (areq_t *)mp->b_rptr;
 			addrp = (ipaddr_t *)((char *)areq +
 			    areq->areq_sender_addr_offset);
-			if (do_attach_ill) {
-				/*
-				 * This is bind to no failover case.
-				 * arp packet also must go out on attach_ill.
-				 */
-				ASSERT(ipha->ipha_src != NULL);
-				*addrp = ipha->ipha_src;
-			} else {
-				*addrp = save_ire->ire_src_addr;
-			}
+			*addrp = save_ire->ire_src_addr;
 
 			ire_refrele(save_ire);
 			addrp = (ipaddr_t *)((char *)areq +
@@ -9076,14 +8854,10 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 	ipaddr_t *addrp;
 	mblk_t *first_mp;
 	ire_t	*save_ire = NULL;
-	ill_t	*attach_ill = NULL;		/* Bind to IPIF_NOFAILOVER */
 	ipif_t	*src_ipif = NULL;
 	ushort_t ire_marks = 0;
 	ill_t	*dst_ill = NULL;
-	boolean_t mctl_present;
-	ipsec_out_t *io;
 	ipha_t *ipha;
-	int	ihandle = 0;
 	mblk_t	*saved_mp;
 	ire_t   *fire = NULL;
 	mblk_t  *copy_mp = NULL;
@@ -9117,10 +8891,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 		ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst),
 		    ipif->ipif_ill->ill_name));
 
-		EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-		if (mctl_present)
-			io = (ipsec_out_t *)first_mp->b_rptr;
-
+		first_mp = mp;
+		if (DB_TYPE(mp) == M_CTL)
+			mp = mp->b_cont;
 		ipha = (ipha_t *)mp->b_rptr;
 
 		/*
@@ -9161,64 +8934,29 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 			    (void *)ipif, ntohl(dst), (void *)fire));
 		}
 
-		if (mctl_present && io->ipsec_out_attach_if) {
-			attach_ill = ip_grab_attach_ill(NULL, first_mp,
-			    io->ipsec_out_ill_index, B_FALSE, ipst);
-
-			/* Failure case frees things for us. */
-			if (attach_ill == NULL) {
-				ipif_refrele(ipif);
-				if (fire != NULL)
-					ire_refrele(fire);
-				return;
-			}
+		/*
+		 * Note: While we pick a dst_ill we are really only
+		 * interested in the ill for load spreading. The source
+		 * ipif is determined by source address selection below.
+		 */
+		if (IS_IPMP(ipif->ipif_ill)) {
+			ipmp_illgrp_t *illg = ipif->ipif_ill->ill_grp;
 
-			/*
-			 * Check if we need an ire that will not be
-			 * looked up by anybody else i.e. HIDDEN.
-			 */
-			if (ill_is_probeonly(attach_ill)) {
-				ire_marks = IRE_MARK_HIDDEN;
-			}
-			/*
-			 * ip_wput passes the right ipif for IPIF_NOFAILOVER
-			 * case.
-			 */
-			dst_ill = ipif->ipif_ill;
-			/* attach_ill has been refheld by ip_grab_attach_ill */
-			ASSERT(dst_ill == attach_ill);
+			if (CLASSD(ipha_dst))
+				dst_ill = ipmp_illgrp_hold_cast_ill(illg);
+			else
+				dst_ill = ipmp_illgrp_hold_next_ill(illg);
 		} else {
-			/*
-			 * If the interface belongs to an interface group,
-			 * make sure the next possible interface in the group
-			 * is used.  This encourages load spreading among
-			 * peers in an interface group.
-			 * Note: load spreading is disabled for RTF_MULTIRT
-			 * routes.
-			 */
-			if ((flags & RTF_MULTIRT) && (fire != NULL) &&
-			    (fire->ire_flags & RTF_MULTIRT)) {
-				/*
-				 * Don't perform outbound load spreading
-				 * in the case of an RTF_MULTIRT issued route,
-				 * we actually typically want to replicate
-				 * outgoing packets through particular
-				 * interfaces.
-				 */
-				dst_ill = ipif->ipif_ill;
-				ill_refhold(dst_ill);
-			} else {
-				dst_ill = ip_newroute_get_dst_ill(
-				    ipif->ipif_ill);
-			}
-			if (dst_ill == NULL) {
-				if (ip_debug > 2) {
-					pr_addr_dbg("ip_newroute_ipif: "
-					    "no dst ill for dst %s\n",
-					    AF_INET, &dst);
-				}
-				goto err_ret;
+			dst_ill = ipif->ipif_ill;
+			ill_refhold(dst_ill);
+		}
+
+		if (dst_ill == NULL) {
+			if (ip_debug > 2) {
+				pr_addr_dbg("ip_newroute_ipif: no dst ill "
+				    "for dst %s\n", AF_INET, &dst);
 			}
+			goto err_ret;
 		}
 
 		/*
@@ -9242,7 +8980,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 
 		unspec_src = (connp != NULL && connp->conn_unspec_src);
 
-		if (((!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) ||
+		if (!IS_UNDER_IPMP(ipif->ipif_ill) &&
+		    (IS_IPMP(ipif->ipif_ill) ||
+		    (!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) ||
 		    (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP ||
 		    (connp != NULL && ipif->ipif_zoneid != zoneid &&
 		    ipif->ipif_zoneid != ALL_ZONES)) &&
@@ -9256,7 +8996,7 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 					    "no src for dst %s",
 					    AF_INET, &dst);
 				}
-				ip1dbg((" through interface %s\n",
+				ip1dbg((" on interface %s\n",
 				    dst_ill->ill_name));
 				goto err_ret;
 			}
@@ -9291,12 +9031,7 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 				goto err_ret;
 			if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
 				goto err_ret;
-			/*
-			 * ihandle is needed when the ire is added to
-			 * cache table.
-			 */
 			save_ire = ire;
-			ihandle = save_ire->ire_ihandle;
 
 			ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, "
 			    "flags %04x\n",
@@ -9328,10 +9063,6 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 				ipha->ipha_src = fire->ire_src_addr;
 			}
 		} else {
-			ASSERT((connp == NULL) ||
-			    (connp->conn_outgoing_ill != NULL) ||
-			    (connp->conn_dontroute) ||
-			    infop->ip_opt_ill_index != 0);
 			/*
 			 * The only ways we can come here are:
 			 * 1) IP_BOUND_IF socket option is set
@@ -9340,6 +9071,9 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 			 * In all cases, the new ire will not be added
 			 * into cache table.
 			 */
+			ASSERT(connp == NULL || connp->conn_dontroute ||
+			    connp->conn_outgoing_ill != NULL ||
+			    infop->ip_opt_ill_index != 0);
 			ire_marks |= IRE_MARK_NOADD;
 		}
 
@@ -9374,7 +9108,8 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 			    (save_ire != NULL ? save_ire->ire_mask : 0),
 			    (fire != NULL) ?		/* Parent handle */
 			    fire->ire_phandle : 0,
-			    ihandle,			/* Interface handle */
+			    (save_ire != NULL) ?	/* Interface handle */
+			    save_ire->ire_ihandle : 0,
 			    (fire != NULL) ?
 			    (fire->ire_flags &
 			    (RTF_SETSRC | RTF_MULTIRT)) : 0,
@@ -9533,7 +9268,8 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 			    (save_ire != NULL ? save_ire->ire_mask : 0),
 			    (fire != NULL) ?		/* Parent handle */
 			    fire->ire_phandle : 0,
-			    ihandle,			/* Interface handle */
+			    (save_ire != NULL) ?	/* Interface handle */
+			    save_ire->ire_ihandle : 0,
 			    (fire != NULL) ?		/* flags if any */
 			    (fire->ire_flags &
 			    (RTF_SETSRC | RTF_MULTIRT)) : 0,
@@ -9593,12 +9329,20 @@ ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
 			/*
 			 * Fill in the source and dest addrs for the resolver.
 			 * NOTE: this depends on memory layouts imposed by
-			 * ill_init().
+			 * ill_init().  There are corner cases above where we
+			 * might've created the IRE with an INADDR_ANY source
+			 * address (e.g., if the zeroth ipif on an underlying
+			 * ill in an IPMP group is 0.0.0.0, but another ipif
+			 * on the ill has a usable test address).  If so, tell
+			 * ARP to use ipha_src as its sender address.
 			 */
 			areq = (areq_t *)mp->b_rptr;
 			addrp = (ipaddr_t *)((char *)areq +
 			    areq->areq_sender_addr_offset);
-			*addrp = ire->ire_src_addr;
+			if (ire->ire_src_addr != INADDR_ANY)
+				*addrp = ire->ire_src_addr;
+			else
+				*addrp = ipha->ipha_src;
 			addrp = (ipaddr_t *)((char *)areq +
 			    areq->areq_target_addr_offset);
 			*addrp = dst;
@@ -10136,7 +9880,7 @@ ip_ipsec_load_complete(ipsec_stack_t *ipss)
 /*
  * Can't be used. Need to call svr4* -> optset directly. the leaf routine
  * determines the grp on which it has to become exclusive, queues the mp
- * and sq draining restarts the optmgmt
+ * and IPSQ draining restarts the optmgmt
  */
 static boolean_t
 ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp)
@@ -10482,28 +10226,6 @@ ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option,
 	}
 
 	switch (option) {
-	case IP_DONTFAILOVER_IF:
-		/*
-		 * This option is used by in.mpathd to ensure
-		 * that IPMP probe packets only go out on the
-		 * test interfaces. in.mpathd sets this option
-		 * on the non-failover interfaces.
-		 * For backward compatibility, this option
-		 * implicitly sets IP_MULTICAST_IF, as used
-		 * be done in bind(), so that ip_wput gets
-		 * this ipif to send mcast packets.
-		 */
-		if (ipif != NULL) {
-			ASSERT(addr != INADDR_ANY);
-			connp->conn_nofailover_ill = ipif->ipif_ill;
-			connp->conn_multicast_ipif = ipif;
-		} else {
-			ASSERT(addr == INADDR_ANY);
-			connp->conn_nofailover_ill = NULL;
-			connp->conn_multicast_ipif = NULL;
-		}
-		break;
-
 	case IP_MULTICAST_IF:
 		connp->conn_multicast_ipif = ipif;
 		break;
@@ -10551,7 +10273,7 @@ ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly,
 				ill_refrele(ill);
 				return (0);
 			}
-			if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid,
+			if (!ipif_lookup_zoneid(ill, connp->conn_zoneid,
 			    0, NULL)) {
 				ill_refrele(ill);
 				ill = NULL;
@@ -10596,8 +10318,6 @@ setit:
 		case IP_BOUND_IF:
 			connp->conn_incoming_ill = ill;
 			connp->conn_outgoing_ill = ill;
-			connp->conn_orig_bound_ifindex = (ill == NULL) ?
-			    0 : ifindex;
 			break;
 
 		case IP_MULTICAST_IF:
@@ -10650,40 +10370,6 @@ setit:
 		case IPV6_BOUND_IF:
 			connp->conn_incoming_ill = ill;
 			connp->conn_outgoing_ill = ill;
-			connp->conn_orig_bound_ifindex = (ill == NULL) ?
-			    0 : ifindex;
-			break;
-
-		case IPV6_BOUND_PIF:
-			/*
-			 * Limit all transmit to this ill.
-			 * Unlike IPV6_BOUND_IF, using this option
-			 * prevents load spreading and failover from
-			 * happening when the interface is part of the
-			 * group. That's why we don't need to remember
-			 * the ifindex in orig_bound_ifindex as in
-			 * IPV6_BOUND_IF.
-			 */
-			connp->conn_outgoing_pill = ill;
-			break;
-
-		case IPV6_DONTFAILOVER_IF:
-			/*
-			 * This option is used by in.mpathd to ensure
-			 * that IPMP probe packets only go out on the
-			 * test interfaces. in.mpathd sets this option
-			 * on the non-failover interfaces.
-			 */
-			connp->conn_nofailover_ill = ill;
-			/*
-			 * For backward compatibility, this option
-			 * implicitly sets ip_multicast_ill as used in
-			 * IPV6_MULTICAST_IF so that ip_wput gets
-			 * this ill to send mcast packets.
-			 */
-			connp->conn_multicast_ill = ill;
-			connp->conn_orig_multicast_ifindex = (ill == NULL) ?
-			    0 : ifindex;
 			break;
 
 		case IPV6_MULTICAST_IF:
@@ -10700,12 +10386,9 @@ setit:
 			if (!checkonly) {
 				if (ifindex == 0) {
 					connp->conn_multicast_ill = NULL;
-					connp->conn_orig_multicast_ifindex = 0;
 					connp->conn_multicast_ipif = NULL;
 				} else if (ill != NULL) {
 					connp->conn_multicast_ill = ill;
-					connp->conn_orig_multicast_ifindex =
-					    ifindex;
 				}
 			}
 			break;
@@ -10867,8 +10550,7 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name,
 			if (secpolicy_ip_config(cr, B_FALSE) != 0)
 				return (EPERM);
 			/* FALLTHRU */
-		case IP_MULTICAST_IF:
-		case IP_DONTFAILOVER_IF: {
+		case IP_MULTICAST_IF: {
 			ipaddr_t addr = *i1;
 
 			error = ip_opt_set_ipif(connp, addr, checkonly, name,
@@ -11189,8 +10871,6 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name,
 	case IPPROTO_IPV6:
 		switch (name) {
 		case IPV6_BOUND_IF:
-		case IPV6_BOUND_PIF:
-		case IPV6_DONTFAILOVER_IF:
 			error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly,
 			    level, name, first_mp);
 			if (error != 0)
@@ -12288,11 +11968,10 @@ ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha,
  * frees mp on failure.
  */
 static boolean_t
-ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha,
+ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
     uint32_t *cksum_val, uint16_t *cksum_flags)
 {
 	uint32_t	frag_offset_flags;
-	ill_t		*ill = (ill_t *)q->q_ptr;
 	mblk_t		*mp = *mpp;
 	mblk_t		*t_mp;
 	ipaddr_t	dst;
@@ -12337,12 +12016,12 @@ ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha,
 
 	/*
 	 * We utilize hardware computed checksum info only for UDP since
-	 * IP fragmentation is a normal occurence for the protocol.  In
+	 * IP fragmentation is a normal occurrence for the protocol.  In
 	 * addition, checksum offload support for IP fragments carrying
 	 * UDP payload is commonly implemented across network adapters.
 	 */
-	ASSERT(ill != NULL);
-	if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+	ASSERT(recv_ill != NULL);
+	if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(recv_ill) &&
 	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
 		mblk_t *mp1 = mp->b_cont;
 		int32_t len;
@@ -12808,7 +12487,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 		goto ipoptions;
 
 	/* Check the IP header checksum.  */
-	if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
+	if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) {
 		/* Clear the IP header h/w cksum flag */
 		DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 	} else if (!mctl_present) {
@@ -12871,7 +12550,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 		 * Revert to software checksum calculation if the interface
 		 * isn't capable of checksum offload or if IPsec is present.
 		 */
-		if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+		if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum)
 			hck_flags = DB_CKSUMFLAGS(mp);
 
 		if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
@@ -12958,8 +12637,11 @@ fragmented:
 		 * reassembled packet has a valid hardware computed
 		 * checksum information associated with it.
 		 */
-		if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags))
+		if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, &sum,
+		    &reass_hck_flags)) {
 			goto slow_done;
+		}
+
 		/*
 		 * Make sure that first_mp points back to mp as
 		 * the mp we came in with could have changed in
@@ -13073,7 +12755,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
 		goto ipoptions;
 	} else if (!mctl_present) {
 		/* Check the IP header checksum.  */
-		if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
+		if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) {
 			/* Clear the IP header h/w cksum flag */
 			DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 		} else if (!mctl_present) {
@@ -13159,7 +12841,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
 	 * Revert to software checksum calculation if the interface
 	 * isn't capable of checksum offload or if IPsec is present.
 	 */
-	if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+	if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum)
 		hck_flags = DB_CKSUMFLAGS(mp);
 
 	if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
@@ -13386,7 +13068,7 @@ ipoptions:
 	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
 	if (u1 & (IPH_MF | IPH_OFFSET)) {
 fragmented:
-		if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
+		if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) {
 			if (mctl_present)
 				freeb(first_mp);
 			goto slow_done;
@@ -13530,7 +13212,7 @@ ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
 		goto ipoptions;
 	} else {
 		/* Check the IP header checksum.  */
-		if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, ill) &&
+		if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill) &&
 		    !mctl_present) {
 #define	uph	((uint16_t *)ipha)
 			sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -13644,7 +13326,7 @@ ipoptions:
 	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
 	if (u1 & (IPH_MF | IPH_OFFSET)) {
 fragmented:
-		if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL))
+		if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL))
 			goto slow_done;
 		/*
 		 * Make sure that first_mp points back to mp as
@@ -13877,6 +13559,11 @@ ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst)
 	return (B_TRUE);
 }
 
+/*
+ * Handle the situation where a packet came in on `ill' but matched an IRE
+ * whose ire_rfq doesn't match `ill'.  We return the IRE that should be used
+ * for interface statistics.
+ */
 ire_t *
 ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
 {
@@ -13887,16 +13574,22 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
 	boolean_t	strict_check = B_FALSE;
 
 	/*
-	 * This packet came in on an interface other than the one associated
-	 * with the first ire we found for the destination address. We do
-	 * another ire lookup here, using the ingress ill, to see if the
-	 * interface is in an interface group.
+	 * IPMP common case: if IRE and ILL are in the same group, there's no
+	 * issue (e.g. packet received on an underlying interface matched an
+	 * IRE_LOCAL on its associated group interface).
+	 */
+	if (ire->ire_rfq != NULL &&
+	    IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) {
+		return (ire);
+	}
+
+	/*
+	 * Do another ire lookup here, using the ingress ill, to see if the
+	 * interface is in a usesrc group.
 	 * As long as the ills belong to the same group, we don't consider
 	 * them to be arriving on the wrong interface. Thus, if the switch
 	 * is doing inbound load spreading, we won't drop packets when the
-	 * ip*_strict_dst_multihoming switch is on. Note, the same holds true
-	 * for 'usesrc groups' where the destination address may belong to
-	 * another interface to allow multipathing to happen.
+	 * ip*_strict_dst_multihoming switch is on.
 	 * We also need to check for IPIF_UNNUMBERED point2point interfaces
 	 * where the local address may not be unique. In this case we were
 	 * at the mercy of the initial ire cache lookup and the IRE_LOCAL it
@@ -13910,18 +13603,18 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
 			strict_check = B_TRUE;
 		new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL,
 		    ill->ill_ipif, ALL_ZONES, NULL,
-		    (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst);
+		    (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst);
 	} else {
 		ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr));
 		if (ipst->ips_ipv6_strict_dst_multihoming)
 			strict_check = B_TRUE;
 		new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL,
 		    IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL,
-		    (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst);
+		    (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst);
 	}
 	/*
 	 * If the same ire that was returned in ip_input() is found then this
-	 * is an indication that interface groups are in use. The packet
+	 * is an indication that usesrc groups are in use. The packet
 	 * arrived on a different ill in the group than the one associated with
 	 * the destination address.  If a different ire was found then the same
 	 * IP address must be hosted on multiple ills. This is possible with
@@ -14075,11 +13768,10 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst,  ill_t *ill, mblk_t *mp)
 
 	/*
 	 * Forwarding fastpath exception case:
-	 * If either of the follwoing case is true, we take
-	 * the slowpath
+	 * If any of the following are true, we take the slowpath:
 	 *	o forwarding is not enabled
-	 *	o incoming and outgoing interface are the same, or the same
-	 *	  IPMP group
+	 *	o incoming and outgoing interface are the same, or in the same
+	 *	  IPMP group.
 	 *	o corresponding ire is in incomplete state
 	 *	o packet needs fragmentation
 	 *	o ARP cache is not resolved
@@ -14090,8 +13782,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst,  ill_t *ill, mblk_t *mp)
 	pkt_len = ntohs(ipha->ipha_length);
 	stq_ill = (ill_t *)ire->ire_stq->q_ptr;
 	if (!(stq_ill->ill_flags & ILLF_ROUTER) ||
-	    (ill == stq_ill) ||
-	    (ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) ||
+	    (ill == stq_ill) || IS_IN_SAME_ILLGRP(ill, stq_ill) ||
 	    (ire->ire_nce == NULL) ||
 	    (pkt_len > ire->ire_max_frag) ||
 	    ((fpmp = ire->ire_nce->nce_fp_mp) == NULL) ||
@@ -14185,11 +13876,10 @@ static void
 ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
     ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward)
 {
-	ill_group_t	*ill_group;
-	ill_group_t	*ire_group;
 	queue_t		*dev_q;
 	ire_t		*src_ire;
 	ip_stack_t	*ipst = ill->ill_ipst;
+	boolean_t	same_illgrp = B_FALSE;
 
 	ASSERT(ire->ire_stq != NULL);
 
@@ -14200,11 +13890,8 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
 	 * If the caller of this function is ip_fast_forward() skip the
 	 * next three checks as it does not apply.
 	 */
-	if (from_ip_fast_forward) {
-		ill_group = ill->ill_group;
-		ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
+	if (from_ip_fast_forward)
 		goto skip;
-	}
 
 	if (ll_multicast != 0) {
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
@@ -14230,13 +13917,10 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
 		goto drop_pkt;
 	}
 
-	ill_group = ill->ill_group;
-	ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
 	/*
 	 * Check if we want to forward this one at this time.
 	 * We allow source routed packets on a host provided that
-	 * they go out the same interface or same interface group
-	 * as they came in on.
+	 * they go out the same ill or illgrp as they came in on.
 	 *
 	 * XXX To be quicker, we may wish to not chase pointers to
 	 * get the ILLF_ROUTER flag and instead store the
@@ -14245,11 +13929,12 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
 	 * whenever the ILLF_ROUTER flag changes.
 	 */
 skip:
+	same_illgrp = IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr);
+
 	if (((ill->ill_flags &
-	    ((ill_t *)ire->ire_stq->q_ptr)->ill_flags &
-	    ILLF_ROUTER) == 0) &&
-	    !(ip_source_routed(ipha, ipst) && (ire->ire_rfq == q ||
-	    (ill_group != NULL && ill_group == ire_group)))) {
+	    ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_ROUTER) == 0) &&
+	    !(ip_source_routed(ipha, ipst) &&
+	    (ire->ire_rfq == q || same_illgrp))) {
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 		if (ip_source_routed(ipha, ipst)) {
 			q = WR(q);
@@ -14290,12 +13975,10 @@ skip:
 		ire_t	*nhop_ire = NULL;
 
 		/*
-		 * Check whether ire_rfq and q are from the same ill
-		 * or if they are not same, they at least belong
-		 * to the same group. If so, send redirects.
+		 * Check whether ire_rfq and q are from the same ill or illgrp.
+		 * If so, send redirects.
 		 */
-		if ((ire->ire_rfq == q ||
-		    (ill_group != NULL && ill_group == ire_group)) &&
+		if ((ire->ire_rfq == q || same_illgrp) &&
 		    !ip_source_routed(ipha, ipst)) {
 
 			nhop = (ire->ire_gateway_addr != 0 ?
@@ -14396,26 +14079,15 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha,
 	}
 	/*
 	 * For multicast we have set dst to be INADDR_BROADCAST
-	 * for delivering to all STREAMS. IRE_MARK_NORECV is really
-	 * only for broadcast packets.
+	 * for delivering to all STREAMS.
 	 */
 	if (!CLASSD(ipha->ipha_dst)) {
 		ire_t *new_ire;
 		ipif_t *ipif;
-		/*
-		 * For ill groups, as the switch duplicates broadcasts
-		 * across all the ports, we need to filter out and
-		 * send up only one copy. There is one copy for every
-		 * broadcast address on each ill. Thus, we look for a
-		 * specific IRE on this ill and look at IRE_MARK_NORECV
-		 * later to see whether this ill is eligible to receive
-		 * them or not. ill_nominate_bcast_rcv() nominates only
-		 * one set of IREs for receiving.
-		 */
 
 		ipif = ipif_get_next_ipif(NULL, ill);
 		if (ipif == NULL) {
-			ire_refrele(ire);
+discard:		ire_refrele(ire);
 			freemsg(mp);
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 			return (NULL);
@@ -14425,13 +14097,17 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha,
 		ipif_refrele(ipif);
 
 		if (new_ire != NULL) {
-			if (new_ire->ire_marks & IRE_MARK_NORECV) {
-				ire_refrele(ire);
+			/*
+			 * If the matching IRE_BROADCAST is part of an IPMP
+			 * group, then drop the packet unless our ill has been
+			 * nominated to receive for the group.
+			 */
+			if (IS_IPMP(new_ire->ire_ipif->ipif_ill) &&
+			    new_ire->ire_rfq != q) {
 				ire_refrele(new_ire);
-				freemsg(mp);
-				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-				return (NULL);
+				goto discard;
 			}
+
 			/*
 			 * In the special case of multirouted broadcast
 			 * packets, we unconditionally need to "gateway"
@@ -14571,6 +14247,13 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
 	    ntohs(ipha->ipha_length));
 
 	/*
+	 * So that we don't end up with dups, only one ill an IPMP group is
+	 * nominated to receive multicast traffic.
+	 */
+	if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast)
+		goto drop_pkt;
+
+	/*
 	 * Forward packets only if we have joined the allmulti
 	 * group on this interface.
 	 */
@@ -14619,18 +14302,15 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
 		}
 	}
 
-	ILM_WALKER_HOLD(ill);
 	if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) {
 		/*
 		 * This might just be caused by the fact that
 		 * multiple IP Multicast addresses map to the same
 		 * link layer multicast - no need to increment counter!
 		 */
-		ILM_WALKER_RELE(ill);
 		freemsg(mp);
 		return (B_TRUE);
 	}
-	ILM_WALKER_RELE(ill);
 done:
 	ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp)));
 	/*
@@ -15498,8 +15178,8 @@ local:
 		 * broadcast ire.
 		 */
 		if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) {
-			if ((ire = ip_check_multihome(&ipha->ipha_dst, ire,
-			    ill)) == NULL) {
+			ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
+			if (ire == NULL) {
 				/* Drop packet */
 				BUMP_MIB(ill->ill_ip_mib,
 				    ipIfStatsForwProhibits);
@@ -15935,19 +15615,12 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 
 	ip1dbg(("ip_rput_dlpi_writer .."));
 	ill = (ill_t *)q->q_ptr;
-	ASSERT(ipsq == ill->ill_phyint->phyint_ipsq);
-
+	ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop);
 	ASSERT(IAM_WRITER_ILL(ill));
 
 	ipst = ill->ill_ipst;
 
-	/*
-	 * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e.
-	 * both are null or non-null. However we can assert that only
-	 * after grabbing the ipsq_lock. So we don't make any assertion
-	 * here and in other places in the code.
-	 */
-	ipif = ipsq->ipsq_pending_ipif;
+	ipif = ipsq->ipsq_xop->ipx_pending_ipif;
 	/*
 	 * The current ioctl could have been aborted by the user and a new
 	 * ioctl to bring up another ill could have started. We could still
@@ -16045,9 +15718,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 				 */
 				ASSERT(connp != NULL);
 				q = CONNP_TO_WQ(connp);
-				if (ill->ill_move_in_progress) {
-					ILL_CLEAR_MOVE(ill);
-				}
 				(void) ipif_down(ipif, NULL, NULL);
 				/* error is set below the switch */
 			}
@@ -16196,45 +15866,31 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		 * ill_dl_up(), which stopped ipif_up()'s processing.
 		 */
 		if (ill->ill_isv6) {
-			/*
-			 * v6 interfaces.
-			 * Unlike ARP which has to do another bind
-			 * and attach, once we get here we are
-			 * done with NDP. Except in the case of
-			 * ILLF_XRESOLV, in which case we send an
-			 * AR_INTERFACE_UP to the external resolver.
-			 * If all goes well, the ioctl will complete
-			 * in ip_rput(). If there's an error, we
-			 * complete it here.
-			 */
-			if ((err = ipif_ndp_up(ipif)) == 0) {
-				if (ill->ill_flags & ILLF_XRESOLV) {
-					mutex_enter(&connp->conn_lock);
-					mutex_enter(&ill->ill_lock);
-					success = ipsq_pending_mp_add(
-					    connp, ipif, q, mp1, 0);
-					mutex_exit(&ill->ill_lock);
-					mutex_exit(&connp->conn_lock);
-					if (success) {
-						err = ipif_resolver_up(ipif,
-						    Res_act_initial);
-						if (err == EINPROGRESS) {
-							freemsg(mp);
-							return;
-						}
-						ASSERT(err != 0);
-						mp1 = ipsq_pending_mp_get(ipsq,
-						    &connp);
-						ASSERT(mp1 != NULL);
-					} else {
-						/* conn has started closing */
-						err = EINTR;
-					}
-				} else { /* Non XRESOLV interface */
-					(void) ipif_resolver_up(ipif,
+			if (ill->ill_flags & ILLF_XRESOLV) {
+				mutex_enter(&connp->conn_lock);
+				mutex_enter(&ill->ill_lock);
+				success = ipsq_pending_mp_add(connp, ipif, q,
+				    mp1, 0);
+				mutex_exit(&ill->ill_lock);
+				mutex_exit(&connp->conn_lock);
+				if (success) {
+					err = ipif_resolver_up(ipif,
 					    Res_act_initial);
-					err = ipif_up_done_v6(ipif);
+					if (err == EINPROGRESS) {
+						freemsg(mp);
+						return;
+					}
+					ASSERT(err != 0);
+					mp1 = ipsq_pending_mp_get(ipsq, &connp);
+					ASSERT(mp1 != NULL);
+				} else {
+					/* conn has started closing */
+					err = EINTR;
 				}
+			} else { /* Non XRESOLV interface */
+				(void) ipif_resolver_up(ipif, Res_act_initial);
+				if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
+					err = ipif_up_done_v6(ipif);
 			}
 		} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
 			/*
@@ -16275,14 +15931,31 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			}
 		}
 
-		if (ill->ill_up_ipifs) {
-			ill_group_cleanup(ill);
+		/*
+		 * If we have a moved ipif to bring up, and everything has
+		 * succeeded to this point, bring it up on the IPMP ill.
+		 * Otherwise, leave it down -- the admin can try to bring it
+		 * up by hand if need be.
+		 */
+		if (ill->ill_move_ipif != NULL) {
+			if (err != 0) {
+				ill->ill_move_ipif = NULL;
+			} else {
+				ipif = ill->ill_move_ipif;
+				ill->ill_move_ipif = NULL;
+				err = ipif_up(ipif, q, mp1);
+				if (err == EINPROGRESS) {
+					freemsg(mp);
+					return;
+				}
+			}
 		}
-
 		break;
+
 	case DL_NOTIFY_IND: {
 		dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
 		ire_t *ire;
+		uint_t orig_mtu;
 		boolean_t need_ire_walk_v4 = B_FALSE;
 		boolean_t need_ire_walk_v6 = B_FALSE;
 
@@ -16322,17 +15995,27 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			 * which it is being derived.
 			 */
 			mutex_enter(&ill->ill_lock);
+
+			orig_mtu = ill->ill_max_mtu;
 			ill->ill_max_frag = (uint_t)notify->dl_data;
+			ill->ill_max_mtu = (uint_t)notify->dl_data;
+
+			/*
+			 * If ill_user_mtu was set (via SIOCSLIFLNKINFO),
+			 * clamp ill_max_mtu at it.
+			 */
+			if (ill->ill_user_mtu != 0 &&
+			    ill->ill_user_mtu < ill->ill_max_mtu)
+				ill->ill_max_mtu = ill->ill_user_mtu;
 
 			/*
-			 * If an SIOCSLIFLNKINFO has changed the ill_max_mtu
-			 * leave it alone
+			 * If the MTU is unchanged, we're done.
 			 */
-			if (ill->ill_mtu_userspecified) {
+			if (orig_mtu == ill->ill_max_mtu) {
 				mutex_exit(&ill->ill_lock);
 				break;
 			}
-			ill->ill_max_mtu = ill->ill_max_frag;
+
 			if (ill->ill_isv6) {
 				if (ill->ill_max_mtu < IPV6_MIN_MTU)
 					ill->ill_max_mtu = IPV6_MIN_MTU;
@@ -16371,7 +16054,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			if (need_ire_walk_v6)
 				ire_walk_v6(ill_mtu_change, (char *)ill,
 				    ALL_ZONES, ipst);
+
+			/*
+			 * Refresh IPMP meta-interface MTU if necessary.
+			 */
+			if (IS_UNDER_IPMP(ill))
+				ipmp_illgrp_refresh_mtu(ill->ill_grp);
 			break;
+
 		case DL_NOTE_LINK_UP:
 		case DL_NOTE_LINK_DOWN: {
 			/*
@@ -16385,9 +16075,17 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 
 			went_up = notify->dl_notification == DL_NOTE_LINK_UP;
 			mutex_enter(&phyint->phyint_lock);
+
 			new_phyint_flags = went_up ?
 			    phyint->phyint_flags | PHYI_RUNNING :
 			    phyint->phyint_flags & ~PHYI_RUNNING;
+
+			if (IS_IPMP(ill)) {
+				new_phyint_flags = went_up ?
+				    new_phyint_flags & ~PHYI_FAILED :
+				    new_phyint_flags | PHYI_FAILED;
+			}
+
 			if (new_phyint_flags != phyint->phyint_flags) {
 				phyint->phyint_flags = new_phyint_flags;
 				changed = B_TRUE;
@@ -16474,7 +16172,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		 * is invoked from an ill queue, conn_oper_pending_ill is not
 		 * available, but we know the ioctl is pending on ill_wq.)
 		 */
-		uint_t paddrlen, paddroff;
+		uint_t	paddrlen, paddroff;
 
 		paddrreq = ill->ill_phys_addr_pend;
 		paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length;
@@ -16592,29 +16290,59 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 	}
 
 	freemsg(mp);
-	if (mp1 != NULL) {
+	if (mp1 == NULL)
+		return;
+
+	/*
+	 * The operation must complete without EINPROGRESS since
+	 * ipsq_pending_mp_get() has removed the mblk (mp1).  Otherwise,
+	 * the operation will be stuck forever inside the IPSQ.
+	 */
+	ASSERT(err != EINPROGRESS);
+
+	switch (ipsq->ipsq_xop->ipx_current_ioctl) {
+	case 0:
+		ipsq_current_finish(ipsq);
+		break;
+
+	case SIOCSLIFNAME:
+	case IF_UNITSEL: {
+		ill_t *ill_other = ILL_OTHER(ill);
+
 		/*
-		 * The operation must complete without EINPROGRESS
-		 * since ipsq_pending_mp_get() has removed the mblk
-		 * from ipsq_pending_mp.  Otherwise, the operation
-		 * will be stuck forever in the ipsq.
+		 * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the
+		 * ill has a peer which is in an IPMP group, then place ill
+		 * into the same group.  One catch: although ifconfig plumbs
+		 * the appropriate IPMP meta-interface prior to plumbing this
+		 * ill, it is possible for multiple ifconfig applications to
+		 * race (or for another application to adjust plumbing), in
+		 * which case the IPMP meta-interface we need will be missing.
+		 * If so, kick the phyint out of the group.
 		 */
-		ASSERT(err != EINPROGRESS);
+		if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) {
+			ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
+			ipmp_illgrp_t	*illg;
 
-		switch (ipsq->ipsq_current_ioctl) {
-		case 0:
-			ipsq_current_finish(ipsq);
-			break;
+			illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4;
+			if (illg == NULL)
+				ipmp_phyint_leave_grp(ill->ill_phyint);
+			else
+				ipmp_ill_join_illgrp(ill, illg);
+		}
 
-		case SIOCLIFADDIF:
-		case SIOCSLIFNAME:
+		if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL)
+			ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
+		else
 			ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
-			break;
+		break;
+	}
+	case SIOCLIFADDIF:
+		ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
+		break;
 
-		default:
-			ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
-			break;
-		}
+	default:
+		ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
+		break;
 	}
 }
 
@@ -16626,20 +16354,16 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 void
 ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 {
-	ill_t		*ill;
+	ill_t		*ill = q->q_ptr;
 	struct iocblk	*iocp;
 	mblk_t		*mp1;
 	conn_t		*connp = NULL;
 
 	ip1dbg(("ip_rput_other "));
-	ill = (ill_t *)q->q_ptr;
-	/*
-	 * This routine is not a writer in the case of SIOCGTUNPARAM
-	 * in which case ipsq is NULL.
-	 */
 	if (ipsq != NULL) {
 		ASSERT(IAM_WRITER_IPSQ(ipsq));
-		ASSERT(ipsq == ill->ill_phyint->phyint_ipsq);
+		ASSERT(ipsq->ipsq_xop ==
+		    ill->ill_phyint->phyint_ipsq->ipsq_xop);
 	}
 
 	switch (mp->b_datap->db_type) {
@@ -16752,7 +16476,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 
 		case DL_IOC_HDR_INFO:
 			/*
-			 * If this was the first attempt turn of the
+			 * If this was the first attempt, turn off the
 			 * fastpath probing.
 			 */
 			mutex_enter(&ill->ill_lock);
@@ -16768,7 +16492,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			}
 			freemsg(mp);
 			break;
-		case SIOCSTUNPARAM:
+			case SIOCSTUNPARAM:
 		case OSIOCSTUNPARAM:
 			ASSERT(ipsq != NULL);
 			/*
@@ -17017,14 +16741,13 @@ ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif)
 	/*
 	 * Find an IRE which matches the destination and the outgoing
 	 * queue in the cache table. All we need is an IRE_CACHE which
-	 * is pointing at ipif->ipif_ill. If it is part of some ill group,
-	 * then it is enough to have some IRE_CACHE in the group.
+	 * is pointing at ipif->ipif_ill.
 	 */
 	if (ipif->ipif_flags & IPIF_POINTOPOINT)
 		dst = ipif->ipif_pp_dst_addr;
 
 	ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp),
-	    MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR, ipst);
+	    MATCH_IRE_ILL | MATCH_IRE_SECATTR, ipst);
 	if (ire == NULL) {
 		/*
 		 * Mark this packet to make it be delivered to
@@ -17321,7 +17044,8 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire)
 			 */
 			mp->b_datap->db_type = M_DATA;
 			icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp,
-			    ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid);
+			    ip6h, icmp6, ill, recv_ill, B_TRUE,
+			    ii->ipsec_in_zoneid);
 		}
 		if (ill_need_rele)
 			ill_refrele(ill);
@@ -17357,37 +17081,36 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire)
 		}
 
 		switch (ipha->ipha_protocol) {
-			case IPPROTO_UDP:
-				ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire,
-				    recv_ill);
-				if (ire_need_rele)
-					ire_refrele(ire);
-				break;
-			case IPPROTO_TCP:
-				if (!ire_need_rele)
-					IRE_REFHOLD(ire);
-				mp = ip_tcp_input(mp, ipha, ill, B_TRUE,
-				    ire, ipsec_mp, 0, ill->ill_rq, NULL);
-				IRE_REFRELE(ire);
-				if (mp != NULL) {
-
-					SQUEUE_ENTER(GET_SQUEUE(mp), mp,
-					    mp, 1, SQ_PROCESS,
-					    SQTAG_IP_PROTO_AGAIN);
-				}
-				break;
-			case IPPROTO_SCTP:
-				if (!ire_need_rele)
-					IRE_REFHOLD(ire);
-				ip_sctp_input(mp, ipha, ill, B_TRUE, ire,
-				    ipsec_mp, 0, ill->ill_rq, dst);
-				break;
-			default:
-				ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire,
-				    recv_ill, 0);
-				if (ire_need_rele)
-					ire_refrele(ire);
-				break;
+		case IPPROTO_UDP:
+			ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire,
+			    recv_ill);
+			if (ire_need_rele)
+				ire_refrele(ire);
+			break;
+		case IPPROTO_TCP:
+			if (!ire_need_rele)
+				IRE_REFHOLD(ire);
+			mp = ip_tcp_input(mp, ipha, ill, B_TRUE,
+			    ire, ipsec_mp, 0, ill->ill_rq, NULL);
+			IRE_REFRELE(ire);
+			if (mp != NULL) {
+				SQUEUE_ENTER(GET_SQUEUE(mp), mp,
+				    mp, 1, SQ_PROCESS,
+				    SQTAG_IP_PROTO_AGAIN);
+			}
+			break;
+		case IPPROTO_SCTP:
+			if (!ire_need_rele)
+				IRE_REFHOLD(ire);
+			ip_sctp_input(mp, ipha, ill, B_TRUE, ire,
+			    ipsec_mp, 0, ill->ill_rq, dst);
+			break;
+		default:
+			ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire,
+			    recv_ill, 0);
+			if (ire_need_rele)
+				ire_refrele(ire);
+			break;
 		}
 	} else {
 		uint32_t rput_flags = 0;
@@ -17621,9 +17344,9 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 		 */
 		ASSERT(!mctl_present);
 		ASSERT(first_mp == mp);
-		if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
+		if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL))
 			return;
-		}
+
 		/*
 		 * Make sure that first_mp points back to mp as
 		 * the mp we came in with could have changed in
@@ -17647,17 +17370,10 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 		ilm_t		*ilm;
 		mblk_t		*mp1;
 		zoneid_t	last_zoneid;
+		ilm_walker_t	ilw;
 
 		if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) {
 			ASSERT(ire->ire_type == IRE_BROADCAST);
-			/*
-			 * Inactive/Failed interfaces are not supposed to
-			 * respond to the multicast packets.
-			 */
-			if (ill_is_probeonly(ill)) {
-				freemsg(first_mp);
-				return;
-			}
 
 			/*
 			 * In the multicast case, applications may have joined
@@ -17680,11 +17396,9 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 			 * have been exhausted.
 			 */
 			last_zoneid = -1;
-			ILM_WALKER_HOLD(recv_ill);
-			for (ilm = recv_ill->ill_ilm; ilm != NULL;
-			    ilm = ilm->ilm_next) {
-				if ((ilm->ilm_flags & ILM_DELETED) ||
-				    ipha->ipha_dst != ilm->ilm_addr ||
+			ilm = ilm_walker_start(&ilw, recv_ill);
+			for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+				if (ipha->ipha_dst != ilm->ilm_addr ||
 				    ilm->ilm_zoneid == last_zoneid ||
 				    ilm->ilm_zoneid == ire->ire_zoneid ||
 				    ilm->ilm_zoneid == ALL_ZONES ||
@@ -17693,12 +17407,12 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
 				mp1 = ip_copymsg(first_mp);
 				if (mp1 == NULL)
 					continue;
-				icmp_inbound(q, mp1, B_TRUE, ill,
+				icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill,
 				    0, sum, mctl_present, B_TRUE,
 				    recv_ill, ilm->ilm_zoneid);
 				last_zoneid = ilm->ilm_zoneid;
 			}
-			ILM_WALKER_RELE(recv_ill);
+			ilm_walker_finish(&ilw);
 		} else if (ire->ire_type == IRE_BROADCAST) {
 			/*
 			 * In the broadcast case, there may be many zones
@@ -18580,14 +18294,13 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level)
 		return (1);
 	}
 
-	if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, ipst)) == NULL) {
+	mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst);
+	if (mpctl == NULL)
 		return (1);
-	}
 
-	mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, ipst);
-	if (mpctl == NULL) {
+	mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst);
+	if (mpctl == NULL)
 		return (1);
-	}
 
 	if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) {
 		return (1);
@@ -19048,6 +18761,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	mblk_t			*mp_tail = NULL;
 	ill_walk_context_t	ctx;
 	zoneid_t		zoneid;
+	ilm_walker_t		ilw;
 
 	/*
 	 * make a copy of the original message
@@ -19064,7 +18778,10 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		ILM_WALKER_HOLD(ill);
+		if (IS_UNDER_IPMP(ill))
+			continue;
+
+		ilm = ilm_walker_start(&ilw, ill);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if (ipif->ipif_zoneid != zoneid &&
@@ -19074,7 +18791,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 			    OCTET_LENGTH);
 			ipm.ipGroupMemberIfIndex.o_length =
 			    mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes);
-			for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+			for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
 				ASSERT(ilm->ilm_ipif != NULL);
 				ASSERT(ilm->ilm_ill == NULL);
 				if (ilm->ilm_ipif != ipif)
@@ -19090,7 +18807,7 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 				}
 			}
 		}
-		ILM_WALKER_RELE(ill);
+		ilm_walker_finish(&ilw);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
@@ -19112,6 +18829,7 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	mblk_t			*mp_tail = NULL;
 	ill_walk_context_t	ctx;
 	zoneid_t		zoneid;
+	ilm_walker_t		ilw;
 
 	/*
 	 * make a copy of the original message
@@ -19127,9 +18845,12 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V6(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		ILM_WALKER_HOLD(ill);
+		if (IS_UNDER_IPMP(ill))
+			continue;
+
+		ilm = ilm_walker_start(&ilw, ill);
 		ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex;
-		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
 			ASSERT(ilm->ilm_ipif == NULL);
 			ASSERT(ilm->ilm_ill != NULL);
 			if (ilm->ilm_zoneid != zoneid)
@@ -19145,7 +18866,7 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 				    (uint_t)sizeof (ipm6)));
 			}
 		}
-		ILM_WALKER_RELE(ill);
+		ilm_walker_finish(&ilw);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 
@@ -19171,6 +18892,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	zoneid_t		zoneid;
 	int			i;
 	slist_t			*sl;
+	ilm_walker_t		ilw;
 
 	/*
 	 * make a copy of the original message
@@ -19187,7 +18909,10 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		ILM_WALKER_HOLD(ill);
+		if (IS_UNDER_IPMP(ill))
+			continue;
+
+		ilm = ilm_walker_start(&ilw, ill);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if (ipif->ipif_zoneid != zoneid)
@@ -19196,7 +18921,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 			    OCTET_LENGTH);
 			ips.ipGroupSourceIfIndex.o_length =
 			    mi_strlen(ips.ipGroupSourceIfIndex.o_bytes);
-			for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+			for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
 				ASSERT(ilm->ilm_ipif != NULL);
 				ASSERT(ilm->ilm_ill == NULL);
 				sl = ilm->ilm_filter;
@@ -19220,7 +18945,7 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 				}
 			}
 		}
-		ILM_WALKER_RELE(ill);
+		ilm_walker_finish(&ilw);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
@@ -19244,6 +18969,7 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	zoneid_t		zoneid;
 	int			i;
 	slist_t			*sl;
+	ilm_walker_t		ilw;
 
 	/*
 	 * make a copy of the original message
@@ -19259,9 +18985,12 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V6(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		ILM_WALKER_HOLD(ill);
+		if (IS_UNDER_IPMP(ill))
+			continue;
+
+		ilm = ilm_walker_start(&ilw, ill);
 		ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex;
-		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+		for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
 			ASSERT(ilm->ilm_ipif == NULL);
 			ASSERT(ilm->ilm_ill != NULL);
 			sl = ilm->ilm_filter;
@@ -19279,7 +19008,7 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 				}
 			}
 		}
-		ILM_WALKER_RELE(ill);
+		ilm_walker_finish(&ilw);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 
@@ -19345,7 +19074,8 @@ ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
  * in one IRE walk.
  */
 static mblk_t *
-ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
+ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
+    ip_stack_t *ipst)
 {
 	struct opthdr	*optp;
 	mblk_t		*mp2ctl;	/* Returned */
@@ -19377,6 +19107,14 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	ird.ird_route.lp_head = mpctl->b_cont;
 	ird.ird_netmedia.lp_head = mp3ctl->b_cont;
 	ird.ird_attrs.lp_head = mp4ctl->b_cont;
+	/*
+	 * If the level has been set the special EXPER_IP_AND_TESTHIDDEN
+	 * value, then also include IRE_MARK_TESTHIDDEN IREs.  This is
+	 * intended a temporary solution until a proper MIB API is provided
+	 * that provides complete filtering/caller-opt-in.
+	 */
+	if (level == EXPER_IP_AND_TESTHIDDEN)
+		ird.ird_flags |= IRD_REPORT_TESTHIDDEN;
 
 	zoneid = Q_TO_CONN(q)->conn_zoneid;
 	ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst);
@@ -19419,7 +19157,8 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
  * ipv6NetToMediaEntryTable in an NDP walk.
  */
 static mblk_t *
-ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
+ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
+    ip_stack_t *ipst)
 {
 	struct opthdr	*optp;
 	mblk_t		*mp2ctl;	/* Returned */
@@ -19451,6 +19190,14 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 	ird.ird_route.lp_head = mpctl->b_cont;
 	ird.ird_netmedia.lp_head = mp3ctl->b_cont;
 	ird.ird_attrs.lp_head = mp4ctl->b_cont;
+	/*
+	 * If the level has been set the special EXPER_IP_AND_TESTHIDDEN
+	 * value, then also include IRE_MARK_TESTHIDDEN IREs.  This is
+	 * intended a temporary solution until a proper MIB API is provided
+	 * that provides complete filtering/caller-opt-in.
+	 */
+	if (level == EXPER_IP_AND_TESTHIDDEN)
+		ird.ird_flags |= IRD_REPORT_TESTHIDDEN;
 
 	zoneid = Q_TO_CONN(q)->conn_zoneid;
 	ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst);
@@ -19671,6 +19418,11 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
 
 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
 
+	if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) &&
+	    ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+		return;
+	}
+
 	if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
 		return;
 
@@ -19812,6 +19564,11 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
 
 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
 
+	if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) &&
+	    ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+		return;
+	}
+
 	if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
 		return;
 
@@ -20518,8 +20275,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
 	boolean_t	mctl_present;
 	ipsec_out_t	*io;
 	int		match_flags;
-	ill_t		*attach_ill = NULL;
-					/* Bind to IPIF_NOFAILOVER ill etc. */
 	ill_t		*xmit_ill = NULL;	/* IP_PKTINFO etc. */
 	ipif_t		*dst_ipif;
 	boolean_t	multirt_need_resolve = B_FALSE;
@@ -20639,16 +20394,11 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
 	}
 
 	/*
-	 * IP_DONTFAILOVER_IF and IP_BOUND_IF have precedence over ill index
-	 * passed in IP_PKTINFO.
+	 * IP_BOUND_IF has precedence over the ill index passed in IP_PKTINFO.
 	 */
-	if (infop->ip_opt_ill_index != 0 &&
-	    connp->conn_outgoing_ill == NULL &&
-	    connp->conn_nofailover_ill == NULL) {
-
-		xmit_ill = ill_lookup_on_ifindex(
-		    infop->ip_opt_ill_index, B_FALSE, NULL, NULL, NULL, NULL,
-		    ipst);
+	if (infop->ip_opt_ill_index != 0 && connp->conn_outgoing_ill == NULL) {
+		xmit_ill = ill_lookup_on_ifindex(infop->ip_opt_ill_index,
+		    B_FALSE, NULL, NULL, NULL, NULL, ipst);
 
 		if (xmit_ill == NULL || IS_VNI(xmit_ill))
 			goto drop_pkt;
@@ -20659,7 +20409,7 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
 		 * accessible from all zones i.e has a valid ipif in
 		 * all zones.
 		 */
-		if (!ipif_lookup_zoneid_group(xmit_ill, zoneid, 0, NULL)) {
+		if (!ipif_lookup_zoneid(xmit_ill, zoneid, 0, NULL)) {
 			goto drop_pkt;
 		}
 	}
@@ -20696,18 +20446,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
 		goto version_hdrlen_check;
 	dst = ipha->ipha_dst;
 
-	if (connp->conn_nofailover_ill != NULL) {
-		attach_ill = conn_get_held_ill(connp,
-		    &connp->conn_nofailover_ill, &err);
-		if (err == ILL_LOOKUP_FAILED) {
-			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-			if (need_decref)
-				CONN_DEC_REF(connp);
-			freemsg(first_mp);
-			return;
-		}
-	}
-
 	/* If IP_BOUND_IF has been set, use that ill. */
 	if (connp->conn_outgoing_ill != NULL) {
 		xmit_ill = conn_get_held_ill(connp,
@@ -20761,9 +20499,6 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
 		ire = NULL;
 	}
 
-	if (attach_ill != NULL)
-		goto send_from_ill;
-
 	/*
 	 * We cache IRE_CACHEs to avoid lookups. We don't do
 	 * this for the tcp global queue and listen end point
@@ -21074,45 +20809,21 @@ notdata:
 	}
 
 	ASSERT(first_mp != NULL);
-	/*
-	 * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if
-	 * to make sure that this packet goes out on the same interface it
-	 * came in. We handle that here.
-	 */
-	if (mctl_present) {
-		uint_t ifindex;
 
+	if (mctl_present) {
 		io = (ipsec_out_t *)first_mp->b_rptr;
-		if (io->ipsec_out_attach_if || io->ipsec_out_ip_nexthop) {
+		if (io->ipsec_out_ip_nexthop) {
 			/*
 			 * We may have lost the conn context if we are
 			 * coming here from ip_newroute(). Copy the
 			 * nexthop information.
 			 */
-			if (io->ipsec_out_ip_nexthop) {
-				ip_nexthop = B_TRUE;
-				nexthop_addr = io->ipsec_out_nexthop_addr;
+			ip_nexthop = B_TRUE;
+			nexthop_addr = io->ipsec_out_nexthop_addr;
 
-				ipha = (ipha_t *)mp->b_rptr;
-				dst = ipha->ipha_dst;
-				goto send_from_ill;
-			} else {
-				ASSERT(io->ipsec_out_ill_index != 0);
-				ifindex = io->ipsec_out_ill_index;
-				attach_ill = ill_lookup_on_ifindex(ifindex,
-				    B_FALSE, NULL, NULL, NULL, NULL, ipst);
-				if (attach_ill == NULL) {
-					ASSERT(xmit_ill == NULL);
-					ip1dbg(("ip_output: bad ifindex for "
-					    "(BIND TO IPIF_NOFAILOVER) %d\n",
-					    ifindex));
-					freemsg(first_mp);
-					BUMP_MIB(&ipst->ips_ip_mib,
-					    ipIfStatsOutDiscards);
-					ASSERT(!need_decref);
-					return;
-				}
-			}
+			ipha = (ipha_t *)mp->b_rptr;
+			dst = ipha->ipha_dst;
+			goto send_from_ill;
 		}
 	}
 
@@ -21161,7 +20872,7 @@ hdrtoosmall:
 
 		ipha = (ipha_t *)mp->b_rptr;
 		if (first_mp == NULL) {
-			ASSERT(attach_ill == NULL && xmit_ill == NULL);
+			ASSERT(xmit_ill == NULL);
 			/*
 			 * If we got here because of "goto hdrtoosmall"
 			 * We need to attach a IPSEC_OUT.
@@ -21213,8 +20924,6 @@ version_hdrlen_check:
 			 */
 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion);
 			ASSERT(xmit_ill == NULL);
-			if (attach_ill != NULL)
-				ill_refrele(attach_ill);
 			if (need_decref)
 				mp->b_flag |= MSGHASREF;
 			(void) ip_output_v6(arg, first_mp, arg2, caller);
@@ -21255,8 +20964,6 @@ version_hdrlen_check:
 		    zoneid, ipst)) {
 			ASSERT(xmit_ill == NULL);
 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-			if (attach_ill != NULL)
-				ill_refrele(attach_ill);
 			TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
 			    "ip_wput_end: q %p (%S)", q, "badopts");
 			if (need_decref)
@@ -21295,22 +21002,6 @@ multicast:
 			 */
 			ill_t *ill = (ill_t *)q->q_ptr;
 
-			/*
-			 * Don't honor attach_if for this case. If ill
-			 * is part of the group, ipif could belong to
-			 * any ill and we cannot maintain attach_ill
-			 * and ipif_ill same anymore and the assert
-			 * below would fail.
-			 */
-			if (mctl_present && io->ipsec_out_attach_if) {
-				io->ipsec_out_ill_index = 0;
-				io->ipsec_out_attach_if = B_FALSE;
-				ASSERT(attach_ill != NULL);
-				ill_refrele(attach_ill);
-				attach_ill = NULL;
-			}
-
-			ASSERT(attach_ill == NULL);
 			ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID);
 			if (ipif == NULL) {
 				if (need_decref)
@@ -21429,25 +21120,11 @@ multicast:
 			first_mp->b_cont = mp;
 			mctl_present = B_TRUE;
 		}
-		if (attach_ill != NULL) {
-			ASSERT(attach_ill == ipif->ipif_ill);
-			match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
 
-			/*
-			 * Check if we need an ire that will not be
-			 * looked up by anybody else i.e. HIDDEN.
-			 */
-			if (ill_is_probeonly(attach_ill)) {
-				match_flags |= MATCH_IRE_MARK_HIDDEN;
-			}
-			io->ipsec_out_ill_index =
-			    attach_ill->ill_phyint->phyint_ifindex;
-			io->ipsec_out_attach_if = B_TRUE;
-		} else {
-			match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
-			io->ipsec_out_ill_index =
-			    ipif->ipif_ill->ill_phyint->phyint_ifindex;
-		}
+		match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
+		io->ipsec_out_ill_index =
+		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
+
 		if (connp != NULL) {
 			io->ipsec_out_multicast_loop =
 			    connp->conn_multicast_loop;
@@ -21469,9 +21146,7 @@ multicast:
 		 *
 		 * NOTE : We need to do it for non-secure case also as
 		 * this might go out secure if there is a global policy
-		 * match in ip_wput_ire. For bind to IPIF_NOFAILOVER
-		 * address, the source should be initialized already and
-		 * hence we won't be initializing here.
+		 * match in ip_wput_ire.
 		 *
 		 * As we do not have the ire yet, it is possible that
 		 * we set the source address here and then later discover
@@ -21507,14 +21182,6 @@ multicast:
 			    zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
 		}
 
-		/*
-		 * refrele attach_ill as its not needed anymore.
-		 */
-		if (attach_ill != NULL) {
-			ill_refrele(attach_ill);
-			attach_ill = NULL;
-		}
-
 		if (ire == NULL) {
 			/*
 			 * Multicast loopback and multicast forwarding is
@@ -21630,33 +21297,9 @@ noroute:
 				ipif_refrele(dst_ipif);
 			}
 		}
-		/*
-		 * If we are bound to IPIF_NOFAILOVER address, look for
-		 * an IRE_CACHE matching the ill.
-		 */
-send_from_ill:
-		if (attach_ill != NULL) {
-			ipif_t	*attach_ipif;
 
-			match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
-
-			/*
-			 * Check if we need an ire that will not be
-			 * looked up by anybody else i.e. HIDDEN.
-			 */
-			if (ill_is_probeonly(attach_ill)) {
-				match_flags |= MATCH_IRE_MARK_HIDDEN;
-			}
-
-			attach_ipif = ipif_get_next_ipif(NULL, attach_ill);
-			if (attach_ipif == NULL) {
-				ip1dbg(("ip_wput: No ipif for attach_ill\n"));
-				goto discard_pkt;
-			}
-			ire = ire_ctable_lookup(dst, 0, 0, attach_ipif,
-			    zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
-			ipif_refrele(attach_ipif);
-		} else if (xmit_ill != NULL) {
+send_from_ill:
+		if (xmit_ill != NULL) {
 			ipif_t *ipif;
 
 			/*
@@ -21681,6 +21324,10 @@ send_from_ill:
 				goto drop_pkt;
 			}
 
+			match_flags = 0;
+			if (IS_UNDER_IPMP(xmit_ill))
+				match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+
 			/*
 			 * Look for a ire that is part of the group,
 			 * if found use it else call ip_newroute_ipif.
@@ -21689,7 +21336,7 @@ send_from_ill:
 			 * ill is accessible from all zones i.e has a
 			 * valid ipif in all zones.
 			 */
-			match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
+			match_flags |= MATCH_IRE_ILL | MATCH_IRE_SECATTR;
 			ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid,
 			    MBLK_GETLABEL(mp), match_flags, ipst);
 			/*
@@ -21729,12 +21376,7 @@ send_from_ill:
 			    ipst);
 		}
 		if (!ire) {
-			/*
-			 * Make sure we don't load spread if this
-			 * is IPIF_NOFAILOVER case.
-			 */
-			if ((attach_ill != NULL) ||
-			    (ip_nexthop && !ignore_nexthop)) {
+			if (ip_nexthop && !ignore_nexthop) {
 				if (mctl_present) {
 					io = (ipsec_out_t *)first_mp->b_rptr;
 					ASSERT(first_mp->b_datap->db_type ==
@@ -21764,15 +21406,8 @@ send_from_ill:
 					first_mp->b_cont = mp;
 					mctl_present = B_TRUE;
 				}
-				if (attach_ill != NULL) {
-					io->ipsec_out_ill_index = attach_ill->
-					    ill_phyint->phyint_ifindex;
-					io->ipsec_out_attach_if = B_TRUE;
-				} else {
-					io->ipsec_out_ip_nexthop = ip_nexthop;
-					io->ipsec_out_nexthop_addr =
-					    nexthop_addr;
-				}
+				io->ipsec_out_ip_nexthop = ip_nexthop;
+				io->ipsec_out_nexthop_addr = nexthop_addr;
 			}
 noirefound:
 			/*
@@ -21787,8 +21422,6 @@ noirefound:
 			ip_newroute(q, first_mp, dst, connp, zoneid, ipst);
 			TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
 			    "ip_wput_end: q %p (%S)", q, "newroute");
-			if (attach_ill != NULL)
-				ill_refrele(attach_ill);
 			if (xmit_ill != NULL)
 				ill_refrele(xmit_ill);
 			if (need_decref)
@@ -21869,8 +21502,6 @@ noirefound:
 			ip_newroute(q, copy_mp, dst, connp, zoneid, ipst);
 		}
 	}
-	if (attach_ill != NULL)
-		ill_refrele(attach_ill);
 	if (xmit_ill != NULL)
 		ill_refrele(xmit_ill);
 	if (need_decref)
@@ -21896,8 +21527,6 @@ drop_pkt:
 	if (need_decref)
 		CONN_DEC_REF(connp);
 	freemsg(first_mp);
-	if (attach_ill != NULL)
-		ill_refrele(attach_ill);
 	if (xmit_ill != NULL)
 		ill_refrele(xmit_ill);
 	TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
@@ -21923,8 +21552,8 @@ ip_wput(queue_t *q, mblk_t *mp)
 /*
  *
  * The following rules must be observed when accessing any ipif or ill
- * that has been cached in the conn. Typically conn_nofailover_ill,
- * conn_outgoing_ill, conn_multicast_ipif and conn_multicast_ill.
+ * that has been cached in the conn. Typically conn_outgoing_ill,
+ * conn_multicast_ipif and conn_multicast_ill.
  *
  * Access: The ipif or ill pointed to from the conn can be accessed under
  * the protection of the conn_lock or after it has been refheld under the
@@ -21944,10 +21573,8 @@ ip_wput(queue_t *q, mblk_t *mp)
  * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock
  * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock
  * or a reference to the ipif or a reference to an ire that references the
- * ipif. An ipif does not change its ill except for failover/failback. Since
- * failover/failback happens only after bringing down the ipif and making sure
- * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock
- * the above holds.
+ * ipif. An ipif only changes its ill when migrating from an underlying ill
+ * to an IPMP ill in ipif_up().
  */
 ipif_t *
 conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err)
@@ -22302,96 +21929,6 @@ ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire,
 	    zoneid));
 }
 
-ire_t *
-conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill)
-{
-	ipaddr_t addr;
-	ire_t *save_ire;
-	irb_t *irb;
-	ill_group_t *illgrp;
-	int	err;
-
-	save_ire = ire;
-	addr = ire->ire_addr;
-
-	ASSERT(ire->ire_type == IRE_BROADCAST);
-
-	illgrp = connp->conn_outgoing_ill->ill_group;
-	if (illgrp == NULL) {
-		*conn_outgoing_ill = conn_get_held_ill(connp,
-		    &connp->conn_outgoing_ill, &err);
-		if (err == ILL_LOOKUP_FAILED) {
-			ire_refrele(save_ire);
-			return (NULL);
-		}
-		return (save_ire);
-	}
-	/*
-	 * If IP_BOUND_IF has been done, conn_outgoing_ill will be set.
-	 * If it is part of the group, we need to send on the ire
-	 * that has been cleared of IRE_MARK_NORECV and that belongs
-	 * to this group. This is okay as IP_BOUND_IF really means
-	 * any ill in the group. We depend on the fact that the
-	 * first ire in the group is always cleared of IRE_MARK_NORECV
-	 * if such an ire exists. This is possible only if you have
-	 * at least one ill in the group that has not failed.
-	 *
-	 * First get to the ire that matches the address and group.
-	 *
-	 * We don't look for an ire with a matching zoneid because a given zone
-	 * won't always have broadcast ires on all ills in the group.
-	 */
-	irb = ire->ire_bucket;
-	rw_enter(&irb->irb_lock, RW_READER);
-	if (ire->ire_marks & IRE_MARK_NORECV) {
-		/*
-		 * If the current zone only has an ire broadcast for this
-		 * address marked NORECV, the ire we want is ahead in the
-		 * bucket, so we look it up deliberately ignoring the zoneid.
-		 */
-		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
-			if (ire->ire_addr != addr)
-				continue;
-			/* skip over deleted ires */
-			if (ire->ire_marks & IRE_MARK_CONDEMNED)
-				continue;
-		}
-	}
-	while (ire != NULL) {
-		/*
-		 * If a new interface is coming up, we could end up
-		 * seeing the loopback ire and the non-loopback ire
-		 * may not have been added yet. So check for ire_stq
-		 */
-		if (ire->ire_stq != NULL && (ire->ire_addr != addr ||
-		    ire->ire_ipif->ipif_ill->ill_group == illgrp)) {
-			break;
-		}
-		ire = ire->ire_next;
-	}
-	if (ire != NULL && ire->ire_addr == addr &&
-	    ire->ire_ipif->ipif_ill->ill_group == illgrp) {
-		IRE_REFHOLD(ire);
-		rw_exit(&irb->irb_lock);
-		ire_refrele(save_ire);
-		*conn_outgoing_ill = ire_to_ill(ire);
-		/*
-		 * Refhold the ill to make the conn_outgoing_ill
-		 * independent of the ire. ip_wput_ire goes in a loop
-		 * and may refrele the ire. Since we have an ire at this
-		 * point we don't need to use ILL_CAN_LOOKUP on the ill.
-		 */
-		ill_refhold(*conn_outgoing_ill);
-		return (ire);
-	}
-	rw_exit(&irb->irb_lock);
-	ip1dbg(("conn_set_outgoing_ill: No matching ire\n"));
-	/*
-	 * If we can't find a suitable ire, return the original ire.
-	 */
-	return (save_ire);
-}
-
 /*
  * This function does the ire_refrele of the ire passed in as the
  * argument. As this function looks up more ires i.e broadcast ires,
@@ -22401,7 +21938,6 @@ conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill)
  * IPQoS Notes:
  * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for
  * IPsec packets are done in ipsec_out_process.
- *
  */
 void
 ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
@@ -22471,9 +22007,8 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
 			if ((first_ire->ire_flags & RTF_MULTIRT) &&
 			    (first_ire->ire_addr == ire->ire_addr) &&
 			    !(first_ire->ire_marks &
-			    (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) {
+			    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
 				break;
-			}
 		}
 
 		if ((first_ire != NULL) && (first_ire != ire)) {
@@ -22489,36 +22024,15 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
 	 * conn_outgoing_ill variable is used only in the broadcast loop.
 	 * for performance we don't grab the mutexs in the fastpath
 	 */
-	if ((connp != NULL) &&
-	    (ire->ire_type == IRE_BROADCAST) &&
-	    ((connp->conn_nofailover_ill != NULL) ||
-	    (connp->conn_outgoing_ill != NULL))) {
-		/*
-		 * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF
-		 * option. So, see if this endpoint is bound to a
-		 * IPIF_NOFAILOVER address. If so, honor it. This implies
-		 * that if the interface is failed, we will still send
-		 * the packet on the same ill which is what we want.
-		 */
+	if (ire->ire_type == IRE_BROADCAST && connp != NULL &&
+	    connp->conn_outgoing_ill != NULL) {
 		conn_outgoing_ill = conn_get_held_ill(connp,
-		    &connp->conn_nofailover_ill, &err);
+		    &connp->conn_outgoing_ill, &err);
 		if (err == ILL_LOOKUP_FAILED) {
 			ire_refrele(ire);
 			freemsg(mp);
 			return;
 		}
-		if (conn_outgoing_ill == NULL) {
-			/*
-			 * Choose a good ill in the group to send the
-			 * packets on.
-			 */
-			ire = conn_set_outgoing_ill(connp, ire,
-			    &conn_outgoing_ill);
-			if (ire == NULL) {
-				freemsg(mp);
-				return;
-			}
-		}
 	}
 
 	if (mp->b_datap->db_type != M_CTL) {
@@ -22578,7 +22092,7 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
 		if (src_ire != NULL &&
 		    !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
 		    (!ipst->ips_ip_restrict_interzone_loopback ||
-		    ire_local_same_ill_group(ire, src_ire))) {
+		    ire_local_same_lan(ire, src_ire))) {
 			if (ipha->ipha_src == INADDR_ANY && !unspec_src)
 				ipha->ipha_src = src_ire->ire_src_addr;
 			ire_refrele(src_ire);
@@ -22741,39 +22255,7 @@ another:;
 		 */
 		ASSERT(ire->ire_ipversion == IPV4_VERSION);
 
-		/*
-		 * With IP multipathing, broadcast packets are sent on the ire
-		 * that has been cleared of IRE_MARK_NORECV and that belongs to
-		 * the group. However, this ire might not be in the same zone so
-		 * we can't always use its source address. We look for a
-		 * broadcast ire in the same group and in the right zone.
-		 */
-		if (ire->ire_type == IRE_BROADCAST &&
-		    ire->ire_zoneid != zoneid) {
-			ire_t *src_ire = ire_ctable_lookup(dst, 0,
-			    IRE_BROADCAST, ire->ire_ipif, zoneid, NULL,
-			    (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst);
-			if (src_ire != NULL) {
-				src = src_ire->ire_src_addr;
-				ire_refrele(src_ire);
-			} else {
-				ire_refrele(ire);
-				if (conn_outgoing_ill != NULL)
-					ill_refrele(conn_outgoing_ill);
-				freemsg(first_mp);
-				if (ill != NULL) {
-					BUMP_MIB(ill->ill_ip_mib,
-					    ipIfStatsOutDiscards);
-				} else {
-					BUMP_MIB(&ipst->ips_ip_mib,
-					    ipIfStatsOutDiscards);
-				}
-				return;
-			}
-		} else {
-			src = ire->ire_src_addr;
-		}
-
+		src = ire->ire_src_addr;
 		if (connp == NULL) {
 			ip1dbg(("ip_wput_ire: no connp and no src "
 			    "address for dst 0x%x, using src 0x%x\n",
@@ -22917,10 +22399,9 @@ another:;
 		ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t));
 
 		io = (ipsec_out_t *)first_mp->b_rptr;
-		io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)->
-		    ill_phyint->phyint_ifindex;
-
-		ipsec_out_process(q, first_mp, ire, ill_index);
+		io->ipsec_out_ill_index =
+		    ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex;
+		ipsec_out_process(q, first_mp, ire, 0);
 		ire_refrele(ire);
 		if (conn_outgoing_ill != NULL)
 			ill_refrele(conn_outgoing_ill);
@@ -22960,7 +22441,7 @@ another:;
 				if (ire1->ire_addr != ire->ire_addr)
 					continue;
 				if (ire1->ire_marks &
-				    (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+				    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
 					continue;
 
 				/* Got one */
@@ -23147,71 +22628,16 @@ broadcast:
 			 * back outbound packets in different zones but on the
 			 * same ill, as the application would see duplicates.
 			 *
-			 * If the interfaces are part of the same group,
-			 * we would want to send only one copy out for
-			 * whole group.
-			 *
 			 * This logic assumes that ire_add_v4() groups the
 			 * IRE_BROADCAST entries so that those with the same
-			 * ire_addr and ill_group are kept together.
+			 * ire_addr are kept together.
 			 */
 			ire_ill = ire->ire_ipif->ipif_ill;
-			if (ire->ire_stq == NULL && ire1->ire_stq != NULL) {
-				if (ire_ill->ill_group != NULL &&
-				    (ire->ire_marks & IRE_MARK_NORECV)) {
-					/*
-					 * If the current zone only has an ire
-					 * broadcast for this address marked
-					 * NORECV, the ire we want is ahead in
-					 * the bucket, so we look it up
-					 * deliberately ignoring the zoneid.
-					 */
-					for (ire1 = ire->ire_bucket->irb_ire;
-					    ire1 != NULL;
-					    ire1 = ire1->ire_next) {
-						ire1_ill =
-						    ire1->ire_ipif->ipif_ill;
-						if (ire1->ire_addr != dst)
-							continue;
-						/* skip over the current ire */
-						if (ire1 == ire)
-							continue;
-						/* skip over deleted ires */
-						if (ire1->ire_marks &
-						    IRE_MARK_CONDEMNED)
-							continue;
-						/*
-						 * non-loopback ire in our
-						 * group: use it for the next
-						 * pass in the loop
-						 */
-						if (ire1->ire_stq != NULL &&
-						    ire1_ill->ill_group ==
-						    ire_ill->ill_group)
-							break;
-					}
-				}
-			} else {
+			if (ire->ire_stq != NULL || ire1->ire_stq == NULL) {
 				while (ire1 != NULL && ire1->ire_addr == dst) {
 					ire1_ill = ire1->ire_ipif->ipif_ill;
-					/*
-					 * We can have two broadcast ires on the
-					 * same ill in different zones; here
-					 * we'll send a copy of the packet on
-					 * each ill and the fanout code will
-					 * call conn_wantpacket() to check that
-					 * the zone has the broadcast address
-					 * configured on the ill. If the two
-					 * ires are in the same group we only
-					 * send one copy up.
-					 */
-					if (ire1_ill != ire_ill &&
-					    (ire1_ill->ill_group == NULL ||
-					    ire_ill->ill_group == NULL ||
-					    ire1_ill->ill_group !=
-					    ire_ill->ill_group)) {
+					if (ire1_ill != ire_ill)
 						break;
-					}
 					ire1 = ire1->ire_next;
 				}
 			}
@@ -23403,13 +22829,8 @@ multi_loopback:
 			 * logic.
 			 */
 			if (ill != NULL) {
-				ilm_t	*ilm;
-
-				ILM_WALKER_HOLD(ill);
-				ilm = ilm_lookup_ill(ill, ipha->ipha_dst,
-				    ALL_ZONES);
-				ILM_WALKER_RELE(ill);
-				if (ilm != NULL) {
+				if (ilm_lookup_ill(ill, ipha->ipha_dst,
+				    ALL_ZONES) != NULL) {
 					/*
 					 * Pass along the virtual output q.
 					 * ip_wput_local() will distribute the
@@ -23565,18 +22986,17 @@ checksumoptions:
 					    ire1 != NULL;
 					    ire1 = ire1->ire_next) {
 						if (!(ire1->ire_flags &
-						    RTF_MULTIRT)) {
+						    RTF_MULTIRT))
 							continue;
-						}
+
 						if (ire1->ire_addr !=
-						    ire->ire_addr) {
+						    ire->ire_addr)
 							continue;
-						}
+
 						if (ire1->ire_marks &
-						    (IRE_MARK_CONDEMNED|
-						    IRE_MARK_HIDDEN)) {
+						    (IRE_MARK_CONDEMNED |
+						    IRE_MARK_TESTHIDDEN))
 							continue;
-						}
 
 						/* Got one */
 						IRE_REFHOLD(ire1);
@@ -24743,9 +24163,8 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 			if ((first_ire->ire_flags & RTF_MULTIRT) &&
 			    (first_ire->ire_addr == ire->ire_addr) &&
 			    !(first_ire->ire_marks &
-			    (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) {
+			    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
 				break;
-			}
 		}
 
 		if (first_ire != NULL) {
@@ -24808,7 +24227,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 				if (ire1->ire_addr != ire->ire_addr)
 					continue;
 				if (ire1->ire_marks &
-				    (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+				    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
 					continue;
 				/*
 				 * Ensure we do not exceed the MTU
@@ -25130,10 +24549,9 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 					if (ire1->ire_addr != ire->ire_addr)
 						continue;
 					if (ire1->ire_marks &
-					    (IRE_MARK_CONDEMNED|
-					    IRE_MARK_HIDDEN)) {
+					    (IRE_MARK_CONDEMNED |
+					    IRE_MARK_TESTHIDDEN))
 						continue;
-					}
 					/*
 					 * Ensure we do not exceed the MTU
 					 * of the next route.
@@ -25500,6 +24918,7 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
 		ilm_t		*ilm;
 		mblk_t		*mp1;
 		zoneid_t	last_zoneid;
+		ilm_walker_t	ilw;
 
 		if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) {
 			ASSERT(ire_type == IRE_BROADCAST);
@@ -25524,11 +24943,9 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
 			 * have been exhausted.
 			 */
 			last_zoneid = -1;
-			ILM_WALKER_HOLD(ill);
-			for (ilm = ill->ill_ilm; ilm != NULL;
-			    ilm = ilm->ilm_next) {
-				if ((ilm->ilm_flags & ILM_DELETED) ||
-				    ipha->ipha_dst != ilm->ilm_addr ||
+			ilm = ilm_walker_start(&ilw, ill);
+			for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+				if (ipha->ipha_dst != ilm->ilm_addr ||
 				    ilm->ilm_zoneid == last_zoneid ||
 				    ilm->ilm_zoneid == zoneid ||
 				    !(ilm->ilm_ipif->ipif_flags & IPIF_UP))
@@ -25536,12 +24953,12 @@ ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
 				mp1 = ip_copymsg(first_mp);
 				if (mp1 == NULL)
 					continue;
-				icmp_inbound(q, mp1, B_TRUE, ill, 0, 0,
-				    mctl_present, B_FALSE, ill,
+				icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill,
+				    0, 0, mctl_present, B_FALSE, ill,
 				    ilm->ilm_zoneid);
 				last_zoneid = ilm->ilm_zoneid;
 			}
-			ILM_WALKER_RELE(ill);
+			ilm_walker_finish(&ilw);
 			/*
 			 * Loopback case: the sending endpoint has
 			 * IP_MULTICAST_LOOP disabled, therefore we don't
@@ -25859,14 +25276,9 @@ ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid)
 	 * caller and hence matching on ILL (MATCH_IRE_ILL) would
 	 * be sufficient rather than MATCH_IRE_IPIF.
 	 *
-	 * This function is used for sending IGMP packets. We need
-	 * to make sure that we send the packet out of the interface
-	 * (ipif->ipif_ill) where we joined the group. This is to
-	 * prevent from switches doing IGMP snooping to send us multicast
-	 * packets for a given group on the interface we have joined.
-	 * If we can't find an ire, igmp_sendpkt has already initialized
-	 * ipsec_out_attach_if so that this will not be load spread in
-	 * ip_newroute_ipif.
+	 * This function is used for sending IGMP packets.  For IPMP,
+	 * we sidestep IGMP snooping issues by sending all multicast
+	 * traffic on a single interface in the IPMP group.
 	 */
 	ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL,
 	    MATCH_IRE_ILL, ipst);
@@ -26035,7 +25447,7 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
 	ip6_t *ip6h1;
 	uint_t	ill_index;
 	ipsec_out_t *io;
-	boolean_t attach_if, hwaccel;
+	boolean_t hwaccel;
 	uint32_t flags = IP6_NO_IPPOLICY;
 	int match_flags;
 	zoneid_t zoneid;
@@ -26052,42 +25464,22 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
 	if (io->ipsec_out_reachable) {
 		flags |= IPV6_REACHABILITY_CONFIRMATION;
 	}
-	attach_if = io->ipsec_out_attach_if;
 	hwaccel = io->ipsec_out_accelerated;
 	zoneid = io->ipsec_out_zoneid;
 	ASSERT(zoneid != ALL_ZONES);
-	match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
+	match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
 	/* Multicast addresses should have non-zero ill_index. */
 	v6dstp = &ip6h->ip6_dst;
 	ASSERT(ip6h->ip6_nxt != IPPROTO_RAW);
 	ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0);
-	ASSERT(!attach_if || ill_index != 0);
-	if (ill_index != 0) {
-		if (ill == NULL) {
-			ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index,
-			    B_TRUE, ipst);
 
-			/* Failure case frees things for us. */
-			if (ill == NULL)
-				return;
-
-			ill_need_rele = B_TRUE;
-		}
-		/*
-		 * If this packet needs to go out on a particular interface
-		 * honor it.
-		 */
-		if (attach_if) {
-			match_flags = MATCH_IRE_ILL;
+	if (ill == NULL && ill_index != 0) {
+		ill = ip_grab_ill(ipsec_mp, ill_index, B_TRUE, ipst);
+		/* Failure case frees things for us. */
+		if (ill == NULL)
+			return;
 
-			/*
-			 * Check if we need an ire that will not be
-			 * looked up by anybody else i.e. HIDDEN.
-			 */
-			if (ill_is_probeonly(ill)) {
-				match_flags |= MATCH_IRE_MARK_HIDDEN;
-			}
-		}
+		ill_need_rele = B_TRUE;
 	}
 	ASSERT(mp != NULL);
 
@@ -26138,32 +25530,15 @@ ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
 			return;
 		}
 
-		ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp,
+		ip_newroute_ipif_v6(q, ipsec_mp, ipif, v6dstp, &ip6h->ip6_src,
 		    unspec_src, zoneid);
 		ipif_refrele(ipif);
 	} else {
-		if (attach_if) {
-			ipif_t	*ipif;
-
-			ipif = ipif_get_next_ipif(NULL, ill);
-			if (ipif == NULL) {
-				if (ill_need_rele)
-					ill_refrele(ill);
-				freemsg(ipsec_mp);
-				return;
-			}
-			ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif,
-			    zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
-			ire_need_rele = B_TRUE;
-			ipif_refrele(ipif);
+		if (ire_arg != NULL) {
+			ire = ire_arg;
 		} else {
-			if (ire_arg != NULL) {
-				ire = ire_arg;
-			} else {
-				ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL,
-				    ipst);
-				ire_need_rele = B_TRUE;
-			}
+			ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, ipst);
+			ire_need_rele = B_TRUE;
 		}
 		if (ire != NULL)
 			goto send;
@@ -26350,7 +25725,6 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
 	ipha_t *ipha1;
 	uint_t	ill_index;
 	ipsec_out_t *io;
-	boolean_t attach_if;
 	int match_flags;
 	irb_t *irb = NULL;
 	boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE;
@@ -26372,39 +25746,19 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
 
 	io = (ipsec_out_t *)ipsec_mp->b_rptr;
 	ill_index = io->ipsec_out_ill_index;
-	attach_if = io->ipsec_out_attach_if;
 	zoneid = io->ipsec_out_zoneid;
 	ASSERT(zoneid != ALL_ZONES);
 	ipst = io->ipsec_out_ns->netstack_ip;
 	ASSERT(io->ipsec_out_ns != NULL);
 
-	match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR;
-	if (ill_index != 0) {
-		if (ill == NULL) {
-			ill = ip_grab_attach_ill(NULL, ipsec_mp,
-			    ill_index, B_FALSE, ipst);
-
-			/* Failure case frees things for us. */
-			if (ill == NULL)
-				return;
-
-			ill_need_rele = B_TRUE;
-		}
-		/*
-		 * If this packet needs to go out on a particular interface
-		 * honor it.
-		 */
-		if (attach_if) {
-			match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
+	match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
+	if (ill == NULL && ill_index != 0) {
+		ill = ip_grab_ill(ipsec_mp, ill_index, B_FALSE, ipst);
+		/* Failure case frees things for us. */
+		if (ill == NULL)
+			return;
 
-			/*
-			 * Check if we need an ire that will not be
-			 * looked up by anybody else i.e. HIDDEN.
-			 */
-			if (ill_is_probeonly(ill)) {
-				match_flags |= MATCH_IRE_MARK_HIDDEN;
-			}
-		}
+		ill_need_rele = B_TRUE;
 	}
 
 	if (CLASSD(dst)) {
@@ -26474,17 +25828,12 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
 		ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT,
 		    zoneid, &zero_info);
 	} else {
-		if (attach_if) {
-			ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif,
-			    zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
+		if (ire_arg != NULL) {
+			ire = ire_arg;
+			ire_need_rele = B_FALSE;
 		} else {
-			if (ire_arg != NULL) {
-				ire = ire_arg;
-				ire_need_rele = B_FALSE;
-			} else {
-				ire = ire_cache_lookup(dst, zoneid,
-				    MBLK_GETLABEL(mp), ipst);
-			}
+			ire = ire_cache_lookup(dst, zoneid,
+			    MBLK_GETLABEL(mp), ipst);
 		}
 		if (ire != NULL) {
 			goto send;
@@ -26613,11 +25962,9 @@ send:
 	    (void *)ire->ire_ipif, (void *)ipif));
 
 	/*
-	 * Multiroute the secured packet, unless IPsec really
-	 * requires the packet to go out only through a particular
-	 * interface.
+	 * Multiroute the secured packet.
 	 */
-	if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) {
+	if (ire->ire_flags & RTF_MULTIRT) {
 		ire_t *first_ire;
 		irb = ire->ire_bucket;
 		ASSERT(irb != NULL);
@@ -26634,9 +25981,8 @@ send:
 			if ((first_ire->ire_flags & RTF_MULTIRT) &&
 			    (first_ire->ire_addr == ire->ire_addr) &&
 			    !(first_ire->ire_marks &
-			    (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) {
+			    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
 				break;
-			}
 		}
 
 		if ((first_ire != NULL) && (first_ire != ire)) {
@@ -26657,11 +26003,6 @@ send:
 
 		multirt_send = B_TRUE;
 		max_frag = ire->ire_max_frag;
-	} else {
-		if ((ire->ire_flags & RTF_MULTIRT) && attach_if) {
-			ip1dbg(("ip_wput_ipsec_out: ignoring multirouting "
-			    "flag, attach_if %d\n", attach_if));
-		}
 	}
 
 	/*
@@ -26689,7 +26030,7 @@ send:
 				if (ire1->ire_addr != ire->ire_addr)
 					continue;
 				if (ire1->ire_marks &
-				    (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+				    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
 					continue;
 				/* No loopback here */
 				if (ire1->ire_stq == NULL)
@@ -27155,10 +26496,8 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
 	 * before sending it the accelerated packet.
 	 */
 	if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) {
-		int ifindex;
 		ill = ire_to_ill(ire);
-		ifindex = ill->ill_phyint->phyint_ifindex;
-		io->ipsec_out_capab_ill_index = ifindex;
+		io->ipsec_out_capab_ill_index = ill->ill_phyint->phyint_ifindex;
 	}
 
 	/*
@@ -27284,17 +26623,18 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
 		}
 	}
 	/*
-	 * We are done with IPsec processing. Send it over
-	 * the wire.
+	 * We are done with IPsec processing. Send it over the wire.
 	 */
 done:
 	mp = ipsec_mp->b_cont;
 	ipha = (ipha_t *)mp->b_rptr;
 	if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
-		ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire);
+		ip_wput_ipsec_out(q, ipsec_mp, ipha, ire->ire_ipif->ipif_ill,
+		    ire);
 	} else {
 		ip6h = (ip6_t *)ipha;
-		ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire);
+		ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ire->ire_ipif->ipif_ill,
+		    ire);
 	}
 	if (ill != NULL && ill_need_rele)
 		ill_refrele(ill);
@@ -27356,18 +26696,16 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 	ipip = ip_sioctl_lookup(iocp->ioc_cmd);
 	if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) {
 		/*
-		 * Special case where ipsq_current_ipif is not set:
+		 * Special case where ipx_current_ipif is not set:
 		 * ill_phyint_reinit merged the v4 and v6 into a single ipsq.
-		 * ill could also have become part of a ipmp group in the
-		 * process, we are here as were not able to complete the
-		 * operation in ipif_set_values because we could not become
-		 * exclusive on the new ipsq, In such a case ipsq_current_ipif
-		 * will not be set so we need to set it.
+		 * We are here as were not able to complete the operation in
+		 * ipif_set_values because we could not become exclusive on
+		 * the new ipsq.
 		 */
 		ill_t *ill = q->q_ptr;
 		ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd);
 	}
-	ASSERT(ipsq->ipsq_current_ipif != NULL);
+	ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL);
 
 	if (ipip->ipi_cmd_type == IF_CMD) {
 		/* This a old style SIOC[GS]IF* command */
@@ -27381,8 +26719,8 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		sin = NULL;
 	}
 
-	err = (*ipip->ipi_func_restart)(ipsq->ipsq_current_ipif, sin, q, mp,
-	    ipip, mp1->b_rptr);
+	err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin,
+	    q, mp, ipip, mp1->b_rptr);
 
 	ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
 }
@@ -27424,6 +26762,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	ip_extract_func_t *extract_funcp;
 	cmd_info_t ci;
 	int err;
+	boolean_t entered_ipsq = B_FALSE;
 
 	ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd));
 
@@ -27505,18 +26844,21 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 		return;
 	}
 
+	ASSERT(ci.ci_ipif != NULL);
+
 	/*
-	 * If ipsq is non-null, we are already being called exclusively on an
-	 * ill but in the case of a failover in progress it is the "from" ill,
-	 *  rather than the "to" ill (which is the ill ptr passed in).
-	 * In order to ensure we are exclusive on both ILLs we rerun
-	 * ipsq_try_enter() here, ipsq's support recursive entry.
+	 * If ipsq is non-NULL, we are already being called exclusively.
 	 */
 	ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
-	ASSERT(ci.ci_ipif != NULL);
-
-	ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
-	    NEW_OP, B_TRUE);
+	if (ipsq == NULL) {
+		ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
+		    NEW_OP, B_TRUE);
+		if (ipsq == NULL) {
+			ipif_refrele(ci.ci_ipif);
+			return;
+		}
+		entered_ipsq = B_TRUE;
+	}
 
 	/*
 	 * Release the ipif so that ipif_down and friends that wait for
@@ -27525,8 +26867,6 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	 * the ipif.
 	 */
 	ipif_refrele(ci.ci_ipif);
-	if (ipsq == NULL)
-		return;
 
 	ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
 
@@ -27535,19 +26875,12 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	 * where we set the IPIF_CHANGING flag. This ensures that there won't
 	 * be any new references to the ipif. This helps functions that go
 	 * through this path and end up trying to wait for the refcnts
-	 * associated with the ipif to go down to zero. Some exceptions are
-	 * Failover, Failback, and Groupname commands that operate on more than
-	 * just the ci.ci_ipif. These commands internally determine the
-	 * set of ipif's they operate on and set and clear the IPIF_CHANGING
-	 * flags on that set. Another exception is the Removeif command that
-	 * sets the IPIF_CONDEMNED flag internally after identifying the right
-	 * ipif to operate on.
+	 * associated with the ipif to go down to zero.  The exception is
+	 * SIOCSLIFREMOVEIF, which sets IPIF_CONDEMNED internally after
+	 * identifying the right ipif to operate on.
 	 */
 	mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock);
-	if (ipip->ipi_cmd != SIOCLIFREMOVEIF &&
-	    ipip->ipi_cmd != SIOCLIFFAILOVER &&
-	    ipip->ipi_cmd != SIOCLIFFAILBACK &&
-	    ipip->ipi_cmd != SIOCSLIFGROUPNAME)
+	if (ipip->ipi_cmd != SIOCLIFREMOVEIF)
 		(ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING;
 	mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock);
 
@@ -27560,7 +26893,8 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 
 	ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
 
-	ipsq_exit(ipsq);
+	if (entered_ipsq)
+		ipsq_exit(ipsq);
 }
 
 /*
@@ -27708,7 +27042,7 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			 * Refhold the conn, till the ioctl completes. This is
 			 * needed in case the ioctl ends up in the pending mp
 			 * list. Every mp in the ill_pending_mp list and
-			 * the ipsq_pending_mp must have a refhold on the conn
+			 * the ipx_pending_mp must have a refhold on the conn
 			 * to resume processing. The refhold is released when
 			 * the ioctl completes. (normally or abnormally)
 			 * In all cases ip_ioctl_finish is called to finish
@@ -27753,8 +27087,25 @@ nak:
 		if (CONN_Q(q))
 			goto nak;
 
-		/* Finish socket ioctls passed through to ARP. */
-		ip_sioctl_iocack(q, mp);
+		/*
+		 * Finish socket ioctls passed through to ARP.  We use the
+		 * ioc_cmd values we set in ip_sioctl_arp() to decide whether
+		 * we need to become writer before calling ip_sioctl_iocack().
+		 * Note that qwriter_ip() will release the refhold, and that a
+		 * refhold is OK without ILL_CAN_LOOKUP() since we're on the
+		 * ill stream.
+		 */
+		iocp = (struct iocblk *)mp->b_rptr;
+		if (iocp->ioc_cmd == AR_ENTRY_SQUERY) {
+			ip_sioctl_iocack(NULL, q, mp, NULL);
+			return;
+		}
+
+		ASSERT(iocp->ioc_cmd == AR_ENTRY_DELETE ||
+		    iocp->ioc_cmd == AR_ENTRY_ADD);
+		ill = q->q_ptr;
+		ill_refhold(ill);
+		qwriter_ip(ill, q, mp, ip_sioctl_iocack, CUR_OP, B_FALSE);
 		return;
 	case M_FLUSH:
 		if (*mp->b_rptr & FLUSHW)
@@ -28021,11 +27372,11 @@ nak:
 				gw_addr_v6 = ire->ire_gateway_addr_v6;
 				mutex_exit(&ire->ire_lock);
 				if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
-					nce = ndp_lookup_v6(ill,
+					nce = ndp_lookup_v6(ill, B_FALSE,
 					    &ire->ire_addr_v6, B_FALSE);
 				} else {
-					nce = ndp_lookup_v6(ill, &gw_addr_v6,
-					    B_FALSE);
+					nce = ndp_lookup_v6(ill, B_FALSE,
+					    &gw_addr_v6, B_FALSE);
 				}
 				if (nce != NULL) {
 					nce_resolv_failed(nce);
@@ -28061,10 +27412,11 @@ nak:
 			gw_addr_v6 = ire->ire_gateway_addr_v6;
 			mutex_exit(&ire->ire_lock);
 			if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
-				nce = ndp_lookup_v6(ill, &ire->ire_addr_v6,
-				    B_FALSE);
+				nce = ndp_lookup_v6(ill, B_FALSE,
+				    &ire->ire_addr_v6, B_FALSE);
 			} else {
-				nce = ndp_lookup_v6(ill, &gw_addr_v6, B_FALSE);
+				nce = ndp_lookup_v6(ill, B_FALSE,
+				    &gw_addr_v6, B_FALSE);
 			}
 			if (nce != NULL) {
 				/*
@@ -28238,13 +27590,14 @@ nak:
 		fake_ire = (ire_t *)mp->b_rptr;
 
 		/*
-		 * By the time we come back here from ARP the incomplete ire
-		 * created in ire_forward() could have been removed. We use
-		 * the parameters stored in the fake_ire to specify the real
-		 * ire as explicitly as possible. This avoids problems when
-		 * IPMP groups are configured as an ipif can 'float'
-		 * across several ill queues. We can be confident that the
-		 * the inability to find an ire is because it no longer exists.
+		 * By the time we come back here from ARP the logical outgoing
+		 * interface of the incomplete ire we added in ire_forward()
+		 * could have disappeared, causing the incomplete ire to also
+		 * disappear.  So we need to retreive the proper ipif for the
+		 * ire before looking in ctable.  In the case of IPMP, the
+		 * ipif may be on the IPMP ill, so look it up based on the
+		 * ire_ipif_ifindex we stashed back in ire_init_common().
+		 * Then, we can verify that ire_ipif_seqid still exists.
 		 */
 		ill = ill_lookup_on_ifindex(fake_ire->ire_ipif_ifindex, B_FALSE,
 		    NULL, NULL, NULL, NULL, ipst);
@@ -28299,6 +27652,7 @@ nak:
 			freemsg(mp); /* fake ire */
 			return;
 		}
+
 		nce = ire->ire_nce;
 		DTRACE_PROBE2(ire__arpresolve__type,
 		    ire_t *, ire, nce_t *, nce);
@@ -29030,7 +28384,7 @@ boolean_t
 conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
     zoneid_t zoneid)
 {
-	ill_t *in_ill;
+	ill_t *bound_ill;
 	boolean_t found;
 	ipif_t *ipif;
 	ire_t *ire;
@@ -29045,32 +28399,15 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
 	 * unicast, broadcast and multicast reception to
 	 * conn_incoming_ill. conn_wantpacket itself is called
 	 * only for BROADCAST and multicast.
-	 *
-	 * 1) ip_rput supresses duplicate broadcasts if the ill
-	 *    is part of a group. Hence, we should be receiving
-	 *    just one copy of broadcast for the whole group.
-	 *    Thus, if it is part of the group the packet could
-	 *    come on any ill of the group and hence we need a
-	 *    match on the group. Otherwise, match on ill should
-	 *    be sufficient.
-	 *
-	 * 2) ip_rput does not suppress duplicate multicast packets.
-	 *    If there are two interfaces in a ill group and we have
-	 *    2 applications (conns) joined a multicast group G on
-	 *    both the interfaces, ilm_lookup_ill filter in ip_rput
-	 *    will give us two packets because we join G on both the
-	 *    interfaces rather than nominating just one interface
-	 *    for receiving multicast like broadcast above. So,
-	 *    we have to call ilg_lookup_ill to filter out duplicate
-	 *    copies, if ill is part of a group.
-	 */
-	in_ill = connp->conn_incoming_ill;
-	if (in_ill != NULL) {
-		if (in_ill->ill_group == NULL) {
-			if (in_ill != ill)
+	 */
+	bound_ill = connp->conn_incoming_ill;
+	if (bound_ill != NULL) {
+		if (IS_IPMP(bound_ill)) {
+			if (bound_ill->ill_grp != ill->ill_grp)
+				return (B_FALSE);
+		} else {
+			if (bound_ill != ill)
 				return (B_FALSE);
-		} else if (in_ill->ill_group != ill->ill_group) {
-			return (B_FALSE);
 		}
 	}
 
@@ -29079,15 +28416,14 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
 			return (B_TRUE);
 		/*
 		 * The conn is in a different zone; we need to check that this
-		 * broadcast address is configured in the application's zone and
-		 * on one ill in the group.
+		 * broadcast address is configured in the application's zone.
 		 */
 		ipif = ipif_get_next_ipif(NULL, ill);
 		if (ipif == NULL)
 			return (B_FALSE);
 		ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif,
 		    connp->conn_zoneid, NULL,
-		    (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst);
+		    (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
 		ipif_refrele(ipif);
 		if (ire != NULL) {
 			ire_refrele(ire);
@@ -29171,7 +28507,7 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
 	}
 
 	ipsq = ill->ill_phyint->phyint_ipsq;
-	ipif = ipsq->ipsq_pending_ipif;
+	ipif = ipsq->ipsq_xop->ipx_pending_ipif;
 	mp1 = ipsq_pending_mp_get(ipsq, &connp);
 	ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
 	if (mp1 == NULL) {
@@ -29181,12 +28517,12 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
 	}
 
 	/*
-	 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
+	 * If an IOCTL is waiting on this (ipx_current_ioctl != 0), then we
 	 * must have an associated conn_t.  Otherwise, we're bringing this
 	 * interface back up as part of handling an asynchronous event (e.g.,
 	 * physical address change).
 	 */
-	if (ipsq->ipsq_current_ioctl != 0) {
+	if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
 		ASSERT(connp != NULL);
 		q = CONNP_TO_WQ(connp);
 	} else {
@@ -29219,16 +28555,28 @@ ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			return;
 	}
 
-	if (ill->ill_up_ipifs)
-		ill_group_cleanup(ill);
+	/*
+	 * If we have a moved ipif to bring up, and everything has succeeded
+	 * to this point, bring it up on the IPMP ill.  Otherwise, leave it
+	 * down -- the admin can try to bring it up by hand if need be.
+	 */
+	if (ill->ill_move_ipif != NULL) {
+		ipif = ill->ill_move_ipif;
+		ill->ill_move_ipif = NULL;
+		if (err == 0) {
+			err = ipif_up(ipif, q, mp1);
+			if (err == EINPROGRESS)
+				return;
+		}
+	}
 
 	/*
 	 * The operation must complete without EINPROGRESS since
-	 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
-	 * Otherwise, the operation will be stuck forever in the ipsq.
+	 * ipsq_pending_mp_get() has removed the mblk.  Otherwise, the
+	 * operation will be stuck forever in the ipsq.
 	 */
 	ASSERT(err != EINPROGRESS);
-	if (ipsq->ipsq_current_ioctl != 0)
+	if (ipsq->ipsq_xop->ipx_current_ioctl != 0)
 		ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
 	else
 		ipsq_current_finish(ipsq);
@@ -29649,124 +28997,6 @@ ip_int_set(queue_t *q, mblk_t *mp, char *value,
 	return (0);
 }
 
-/*
- * Handle changes to ipmp_hook_emulation ndd variable.
- * Need to update phyint_hook_ifindex.
- * Also generate a nic plumb event should a new ifidex be assigned to a group.
- */
-static void
-ipmp_hook_emulation_changed(ip_stack_t *ipst)
-{
-	phyint_t *phyi;
-	phyint_t *phyi_tmp;
-	char *groupname;
-	int namelen;
-	ill_t	*ill;
-	boolean_t new_group;
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	/*
-	 * Group indicies are stored in the phyint - a common structure
-	 * to both IPv4 and IPv6.
-	 */
-	phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
-	for (; phyi != NULL;
-	    phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
-	    phyi, AVL_AFTER)) {
-		/* Ignore the ones that do not have a group */
-		if (phyi->phyint_groupname_len == 0)
-			continue;
-
-		/*
-		 * Look for other phyint in group.
-		 * Clear name/namelen so the lookup doesn't find ourselves.
-		 */
-		namelen = phyi->phyint_groupname_len;
-		groupname = phyi->phyint_groupname;
-		phyi->phyint_groupname_len = 0;
-		phyi->phyint_groupname = NULL;
-
-		phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst);
-		/* Restore */
-		phyi->phyint_groupname_len = namelen;
-		phyi->phyint_groupname = groupname;
-
-		new_group = B_FALSE;
-		if (ipst->ips_ipmp_hook_emulation) {
-			/*
-			 * If the group already exists and has already
-			 * been assigned a group ifindex, we use the existing
-			 * group_ifindex, otherwise we pick a new group_ifindex
-			 * here.
-			 */
-			if (phyi_tmp != NULL &&
-			    phyi_tmp->phyint_group_ifindex != 0) {
-				phyi->phyint_group_ifindex =
-				    phyi_tmp->phyint_group_ifindex;
-			} else {
-				/* XXX We need a recovery strategy here. */
-				if (!ip_assign_ifindex(
-				    &phyi->phyint_group_ifindex, ipst))
-					cmn_err(CE_PANIC,
-					    "ip_assign_ifindex() failed");
-				new_group = B_TRUE;
-			}
-		} else {
-			phyi->phyint_group_ifindex = 0;
-		}
-		if (ipst->ips_ipmp_hook_emulation)
-			phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex;
-		else
-			phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
-		/*
-		 * For IP Filter to find out the relationship between
-		 * names and interface indicies, we need to generate
-		 * a NE_PLUMB event when a new group can appear.
-		 * We always generate events when a new interface appears
-		 * (even when ipmp_hook_emulation is set) so there
-		 * is no need to generate NE_PLUMB events when
-		 * ipmp_hook_emulation is turned off.
-		 * And since it isn't critical for IP Filter to get
-		 * the NE_UNPLUMB events we skip those here.
-		 */
-		if (new_group) {
-			/*
-			 * First phyint in group - generate group PLUMB event.
-			 * Since we are not running inside the ipsq we do
-			 * the dispatch immediately.
-			 */
-			if (phyi->phyint_illv4 != NULL)
-				ill = phyi->phyint_illv4;
-			else
-				ill = phyi->phyint_illv6;
-
-			if (ill != NULL)
-				ill_nic_event_plumb(ill, B_TRUE);
-		}
-	}
-	rw_exit(&ipst->ips_ill_g_lock);
-}
-
-/* ARGSUSED */
-static int
-ipmp_hook_emulation_set(queue_t *q, mblk_t *mp, char *value,
-    caddr_t addr, cred_t *cr)
-{
-	int *v = (int *)addr;
-	long new_value;
-	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
-
-	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
-		return (EINVAL);
-
-	if (*v != new_value) {
-		*v = new_value;
-		ipmp_hook_emulation_changed(ipst);
-	}
-	return (0);
-}
-
 static void *
 ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp)
 {
@@ -30448,12 +29678,12 @@ next_mp:
 
 		arpce->nce_state = ND_INCOMPLETE;
 		mutex_exit(&arpce->nce_lock);
+
 		/*
 		 * Note that ire_add() (called from ire_forward())
 		 * holds a ref on the ire until ARP is completed.
 		 */
-
-		ire_arpresolve(ire, ire_to_ill(ire));
+		ire_arpresolve(ire);
 		return (LOOKUP_IN_PROGRESS);
 	default:
 		ASSERT(0);
@@ -30596,7 +29826,7 @@ ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill,
 		return (ALL_ZONES);
 
 	if (IN6_IS_ADDR_LINKLOCAL(addr)) {
-		ire_flags |= MATCH_IRE_ILL_GROUP;
+		ire_flags |= MATCH_IRE_ILL;
 		ipif_arg = ill->ill_ipif;
 	}
 	if (lookup_zoneid != ALL_ZONES)
@@ -30648,20 +29878,24 @@ void
 ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
     const ill_t *ill, int ipver, uint32_t hlen, ip_stack_t *ipst)
 {
+	mblk_t *mp2;
 	ipobs_cb_t *ipobs_cb;
+	ipobs_hook_data_t *ihd;
+	uint64_t grifindex = 0;
 
 	ASSERT(DB_TYPE(mp) == M_DATA);
 
+	if (IS_UNDER_IPMP(ill))
+		grifindex = ipmp_ill_get_ipmp_ifindex(ill);
+
 	mutex_enter(&ipst->ips_ipobs_cb_lock);
 	ipst->ips_ipobs_cb_nwalkers++;
 	mutex_exit(&ipst->ips_ipobs_cb_lock);
 	for (ipobs_cb = list_head(&ipst->ips_ipobs_cb_list); ipobs_cb != NULL;
 	    ipobs_cb = list_next(&ipst->ips_ipobs_cb_list, ipobs_cb)) {
-		mblk_t  *mp2 = allocb(sizeof (ipobs_hook_data_t),
-		    BPRI_HI);
+		mp2 = allocb(sizeof (ipobs_hook_data_t), BPRI_HI);
 		if (mp2 != NULL) {
-			ipobs_hook_data_t *ihd =
-			    (ipobs_hook_data_t *)mp2->b_rptr;
+			ihd = (ipobs_hook_data_t *)mp2->b_rptr;
 			if (((ihd->ihd_mp = dupmsg(mp)) == NULL) &&
 			    ((ihd->ihd_mp = copymsg(mp)) == NULL)) {
 				freemsg(mp2);
@@ -30673,6 +29907,7 @@ ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
 			ihd->ihd_zsrc = zsrc;
 			ihd->ihd_zdst = zdst;
 			ihd->ihd_ifindex = ill->ill_phyint->phyint_ifindex;
+			ihd->ihd_grifindex = grifindex;
 			ihd->ihd_stack = ipst->ips_netstack;
 			mp2->b_wptr += sizeof (*ihd);
 			ipobs_cb->ipobs_cbfunc(mp2);
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index fe326778c2..6e63af32b3 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -95,7 +95,6 @@
 #include <sys/pattr.h>
 #include <inet/ipclassifier.h>
 #include <inet/ipsecah.h>
-#include <inet/udp_impl.h>
 #include <inet/rawip_impl.h>
 #include <inet/rts_impl.h>
 #include <sys/squeue_impl.h>
@@ -186,7 +185,7 @@ const in6_addr_t ipv6_solicited_node_mcast =
 #define	IP6_MBLK_HDR_ERR	1
 #define	IP6_MBLK_LEN_ERR	2
 
-static void	icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *ill,
+static void	icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *, ill_t *,
     boolean_t, zoneid_t);
 static void	icmp_pkt_v6(queue_t *, mblk_t *, void *, size_t,
     const in6_addr_t *, boolean_t, zoneid_t, ip_stack_t *);
@@ -208,11 +207,13 @@ static void	ip_fanout_udp_v6(queue_t *, mblk_t *, ip6_t *, uint32_t,
     ill_t *, ill_t *, uint_t, boolean_t, zoneid_t);
 static int	ip_process_options_v6(queue_t *, mblk_t *, ip6_t *,
     uint8_t *, uint_t, uint8_t, ip_stack_t *);
-static mblk_t	*ip_rput_frag_v6(queue_t *, mblk_t *, ip6_t *,
+static mblk_t	*ip_rput_frag_v6(ill_t *, ill_t *, mblk_t *, ip6_t *,
     ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *);
 static boolean_t	ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *);
 static void	ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int,
-    conn_t *, int, int, int, zoneid_t);
+    conn_t *, int, int, zoneid_t);
+static boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *,
+    ipif_t **);
 
 /*
  * A template for an IPv6 AR_ENTRY_QUERY
@@ -248,15 +249,14 @@ static areq_t	ipv6_areq_template = {
  * call icmp_inbound_v6() for each relevant zone.
  */
 static void
-icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
-    boolean_t mctl_present, uint_t flags, zoneid_t zoneid, mblk_t *dl_mp)
+icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
+    uint_t hdr_length, boolean_t mctl_present, uint_t flags, zoneid_t zoneid,
+    mblk_t *dl_mp)
 {
 	icmp6_t		*icmp6;
 	ip6_t		*ip6h;
 	boolean_t	interested;
-	ip6i_t		*ip6i;
 	in6_addr_t	origsrc;
-	ire_t		*ire;
 	mblk_t		*first_mp;
 	ipsec_in_t	*ii;
 	ip_stack_t	*ipst = ill->ill_ipst;
@@ -344,7 +344,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
 		break;
 
 	case ICMP6_PACKET_TOO_BIG:
-		icmp_inbound_too_big_v6(q, first_mp, ill, mctl_present,
+		icmp_inbound_too_big_v6(q, first_mp, ill, inill, mctl_present,
 		    zoneid);
 		return;
 	case ICMP6_ECHO_REQUEST:
@@ -422,66 +422,6 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
 		 * checksum field. The checksum is calculated in ip_wput_v6.
 		 */
 		icmp6->icmp6_cksum = ip6h->ip6_plen;
-		/*
-		 * ICMP echo replies should go out on the same interface
-		 * the request came on as probes used by in.mpathd for
-		 * detecting NIC failures are ECHO packets. We turn-off load
-		 * spreading by allocating a ip6i and setting ip6i_attach_if
-		 * to B_TRUE which is handled both by ip_wput_v6 and
-		 * ip_newroute_v6. If we don't turnoff load spreading,
-		 * the packets might get dropped if there are no
-		 * non-FAILED/INACTIVE interfaces for it to go out on and
-		 * in.mpathd would wrongly detect a failure or mis-detect
-		 * a NIC failure as a link failure. As load spreading can
-		 * happen only if ill_group is not NULL, we do only for
-		 * that case and this does not affect the normal case.
-		 *
-		 * We force this only on echo packets that came from on-link
-		 * hosts. We restrict this to link-local addresses which
-		 * is used by in.mpathd for probing. In the IPv6 case,
-		 * default routes typically have an ire_ipif pointer and
-		 * hence a MATCH_IRE_ILL later in ip_newroute_v6/ip_wput_v6
-		 * might work. As a default route out of this interface
-		 * may not be present, enforcing this packet to go out in
-		 * this case may not work.
-		 */
-		if (ill->ill_group != NULL &&
-		    IN6_IS_ADDR_LINKLOCAL(&origsrc)) {
-			/*
-			 * If we are sending replies to ourselves, don't
-			 * set ATTACH_IF as we may not be able to find
-			 * the IRE_LOCAL on this ill i.e setting ATTACH_IF
-			 * causes ip_wput_v6 to look for an IRE_LOCAL on
-			 * "ill" which it may not find and will try to
-			 * create an IRE_CACHE for our local address. Once
-			 * we do this, we will try to forward all packets
-			 * meant to our LOCAL address.
-			 */
-			ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ALL_ZONES,
-			    NULL, ipst);
-			if (ire == NULL || ire->ire_type != IRE_LOCAL) {
-				mp = ip_add_info_v6(mp, NULL, &ip6h->ip6_dst);
-				if (mp == NULL) {
-					BUMP_MIB(ill->ill_icmp6_mib,
-					    ipv6IfIcmpInErrors);
-					if (ire != NULL)
-						ire_refrele(ire);
-					if (mctl_present)
-						freeb(first_mp);
-					return;
-				} else if (mctl_present) {
-					first_mp->b_cont = mp;
-				} else {
-					first_mp = mp;
-				}
-				ip6i = (ip6i_t *)mp->b_rptr;
-				ip6i->ip6i_flags = IP6I_ATTACH_IF;
-				ip6i->ip6i_ifindex =
-				    ill->ill_phyint->phyint_ifindex;
-			}
-			if (ire != NULL)
-				ire_refrele(ire);
-		}
 
 		if (!mctl_present) {
 			/*
@@ -529,7 +469,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
 		if (mctl_present)
 			freeb(first_mp);
 		/* XXX may wish to pass first_mp up to ndp_input someday. */
-		ndp_input(ill, mp, dl_mp);
+		ndp_input(inill, mp, dl_mp);
 		return;
 
 	case ND_NEIGHBOR_ADVERT:
@@ -538,7 +478,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
 		if (mctl_present)
 			freeb(first_mp);
 		/* XXX may wish to pass first_mp up to ndp_input someday. */
-		ndp_input(ill, mp, dl_mp);
+		ndp_input(inill, mp, dl_mp);
 		return;
 
 	case ND_REDIRECT: {
@@ -579,7 +519,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
 	}
 	if (interested) {
 		icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill,
-		    mctl_present, zoneid);
+		    inill, mctl_present, zoneid);
 	} else {
 		freemsg(first_mp);
 	}
@@ -592,7 +532,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length,
  */
 /* ARGSUSED */
 static void
-icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
+icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
     boolean_t mctl_present, zoneid_t zoneid)
 {
 	ip6_t		*ip6h;
@@ -658,11 +598,10 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
 	 * sufficient. Same link local addresses for different ILL's is
 	 * possible.
 	 */
-
 	if (IN6_IS_ADDR_LINKLOCAL(&inner_ip6h->ip6_dst)) {
 		first_ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL,
 		    IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL,
-		    MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst);
+		    MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
 
 		if (first_ire == NULL) {
 			if (ip_debug > 2) {
@@ -773,7 +712,7 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
 		}
 		rw_exit(&irb->irb_lock);
 	}
-	icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill,
+	icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, inill,
 	    mctl_present, zoneid);
 }
 
@@ -783,7 +722,8 @@ icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill,
  */
 void
 icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
-    icmp6_t *icmp6, ill_t *ill, boolean_t mctl_present, zoneid_t zoneid)
+    icmp6_t *icmp6, ill_t *ill, ill_t *inill, boolean_t mctl_present,
+    zoneid_t zoneid)
 {
 	uint16_t *up;	/* Pointer to ports in ULP header */
 	uint32_t ports;	/* reversed ports for fanout */
@@ -861,7 +801,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 		((uint16_t *)&ports)[0] = up[1];
 		((uint16_t *)&ports)[1] = up[0];
 
-		ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, ill,
+		ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, inill,
 		    IP6_NO_IPPOLICY, mctl_present, zoneid);
 		return;
 	}
@@ -908,7 +848,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 		up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
 		((uint16_t *)&ports)[0] = up[1];
 		((uint16_t *)&ports)[1] = up[0];
-		ip_fanout_sctp(first_mp, ill, (ipha_t *)ip6h, ports, 0,
+		ip_fanout_sctp(first_mp, inill, (ipha_t *)ip6h, ports, 0,
 		    mctl_present, IP6_NO_IPPOLICY, zoneid);
 		return;
 	case IPPROTO_ESP:
@@ -940,7 +880,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 			ASSERT(ill != NULL);
 			ii->ipsec_in_ill_index =
 			    ill->ill_phyint->phyint_ifindex;
-			ii->ipsec_in_rill_index = ii->ipsec_in_ill_index;
+			ii->ipsec_in_rill_index =
+			    inill->ill_phyint->phyint_ifindex;
 			first_mp->b_cont->b_datap->db_type = M_CTL;
 		} else {
 			/*
@@ -970,7 +911,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 			mp->b_datap->db_type = M_CTL;
 			ii->ipsec_in_ill_index =
 			    ill->ill_phyint->phyint_ifindex;
-			ii->ipsec_in_rill_index = ii->ipsec_in_ill_index;
+			ii->ipsec_in_rill_index =
+			    inill->ill_phyint->phyint_ifindex;
 		}
 
 		if (!ipsec_loaded(ipss)) {
@@ -985,7 +927,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 		if (ipsec_rc == IPSEC_STATUS_FAILED)
 			return;
 
-		ip_fanout_proto_again(first_mp, ill, ill, NULL);
+		ip_fanout_proto_again(first_mp, ill, inill, NULL);
 		return;
 	}
 	case IPPROTO_ENCAP:
@@ -1083,8 +1025,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 			 * doing here.
 			 */
 			icmp_inbound_error_fanout_v6(q, first_mp,
-			    (ip6_t *)mp->b_rptr, icmp6, ill, mctl_present,
-			    zoneid);
+			    (ip6_t *)mp->b_rptr, icmp6, ill, inill,
+			    mctl_present, zoneid);
 			return;
 		}
 		/* FALLTHRU */
@@ -1096,7 +1038,7 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 		rip6h.ip6_src = ip6h->ip6_dst;
 		rip6h.ip6_dst = ip6h->ip6_src;
 		rip6h.ip6_nxt = nexthdr;
-		ip_fanout_proto_v6(q, first_mp, &rip6h, ill, ill, nexthdr, 0,
+		ip_fanout_proto_v6(q, first_mp, &rip6h, ill, inill, nexthdr, 0,
 		    IP6_NO_IPPOLICY, mctl_present, zoneid);
 		return;
 	}
@@ -1194,9 +1136,8 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
 	 * redirect packet.)
 	 */
 
-	prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL,
-	    ALL_ZONES, NULL, MATCH_IRE_GW | MATCH_IRE_ILL_GROUP |
-	    MATCH_IRE_DEFAULT, ipst);
+	prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL, ALL_ZONES,
+	    NULL, MATCH_IRE_GW | MATCH_IRE_ILL | MATCH_IRE_DEFAULT, ipst);
 
 	/*
 	 * Check that
@@ -1260,6 +1201,7 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
 	opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR);
 	if (opt != NULL) {
 		err = ndp_lookup_then_add_v6(ill,
+		    B_FALSE,			/* don't match across illgrp */
 		    (uchar_t *)&opt[1],		/* Link layer address */
 		    gateway,
 		    &ipv6_all_ones,		/* prefix mask */
@@ -1367,8 +1309,7 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
 		 */
 		redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST,
 		    ire->ire_ipif, NULL, ALL_ZONES, 0, NULL,
-		    (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP),
-		    ipst);
+		    (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
 
 		ire_refrele(ire);		/* Held in ire_add_v6 */
 
@@ -1457,15 +1398,11 @@ icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst,
 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes);
 			return (NULL);
 		}
-		/*
-		 * Does not matter whether we use ire_stq or ire_ipif here.
-		 * Just pick an ill for ICMP replies.
-		 */
 		ASSERT(ire->ire_ipif != NULL);
 		ill = ire->ire_ipif->ipif_ill;
 		ire_refrele(ire);
 	}
-	ipif = ipif_select_source_v6(ill, origsrc, RESTRICT_TO_NONE,
+	ipif = ipif_select_source_v6(ill, origsrc, B_FALSE,
 	    IPV6_PREFER_SRC_DEFAULT, zoneid);
 	if (ipif != NULL) {
 		*src = ipif->ipif_v6src_addr;
@@ -1858,7 +1795,7 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
 	mp = icmp_pkt_err_ok_v6(q, mp, llbcast, B_FALSE, ipst);
 	if (mp == NULL)
 		return;
-	nce = ndp_lookup_v6(ill, targetp, B_FALSE);
+	nce = ndp_lookup_v6(ill, B_TRUE, targetp, B_FALSE);
 	if (nce != NULL && nce->nce_state != ND_INCOMPLETE) {
 		ll_opt_len = (sizeof (nd_opt_hdr_t) +
 		    ill->ill_phys_addr_length + 7)/8 * 8;
@@ -1908,31 +1845,8 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
 	rdh->nd_opt_rh_reserved1 = 0;
 	rdh->nd_opt_rh_reserved2 = 0;
 	/* ipif_v6src_addr contains the link-local source address */
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	if (ill->ill_group != NULL) {
-		/*
-		 * The receiver of the redirect will verify whether it
-		 * had a route through us (srcp that we will use in
-		 * the redirect) or not. As we load spread even link-locals,
-		 * we don't know which source address the receiver of
-		 * redirect has in its route for communicating with us.
-		 * Thus we randomly choose a source here and finally we
-		 * should get to the right one and it will eventually
-		 * accept the redirect from us. We can't call
-		 * ip_lookup_scope_v6 because we don't have the right
-		 * link-local address here. Thus we randomly choose one.
-		 */
-		int cnt = ill->ill_group->illgrp_ill_count;
+	srcp = &ill->ill_ipif->ipif_v6src_addr;
 
-		ill = ill->ill_group->illgrp_ill;
-		cnt = ++ipst->ips_icmp_redirect_v6_src_index % cnt;
-		while (cnt--)
-			ill = ill->ill_group_next;
-		srcp = &ill->ill_ipif->ipif_v6src_addr;
-	} else {
-		srcp = &ill->ill_ipif->ipif_v6src_addr;
-	}
-	rw_exit(&ipst->ips_ill_g_lock);
 	/* Redirects sent by router, and router is global zone */
 	icmp_pkt_v6(q, mp, buf, len, srcp, B_FALSE, GLOBAL_ZONEID, ipst);
 	kmem_free(buf, len);
@@ -2231,6 +2145,7 @@ ip_bind_post_handling_v6(conn_t *connp, mblk_t *mp,
 	if (version_changed) {
 		ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst);
 	}
+
 	/*
 	 * Pass the IPSEC headers size in ire_ipsec_overhead.
 	 * We can't do this in ip_bind_insert_ire because the policy
@@ -2771,8 +2686,8 @@ ip_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
 			}
 			if (ip6_asp_can_lookup(ipst)) {
 				src_ipif = ipif_select_source_v6(dst_ill,
-				    v6dst, RESTRICT_TO_NONE,
-				    connp->conn_src_preferences, zoneid);
+				    v6dst, B_FALSE, connp->conn_src_preferences,
+				    zoneid);
 				ip6_asp_table_refrele(ipst);
 				if (src_ipif == NULL) {
 					pr_addr_dbg("ip_bind_connected_v6: "
@@ -3111,7 +3026,15 @@ ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst)
 	ip6i->ip6i_nxt = IPPROTO_RAW;
 	if (ill != NULL) {
 		ip6i->ip6i_flags = IP6I_IFINDEX;
-		ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
+		/*
+		 * If `ill' is in an IPMP group, make sure we use the IPMP
+		 * interface index so that e.g. IPV6_RECVPKTINFO will get the
+		 * IPMP interface index and not an underlying interface index.
+		 */
+		if (IS_UNDER_IPMP(ill))
+			ip6i->ip6i_ifindex = ipmp_ill_get_ipmp_ifindex(ill);
+		else
+			ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
 	} else {
 		ip6i->ip6i_flags = 0;
 	}
@@ -4257,33 +4180,6 @@ ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h)
 }
 
 /*
- * Select an ill for the packet by considering load spreading across
- * a different ill in the group if dst_ill is part of some group.
- */
-static ill_t *
-ip_newroute_get_dst_ill_v6(ill_t *dst_ill)
-{
-	ill_t *ill;
-
-	/*
-	 * We schedule irrespective of whether the source address is
-	 * INADDR_UNSPECIED or not.
-	 */
-	ill = illgrp_scheduler(dst_ill);
-	if (ill == NULL)
-		return (NULL);
-
-	/*
-	 * For groups with names ip_sioctl_groupname ensures that all
-	 * ills are of same type. For groups without names, ifgrp_insert
-	 * ensures this.
-	 */
-	ASSERT(dst_ill->ill_type == ill->ill_type);
-
-	return (ill);
-}
-
-/*
  * IPv6 -
  * ip_newroute_v6 is called by ip_rput_data_v6 or ip_wput_v6 whenever we need
  * to send out a packet to a destination address for which we do not have
@@ -4303,14 +4199,6 @@ ip_newroute_get_dst_ill_v6(ill_t *dst_ill)
  * node sits at a site boundary).
  * We create the cache entries in the regular ctable since
  * it can not "confuse" things for other destinations.
- * table.
- *
- * When ill is part of a ill group, we subject the packets
- * to load spreading even if the ill is specified by the
- * means described above. We disable only for IPV6_BOUND_PIF
- * and for the cases where IP6I_ATTACH_IF is set i.e NS/NA/
- * Echo replies to link-local destinations have IP6I_ATTACH_IF
- * set.
  *
  * NOTE : These are the scopes of some of the variables that point at IRE,
  *	  which needs to be followed while making any future modifications
@@ -4327,8 +4215,6 @@ ip_newroute_get_dst_ill_v6(ill_t *dst_ill)
  *
  *	Thus on failures, we have to REFRELE only ire and sire, if they
  *	are not NULL.
- *
- *	v6srcp may be used in the future. Currently unused.
  */
 /* ARGSUSED */
 void
@@ -4346,10 +4232,8 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
 	int		err = 0;
 	mblk_t		*first_mp;
 	ipsec_out_t	*io;
-	ill_t		*attach_ill = NULL;
 	ushort_t	ire_marks = 0;
 	int		match_flags;
-	boolean_t	ip6i_present;
 	ire_t		*first_sire = NULL;
 	mblk_t		*copy_mp = NULL;
 	mblk_t		*xmit_mp = NULL;
@@ -4359,7 +4243,6 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
 	boolean_t	multirt_is_resolvable;
 	boolean_t	multirt_resolve_next;
 	boolean_t	need_rele = B_FALSE;
-	boolean_t	do_attach_ill = B_FALSE;
 	boolean_t	ip6_asp_table_held = B_FALSE;
 	tsol_ire_gw_secattr_t *attrp = NULL;
 	tsol_gcgrp_t	*gcgrp = NULL;
@@ -4376,39 +4259,12 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
 		io = NULL;
 	}
 
-	/*
-	 * If this end point is bound to IPIF_NOFAILOVER, set bnf_ill and
-	 * bind_to_nofailover B_TRUE. We can't use conn to determine as it
-	 * could be NULL.
-	 *
-	 * This information can appear either in an ip6i_t or an IPSEC_OUT
-	 * message.
-	 */
 	ip6h = (ip6_t *)mp->b_rptr;
-	ip6i_present = (ip6h->ip6_nxt == IPPROTO_RAW);
-	if (ip6i_present || (io != NULL && io->ipsec_out_attach_if)) {
-		if (!ip6i_present ||
-		    ((ip6i_t *)ip6h)->ip6i_flags & IP6I_ATTACH_IF) {
-			attach_ill = ip_grab_attach_ill(ill, first_mp,
-			    (ip6i_present ? ((ip6i_t *)ip6h)->ip6i_ifindex :
-			    io->ipsec_out_ill_index), B_TRUE, ipst);
-			/* Failure case frees things for us. */
-			if (attach_ill == NULL)
-				return;
-
-			/*
-			 * Check if we need an ire that will not be
-			 * looked up by anybody else i.e. HIDDEN.
-			 */
-			if (ill_is_probeonly(attach_ill))
-				ire_marks = IRE_MARK_HIDDEN;
-		}
-	}
 
 	if (IN6_IS_ADDR_LOOPBACK(v6dstp)) {
 		ip1dbg(("ip_newroute_v6: dst with loopback addr\n"));
 		goto icmp_err_ret;
-	} else if ((v6srcp != NULL) && IN6_IS_ADDR_LOOPBACK(v6srcp)) {
+	} else if (IN6_IS_ADDR_LOOPBACK(v6srcp)) {
 		ip1dbg(("ip_newroute_v6: src with loopback addr\n"));
 		goto icmp_err_ret;
 	}
@@ -4436,30 +4292,24 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
 		ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0,
 		    NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp),
 		    match_flags, ipst);
+	} else {
+		match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
+		    MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL;
+		match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR;
+
 		/*
-		 * ire_add_then_send -> ip_newroute_v6 in the CGTP case passes
-		 * in a NULL ill, but the packet could be a neighbor
-		 * solicitation/advertisment and could have a valid attach_ill.
+		 * Because nce_xmit() calls ip_output_v6() and NCEs are always
+		 * tied to an underlying interface, IS_UNDER_IPMP() may be
+		 * true even when building IREs that will be used for data
+		 * traffic.  As such, use the packet's source address to
+		 * determine whether the traffic is test traffic, and set
+		 * MATCH_IRE_MARK_TESTHIDDEN if so.
 		 */
-		if (attach_ill != NULL)
-			ill_refrele(attach_ill);
-	} else {
-		if (attach_ill != NULL) {
-			/*
-			 * attach_ill is set only for communicating with
-			 * on-link hosts. So, don't look for DEFAULT.
-			 * ip_wput_v6 passes the right ill in this case and
-			 * hence we can assert.
-			 */
-			ASSERT(ill == attach_ill);
-			ill_refrele(attach_ill);
-			do_attach_ill = B_TRUE;
-			match_flags = MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL;
-		} else {
-			match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-			    MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL_GROUP;
+		if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) {
+			if (ipif_lookup_testaddr_v6(ill, v6srcp, NULL))
+				match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
 		}
-		match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR;
+
 		ire = ire_ftable_lookup_v6(v6dstp, NULL, NULL, 0, ill->ill_ipif,
 		    &sire, zoneid, 0, MBLK_GETLABEL(mp), match_flags, ipst);
 	}
@@ -4601,106 +4451,56 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
 		}
 
 		/*
-		 * We have a route to reach the destination.
-		 *
-		 * 1) If the interface is part of ill group, try to get a new
-		 *    ill taking load spreading into account.
+		 * We have a route to reach the destination.  Find the
+		 * appropriate ill, then get a source address that matches the
+		 * right scope via ipif_select_source_v6().
 		 *
-		 * 2) After selecting the ill, get a source address that might
-		 *    create good inbound load spreading and that matches the
-		 *    right scope. ipif_select_source_v6 does this for us.
+		 * If we are here trying to create an IRE_CACHE for an offlink
+		 * destination and have an IRE_CACHE entry for VNI, then use
+		 * ire_stq instead since VNI's queue is a black hole.
 		 *
-		 * If the application specified the ill (ifindex), we still
-		 * load spread. Only if the packets needs to go out specifically
-		 * on a given ill e.g. bind to IPIF_NOFAILOVER address,
-		 * IPV6_BOUND_PIF we don't try to use a different ill for load
-		 * spreading.
+		 * Note: While we pick a dst_ill we are really only interested
+		 * in the ill for load spreading.  The source ipif is
+		 * determined by source address selection below.
 		 */
-		if (!do_attach_ill) {
-			/*
-			 * If the interface belongs to an interface group,
-			 * make sure the next possible interface in the group
-			 * is used.  This encourages load spreading among
-			 * peers in an interface group. However, in the case
-			 * of multirouting, load spreading is not used, as we
-			 * actually want to replicate outgoing packets through
-			 * particular interfaces.
-			 *
-			 * Note: While we pick a dst_ill we are really only
-			 * interested in the ill for load spreading.
-			 * The source ipif is determined by source address
-			 * selection below.
-			 */
-			if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) {
-				dst_ill = ire->ire_ipif->ipif_ill;
-				/* For uniformity do a refhold */
-				ill_refhold(dst_ill);
+		if ((ire->ire_type == IRE_CACHE) &&
+		    IS_VNI(ire->ire_ipif->ipif_ill)) {
+			dst_ill = ire->ire_stq->q_ptr;
+			ill_refhold(dst_ill);
+		} else {
+			ill_t *ill = ire->ire_ipif->ipif_ill;
+
+			if (IS_IPMP(ill)) {
+				dst_ill =
+				    ipmp_illgrp_hold_next_ill(ill->ill_grp);
 			} else {
-				/*
-				 * If we are here trying to create an IRE_CACHE
-				 * for an offlink destination and have the
-				 * IRE_CACHE for the next hop and the latter is
-				 * using virtual IP source address selection i.e
-				 * it's ire->ire_ipif is pointing to a virtual
-				 * network interface (vni) then
-				 * ip_newroute_get_dst_ll() will return the vni
-				 * interface as the dst_ill. Since the vni is
-				 * virtual i.e not associated with any physical
-				 * interface, it cannot be the dst_ill, hence
-				 * in such a case call ip_newroute_get_dst_ll()
-				 * with the stq_ill instead of the ire_ipif ILL.
-				 * The function returns a refheld ill.
-				 */
-				if ((ire->ire_type == IRE_CACHE) &&
-				    IS_VNI(ire->ire_ipif->ipif_ill))
-					dst_ill = ip_newroute_get_dst_ill_v6(
-					    ire->ire_stq->q_ptr);
-				else
-					dst_ill = ip_newroute_get_dst_ill_v6(
-					    ire->ire_ipif->ipif_ill);
+				dst_ill = ill;
+				ill_refhold(dst_ill);
 			}
-			if (dst_ill == NULL) {
-				if (ip_debug > 2) {
-					pr_addr_dbg("ip_newroute_v6 : no dst "
-					    "ill for dst %s\n",
-					    AF_INET6, v6dstp);
-				}
-				goto icmp_err_ret;
-			} else if (dst_ill->ill_group == NULL && ill != NULL &&
-			    dst_ill != ill) {
-				/*
-				 * If "ill" is not part of any group, we should
-				 * have found a route matching "ill" as we
-				 * called ire_ftable_lookup_v6 with
-				 * MATCH_IRE_ILL_GROUP.
-				 * Rather than asserting when there is a
-				 * mismatch, we just drop the packet.
-				 */
-				ip0dbg(("ip_newroute_v6: BOUND_IF failed : "
-				    "dst_ill %s ill %s\n",
-				    dst_ill->ill_name,
-				    ill->ill_name));
-				goto icmp_err_ret;
+		}
+
+		if (dst_ill == NULL) {
+			if (ip_debug > 2) {
+				pr_addr_dbg("ip_newroute_v6 : no dst "
+				    "ill for dst %s\n", AF_INET6, v6dstp);
 			}
-		} else {
-			dst_ill = ire->ire_ipif->ipif_ill;
-			/* For uniformity do refhold */
-			ill_refhold(dst_ill);
+			goto icmp_err_ret;
+		}
+
+		if (ill != NULL && dst_ill != ill &&
+		    !IS_IN_SAME_ILLGRP(dst_ill, ill)) {
 			/*
-			 * We should have found a route matching ill as we
-			 * called ire_ftable_lookup_v6 with MATCH_IRE_ILL.
-			 * Rather than asserting, while there is a mismatch,
-			 * we just drop the packet.
+			 * We should have found a route matching "ill"
+			 * as we called ire_ftable_lookup_v6 with
+			 * MATCH_IRE_ILL.  Rather than asserting when
+			 * there is a mismatch, we just drop the packet.
 			 */
-			if (dst_ill != ill) {
-				ip0dbg(("ip_newroute_v6: Packet dropped as "
-				    "IP6I_ATTACH_IF ill is %s, "
-				    "ire->ire_ipif->ipif_ill is %s\n",
-				    ill->ill_name,
-				    dst_ill->ill_name));
-				goto icmp_err_ret;
-			}
+			ip0dbg(("ip_newroute_v6: BOUND_IF failed: "
+			    "dst_ill %s ill %s\n", dst_ill->ill_name,
+			    ill->ill_name));
+			goto icmp_err_ret;
 		}
+
 		/*
 		 * Pick a source address which matches the scope of the
 		 * destination address.
@@ -4708,7 +4508,20 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
 		 * parent ire (sire).
 		 */
 		ASSERT(src_ipif == NULL);
-		if (ire->ire_type == IRE_IF_RESOLVER &&
+
+		/*
+		 * Because nce_xmit() calls ip_output_v6() and NCEs are always
+		 * tied to the underlying interface, IS_UNDER_IPMP() may be
+		 * true even when building IREs that will be used for data
+		 * traffic.  As such, see if the packet's source address is a
+		 * test address, and if so use that test address's ipif for
+		 * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in
+		 * ire_add_v6() can work properly.
+		 */
+		if (ill != NULL && IS_UNDER_IPMP(ill))
+			(void) ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif);
+
+		if (src_ipif == NULL && ire->ire_type == IRE_IF_RESOLVER &&
 		    !IN6_IS_ADDR_UNSPECIFIED(&v6gw) &&
 		    ip6_asp_can_lookup(ipst)) {
 			/*
@@ -4718,10 +4531,10 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
 			 */
 			ip6_asp_table_held = B_TRUE;
 			src_ipif = ipif_select_source_v6(dst_ill, &v6gw,
-			    RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT, zoneid);
+			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, zoneid);
 			if (src_ipif != NULL)
 				ire_marks |= IRE_MARK_USESRC_CHECK;
-		} else {
+		} else if (src_ipif == NULL) {
 			if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
 				/*
 				 * Check that the ipif matching the requested
@@ -4732,14 +4545,9 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
 				    NULL, NULL, NULL, NULL, ipst);
 			}
 			if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
-				uint_t restrict_ill = RESTRICT_TO_NONE;
-
-				if (ip6i_present && ((ip6i_t *)ip6h)->ip6i_flags
-				    & IP6I_ATTACH_IF)
-					restrict_ill = RESTRICT_TO_ILL;
 				ip6_asp_table_held = B_TRUE;
 				src_ipif = ipif_select_source_v6(dst_ill,
-				    v6dstp, restrict_ill,
+				    v6dstp, B_FALSE,
 				    IPV6_PREFER_SRC_DEFAULT, zoneid);
 				if (src_ipif != NULL)
 					ire_marks |= IRE_MARK_USESRC_CHECK;
@@ -4750,7 +4558,7 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
 			if (ip_debug > 2) {
 				/* ip1dbg */
 				pr_addr_dbg("ip_newroute_v6: no src for "
-				    "dst %s\n, ", AF_INET6, v6dstp);
+				    "dst %s\n", AF_INET6, v6dstp);
 				printf("ip_newroute_v6: interface name %s\n",
 				    dst_ill->ill_name);
 			}
@@ -4837,14 +4645,7 @@ ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
 				    "ire_ihandle_lookup_offlink_v6 failed\n"));
 				goto icmp_err_ret;
 			}
-			/*
-			 * Assume DL_UNITDATA_REQ is same for all physical
-			 * interfaces in the ifgrp.  If it isn't, this code will
-			 * have to be seriously rewhacked to allow the
-			 * fastpath probing (such that I cache the link
-			 * header in the IRE_CACHE) to work over ifgrps.
-			 * We have what we need to build an IRE_CACHE.
-			 */
+
 			/*
 			 * Note: the new ire inherits RTF_SETSRC
 			 * and RTF_MULTIRT to propagate these flags from prefix
@@ -5659,24 +5460,22 @@ icmp_err_ret:
  */
 void
 ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
-    in6_addr_t v6dst, int unspec_src, zoneid_t zoneid)
+    const in6_addr_t *v6dstp, const in6_addr_t *v6srcp, int unspec_src,
+    zoneid_t zoneid)
 {
 	ire_t	*ire = NULL;
 	ipif_t	*src_ipif = NULL;
 	int	err = 0;
 	ill_t	*dst_ill = NULL;
 	ire_t	*save_ire;
-	ushort_t ire_marks = 0;
 	ipsec_out_t *io;
-	ill_t *attach_ill = NULL;
 	ill_t *ill;
-	ip6_t *ip6h;
 	mblk_t *first_mp;
-	boolean_t ip6i_present;
 	ire_t *fire = NULL;
 	mblk_t  *copy_mp = NULL;
+	const in6_addr_t *ire_v6srcp;
+	boolean_t probe = B_FALSE;
 	boolean_t multirt_resolve_next;
-	in6_addr_t *v6dstp = &v6dst;
 	boolean_t ipif_held = B_FALSE;
 	boolean_t ill_held = B_FALSE;
 	boolean_t ip6_asp_table_held = B_FALSE;
@@ -5728,35 +5527,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
 		if (!(ill->ill_flags & ILLF_MULTICAST)) {
 			goto err_ret;
 		}
-		/*
-		 * If this end point is bound to IPIF_NOFAILOVER, set bnf_ill
-		 * and bind_to_nofailover B_TRUE. We can't use conn to determine
-		 * as it could be NULL.
-		 *
-		 * This information can appear either in an ip6i_t or an
-		 * IPSEC_OUT message.
-		 */
-		ip6h = (ip6_t *)mp->b_rptr;
-		ip6i_present = (ip6h->ip6_nxt == IPPROTO_RAW);
-		if (ip6i_present || (io != NULL && io->ipsec_out_attach_if)) {
-			if (!ip6i_present ||
-			    ((ip6i_t *)ip6h)->ip6i_flags & IP6I_ATTACH_IF) {
-				attach_ill = ip_grab_attach_ill(ill, first_mp,
-				    (ip6i_present ?
-				    ((ip6i_t *)ip6h)->ip6i_ifindex :
-				    io->ipsec_out_ill_index), B_TRUE, ipst);
-				/* Failure case frees things for us. */
-				if (attach_ill == NULL)
-					return;
-
-				/*
-				 * Check if we need an ire that will not be
-				 * looked up by anybody else i.e. HIDDEN.
-				 */
-				if (ill_is_probeonly(attach_ill))
-					ire_marks = IRE_MARK_HIDDEN;
-			}
-		}
 
 		/*
 		 * We check if an IRE_OFFSUBNET for the addr that goes through
@@ -5770,76 +5540,93 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
 		    (void *)ipif, ntohl(V4_PART_OF_V6((*v6dstp))),
 		    (void *)fire));
 
+		ASSERT(src_ipif == NULL);
+
 		/*
-		 * If the application specified the ill (ifindex), we still
-		 * load spread. Only if the packets needs to go out specifically
-		 * on a given ill e.g. binding to IPIF_NOFAILOVER address or
-		 * IPV6_BOUND_PIF, or there is a parent ire entry that specified
-		 * multirouting, then we don't try to use a different ill for
-		 * load spreading.
+		 * Because nce_xmit() calls ip_output_v6() and NCEs are always
+		 * tied to the underlying interface, IS_UNDER_IPMP() may be
+		 * true even when building IREs that will be used for data
+		 * traffic.  As such, see if the packet's source address is a
+		 * test address, and if so use that test address's ipif for
+		 * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in
+		 * ire_add_v6() can work properly.
+		 */
+		if (IS_UNDER_IPMP(ill))
+			probe = ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif);
+
+		/*
+		 * Determine the outbound (destination) ill for this route.
+		 * If IPMP is not in use, that's the same as our ill.  If IPMP
+		 * is in-use and we're on the IPMP interface, or we're on an
+		 * underlying ill but sending data traffic, use a suitable
+		 * destination ill from the group.  The latter case covers a
+		 * subtle edge condition with multicast: when we bring up an
+		 * IPv6 data address, we will create an NCE on an underlying
+		 * interface, and send solitications to ff02::1, which would
+		 * take us through here, and cause us to create an IRE for
+		 * ff02::1.  To meet our defined semantics for multicast (and
+		 * ensure there aren't unexpected echoes), that IRE needs to
+		 * use the IPMP group's nominated multicast interface.
+		 *
+		 * Note: the source ipif is determined by source address
+		 * selection later.
 		 */
-		if (attach_ill == NULL) {
-			/*
-			 * If the interface belongs to an interface group,
-			 * make sure the next possible interface in the group
-			 * is used.  This encourages load spreading among peers
-			 * in an interface group.
-			 *
-			 * Note: While we pick a dst_ill we are really only
-			 * interested in the ill for load spreading. The source
-			 * ipif is determined by source address selection below.
-			 */
-			if ((fire != NULL) && (fire->ire_flags & RTF_MULTIRT)) {
-				dst_ill = ipif->ipif_ill;
-				/* For uniformity do a refhold */
-				ill_refhold(dst_ill);
+		if (IS_IPMP(ill) || (IS_UNDER_IPMP(ill) && !probe)) {
+			ill_t *ipmp_ill;
+			ipmp_illgrp_t *illg;
+
+			if (IS_UNDER_IPMP(ill)) {
+				ipmp_ill = ipmp_ill_hold_ipmp_ill(ill);
 			} else {
-				/* refheld by ip_newroute_get_dst_ill_v6 */
-				dst_ill =
-				    ip_newroute_get_dst_ill_v6(ipif->ipif_ill);
+				ipmp_ill = ill;
+				ill_refhold(ipmp_ill);	/* for symmetry */
 			}
-			if (dst_ill == NULL) {
-				if (ip_debug > 2) {
-					pr_addr_dbg("ip_newroute_ipif_v6: "
-					    "no dst ill for dst %s\n",
-					    AF_INET6, v6dstp);
-				}
+
+			if (ipmp_ill == NULL)
 				goto err_ret;
-			}
+
+			illg = ipmp_ill->ill_grp;
+			if (IN6_IS_ADDR_MULTICAST(v6dstp))
+				dst_ill = ipmp_illgrp_hold_cast_ill(illg);
+			else
+				dst_ill = ipmp_illgrp_hold_next_ill(illg);
+
+			ill_refrele(ipmp_ill);
 		} else {
-			dst_ill = ipif->ipif_ill;
-			/*
-			 * ip_wput_v6 passes the right ipif for IPIF_NOFAILOVER
-			 * and IPV6_BOUND_PIF case.
-			 */
-			ASSERT(dst_ill == attach_ill);
-			/* attach_ill is already refheld */
+			dst_ill = ill;
+			ill_refhold(dst_ill); 	/* for symmetry */
+		}
+
+		if (dst_ill == NULL) {
+			if (ip_debug > 2) {
+				pr_addr_dbg("ip_newroute_ipif_v6: "
+				    "no dst ill for dst %s\n",
+				    AF_INET6, v6dstp);
+			}
+			goto err_ret;
 		}
+
 		/*
 		 * Pick a source address which matches the scope of the
 		 * destination address.
 		 * For RTF_SETSRC routes, the source address is imposed by the
 		 * parent ire (fire).
 		 */
-		ASSERT(src_ipif == NULL);
-		if ((fire != NULL) && (fire->ire_flags & RTF_SETSRC)) {
+
+		if (src_ipif == NULL && fire != NULL &&
+		    (fire->ire_flags & RTF_SETSRC)) {
 			/*
 			 * Check that the ipif matching the requested source
 			 * address still exists.
 			 */
-			src_ipif =
-			    ipif_lookup_addr_v6(&fire->ire_src_addr_v6,
+			src_ipif = ipif_lookup_addr_v6(&fire->ire_src_addr_v6,
 			    NULL, zoneid, NULL, NULL, NULL, NULL, ipst);
 		}
-		if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
-			uint_t restrict_ill = RESTRICT_TO_NONE;
 
-			if (ip6i_present && ((ip6i_t *)ip6h)->ip6i_flags
-			    & IP6I_ATTACH_IF)
-				restrict_ill = RESTRICT_TO_ILL;
+		if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
 			ip6_asp_table_held = B_TRUE;
 			src_ipif = ipif_select_source_v6(dst_ill, v6dstp,
-			    restrict_ill, IPV6_PREFER_SRC_DEFAULT, zoneid);
+			    B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
 		}
 
 		if (src_ipif == NULL) {
@@ -5847,16 +5634,20 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
 				if (ip_debug > 2) {
 					/* ip1dbg */
 					pr_addr_dbg("ip_newroute_ipif_v6: "
-					    "no src for dst %s\n,",
+					    "no src for dst %s\n",
 					    AF_INET6, v6dstp);
 					printf(" through interface %s\n",
 					    dst_ill->ill_name);
 				}
 				goto err_ret;
 			}
+			ire_v6srcp = &ipv6_all_zeros;
 			src_ipif = ipif;
 			ipif_refhold(src_ipif);
+		} else {
+			ire_v6srcp = &src_ipif->ipif_v6src_addr;
 		}
+
 		ire = ipif_to_ire_v6(ipif);
 		if (ire == NULL) {
 			if (ip_debug > 2) {
@@ -5903,7 +5694,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
 			}
 		}
 
-		ASSERT((attach_ill == NULL) || (dst_ill == attach_ill));
 		switch (ire->ire_type) {
 		case IRE_IF_NORESOLVER: {
 			/*
@@ -5921,7 +5711,7 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
 			ire = ire_create_v6(
 			    v6dstp,			/* dest address */
 			    &ipv6_all_ones,		/* mask */
-			    &src_ipif->ipif_v6src_addr, /* source address */
+			    ire_v6srcp,			/* source address */
 			    NULL,			/* gateway address */
 			    &save_ire->ire_max_frag,
 			    NULL,			/* no src nce */
@@ -5946,8 +5736,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
 				break;
 			}
 
-			ire->ire_marks |= ire_marks;
-
 			err = ndp_noresolver(dst_ill, v6dstp);
 			if (err != 0) {
 				ire_refrele(save_ire);
@@ -6051,7 +5839,7 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
 			ire = ire_create_v6(
 			    v6dstp,			/* dest address */
 			    &ipv6_all_ones,		/* mask */
-			    &src_ipif->ipif_v6src_addr, /* source address */
+			    ire_v6srcp,			/* source address */
 			    NULL,			/* gateway address */
 			    &save_ire->ire_max_frag,
 			    NULL,			/* src nce */
@@ -6076,8 +5864,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
 				break;
 			}
 
-			ire->ire_marks |= ire_marks;
-
 			/* Resolve and add ire to the ctable */
 			err = ndp_resolver(dst_ill, v6dstp, first_mp, zoneid);
 			switch (err) {
@@ -6273,8 +6059,8 @@ err_ret:
 		ipif_refrele(ipif);
 	if (src_ipif != NULL)
 		ipif_refrele(src_ipif);
+
 	/* Multicast - no point in trying to generate ICMP error */
-	ASSERT((attach_ill == NULL) || (dst_ill == attach_ill));
 	if (dst_ill != NULL) {
 		ill = dst_ill;
 		ill_held = B_TRUE;
@@ -6499,7 +6285,7 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 					    &ip6h->ip6_dst)) {
 						ipif = ipif_select_source_v6(
 						    ill, &ip6h->ip6_src,
-						    RESTRICT_TO_GROUP,
+						    B_TRUE,
 						    IPV6_PREFER_SRC_DEFAULT,
 						    ALL_ZONES);
 						if (ipif != NULL) {
@@ -7050,7 +6836,7 @@ ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr)
  */
 static boolean_t
 ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
-    ill_t *ill, mblk_t *hada_mp, zoneid_t zoneid)
+    ill_t *ill, ill_t *inill, mblk_t *hada_mp, zoneid_t zoneid)
 {
 	mblk_t *mp;
 	uint8_t nexthdr;
@@ -7093,7 +6879,7 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
 		 */
 		ii = (ipsec_in_t *)first_mp->b_rptr;
 		ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex;
-		ii->ipsec_in_rill_index = ii->ipsec_in_ill_index;
+		ii->ipsec_in_rill_index = inill->ill_phyint->phyint_ifindex;
 		first_mp->b_cont = mp;
 	}
 	/*
@@ -7122,7 +6908,7 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
 	switch (ipsec_rc) {
 	case IPSEC_STATUS_SUCCESS:
 		/* we're done with IPsec processing, send it up */
-		ip_fanout_proto_again(first_mp, ill, ill, NULL);
+		ip_fanout_proto_again(first_mp, ill, inill, NULL);
 		break;
 	case IPSEC_STATUS_FAILED:
 		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
@@ -7225,7 +7011,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 	ip6_hbh_t	*hbhhdr;
 	boolean_t	ll_multicast = (flags & IP6_IN_LLMCAST);
 	conn_t		*connp;
-	ilm_t		*ilm;
 	uint32_t	ports;
 	zoneid_t	zoneid = GLOBAL_ZONEID;
 	uint16_t	hck_flags, reass_hck_flags;
@@ -7347,10 +7132,8 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 		/*
 		 * XXX TODO Give to mrouted to for multicast forwarding.
 		 */
-		ILM_WALKER_HOLD(ill);
-		ilm = ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, ALL_ZONES);
-		ILM_WALKER_RELE(ill);
-		if (ilm == NULL) {
+		if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE,
+		    ALL_ZONES) == NULL) {
 			if (ip_debug > 3) {
 				/* ip2dbg */
 				pr_addr_dbg("ip_rput_data_v6: got mcast packet"
@@ -7405,7 +7188,7 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 	if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) {
 		ire = ire_ctable_lookup_v6(&ip6h->ip6_dst, NULL,
 		    IRE_CACHE|IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL,
-		    MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst);
+		    MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
 	} else {
 		ire = ire_cache_lookup_v6(&ip6h->ip6_dst, ALL_ZONES,
 		    MBLK_GETLABEL(mp), ipst);
@@ -7466,9 +7249,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 	}
 	/* we have a matching IRE */
 	if (ire->ire_stq != NULL) {
-		ill_group_t *ill_group;
-		ill_group_t *ire_group;
-
 		/*
 		 * To be quicker, we may wish not to chase pointers
 		 * (ire->ire_ipif->ipif_ill...) and instead store the
@@ -7483,7 +7263,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 		no_forward = ((ill->ill_flags &
 		    ire->ire_ipif->ipif_ill->ill_flags & ILLF_ROUTER) == 0);
 
-
 		ASSERT(first_mp == mp);
 		/*
 		 * This ire has a send-to queue - forward the packet.
@@ -7568,10 +7347,8 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
 		 * we're forwarding onto the same link), conditionally send
 		 * a redirect message.
 		 */
-		ill_group = ill->ill_group;
-		ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
-		if (ire->ire_rfq != q && (ill_group == NULL ||
-		    ill_group != ire_group)) {
+		if (ire->ire_rfq != q &&
+		    !IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) {
 			if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) ||
 			    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) {
 				BUMP_MIB(ill->ill_ip_mib,
@@ -8006,7 +7783,10 @@ tcp_fanout:
 			 * where there is no conn.
 			 */
 			if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
-				ASSERT(!IS_LOOPBACK((ill)));
+				ilm_t *ilm;
+				ilm_walker_t ilw;
+
+				ASSERT(!IS_LOOPBACK(ill));
 				/*
 				 * In the multicast case, applications may have
 				 * joined the group from different zones, so we
@@ -8015,32 +7795,32 @@ tcp_fanout:
 				 * structures (ilm) on the receive ill and send
 				 * a copy of the packet up each matching one.
 				 */
-				ILM_WALKER_HOLD(ill);
-				for (ilm = ill->ill_ilm; ilm != NULL;
-				    ilm = ilm->ilm_next) {
-					if (ilm->ilm_flags & ILM_DELETED)
-						continue;
+				ilm = ilm_walker_start(&ilw, inill);
+				for (; ilm != NULL;
+				    ilm = ilm_walker_step(&ilw, ilm)) {
 					if (!IN6_ARE_ADDR_EQUAL(
 					    &ilm->ilm_v6addr, &ip6h->ip6_dst))
 						continue;
-					if (!ipif_lookup_zoneid(ill,
-					    ilm->ilm_zoneid, IPIF_UP, NULL))
+					if (!ipif_lookup_zoneid(
+					    ilw.ilw_walk_ill, ilm->ilm_zoneid,
+					    IPIF_UP, NULL))
 						continue;
 
 					first_mp1 = ip_copymsg(first_mp);
 					if (first_mp1 == NULL)
 						continue;
-					icmp_inbound_v6(q, first_mp1, ill,
+					icmp_inbound_v6(q, first_mp1,
+					    ilw.ilw_walk_ill, inill,
 					    hdr_len, mctl_present, 0,
 					    ilm->ilm_zoneid, dl_mp);
 				}
-				ILM_WALKER_RELE(ill);
+				ilm_walker_finish(&ilw);
 			} else {
 				first_mp1 = ip_copymsg(first_mp);
 				if (first_mp1 != NULL)
 					icmp_inbound_v6(q, first_mp1, ill,
-					    hdr_len, mctl_present, 0, zoneid,
-					    dl_mp);
+					    inill, hdr_len, mctl_present, 0,
+					    zoneid, dl_mp);
 			}
 		}
 			/* FALLTHRU */
@@ -8082,7 +7862,7 @@ tcp_fanout:
 
 			/* Check if AH is present. */
 			if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill,
-			    hada_mp, zoneid)) {
+			    inill, hada_mp, zoneid)) {
 				ip0dbg(("dst early hada drop\n"));
 				return;
 			}
@@ -8206,7 +7986,7 @@ tcp_fanout:
 			/* Restore the flags */
 			DB_CKSUMFLAGS(mp) = hck_flags;
 
-			mp = ip_rput_frag_v6(q, mp, ip6h, fraghdr,
+			mp = ip_rput_frag_v6(ill, inill, mp, ip6h, fraghdr,
 			    remlen - used, &prev_nexthdr_offset,
 			    &reass_sum, &reass_hck_flags);
 			if (mp == NULL) {
@@ -8249,7 +8029,7 @@ tcp_fanout:
 
 			/* Check if AH is present. */
 			if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill,
-			    hada_mp, zoneid)) {
+			    inill, hada_mp, zoneid)) {
 				ip0dbg(("routing hada drop\n"));
 				return;
 			}
@@ -8322,7 +8102,7 @@ tcp_fanout:
 				ii->ipsec_in_ill_index =
 				    ill->ill_phyint->phyint_ifindex;
 				ii->ipsec_in_rill_index =
-				    ii->ipsec_in_ill_index;
+				    inill->ill_phyint->phyint_ifindex;
 				first_mp->b_cont = mp;
 				/*
 				 * Cache hardware acceleration info.
@@ -8480,11 +8260,10 @@ hada_drop:
  * nexthdr field when reassembly completes.
  */
 static mblk_t *
-ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
+ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
     ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset,
     uint32_t *cksum_val, uint16_t *cksum_flags)
 {
-	ill_t		*ill = (ill_t *)q->q_ptr;
 	uint32_t	ident = ntohl(fraghdr->ip6f_ident);
 	uint16_t	offset;
 	boolean_t	more_frags;
@@ -8518,8 +8297,8 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 	 * addition, checksum offload support for IP fragments carrying
 	 * UDP payload is commonly implemented across network adapters.
 	 */
-	ASSERT(ill != NULL);
-	if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+	ASSERT(inill != NULL);
+	if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(inill) &&
 	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
 		mblk_t *mp1 = mp->b_cont;
 		int32_t len;
@@ -8581,7 +8360,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 			freemsg(mp);
 			return (NULL);
 		}
-		icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER,
+		icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER,
 		    (uint32_t)((char *)&ip6h->ip6_plen -
 		    (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst);
 		return (NULL);
@@ -8607,7 +8386,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 			freemsg(mp);
 			return (NULL);
 		}
-		icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER,
+		icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER,
 		    (uint32_t)((char *)&fraghdr->ip6f_offlg -
 		    (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst);
 		return (NULL);
@@ -9204,16 +8983,14 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
  * The routine can handle an ICMPv6 header that is not in the first mblk.
  *
  * The order to determine the outgoing interface is as follows:
- * 1. IPV6_BOUND_PIF is set, use that ill (conn_outgoing_pill)
- * 2. If conn_nofailover_ill is set then use that ill.
- * 3. If an ip6i_t with IP6I_IFINDEX set then use that ill.
- * 4. If q is an ill queue and (link local or multicast destination) then
+ * 1. If an ip6i_t with IP6I_IFINDEX set then use that ill.
+ * 2. If q is an ill queue and (link local or multicast destination) then
  *    use that ill.
- * 5. If IPV6_BOUND_IF has been set use that ill.
- * 6. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise
+ * 3. If IPV6_BOUND_IF has been set use that ill.
+ * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise
  *    look for the best IRE match for the unspecified group to determine
  *    the ill.
- * 7. For unicast: Just do an IRE lookup for the best match.
+ * 5. For unicast: Just do an IRE lookup for the best match.
  *
  * arg2 is always a queue_t *.
  * When that queue is an ill_t (i.e. q_next != NULL), then arg must be
@@ -9238,12 +9015,10 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
 	int		unspec_src;
 	boolean_t	do_outrequests;	/* Increment OutRequests? */
 	mib2_ipIfStatsEntry_t	*mibptr;
-	int 		match_flags = MATCH_IRE_ILL_GROUP;
-	boolean_t	attach_if = B_FALSE;
+	int 		match_flags = MATCH_IRE_ILL;
 	mblk_t		*first_mp;
 	boolean_t	mctl_present;
 	ipsec_out_t	*io;
-	boolean_t	drop_if_delayed = B_FALSE;
 	boolean_t	multirt_need_resolve = B_FALSE;
 	mblk_t		*copy_mp = NULL;
 	int		err = 0;
@@ -9574,16 +9349,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
 		 */
 		mp->b_rptr = (uchar_t *)ip6h;
 
-		/*
-		 * IP6I_ATTACH_IF is set in this function when we had a
-		 * conn and it was either bound to the IPFF_NOFAILOVER address
-		 * or IPV6_BOUND_PIF was set. These options override other
-		 * options that set the ifindex. We come here with
-		 * IP6I_ATTACH_IF set when we can't find the ire and
-		 * ip_newroute_v6 is feeding the packet for second time.
-		 */
-		if ((ip6i->ip6i_flags & IP6I_IFINDEX) ||
-		    (ip6i->ip6i_flags & IP6I_ATTACH_IF)) {
+		if (ip6i->ip6i_flags & IP6I_IFINDEX) {
 			ASSERT(ip6i->ip6i_ifindex != 0);
 			if (ill != NULL)
 				ill_refrele(ill);
@@ -9603,33 +9369,13 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
 				return;
 			}
 			mibptr = ill->ill_ip_mib;
-			if (ip6i->ip6i_flags & IP6I_IFINDEX) {
-				/*
-				 * Preserve the index so that when we return
-				 * from IPSEC processing, we know where to
-				 * send the packet.
-				 */
-				if (mctl_present) {
-					ASSERT(io != NULL);
-					io->ipsec_out_ill_index =
-					    ip6i->ip6i_ifindex;
-				}
-			}
-			if (ip6i->ip6i_flags & IP6I_ATTACH_IF) {
-				/*
-				 * This is a multipathing probe packet that has
-				 * been delayed in ND resolution. Drop the
-				 * packet for the reasons mentioned in
-				 * nce_queue_mp()
-				 */
-				if ((ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) &&
-				    (ip6i->ip6i_flags & IP6I_ND_DELAYED)) {
-					freemsg(first_mp);
-					ill_refrele(ill);
-					if (need_decref)
-						CONN_DEC_REF(connp);
-					return;
-				}
+			/*
+			 * Preserve the index so that when we return from
+			 * IPSEC processing, we know where to send the packet.
+			 */
+			if (mctl_present) {
+				ASSERT(io != NULL);
+				io->ipsec_out_ill_index = ip6i->ip6i_ifindex;
 			}
 		}
 		if (ip6i->ip6i_flags & IP6I_VERIFY_SRC) {
@@ -9698,114 +9444,20 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
 	if (IN6_IS_ADDR_MULTICAST(v6dstp))
 		goto ipv6multicast;
 
-	/* 1. IPV6_BOUND_PIF takes precedence over all the ifindex settings. */
-	if (connp != NULL && connp->conn_outgoing_pill != NULL) {
-		ill_t	*conn_outgoing_pill;
-
-		conn_outgoing_pill = conn_get_held_ill(connp,
-		    &connp->conn_outgoing_pill, &err);
-		if (err == ILL_LOOKUP_FAILED) {
-			if (ill != NULL)
-				ill_refrele(ill);
-			if (need_decref)
-				CONN_DEC_REF(connp);
-			freemsg(first_mp);
-			return;
-		}
-		if (conn_outgoing_pill != NULL) {
-			if (ill != NULL)
-				ill_refrele(ill);
-			ill = conn_outgoing_pill;
-			attach_if = B_TRUE;
-			match_flags = MATCH_IRE_ILL;
-			mibptr = ill->ill_ip_mib;
-
-			/*
-			 * Check if we need an ire that will not be
-			 * looked up by anybody else i.e. HIDDEN.
-			 */
-			if (ill_is_probeonly(ill))
-				match_flags |= MATCH_IRE_MARK_HIDDEN;
-			goto send_from_ill;
-		}
-	}
-
-	/* 2. If ipc_nofailover_ill is set then use that ill. */
-	if (connp != NULL && connp->conn_nofailover_ill != NULL) {
-		ill_t	*conn_nofailover_ill;
-
-		conn_nofailover_ill = conn_get_held_ill(connp,
-		    &connp->conn_nofailover_ill, &err);
-		if (err == ILL_LOOKUP_FAILED) {
-			if (ill != NULL)
-				ill_refrele(ill);
-			if (need_decref)
-				CONN_DEC_REF(connp);
-			freemsg(first_mp);
-			return;
-		}
-		if (conn_nofailover_ill != NULL) {
-			if (ill != NULL)
-				ill_refrele(ill);
-			ill = conn_nofailover_ill;
-			attach_if = B_TRUE;
-			/*
-			 * Assumes that ipc_nofailover_ill is used only for
-			 * multipathing probe packets. These packets are better
-			 * dropped, if they are delayed in ND resolution, for
-			 * the reasons described in nce_queue_mp().
-			 * IP6I_DROP_IFDELAYED will be set later on in this
-			 * function for this packet.
-			 */
-			drop_if_delayed = B_TRUE;
-			match_flags = MATCH_IRE_ILL;
-			mibptr = ill->ill_ip_mib;
-
-			/*
-			 * Check if we need an ire that will not be
-			 * looked up by anybody else i.e. HIDDEN.
-			 */
-			if (ill_is_probeonly(ill))
-				match_flags |= MATCH_IRE_MARK_HIDDEN;
-			goto send_from_ill;
-		}
-	}
-
-	/*
-	 * Redo 1. If we did not find an IRE_CACHE the first time, we should
-	 * have an ip6i_t with IP6I_ATTACH_IF if IPV6_BOUND_PIF or
-	 * bind to the IPIF_NOFAILOVER address was used on this endpoint.
-	 */
-	if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_ATTACH_IF)) {
-		ASSERT(ip6i->ip6i_ifindex != 0);
-		attach_if = B_TRUE;
-		ASSERT(ill != NULL);
-		match_flags = MATCH_IRE_ILL;
-
-		/*
-		 * Check if we need an ire that will not be
-		 * looked up by anybody else i.e. HIDDEN.
-		 */
-		if (ill_is_probeonly(ill))
-			match_flags |= MATCH_IRE_MARK_HIDDEN;
-		goto send_from_ill;
-	}
-
-	/* 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
+	/* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
 	if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
 		ASSERT(ill != NULL);
 		goto send_from_ill;
 	}
 
 	/*
-	 * 4. If q is an ill queue and (link local or multicast destination)
+	 * 2. If q is an ill queue and there's a link-local destination
 	 *    then use that ill.
 	 */
-	if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp)) {
+	if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp))
 		goto send_from_ill;
-	}
 
-	/* 5. If IPV6_BOUND_IF has been set use that ill. */
+	/* 3. If IPV6_BOUND_IF has been set use that ill. */
 	if (connp != NULL && connp->conn_outgoing_ill != NULL) {
 		ill_t	*conn_outgoing_ill;
 
@@ -9827,7 +9479,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
 	}
 
 	/*
-	 * 6. For unicast: Just do an IRE lookup for the best match.
+	 * 4. For unicast: Just do an IRE lookup for the best match.
 	 * If we get here for a link-local address it is rather random
 	 * what interface we pick on a multihomed host.
 	 * *If* there is an IRE_CACHE (and the link-local address
@@ -9913,7 +9565,6 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
 			}
 			BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
 		}
-		ASSERT(!attach_if);
 
 		/*
 		 * Check if the ire has the RTF_MULTIRT flag, inherited
@@ -9966,7 +9617,7 @@ ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
 			}
 		}
 		ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request,
-		    connp, caller, 0, ip6i_flags, zoneid);
+		    connp, caller, ip6i_flags, zoneid);
 		if (need_decref) {
 			CONN_DEC_REF(connp);
 			connp = NULL;
@@ -10086,9 +9737,6 @@ ipv6multicast:
 	ip2dbg(("ip_wput_v6: multicast\n"));
 
 	/*
-	 * 1. IPV6_BOUND_PIF takes precedence over all the ifindex settings
-	 * 2. If conn_nofailover_ill is set then use that ill.
-	 *
 	 * Hold the conn_lock till we refhold the ill of interest that is
 	 * pointed to from the conn. Since we cannot do an ill/ipif_refrele
 	 * while holding any locks, postpone the refrele until after the
@@ -10100,79 +9748,12 @@ ipv6multicast:
 	} else {
 		conn_lock_held = B_FALSE;
 	}
-	if (connp != NULL && connp->conn_outgoing_pill != NULL) {
-		err = ill_check_and_refhold(connp->conn_outgoing_pill);
-		if (err == ILL_LOOKUP_FAILED) {
-			ip1dbg(("ip_output_v6: multicast"
-			    " conn_outgoing_pill no ipif\n"));
-multicast_discard:
-			ASSERT(saved_ill == NULL);
-			if (conn_lock_held)
-				mutex_exit(&connp->conn_lock);
-			if (ill != NULL)
-				ill_refrele(ill);
-			freemsg(first_mp);
-			if (do_outrequests)
-				BUMP_MIB(mibptr, ipIfStatsOutDiscards);
-			if (need_decref)
-				CONN_DEC_REF(connp);
-			return;
-		}
-		saved_ill = ill;
-		ill = connp->conn_outgoing_pill;
-		attach_if = B_TRUE;
-		match_flags = MATCH_IRE_ILL;
-		mibptr = ill->ill_ip_mib;
-
-		/*
-		 * Check if we need an ire that will not be
-		 * looked up by anybody else i.e. HIDDEN.
-		 */
-		if (ill_is_probeonly(ill))
-			match_flags |= MATCH_IRE_MARK_HIDDEN;
-	} else if (connp != NULL && connp->conn_nofailover_ill != NULL) {
-		err = ill_check_and_refhold(connp->conn_nofailover_ill);
-		if (err == ILL_LOOKUP_FAILED) {
-			ip1dbg(("ip_output_v6: multicast"
-			    " conn_nofailover_ill no ipif\n"));
-			goto multicast_discard;
-		}
-		saved_ill = ill;
-		ill = connp->conn_nofailover_ill;
-		attach_if = B_TRUE;
-		match_flags = MATCH_IRE_ILL;
-
-		/*
-		 * Check if we need an ire that will not be
-		 * looked up by anybody else i.e. HIDDEN.
-		 */
-		if (ill_is_probeonly(ill))
-			match_flags |= MATCH_IRE_MARK_HIDDEN;
-	} else if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_ATTACH_IF)) {
-		/*
-		 * Redo 1. If we did not find an IRE_CACHE the first time,
-		 * we should have an ip6i_t with IP6I_ATTACH_IF if
-		 * IPV6_BOUND_PIF or bind to the IPIF_NOFAILOVER address was
-		 * used on this endpoint.
-		 */
-		ASSERT(ip6i->ip6i_ifindex != 0);
-		attach_if = B_TRUE;
-		ASSERT(ill != NULL);
-		match_flags = MATCH_IRE_ILL;
-
-		/*
-		 * Check if we need an ire that will not be
-		 * looked up by anybody else i.e. HIDDEN.
-		 */
-		if (ill_is_probeonly(ill))
-			match_flags |= MATCH_IRE_MARK_HIDDEN;
-	} else if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
-		/* 3. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
-
+	if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
+		/* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
 		ASSERT(ill != NULL);
 	} else if (ill != NULL) {
 		/*
-		 * 4. If q is an ill queue and (link local or multicast
+		 * 2. If q is an ill queue and (link local or multicast
 		 * destination) then use that ill.
 		 * We don't need the ipif initialization here.
 		 * This useless assert below is just to prevent lint from
@@ -10181,9 +9762,9 @@ multicast_discard:
 		ASSERT(ill != NULL);
 	} else if (connp != NULL) {
 		/*
-		 * 5. If IPV6_BOUND_IF has been set use that ill.
+		 * 3. If IPV6_BOUND_IF has been set use that ill.
 		 *
-		 * 6. For multicast: if IPV6_MULTICAST_IF has been set use it.
+		 * 4. For multicast: if IPV6_MULTICAST_IF has been set use it.
 		 * Otherwise look for the best IRE match for the unspecified
 		 * group to determine the ill.
 		 *
@@ -10198,7 +9779,18 @@ multicast_discard:
 			if (err == ILL_LOOKUP_FAILED) {
 				ip1dbg(("ip_output_v6: multicast"
 				    " conn_outgoing_ill no ipif\n"));
-				goto multicast_discard;
+multicast_discard:
+				ASSERT(saved_ill == NULL);
+				if (conn_lock_held)
+					mutex_exit(&connp->conn_lock);
+				if (ill != NULL)
+					ill_refrele(ill);
+				freemsg(first_mp);
+				if (do_outrequests)
+					BUMP_MIB(mibptr, ipIfStatsOutDiscards);
+				if (need_decref)
+					CONN_DEC_REF(connp);
+				return;
 			}
 			ill = connp->conn_outgoing_ill;
 		} else if (connp->conn_multicast_ill != NULL) {
@@ -10239,8 +9831,6 @@ multicast_discard:
 			 */
 			mutex_enter(&connp->conn_lock);
 			connp->conn_multicast_ill = ill;
-			connp->conn_orig_multicast_ifindex =
-			    ill->ill_phyint->phyint_ifindex;
 			mutex_exit(&connp->conn_lock);
 		}
 	}
@@ -10307,11 +9897,55 @@ multicast_discard:
 send_from_ill:
 	ASSERT(ill != NULL);
 	ASSERT(mibptr == ill->ill_ip_mib);
+
 	if (do_outrequests) {
 		BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
 		do_outrequests = B_FALSE;
 	}
 
+	/*
+	 * Because nce_xmit() calls ip_output_v6() and NCEs are always tied to
+	 * an underlying interface, IS_UNDER_IPMP() may be true even when
+	 * building IREs that will be used for data traffic.  As such, use the
+	 * packet's source address to determine whether the traffic is test
+	 * traffic, and set MATCH_IRE_MARK_TESTHIDDEN if so.
+	 *
+	 * Separately, we also need to mark probe packets so that ND can
+	 * process them specially; see the comments in nce_queue_mp_common().
+	 */
+	if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
+	    ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) {
+		if (ip6i == NULL) {
+			if ((mp = ip_add_info_v6(mp, NULL, v6dstp)) == NULL) {
+				if (mctl_present)
+					freeb(first_mp);
+				goto discard;
+			}
+
+			if (mctl_present)
+				first_mp->b_cont = mp;
+			else
+				first_mp = mp;
+
+			/* ndp_resolver() expects a pulled-up message */
+			if (MBLKL(mp) == sizeof (ip6i_t) &&
+			    pullupmsg(mp, -1) == 0) {
+				ip1dbg(("ip_output_v6: pullupmsg failed\n"));
+discard:			BUMP_MIB(mibptr, ipIfStatsOutDiscards);
+				ill_refrele(ill);
+				if (need_decref)
+					CONN_DEC_REF(connp);
+				return;
+			}
+			ip6i = (ip6i_t *)mp->b_rptr;
+			ip6h = (ip6_t *)&ip6i[1];
+			v6dstp = &ip6h->ip6_dst;
+			mp->b_rptr = (uchar_t *)ip6h;	/* rewound below */
+		}
+		ip6i->ip6i_flags |= IP6I_IPMP_PROBE;
+		match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+	}
+
 	if (io != NULL)
 		io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
 
@@ -10390,9 +10024,7 @@ send_from_ill:
 		    ill->ill_name, (void *)ire,
 		    ill->ill_phyint->phyint_ifindex));
 		ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request,
-		    connp, caller,
-		    (attach_if ? ill->ill_phyint->phyint_ifindex : 0),
-		    ip6i_flags, zoneid);
+		    connp, caller, ip6i_flags, zoneid);
 		ire_refrele(ire);
 		if (need_decref) {
 			CONN_DEC_REF(connp);
@@ -10422,7 +10054,8 @@ send_from_ill:
 					return;
 				}
 				ip_newroute_ipif_v6(q, copy_mp, ipif,
-				    ip6h->ip6_dst, unspec_src, zoneid);
+				    &ip6h->ip6_dst, &ip6h->ip6_src, unspec_src,
+				    zoneid);
 				ipif_refrele(ipif);
 			} else {
 				ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst,
@@ -10440,12 +10073,11 @@ send_from_ill:
 	/* Update rptr if there was an ip6i_t header. */
 	if (ip6i != NULL)
 		mp->b_rptr -= sizeof (ip6i_t);
-	if (unspec_src || attach_if) {
+	if (unspec_src) {
 		if (ip6i == NULL) {
 			/*
 			 * Add ip6i_t header to carry unspec_src
-			 * or attach_if until the packet comes back in
-			 * ip_wput_v6.
+			 * until the packet comes back in ip_wput_v6.
 			 */
 			if (mctl_present) {
 				first_mp->b_cont =
@@ -10481,28 +10113,15 @@ send_from_ill:
 			ip6h = (ip6_t *)&ip6i[1];
 			v6dstp = &ip6h->ip6_dst;
 		}
-		if (unspec_src)
-			ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
-		if (attach_if) {
-			/*
-			 * Bind to nofailover/BOUND_PIF overrides ifindex.
-			 */
-			ip6i->ip6i_flags |= IP6I_ATTACH_IF;
-			ip6i->ip6i_flags &= ~IP6I_IFINDEX;
-			ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
-			if (drop_if_delayed) {
-				/* This is a multipathing probe packet */
-				ip6i->ip6i_flags |= IP6I_DROP_IFDELAYED;
-			}
-		}
+		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
 		if (mctl_present) {
 			ASSERT(io != NULL);
 			io->ipsec_out_unspec_src = unspec_src;
 		}
 	}
 	if (IN6_IS_ADDR_MULTICAST(v6dstp)) {
-		ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, *v6dstp,
-		    unspec_src, zoneid);
+		ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, v6dstp,
+		    &ip6h->ip6_src, unspec_src, zoneid);
 	} else {
 		ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, ill,
 		    zoneid, ipst);
@@ -10544,14 +10163,6 @@ ip_wput_v6(queue_t *q, mblk_t *mp)
 		ip_output_v6(GLOBAL_ZONEID, mp, q, IP_WPUT);
 }
 
-static void
-ipsec_out_attach_if(ipsec_out_t *io, int attach_index)
-{
-	ASSERT(io->ipsec_out_type == IPSEC_OUT);
-	io->ipsec_out_attach_if = B_TRUE;
-	io->ipsec_out_ill_index = attach_index;
-}
-
 /*
  * NULL send-to queue - packet is to be delivered locally.
  */
@@ -10731,6 +10342,8 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
 			 */
 			if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
 			    !IS_LOOPBACK(ill)) {
+				ilm_walker_t ilw;
+
 				/*
 				 * In the multicast case, applications may have
 				 * joined the group from different zones, so we
@@ -10742,11 +10355,9 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
 				 * on the loopback interface (PHYI_LOOPBACK flag
 				 * set) as they must stay in the sender's zone.
 				 */
-				ILM_WALKER_HOLD(ill);
-				for (ilm = ill->ill_ilm; ilm != NULL;
-				    ilm = ilm->ilm_next) {
-					if (ilm->ilm_flags & ILM_DELETED)
-						continue;
+				ilm = ilm_walker_start(&ilw, ill);
+				for (; ilm != NULL;
+				    ilm = ilm_walker_step(&ilw, ilm)) {
 					if (!IN6_ARE_ADDR_EQUAL(
 					    &ilm->ilm_v6addr, &ip6h->ip6_dst))
 						continue;
@@ -10754,23 +10365,24 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
 					    IP_FF_NO_MCAST_LOOP) &&
 					    ilm->ilm_zoneid == ire->ire_zoneid)
 						continue;
-					if (!ipif_lookup_zoneid(ill,
-					    ilm->ilm_zoneid, IPIF_UP, NULL))
+					if (!ipif_lookup_zoneid(
+					    ilw.ilw_walk_ill, ilm->ilm_zoneid,
+					    IPIF_UP, NULL))
 						continue;
 
 					first_mp1 = ip_copymsg(first_mp);
 					if (first_mp1 == NULL)
 						continue;
-					icmp_inbound_v6(q, first_mp1, ill,
-					    hdr_length, mctl_present,
-					    IP6_NO_IPPOLICY, ilm->ilm_zoneid,
-					    NULL);
+					icmp_inbound_v6(q, first_mp1,
+					    ilw.ilw_walk_ill, ill, hdr_length,
+					    mctl_present, IP6_NO_IPPOLICY,
+					    ilm->ilm_zoneid, NULL);
 				}
-				ILM_WALKER_RELE(ill);
+				ilm_walker_finish(&ilw);
 			} else {
 				first_mp1 = ip_copymsg(first_mp);
 				if (first_mp1 != NULL)
-					icmp_inbound_v6(q, first_mp1, ill,
+					icmp_inbound_v6(q, first_mp1, ill, ill,
 					    hdr_length, mctl_present,
 					    IP6_NO_IPPOLICY, ire->ire_zoneid,
 					    NULL);
@@ -10823,8 +10435,7 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
  */
 static void
 ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
-    int cksum_request, conn_t *connp, int caller, int attach_index, int flags,
-    zoneid_t zoneid)
+    int cksum_request, conn_t *connp, int caller, int flags, zoneid_t zoneid)
 {
 	ip6_t		*ip6h;
 	uint8_t		nexthdr;
@@ -10917,7 +10528,7 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
 		if (src_ire != NULL &&
 		    !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
 		    (!ipst->ips_ip_restrict_interzone_loopback ||
-		    ire_local_same_ill_group(ire, src_ire))) {
+		    ire_local_same_lan(ire, src_ire))) {
 			if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
 			    !unspec_src) {
 				ip6h->ip6_src = src_ire->ire_src_addr_v6;
@@ -10974,20 +10585,14 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
 		/*
 		 * Select the source address using ipif_select_source_v6.
 		 */
-		if (attach_index != 0) {
-			ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst,
-			    RESTRICT_TO_ILL, IPV6_PREFER_SRC_DEFAULT, zoneid);
-		} else {
-			ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst,
-			    RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid);
-		}
+		ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, B_FALSE,
+		    IPV6_PREFER_SRC_DEFAULT, zoneid);
 		if (ipif == NULL) {
 			if (ip_debug > 2) {
 				/* ip1dbg */
 				pr_addr_dbg("ip_wput_ire_v6: no src for "
-				    "dst %s\n, ", AF_INET6, &ip6h->ip6_dst);
-				printf("ip_wput_ire_v6: interface name %s\n",
-				    ill->ill_name);
+				    "dst %s\n", AF_INET6, &ip6h->ip6_dst);
+				printf("through interface %s\n", ill->ill_name);
 			}
 			freemsg(first_mp);
 			return;
@@ -10998,12 +10603,8 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
 		if ((connp != NULL && connp->conn_multicast_loop) ||
 		    !IS_LOOPBACK(ill)) {
-			ilm_t	*ilm;
-
-			ILM_WALKER_HOLD(ill);
-			ilm = ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, ALL_ZONES);
-			ILM_WALKER_RELE(ill);
-			if (ilm != NULL) {
+			if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE,
+			    ALL_ZONES) != NULL) {
 				mblk_t *nmp;
 				int fanout_flags = 0;
 
@@ -11417,8 +11018,6 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
 			}
 			/* Do IPSEC processing first */
 			if (mctl_present) {
-				if (attach_index != 0)
-					ipsec_out_attach_if(io, attach_index);
 				ipsec_out_process(q, first_mp, ire, ill_index);
 				return;
 			}
@@ -11456,8 +11055,6 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
 				    max_frag, B_FALSE, B_TRUE, zoneid, ipst);
 				return;
 			}
-			if (attach_index != 0)
-				ipsec_out_attach_if(io, attach_index);
 			ipsec_out_process(q, first_mp, ire, ill_index);
 			return;
 		}
@@ -11948,8 +11545,8 @@ boolean_t
 conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags,
     zoneid_t zoneid)
 {
-	ill_t *in_ill;
-	boolean_t wantpacket = B_TRUE;
+	ill_t *bound_ill;
+	boolean_t wantpacket;
 	in6_addr_t *v6dst_ptr = &ip6h->ip6_dst;
 	in6_addr_t *v6src_ptr = &ip6h->ip6_src;
 
@@ -11958,42 +11555,16 @@ conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags,
 	 * unicast and multicast reception to conn_incoming_ill.
 	 * conn_wantpacket_v6 is called both for unicast and
 	 * multicast.
-	 *
-	 * 1) The unicast copy of the packet can come anywhere in
-	 *    the ill group if it is part of the group. Thus, we
-	 *    need to check to see whether the ill group matches
-	 *    if in_ill is part of a group.
-	 *
-	 * 2) ip_rput does not suppress duplicate multicast packets.
-	 *    If there are two interfaces in a ill group and we have
-	 *    2 applications (conns) joined a multicast group G on
-	 *    both the interfaces, ilm_lookup_ill filter in ip_rput
-	 *    will give us two packets because we join G on both the
-	 *    interfaces rather than nominating just one interface
-	 *    for receiving multicast like broadcast above. So,
-	 *    we have to call ilg_lookup_ill to filter out duplicate
-	 *    copies, if ill is part of a group, to supress duplicates.
 	 */
-	in_ill = connp->conn_incoming_ill;
-	if (in_ill != NULL) {
-		mutex_enter(&connp->conn_lock);
-		in_ill = connp->conn_incoming_ill;
-		mutex_enter(&ill->ill_lock);
-		/*
-		 * No IPMP, and the packet did not arrive on conn_incoming_ill
-		 * OR, IPMP in use and the packet arrived on an IPMP group
-		 * different from the conn_incoming_ill's IPMP group.
-		 * Reject the packet.
-		 */
-		if ((in_ill->ill_group == NULL && in_ill != ill) ||
-		    (in_ill->ill_group != NULL &&
-		    in_ill->ill_group !=  ill->ill_group)) {
-			wantpacket = B_FALSE;
+	bound_ill = connp->conn_incoming_ill;
+	if (bound_ill != NULL) {
+		if (IS_IPMP(bound_ill)) {
+			if (bound_ill->ill_grp != ill->ill_grp)
+				return (B_FALSE);
+		} else {
+			if (bound_ill != ill)
+				return (B_FALSE);
 		}
-		mutex_exit(&ill->ill_lock);
-		mutex_exit(&connp->conn_lock);
-		if (!wantpacket)
-			return (B_FALSE);
 	}
 
 	if (connp->conn_multi_router)
@@ -12140,7 +11711,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
 				    (IN6_ARE_ADDR_EQUAL(&first_ire->ire_addr_v6,
 				    &ire->ire_addr_v6)) &&
 				    !(first_ire->ire_marks &
-				    (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)))
+				    (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
 					break;
 			}
 
@@ -12204,8 +11775,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
 					    &ire->ire_addr_v6))
 						continue;
 					if (ire1->ire_marks &
-					    (IRE_MARK_CONDEMNED|
-					    IRE_MARK_HIDDEN))
+					    IRE_MARK_CONDEMNED)
 						continue;
 
 					/* Got one */
@@ -13279,3 +12849,31 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
 		size += ehdrlen;
 	}
 }
+
+/*
+ * Utility routine that checks if `v6srcp' is a valid address on underlying
+ * interface `ill'.  If `ipifp' is non-NULL, it's set to a held ipif
+ * associated with `v6srcp' on success.  NOTE: if this is not called from
+ * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
+ * group during or after this lookup.
+ */
+static boolean_t
+ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp)
+{
+	ipif_t *ipif;
+
+	ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst);
+	if (ipif != NULL) {
+		if (ipifp != NULL)
+			*ipifp = ipif;
+		else
+			ipif_refrele(ipif);
+		return (B_TRUE);
+	}
+
+	if (ip_debug > 2) {
+		pr_addr_dbg("ipif_lookup_testaddr_v6: cannot find ipif for "
+		    "src %s\n", AF_INET6, v6srcp);
+	}
+	return (B_FALSE);
+}
diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c
index 81447c2e30..c729118fec 100644
--- a/usr/src/uts/common/inet/ip/ip6_if.c
+++ b/usr/src/uts/common/inet/ip/ip6_if.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -53,7 +53,6 @@
 #include <netinet/igmp_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
-#include <netinet/in.h>
 
 #include <inet/common.h>
 #include <inet/nd.h>
@@ -178,10 +177,12 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
 				} else if (IPIF_CAN_WAIT(ipif, q)) {
 					ipsq = ill->ill_phyint->phyint_ipsq;
 					mutex_enter(&ipsq->ipsq_lock);
+					mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ill->ill_lock);
 					rw_exit(&ipst->ips_ill_g_lock);
 					ipsq_enq(ipsq, q, mp, func, NEW_OP,
 					    ill);
+					mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ipsq->ipsq_lock);
 					RELEASE_CONN_LOCK(q);
 					if (error != NULL)
@@ -202,16 +203,12 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
 }
 
 /*
- * Look for an ipif with the specified address. For point-point links
- * we look for matches on either the destination address and the local
- * address, but we ignore the check on the local address if IPIF_UNNUMBERED
- * is set.
- * Matches on a specific ill if match_ill is set.
+ * Common function for ipif_lookup_addr_v6() and ipif_lookup_addr_exact_v6().
  */
-/* ARGSUSED */
-ipif_t *
-ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
-    queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+static ipif_t *
+ipif_lookup_addr_common_v6(const in6_addr_t *addr, ill_t *match_ill,
+    boolean_t match_illgrp, zoneid_t zoneid, queue_t *q, mblk_t *mp,
+    ipsq_func_t func, int *error, ip_stack_t *ipst)
 {
 	ipif_t	*ipif;
 	ill_t	*ill;
@@ -230,7 +227,8 @@ ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
 repeat:
 	ill = ILL_START_WALK_V6(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (match_ill != NULL && ill != match_ill) {
+		if (match_ill != NULL && ill != match_ill &&
+		    (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
 			continue;
 		}
 		GRAB_CONN_LOCK(q);
@@ -257,10 +255,12 @@ repeat:
 				} else if (IPIF_CAN_WAIT(ipif, q)) {
 					ipsq = ill->ill_phyint->phyint_ipsq;
 					mutex_enter(&ipsq->ipsq_lock);
+					mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ill->ill_lock);
 					rw_exit(&ipst->ips_ill_g_lock);
 					ipsq_enq(ipsq, q, mp, func, NEW_OP,
 					    ill);
+					mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ipsq->ipsq_lock);
 					RELEASE_CONN_LOCK(q);
 					if (error != NULL)
@@ -323,11 +323,41 @@ ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid,
 }
 
 /*
+ * Lookup an ipif with the specified address.  For point-to-point links we
+ * look for matches on either the destination address or the local address,
+ * but we skip the local address check if IPIF_UNNUMBERED is set.  If the
+ * `match_ill' argument is non-NULL, the lookup is restricted to that ill
+ * (or illgrp if `match_ill' is in an IPMP group).
+ */
+ipif_t *
+ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
+    queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+{
+	return (ipif_lookup_addr_common_v6(addr, match_ill, B_TRUE, zoneid, q,
+	    mp, func, error, ipst));
+}
+
+/*
+ * Special abbreviated version of ipif_lookup_addr_v6() that doesn't match
+ * `match_ill' across the IPMP group.  This function is only needed in some
+ * corner-cases; almost everything should use ipif_lookup_addr_v6().
+ */
+ipif_t *
+ipif_lookup_addr_exact_v6(const in6_addr_t *addr, ill_t *match_ill,
+    ip_stack_t *ipst)
+{
+	ASSERT(match_ill != NULL);
+	return (ipif_lookup_addr_common_v6(addr, match_ill, B_FALSE, ALL_ZONES,
+	    NULL, NULL, NULL, NULL, ipst));
+}
+
+/*
  * Look for an ipif with the specified address. For point-point links
  * we look for matches on either the destination address and the local
  * address, but we ignore the check on the local address if IPIF_UNNUMBERED
  * is set.
- * Matches on a specific ill if match_ill is set.
+ * If the `match_ill' argument is non-NULL, the lookup is restricted to that
+ * ill (or illgrp if `match_ill' is in an IPMP group).
  * Return the zoneid for the ipif. ALL_ZONES if none found.
  */
 zoneid_t
@@ -348,7 +378,8 @@ ipif_lookup_addr_zoneid_v6(const in6_addr_t *addr, ill_t *match_ill,
 repeat:
 	ill = ILL_START_WALK_V6(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (match_ill != NULL && ill != match_ill) {
+		if (match_ill != NULL && ill != match_ill &&
+		    !IS_IN_SAME_ILLGRP(ill, match_ill)) {
 			continue;
 		}
 		mutex_enter(&ill->ill_lock);
@@ -1120,11 +1151,10 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
 boolean_t
 ill_setdefaulttoken(ill_t *ill)
 {
-	int 		i;
+	int		i;
 	in6_addr_t	v6addr, v6mask;
 
-	if (!MEDIA_V6INTFID(ill->ill_media, ill->ill_phys_addr_length,
-	    ill->ill_phys_addr, &v6addr))
+	if (!MEDIA_V6INTFID(ill->ill_media, ill, &v6addr))
 		return (B_FALSE);
 
 	(void) ip_plen_to_mask_v6(IPV6_TOKEN_LEN, &v6mask);
@@ -1161,7 +1191,7 @@ ipif_set_tun_auto_addr(ipif_t *ipif, struct iftun_req *ta)
 {
 	sin6_t	sin6;
 	sin_t	*sin;
-	ill_t 	*ill = ipif->ipif_ill;
+	ill_t	*ill = ipif->ipif_ill;
 	tun_t *tp = (tun_t *)ill->ill_wq->q_next->q_ptr;
 
 	if (ta->ifta_saddr.ss_family != AF_INET ||
@@ -1227,7 +1257,7 @@ ipif_set_tun_llink(ill_t *ill, struct iftun_req *ta)
 
 	if ((ta->ifta_flags & IFTUN_DST) &&
 	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)) {
-		sin6_t	sin6;
+		sin6_t  sin6;
 
 		ASSERT(!(ipif->ipif_flags & IPIF_UP));
 		bzero(&sin6, sizeof (sin6_t));
@@ -1344,13 +1374,22 @@ ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce)
 
 	if (ret_nce != NULL)
 		*ret_nce = NULL;
+
+	/*
+	 * IPMP meta-interfaces don't have any inherent multicast mappings,
+	 * and instead use the ones on the underlying interfaces.
+	 */
+	if (IS_IPMP(ill))
+		return (0);
+
 	/*
 	 * Delete the mapping nce. Normally these should not exist
 	 * as a previous ipif_down -> ipif_ndp_down should have deleted
 	 * all the nces. But they can exist if ip_rput_dlpi_writer
-	 * calls this when PHYI_MULTI_BCAST is set.
+	 * calls this when PHYI_MULTI_BCAST is set.  Mappings are always
+	 * tied to the underlying ill, so don't match across the illgrp.
 	 */
-	mnce = ndp_lookup_v6(ill, &v6_mcast_addr, B_FALSE);
+	mnce = ndp_lookup_v6(ill, B_FALSE, &v6_mcast_addr, B_FALSE);
 	if (mnce != NULL) {
 		ndp_delete(mnce);
 		NCE_REFRELE(mnce);
@@ -1424,13 +1463,15 @@ ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce)
  * Get the resolver set up for a new ipif.  (Always called as writer.)
  */
 int
-ipif_ndp_up(ipif_t *ipif)
+ipif_ndp_up(ipif_t *ipif, boolean_t initial)
 {
 	ill_t		*ill = ipif->ipif_ill;
 	int		err = 0;
 	nce_t		*nce = NULL;
 	nce_t		*mnce = NULL;
+	boolean_t	added_ipif = B_FALSE;
 
+	ASSERT(IAM_WRITER_ILL(ill));
 	ip1dbg(("ipif_ndp_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
 
 	/*
@@ -1464,7 +1505,10 @@ ipif_ndp_up(ipif_t *ipif)
 
 	if ((ipif->ipif_flags & (IPIF_UNNUMBERED|IPIF_NOLOCAL)) == 0) {
 		uint16_t	flags;
-		uchar_t	*hw_addr = NULL;
+		uint16_t	state;
+		uchar_t		*hw_addr = NULL;
+		ill_t		*bound_ill;
+		ipmp_illgrp_t	*illg = ill->ill_grp;
 
 		/* Permanent entries don't need NUD */
 		flags = NCE_F_PERMANENT | NCE_F_NONUD;
@@ -1474,26 +1518,65 @@ ipif_ndp_up(ipif_t *ipif)
 		if (ipif->ipif_flags & IPIF_ANYCAST)
 			flags |= NCE_F_ANYCAST;
 
-		if (ill->ill_net_type == IRE_IF_RESOLVER) {
-			hw_addr = ill->ill_nd_lla;
-
-			if (ill->ill_move_in_progress) {
-				/*
-				 * Addresses are failing over to this ill.
-				 * Don't wait for NUD to see this change.
-				 * Publish our new link-layer address.
-				 */
-				flags |= NCE_F_UNSOL_ADV;
+		if (IS_IPMP(ill)) {
+			ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
+			/*
+			 * If we're here via ipif_up(), then the ipif won't be
+			 * bound yet -- add it to the group, which will bind
+			 * it if possible.  (We would add it in ipif_up(), but
+			 * deleting on failure there is gruesome.)  If we're
+			 * here via ipmp_ill_bind_ipif(), then the ipif has
+			 * already been added to the group and we just need to
+			 * use the binding.
+			 */
+			if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
+				bound_ill = ipmp_illgrp_add_ipif(illg, ipif);
+				if (bound_ill == NULL) {
+					/*
+					 * We couldn't bind the ipif to an ill
+					 * yet, so we have nothing to publish.
+					 * Set ipif_addr_ready so that this
+					 * address can be used locally for now.
+					 * The routing socket message will be
+					 * sent from ipif_up_done_v6().
+					 */
+					ipif->ipif_addr_ready = 1;
+					return (0);
+				}
+				added_ipif = B_TRUE;
 			}
+			hw_addr = bound_ill->ill_nd_lla;
+		} else {
+			bound_ill = ill;
+			if (ill->ill_net_type == IRE_IF_RESOLVER)
+				hw_addr = ill->ill_nd_lla;
+		}
+
+		/*
+		 * If this is an initial bring-up (or the ipif was never
+		 * completely brought up), do DAD.  Otherwise, we're here
+		 * because IPMP has rebound an address to this ill: send
+		 * unsolicited advertisements to inform others.
+		 */
+		if (initial || !ipif->ipif_addr_ready) {
+			state = ND_PROBE;
+		} else {
+			state = ND_REACHABLE;
+			flags |= NCE_F_UNSOL_ADV;
 		}
-		err = ndp_lookup_then_add_v6(ill,
+		/*
+		 * NOTE: for IPMP, local addresses are always associated with
+		 * the ill they're bound to, so don't match across the illgrp.
+		 */
+		err = ndp_lookup_then_add_v6(bound_ill,
+		    B_FALSE,
 		    hw_addr,
 		    &ipif->ipif_v6lcl_addr,
 		    &ipv6_all_ones,
 		    &ipv6_all_zeros,
 		    0,
 		    flags,
-		    ND_PROBE,	/* Causes Duplicate Address Detection to run */
+		    state,
 		    &nce);
 		switch (err) {
 		case 0:
@@ -1509,19 +1592,11 @@ ipif_ndp_up(ipif_t *ipif)
 			NCE_REFRELE(nce);
 			ip1dbg(("ipif_ndp_up: NCE already exists for %s\n",
 			    ill->ill_name));
-			if (mnce != NULL) {
-				ndp_delete(mnce);
-				NCE_REFRELE(mnce);
-			}
-			return (err);
+			goto fail;
 		default:
-			ip1dbg(("ipif_ndp_up: NCE creation failed %s\n",
+			ip1dbg(("ipif_ndp_up: NCE creation failed for %s\n",
 			    ill->ill_name));
-			if (mnce != NULL) {
-				ndp_delete(mnce);
-				NCE_REFRELE(mnce);
-			}
-			return (err);
+			goto fail;
 		}
 	} else {
 		/* No local NCE for this entry */
@@ -1532,6 +1607,15 @@ ipif_ndp_up(ipif_t *ipif)
 	if (mnce != NULL)
 		NCE_REFRELE(mnce);
 	return (0);
+fail:
+	if (mnce != NULL) {
+		ndp_delete(mnce);
+		NCE_REFRELE(mnce);
+	}
+	if (added_ipif)
+		ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+
+	return (err);
 }
 
 /* Remove all cache entries for this logical interface */
@@ -1539,23 +1623,42 @@ void
 ipif_ndp_down(ipif_t *ipif)
 {
 	nce_t	*nce;
+	ill_t	*ill = ipif->ipif_ill;
+
+	ASSERT(IAM_WRITER_ILL(ill));
 
 	if (ipif->ipif_isv6) {
-		nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr,
-		    B_FALSE);
-		if (nce != NULL) {
-			ndp_delete(nce);
-			NCE_REFRELE(nce);
+		ill_t *bound_ill;
+
+		if (IS_IPMP(ill))
+			bound_ill = ipmp_ipif_bound_ill(ipif);
+		else
+			bound_ill = ill;
+
+		if (bound_ill != NULL) {
+			nce = ndp_lookup_v6(bound_ill,
+			    B_FALSE,	/* see comment in ipif_ndp_up() */
+			    &ipif->ipif_v6lcl_addr,
+			    B_FALSE);
+			if (nce != NULL) {
+				ndp_delete(nce);
+				NCE_REFRELE(nce);
+			}
 		}
+
+		/*
+		 * Make IPMP aware of the deleted data address.
+		 */
+		if (IS_IPMP(ill))
+			ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
 	}
+
 	/*
 	 * Remove mapping and all other nces dependent on this ill
 	 * when the last ipif is going away.
 	 */
-	if (ipif->ipif_ill->ill_ipif_up_count == 0) {
-		ndp_walk(ipif->ipif_ill, (pfi_t)ndp_delete_per_ill,
-		    (uchar_t *)ipif->ipif_ill, ipif->ipif_ill->ill_ipst);
-	}
+	if (ill->ill_ipif_up_count == 0)
+		ndp_walk(ill, (pfi_t)ndp_delete_per_ill, ill, ill->ill_ipst);
 }
 
 /*
@@ -1936,9 +2039,7 @@ rule_preferred(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
 }
 
 /*
- * Prefer source addresses that are assigned to the outgoing interface, or
- * to an interface that is in the same IPMP group as the outgoing
- * interface.
+ * Prefer source addresses that are assigned to the outgoing interface.
  */
 /* ARGSUSED3 */
 static rule_res_t
@@ -1955,15 +2056,11 @@ rule_interface(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
 		return (CAND_TIE);
 
 	if (!bc->cand_matchedinterface_set) {
-		bc->cand_matchedinterface = (bc->cand_ill == dstill ||
-		    (dstill->ill_group != NULL &&
-		    dstill->ill_group == bc->cand_ill->ill_group));
+		bc->cand_matchedinterface = bc->cand_ill == dstill;
 		bc->cand_matchedinterface_set = B_TRUE;
 	}
 
-	cc->cand_matchedinterface = (cc->cand_ill == dstill ||
-	    (dstill->ill_group != NULL &&
-	    dstill->ill_group == cc->cand_ill->ill_group));
+	cc->cand_matchedinterface = cc->cand_ill == dstill;
 	cc->cand_matchedinterface_set = B_TRUE;
 
 	if (bc->cand_matchedinterface == cc->cand_matchedinterface)
@@ -2134,6 +2231,13 @@ rule_addr_type(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
 static rule_res_t
 rule_prefix(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst)
 {
+	/*
+	 * For IPMP, we always want to choose a random source address from
+	 * among any equally usable addresses, so always report a tie.
+	 */
+	if (IS_IPMP(dstinfo->dst_ill))
+		return (CAND_TIE);
+
 	if (!bc->cand_common_pref_set) {
 		bc->cand_common_pref = ip_common_prefix_v6(&bc->cand_srcaddr,
 		    dstinfo->dst_addr);
@@ -2177,10 +2281,9 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
  * specification's algorithm could traverse the list of addresses once for
  * every rule).
  *
- * The restrict_ill argument restricts the algorithm to chose a source
- * address that is assigned to the destination ill or an ill in the same
- * IPMP group as the destination ill.  This is used when the destination
- * address is a link-local or multicast address, and when
+ * The restrict_ill argument restricts the algorithm to choose a source
+ * address that is assigned to the destination ill.  This is used when
+ * the destination address is a link-local or multicast address, and when
  * ipv6_strict_dst_multihoming is turned on.
  *
  * src_prefs is the caller's set of source address preferences.  If source
@@ -2192,13 +2295,13 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
  */
 ipif_t *
 ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
-    uint_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid)
+    boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid)
 {
 	dstinfo_t	dstinfo;
 	char		dstr[INET6_ADDRSTRLEN];
 	char		sstr[INET6_ADDRSTRLEN];
-	ipif_t		*ipif;
-	ill_t		*ill, *usesrc_ill = NULL;
+	ipif_t		*ipif, *start_ipif, *next_ipif;
+	ill_t		*ill, *usesrc_ill = NULL, *ipmp_ill = NULL;
 	ill_walk_context_t	ctx;
 	cand_t		best_c;	/* The best candidate */
 	cand_t		curr_c;	/* The current candidate */
@@ -2247,6 +2350,16 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
 		} else {
 			return (NULL);
 		}
+	} else if (IS_UNDER_IPMP(dstill)) {
+		/*
+		 * Test addresses should never be used for source address
+		 * selection, so if we were passed an underlying ill, switch
+		 * to the IPMP meta-interface.
+		 */
+		if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(dstill)) != NULL)
+			dstinfo.dst_ill = ipmp_ill;
+		else
+			return (NULL);
 	} else {
 		dstinfo.dst_ill = dstill;
 	}
@@ -2286,10 +2399,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
 	 */
 	if (IN6_IS_ADDR_LINKLOCAL(dst) || IN6_IS_ADDR_MULTICAST(dst) ||
 	    ipst->ips_ipv6_strict_dst_multihoming || usesrc_ill != NULL) {
-		if (restrict_ill == RESTRICT_TO_NONE)
-			dstinfo.dst_restrict_ill = RESTRICT_TO_GROUP;
-		else
-			dstinfo.dst_restrict_ill = restrict_ill;
+		dstinfo.dst_restrict_ill = B_TRUE;
 	} else {
 		dstinfo.dst_restrict_ill = restrict_ill;
 	}
@@ -2297,39 +2407,41 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
 	bzero(&best_c, sizeof (cand_t));
 
 	/*
-	 * Take a pass through the list of IPv6 interfaces to chose the
-	 * best possible source address.  If restrict_ill is true, we only
-	 * iterate through the ill's that are in the same IPMP group as the
-	 * destination's outgoing ill.  If restrict_ill is false, we walk
-	 * the entire list of IPv6 ill's.
+	 * Take a pass through the list of IPv6 interfaces to choose the best
+	 * possible source address.  If restrict_ill is set, just use dst_ill.
 	 */
-	if (dstinfo.dst_restrict_ill != RESTRICT_TO_NONE) {
-		if (dstinfo.dst_ill->ill_group != NULL &&
-		    dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP) {
-			ill = dstinfo.dst_ill->ill_group->illgrp_ill;
-		} else {
-			ill = dstinfo.dst_ill;
-		}
-	} else {
+	if (dstinfo.dst_restrict_ill)
+		ill = dstinfo.dst_ill;
+	else
 		ill = ILL_START_WALK_V6(&ctx, ipst);
-	}
 
-	while (ill != NULL) {
+	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
 		ASSERT(ill->ill_isv6);
 
 		/*
-		 * Avoid FAILED/OFFLINE ills.
-		 * Global and site local addresses will failover and
-		 * will be available on the new ill.
-		 * But link local addresses don't move.
+		 * Test addresses should never be used for source address
+		 * selection, so ignore underlying ills.
 		 */
-		if (dstinfo.dst_restrict_ill != RESTRICT_TO_ILL &&
-		    ill->ill_phyint->phyint_flags &
-		    (PHYI_OFFLINE | PHYI_FAILED))
-			goto next_ill;
+		if (IS_UNDER_IPMP(ill))
+			continue;
 
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
+		/*
+		 * For source address selection, we treat the ipif list as
+		 * circular and continue until we get back to where we
+		 * started.  This allows IPMP to vary source address selection
+		 * (which improves inbound load spreading) by caching its last
+		 * ending point and starting from there.  NOTE: we don't have
+		 * to worry about ill_src_ipif changing ills since that can't
+		 * happen on the IPMP ill.
+		 */
+		start_ipif = ill->ill_ipif;
+		if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
+			start_ipif = ill->ill_src_ipif;
+
+		ipif = start_ipif;
+		do {
+			if ((next_ipif = ipif->ipif_next) == NULL)
+				next_ipif = ill->ill_ipif;
 
 			if (!IPIF_VALID_IPV6_SOURCE(ipif))
 				continue;
@@ -2387,9 +2499,8 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
 			 */
 			for (index = 0; rules[index] != NULL; index++) {
 				/* Apply a comparison rule. */
-				rule_result =
-				    (rules[index])(&best_c, &curr_c, &dstinfo,
-				    ipst);
+				rule_result = (rules[index])(&best_c, &curr_c,
+				    &dstinfo, ipst);
 				if (rule_result == CAND_AVOID) {
 					/*
 					 * The best candidate is still the
@@ -2417,21 +2528,29 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
 			 * have been prefered as the best candidate so far.
 			 */
 			ASSERT(rule_result != CAND_TIE);
+		} while ((ipif = next_ipif) != start_ipif);
+
+		/*
+		 * For IPMP, update the source ipif rotor to the next ipif,
+		 * provided we can look it up.  (We must not use it if it's
+		 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
+		 * ipif_free() checked ill_src_ipif.)
+		 */
+		if (IS_IPMP(ill) && ipif != NULL) {
+			mutex_enter(&ipif->ipif_ill->ill_lock);
+			next_ipif = ipif->ipif_next;
+			if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+				ill->ill_src_ipif = next_ipif;
+			else
+				ill->ill_src_ipif = NULL;
+			mutex_exit(&ipif->ipif_ill->ill_lock);
 		}
 
 		/*
-		 * We may be walking the linked-list of ill's in an
-		 * IPMP group or traversing the IPv6 ill avl tree. If it is a
-		 * usesrc ILL then it can't be part of IPMP group and we
-		 * will exit the while loop.
+		 * Only one ill to consider if dst_restrict_ill is set.
 		 */
-next_ill:
-		if (dstinfo.dst_restrict_ill == RESTRICT_TO_ILL)
-			ill = NULL;
-		else if (dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP)
-			ill = ill->ill_group_next;
-		else
-			ill = ill_next(&ctx, ill);
+		if (dstinfo.dst_restrict_ill)
+			break;
 	}
 
 	ipif = best_c.cand_ipif;
@@ -2444,6 +2563,9 @@ next_ill:
 	if (usesrc_ill != NULL)
 		ill_refrele(usesrc_ill);
 
+	if (ipmp_ill != NULL)
+		ill_refrele(ipmp_ill);
+
 	if (dst_rhtp != NULL)
 		TPC_RELE(dst_rhtp);
 
@@ -2474,8 +2596,7 @@ next_ill:
  * ipif_update_other_ipifs calls us.
  *
  * If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when illgrp_insert or ipif_up_done_v6
- * calls us.
+ * if needed. This happens when ipif_up_done_v6 calls us.
  */
 void
 ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
@@ -2561,8 +2682,7 @@ ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
 	if (ip6_asp_can_lookup(ipst)) {
 		ip6_asp_table_held = B_TRUE;
 		nipif = ipif_select_source_v6(ill, &ipif->ipif_v6subnet,
-		    RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT,
-		    ipif->ipif_zoneid);
+		    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
 	}
 	if (nipif == NULL) {
 		/* Last resort - all ipif's have IPIF_NOLOCAL */
@@ -2630,13 +2750,9 @@ ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
  * Find the IRE_INTERFACE for such ipif's and recreate them
  * to use an different source address following the rules in
  * ipif_up_done_v6.
- *
- * This function takes an illgrp as an argument so that illgrp_delete
- * can call this to update source address even after deleting the
- * old_ipif->ipif_ill from the ill group.
  */
 void
-ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp)
+ipif_update_other_ipifs_v6(ipif_t *old_ipif)
 {
 	ipif_t	*ipif;
 	ill_t	*ill;
@@ -2651,23 +2767,9 @@ ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp)
 	    inet_ntop(AF_INET6, &old_ipif->ipif_v6lcl_addr,
 	    buf, sizeof (buf))));
 
-	/*
-	 * If this part of a group, look at all ills as ipif_select_source
-	 * borrows a source address across all the ills in the group.
-	 */
-	if (illgrp != NULL)
-		ill = illgrp->illgrp_ill;
-
-	/* Don't need a lock since this is a writer */
-	for (; ill != NULL; ill = ill->ill_group_next) {
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-
-			if (ipif == old_ipif)
-				continue;
-
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+		if (ipif != old_ipif)
 			ipif_recreate_interface_routes_v6(old_ipif, ipif);
-		}
 	}
 }
 
@@ -2828,12 +2930,10 @@ ipif_up_done_v6(ipif_t *ipif)
 	boolean_t	flush_ire_cache = B_TRUE;
 	int	err;
 	char	buf[INET6_ADDRSTRLEN];
-	phyint_t *phyi;
 	ire_t	**ipif_saved_irep = NULL;
 	int ipif_saved_ire_cnt;
 	int cnt;
 	boolean_t src_ipif_held = B_FALSE;
-	boolean_t ire_added = B_FALSE;
 	boolean_t loopback = B_FALSE;
 	boolean_t ip6_asp_table_held = B_FALSE;
 	ip_stack_t	*ipst = ill->ill_ipst;
@@ -2868,8 +2968,8 @@ ipif_up_done_v6(ipif_t *ipif)
 		break;
 	}
 	if (flush_ire_cache)
-		ire_walk_ill_v6(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE,
-		    IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
+		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
+		    IRE_CACHE, ill_ipif_cache_delete, ill, ill);
 
 	/*
 	 * Figure out which way the send-to queue should go.  Only
@@ -2900,7 +3000,9 @@ ipif_up_done_v6(ipif_t *ipif)
 			ipif->ipif_ire_type = IRE_LOCAL;
 	}
 
-	if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) {
+	if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
+	    ((ipif->ipif_flags & IPIF_DEPRECATED) &&
+	    !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
 		/*
 		 * Can't use our source address. Select a different
 		 * source address for the IRE_INTERFACE and IRE_LOCAL
@@ -2908,7 +3010,7 @@ ipif_up_done_v6(ipif_t *ipif)
 		if (ip6_asp_can_lookup(ipst)) {
 			ip6_asp_table_held = B_TRUE;
 			src_ipif = ipif_select_source_v6(ipif->ipif_ill,
-			    &ipif->ipif_v6subnet, RESTRICT_TO_NONE,
+			    &ipif->ipif_v6subnet, B_FALSE,
 			    IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
 		}
 		if (src_ipif == NULL)
@@ -3090,9 +3192,9 @@ ipif_up_done_v6(ipif_t *ipif)
 	ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
 
 	/*
-	 * Need to atomically check for ip_addr_availablity_check
-	 * now under ill_g_lock, and if it fails got bad, and remove
-	 * from group also
+	 * Need to atomically check for IP address availability under
+	 * ip_addr_avail_lock.  ill_g_lock is held as reader to ensure no new
+	 * ills or new ipifs can be added while we are checking availability.
 	 */
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	mutex_enter(&ipst->ips_ip_addr_avail_lock);
@@ -3125,9 +3227,7 @@ ipif_up_done_v6(ipif_t *ipif)
 	}
 
 	/*
-	 * Add in all newly created IREs. We want to add before
-	 * we call ifgrp_insert which wants to know whether
-	 * IRE_IF_RESOLVER exists or not.
+	 * Add in all newly created IREs.
 	 *
 	 * NOTE : We refrele the ire though we may branch to "bad"
 	 *	  later on where we do ire_delete. This is okay
@@ -3148,36 +3248,6 @@ ipif_up_done_v6(ipif_t *ipif)
 		ip6_asp_table_refrele(ipst);
 		ip6_asp_table_held = B_FALSE;
 	}
-	ire_added = B_TRUE;
-
-	/*
-	 * Form groups if possible.
-	 *
-	 * If we are supposed to be in a ill_group with a name, insert it
-	 * now as we know that at least one ipif is UP. Otherwise form
-	 * nameless groups.
-	 *
-	 * If ip_enable_group_ifs is set and ipif address is not ::0, insert
-	 * this ipif into the appropriate interface group, or create a
-	 * new one. If this is already in a nameless group, we try to form
-	 * a bigger group looking at other ills potentially sharing this
-	 * ipif's prefix.
-	 */
-	phyi = ill->ill_phyint;
-	if (phyi->phyint_groupname_len != 0) {
-		ASSERT(phyi->phyint_groupname != NULL);
-		if (ill->ill_ipif_up_count == 1) {
-			ASSERT(ill->ill_group == NULL);
-			err = illgrp_insert(&ipst->ips_illgrp_head_v6, ill,
-			    phyi->phyint_groupname, NULL, B_TRUE);
-			if (err != 0) {
-				ip1dbg(("ipif_up_done_v6: illgrp allocation "
-				    "failed, error %d\n", err));
-				goto bad;
-			}
-		}
-		ASSERT(ill->ill_group != NULL);
-	}
 
 	/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
 	ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
@@ -3190,19 +3260,23 @@ ipif_up_done_v6(ipif_t *ipif)
 		 */
 		ill_recover_multicast(ill);
 	}
-	/* Join the allhosts multicast address and the solicited node MC */
-	ipif_multicast_up(ipif);
 
-	if (!loopback) {
+	if (ill->ill_ipif_up_count == 1) {
 		/*
-		 * See whether anybody else would benefit from the
-		 * new ipif that we added. We call this always rather
-		 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST
-		 * ipif for the benefit of illgrp_insert (done above)
-		 * which does not do source address selection as it does
-		 * not want to re-create interface routes that we are
-		 * having reference to it here.
+		 * Since the interface is now up, it may now be active.
 		 */
+		if (IS_UNDER_IPMP(ill))
+			ipmp_ill_refresh_active(ill);
+	}
+
+	/* Join the allhosts multicast address and the solicited node MC */
+	ipif_multicast_up(ipif);
+
+	/*
+	 * See if anybody else would benefit from our new ipif.
+	 */
+	if (!loopback &&
+	    !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
 		ill_update_source_selection(ill);
 	}
 
@@ -3238,29 +3312,11 @@ ipif_up_done_v6(ipif_t *ipif)
 bad:
 	if (ip6_asp_table_held)
 		ip6_asp_table_refrele(ipst);
-	/*
-	 * We don't have to bother removing from ill groups because
-	 *
-	 * 1) For groups with names, we insert only when the first ipif
-	 *    comes up. In that case if it fails, it will not be in any
-	 *    group. So, we need not try to remove for that case.
-	 *
-	 * 2) For groups without names, either we tried to insert ipif_ill
-	 *    in a group as singleton or found some other group to become
-	 *    a bigger group. For the former, if it fails we don't have
-	 *    anything to do as ipif_ill is not in the group and for the
-	 *    latter, there are no failures in illgrp_insert/illgrp_delete
-	 *    (ENOMEM can't occur for this. Check ifgrp_insert).
-	 */
 
 	while (irep > ire_array) {
 		irep--;
-		if (*irep != NULL) {
+		if (*irep != NULL)
 			ire_delete(*irep);
-			if (ire_added)
-				ire_refrele(*irep);
-		}
-
 	}
 	(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
 
@@ -3272,8 +3328,7 @@ bad:
 		ipif_refrele(src_ipif);
 
 	ipif_ndp_down(ipif);
-	if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV)
-		ipif_arp_down(ipif);
+	ipif_resolver_down(ipif);
 
 	return (err);
 }
@@ -3286,15 +3341,14 @@ int
 ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
 {
-	in6_addr_t	addr;
 	sin6_t		*sin6;
 	nce_t		*nce;
 	struct lifreq	*lifr;
 	lif_nd_req_t	*lnr;
-	mblk_t	*mp1;
+	ill_t		*ill = ipif->ipif_ill;
+	ire_t		*ire;
 
-	mp1 = mp->b_cont->b_cont;
-	lifr = (struct lifreq *)mp1->b_rptr;
+	lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr;
 	lnr = &lifr->lifr_nd;
 	/* Only allow for logical unit zero i.e. not on "le0:17" */
 	if (ipif->ipif_id != 0)
@@ -3307,8 +3361,28 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 		return (EAFNOSUPPORT);
 
 	sin6 = (sin6_t *)&lnr->lnr_addr;
-	addr = sin6->sin6_addr;
-	nce = ndp_lookup_v6(ipif->ipif_ill, &addr, B_FALSE);
+
+	/*
+	 * Since ND mappings must be consistent across an IPMP group, prohibit
+	 * deleting ND mappings on underlying interfaces.  Also, since ND
+	 * mappings for IPMP data addresses are owned by IP itself, prohibit
+	 * deleting them.
+	 */
+	if (IS_UNDER_IPMP(ill))
+		return (EPERM);
+
+	if (IS_IPMP(ill)) {
+		ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL,
+		    ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL,
+		    ill->ill_ipst);
+		if (ire != NULL) {
+			ire_refrele(ire);
+			return (EPERM);
+		}
+	}
+
+	/* See comment in ndp_query() regarding IS_IPMP(ill) usage */
+	nce = ndp_lookup_v6(ill, IS_IPMP(ill), &sin6->sin6_addr, B_FALSE);
 	if (nce == NULL)
 		return (ESRCH);
 	ndp_delete(nce);
@@ -3354,11 +3428,11 @@ int
 ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
 {
+	sin6_t		*sin6;
 	ill_t		*ill = ipif->ipif_ill;
 	struct	lifreq	*lifr;
 	lif_nd_req_t	*lnr;
-
-	ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
+	ire_t		*ire;
 
 	lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr;
 	lnr = &lifr->lifr_nd;
@@ -3372,5 +3446,26 @@ ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	if (lnr->lnr_addr.ss_family != AF_INET6)
 		return (EAFNOSUPPORT);
 
+	sin6 = (sin6_t *)&lnr->lnr_addr;
+
+	/*
+	 * Since ND mappings must be consistent across an IPMP group, prohibit
+	 * updating ND mappings on underlying interfaces.  Also, since ND
+	 * mappings for IPMP data addresses are owned by IP itself, prohibit
+	 * updating them.
+	 */
+	if (IS_UNDER_IPMP(ill))
+		return (EPERM);
+
+	if (IS_IPMP(ill)) {
+		ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL,
+		    ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL,
+		    ill->ill_ipst);
+		if (ire != NULL) {
+			ire_refrele(ire);
+			return (EPERM);
+		}
+	}
+
 	return (ndp_sioc_update(ill, lnr));
 }
diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c
index 41461ca96f..0d0f3621f5 100644
--- a/usr/src/uts/common/inet/ip/ip6_ire.c
+++ b/usr/src/uts/common/inet/ip/ip6_ire.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -73,7 +73,6 @@ static	ire_t	*ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
     const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
 static	ire_t	*ip6_ctable_lookup_impl(ire_ctable_args_t *);
 
-
 /*
  * Initialize the ire that is specific to IPv6 part and call
  * ire_init_common to finish it.
@@ -261,13 +260,11 @@ ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
 	 * Make sure we follow ire_ipif.
 	 *
 	 * We need to determine the interface route through
-	 * which the gateway will be reached. We don't really
-	 * care which interface is picked if the interface is
-	 * part of a group.
+	 * which the gateway will be reached.
 	 */
 	if (ire->ire_ipif != NULL) {
 		ipif = ire->ire_ipif;
-		match_flags |= MATCH_IRE_ILL_GROUP;
+		match_flags |= MATCH_IRE_ILL;
 	}
 
 	switch (ire->ire_type) {
@@ -409,35 +406,54 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
 	ire_t	*ire = *ire_p;
 	int	error;
 	ip_stack_t	*ipst = ire->ire_ipst;
+	uint_t	marks = 0;
 
 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
 	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
 	ASSERT(ire->ire_nce == NULL);
 
+	/*
+	 * IREs with source addresses hosted on interfaces that are under IPMP
+	 * should be hidden so that applications don't accidentally end up
+	 * sending packets with test addresses as their source addresses, or
+	 * sending out interfaces that are e.g. IFF_INACTIVE.  Hide them here.
+	 * (We let IREs with unspecified source addresses slip through since
+	 * ire_send_v6() will delete them automatically.)
+	 */
+	if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
+	    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
+		DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
+		marks |= IRE_MARK_TESTHIDDEN;
+	}
+
 	/* Find the appropriate list head. */
 	switch (ire->ire_type) {
 	case IRE_HOST:
 		ire->ire_mask_v6 = ipv6_all_ones;
 		ire->ire_masklen = IPV6_ABITS;
+		ire->ire_marks |= marks;
 		if ((ire->ire_flags & RTF_SETSRC) == 0)
 			ire->ire_src_addr_v6 = ipv6_all_zeros;
 		break;
 	case IRE_CACHE:
+		ire->ire_mask_v6 = ipv6_all_ones;
+		ire->ire_masklen = IPV6_ABITS;
+		ire->ire_marks |= marks;
+		break;
 	case IRE_LOCAL:
 	case IRE_LOOPBACK:
 		ire->ire_mask_v6 = ipv6_all_ones;
 		ire->ire_masklen = IPV6_ABITS;
 		break;
 	case IRE_PREFIX:
-		if ((ire->ire_flags & RTF_SETSRC) == 0)
-			ire->ire_src_addr_v6 = ipv6_all_zeros;
-		break;
 	case IRE_DEFAULT:
+		ire->ire_marks |= marks;
 		if ((ire->ire_flags & RTF_SETSRC) == 0)
 			ire->ire_src_addr_v6 = ipv6_all_zeros;
 		break;
 	case IRE_IF_RESOLVER:
 	case IRE_IF_NORESOLVER:
+		ire->ire_marks |= marks;
 		break;
 	default:
 		printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
@@ -543,9 +559,8 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
 	 * 2) We could have multiple packets trying to create
 	 *    an IRE_CACHE for the same ill.
 	 *
-	 * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants
-	 * to go out on a particular ill. Rather than looking at the
-	 * packet, we depend on the above for MATCH_IRE_ILL here.
+	 * Rather than looking at the packet, we depend on the above for
+	 * MATCH_IRE_ILL here.
 	 *
 	 * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
 	 * multiple IRE_CACHES for an ill for the same destination
@@ -555,20 +570,15 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
 	 */
 	if (ire->ire_ipif != NULL)
 		flags |= MATCH_IRE_IPIF;
+
 	/*
-	 * If we are creating hidden ires, make sure we search on
-	 * this ill (MATCH_IRE_ILL) and a hidden ire, while we are
-	 * searching for duplicates below. Otherwise we could
-	 * potentially find an IRE on some other interface
-	 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
-	 * shouldn't do this as this will lead to an infinite loop as
-	 * eventually we need an hidden ire for this packet to go
-	 * out. MATCH_IRE_ILL is already marked above.
+	 * If we are creating a hidden IRE, make sure we search for
+	 * hidden IREs when searching for duplicates below.
+	 * Otherwise, we might find an IRE on some other interface
+	 * that's not marked hidden.
 	 */
-	if (ire->ire_marks & IRE_MARK_HIDDEN) {
-		ASSERT(ire->ire_type == IRE_CACHE);
-		flags |= MATCH_IRE_MARK_HIDDEN;
-	}
+	if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
+		flags |= MATCH_IRE_MARK_TESTHIDDEN;
 
 	/*
 	 * Start the atomic add of the ire. Grab the ill locks,
@@ -692,7 +702,7 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
 		}
 	}
 	if (ire->ire_type == IRE_CACHE) {
-		in6_addr_t gw_addr_v6;
+		const in6_addr_t *addr_v6;
 		ill_t	*ill = ire_to_ill(ire);
 		char	buf[INET6_ADDRSTRLEN];
 		nce_t	*nce;
@@ -712,12 +722,12 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
 		 * time on the list and rts_setgwr_v6 could not
 		 * be changing this.
 		 */
-		gw_addr_v6 = ire->ire_gateway_addr_v6;
-		if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
-			nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, B_TRUE);
-		} else {
-			nce = ndp_lookup_v6(ill, &gw_addr_v6, B_TRUE);
-		}
+		addr_v6 = &ire->ire_gateway_addr_v6;
+		if (IN6_IS_ADDR_UNSPECIFIED(addr_v6))
+			addr_v6 = &ire->ire_addr_v6;
+
+		/* nce fastpath is per-ill; don't match across illgrp */
+		nce = ndp_lookup_v6(ill, B_FALSE, addr_v6, B_TRUE);
 		if (nce == NULL)
 			goto failed;
 
@@ -1217,28 +1227,29 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
 	in6_addr_t gw_addr_v6;
 	ill_t *ire_ill = NULL, *dst_ill;
 	ill_t *ipif_ill = NULL;
-	ill_group_t *ire_ill_group = NULL;
-	ill_group_t *ipif_ill_group = NULL;
 	ipif_t	*src_ipif;
 
 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
 	ASSERT(addr != NULL);
 	ASSERT(mask != NULL);
 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
-	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
+	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
 	    (ipif != NULL && ipif->ipif_isv6));
 
 	/*
-	 * HIDDEN cache entries have to be looked up specifically with
-	 * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
-	 * when the interface is FAILED or INACTIVE. In that case,
-	 * any IRE_CACHES that exists should be marked with
-	 * IRE_MARK_HIDDEN. So, we don't really need to match below
-	 * for IRE_MARK_HIDDEN. But we do so for consistency.
+	 * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
+	 * is in fact hidden, to ensure the caller gets the right one.  One
+	 * exception: if the caller passed MATCH_IRE_IHANDLE, then they
+	 * already know the identity of the given IRE_INTERFACE entry and
+	 * there's no point trying to hide it from them.
 	 */
-	if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
-	    (ire->ire_marks & IRE_MARK_HIDDEN))
-		return (B_FALSE);
+	if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+		if (match_flags & MATCH_IRE_IHANDLE)
+			match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+
+		if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+			return (B_FALSE);
+	}
 
 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
 	    ire->ire_zoneid != ALL_ZONES) {
@@ -1288,7 +1299,7 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
 			 */
 			if ((dst_ill->ill_usesrc_ifindex != 0) &&
 			    (src_ipif = ipif_select_source_v6(dst_ill, addr,
-			    RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, zoneid))
+			    B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid))
 			    != NULL) {
 				ip3dbg(("ire_match_args: src_ipif %p"
 				    " dst_ill %p", (void *)src_ipif,
@@ -1326,20 +1337,20 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
 		gw_addr_v6 = ire->ire_gateway_addr_v6;
 		mutex_exit(&ire->ire_lock);
 	}
+
 	/*
-	 * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
-	 * somebody wants to send out on a particular interface which
-	 * is given by ire_stq and hence use ire_stq to derive the ill
-	 * value. ire_ipif for IRE_CACHES is just the
-	 * means of getting a source address i.e ire_src_addr_v6 =
-	 * ire->ire_ipif->ipif_src_addr_v6.
+	 * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
+	 * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
+	 * of getting a source address -- i.e., ire_src_addr_v6 ==
+	 * ire->ire_ipif->ipif_v6src_addr).  ire_to_ill() handles this.
+	 *
+	 * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
+	 * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
+	 * IPMP test traffic), then the ill must match exactly.
 	 */
-	if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
+	if (match_flags & MATCH_IRE_ILL) {
 		ire_ill = ire_to_ill(ire);
-		if (ire_ill != NULL)
-			ire_ill_group = ire_ill->ill_group;
 		ipif_ill = ipif->ipif_ill;
-		ipif_ill_group = ipif_ill->ill_group;
 	}
 
 	/* No ire_addr_v6 bits set past the mask */
@@ -1357,17 +1368,14 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
 	    &ipif->ipif_v6src_addr)) &&
 	    ((!(match_flags & MATCH_IRE_IPIF)) ||
 	    (ire->ire_ipif == ipif)) &&
-	    ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
-	    (ire->ire_type != IRE_CACHE ||
-	    ire->ire_marks & IRE_MARK_HIDDEN)) &&
+	    ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
+	    (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
 	    ((!(match_flags & MATCH_IRE_ILL)) ||
-	    (ire_ill == ipif_ill)) &&
+	    (ire_ill == ipif_ill ||
+	    (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
+	    ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
 	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
 	    (ire->ire_ihandle == ihandle)) &&
-	    ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
-	    (ire_ill == ipif_ill) ||
-	    (ire_ill_group != NULL &&
-	    ire_ill_group == ipif_ill_group)) &&
 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
 	    (!is_system_labeled()) ||
 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -1391,8 +1399,7 @@ ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
 	 * MATCH_IRE_ILL is set.
 	 */
-	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
-	    (ipif == NULL))
+	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
 		return (NULL);
 
 	/*
@@ -1477,8 +1484,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
 	 * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
 	 * MATCH_IRE_ILL is set.
 	 */
-	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
-	    (ipif == NULL))
+	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
 		return (NULL);
 
 	/*
@@ -1661,8 +1667,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
 				mutex_enter(&ire->ire_lock);
 				gw_addr_v6 = ire->ire_gateway_addr_v6;
 				mutex_exit(&ire->ire_lock);
-				match_flags = MATCH_IRE_ILL_GROUP |
-				    MATCH_IRE_SECATTR;
+				match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
 				rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL,
 				    0, ire->ire_ipif, zoneid, tsl, match_flags,
 				    ipst);
@@ -1703,7 +1708,7 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
 
 					if (ire->ire_ipif != NULL) {
 						ire_match_flags |=
-						    MATCH_IRE_ILL_GROUP;
+						    MATCH_IRE_ILL;
 					}
 					rire = ire_route_lookup_v6(&gw_addr_v6,
 					    NULL, NULL, IRE_INTERFACE,
@@ -1791,21 +1796,8 @@ found_ire_held:
 		 */
 		saved_ire = ire;
 
-		/*
-		 * Currently MATCH_IRE_ILL is never used with
-		 * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
-		 * sending out packets as MATCH_IRE_ILL is used only
-		 * for communicating with on-link hosts. We can't assert
-		 * that here as RTM_GET calls this function with
-		 * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
-		 * We have already used the MATCH_IRE_ILL in determining
-		 * the right prefix route at this point. To match the
-		 * behavior of how we locate routes while sending out
-		 * packets, we don't want to use MATCH_IRE_ILL below
-		 * while locating the interface route.
-		 */
 		if (ire->ire_ipif != NULL)
-			match_flags |= MATCH_IRE_ILL_GROUP;
+			match_flags |= MATCH_IRE_ILL;
 
 		mutex_enter(&ire->ire_lock);
 		gw_addr_v6 = ire->ire_gateway_addr_v6;
@@ -1958,9 +1950,7 @@ ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
 }
 
 /*
- * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
- * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
- * to the hidden ones.
+ * Lookup cache.
  *
  * In general the zoneid has to match (where ALL_ZONES match all of them).
  * But for IRE_LOCAL we also need to handle the case where L2 should
@@ -1968,8 +1958,7 @@ ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
  * Ethernet drivers nor Ethernet hardware loops back packets sent to their
  * own MAC address. This loopback is needed when the normal
  * routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill (or ill group) as the ill with which this IRE_LOCAL is
- * associated.
+ * the same ill as the ill with which this IRE_LOCAL is associated.
  *
  * Earlier versions of this code always matched an IRE_LOCAL independently of
  * the zoneid. We preserve that earlier behavior when
@@ -1986,7 +1975,7 @@ ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid,
 	    ipst->ips_ip6_cache_table_size)];
 	rw_enter(&irb_ptr->irb_lock, RW_READER);
 	for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
-		if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
+		if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) {
 			/*
@@ -2125,13 +2114,8 @@ ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
 	ASSERT(cire != NULL && pire != NULL);
 
 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
-	/*
-	 * ip_newroute_v6 calls ire_ftable_lookup with MATCH_IRE_ILL only
-	 * for on-link hosts. We should never be here for onlink.
-	 * Thus, use MATCH_IRE_ILL_GROUP.
-	 */
 	if (pire->ire_ipif != NULL)
-		match_flags |= MATCH_IRE_ILL_GROUP;
+		match_flags |= MATCH_IRE_ILL;
 	/*
 	 * We know that the mask of the interface ire equals cire->ire_cmask.
 	 * (When ip_newroute_v6() created 'cire' for an on-link destn. it set
@@ -2168,7 +2152,7 @@ ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
 	 */
 	match_flags =  MATCH_IRE_TYPE;
 	if (pire->ire_ipif != NULL)
-		match_flags |= MATCH_IRE_ILL_GROUP;
+		match_flags |= MATCH_IRE_ILL;
 
 	mutex_enter(&pire->ire_lock);
 	gw_addr = pire->ire_gateway_addr_v6;
@@ -2210,24 +2194,30 @@ ire_t *
 ipif_to_ire_v6(const ipif_t *ipif)
 {
 	ire_t	*ire;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
+	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+	uint_t	match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF;
+
+	/*
+	 * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
+	 * so that they aren't accidentally returned.  However, if the
+	 * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+	 */
+	if (IS_UNDER_IPMP(ipif->ipif_ill))
+		match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
 
 	ASSERT(ipif->ipif_isv6);
 	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
 		ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL,
-		    IRE_LOOPBACK, ipif, ALL_ZONES, NULL,
-		    (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ipst);
+		    IRE_LOOPBACK, ipif, ALL_ZONES, NULL, match_flags, ipst);
 	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
 		/* In this case we need to lookup destination address. */
 		ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr,
 		    &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES,
-		    0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
-		    MATCH_IRE_MASK), ipst);
+		    0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
 	} else {
 		ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet,
 		    &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL,
-		    ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
-		    MATCH_IRE_MASK), ipst);
+		    ALL_ZONES, 0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
 	}
 	return (ire);
 }
@@ -2296,7 +2286,7 @@ ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl,
 			continue;
 		if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp))
 			continue;
-		if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_HIDDEN))
+		if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
 			continue;
 		unres_cnt--;
 	}
@@ -2434,7 +2424,7 @@ ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
 						continue;
 					if (cire->ire_marks &
 					    (IRE_MARK_CONDEMNED|
-					    IRE_MARK_HIDDEN))
+					    IRE_MARK_TESTHIDDEN))
 						continue;
 
 					if (cire->ire_gw_secattr != NULL &&
@@ -2635,8 +2625,7 @@ ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
 					    &cire->ire_addr_v6, &v6dst))
 						continue;
 					if (cire->ire_marks &
-					    (IRE_MARK_CONDEMNED|
-					    IRE_MARK_HIDDEN))
+					    IRE_MARK_CONDEMNED)
 						continue;
 
 					if (cire->ire_gw_secattr != NULL &&
@@ -2845,8 +2834,7 @@ ip6_ctable_lookup_impl(ire_ctable_args_t *margs)
 	ire_t			*ire;
 	ip_stack_t		*ipst = margs->ict_ipst;
 
-	if ((margs->ict_flags &
-	    (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
+	if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
 	    (margs->ict_ipif == NULL)) {
 		return (NULL);
 	}
diff --git a/usr/src/uts/common/inet/ip/ip6_rts.c b/usr/src/uts/common/inet/ip/ip6_rts.c
index 7d2ddd5c04..dcf429c8ba 100644
--- a/usr/src/uts/common/inet/ip/ip6_rts.c
+++ b/usr/src/uts/common/inet/ip/ip6_rts.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,8 +38,6 @@
  *	@(#)rtsock.c	8.6 (Berkeley) 2/11/95
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file contains routines that processes routing socket requests.
  */
@@ -216,5 +214,5 @@ ip_rts_change_v6(int type, const in6_addr_t *dst_addr,
 	rtm->rtm_errno = error;
 	rtm->rtm_flags |= RTF_DONE;
 	rtm->rtm_addrs = rtm_addrs;
-	rts_queue_input(mp, NULL, AF_INET6, ipst);
+	rts_queue_input(mp, NULL, AF_INET6, RTSQ_ALL, ipst);
 }
diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c
index 4fa3c7a74d..31f83c842d 100644
--- a/usr/src/uts/common/inet/ip/ip_ftable.c
+++ b/usr/src/uts/common/inet/ip/ip_ftable.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,7 +67,6 @@
 #include <net/pfkeyv2.h>
 #include <inet/ipsec_info.h>
 #include <inet/sadb.h>
-#include <sys/kmem.h>
 #include <inet/tcp.h>
 #include <inet/ipclassifier.h>
 #include <sys/zone.h>
@@ -159,8 +158,7 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
 	 * ire_match_args() will dereference ipif MATCH_IRE_SRC or
 	 * MATCH_IRE_ILL is set.
 	 */
-	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
-	    (ipif == NULL))
+	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
 		return (NULL);
 
 	(void) memset(&rdst, 0, sizeof (rdst));
@@ -290,28 +288,16 @@ found_ire_held:
 		 */
 		save_ire = ire;
 
+		if (ire->ire_ipif != NULL)
+			match_flags |= MATCH_IRE_ILL;
+
 		/*
-		 * Currently MATCH_IRE_ILL is never used with
-		 * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
-		 * sending out packets as MATCH_IRE_ILL is used only
-		 * for communicating with on-link hosts. We can't assert
-		 * that here as RTM_GET calls this function with
-		 * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
-		 * We have already used the MATCH_IRE_ILL in determining
-		 * the right prefix route at this point. To match the
-		 * behavior of how we locate routes while sending out
-		 * packets, we don't want to use MATCH_IRE_ILL below
-		 * while locating the interface route.
-		 *
 		 * ire_ftable_lookup may end up with an incomplete IRE_CACHE
 		 * entry for the gateway (i.e., one for which the
 		 * ire_nce->nce_state is not yet ND_REACHABLE). If the caller
 		 * has specified MATCH_IRE_COMPLETE, such entries will not
 		 * be returned; instead, we return the IF_RESOLVER ire.
 		 */
-		if (ire->ire_ipif != NULL)
-			match_flags |= MATCH_IRE_ILL_GROUP;
-
 		ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0,
 		    ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
 		DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
@@ -532,7 +518,7 @@ ire_ftable_lookup_simple(ipaddr_t addr,
 		}
 	}
 	if (ire->ire_ipif != NULL)
-		match_flags |= MATCH_IRE_ILL_GROUP;
+		match_flags |= MATCH_IRE_ILL;
 
 	ire = ire_route_lookup(ire->ire_gateway_addr, 0,
 	    0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst);
@@ -678,13 +664,11 @@ ire_lookup_multi(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
 	 * Make sure we follow ire_ipif.
 	 *
 	 * We need to determine the interface route through
-	 * which the gateway will be reached. We don't really
-	 * care which interface is picked if the interface is
-	 * part of a group.
+	 * which the gateway will be reached.
 	 */
 	if (ire->ire_ipif != NULL) {
 		ipif = ire->ire_ipif;
-		match_flags |= MATCH_IRE_ILL_GROUP;
+		match_flags |= MATCH_IRE_ILL;
 	}
 
 	switch (ire->ire_type) {
@@ -854,40 +838,26 @@ ire_get_next_default_ire(ire_t *ire, ire_t *ire_origin)
 }
 
 static ipif_t *
-ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill,
+ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire,
     int zoneid, ushort_t *marks)
 {
 	ipif_t *src_ipif;
-	ip_stack_t *ipst = dst_ill->ill_ipst;
+	ill_t *ill = ire->ire_ipif->ipif_ill;
+	ip_stack_t *ipst = ill->ill_ipst;
 
 	/*
-	 * Pick the best source address from dst_ill.
+	 * Pick the best source address from ill.
 	 *
-	 * 1) If it is part of a multipathing group, we would
-	 *    like to spread the inbound packets across different
-	 *    interfaces. ipif_select_source picks a random source
-	 *    across the different ills in the group.
-	 *
-	 * 2) If it is not part of a multipathing group, we try
-	 *    to pick the source address from the destination
+	 * 1) Try to pick the source address from the destination
 	 *    route. Clustering assumes that when we have multiple
 	 *    prefixes hosted on an interface, the prefix of the
 	 *    source address matches the prefix of the destination
 	 *    route. We do this only if the address is not
 	 *    DEPRECATED.
 	 *
-	 * 3) If the conn is in a different zone than the ire, we
+	 * 2) If the conn is in a different zone than the ire, we
 	 *    need to pick a source address from the right zone.
-	 *
-	 * NOTE : If we hit case (1) above, the prefix of the source
-	 *	  address picked may not match the prefix of the
-	 *	  destination routes prefix as ipif_select_source
-	 *	  does not look at "dst" while picking a source
-	 *	  address.
-	 *	  If we want the same behavior as (2), we will need
-	 *	  to change the behavior of ipif_select_source.
 	 */
-
 	if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
 		/*
 		 * The RTF_SETSRC flag is set in the parent ire (sire).
@@ -899,13 +869,10 @@ ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill,
 		return (src_ipif);
 	}
 	*marks |= IRE_MARK_USESRC_CHECK;
-	if ((dst_ill->ill_group != NULL) ||
+	if (IS_IPMP(ill) ||
 	    (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
-	    (dst_ill->ill_usesrc_ifindex != 0)) {
-		src_ipif = ipif_select_source(dst_ill, dst, zoneid);
-		if (src_ipif == NULL)
-			return (NULL);
-
+	    (ill->ill_usesrc_ifindex != 0)) {
+		src_ipif = ipif_select_source(ill, dst, zoneid);
 	} else {
 		src_ipif = ire->ire_ipif;
 		ASSERT(src_ipif != NULL);
@@ -1071,18 +1038,20 @@ create_irecache:
 		sire->ire_last_used_time = lbolt;
 	}
 
-	/* Obtain dst_ill */
-	dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
+	dst_ill = ire->ire_ipif->ipif_ill;
+	if (IS_IPMP(dst_ill))
+		dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp);
+	else
+		ill_refhold(dst_ill);
+
 	if (dst_ill == NULL) {
-		ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
-		    (void *)ire));
+		ip2dbg(("ire_forward no dst ill; ire 0x%p\n", (void *)ire));
 		goto icmp_err_ret;
 	}
 
 	ASSERT(src_ipif == NULL);
 	/* Now obtain the src_ipif */
-	src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
-	    zoneid, &ire_marks);
+	src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks);
 	if (src_ipif == NULL)
 		goto icmp_err_ret;
 
@@ -1254,18 +1223,13 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
 	ire_t   *sire = NULL, *save_ire;
 	ill_t *dst_ill = NULL;
 	int error;
-	zoneid_t zoneid;
+	zoneid_t zoneid = GLOBAL_ZONEID;
 	ipif_t *src_ipif = NULL;
 	mblk_t *res_mp;
 	ushort_t ire_marks = 0;
 
-	zoneid = GLOBAL_ZONEID;
-
-
 	ire = ire_ftable_lookup_simple(dst, &sire, zoneid,
-	    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-	    MATCH_IRE_RJ_BHOLE, ipst);
-
+	    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE, ipst);
 	if (ire == NULL) {
 		ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
 		goto icmp_err_ret;
@@ -1288,9 +1252,7 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
 	 * nexthop router, just hand over the cache entry
 	 * and we are done.
 	 */
-
 	if (ire->ire_type & IRE_CACHE) {
-
 		/*
 		 * If we are using this ire cache entry as a
 		 * gateway to forward packets, chances are we
@@ -1334,18 +1296,21 @@ ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
 		UPDATE_OB_PKT_COUNT(sire);
 	}
 
-	/* Obtain dst_ill */
-	dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
+	dst_ill = ire->ire_ipif->ipif_ill;
+	if (IS_IPMP(dst_ill))
+		dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp);
+	else
+		ill_refhold(dst_ill);	/* for symmetry */
+
 	if (dst_ill == NULL) {
-		ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
+		ip2dbg(("ire_forward_simple: no dst ill; ire 0x%p\n",
 		    (void *)ire));
 		goto icmp_err_ret;
 	}
 
 	ASSERT(src_ipif == NULL);
 	/* Now obtain the src_ipif */
-	src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
-	    zoneid, &ire_marks);
+	src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks);
 	if (src_ipif == NULL)
 		goto icmp_err_ret;
 
@@ -1720,33 +1685,24 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
 
 		match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
 		    MATCH_IRE_RECURSIVE| MATCH_IRE_RJ_BHOLE|
-		    MATCH_IRE_SECATTR);
+		    MATCH_IRE_SECATTR | MATCH_IRE_ILL);
 
 		/*
 		 * If supplied ifindex is non-null, the only valid
-		 * nexthop is one off of the interface or group corresponding
+		 * nexthop is one off of the interface corresponding
 		 * to the specified ifindex.
 		 */
 		ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
 		    NULL, NULL, NULL, NULL, ipst);
 		if (ill != NULL) {
-			match_flags |= MATCH_IRE_ILL;
+			supplied_ipif = ipif_get_next_ipif(NULL, ill);
 		} else {
-			/* Fallback to group names if hook_emulation set */
-			if (ipst->ips_ipmp_hook_emulation) {
-				ill = ill_group_lookup_on_ifindex(ifindex,
-				    B_FALSE, ipst);
-			}
-			if (ill == NULL) {
-				ip1dbg(("ipfil_sendpkt: Could not find"
-				    " route to dst\n"));
-				value = ECOMM;
-				freemsg(mp);
-				goto discard;
-			}
-			match_flags |= MATCH_IRE_ILL_GROUP;
+			ip1dbg(("ipfil_sendpkt: Could not find"
+			    " route to dst\n"));
+			value = ECOMM;
+			freemsg(mp);
+			goto discard;
 		}
-		supplied_ipif = ipif_get_next_ipif(NULL, ill);
 
 		ire = ire_route_lookup(dst, 0, 0, 0, supplied_ipif,
 		    &sire, zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
@@ -2325,9 +2281,9 @@ ire_round_robin(irb_t *irb_ptr, zoneid_t zoneid, ire_ftable_args_t *margs,
 		 * interested in routers that are
 		 * reachable through ipifs within our zone.
 		 */
-		if (ire->ire_ipif != NULL) {
-			match_flags |= MATCH_IRE_ILL_GROUP;
-		}
+		if (ire->ire_ipif != NULL)
+			match_flags |= MATCH_IRE_ILL;
+
 		rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0,
 		    IRE_INTERFACE, ire->ire_ipif, NULL, zoneid, margs->ift_tsl,
 		    match_flags, ipst);
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 0597245499..9771c87721 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -46,6 +46,7 @@
 #include <sys/bitmap.h>
 #include <sys/cpuvar.h>
 #include <sys/time.h>
+#include <sys/ctype.h>
 #include <sys/kmem.h>
 #include <sys/systm.h>
 #include <sys/param.h>
@@ -61,10 +62,10 @@
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/igmp_var.h>
-#include <sys/strsun.h>
 #include <sys/policy.h>
 #include <sys/ethernet.h>
 #include <sys/callb.h>
+#include <sys/md5.h>
 
 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
 #include <inet/mi.h>
@@ -85,7 +86,6 @@
 #include <inet/tun.h>
 #include <inet/sctp_ip.h>
 #include <inet/ip_netinfo.h>
-#include <inet/mib2.h>
 
 #include <net/pfkeyv2.h>
 #include <inet/ipsec_info.h>
@@ -93,7 +93,6 @@
 #include <inet/ipsec_impl.h>
 #include <sys/iphada.h>
 
-
 #include <netinet/igmp.h>
 #include <inet/ip_listutils.h>
 #include <inet/ipclassifier.h>
@@ -158,7 +157,7 @@ static	int	ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
 static void	ipsq_delete(ipsq_t *);
 
 static ipif_t	*ipif_allocate(ill_t *ill, int id, uint_t ire_type,
-		    boolean_t initialize);
+    boolean_t initialize, boolean_t insert);
 static void	ipif_check_bcast_ires(ipif_t *test_ipif);
 static ire_t	**ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
@@ -169,7 +168,6 @@ static int	ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
 static void	ipif_free(ipif_t *ipif);
 static void	ipif_free_tail(ipif_t *ipif);
 static void	ipif_mtu_change(ire_t *ire, char *ipif_arg);
-static void	ipif_multicast_down(ipif_t *ipif);
 static void	ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
 static void	ipif_set_default(ipif_t *ipif);
 static int	ipif_set_values(queue_t *q, mblk_t *mp,
@@ -179,8 +177,7 @@ static int	ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
 static ipif_t	*ipif_lookup_on_name(char *name, size_t namelen,
     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
     queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *);
-static int	ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp);
-static void	ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp);
+static void	ipif_update_other_ipifs(ipif_t *old_ipif);
 
 static int	ill_alloc_ppa(ill_if_t *, ill_t *);
 static int	ill_arp_off(ill_t *ill);
@@ -192,33 +189,18 @@ static void	ill_down(ill_t *ill);
 static void	ill_downi(ire_t *ire, char *ill_arg);
 static void	ill_free_mib(ill_t *ill);
 static void	ill_glist_delete(ill_t *);
-static boolean_t ill_has_usable_ipif(ill_t *);
-static int	ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int);
-static void	ill_nominate_bcast_rcv(ill_group_t *illgrp);
-static void	ill_phyint_free(ill_t *ill);
 static void	ill_phyint_reinit(ill_t *ill);
 static void	ill_set_nce_router_flags(ill_t *, boolean_t);
 static void	ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
-static void	ill_signal_ipsq_ills(ipsq_t *, boolean_t);
-static boolean_t ill_split_ipsq(ipsq_t *cur_sq);
-static void	ill_stq_cache_delete(ire_t *, char *);
-
-static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
-    in6_addr_t *);
-static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
-    ipaddr_t *);
-static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *);
-static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
-    in6_addr_t *);
-static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
-    ipaddr_t *);
-
+static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
+static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
+static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo;
+static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo;
 static void	ipif_save_ire(ipif_t *, ire_t *);
 static void	ipif_remove_ire(ipif_t *, ire_t *);
 static void 	ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *);
 static void 	ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
+static void	phyint_free(phyint_t *);
 
 /*
  * Per-ill IPsec capabilities management.
@@ -250,18 +232,14 @@ static void	ill_capability_ack_thr(void *);
 static void	ill_capability_lso_enable(ill_t *);
 static void	ill_capability_send(ill_t *, mblk_t *);
 
-static void	illgrp_cache_delete(ire_t *, char *);
-static void	illgrp_delete(ill_t *ill);
-static void	illgrp_reset_schednext(ill_t *ill);
-
 static ill_t	*ill_prev_usesrc(ill_t *);
 static int	ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
 static void	ill_disband_usesrc_group(ill_t *);
 static void	conn_cleanup_stale_ire(conn_t *, caddr_t);
 
 #ifdef DEBUG
-static	void	ill_trace_cleanup(const ill_t *);
-static	void	ipif_trace_cleanup(const ipif_t *);
+static  void    ill_trace_cleanup(const ill_t *);
+static  void    ipif_trace_cleanup(const ipif_t *);
 #endif
 
 /*
@@ -491,6 +469,7 @@ static nv_t	ipif_nv_tbl[] = {
 	{ PHYI_STANDBY,		"STANDBY" },
 	{ PHYI_INACTIVE,	"INACTIVE" },
 	{ PHYI_OFFLINE,		"OFFLINE" },
+	{ PHYI_IPMP,		"IPMP" }
 };
 
 static uchar_t	ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
@@ -508,7 +487,8 @@ static ip_m_t   ip_m_tbl[] = {
 	    ip_ether_v6intfid },
 	{ DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo,
 	    ip_ib_v6intfid },
-	{ SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL},
+	{ SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL },
+	{ SUNW_DL_IPMP, IFT_OTHER, NULL, NULL, ip_ipmp_v6intfid },
 	{ DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
 	    ip_nodef_v6intfid }
 };
@@ -529,14 +509,6 @@ static ipif_t	ipif_zero;
  */
 uint_t	ill_no_arena = 12;	/* Setable in /etc/system */
 
-static uint_t
-ipif_rand(ip_stack_t *ipst)
-{
-	ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 +
-	    12345;
-	return ((ipst->ips_ipif_src_random >> 16) & 0x7fff);
-}
-
 /*
  * Allocate per-interface mibs.
  * Returns true if ok. False otherwise.
@@ -623,7 +595,7 @@ ill_allocate_mibs(ill_t *ill)
  * (Always called as writer.)
  */
 mblk_t *
-ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
+ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr)
 {
 	arc_t	*arc = (arc_t *)template;
 	char	*cp;
@@ -669,17 +641,69 @@ ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
 }
 
 mblk_t *
-ipif_area_alloc(ipif_t *ipif)
+ipif_area_alloc(ipif_t *ipif, uint_t optflags)
 {
-	return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template,
-	    (char *)&ipif->ipif_lcl_addr));
+	caddr_t	addr;
+	mblk_t 	*mp;
+	area_t	*area;
+	uchar_t	*areap;
+	ill_t	*ill = ipif->ipif_ill;
+
+	if (ill->ill_isv6) {
+		ASSERT(ill->ill_flags & ILLF_XRESOLV);
+		addr = (caddr_t)&ipif->ipif_v6lcl_addr;
+		areap = (uchar_t *)&ip6_area_template;
+	} else {
+		addr = (caddr_t)&ipif->ipif_lcl_addr;
+		areap = (uchar_t *)&ip_area_template;
+	}
+
+	if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL)
+		return (NULL);
+
+	/*
+	 * IPMP requires that the hardware address be included in all
+	 * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on.
+	 * If there are no active underlying ills in the group (and thus no
+	 * hardware address, DAD will be deferred until an underlying ill
+	 * becomes active.
+	 */
+	if (IS_IPMP(ill)) {
+		if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
+			freemsg(mp);
+			return (NULL);
+		}
+	} else {
+		ill_refhold(ill);
+	}
+
+	area = (area_t *)mp->b_rptr;
+	area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR;
+	area->area_flags |= optflags;
+	area->area_hw_addr_length = ill->ill_phys_addr_length;
+	bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset,
+	    area->area_hw_addr_length);
+
+	ill_refrele(ill);
+	return (mp);
 }
 
 mblk_t *
 ipif_ared_alloc(ipif_t *ipif)
 {
-	return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template,
-	    (char *)&ipif->ipif_lcl_addr));
+	caddr_t	addr;
+	uchar_t	*aredp;
+
+	if (ipif->ipif_ill->ill_isv6) {
+		ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV);
+		addr = (caddr_t)&ipif->ipif_v6lcl_addr;
+		aredp = (uchar_t *)&ip6_ared_template;
+	} else {
+		addr = (caddr_t)&ipif->ipif_lcl_addr;
+		aredp = (uchar_t *)&ip_ared_template;
+	}
+
+	return (ill_arp_alloc(ipif->ipif_ill, aredp, addr));
 }
 
 mblk_t *
@@ -689,6 +713,19 @@ ill_ared_alloc(ill_t *ill, ipaddr_t addr)
 	    (char *)&addr));
 }
 
+mblk_t *
+ill_arie_alloc(ill_t *ill, const char *grifname, const void *template)
+{
+	mblk_t	*mp = ill_arp_alloc(ill, template, 0);
+	arie_t	*arie;
+
+	if (mp != NULL) {
+		arie = (arie_t *)mp->b_rptr;
+		(void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ);
+	}
+	return (mp);
+}
+
 /*
  * Completely vaporize a lower level tap and all associated interfaces.
  * ill_delete is called only out of ip_close when the device control
@@ -751,6 +788,12 @@ ill_delete(ill_t *ill)
 	ip_purge_allmulti(ill);
 
 	/*
+	 * If the ill being deleted is under IPMP, boot it out of the illgrp.
+	 */
+	if (IS_UNDER_IPMP(ill))
+		ipmp_ill_leave_illgrp(ill);
+
+	/*
 	 * ill_down will arrange to blow off any IRE's dependent on this
 	 * ILL, and shut down fragmentation reassembly.
 	 */
@@ -890,8 +933,19 @@ ill_delete_tail(ill_t *ill)
 	 *    ill references.
 	 */
 	ASSERT(ilm_walk_ill(ill) == 0);
+
 	/*
-	 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free
+	 * If this ill is an IPMP meta-interface, blow away the illgrp.  This
+	 * is safe to do because the illgrp has already been unlinked from the
+	 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
+	 */
+	if (IS_IPMP(ill)) {
+		ipmp_illgrp_destroy(ill->ill_grp);
+		ill->ill_grp = NULL;
+	}
+
+	/*
+	 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
 	 * could free the phyint. No more reference to the phyint after this
 	 * point.
 	 */
@@ -1139,7 +1193,7 @@ ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id)
  * Add the pending mp to the list. There can be only 1 pending mp
  * in the list. Any exclusive ioctl that needs to wait for a response
  * from another module or driver needs to use this function to set
- * the ipsq_pending_mp to the ioctl mblk and wait for the response from
+ * the ipx_pending_mp to the ioctl mblk and wait for the response from
  * the other module/driver. This is also used while waiting for the
  * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
  */
@@ -1147,19 +1201,19 @@ boolean_t
 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
     int waitfor)
 {
-	ipsq_t	*ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
+	ipxop_t	*ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
 
 	ASSERT(IAM_WRITER_IPIF(ipif));
 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
 	ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
-	ASSERT(ipsq->ipsq_pending_mp == NULL);
+	ASSERT(ipx->ipx_pending_mp == NULL);
 	/*
 	 * The caller may be using a different ipif than the one passed into
 	 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
 	 * ill needs to wait for the V6 ill to quiesce).  So we can't ASSERT
-	 * that `ipsq_current_ipif == ipif'.
+	 * that `ipx_current_ipif == ipif'.
 	 */
-	ASSERT(ipsq->ipsq_current_ipif != NULL);
+	ASSERT(ipx->ipx_current_ipif != NULL);
 
 	/*
 	 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls,
@@ -1180,8 +1234,8 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
 		if (connp->conn_state_flags & CONN_CLOSING)
 			return (B_FALSE);
 	}
-	mutex_enter(&ipsq->ipsq_lock);
-	ipsq->ipsq_pending_ipif = ipif;
+	mutex_enter(&ipx->ipx_lock);
+	ipx->ipx_pending_ipif = ipif;
 	/*
 	 * Note down the queue in b_queue. This will be returned by
 	 * ipsq_pending_mp_get. Caller will then use these values to restart
@@ -1189,38 +1243,40 @@ ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
 	 */
 	add_mp->b_next = NULL;
 	add_mp->b_queue = q;
-	ipsq->ipsq_pending_mp = add_mp;
-	ipsq->ipsq_waitfor = waitfor;
+	ipx->ipx_pending_mp = add_mp;
+	ipx->ipx_waitfor = waitfor;
+	mutex_exit(&ipx->ipx_lock);
 
 	if (connp != NULL)
 		connp->conn_oper_pending_ill = ipif->ipif_ill;
-	mutex_exit(&ipsq->ipsq_lock);
+
 	return (B_TRUE);
 }
 
 /*
- * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp
+ * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
  * queued in the list.
  */
 mblk_t *
 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
 {
 	mblk_t	*curr = NULL;
+	ipxop_t	*ipx = ipsq->ipsq_xop;
 
-	mutex_enter(&ipsq->ipsq_lock);
 	*connpp = NULL;
-	if (ipsq->ipsq_pending_mp == NULL) {
-		mutex_exit(&ipsq->ipsq_lock);
+	mutex_enter(&ipx->ipx_lock);
+	if (ipx->ipx_pending_mp == NULL) {
+		mutex_exit(&ipx->ipx_lock);
 		return (NULL);
 	}
 
 	/* There can be only 1 such excl message */
-	curr = ipsq->ipsq_pending_mp;
-	ASSERT(curr != NULL && curr->b_next == NULL);
-	ipsq->ipsq_pending_ipif = NULL;
-	ipsq->ipsq_pending_mp = NULL;
-	ipsq->ipsq_waitfor = 0;
-	mutex_exit(&ipsq->ipsq_lock);
+	curr = ipx->ipx_pending_mp;
+	ASSERT(curr->b_next == NULL);
+	ipx->ipx_pending_ipif = NULL;
+	ipx->ipx_pending_mp = NULL;
+	ipx->ipx_waitfor = 0;
+	mutex_exit(&ipx->ipx_lock);
 
 	if (CONN_Q(curr->b_queue)) {
 		/*
@@ -1237,7 +1293,7 @@ ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
 }
 
 /*
- * Cleanup the ioctl mp queued in ipsq_pending_mp
+ * Cleanup the ioctl mp queued in ipx_pending_mp
  * - Called in the ill_delete path
  * - Called in the M_ERROR or M_HANGUP path on the ill.
  * - Called in the conn close path.
@@ -1246,48 +1302,41 @@ boolean_t
 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
 {
 	mblk_t	*mp;
-	ipsq_t	*ipsq;
+	ipxop_t	*ipx;
 	queue_t	*q;
 	ipif_t	*ipif;
 
 	ASSERT(IAM_WRITER_ILL(ill));
-	ipsq = ill->ill_phyint->phyint_ipsq;
-	mutex_enter(&ipsq->ipsq_lock);
+	ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
+
 	/*
-	 * If connp is null, unconditionally clean up the ipsq_pending_mp.
+	 * If connp is null, unconditionally clean up the ipx_pending_mp.
 	 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl
 	 * even if it is meant for another ill, since we have to enqueue
-	 * a new mp now in ipsq_pending_mp to complete the ipif_down.
+	 * a new mp now in ipx_pending_mp to complete the ipif_down.
 	 * If connp is non-null we are called from the conn close path.
 	 */
-	mp = ipsq->ipsq_pending_mp;
+	mutex_enter(&ipx->ipx_lock);
+	mp = ipx->ipx_pending_mp;
 	if (mp == NULL || (connp != NULL &&
 	    mp->b_queue != CONNP_TO_WQ(connp))) {
-		mutex_exit(&ipsq->ipsq_lock);
+		mutex_exit(&ipx->ipx_lock);
 		return (B_FALSE);
 	}
-	/* Now remove from the ipsq_pending_mp */
-	ipsq->ipsq_pending_mp = NULL;
+	/* Now remove from the ipx_pending_mp */
+	ipx->ipx_pending_mp = NULL;
 	q = mp->b_queue;
 	mp->b_next = NULL;
 	mp->b_prev = NULL;
 	mp->b_queue = NULL;
 
-	/* If MOVE was in progress, clear the move_in_progress fields also. */
-	ill = ipsq->ipsq_pending_ipif->ipif_ill;
-	if (ill->ill_move_in_progress) {
-		ILL_CLEAR_MOVE(ill);
-	} else if (ill->ill_up_ipifs) {
-		ill_group_cleanup(ill);
-	}
-
-	ipif = ipsq->ipsq_pending_ipif;
-	ipsq->ipsq_pending_ipif = NULL;
-	ipsq->ipsq_waitfor = 0;
-	ipsq->ipsq_current_ipif = NULL;
-	ipsq->ipsq_current_ioctl = 0;
-	ipsq->ipsq_current_done = B_TRUE;
-	mutex_exit(&ipsq->ipsq_lock);
+	ipif = ipx->ipx_pending_ipif;
+	ipx->ipx_pending_ipif = NULL;
+	ipx->ipx_waitfor = 0;
+	ipx->ipx_current_ipif = NULL;
+	ipx->ipx_current_ioctl = 0;
+	ipx->ipx_current_done = B_TRUE;
+	mutex_exit(&ipx->ipx_lock);
 
 	if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
 		if (connp == NULL) {
@@ -1437,7 +1486,7 @@ conn_ioctl_cleanup(conn_t *connp)
 	 * Is any exclusive ioctl pending ? If so clean it up. If the
 	 * ioctl has not yet started, the mp is pending in the list headed by
 	 * ipsq_xopq_head. If the ioctl has started the mp could be present in
-	 * ipsq_pending_mp. If the ioctl timed out in the streamhead but
+	 * ipx_pending_mp. If the ioctl timed out in the streamhead but
 	 * is currently executing now the mp is not queued anywhere but
 	 * conn_oper_pending_ill is null. The conn close will wait
 	 * till the conn_ref drops to zero.
@@ -1468,9 +1517,9 @@ conn_ioctl_cleanup(conn_t *connp)
 			ill_waiter_dcr(ill);
 			/*
 			 * Check whether this ioctl has started and is
-			 * pending now in ipsq_pending_mp. If it is not
-			 * found there then check whether this ioctl has
-			 * not even started and is in the ipsq_xopq list.
+			 * pending. If it is not found there then check
+			 * whether this ioctl has not even started and is in
+			 * the ipsq_xopq list.
 			 */
 			if (!ipsq_pending_mp_cleanup(ill, connp))
 				ipsq_xopq_mp_cleanup(ill, connp);
@@ -1506,16 +1555,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg)
 	if (connp->conn_multicast_ill == ill) {
 		/* Revert to late binding */
 		connp->conn_multicast_ill = NULL;
-		connp->conn_orig_multicast_ifindex = 0;
 	}
 	if (connp->conn_incoming_ill == ill)
 		connp->conn_incoming_ill = NULL;
 	if (connp->conn_outgoing_ill == ill)
 		connp->conn_outgoing_ill = NULL;
-	if (connp->conn_outgoing_pill == ill)
-		connp->conn_outgoing_pill = NULL;
-	if (connp->conn_nofailover_ill == ill)
-		connp->conn_nofailover_ill = NULL;
 	if (connp->conn_dhcpinit_ill == ill) {
 		connp->conn_dhcpinit_ill = NULL;
 		ASSERT(ill->ill_dhcpinit != 0);
@@ -1524,11 +1568,11 @@ conn_cleanup_ill(conn_t *connp, caddr_t arg)
 	if (connp->conn_ire_cache != NULL) {
 		ire = connp->conn_ire_cache;
 		/*
-		 * ip_newroute creates IRE_CACHE with ire_stq coming from
-		 * interface X and ipif coming from interface Y, if interface
-		 * X and Y are part of the same IPMPgroup. Thus whenever
-		 * interface X goes down, remove all references to it by
-		 * checking both on ire_ipif and ire_stq.
+		 * Source address selection makes it possible for IRE_CACHE
+		 * entries to be created with ire_stq coming from interface X
+		 * and ipif coming from interface Y.  Thus whenever interface
+		 * X goes down, remove all references to it by checking both
+		 * on ire_ipif and ire_stq.
 		 */
 		if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
 		    (ire->ire_type == IRE_CACHE &&
@@ -1601,14 +1645,10 @@ ill_down(ill_t *ill)
 	ip_stack_t	*ipst = ill->ill_ipst;
 
 	/* Blow off any IREs dependent on this ILL. */
-	ire_walk(ill_downi, (char *)ill, ipst);
+	ire_walk(ill_downi, ill, ipst);
 
 	/* Remove any conn_*_ill depending on this ill */
 	ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
-
-	if (ill->ill_group != NULL) {
-		illgrp_delete(ill);
-	}
 }
 
 /*
@@ -1621,9 +1661,9 @@ ill_downi(ire_t *ire, char *ill_arg)
 	ill_t	*ill = (ill_t *)ill_arg;
 
 	/*
-	 * ip_newroute creates IRE_CACHE with ire_stq coming from
-	 * interface X and ipif coming from interface Y, if interface
-	 * X and Y are part of the same IPMP group. Thus whenever interface
+	 * Source address selection makes it possible for IRE_CACHE
+	 * entries to be created with ire_stq coming from interface X
+	 * and ipif coming from interface Y.  Thus whenever interface
 	 * X goes down, remove all references to it by checking both
 	 * on ire_ipif and ire_stq.
 	 */
@@ -3696,16 +3736,39 @@ nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp,
 }
 
 /*
- * Set an ill's ILLF_ROUTER flag appropriately.  If the ill is part of an
- * IPMP group, make sure all ill's in the group adopt the new policy.  Send
- * up RTS_IFINFO routing socket messages for each interface whose flags we
- * change.
+ * Helper function for ill_forward_set().
+ */
+static void
+ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
+{
+	ip_stack_t	*ipst = ill->ill_ipst;
+
+	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
+
+	ip1dbg(("ill_forward_set: %s %s forwarding on %s",
+	    (enable ? "Enabling" : "Disabling"),
+	    (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
+	mutex_enter(&ill->ill_lock);
+	if (enable)
+		ill->ill_flags |= ILLF_ROUTER;
+	else
+		ill->ill_flags &= ~ILLF_ROUTER;
+	mutex_exit(&ill->ill_lock);
+	if (ill->ill_isv6)
+		ill_set_nce_router_flags(ill, enable);
+	/* Notify routing socket listeners of this change. */
+	ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
+}
+
+/*
+ * Set an ill's ILLF_ROUTER flag appropriately.  Send up RTS_IFINFO routing
+ * socket messages for each interface whose flags we change.
  */
 int
 ill_forward_set(ill_t *ill, boolean_t enable)
 {
-	ill_group_t *illgrp;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	ipmp_illgrp_t *illg;
+	ip_stack_t *ipst = ill->ill_ipst;
 
 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
 
@@ -3716,47 +3779,23 @@ ill_forward_set(ill_t *ill, boolean_t enable)
 	if (IS_LOOPBACK(ill))
 		return (EINVAL);
 
-	/*
-	 * If the ill is in an IPMP group, set the forwarding policy on all
-	 * members of the group to the same value.
-	 */
-	illgrp = ill->ill_group;
-	if (illgrp != NULL) {
-		ill_t *tmp_ill;
+	if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
+		/*
+		 * Update all of the interfaces in the group.
+		 */
+		illg = ill->ill_grp;
+		ill = list_head(&illg->ig_if);
+		for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
+			ill_forward_set_on_ill(ill, enable);
 
-		for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL;
-		    tmp_ill = tmp_ill->ill_group_next) {
-			ip1dbg(("ill_forward_set: %s %s forwarding on %s",
-			    (enable ? "Enabling" : "Disabling"),
-			    (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"),
-			    tmp_ill->ill_name));
-			mutex_enter(&tmp_ill->ill_lock);
-			if (enable)
-				tmp_ill->ill_flags |= ILLF_ROUTER;
-			else
-				tmp_ill->ill_flags &= ~ILLF_ROUTER;
-			mutex_exit(&tmp_ill->ill_lock);
-			if (tmp_ill->ill_isv6)
-				ill_set_nce_router_flags(tmp_ill, enable);
-			/* Notify routing socket listeners of this change. */
-			ip_rts_ifmsg(tmp_ill->ill_ipif);
-		}
-	} else {
-		ip1dbg(("ill_forward_set: %s %s forwarding on %s",
-		    (enable ? "Enabling" : "Disabling"),
-		    (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
-		mutex_enter(&ill->ill_lock);
-		if (enable)
-			ill->ill_flags |= ILLF_ROUTER;
-		else
-			ill->ill_flags &= ~ILLF_ROUTER;
-		mutex_exit(&ill->ill_lock);
-		if (ill->ill_isv6)
-			ill_set_nce_router_flags(ill, enable);
-		/* Notify routing socket listeners of this change. */
-		ip_rts_ifmsg(ill->ill_ipif);
+		/*
+		 * Update the IPMP meta-interface.
+		 */
+		ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
+		return (0);
 	}
 
+	ill_forward_set_on_ill(ill, enable);
 	return (0);
 }
 
@@ -3772,7 +3811,12 @@ ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
 	nce_t *nce;
 
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE);
+		/*
+		 * NOTE: we're called separately for each ill in an illgrp,
+		 * so don't match across the illgrp.
+		 */
+		nce = ndp_lookup_v6(ill, B_FALSE, &ipif->ipif_v6lcl_addr,
+		    B_FALSE);
 		if (nce != NULL) {
 			mutex_enter(&nce->nce_lock);
 			if (enable)
@@ -3928,36 +3972,45 @@ ill_next(ill_walk_context_t *ctx, ill_t *lastill)
 }
 
 /*
- * Check interface name for correct format which is name+ppa.
- * name can contain characters and digits, the right most digits
- * make up the ppa number. use of octal is not allowed, name must contain
- * a ppa, return pointer to the start of ppa.
- * In case of error return NULL.
+ * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
+ * The final number (PPA) must not have any leading zeros.  Upon success, a
+ * pointer to the start of the PPA is returned; otherwise NULL is returned.
  */
 static char *
 ill_get_ppa_ptr(char *name)
 {
-	int namelen = mi_strlen(name);
+	int namelen = strlen(name);
+	int end_ndx = namelen - 1;
+	int ppa_ndx, i;
 
-	int len = namelen;
+	/*
+	 * Check that the first character is [a-zA-Z], and that the last
+	 * character is [0-9].
+	 */
+	if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
+		return (NULL);
 
-	name += len;
-	while (len > 0) {
-		name--;
-		if (*name < '0' || *name > '9')
+	/*
+	 * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
+	 */
+	for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
+		if (!isdigit(name[ppa_ndx - 1]))
 			break;
-		len--;
-	}
 
-	/* empty string, all digits, or no trailing digits */
-	if (len == 0 || len == (int)namelen)
+	if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
 		return (NULL);
 
-	name++;
-	/* check for attempted use of octal */
-	if (*name == '0' && len != (int)namelen - 1)
-		return (NULL);
-	return (name);
+	/*
+	 * Check that the intermediate characters are [a-z0-9.]
+	 */
+	for (i = 1; i < ppa_ndx; i++) {
+		if (!isalpha(name[i]) && !isdigit(name[i]) &&
+		    name[i] != '.' && name[i] != '_') {
+			return (NULL);
+		}
+	}
+
+	return (name + ppa_ndx);
 }
 
 /*
@@ -4037,8 +4090,10 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
 		} else if (ILL_CAN_WAIT(ill, q)) {
 			ipsq = ill->ill_phyint->phyint_ipsq;
 			mutex_enter(&ipsq->ipsq_lock);
+			mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 			mutex_exit(&ill->ill_lock);
 			ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+			mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 			mutex_exit(&ipsq->ipsq_lock);
 			RELEASE_CONN_LOCK(q);
 			if (error != NULL)
@@ -4102,6 +4157,7 @@ static void
 ill_glist_delete(ill_t *ill)
 {
 	ip_stack_t	*ipst;
+	phyint_t	*phyi;
 
 	if (ill == NULL)
 		return;
@@ -4139,8 +4195,41 @@ ill_glist_delete(ill_t *ill)
 	ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
 	    ill->ill_name_length);
 
-	ill_phyint_free(ill);
+	ASSERT(ill->ill_phyint != NULL);
+	phyi = ill->ill_phyint;
+	ill->ill_phyint = NULL;
+
+	/*
+	 * ill_init allocates a phyint always to store the copy
+	 * of flags relevant to phyint. At that point in time, we could
+	 * not assign the name and hence phyint_illv4/v6 could not be
+	 * initialized. Later in ipif_set_values, we assign the name to
+	 * the ill, at which point in time we assign phyint_illv4/v6.
+	 * Thus we don't rely on phyint_illv6 to be initialized always.
+	 */
+	if (ill->ill_flags & ILLF_IPV6)
+		phyi->phyint_illv6 = NULL;
+	else
+		phyi->phyint_illv4 = NULL;
+
+	if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
+		rw_exit(&ipst->ips_ill_g_lock);
+		return;
+	}
+
+	/*
+	 * There are no ills left on this phyint; pull it out of the phyint
+	 * avl trees, and free it.
+	 */
+	if (phyi->phyint_ifindex > 0) {
+		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+		    phyi);
+		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
+		    phyi);
+	}
 	rw_exit(&ipst->ips_ill_g_lock);
+
+	phyint_free(phyi);
 }
 
 /*
@@ -4367,30 +4456,32 @@ ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
 	return (0);
 }
 
-/* Initialize the per phyint (per IPMP group) ipsq used for serialization */
+/* Initialize the per phyint ipsq used for serialization */
 static boolean_t
-ipsq_init(ill_t *ill)
+ipsq_init(ill_t *ill, boolean_t enter)
 {
 	ipsq_t  *ipsq;
+	ipxop_t	*ipx;
 
-	/* Init the ipsq and impicitly enter as writer */
-	ill->ill_phyint->phyint_ipsq =
-	    kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
-	if (ill->ill_phyint->phyint_ipsq == NULL)
+	if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
 		return (B_FALSE);
-	ipsq = ill->ill_phyint->phyint_ipsq;
-	ipsq->ipsq_phyint_list = ill->ill_phyint;
-	ill->ill_phyint->phyint_ipsq_next = NULL;
+
+	ill->ill_phyint->phyint_ipsq = ipsq;
+	ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
+	ipx->ipx_ipsq = ipsq;
+	ipsq->ipsq_next = ipsq;
+	ipsq->ipsq_phyint = ill->ill_phyint;
 	mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
-	ipsq->ipsq_refs = 1;
-	ipsq->ipsq_writer = curthread;
-	ipsq->ipsq_reentry_cnt = 1;
+	mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
 	ipsq->ipsq_ipst = ill->ill_ipst;	/* No netstack_hold */
+	if (enter) {
+		ipx->ipx_writer = curthread;
+		ipx->ipx_forced = B_FALSE;
+		ipx->ipx_reentry_cnt = 1;
 #ifdef DEBUG
-	ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack,
-	    IPSQ_STACK_DEPTH);
+		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
 #endif
-	(void) strcpy(ipsq->ipsq_name, ill->ill_name);
+	}
 	return (B_TRUE);
 }
 
@@ -4468,7 +4559,7 @@ ill_init(queue_t *q, ill_t *ill)
 	ill->ill_ppa = UINT_MAX;
 	ill->ill_fastpath_list = &ill->ill_fastpath_list;
 
-	if (!ipsq_init(ill)) {
+	if (!ipsq_init(ill, B_TRUE)) {
 		freemsg(info_mp);
 		mi_free(frag_ptr);
 		mi_free(ill->ill_phyint);
@@ -4589,29 +4680,16 @@ loopback_kstat_update(kstat_t *ksp, int rw)
 }
 
 /*
- * Has ifindex been plumbed already.
- * Compares both phyint_ifindex and phyint_group_ifindex.
+ * Has ifindex been plumbed already?
  */
 static boolean_t
 phyint_exists(uint_t index, ip_stack_t *ipst)
 {
-	phyint_t *phyi;
-
 	ASSERT(index != 0);
 	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-	/*
-	 * Indexes are stored in the phyint - a common structure
-	 * to both IPv4 and IPv6.
-	 */
-	phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
-	for (; phyi != NULL;
-	    phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
-	    phyi, AVL_AFTER)) {
-		if (phyi->phyint_ifindex == index ||
-		    phyi->phyint_group_ifindex == index)
-			return (B_TRUE);
-	}
-	return (B_FALSE);
+
+	return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+	    &index, NULL) != NULL);
 }
 
 /* Pick a unique ifindex */
@@ -4675,9 +4753,9 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 {
 	ill_t	*ill;
 	ipif_t	*ipif;
+	ipsq_t	*ipsq;
 	kstat_named_t	*kn;
 	boolean_t isloopback;
-	ipsq_t *old_ipsq;
 	in6_addr_t ov6addr;
 
 	isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
@@ -4761,16 +4839,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 	ill->ill_net_type = IRE_LOOPBACK;
 
 	/* Initialize the ipsq */
-	if (!ipsq_init(ill))
+	if (!ipsq_init(ill, B_FALSE))
 		goto done;
 
-	ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL;
-	ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--;
-	ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
-	ill->ill_phyint->phyint_ipsq->ipsq_depth = 0;
-#endif
-	ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE);
+	ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE);
 	if (ipif == NULL)
 		goto done;
 
@@ -4807,7 +4879,7 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 	ill->ill_frag_free_num_pkts = 0;
 	ill->ill_last_frag_clean_time = 0;
 
-	old_ipsq = ill->ill_phyint->phyint_ipsq;
+	ipsq = ill->ill_phyint->phyint_ipsq;
 
 	if (ill_glist_insert(ill, "lo", isv6) != 0)
 		cmn_err(CE_PANIC, "cannot insert loopback interface");
@@ -4824,13 +4896,11 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 	sctp_update_ipif_addr(ipif, ov6addr);
 
 	/*
-	 * If the ipsq was changed in ill_phyint_reinit free the old ipsq.
+	 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
+	 * If so, free our original one.
 	 */
-	if (old_ipsq != ill->ill_phyint->phyint_ipsq) {
-		/* Loopback ills aren't in any IPMP group */
-		ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP));
-		ipsq_delete(old_ipsq);
-	}
+	if (ipsq != ill->ill_phyint->phyint_ipsq)
+		ipsq_delete(ipsq);
 
 	/*
 	 * Delay this till the ipif is allocated as ipif_allocate
@@ -4871,12 +4941,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
 done:
 	if (ill != NULL) {
 		if (ill->ill_phyint != NULL) {
-			ipsq_t	*ipsq;
-
 			ipsq = ill->ill_phyint->phyint_ipsq;
 			if (ipsq != NULL) {
-				ipsq->ipsq_ipst = NULL;
-				kmem_free(ipsq, sizeof (ipsq_t));
+				ipsq->ipsq_phyint = NULL;
+				ipsq_delete(ipsq);
 			}
 			mi_free(ill->ill_phyint);
 		}
@@ -4954,9 +5022,11 @@ ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp,
 			} else if (ILL_CAN_WAIT(ill, q)) {
 				ipsq = ill->ill_phyint->phyint_ipsq;
 				mutex_enter(&ipsq->ipsq_lock);
+				mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 				rw_exit(&ipst->ips_ill_g_lock);
 				mutex_exit(&ill->ill_lock);
 				ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+				mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 				mutex_exit(&ipsq->ipsq_lock);
 				RELEASE_CONN_LOCK(q);
 				if (err != NULL)
@@ -5294,6 +5364,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	dl_info_ack_t	*dlia;
 	ip_m_t		*ipm;
 	dl_qos_cl_sel1_t *sel1;
+	int		min_mtu;
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
@@ -5336,7 +5407,14 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	ill->ill_bcast_addr_length = brdcst_addr_length;
 	ill->ill_phys_addr_length = phys_addr_length;
 	ill->ill_sap_length = sap_length;
-	ill->ill_max_frag = dlia->dl_max_sdu;
+
+	/*
+	 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
+	 * but we must ensure a minimum IP MTU is used since other bits of
+	 * IP will fly apart otherwise.
+	 */
+	min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
+	ill->ill_max_frag  = MAX(min_mtu, dlia->dl_max_sdu);
 	ill->ill_max_mtu = ill->ill_max_frag;
 
 	ill->ill_type = ipm->ip_m_type;
@@ -5358,7 +5436,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 		 * the wakeup.
 		 */
 		(void) ipif_allocate(ill, 0, IRE_LOCAL,
-		    dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE);
+		    dlia->dl_provider_style != DL_STYLE2, B_TRUE);
 		mutex_enter(&ill->ill_lock);
 		ASSERT(ill->ill_dlpi_style_set == 0);
 		ill->ill_dlpi_style_set = 1;
@@ -5397,8 +5475,13 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 	/*
 	 * Free ill_resolver_mp and ill_bcast_mp as things could have
 	 * changed now.
+	 *
+	 * NOTE: The IPMP meta-interface is special-cased because it starts
+	 * with no underlying interfaces (and thus an unknown broadcast
+	 * address length), but we enforce that an interface is broadcast-
+	 * capable as part of allowing it to join a group.
 	 */
-	if (ill->ill_bcast_addr_length == 0) {
+	if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
 		if (ill->ill_resolver_mp != NULL)
 			freemsg(ill->ill_resolver_mp);
 		if (ill->ill_bcast_mp != NULL)
@@ -5451,6 +5534,11 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
 		if (!ill->ill_isv6)
 			ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
 	}
+
+	/* For IPMP, PHYI_IPMP should already be set by ipif_allocate() */
+	if (ill->ill_mactype == SUNW_DL_IPMP)
+		ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
+
 	/* By default an interface does not support any CoS marking */
 	ill->ill_flags &= ~ILLF_COS_ENABLED;
 
@@ -5552,16 +5640,18 @@ ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
 }
 
 /*
- * Find any non-virtual, not condemned, and up multicast capable interface
- * given an IP instance and zoneid.  Order of preference is:
+ * Find a mulitcast-capable ipif given an IP instance and zoneid.
+ * The ipif must be up, and its ill must multicast-capable, not
+ * condemned, not an underlying interface in an IPMP group, and
+ * not a VNI interface.  Order of preference:
  *
- * 1. normal
- * 1.1 normal, but deprecated
- * 2. point to point
- * 2.1 point to point, but deprecated
- * 3. link local
- * 3.1 link local, but deprecated
- * 4. loopback.
+ * 	1a. normal
+ * 	1b. normal, but deprecated
+ * 	2a. point to point
+ * 	2b. point to point, but deprecated
+ * 	3a. link local
+ * 	3b. link local, but deprecated
+ * 	4. loopback.
  */
 ipif_t *
 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
@@ -5580,7 +5670,7 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
 
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
 		mutex_enter(&ill->ill_lock);
-		if (IS_VNI(ill) || !ILL_CAN_LOOKUP(ill) ||
+		if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) ||
 		    !(ill->ill_flags & ILLF_MULTICAST)) {
 			mutex_exit(&ill->ill_lock);
 			continue;
@@ -5736,10 +5826,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
 				} else if (IPIF_CAN_WAIT(ipif, q)) {
 					ipsq = ill->ill_phyint->phyint_ipsq;
 					mutex_enter(&ipsq->ipsq_lock);
+					mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ill->ill_lock);
 					rw_exit(&ipst->ips_ill_g_lock);
 					ipsq_enq(ipsq, q, mp, func, NEW_OP,
 					    ill);
+					mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ipsq->ipsq_lock);
 					RELEASE_CONN_LOCK(q);
 					if (error != NULL)
@@ -5761,15 +5853,12 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
 }
 
 /*
- * Look for an ipif with the specified address. For point-point links
- * we look for matches on either the destination address and the local
- * address, but we ignore the check on the local address if IPIF_UNNUMBERED
- * is set.
- * Matches on a specific ill if match_ill is set.
+ * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
  */
-ipif_t *
-ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
-    mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+static ipif_t *
+ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp,
+    zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
+    ip_stack_t *ipst)
 {
 	ipif_t  *ipif;
 	ill_t   *ill;
@@ -5788,7 +5877,8 @@ ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
 repeat:
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (match_ill != NULL && ill != match_ill) {
+		if (match_ill != NULL && ill != match_ill &&
+		    (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
 			continue;
 		}
 		GRAB_CONN_LOCK(q);
@@ -5817,10 +5907,12 @@ repeat:
 				} else if (IPIF_CAN_WAIT(ipif, q)) {
 					ipsq = ill->ill_phyint->phyint_ipsq;
 					mutex_enter(&ipsq->ipsq_lock);
+					mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ill->ill_lock);
 					rw_exit(&ipst->ips_ill_g_lock);
 					ipsq_enq(ipsq, q, mp, func, NEW_OP,
 					    ill);
+					mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 					mutex_exit(&ipsq->ipsq_lock);
 					RELEASE_CONN_LOCK(q);
 					if (error != NULL)
@@ -5894,11 +5986,40 @@ ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
 }
 
 /*
+ * Lookup an ipif with the specified address.  For point-to-point links we
+ * look for matches on either the destination address or the local address,
+ * but we skip the local address check if IPIF_UNNUMBERED is set.  If the
+ * `match_ill' argument is non-NULL, the lookup is restricted to that ill
+ * (or illgrp if `match_ill' is in an IPMP group).
+ */
+ipif_t *
+ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
+    mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+{
+	return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp,
+	    func, error, ipst));
+}
+
+/*
+ * Special abbreviated version of ipif_lookup_addr() that doesn't match
+ * `match_ill' across the IPMP group.  This function is only needed in some
+ * corner-cases; almost everything should use ipif_lookup_addr().
+ */
+static ipif_t *
+ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
+{
+	ASSERT(match_ill != NULL);
+	return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES,
+	    NULL, NULL, NULL, NULL, ipst));
+}
+
+/*
  * Look for an ipif with the specified address. For point-point links
  * we look for matches on either the destination address and the local
  * address, but we ignore the check on the local address if IPIF_UNNUMBERED
  * is set.
- * Matches on a specific ill if match_ill is set.
+ * If the `match_ill' argument is non-NULL, the lookup is restricted to that
+ * ill (or illgrp if `match_ill' is in an IPMP group).
  * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
  */
 zoneid_t
@@ -5918,7 +6039,8 @@ ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
 repeat:
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if (match_ill != NULL && ill != match_ill) {
+		if (match_ill != NULL && ill != match_ill &&
+		    !IS_IN_SAME_ILLGRP(ill, match_ill)) {
 			continue;
 		}
 		mutex_enter(&ill->ill_lock);
@@ -6008,7 +6130,7 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
 		/*
 		 * The callers of this function wants to know the
 		 * interface on which they have to send the replies
-		 * back. For IRE_CACHES that have ire_stq and ire_ipif
+		 * back. For IREs that have ire_stq and ire_ipif
 		 * derived from different ills, we really don't care
 		 * what we return here.
 		 */
@@ -6109,30 +6231,6 @@ ipif_is_freeable(ipif_t *ipif)
 }
 
 /*
- * This func does not prevent refcnt from increasing. But if
- * the caller has taken steps to that effect, then this func
- * can be used to determine whether the ipifs marked with IPIF_MOVING
- * have become quiescent and can be moved in a failover/failback.
- */
-static ipif_t *
-ill_quiescent_to_move(ill_t *ill)
-{
-	ipif_t  *ipif;
-
-	ASSERT(MUTEX_HELD(&ill->ill_lock));
-
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (ipif->ipif_state_flags & IPIF_MOVING) {
-			if (ipif->ipif_refcnt != 0 ||
-			    !IPIF_DOWN_OK(ipif)) {
-				return (ipif);
-			}
-		}
-	}
-	return (NULL);
-}
-
-/*
  * The ipif/ill/ire has been refreled. Do the tail processing.
  * Determine if the ipif or ill in question has become quiescent and if so
  * wakeup close and/or restart any queued pending ioctl that is waiting
@@ -6144,87 +6242,61 @@ ipif_ill_refrele_tail(ill_t *ill)
 	mblk_t	*mp;
 	conn_t	*connp;
 	ipsq_t	*ipsq;
+	ipxop_t	*ipx;
 	ipif_t	*ipif;
 	dl_notify_ind_t *dlindp;
 
 	ASSERT(MUTEX_HELD(&ill->ill_lock));
 
-	if ((ill->ill_state_flags & ILL_CONDEMNED) &&
-	    ill_is_freeable(ill)) {
-		/* ill_close may be waiting */
+	if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
+		/* ip_modclose() may be waiting */
 		cv_broadcast(&ill->ill_cv);
 	}
 
-	/* ipsq can't change because ill_lock  is held */
 	ipsq = ill->ill_phyint->phyint_ipsq;
-	if (ipsq->ipsq_waitfor == 0) {
-		/* Not waiting for anything, just return. */
-		mutex_exit(&ill->ill_lock);
-		return;
-	}
-	ASSERT(ipsq->ipsq_pending_mp != NULL &&
-	    ipsq->ipsq_pending_ipif != NULL);
-	/*
-	 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF.
-	 * Last ipif going down needs to down the ill, so ill_ire_cnt must
-	 * be zero for restarting an ioctl that ends up downing the ill.
-	 */
-	ipif = ipsq->ipsq_pending_ipif;
-	if (ipif->ipif_ill != ill) {
-		/* The ioctl is pending on some other ill. */
-		mutex_exit(&ill->ill_lock);
-		return;
-	}
+	mutex_enter(&ipsq->ipsq_lock);
+	ipx = ipsq->ipsq_xop;
+	mutex_enter(&ipx->ipx_lock);
+	if (ipx->ipx_waitfor == 0)	/* no one's waiting; bail */
+		goto unlock;
+
+	ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
+
+	ipif = ipx->ipx_pending_ipif;
+	if (ipif->ipif_ill != ill) 	/* wait is for another ill; bail */
+		goto unlock;
 
-	switch (ipsq->ipsq_waitfor) {
+	switch (ipx->ipx_waitfor) {
 	case IPIF_DOWN:
-		if (!ipif_is_quiescent(ipif)) {
-			mutex_exit(&ill->ill_lock);
-			return;
-		}
+		if (!ipif_is_quiescent(ipif))
+			goto unlock;
 		break;
 	case IPIF_FREE:
-		if (!ipif_is_freeable(ipif)) {
-			mutex_exit(&ill->ill_lock);
-			return;
-		}
+		if (!ipif_is_freeable(ipif))
+			goto unlock;
 		break;
-
 	case ILL_DOWN:
-		if (!ill_is_quiescent(ill)) {
-			mutex_exit(&ill->ill_lock);
-			return;
-		}
+		if (!ill_is_quiescent(ill))
+			goto unlock;
 		break;
 	case ILL_FREE:
 		/*
-		 * case ILL_FREE arises only for loopback. otherwise ill_delete
-		 * waits synchronously in ip_close, and no message is queued in
-		 * ipsq_pending_mp at all in this case
+		 * ILL_FREE is only for loopback; normal ill teardown waits
+		 * synchronously in ip_modclose() without using ipx_waitfor,
+		 * handled by the cv_broadcast() at the top of this function.
 		 */
-		if (!ill_is_freeable(ill)) {
-			mutex_exit(&ill->ill_lock);
-			return;
-		}
-		break;
-
-	case ILL_MOVE_OK:
-		if (ill_quiescent_to_move(ill) != NULL) {
-			mutex_exit(&ill->ill_lock);
-			return;
-		}
+		if (!ill_is_freeable(ill))
+			goto unlock;
 		break;
 	default:
-		cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n",
-		    (void *)ipsq, ipsq->ipsq_waitfor);
+		cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
+		    (void *)ipsq, ipx->ipx_waitfor);
 	}
 
-	/*
-	 * Incr refcnt for the qwriter_ip call below which
-	 * does a refrele
-	 */
-	ill_refhold_locked(ill);
+	ill_refhold_locked(ill);	/* for qwriter_ip() call below */
+	mutex_exit(&ipx->ipx_lock);
 	mp = ipsq_pending_mp_get(ipsq, &connp);
+	mutex_exit(&ipsq->ipsq_lock);
 	mutex_exit(&ill->ill_lock);
 
 	ASSERT(mp != NULL);
@@ -6249,6 +6321,7 @@ ipif_ill_refrele_tail(ill_t *ill)
 			return;
 		default:
 			ASSERT(0);
+			ill_refrele(ill);
 		}
 		break;
 
@@ -6268,6 +6341,11 @@ ipif_ill_refrele_tail(ill_t *ill)
 		cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
 		    "db_type %d\n", (void *)mp, mp->b_datap->db_type);
 	}
+	return;
+unlock:
+	mutex_exit(&ipsq->ipsq_lock);
+	mutex_exit(&ipx->ipx_lock);
+	mutex_exit(&ill->ill_lock);
 }
 
 #ifdef DEBUG
@@ -6902,10 +6980,23 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
 	ipif = ipif_arg;
 	if (ipif_arg != NULL)
 		match_flags |= MATCH_IRE_ILL;
+again:
 	gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL,
 	    ALL_ZONES, 0, NULL, match_flags, ipst);
-	if (gw_ire == NULL)
+	if (gw_ire == NULL) {
+		/*
+		 * With IPMP, we allow host routes to influence in.mpathd's
+		 * target selection.  However, if the test addresses are on
+		 * their own network, the above lookup will fail since the
+		 * underlying IRE_INTERFACEs are marked hidden.  So allow
+		 * hidden test IREs to be found and try again.
+		 */
+		if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))  {
+			match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+			goto again;
+		}
 		return (ENETUNREACH);
+	}
 
 	/*
 	 * We create one of three types of IREs as a result of this request
@@ -7355,9 +7446,11 @@ void
 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
     ill_t *pending_ill)
 {
-	conn_t	*connp = NULL;
+	conn_t	*connp;
+	ipxop_t *ipx = ipsq->ipsq_xop;
 
 	ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
+	ASSERT(MUTEX_HELD(&ipx->ipx_lock));
 	ASSERT(func != NULL);
 
 	mp->b_queue = q;
@@ -7366,14 +7459,14 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
 
 	switch (type) {
 	case CUR_OP:
-		if (ipsq->ipsq_mptail != NULL) {
-			ASSERT(ipsq->ipsq_mphead != NULL);
-			ipsq->ipsq_mptail->b_next = mp;
+		if (ipx->ipx_mptail != NULL) {
+			ASSERT(ipx->ipx_mphead != NULL);
+			ipx->ipx_mptail->b_next = mp;
 		} else {
-			ASSERT(ipsq->ipsq_mphead == NULL);
-			ipsq->ipsq_mphead = mp;
+			ASSERT(ipx->ipx_mphead == NULL);
+			ipx->ipx_mphead = mp;
 		}
-		ipsq->ipsq_mptail = mp;
+		ipx->ipx_mptail = mp;
 		break;
 
 	case NEW_OP:
@@ -7385,6 +7478,15 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
 			ipsq->ipsq_xopq_mphead = mp;
 		}
 		ipsq->ipsq_xopq_mptail = mp;
+		ipx->ipx_ipsq_queued = B_TRUE;
+		break;
+
+	case SWITCH_OP:
+		ASSERT(ipsq->ipsq_swxop != NULL);
+		/* only one switch operation is currently allowed */
+		ASSERT(ipsq->ipsq_switch_mp == NULL);
+		ipsq->ipsq_switch_mp = mp;
+		ipx->ipx_ipsq_queued = B_TRUE;
 		break;
 	default:
 		cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
@@ -7392,55 +7494,273 @@ ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
 
 	if (CONN_Q(q) && pending_ill != NULL) {
 		connp = Q_TO_CONN(q);
-
 		ASSERT(MUTEX_HELD(&connp->conn_lock));
 		connp->conn_oper_pending_ill = pending_ill;
 	}
 }
 
 /*
- * Return the mp at the head of the ipsq. After emptying the ipsq
- * look at the next ioctl, if this ioctl is complete. Otherwise
- * return, we will resume when we complete the current ioctl.
- * The current ioctl will wait till it gets a response from the
- * driver below.
+ * Dequeue the next message that requested exclusive access to this IPSQ's
+ * xop.  Specifically:
+ *
+ *  1. If we're still processing the current operation on `ipsq', then
+ *     dequeue the next message for the operation (from ipx_mphead), or
+ *     return NULL if there are no queued messages for the operation.
+ *     These messages are queued via CUR_OP to qwriter_ip() and friends.
+ *
+ *  2. If the current operation on `ipsq' has completed (ipx_current_ipif is
+ *     not set) see if the ipsq has requested an xop switch.  If so, switch
+ *     `ipsq' to a different xop.  Xop switches only happen when joining or
+ *     leaving IPMP groups and require a careful dance -- see the comments
+ *     in-line below for details.  If we're leaving a group xop or if we're
+ *     joining a group xop and become writer on it, then we proceed to (3).
+ *     Otherwise, we return NULL and exit the xop.
+ *
+ *  3. For each IPSQ in the xop, return any switch operation stored on
+ *     ipsq_switch_mp (set via SWITCH_OP); these must be processed before
+ *     any other messages queued on the IPSQ.  Otherwise, dequeue the next
+ *     exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
+ *     Note that if the phyint tied to `ipsq' is not using IPMP there will
+ *     only be one IPSQ in the xop.  Otherwise, there will be one IPSQ for
+ *     each phyint in the group, including the IPMP meta-interface phyint.
  */
 static mblk_t *
 ipsq_dq(ipsq_t *ipsq)
 {
+	ill_t	*illv4, *illv6;
 	mblk_t	*mp;
+	ipsq_t	*xopipsq;
+	ipsq_t	*leftipsq = NULL;
+	ipxop_t *ipx;
+	phyint_t *phyi = ipsq->ipsq_phyint;
+	ip_stack_t *ipst = ipsq->ipsq_ipst;
+	boolean_t emptied = B_FALSE;
 
-	ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
+	/*
+	 * Grab all the locks we need in the defined order (ill_g_lock ->
+	 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
+	 */
+	rw_enter(&ipst->ips_ill_g_lock,
+	    ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
+	mutex_enter(&ipsq->ipsq_lock);
+	ipx = ipsq->ipsq_xop;
+	mutex_enter(&ipx->ipx_lock);
 
-	mp = ipsq->ipsq_mphead;
-	if (mp != NULL) {
-		ipsq->ipsq_mphead = mp->b_next;
-		if (ipsq->ipsq_mphead == NULL)
-			ipsq->ipsq_mptail = NULL;
-		mp->b_next = NULL;
-		return (mp);
+	/*
+	 * Dequeue the next message associated with the current exclusive
+	 * operation, if any.
+	 */
+	if ((mp = ipx->ipx_mphead) != NULL) {
+		ipx->ipx_mphead = mp->b_next;
+		if (ipx->ipx_mphead == NULL)
+			ipx->ipx_mptail = NULL;
+		mp->b_next = (void *)ipsq;
+		goto out;
 	}
-	if (ipsq->ipsq_current_ipif != NULL)
-		return (NULL);
-	mp = ipsq->ipsq_xopq_mphead;
-	if (mp != NULL) {
-		ipsq->ipsq_xopq_mphead = mp->b_next;
-		if (ipsq->ipsq_xopq_mphead == NULL)
-			ipsq->ipsq_xopq_mptail = NULL;
-		mp->b_next = NULL;
-		return (mp);
+
+	if (ipx->ipx_current_ipif != NULL)
+		goto empty;
+
+	if (ipsq->ipsq_swxop != NULL) {
+		/*
+		 * The exclusive operation that is now being completed has
+		 * requested a switch to a different xop.  This happens
+		 * when an interface joins or leaves an IPMP group.  Joins
+		 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
+		 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
+		 * (phyint_free()), or interface plumb for an ill type
+		 * not in the IPMP group (ip_rput_dlpi_writer()).
+		 *
+		 * Xop switches are not allowed on the IPMP meta-interface.
+		 */
+		ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
+		ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
+		DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
+
+		if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
+			/*
+			 * We're switching back to our own xop, so we have two
+			 * xop's to drain/exit: our own, and the group xop
+			 * that we are leaving.
+			 *
+			 * First, pull ourselves out of the group ipsq list.
+			 * This is safe since we're writer on ill_g_lock.
+			 */
+			ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
+
+			xopipsq = ipx->ipx_ipsq;
+			while (xopipsq->ipsq_next != ipsq)
+				xopipsq = xopipsq->ipsq_next;
+
+			xopipsq->ipsq_next = ipsq->ipsq_next;
+			ipsq->ipsq_next = ipsq;
+			ipsq->ipsq_xop = ipsq->ipsq_swxop;
+			ipsq->ipsq_swxop = NULL;
+
+			/*
+			 * Second, prepare to exit the group xop.  The actual
+			 * ipsq_exit() is done at the end of this function
+			 * since we cannot hold any locks across ipsq_exit().
+			 * Note that although we drop the group's ipx_lock, no
+			 * threads can proceed since we're still ipx_writer.
+			 */
+			leftipsq = xopipsq;
+			mutex_exit(&ipx->ipx_lock);
+
+			/*
+			 * Third, set ipx to point to our own xop (which was
+			 * inactive and therefore can be entered).
+			 */
+			ipx = ipsq->ipsq_xop;
+			mutex_enter(&ipx->ipx_lock);
+			ASSERT(ipx->ipx_writer == NULL);
+			ASSERT(ipx->ipx_current_ipif == NULL);
+		} else {
+			/*
+			 * We're switching from our own xop to a group xop.
+			 * The requestor of the switch must ensure that the
+			 * group xop cannot go away (e.g. by ensuring the
+			 * phyint associated with the xop cannot go away).
+			 *
+			 * If we can become writer on our new xop, then we'll
+			 * do the drain.  Otherwise, the current writer of our
+			 * new xop will do the drain when it exits.
+			 *
+			 * First, splice ourselves into the group IPSQ list.
+			 * This is safe since we're writer on ill_g_lock.
+			 */
+			ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
+
+			xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
+			while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
+				xopipsq = xopipsq->ipsq_next;
+
+			xopipsq->ipsq_next = ipsq;
+			ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
+			ipsq->ipsq_xop = ipsq->ipsq_swxop;
+			ipsq->ipsq_swxop = NULL;
+
+			/*
+			 * Second, exit our own xop, since it's now unused.
+			 * This is safe since we've got the only reference.
+			 */
+			ASSERT(ipx->ipx_writer == curthread);
+			ipx->ipx_writer = NULL;
+			VERIFY(--ipx->ipx_reentry_cnt == 0);
+			ipx->ipx_ipsq_queued = B_FALSE;
+			mutex_exit(&ipx->ipx_lock);
+
+			/*
+			 * Third, set ipx to point to our new xop, and check
+			 * if we can become writer on it.  If we cannot, then
+			 * the current writer will drain the IPSQ group when
+			 * it exits.  Our ipsq_xop is guaranteed to be stable
+			 * because we're still holding ipsq_lock.
+			 */
+			ipx = ipsq->ipsq_xop;
+			mutex_enter(&ipx->ipx_lock);
+			if (ipx->ipx_writer != NULL ||
+			    ipx->ipx_current_ipif != NULL) {
+				goto out;
+			}
+		}
+
+		/*
+		 * Fourth, become writer on our new ipx before we continue
+		 * with the drain.  Note that we never dropped ipsq_lock
+		 * above, so no other thread could've raced with us to
+		 * become writer first.  Also, we're holding ipx_lock, so
+		 * no other thread can examine the ipx right now.
+		 */
+		ASSERT(ipx->ipx_current_ipif == NULL);
+		ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
+		VERIFY(ipx->ipx_reentry_cnt++ == 0);
+		ipx->ipx_writer = curthread;
+		ipx->ipx_forced = B_FALSE;
+#ifdef DEBUG
+		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
+#endif
 	}
-	return (NULL);
+
+	xopipsq = ipsq;
+	do {
+		/*
+		 * So that other operations operate on a consistent and
+		 * complete phyint, a switch message on an IPSQ must be
+		 * handled prior to any other operations on that IPSQ.
+		 */
+		if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
+			xopipsq->ipsq_switch_mp = NULL;
+			ASSERT(mp->b_next == NULL);
+			mp->b_next = (void *)xopipsq;
+			goto out;
+		}
+
+		if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
+			xopipsq->ipsq_xopq_mphead = mp->b_next;
+			if (xopipsq->ipsq_xopq_mphead == NULL)
+				xopipsq->ipsq_xopq_mptail = NULL;
+			mp->b_next = (void *)xopipsq;
+			goto out;
+		}
+	} while ((xopipsq = xopipsq->ipsq_next) != ipsq);
+empty:
+	/*
+	 * There are no messages.  Further, we are holding ipx_lock, hence no
+	 * new messages can end up on any IPSQ in the xop.
+	 */
+	ipx->ipx_writer = NULL;
+	ipx->ipx_forced = B_FALSE;
+	VERIFY(--ipx->ipx_reentry_cnt == 0);
+	ipx->ipx_ipsq_queued = B_FALSE;
+	emptied = B_TRUE;
+#ifdef	DEBUG
+	ipx->ipx_depth = 0;
+#endif
+out:
+	mutex_exit(&ipx->ipx_lock);
+	mutex_exit(&ipsq->ipsq_lock);
+
+	/*
+	 * If we completely emptied the xop, then wake up any threads waiting
+	 * to enter any of the IPSQ's associated with it.
+	 */
+	if (emptied) {
+		xopipsq = ipsq;
+		do {
+			if ((phyi = xopipsq->ipsq_phyint) == NULL)
+				continue;
+
+			illv4 = phyi->phyint_illv4;
+			illv6 = phyi->phyint_illv6;
+
+			GRAB_ILL_LOCKS(illv4, illv6);
+			if (illv4 != NULL)
+				cv_broadcast(&illv4->ill_cv);
+			if (illv6 != NULL)
+				cv_broadcast(&illv6->ill_cv);
+			RELEASE_ILL_LOCKS(illv4, illv6);
+		} while ((xopipsq = xopipsq->ipsq_next) != ipsq);
+	}
+	rw_exit(&ipst->ips_ill_g_lock);
+
+	/*
+	 * Now that all locks are dropped, exit the IPSQ we left.
+	 */
+	if (leftipsq != NULL)
+		ipsq_exit(leftipsq);
+
+	return (mp);
 }
 
 /*
  * Enter the ipsq corresponding to ill, by waiting synchronously till
  * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
  * will have to drain completely before ipsq_enter returns success.
- * ipsq_current_ipif will be set if some exclusive ioctl is in progress,
- * and the ipsq_exit logic will start the next enqueued ioctl after
- * completion of the current ioctl. If 'force' is used, we don't wait
- * for the enqueued ioctls. This is needed when a conn_close wants to
+ * ipx_current_ipif will be set if some exclusive op is in progress,
+ * and the ipsq_exit logic will start the next enqueued op after
+ * completion of the current op. If 'force' is used, we don't wait
+ * for the enqueued ops. This is needed when a conn_close wants to
  * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
  * of an ill can also use this option. But we dont' use it currently.
  */
@@ -7449,13 +7769,16 @@ boolean_t
 ipsq_enter(ill_t *ill, boolean_t force, int type)
 {
 	ipsq_t	*ipsq;
+	ipxop_t *ipx;
 	boolean_t waited_enough = B_FALSE;
 
 	/*
-	 * Holding the ill_lock prevents <ill-ipsq> assocs from changing.
-	 * Since the <ill-ipsq> assocs could change while we wait for the
-	 * writer, it is easier to wait on a fixed global rather than try to
-	 * cv_wait on a changing ipsq.
+	 * Note that the relationship between ill and ipsq is fixed as long as
+	 * the ill is not ILL_CONDEMNED.  Holding ipsq_lock ensures the
+	 * relationship between the IPSQ and xop cannot change.  However,
+	 * since we cannot hold ipsq_lock across the cv_wait(), it may change
+	 * while we're waiting.  We wait on ill_cv and rely on ipsq_exit()
+	 * waking up all ills in the xop when it becomes available.
 	 */
 	mutex_enter(&ill->ill_lock);
 	for (;;) {
@@ -7466,34 +7789,35 @@ ipsq_enter(ill_t *ill, boolean_t force, int type)
 
 		ipsq = ill->ill_phyint->phyint_ipsq;
 		mutex_enter(&ipsq->ipsq_lock);
-		if (ipsq->ipsq_writer == NULL &&
-		    (type == CUR_OP || ipsq->ipsq_current_ipif == NULL ||
-		    waited_enough)) {
+		ipx = ipsq->ipsq_xop;
+		mutex_enter(&ipx->ipx_lock);
+
+		if (ipx->ipx_writer == NULL && (type == CUR_OP ||
+		    ipx->ipx_current_ipif == NULL || waited_enough))
 			break;
-		} else if (ipsq->ipsq_writer != NULL) {
+
+		if (!force || ipx->ipx_writer != NULL) {
+			mutex_exit(&ipx->ipx_lock);
 			mutex_exit(&ipsq->ipsq_lock);
 			cv_wait(&ill->ill_cv, &ill->ill_lock);
 		} else {
+			mutex_exit(&ipx->ipx_lock);
 			mutex_exit(&ipsq->ipsq_lock);
-			if (force) {
-				(void) cv_timedwait(&ill->ill_cv,
-				    &ill->ill_lock,
-				    lbolt + ENTER_SQ_WAIT_TICKS);
-				waited_enough = B_TRUE;
-				continue;
-			} else {
-				cv_wait(&ill->ill_cv, &ill->ill_lock);
-			}
+			(void) cv_timedwait(&ill->ill_cv,
+			    &ill->ill_lock, lbolt + ENTER_SQ_WAIT_TICKS);
+			waited_enough = B_TRUE;
 		}
 	}
 
-	ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL);
-	ASSERT(ipsq->ipsq_reentry_cnt == 0);
-	ipsq->ipsq_writer = curthread;
-	ipsq->ipsq_reentry_cnt++;
+	ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
+	ASSERT(ipx->ipx_reentry_cnt == 0);
+	ipx->ipx_writer = curthread;
+	ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
+	ipx->ipx_reentry_cnt++;
 #ifdef DEBUG
-	ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IPSQ_STACK_DEPTH);
+	ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
 #endif
+	mutex_exit(&ipx->ipx_lock);
 	mutex_exit(&ipsq->ipsq_lock);
 	mutex_exit(&ill->ill_lock);
 	return (B_TRUE);
@@ -7513,14 +7837,13 @@ ill_perim_exit(ill_t *ill)
 
 /*
  * The ipsq_t (ipsq) is the synchronization data structure used to serialize
- * certain critical operations like plumbing (i.e. most set ioctls),
- * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP
- * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per
- * IPMP group. The ipsq serializes exclusive ioctls issued by applications
- * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple
- * threads executing in the ipsq. Responses from the driver pertain to the
- * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated
- * as part of bringing up the interface) and are enqueued in ipsq_mphead.
+ * certain critical operations like plumbing (i.e. most set ioctls), multicast
+ * joins, igmp/mld timers, etc.  There is one ipsq per phyint. The ipsq
+ * serializes exclusive ioctls issued by applications on a per ipsq basis in
+ * ipsq_xopq_mphead. It also protects against multiple threads executing in
+ * the ipsq. Responses from the driver pertain to the current ioctl (say a
+ * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
+ * up the interface) and are enqueued in ipx_mphead.
  *
  * If a thread does not want to reenter the ipsq when it is already writer,
  * it must make sure that the specified reentry point to be called later
@@ -7528,29 +7851,33 @@ ill_perim_exit(ill_t *ill)
  * point must never ever try to enter the ipsq again. Otherwise it can lead
  * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
  * When the thread that is currently exclusive finishes, it (ipsq_exit)
- * dequeues the requests waiting to become exclusive in ipsq_mphead and calls
- * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit
+ * dequeues the requests waiting to become exclusive in ipx_mphead and calls
+ * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
  * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
  * ioctl if the current ioctl has completed. If the current ioctl is still
  * in progress it simply returns. The current ioctl could be waiting for
- * a response from another module (arp_ or the driver or could be waiting for
- * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp
- * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the
+ * a response from another module (arp or the driver or could be waiting for
+ * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
+ * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
  * execution of the ioctl and ipsq_exit does not start the next ioctl unless
- * ipsq_current_ipif is clear which happens only on ioctl completion.
+ * ipx_current_ipif is NULL which happens only once the ioctl is complete and
+ * all associated DLPI operations have completed.
  */
 
 /*
- * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of
- * ipif or ill can be specified). The caller ensures ipif or ill is valid by
- * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued
- * completion.
+ * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
+ * and `ill' cannot both be specified).  Returns a pointer to the entered IPSQ
+ * on success, or NULL on failure.  The caller ensures ipif/ill is valid by
+ * refholding it as necessary.  If the IPSQ cannot be entered and `func' is
+ * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
+ * can be entered.  If `func' is NULL, then `q' and `mp' are ignored.
  */
 ipsq_t *
 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
     ipsq_func_t func, int type, boolean_t reentry_ok)
 {
 	ipsq_t	*ipsq;
+	ipxop_t	*ipx;
 
 	/* Only 1 of ipif or ill can be specified */
 	ASSERT((ipif != NULL) ^ (ill != NULL));
@@ -7558,13 +7885,15 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
 		ill = ipif->ipif_ill;
 
 	/*
-	 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
-	 * ipsq of an ill can't change when ill_lock is held.
+	 * lock ordering: conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
+	 * ipx of an ipsq can't change when ipsq_lock is held.
 	 */
 	GRAB_CONN_LOCK(q);
 	mutex_enter(&ill->ill_lock);
 	ipsq = ill->ill_phyint->phyint_ipsq;
 	mutex_enter(&ipsq->ipsq_lock);
+	ipx = ipsq->ipsq_xop;
+	mutex_enter(&ipx->ipx_lock);
 
 	/*
 	 * 1. Enter the ipsq if we are already writer and reentry is ok.
@@ -7572,30 +7901,32 @@ ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
 	 *    'func' nor any of its callees must ever attempt to enter the ipsq
 	 *    again. Otherwise it can lead to an infinite loop
 	 * 2. Enter the ipsq if there is no current writer and this attempted
-	 *    entry is part of the current ioctl or operation
+	 *    entry is part of the current operation
 	 * 3. Enter the ipsq if there is no current writer and this is a new
-	 *    ioctl (or operation) and the ioctl (or operation) queue is
-	 *    empty and there is no ioctl (or operation) currently in progress
+	 *    operation and the operation queue is empty and there is no
+	 *    operation currently in progress
 	 */
-	if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) ||
-	    (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL &&
-	    ipsq->ipsq_current_ipif == NULL))) ||
-	    (ipsq->ipsq_writer == curthread && reentry_ok)) {
+	if ((ipx->ipx_writer == curthread && reentry_ok) ||
+	    (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
+	    !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL)))) {
 		/* Success. */
-		ipsq->ipsq_reentry_cnt++;
-		ipsq->ipsq_writer = curthread;
+		ipx->ipx_reentry_cnt++;
+		ipx->ipx_writer = curthread;
+		ipx->ipx_forced = B_FALSE;
+		mutex_exit(&ipx->ipx_lock);
 		mutex_exit(&ipsq->ipsq_lock);
 		mutex_exit(&ill->ill_lock);
 		RELEASE_CONN_LOCK(q);
 #ifdef DEBUG
-		ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack,
-		    IPSQ_STACK_DEPTH);
+		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
 #endif
 		return (ipsq);
 	}
 
-	ipsq_enq(ipsq, q, mp, func, type, ill);
+	if (func != NULL)
+		ipsq_enq(ipsq, q, mp, func, type, ill);
 
+	mutex_exit(&ipx->ipx_lock);
 	mutex_exit(&ipsq->ipsq_lock);
 	mutex_exit(&ill->ill_lock);
 	RELEASE_CONN_LOCK(q);
@@ -7630,188 +7961,58 @@ qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
 }
 
 /*
- * If there are more than ILL_GRP_CNT ills in a group,
- * we use kmem alloc'd buffers, else use the stack
- */
-#define	ILL_GRP_CNT	14
-/*
- * Drain the ipsq, if there are messages on it, and then leave the ipsq.
- * Called by a thread that is currently exclusive on this ipsq.
+ * Exit the specified IPSQ.  If this is the final exit on it then drain it
+ * prior to exiting.  Caller must be writer on the specified IPSQ.
  */
 void
 ipsq_exit(ipsq_t *ipsq)
 {
+	mblk_t *mp;
+	ipsq_t *mp_ipsq;
 	queue_t	*q;
-	mblk_t	*mp;
-	ipsq_func_t	func;
-	int	next;
-	ill_t	**ill_list = NULL;
-	size_t	ill_list_size = 0;
-	int	cnt = 0;
-	boolean_t need_ipsq_free = B_FALSE;
-	ip_stack_t	*ipst = ipsq->ipsq_ipst;
+	phyint_t *phyi;
+	ipsq_func_t func;
 
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
-	mutex_enter(&ipsq->ipsq_lock);
-	ASSERT(ipsq->ipsq_reentry_cnt >= 1);
-	if (ipsq->ipsq_reentry_cnt != 1) {
-		ipsq->ipsq_reentry_cnt--;
-		mutex_exit(&ipsq->ipsq_lock);
+
+	ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
+	if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
+		ipsq->ipsq_xop->ipx_reentry_cnt--;
 		return;
 	}
 
-	mp = ipsq_dq(ipsq);
-	while (mp != NULL) {
-again:
-		mutex_exit(&ipsq->ipsq_lock);
-		func = (ipsq_func_t)mp->b_prev;
-		q = (queue_t *)mp->b_queue;
-		mp->b_prev = NULL;
-		mp->b_queue = NULL;
-
-		/*
-		 * If 'q' is an conn queue, it is valid, since we did a
-		 * a refhold on the connp, at the start of the ioctl.
-		 * If 'q' is an ill queue, it is valid, since close of an
-		 * ill will clean up the 'ipsq'.
-		 */
-		(*func)(ipsq, q, mp, NULL);
-
-		mutex_enter(&ipsq->ipsq_lock);
+	for (;;) {
+		phyi = ipsq->ipsq_phyint;
 		mp = ipsq_dq(ipsq);
-	}
-
-	mutex_exit(&ipsq->ipsq_lock);
-
-	/*
-	 * Need to grab the locks in the right order. Need to
-	 * atomically check (under ipsq_lock) that there are no
-	 * messages before relinquishing the ipsq. Also need to
-	 * atomically wakeup waiters on ill_cv while holding ill_lock.
-	 * Holding ill_g_lock ensures that ipsq list of ills is stable.
-	 * If we need to call ill_split_ipsq and change <ill-ipsq> we need
-	 * to grab ill_g_lock as writer.
-	 */
-	rw_enter(&ipst->ips_ill_g_lock,
-	    ipsq->ipsq_split ? RW_WRITER : RW_READER);
+		mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
 
-	/* ipsq_refs can't change while ill_g_lock is held as reader */
-	if (ipsq->ipsq_refs != 0) {
-		/* At most 2 ills v4/v6 per phyint */
-		cnt = ipsq->ipsq_refs << 1;
-		ill_list_size = cnt * sizeof (ill_t *);
 		/*
-		 * If memory allocation fails, we will do the split
-		 * the next time ipsq_exit is called for whatever reason.
-		 * As long as the ipsq_split flag is set the need to
-		 * split is remembered.
+		 * If we've changed to a new IPSQ, and the phyint associated
+		 * with the old one has gone away, free the old IPSQ.  Note
+		 * that this cannot happen while the IPSQ is in a group.
 		 */
-		ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
-		if (ill_list != NULL)
-			cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt);
-	}
-	mutex_enter(&ipsq->ipsq_lock);
-	mp = ipsq_dq(ipsq);
-	if (mp != NULL) {
-		/* oops, some message has landed up, we can't get out */
-		if (ill_list != NULL)
-			ill_unlock_ills(ill_list, cnt);
-		rw_exit(&ipst->ips_ill_g_lock);
-		if (ill_list != NULL)
-			kmem_free(ill_list, ill_list_size);
-		ill_list = NULL;
-		ill_list_size = 0;
-		cnt = 0;
-		goto again;
-	}
+		if (mp_ipsq != ipsq && phyi == NULL) {
+			ASSERT(ipsq->ipsq_next == ipsq);
+			ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
+			ipsq_delete(ipsq);
+		}
 
-	/*
-	 * Split only if no ioctl is pending and if memory alloc succeeded
-	 * above.
-	 */
-	if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL &&
-	    ill_list != NULL) {
-		/*
-		 * No new ill can join this ipsq since we are holding the
-		 * ill_g_lock. Hence ill_split_ipsq can safely traverse the
-		 * ipsq. ill_split_ipsq may fail due to memory shortage.
-		 * If so we will retry on the next ipsq_exit.
-		 */
-		ipsq->ipsq_split = ill_split_ipsq(ipsq);
-	}
+		if (mp == NULL)
+			break;
 
-	/*
-	 * We are holding the ipsq lock, hence no new messages can
-	 * land up on the ipsq, and there are no messages currently.
-	 * Now safe to get out. Wake up waiters and relinquish ipsq
-	 * atomically while holding ill locks.
-	 */
-	ipsq->ipsq_writer = NULL;
-	ipsq->ipsq_reentry_cnt--;
-	ASSERT(ipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
-	ipsq->ipsq_depth = 0;
-#endif
-	mutex_exit(&ipsq->ipsq_lock);
-	/*
-	 * For IPMP this should wake up all ills in this ipsq.
-	 * We need to hold the ill_lock while waking up waiters to
-	 * avoid missed wakeups. But there is no need to acquire all
-	 * the ill locks and then wakeup. If we have not acquired all
-	 * the locks (due to memory failure above) ill_signal_ipsq_ills
-	 * wakes up ills one at a time after getting the right ill_lock
-	 */
-	ill_signal_ipsq_ills(ipsq, ill_list != NULL);
-	if (ill_list != NULL)
-		ill_unlock_ills(ill_list, cnt);
-	if (ipsq->ipsq_refs == 0)
-		need_ipsq_free = B_TRUE;
-	rw_exit(&ipst->ips_ill_g_lock);
-	if (ill_list != 0)
-		kmem_free(ill_list, ill_list_size);
+		q = mp->b_queue;
+		func = (ipsq_func_t)mp->b_prev;
+		ipsq = mp_ipsq;
+		mp->b_next = mp->b_prev = NULL;
+		mp->b_queue = NULL;
 
-	if (need_ipsq_free) {
 		/*
-		 * Free the ipsq. ipsq_refs can't increase because ipsq can't be
-		 * looked up. ipsq can be looked up only thru ill or phyint
-		 * and there are no ills/phyint on this ipsq.
+		 * If 'q' is an conn queue, it is valid, since we did a
+		 * a refhold on the conn at the start of the ioctl.
+		 * If 'q' is an ill queue, it is valid, since close of an
+		 * ill will clean up its IPSQ.
 		 */
-		ipsq_delete(ipsq);
-	}
-
-	/*
-	 * Now that we're outside the IPSQ, start any IGMP/MLD timers.  We
-	 * can't start these inside the IPSQ since e.g. igmp_start_timers() ->
-	 * untimeout() (inside the IPSQ, waiting for an executing timeout to
-	 * finish) could deadlock with igmp_timeout_handler() -> ipsq_enter()
-	 * (executing the timeout, waiting to get inside the IPSQ).
-	 *
-	 * However, there is one exception to the above: if this thread *is*
-	 * the IGMP/MLD timeout handler thread, then we must not start its
-	 * timer until the current handler is done.
-	 */
-	mutex_enter(&ipst->ips_igmp_timer_lock);
-	if (curthread != ipst->ips_igmp_timer_thread) {
-		next = ipst->ips_igmp_deferred_next;
-		ipst->ips_igmp_deferred_next = INFINITY;
-		mutex_exit(&ipst->ips_igmp_timer_lock);
-
-		if (next != INFINITY)
-			igmp_start_timers(next, ipst);
-	} else {
-		mutex_exit(&ipst->ips_igmp_timer_lock);
-	}
-
-	mutex_enter(&ipst->ips_mld_timer_lock);
-	if (curthread != ipst->ips_mld_timer_thread) {
-		next = ipst->ips_mld_deferred_next;
-		ipst->ips_mld_deferred_next = INFINITY;
-		mutex_exit(&ipst->ips_mld_timer_lock);
-
-		if (next != INFINITY)
-			mld_start_timers(next, ipst);
-	} else {
-		mutex_exit(&ipst->ips_mld_timer_lock);
+		(*func)(ipsq, q, mp, NULL);
 	}
 }
 
@@ -7822,15 +8023,17 @@ again:
 void
 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
 {
+	ipxop_t *ipx = ipsq->ipsq_xop;
+
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
+	ASSERT(ipx->ipx_current_ipif == NULL);
+	ASSERT(ipx->ipx_current_ioctl == 0);
 
-	mutex_enter(&ipsq->ipsq_lock);
-	ASSERT(ipsq->ipsq_current_ipif == NULL);
-	ASSERT(ipsq->ipsq_current_ioctl == 0);
-	ipsq->ipsq_current_done = B_FALSE;
-	ipsq->ipsq_current_ipif = ipif;
-	ipsq->ipsq_current_ioctl = ioccmd;
-	mutex_exit(&ipsq->ipsq_lock);
+	ipx->ipx_current_done = B_FALSE;
+	ipx->ipx_current_ioctl = ioccmd;
+	mutex_enter(&ipx->ipx_lock);
+	ipx->ipx_current_ipif = ipif;
+	mutex_exit(&ipx->ipx_lock);
 }
 
 /*
@@ -7844,17 +8047,18 @@ ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
 void
 ipsq_current_finish(ipsq_t *ipsq)
 {
-	ipif_t *ipif = ipsq->ipsq_current_ipif;
+	ipxop_t	*ipx = ipsq->ipsq_xop;
 	t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
+	ipif_t	*ipif = ipx->ipx_current_ipif;
 
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
 
 	/*
-	 * For SIOCSLIFREMOVEIF, the ipif has been already been blown away
+	 * For SIOCLIFREMOVEIF, the ipif has been already been blown away
 	 * (but in that case, IPIF_CHANGING will already be clear and no
 	 * pending DLPI messages can remain).
 	 */
-	if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) {
+	if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
 		ill_t *ill = ipif->ipif_ill;
 
 		mutex_enter(&ill->ill_lock);
@@ -7863,12 +8067,14 @@ ipsq_current_finish(ipsq_t *ipsq)
 		mutex_exit(&ill->ill_lock);
 	}
 
-	mutex_enter(&ipsq->ipsq_lock);
-	ipsq->ipsq_current_ioctl = 0;
-	ipsq->ipsq_current_done = B_TRUE;
-	if (dlpi_pending == DL_PRIM_INVAL)
-		ipsq->ipsq_current_ipif = NULL;
-	mutex_exit(&ipsq->ipsq_lock);
+	ASSERT(!ipx->ipx_current_done);
+	ipx->ipx_current_done = B_TRUE;
+	ipx->ipx_current_ioctl = 0;
+	if (dlpi_pending == DL_PRIM_INVAL) {
+		mutex_enter(&ipx->ipx_lock);
+		ipx->ipx_current_ipif = NULL;
+		mutex_exit(&ipx->ipx_lock);
+	}
 }
 
 /*
@@ -7884,123 +8090,38 @@ ipsq_flush(ill_t *ill)
 	mblk_t	*prev;
 	mblk_t	*mp;
 	mblk_t	*mp_next;
-	ipsq_t	*ipsq;
+	ipxop_t	*ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
 
 	ASSERT(IAM_WRITER_ILL(ill));
-	ipsq = ill->ill_phyint->phyint_ipsq;
+
 	/*
 	 * Flush any messages sent up by the driver.
 	 */
-	mutex_enter(&ipsq->ipsq_lock);
-	for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) {
+	mutex_enter(&ipx->ipx_lock);
+	for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
 		mp_next = mp->b_next;
 		q = mp->b_queue;
 		if (q == ill->ill_rq || q == ill->ill_wq) {
-			/* Remove the mp from the ipsq */
+			/* dequeue mp */
 			if (prev == NULL)
-				ipsq->ipsq_mphead = mp->b_next;
+				ipx->ipx_mphead = mp->b_next;
 			else
 				prev->b_next = mp->b_next;
-			if (ipsq->ipsq_mptail == mp) {
+			if (ipx->ipx_mptail == mp) {
 				ASSERT(mp_next == NULL);
-				ipsq->ipsq_mptail = prev;
+				ipx->ipx_mptail = prev;
 			}
 			inet_freemsg(mp);
 		} else {
 			prev = mp;
 		}
 	}
-	mutex_exit(&ipsq->ipsq_lock);
+	mutex_exit(&ipx->ipx_lock);
 	(void) ipsq_pending_mp_cleanup(ill, NULL);
 	ipsq_xopq_mp_cleanup(ill, NULL);
 	ill_pending_mp_cleanup(ill);
 }
 
-/* ARGSUSED */
-int
-ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
-    ip_ioctl_cmd_t *ipip, void *ifreq)
-{
-	ill_t	*ill;
-	struct lifreq	*lifr = (struct lifreq *)ifreq;
-	boolean_t isv6;
-	conn_t	*connp;
-	ip_stack_t	*ipst;
-
-	connp = Q_TO_CONN(q);
-	ipst = connp->conn_netstack->netstack_ip;
-	isv6 = connp->conn_af_isv6;
-	/*
-	 * Set original index.
-	 * Failover and failback move logical interfaces
-	 * from one physical interface to another.  The
-	 * original index indicates the parent of a logical
-	 * interface, in other words, the physical interface
-	 * the logical interface will be moved back to on
-	 * failback.
-	 */
-
-	/*
-	 * Don't allow the original index to be changed
-	 * for non-failover addresses, autoconfigured
-	 * addresses, or IPv6 link local addresses.
-	 */
-	if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) ||
-	    (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) {
-		return (EINVAL);
-	}
-	/*
-	 * The new original index must be in use by some
-	 * physical interface.
-	 */
-	ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL,
-	    NULL, NULL, ipst);
-	if (ill == NULL)
-		return (ENXIO);
-	ill_refrele(ill);
-
-	ipif->ipif_orig_ifindex = lifr->lifr_index;
-	/*
-	 * When this ipif gets failed back, don't
-	 * preserve the original id, as it is no
-	 * longer applicable.
-	 */
-	ipif->ipif_orig_ipifid = 0;
-	/*
-	 * For IPv4, change the original index of any
-	 * multicast addresses associated with the
-	 * ipif to the new value.
-	 */
-	if (!isv6) {
-		ilm_t *ilm;
-
-		mutex_enter(&ipif->ipif_ill->ill_lock);
-		for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL;
-		    ilm = ilm->ilm_next) {
-			if (ilm->ilm_ipif == ipif) {
-				ilm->ilm_orig_ifindex = lifr->lifr_index;
-			}
-		}
-		mutex_exit(&ipif->ipif_ill->ill_lock);
-	}
-	return (0);
-}
-
-/* ARGSUSED */
-int
-ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
-    ip_ioctl_cmd_t *ipip, void *ifreq)
-{
-	struct lifreq *lifr = (struct lifreq *)ifreq;
-
-	/*
-	 * Get the original interface index i.e the one
-	 * before FAILOVER if it ever happened.
-	 */
-	lifr->lifr_index = ipif->ipif_orig_ifindex;
-	return (0);
-}
-
 /*
  * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls,
  * refhold and return the associated ipif
@@ -8087,8 +8208,6 @@ int
 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
     cmd_info_t *ci, ipsq_func_t func)
 {
-	sin_t		*sin;
-	sin6_t		*sin6;
 	char		*name;
 	struct ifreq    *ifr;
 	struct lifreq    *lifr;
@@ -8132,9 +8251,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 		 * be trusted.
 		 */
 		ifr->ifr_name[IFNAMSIZ - 1] = '\0';
-		sin = (sin_t *)&ifr->ifr_addr;
 		name = ifr->ifr_name;
-		ci->ci_sin = sin;
+		ci->ci_sin = (sin_t *)&ifr->ifr_addr;
 		ci->ci_sin6 = NULL;
 		ci->ci_lifr = (struct lifreq *)ifr;
 	} else {
@@ -8148,14 +8266,8 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 		 */
 		lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
 		name = lifr->lifr_name;
-		sin = (sin_t *)&lifr->lifr_addr;
-		sin6 = (sin6_t *)&lifr->lifr_addr;
-		if (ipip->ipi_cmd == SIOCSLIFGROUPNAME) {
-			(void) strncpy(ci->ci_groupname, lifr->lifr_groupname,
-			    LIFNAMSIZ);
-		}
-		ci->ci_sin = sin;
-		ci->ci_sin6 = sin6;
+		ci->ci_sin = (sin_t *)&lifr->lifr_addr;
+		ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
 		ci->ci_lifr = lifr;
 	}
 
@@ -8181,21 +8293,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 		if (ipif == NULL) {
 			if (err == EINPROGRESS)
 				return (err);
-			if (ipip->ipi_cmd == SIOCLIFFAILOVER ||
-			    ipip->ipi_cmd == SIOCLIFFAILBACK) {
-				/*
-				 * Need to try both v4 and v6 since this
-				 * ioctl can come down either v4 or v6
-				 * socket. The lifreq.lifr_family passed
-				 * down by this ioctl is AF_UNSPEC.
-				 */
-				ipif = ipif_lookup_on_name(name,
-				    mi_strlen(name), B_FALSE, &exists, !isv6,
-				    zoneid, (connp == NULL) ? q :
-				    CONNP_TO_WQ(connp), mp, func, &err, ipst);
-				if (err == EINPROGRESS)
-					return (err);
-			}
 			err = 0;	/* Ensure we don't use it below */
 		}
 	}
@@ -8221,15 +8318,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 	if (ipif == NULL)
 		return (ENXIO);
 
-	/*
-	 * Allow only GET operations if this ipif has been created
-	 * temporarily due to a MOVE operation.
-	 */
-	if (ipif->ipif_replace_zero && !(ipip->ipi_flags & IPI_REPL)) {
-		ipif_refrele(ipif);
-		return (EINVAL);
-	}
-
 	ci->ci_ipif = ipif;
 	return (0);
 }
@@ -8247,15 +8335,15 @@ ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
 
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
-
-	while (ill != NULL) {
+	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (IS_UNDER_IPMP(ill))
+			continue;
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if (ipif->ipif_zoneid == zoneid ||
 			    ipif->ipif_zoneid == ALL_ZONES)
 				numifs++;
 		}
-		ill = ill_next(&ctx, ill);
 	}
 	rw_exit(&ipst->ips_ill_g_lock);
 	return (numifs);
@@ -8283,6 +8371,9 @@ ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
 		ill = ILL_START_WALK_ALL(&ctx, ipst);
 
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
+			continue;
+
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if ((ipif->ipif_flags & IPIF_NOXMIT) &&
@@ -8491,6 +8582,8 @@ ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (IS_UNDER_IPMP(ill))
+			continue;
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if (zoneid != ipif->ipif_zoneid &&
@@ -8760,6 +8853,9 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ill_first(list, list, &ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
+			continue;
+
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
 			if ((ipif->ipif_flags & IPIF_NOXMIT) &&
@@ -8795,6 +8891,7 @@ ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
 
 			ipif_get_name(ipif, lifr->lifr_name,
 			    sizeof (lifr->lifr_name));
+			lifr->lifr_type = ill->ill_type;
 			if (ipif->ipif_isv6) {
 				sin6 = (sin6_t *)&lifr->lifr_addr;
 				*sin6 = sin6_null;
@@ -8828,23 +8925,6 @@ lif_copydone:
 	return (0);
 }
 
-/* ARGSUSED */
-int
-ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin,
-    queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
-{
-	ip_stack_t	*ipst;
-
-	if (q->q_next == NULL)
-		ipst = CONNQ_TO_IPST(q);
-	else
-		ipst = ILLQ_TO_IPST(q);
-
-	/* Existence of b_cont->b_cont checked in ip_wput_nondata */
-	ipst->ips_ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr;
-	return (0);
-}
-
 static void
 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
 {
@@ -9038,8 +9118,7 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
 			src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid);
 		} else {
 			src_ipif = ipif_select_source_v6(dst_ill,
-			    daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT,
-			    zoneid);
+			    daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
 		}
 		if (src_ipif == NULL)
 			goto next_dst;
@@ -9325,10 +9404,14 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	struct arpreq *ar;
 	struct xarpreq *xar;
 	int flags, alength;
-	char *lladdr;
-	ip_stack_t	*ipst;
+	uchar_t *lladdr;
+	ire_t *ire;
+	ip_stack_t *ipst;
 	ill_t *ill = ipif->ipif_ill;
+	ill_t *proxy_ill = NULL;
+	ipmp_arpent_t *entp = NULL;
 	boolean_t if_arp_ioctl = B_FALSE;
+	boolean_t proxyarp = B_FALSE;
 
 	ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
 	connp = Q_TO_CONN(q);
@@ -9340,7 +9423,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		ar = NULL;
 
 		flags = xar->xarp_flags;
-		lladdr = LLADDR(&xar->xarp_ha);
+		lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
 		if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
 		/*
 		 * Validate against user's link layer address length
@@ -9359,7 +9442,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		xar = NULL;
 
 		flags = ar->arp_flags;
-		lladdr = ar->arp_ha.sa_data;
+		lladdr = (uchar_t *)ar->arp_ha.sa_data;
 		/*
 		 * Theoretically, the sa_family could tell us what link
 		 * layer type this operation is trying to deal with. By
@@ -9379,6 +9462,51 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		}
 	}
 
+	ipaddr = sin->sin_addr.s_addr;
+
+	/*
+	 * IPMP ARP special handling:
+	 *
+	 * 1. Since ARP mappings must appear consistent across the group,
+	 *    prohibit changing ARP mappings on the underlying interfaces.
+	 *
+	 * 2. Since ARP mappings for IPMP data addresses are maintained by
+	 *    IP itself, prohibit changing them.
+	 *
+	 * 3. For proxy ARP, use a functioning hardware address in the group,
+	 *    provided one exists.  If one doesn't, just add the entry as-is;
+	 *    ipmp_illgrp_refresh_arpent() will refresh it if things change.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
+			return (EPERM);
+	}
+	if (IS_IPMP(ill)) {
+		ipmp_illgrp_t *illg = ill->ill_grp;
+
+		switch (ipip->ipi_cmd) {
+		case SIOCSARP:
+		case SIOCSXARP:
+			proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
+			if (proxy_ill != NULL) {
+				proxyarp = B_TRUE;
+				if (!ipmp_ill_is_active(proxy_ill))
+					proxy_ill = ipmp_illgrp_next_ill(illg);
+				if (proxy_ill != NULL)
+					lladdr = proxy_ill->ill_phys_addr;
+			}
+			/* FALLTHRU */
+		case SIOCDARP:
+		case SIOCDXARP:
+			ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL,
+			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
+			if (ire != NULL) {
+				ire_refrele(ire);
+				return (EPERM);
+			}
+		}
+	}
+
 	/*
 	 * We are going to pass up to ARP a packet chain that looks
 	 * like:
@@ -9400,8 +9528,6 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		return (ENOMEM);
 	}
 
-	ipaddr = sin->sin_addr.s_addr;
-
 	mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
 	    (caddr_t)&ipaddr);
 	if (mp2 == NULL) {
@@ -9481,6 +9607,30 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		area->area_flags |= ACE_F_AUTHORITY;
 
 	/*
+	 * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it
+	 * so that IP can update ARP as the active ills in the group change.
+	 */
+	if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD &&
+	    (area->area_flags & ACE_F_PERMANENT)) {
+		entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp);
+
+		/*
+		 * The second part of the conditional below handles a corner
+		 * case: if this is proxy ARP and the IPMP group has no active
+		 * interfaces, we can't send the request to ARP now since it
+		 * won't be able to build an ACE.  So we return success and
+		 * notify ARP about the proxy ARP entry once an interface
+		 * becomes active.
+		 */
+		if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
+			mp2->b_cont = NULL;
+			inet_freemsg(mp1);
+			inet_freemsg(pending_mp);
+			return (entp == NULL ? ENOMEM : 0);
+		}
+	}
+
+	/*
 	 * Before sending 'mp' to ARP, we have to clear the b_next
 	 * and b_prev. Otherwise if STREAMS encounters such a message
 	 * in freemsg(), (because ARP can close any time) it can cause
@@ -9497,7 +9647,12 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	mutex_enter(&connp->conn_lock);
 	mutex_enter(&ill->ill_lock);
 	/* conn has not yet started closing, hence this can't fail */
-	VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
+	if (ipip->ipi_flags & IPI_WR) {
+		VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp),
+		    pending_mp, 0) != 0);
+	} else {
+		VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
+	}
 	mutex_exit(&ill->ill_lock);
 	mutex_exit(&connp->conn_lock);
 
@@ -9506,6 +9661,13 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	 * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion.
 	 */
 	putnext(ill->ill_rq, mp1);
+
+	/*
+	 * If we created an IPMP ARP entry, mark that we've notified ARP.
+	 */
+	if (entp != NULL)
+		ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
+
 	return (EINPROGRESS);
 }
 
@@ -9564,55 +9726,114 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
 		    mp, func, &err, ipst);
 		if (ipif == NULL)
 			return (err);
-		if (ipif->ipif_id != 0 ||
-		    ipif->ipif_net_type != IRE_IF_RESOLVER) {
+		if (ipif->ipif_id != 0) {
 			ipif_refrele(ipif);
 			return (ENXIO);
 		}
 	} else {
 		/*
-		 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with sdl_nlen ==
-		 * 0: use the IP address to figure out the ill.	 In the IPMP
-		 * case, a simple forwarding table lookup will return the
-		 * IRE_IF_RESOLVER for the first interface in the group, which
-		 * might not be the interface on which the requested IP
-		 * address was resolved due to the ill selection algorithm
-		 * (see ip_newroute_get_dst_ill()).  So we do a cache table
-		 * lookup first: if the IRE cache entry for the IP address is
-		 * still there, it will contain the ill pointer for the right
-		 * interface, so we use that. If the cache entry has been
-		 * flushed, we fall back to the forwarding table lookup. This
-		 * should be rare enough since IRE cache entries have a longer
-		 * life expectancy than ARP cache entries.
+		 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
+		 * of 0: use the IP address to find the ipif.  If the IP
+		 * address is an IPMP test address, ire_ftable_lookup() will
+		 * find the wrong ill, so we first do an ipif_lookup_addr().
 		 */
-		ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL,
-		    ipst);
-		if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) ||
-		    ((ill = ire_to_ill(ire)) == NULL) ||
-		    (ill->ill_net_type != IRE_IF_RESOLVER)) {
-			if (ire != NULL)
-				ire_refrele(ire);
-			ire = ire_ftable_lookup(sin->sin_addr.s_addr,
-			    0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0,
-			    NULL, MATCH_IRE_TYPE, ipst);
+		ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
+		    CONNP_TO_WQ(connp), mp, func, &err, ipst);
+		if (ipif == NULL) {
+			ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0,
+			    IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL,
+			    MATCH_IRE_TYPE, ipst);
 			if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) {
-
 				if (ire != NULL)
 					ire_refrele(ire);
 				return (ENXIO);
 			}
+			ipif = ill->ill_ipif;
+			ipif_refhold(ipif);
+			ire_refrele(ire);
 		}
-		ASSERT(ire != NULL && ill != NULL);
-		ipif = ill->ill_ipif;
-		ipif_refhold(ipif);
-		ire_refrele(ire);
 	}
+
+	if (ipif->ipif_net_type != IRE_IF_RESOLVER) {
+		ipif_refrele(ipif);
+		return (ENXIO);
+	}
+
 	ci->ci_sin = sin;
 	ci->ci_ipif = ipif;
 	return (0);
 }
 
 /*
+ * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
+ * value of `ioccmd'.  While an illgrp is linked to an ipmp_grp_t, it is
+ * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
+ * up and thus an ill can join that illgrp.
+ *
+ * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
+ * open()/close() primarily because close() is not allowed to fail or block
+ * forever.  On the other hand, I_PUNLINK *can* fail, and there's no reason
+ * why anyone should ever need to I_PUNLINK an in-use IPMP stream.  To ensure
+ * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
+ * I_PUNLINK) we defer linking to I_PLINK.  Separately, we also fail attempts
+ * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
+ * state if I_UNLINK didn't occur.
+ *
+ * Note that for each plumb/unplumb operation, we may end up here more than
+ * once because of the way ifconfig works.  However, it's OK to link the same
+ * illgrp more than once, or unlink an illgrp that's already unlinked.
+ */
+static int
+ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
+{
+	int err;
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	ASSERT(IS_IPMP(ill));
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	switch (ioccmd) {
+	case I_LINK:
+		return (ENOTSUP);
+
+	case I_PLINK:
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
+		rw_exit(&ipst->ips_ipmp_lock);
+		break;
+
+	case I_PUNLINK:
+		/*
+		 * Require all UP ipifs be brought down prior to unlinking the
+		 * illgrp so any associated IREs (and other state) is torched.
+		 */
+		if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
+			return (EBUSY);
+
+		/*
+		 * NOTE: We hold ipmp_lock across the unlink to prevent a race
+		 * with an SIOCSLIFGROUPNAME request from an ill trying to
+		 * join this group.  Specifically: ills trying to join grab
+		 * ipmp_lock and bump a "pending join" counter checked by
+		 * ipmp_illgrp_unlink_grp().  During the unlink no new pending
+		 * joins can occur (since we have ipmp_lock).  Once we drop
+		 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
+		 * find the illgrp (since we unlinked it) and will return
+		 * EAFNOSUPPORT.  This will then take them back through the
+		 * IPMP meta-interface plumbing logic in ifconfig, and thus
+		 * back through I_PLINK above.
+		 */
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		err = ipmp_illgrp_unlink_grp(ill->ill_grp);
+		rw_exit(&ipst->ips_ipmp_lock);
+		return (err);
+	default:
+		break;
+	}
+	return (0);
+}
+
+/*
  * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
  * atomically set/clear the muxids. Also complete the ioctl by acking or
  * naking it.  Note that the code is structured such that the link type,
@@ -9697,7 +9918,7 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 
 		if (ipsq == NULL) {
 			ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
-			    NEW_OP, B_TRUE);
+			    NEW_OP, B_FALSE);
 			if (ipsq == NULL) {
 				ill_refrele(ill);
 				return;
@@ -9728,6 +9949,11 @@ ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			err = EINVAL;
 			goto done;
 		}
+
+		if (IS_IPMP(ill) &&
+		    (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
+			goto done;
+
 		ill->ill_arp_muxid = islink ? li->l_index : 0;
 	} else {
 		/*
@@ -9763,6 +9989,7 @@ static int
 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
     struct linkblk *li, boolean_t doconsist)
 {
+	int		err = 0;
 	ill_t  		*ill;
 	queue_t		*ipwq, *dwq;
 	const char	*name;
@@ -9796,7 +10023,7 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 
 	if (ipsq == NULL) {
 		ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
-		    NEW_OP, B_TRUE);
+		    NEW_OP, B_FALSE);
 		if (ipsq == NULL)
 			return (EINPROGRESS);
 		entered_ipsq = B_TRUE;
@@ -9811,12 +10038,14 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 		 */
 		if ((islink && ill->ill_ip_muxid != 0) ||
 		    (!islink && ill->ill_arp_muxid != 0)) {
-			if (entered_ipsq)
-				ipsq_exit(ipsq);
-			return (EINVAL);
+			err = EINVAL;
+			goto done;
 		}
 	}
 
+	if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
+		goto done;
+
 	/*
 	 * As part of I_{P}LINKing, stash the number of downstream modules and
 	 * the read queue of the module immediately below IP in the ill.
@@ -9853,11 +10082,11 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 			ill_capability_reset(ill, B_FALSE);
 	}
 	ipsq_current_finish(ipsq);
-
+done:
 	if (entered_ipsq)
 		ipsq_exit(ipsq);
 
-	return (0);
+	return (err);
 }
 
 /*
@@ -10124,8 +10353,9 @@ nak:
 }
 
 /* ip_wput hands off ARP IOCTL responses to us */
+/* ARGSUSED3 */
 void
-ip_sioctl_iocack(queue_t *q, mblk_t *mp)
+ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 {
 	struct arpreq *ar;
 	struct xarpreq *xar;
@@ -10136,7 +10366,6 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
 	struct iocblk	*orig_iocp;
 	ill_t *ill;
 	conn_t *connp = NULL;
-	uint_t ioc_id;
 	mblk_t *pending_mp;
 	int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE;
 	int *flagsp;
@@ -10146,6 +10375,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
 	int err;
 	ip_stack_t *ipst;
 
+	ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
 	ill = q->q_ptr;
 	ASSERT(ill != NULL);
 	ipst = ill->ill_ipst;
@@ -10185,10 +10415,14 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
 	iocp = (struct iocblk *)mp->b_rptr;
 
 	/*
-	 * Pick out the originating queue based on the ioc_id.
+	 * Find the pending message; if we're exclusive, it'll be on our IPSQ.
+	 * Otherwise, we can find it from our ioc_id.
 	 */
-	ioc_id = iocp->ioc_id;
-	pending_mp = ill_pending_mp_get(ill, &connp, ioc_id);
+	if (ipsq != NULL)
+		pending_mp = ipsq_pending_mp_get(ipsq, &connp);
+	else
+		pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id);
+
 	if (pending_mp == NULL) {
 		ASSERT(connp == NULL);
 		inet_freemsg(mp);
@@ -10271,7 +10505,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
 					ire_refrele(ire);
 					freemsg(mp);
 					ip_ioctl_finish(q, orig_ioc_mp,
-					    EINVAL, NO_COPYOUT, NULL);
+					    EINVAL, NO_COPYOUT, ipsq);
 					return;
 				}
 				*flagsp |= ATF_COM;
@@ -10297,12 +10531,27 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
 			/* Ditch the internal IOCTL. */
 			freemsg(mp);
 			ire_refrele(ire);
-			ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL);
+			ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
 			return;
 		}
 	}
 
 	/*
+	 * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE
+	 * on the IPMP meta-interface, ensure any ARP entries added in
+	 * ip_sioctl_arp() are deleted.
+	 */
+	if (IS_IPMP(ill) &&
+	    ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) ||
+	    ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) {
+		ipmp_illgrp_t *illg = ill->ill_grp;
+		ipmp_arpent_t *entp;
+
+		if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL)
+			ipmp_illgrp_destroy_arpent(illg, entp);
+	}
+
+	/*
 	 * Delete the coresponding IRE_CACHE if any.
 	 * Reset the error if there was one (in case there was no entry
 	 * in arp.)
@@ -10341,7 +10590,7 @@ errack:
 	if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) {
 		err = iocp->ioc_error;
 		freemsg(mp);
-		ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL);
+		ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq);
 		return;
 	}
 
@@ -10355,7 +10604,7 @@ errack:
 		    sizeof (xar->xarp_ha.sdl_data)) {
 			freemsg(mp);
 			ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT,
-			    NULL);
+			    ipsq);
 			return;
 		}
 	}
@@ -10382,7 +10631,7 @@ errack:
 	/* Ditch the internal IOCTL. */
 	freemsg(mp);
 	/* Complete the original. */
-	ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL);
+	ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
 }
 
 /*
@@ -10397,7 +10646,7 @@ errack:
  * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
  * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
  *
- * Executed as a writer on the ill or ill group.
+ * Executed as a writer on the ill.
  * So no lock is needed to traverse the ipif chain, or examine the
  * phyint flags.
  */
@@ -10423,7 +10672,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	boolean_t found_sep = B_FALSE;
 	conn_t	*connp;
 	zoneid_t zoneid;
-	int	orig_ifindex = 0;
 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
 
 	ASSERT(q->q_next == NULL);
@@ -10513,61 +10761,10 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 	if (ipsq == NULL)
 		return (EINPROGRESS);
 
-	/*
-	 * If the interface is failed, inactive or offlined, look for a working
-	 * interface in the ill group and create the ipif there. If we can't
-	 * find a good interface, create the ipif anyway so that in.mpathd can
-	 * move it to the first repaired interface.
-	 */
-	if ((ill->ill_phyint->phyint_flags &
-	    (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
-	    ill->ill_phyint->phyint_groupname_len != 0) {
-		phyint_t *phyi;
-		char *groupname = ill->ill_phyint->phyint_groupname;
-
-		/*
-		 * We're looking for a working interface, but it doesn't matter
-		 * if it's up or down; so instead of following the group lists,
-		 * we look at each physical interface and compare the groupname.
-		 * We're only interested in interfaces with IPv4 (resp. IPv6)
-		 * plumbed when we're adding an IPv4 (resp. IPv6) ipif.
-		 * Otherwise we create the ipif on the failed interface.
-		 */
-		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-		phyi = avl_first(&ipst->ips_phyint_g_list->
-		    phyint_list_avl_by_index);
-		for (; phyi != NULL;
-		    phyi = avl_walk(&ipst->ips_phyint_g_list->
-		    phyint_list_avl_by_index,
-		    phyi, AVL_AFTER)) {
-			if (phyi->phyint_groupname_len == 0)
-				continue;
-			ASSERT(phyi->phyint_groupname != NULL);
-			if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 &&
-			    !(phyi->phyint_flags &
-			    (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
-			    (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) :
-			    (phyi->phyint_illv4 != NULL))) {
-				break;
-			}
-		}
-		rw_exit(&ipst->ips_ill_g_lock);
-
-		if (phyi != NULL) {
-			orig_ifindex = ill->ill_phyint->phyint_ifindex;
-			ill = (ill->ill_isv6 ? phyi->phyint_illv6 :
-			    phyi->phyint_illv4);
-		}
-	}
-
-	/*
-	 * We are now exclusive on the ipsq, so an ill move will be serialized
-	 * before or after us.
-	 */
+	/* We are now exclusive on the IPSQ */
 	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(ill->ill_move_in_progress == B_FALSE);
 
-	if (found_sep && orig_ifindex == 0) {
+	if (found_sep) {
 		/* Now see if there is an IPIF with this unit number. */
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
@@ -10580,14 +10777,11 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 
 	/*
 	 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
-	 * of lo0. We never come here when we plumb lo0:0. It
-	 * happens in ipif_lookup_on_name.
-	 * The specified unit number is ignored when we create the ipif on a
-	 * different interface. However, we save it in ipif_orig_ipifid below so
-	 * that the ipif fails back to the right position.
-	 */
-	if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ?
-	    id : -1, IRE_LOCAL, B_TRUE)) == NULL) {
+	 * of lo0.  Plumbing for lo0:0 happens in ipif_lookup_on_name()
+	 * instead.
+	 */
+	if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
+	    B_TRUE, B_TRUE)) == NULL) {
 		err = ENOBUFS;
 		goto done;
 	}
@@ -10604,14 +10798,6 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 		    &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
 	}
 
-	/* Set ifindex and unit number for failback */
-	if (err == 0 && orig_ifindex != 0) {
-		ipif->ipif_orig_ifindex = orig_ifindex;
-		if (found_sep) {
-			ipif->ipif_orig_ipifid = id;
-		}
-	}
-
 done:
 	ipsq_exit(ipsq);
 	return (err);
@@ -10672,7 +10858,6 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 			ill_delete(ill);
 			mutex_enter(&connp->conn_lock);
 			mutex_enter(&ill->ill_lock);
-			ASSERT(ill->ill_group == NULL);
 
 			/* Are any references to this ill active */
 			if (ill_is_freeable(ill)) {
@@ -10693,14 +10878,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		}
 	}
 
-	/*
-	 * We are exclusive on the ipsq, so an ill move will be serialized
-	 * before or after us.
-	 */
-	ASSERT(ill->ill_move_in_progress == B_FALSE);
-
 	if (ipif->ipif_id == 0) {
-
 		ipsq_t *ipsq;
 
 		/* Find based on address */
@@ -10712,35 +10890,15 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 			sin6 = (sin6_t *)sin;
 			/* We are a writer, so we should be able to lookup */
-			ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
-			    ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
-			if (ipif == NULL) {
-				/*
-				 * Maybe the address in on another interface in
-				 * the same IPMP group? We check this below.
-				 */
-				ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
-				    NULL, ALL_ZONES, NULL, NULL, NULL, NULL,
-				    ipst);
-			}
+			ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
+			    ipst);
 		} else {
-			ipaddr_t addr;
-
 			if (sin->sin_family != AF_INET)
 				return (EAFNOSUPPORT);
 
-			addr = sin->sin_addr.s_addr;
 			/* We are a writer, so we should be able to lookup */
-			ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL,
-			    NULL, NULL, NULL, ipst);
-			if (ipif == NULL) {
-				/*
-				 * Maybe the address in on another interface in
-				 * the same IPMP group? We check this below.
-				 */
-				ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES,
-				    NULL, NULL, NULL, NULL, ipst);
-			}
+			ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
+			    ipst);
 		}
 		if (ipif == NULL) {
 			return (EADDRNOTAVAIL);
@@ -10750,32 +10908,11 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		 * It is possible for a user to send an SIOCLIFREMOVEIF with
 		 * lifr_name of the physical interface but with an ip address
 		 * lifr_addr of a logical interface plumbed over it.
-		 * So update ipsq_current_ipif once ipif points to the
-		 * correct interface after doing ipif_lookup_addr().
+		 * So update ipx_current_ipif now that ipif points to the
+		 * correct one.
 		 */
 		ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
-		ASSERT(ipsq != NULL);
-
-		mutex_enter(&ipsq->ipsq_lock);
-		ipsq->ipsq_current_ipif = ipif;
-		mutex_exit(&ipsq->ipsq_lock);
-
-		/*
-		 * When the address to be removed is hosted on a different
-		 * interface, we check if the interface is in the same IPMP
-		 * group as the specified one; if so we proceed with the
-		 * removal.
-		 * ill->ill_group is NULL when the ill is down, so we have to
-		 * compare the group names instead.
-		 */
-		if (ipif->ipif_ill != ill &&
-		    (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 ||
-		    ill->ill_phyint->phyint_groupname_len == 0 ||
-		    mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname,
-		    ill->ill_phyint->phyint_groupname) != 0)) {
-			ipif_refrele(ipif);
-			return (EADDRNOTAVAIL);
-		}
+		ipsq->ipsq_xop->ipx_current_ipif = ipif;
 
 		/* This is a writer */
 		ipif_refrele(ipif);
@@ -11072,7 +11209,7 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	if (need_dl_down)
 		ill_dl_down(ill);
 	if (need_arp_down)
-		ipif_arp_down(ipif);
+		ipif_resolver_down(ipif);
 
 	return (err);
 }
@@ -11272,9 +11409,9 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	if (need_dl_down)
 		ill_dl_down(ill);
-
 	if (need_arp_down)
-		ipif_arp_down(ipif);
+		ipif_resolver_down(ipif);
+
 	return (err);
 }
 
@@ -11323,144 +11460,8 @@ ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 }
 
 /*
- * part of ipmp, make this func return the active/inactive state and
- * caller can set once atomically instead of multiple mutex_enter/mutex_exit
- */
-/*
- * This function either sets or clears the IFF_INACTIVE flag.
- *
- * As long as there are some addresses or multicast memberships on the
- * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we
- * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface
- * will be used for outbound packets.
- *
- * Caller needs to verify the validity of setting IFF_INACTIVE.
- */
-static void
-phyint_inactive(phyint_t *phyi)
-{
-	ill_t *ill_v4;
-	ill_t *ill_v6;
-	ipif_t *ipif;
-	ilm_t *ilm;
-
-	ill_v4 = phyi->phyint_illv4;
-	ill_v6 = phyi->phyint_illv6;
-
-	/*
-	 * No need for a lock while traversing the list since iam
-	 * a writer
-	 */
-	if (ill_v4 != NULL) {
-		ASSERT(IAM_WRITER_ILL(ill_v4));
-		for (ipif = ill_v4->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
-				mutex_enter(&phyi->phyint_lock);
-				phyi->phyint_flags &= ~PHYI_INACTIVE;
-				mutex_exit(&phyi->phyint_lock);
-				return;
-			}
-		}
-		for (ilm = ill_v4->ill_ilm; ilm != NULL;
-		    ilm = ilm->ilm_next) {
-			if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
-				mutex_enter(&phyi->phyint_lock);
-				phyi->phyint_flags &= ~PHYI_INACTIVE;
-				mutex_exit(&phyi->phyint_lock);
-				return;
-			}
-		}
-	}
-	if (ill_v6 != NULL) {
-		ill_v6 = phyi->phyint_illv6;
-		for (ipif = ill_v6->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
-				mutex_enter(&phyi->phyint_lock);
-				phyi->phyint_flags &= ~PHYI_INACTIVE;
-				mutex_exit(&phyi->phyint_lock);
-				return;
-			}
-		}
-		for (ilm = ill_v6->ill_ilm; ilm != NULL;
-		    ilm = ilm->ilm_next) {
-			if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
-				mutex_enter(&phyi->phyint_lock);
-				phyi->phyint_flags &= ~PHYI_INACTIVE;
-				mutex_exit(&phyi->phyint_lock);
-				return;
-			}
-		}
-	}
-	mutex_enter(&phyi->phyint_lock);
-	phyi->phyint_flags |= PHYI_INACTIVE;
-	mutex_exit(&phyi->phyint_lock);
-}
-
-/*
- * This function is called only when the phyint flags change. Currently
- * called from ip_sioctl_flags. We re-do the broadcast nomination so
- * that we can select a good ill.
- */
-static void
-ip_redo_nomination(phyint_t *phyi)
-{
-	ill_t *ill_v4;
-
-	ill_v4 = phyi->phyint_illv4;
-
-	if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
-		ASSERT(IAM_WRITER_ILL(ill_v4));
-		if (ill_v4->ill_group->illgrp_ill_count > 1)
-			ill_nominate_bcast_rcv(ill_v4->ill_group);
-	}
-}
-
-/*
- * Heuristic to check if ill is INACTIVE.
- * Checks if ill has an ipif with an usable ip address.
- *
- * Return values:
- *	B_TRUE	- ill is INACTIVE; has no usable ipif
- *	B_FALSE - ill is not INACTIVE; ill has at least one usable ipif
- */
-static boolean_t
-ill_is_inactive(ill_t *ill)
-{
-	ipif_t *ipif;
-
-	/* Check whether it is in an IPMP group */
-	if (ill->ill_phyint->phyint_groupname == NULL)
-		return (B_FALSE);
-
-	if (ill->ill_ipif_up_count == 0)
-		return (B_TRUE);
-
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		uint64_t flags = ipif->ipif_flags;
-
-		/*
-		 * This ipif is usable if it is IPIF_UP and not a
-		 * dedicated test address.  A dedicated test address
-		 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED
-		 * (note in particular that V6 test addresses are
-		 * link-local data addresses and thus are marked
-		 * IPIF_NOFAILOVER but not IPIF_DEPRECATED).
-		 */
-		if ((flags & IPIF_UP) &&
-		    ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) !=
-		    (IPIF_DEPRECATED|IPIF_NOFAILOVER)))
-			return (B_FALSE);
-	}
-	return (B_TRUE);
-}
-
-/*
- * Set interface flags.
- * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT,
- * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST,
- * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE.
+ * Set interface flags.  Many flags require special handling (e.g.,
+ * bringing the interface down); see below for details.
  *
  * NOTE : We really don't enforce that ipif_id zero should be used
  *	  for setting any flags other than IFF_LOGINT_FLAGS. This
@@ -11478,17 +11479,16 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 {
 	uint64_t turn_on;
 	uint64_t turn_off;
-	int	err;
+	int	err = 0;
 	phyint_t *phyi;
 	ill_t *ill;
-	uint64_t intf_flags;
+	uint64_t intf_flags, cantchange_flags;
 	boolean_t phyint_flags_modified = B_FALSE;
 	uint64_t flags;
 	struct ifreq *ifr;
 	struct lifreq *lifr;
 	boolean_t set_linklocal = B_FALSE;
 	boolean_t zero_source = B_FALSE;
-	ip_stack_t *ipst;
 
 	ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -11497,11 +11497,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	ill = ipif->ipif_ill;
 	phyi = ill->ill_phyint;
-	ipst = ill->ill_ipst;
 
 	if (ipip->ipi_cmd_type == IF_CMD) {
 		ifr = (struct ifreq *)if_req;
-		flags =  (uint64_t)(ifr->ifr_flags & 0x0000ffff);
+		flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
 	} else {
 		lifr = (struct lifreq *)if_req;
 		flags = lifr->lifr_flags;
@@ -11524,25 +11523,60 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		flags |= intf_flags & ~0xFFFF;
 
 	/*
-	 * First check which bits will change and then which will
-	 * go on and off
+	 * Explicitly fail attempts to change flags that are always invalid on
+	 * an IPMP meta-interface.
 	 */
-	turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE;
-	if (!turn_on)
+	if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
+		return (EINVAL);
+
+	/*
+	 * Check which flags will change; silently ignore flags which userland
+	 * is not allowed to control.  (Because these flags may change between
+	 * SIOCGLIFFLAGS and SIOCSLIFFLAGS, and that's outside of userland's
+	 * control, we need to silently ignore them rather than fail.)
+	 */
+	cantchange_flags = IFF_CANTCHANGE;
+	if (IS_IPMP(ill))
+		cantchange_flags |= IFF_IPMP_CANTCHANGE;
+
+	turn_on = (flags ^ intf_flags) & ~cantchange_flags;
+	if (turn_on == 0)
 		return (0);	/* No change */
 
 	turn_off = intf_flags & turn_on;
 	turn_on ^= turn_off;
-	err = 0;
 
 	/*
-	 * Don't allow any bits belonging to the logical interface
-	 * to be set or cleared on the replacement ipif that was
-	 * created temporarily during a MOVE.
+	 * All test addresses must be IFF_DEPRECATED (to ensure source address
+	 * selection avoids them) -- so force IFF_DEPRECATED on, and do not
+	 * allow it to be turned off.
 	 */
-	if (ipif->ipif_replace_zero &&
-	    ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) {
+	if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
+	    (turn_on|intf_flags) & IFF_NOFAILOVER)
 		return (EINVAL);
+
+	if (turn_on & IFF_NOFAILOVER) {
+		turn_on |= IFF_DEPRECATED;
+		flags |= IFF_DEPRECATED;
+	}
+
+	/*
+	 * On underlying interfaces, only allow applications to manage test
+	 * addresses -- otherwise, they may get confused when the address
+	 * moves as part of being brought up.  Likewise, prevent an
+	 * application-managed test address from being converted to a data
+	 * address.  To prevent migration of administratively up addresses in
+	 * the kernel, we don't allow them to be converted either.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
+
+		if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
+			return (EINVAL);
+
+		if ((turn_off & IFF_NOFAILOVER) &&
+		    (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
+			return (EINVAL);
 	}
 
 	/*
@@ -11583,16 +11617,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	}
 
 	/*
-	 * ILL cannot be part of a usesrc group and and IPMP group at the
-	 * same time. No need to grab ill_g_usesrc_lock here, see
-	 * synchronization notes in ip.c
-	 */
-	if (turn_on & PHYI_STANDBY &&
-	    ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
-		return (EINVAL);
-	}
-
-	/*
 	 * If we modify physical interface flags, we'll potentially need to
 	 * send up two routing socket messages for the changes (one for the
 	 * IPv4 ill, and another for the IPv6 ill).  Note that here.
@@ -11601,98 +11625,44 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		phyint_flags_modified = B_TRUE;
 
 	/*
-	 * If we are setting or clearing FAILED or STANDBY or OFFLINE,
-	 * we need to flush the IRE_CACHES belonging to this ill.
-	 * We handle this case here without doing the DOWN/UP dance
-	 * like it is done for other flags. If some other flags are
-	 * being turned on/off with FAILED/STANDBY/OFFLINE, the code
-	 * below will handle it by bringing it down and then
-	 * bringing it UP.
+	 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
+	 * (otherwise, we'd immediately use them, defeating standby).  Also,
+	 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
+	 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
+	 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared.  We
+	 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
+	 * will not be honored.
 	 */
-	if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) {
-		ill_t *ill_v4, *ill_v6;
-
-		ill_v4 = phyi->phyint_illv4;
-		ill_v6 = phyi->phyint_illv6;
-
+	if (turn_on & PHYI_STANDBY) {
 		/*
-		 * First set the INACTIVE flag if needed. Then delete the ires.
-		 * ire_add will atomically prevent creating new IRE_CACHEs
-		 * unless hidden flag is set.
-		 * PHYI_FAILED and PHYI_INACTIVE are exclusive
+		 * No need to grab ill_g_usesrc_lock here; see the
+		 * synchronization notes in ip.c.
 		 */
-		if ((turn_on & PHYI_FAILED) &&
-		    ((intf_flags & PHYI_STANDBY) ||
-		    !ipst->ips_ipmp_enable_failback)) {
-			/* Reset PHYI_INACTIVE when PHYI_FAILED is being set */
-			phyi->phyint_flags &= ~PHYI_INACTIVE;
-		}
-		if ((turn_off & PHYI_FAILED) &&
-		    ((intf_flags & PHYI_STANDBY) ||
-		    (!ipst->ips_ipmp_enable_failback &&
-		    ill_is_inactive(ill)))) {
-			phyint_inactive(phyi);
-		}
-
-		if (turn_on & PHYI_STANDBY) {
-			/*
-			 * We implicitly set INACTIVE only when STANDBY is set.
-			 * INACTIVE is also set on non-STANDBY phyint when user
-			 * disables FAILBACK using configuration file.
-			 * Do not allow STANDBY to be set on such INACTIVE
-			 * phyint
-			 */
-			if (phyi->phyint_flags & PHYI_INACTIVE)
-				return (EINVAL);
-			if (!(phyi->phyint_flags & PHYI_FAILED))
-				phyint_inactive(phyi);
-		}
-		if (turn_off & PHYI_STANDBY) {
-			if (ipst->ips_ipmp_enable_failback) {
-				/*
-				 * Reset PHYI_INACTIVE.
-				 */
-				phyi->phyint_flags &= ~PHYI_INACTIVE;
-			} else if (ill_is_inactive(ill) &&
-			    !(phyi->phyint_flags & PHYI_FAILED)) {
-				/*
-				 * Need to set INACTIVE, when user sets
-				 * STANDBY on a non-STANDBY phyint and
-				 * later resets STANDBY
-				 */
-				phyint_inactive(phyi);
-			}
+		if (ill->ill_usesrc_grp_next != NULL ||
+		    intf_flags & PHYI_INACTIVE)
+			return (EINVAL);
+		if (!(flags & PHYI_FAILED)) {
+			flags |= PHYI_INACTIVE;
+			turn_on |= PHYI_INACTIVE;
 		}
-		/*
-		 * We should always send up a message so that the
-		 * daemons come to know of it. Note that the zeroth
-		 * interface can be down and the check below for IPIF_UP
-		 * will not make sense as we are actually setting
-		 * a phyint flag here. We assume that the ipif used
-		 * is always the zeroth ipif. (ip_rts_ifmsg does not
-		 * send up any message for non-zero ipifs).
-		 */
-		phyint_flags_modified = B_TRUE;
+	}
 
-		if (ill_v4 != NULL) {
-			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
-			    IRE_CACHE, ill_stq_cache_delete,
-			    (char *)ill_v4, ill_v4);
-			illgrp_reset_schednext(ill_v4);
-		}
-		if (ill_v6 != NULL) {
-			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
-			    IRE_CACHE, ill_stq_cache_delete,
-			    (char *)ill_v6, ill_v6);
-			illgrp_reset_schednext(ill_v6);
-		}
+	if (turn_off & PHYI_STANDBY) {
+		flags &= ~PHYI_INACTIVE;
+		turn_off |= PHYI_INACTIVE;
 	}
 
 	/*
+	 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
+	 * would end up on.
+	 */
+	if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
+	    (PHYI_FAILED | PHYI_INACTIVE))
+		return (EINVAL);
+
+	/*
 	 * If ILLF_ROUTER changes, we need to change the ip forwarding
-	 * status of the interface and, if the interface is part of an IPMP
-	 * group, all other interfaces that are part of the same IPMP
-	 * group.
+	 * status of the interface.
 	 */
 	if ((turn_on | turn_off) & ILLF_ROUTER)
 		(void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
@@ -11718,33 +11688,31 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		mutex_exit(&ill->ill_phyint->phyint_lock);
 
 		/*
-		 * We do the broadcast and nomination here rather
-		 * than waiting for a FAILOVER/FAILBACK to happen. In
-		 * the case of FAILBACK from INACTIVE standby to the
-		 * interface that has been repaired, PHYI_FAILED has not
-		 * been cleared yet. If there are only two interfaces in
-		 * that group, all we have is a FAILED and INACTIVE
-		 * interface. If we do the nomination soon after a failback,
-		 * the broadcast nomination code would select the
-		 * INACTIVE interface for receiving broadcasts as FAILED is
-		 * not yet cleared. As we don't want STANDBY/INACTIVE to
-		 * receive broadcast packets, we need to redo nomination
-		 * when the FAILED is cleared here. Thus, in general we
-		 * always do the nomination here for FAILED, STANDBY
-		 * and OFFLINE.
+		 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
+		 * same to the kernel: if any of them has been set by
+		 * userland, the interface cannot be used for data traffic.
 		 */
-		if (((turn_on | turn_off) &
-		    (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) {
-			ip_redo_nomination(phyi);
+		if ((turn_on|turn_off) &
+		    (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
+			ASSERT(!IS_IPMP(ill));
+			/*
+			 * It's possible the ill is part of an "anonymous"
+			 * IPMP group rather than a real group.  In that case,
+			 * there are no other interfaces in the group and thus
+			 * no need to call ipmp_phyint_refresh_active().
+			 */
+			if (IS_UNDER_IPMP(ill))
+				ipmp_phyint_refresh_active(phyi);
 		}
+
 		if (phyint_flags_modified) {
 			if (phyi->phyint_illv4 != NULL) {
 				ip_rts_ifmsg(phyi->phyint_illv4->
-				    ill_ipif);
+				    ill_ipif, RTSQ_DEFAULT);
 			}
 			if (phyi->phyint_illv6 != NULL) {
 				ip_rts_ifmsg(phyi->phyint_illv6->
-				    ill_ipif);
+				    ill_ipif, RTSQ_DEFAULT);
 			}
 		}
 		return (0);
@@ -11785,15 +11753,17 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	}
 
 	/*
-	 * The only flag changes that we currently take specific action on
-	 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL,
-	 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and
-	 * IPIF_PREFERRED.  This is done by bring the ipif down, changing
-	 * the flags and bringing it back up again.
+	 * The only flag changes that we currently take specific action on are
+	 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
+	 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
+	 * IPIF_NOFAILOVER.  This is done by bring the ipif down, changing the
+	 * flags and bringing it back up again.  For IPIF_NOFAILOVER, the act
+	 * of bringing it back up will trigger the address to be moved.
 	 */
 	if ((turn_on|turn_off) &
 	    (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
-	    ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) {
+	    ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
+	    IPIF_NOFAILOVER)) {
 		/*
 		 * Taking this ipif down, make sure we have
 		 * valid net and subnet bcast ire's for other
@@ -11822,9 +11792,8 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 {
 	ill_t	*ill;
 	phyint_t *phyi;
-	uint64_t turn_on;
-	uint64_t turn_off;
-	uint64_t intf_flags;
+	uint64_t turn_on, turn_off;
+	uint64_t intf_flags, cantchange_flags;
 	boolean_t phyint_flags_modified = B_FALSE;
 	int	err = 0;
 	boolean_t set_linklocal = B_FALSE;
@@ -11839,12 +11808,15 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 	phyi = ill->ill_phyint;
 
 	intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
-	turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP);
+	cantchange_flags = IFF_CANTCHANGE | IFF_UP;
+	if (IS_IPMP(ill))
+		cantchange_flags |= IFF_IPMP_CANTCHANGE;
 
+	turn_on = (flags ^ intf_flags) & ~cantchange_flags;
 	turn_off = intf_flags & turn_on;
 	turn_on ^= turn_off;
 
-	if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))
+	if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
 		phyint_flags_modified = B_TRUE;
 
 	/*
@@ -11870,9 +11842,6 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 	mutex_exit(&ill->ill_lock);
 	mutex_exit(&phyi->phyint_lock);
 
-	if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)))
-		ip_redo_nomination(phyi);
-
 	if (set_linklocal)
 		(void) ipif_setlinklocal(ipif);
 
@@ -11881,12 +11850,29 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 	else
 		ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
 
+	/*
+	 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
+	 * the kernel: if any of them has been set by userland, the interface
+	 * cannot be used for data traffic.
+	 */
+	if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
+		ASSERT(!IS_IPMP(ill));
+		/*
+		 * It's possible the ill is part of an "anonymous" IPMP group
+		 * rather than a real group.  In that case, there are no other
+		 * interfaces in the group and thus no need for us to call
+		 * ipmp_phyint_refresh_active().
+		 */
+		if (IS_UNDER_IPMP(ill))
+			ipmp_phyint_refresh_active(phyi);
+	}
+
 	if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
 		/*
 		 * XXX ipif_up really does not know whether a phyint flags
 		 * was modified or not. So, it sends up information on
 		 * only one routing sockets message. As we don't bring up
-		 * the interface and also set STANDBY/FAILED simultaneously
+		 * the interface and also set PHYI_ flags simultaneously
 		 * it should be okay.
 		 */
 		err = ipif_up(ipif, q, mp);
@@ -11898,14 +11884,14 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
 		if (phyint_flags_modified) {
 			if (phyi->phyint_illv4 != NULL) {
 				ip_rts_ifmsg(phyi->phyint_illv4->
-				    ill_ipif);
+				    ill_ipif, RTSQ_DEFAULT);
 			}
 			if (phyi->phyint_illv6 != NULL) {
 				ip_rts_ifmsg(phyi->phyint_illv6->
-				    ill_ipif);
+				    ill_ipif, RTSQ_DEFAULT);
 			}
 		} else {
-			ip_rts_ifmsg(ipif);
+			ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
 		}
 		/*
 		 * Update the flags in SCTP's IPIF list, ipif_up() will do
@@ -12101,10 +12087,7 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		 * broadcast address makes sense.  If it does,
 		 * there should be an IRE for it already.
 		 * Don't match on ipif, only on the ill
-		 * since we are sharing these now. Don't use
-		 * MATCH_IRE_ILL_GROUP as we are looking for
-		 * the broadcast ire on this ill and each ill
-		 * in the group has its own broadcast ire.
+		 * since we are sharing these now.
 		 */
 		ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST,
 		    ipif, ALL_ZONES, NULL,
@@ -12302,9 +12285,16 @@ int
 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *if_req)
 {
-
 	ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+
+	/*
+	 * Since no applications should ever be setting metrics on underlying
+	 * interfaces, we explicitly fail to smoke 'em out.
+	 */
+	if (IS_UNDER_IPMP(ipif->ipif_ill))
+		return (EINVAL);
+
 	/*
 	 * Set interface metric.  We don't use this for
 	 * anything but we keep track of it in case it is
@@ -12332,6 +12322,7 @@ ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	/* Get interface metric. */
 	ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+
 	if (ipip->ipi_cmd_type == IF_CMD) {
 		struct ifreq    *ifr;
 
@@ -12766,13 +12757,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		nipif->ipif_state_flags |= IPIF_CHANGING;
 	}
 
-	mutex_exit(&ill->ill_lock);
-
 	if (lir->lir_maxmtu != 0) {
 		ill->ill_max_mtu = lir->lir_maxmtu;
-		ill->ill_mtu_userspecified = 1;
+		ill->ill_user_mtu = lir->lir_maxmtu;
 		mtu_walk = B_TRUE;
 	}
+	mutex_exit(&ill->ill_lock);
 
 	if (lir->lir_reachtime != 0)
 		ill->ill_reachable_time = lir->lir_reachtime;
@@ -12821,6 +12811,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	ILL_UNMARK_CHANGING(ill);
 	mutex_exit(&ill->ill_lock);
 
+	/*
+	 * Refresh IPMP meta-interface MTU if necessary.
+	 */
+	if (IS_UNDER_IPMP(ill))
+		ipmp_illgrp_refresh_mtu(ill->ill_grp);
+
 	return (0);
 }
 
@@ -13032,13 +13028,117 @@ ipif_assign_seqid(ipif_t *ipif)
 }
 
 /*
+ * Clone the contents of `sipif' to `dipif'.  Requires that both ipifs are
+ * administratively down (i.e., no DAD), of the same type, and locked.  Note
+ * that the clone is complete -- including the seqid -- and the expectation is
+ * that the caller will either free or overwrite `sipif' before it's unlocked.
+ */
+static void
+ipif_clone(const ipif_t *sipif, ipif_t *dipif)
+{
+	ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
+	ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
+	ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
+	ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
+	ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
+	ASSERT(sipif->ipif_arp_del_mp == NULL);
+	ASSERT(dipif->ipif_arp_del_mp == NULL);
+	ASSERT(sipif->ipif_igmp_rpt == NULL);
+	ASSERT(dipif->ipif_igmp_rpt == NULL);
+	ASSERT(sipif->ipif_multicast_up == 0);
+	ASSERT(dipif->ipif_multicast_up == 0);
+	ASSERT(sipif->ipif_joined_allhosts == 0);
+	ASSERT(dipif->ipif_joined_allhosts == 0);
+
+	dipif->ipif_mtu = sipif->ipif_mtu;
+	dipif->ipif_flags = sipif->ipif_flags;
+	dipif->ipif_metric = sipif->ipif_metric;
+	dipif->ipif_zoneid = sipif->ipif_zoneid;
+	dipif->ipif_v6subnet = sipif->ipif_v6subnet;
+	dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
+	dipif->ipif_v6src_addr = sipif->ipif_v6src_addr;
+	dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
+	dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
+	dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
+
+	/*
+	 * While dipif is down right now, it might've been up before.  Since
+	 * it's changing identity, its packet counters need to be reset.
+	 */
+	dipif->ipif_ib_pkt_count = 0;
+	dipif->ipif_ob_pkt_count = 0;
+	dipif->ipif_fo_pkt_count = 0;
+
+	/*
+	 * As per the comment atop the function, we assume that these sipif
+	 * fields will be changed before sipif is unlocked.
+	 */
+	dipif->ipif_seqid = sipif->ipif_seqid;
+	dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp;
+	dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt;
+	dipif->ipif_state_flags = sipif->ipif_state_flags;
+}
+
+/*
+ * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
+ * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
+ * (unreferenced) ipif.  Also, if `sipif' is used by the current xop, then
+ * transfer the xop to `dipif'.  Requires that all ipifs are administratively
+ * down (i.e., no DAD), of the same type, and unlocked.
+ */
+static void
+ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
+{
+	ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
+	int ipx_current_ioctl;
+
+	ASSERT(sipif != dipif);
+	ASSERT(sipif != virgipif);
+
+	/*
+	 * Grab all of the locks that protect the ipif in a defined order.
+	 */
+	GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
+	if (sipif > dipif) {
+		mutex_enter(&sipif->ipif_saved_ire_lock);
+		mutex_enter(&dipif->ipif_saved_ire_lock);
+	} else {
+		mutex_enter(&dipif->ipif_saved_ire_lock);
+		mutex_enter(&sipif->ipif_saved_ire_lock);
+	}
+
+	ipif_clone(sipif, dipif);
+	if (virgipif != NULL) {
+		ipif_clone(virgipif, sipif);
+		mi_free(virgipif);
+	}
+
+	mutex_exit(&sipif->ipif_saved_ire_lock);
+	mutex_exit(&dipif->ipif_saved_ire_lock);
+	RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
+
+	/*
+	 * Transfer ownership of the current xop, if necessary.
+	 */
+	if (ipsq->ipsq_xop->ipx_current_ipif == sipif) {
+		ASSERT(ipsq->ipsq_xop->ipx_pending_ipif == NULL);
+		ipx_current_ioctl = ipsq->ipsq_xop->ipx_current_ioctl;
+		ipsq_current_finish(ipsq);
+		ipsq_current_start(ipsq, dipif, ipx_current_ioctl);
+	}
+
+	if (virgipif == NULL)
+		mi_free(sipif);
+}
+
+/*
  * Insert the ipif, so that the list of ipifs on the ill will be sorted
  * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
  * be inserted into the first space available in the list. The value of
  * ipif_id will then be set to the appropriate value for its position.
  */
 static int
-ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
+ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
 {
 	ill_t *ill;
 	ipif_t *tipif;
@@ -13056,12 +13156,11 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
 	/*
 	 * In the case of lo0:0 we already hold the ill_g_lock.
 	 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
-	 * ipif_insert. Another such caller is ipif_move.
+	 * ipif_insert.
 	 */
 	if (acquire_g_lock)
 		rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-	if (acquire_ill_lock)
-		mutex_enter(&ill->ill_lock);
+	mutex_enter(&ill->ill_lock);
 	id = ipif->ipif_id;
 	tipifp = &(ill->ill_ipif);
 	if (id == -1) {	/* need to find a real id */
@@ -13075,8 +13174,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
 		}
 		/* limit number of logical interfaces */
 		if (id >= ipst->ips_ip_addrs_per_if) {
-			if (acquire_ill_lock)
-				mutex_exit(&ill->ill_lock);
+			mutex_exit(&ill->ill_lock);
 			if (acquire_g_lock)
 				rw_exit(&ipst->ips_ill_g_lock);
 			return (-1);
@@ -13091,8 +13189,7 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
 			tipifp = &(tipif->ipif_next);
 		}
 	} else {
-		if (acquire_ill_lock)
-			mutex_exit(&ill->ill_lock);
+		mutex_exit(&ill->ill_lock);
 		if (acquire_g_lock)
 			rw_exit(&ipst->ips_ill_g_lock);
 		return (-1);
@@ -13102,25 +13199,22 @@ ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
 
 	ipif->ipif_next = tipif;
 	*tipifp = ipif;
-	if (acquire_ill_lock)
-		mutex_exit(&ill->ill_lock);
+	mutex_exit(&ill->ill_lock);
 	if (acquire_g_lock)
 		rw_exit(&ipst->ips_ill_g_lock);
+
 	return (0);
 }
 
 static void
-ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
+ipif_remove(ipif_t *ipif)
 {
 	ipif_t	**ipifp;
 	ill_t	*ill = ipif->ipif_ill;
 
 	ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
-	if (acquire_ill_lock)
-		mutex_enter(&ill->ill_lock);
-	else
-		ASSERT(MUTEX_HELD(&ill->ill_lock));
 
+	mutex_enter(&ill->ill_lock);
 	ipifp = &ill->ill_ipif;
 	for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
 		if (*ipifp == ipif) {
@@ -13128,9 +13222,7 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
 			break;
 		}
 	}
-
-	if (acquire_ill_lock)
-		mutex_exit(&ill->ill_lock);
+	mutex_exit(&ill->ill_lock);
 }
 
 /*
@@ -13149,10 +13241,12 @@ ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
  * second DL_INFO_ACK comes in from the driver.
  */
 static ipif_t *
-ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
+ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
+    boolean_t insert)
 {
 	ipif_t	*ipif;
-	phyint_t *phyi;
+	phyint_t *phyi = ill->ill_phyint;
+	ip_stack_t *ipst = ill->ill_ipst;
 
 	ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
 	    ill->ill_name, id, (void *)ill));
@@ -13175,23 +13269,61 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
 	ipif->ipif_refcnt = 0;
 	ipif->ipif_saved_ire_cnt = 0;
 
-	if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) {
-		mi_free(ipif);
-		return (NULL);
+	if (insert) {
+		if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) {
+			mi_free(ipif);
+			return (NULL);
+		}
+		/* -1 id should have been replaced by real id */
+		id = ipif->ipif_id;
+		ASSERT(id >= 0);
 	}
-	/* -1 id should have been replaced by real id */
-	id = ipif->ipif_id;
-	ASSERT(id >= 0);
 
 	if (ill->ill_name[0] != '\0')
 		ipif_assign_seqid(ipif);
 
 	/*
-	 * Keep a copy of original id in ipif_orig_ipifid.  Failback
-	 * will attempt to restore the original id.  The SIOCSLIFOINDEX
-	 * ioctl sets ipif_orig_ipifid to zero.
+	 * If this is ipif zero, configure ill/phyint-wide information.
+	 * Defer most configuration until we're guaranteed we're attached.
 	 */
-	ipif->ipif_orig_ipifid = id;
+	if (id == 0) {
+		if (ill->ill_mactype == SUNW_DL_IPMP) {
+			/*
+			 * Set PHYI_IPMP and also set PHYI_FAILED since there
+			 * are no active interfaces.  Similarly, PHYI_RUNNING
+			 * isn't set until the group has an active interface.
+			 */
+			mutex_enter(&phyi->phyint_lock);
+			phyi->phyint_flags |= (PHYI_IPMP | PHYI_FAILED);
+			mutex_exit(&phyi->phyint_lock);
+
+			/*
+			 * Create the illgrp (which must not exist yet because
+			 * the zeroth ipif is created once per ill).  However,
+			 * do not not link it to the ipmp_grp_t until I_PLINK
+			 * is called; see ip_sioctl_plink_ipmp() for details.
+			 */
+			if (ipmp_illgrp_create(ill) == NULL) {
+				if (insert) {
+					rw_enter(&ipst->ips_ill_g_lock,
+					    RW_WRITER);
+					ipif_remove(ipif);
+					rw_exit(&ipst->ips_ill_g_lock);
+				}
+				mi_free(ipif);
+				return (NULL);
+			}
+		} else {
+			/*
+			 * By default, PHYI_RUNNING is set when the zeroth
+			 * ipif is created.  For other ipifs, we don't touch
+			 * it since DLPI notifications may have changed it.
+			 */
+			mutex_enter(&phyi->phyint_lock);
+			phyi->phyint_flags |= PHYI_RUNNING;
+			mutex_exit(&phyi->phyint_lock);
+		}
+	}
 
 	/*
 	 * We grab the ill_lock and phyint_lock to protect the flag changes.
@@ -13199,18 +13331,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
 	 * ioctl completes and the IPIF_CHANGING flag is cleared.
 	 */
 	mutex_enter(&ill->ill_lock);
-	mutex_enter(&ill->ill_phyint->phyint_lock);
-	/*
-	 * Set the running flag when logical interface zero is created.
-	 * For subsequent logical interfaces, a DLPI link down
-	 * notification message may have cleared the running flag to
-	 * indicate the link is down, so we shouldn't just blindly set it.
-	 */
-	if (id == 0)
-		ill->ill_phyint->phyint_flags |= PHYI_RUNNING;
+	mutex_enter(&phyi->phyint_lock);
+
 	ipif->ipif_ire_type = ire_type;
-	phyi = ill->ill_phyint;
-	ipif->ipif_orig_ifindex = phyi->phyint_ifindex;
 
 	if (ipif->ipif_isv6) {
 		ill->ill_flags |= ILLF_IPV6;
@@ -13238,14 +13361,18 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
 	 * Don't set the interface flags etc. now, will do it in
 	 * ip_ll_subnet_defaults.
 	 */
-	if (!initialize) {
-		mutex_exit(&ill->ill_lock);
-		mutex_exit(&ill->ill_phyint->phyint_lock);
-		return (ipif);
-	}
+	if (!initialize)
+		goto out;
+
 	ipif->ipif_mtu = ill->ill_max_mtu;
 
-	if (ill->ill_bcast_addr_length != 0) {
+	/*
+	 * NOTE: The IPMP meta-interface is special-cased because it starts
+	 * with no underlying interfaces (and thus an unknown broadcast
+	 * address length), but all interfaces that can be placed into an IPMP
+	 * group are required to be broadcast-capable.
+	 */
+	if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
 		/*
 		 * Later detect lack of DLPI driver multicast
 		 * capability by catching DL_ENABMULTI errors in
@@ -13269,8 +13396,7 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
 				ill->ill_flags |= ILLF_NOARP;
 		}
 		if (ill->ill_phys_addr_length == 0) {
-			if (ill->ill_media &&
-			    ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) {
+			if (ill->ill_mactype == SUNW_DL_VNI) {
 				ipif->ipif_flags |= IPIF_NOXMIT;
 				phyi->phyint_flags |= PHYI_VIRTUAL;
 			} else {
@@ -13285,8 +13411,9 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
 			}
 		}
 	}
+out:
+	mutex_exit(&phyi->phyint_lock);
 	mutex_exit(&ill->ill_lock);
-	mutex_exit(&ill->ill_phyint->phyint_lock);
 	return (ipif);
 }
 
@@ -13300,34 +13427,49 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
  *	  for details.
  */
 void
-ipif_arp_down(ipif_t *ipif)
+ipif_resolver_down(ipif_t *ipif)
 {
 	mblk_t	*mp;
 	ill_t	*ill = ipif->ipif_ill;
 
-	ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
+	ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
+	if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
+		return;
+
 	/* Delete the mapping for the local address */
 	mp = ipif->ipif_arp_del_mp;
 	if (mp != NULL) {
-		ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+		ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
 		    *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
 		putnext(ill->ill_rq, mp);
 		ipif->ipif_arp_del_mp = NULL;
 	}
 
 	/*
+	 * Make IPMP aware of the deleted data address.
+	 */
+	if (IS_IPMP(ill))
+		ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+
+	/*
 	 * If this is the last ipif that is going down and there are no
 	 * duplicate addresses we may yet attempt to re-probe, then we need to
 	 * clean up ARP completely.
 	 */
 	if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) {
+		/*
+		 * If this was the last ipif on an IPMP interface, purge any
+		 * IPMP ARP entries associated with it.
+		 */
+		if (IS_IPMP(ill))
+			ipmp_illgrp_refresh_arpent(ill->ill_grp);
 
 		/* Send up AR_INTERFACE_DOWN message */
 		mp = ill->ill_arp_down_mp;
 		if (mp != NULL) {
-			ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+			ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
 			    *(unsigned *)mp->b_rptr, ill->ill_name,
 			    ipif->ipif_id));
 			putnext(ill->ill_rq, mp);
@@ -13337,7 +13479,7 @@ ipif_arp_down(ipif_t *ipif)
 		/* Tell ARP to delete the multicast mappings */
 		mp = ill->ill_arp_del_mapping_mp;
 		if (mp != NULL) {
-			ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
+			ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
 			    *(unsigned *)mp->b_rptr, ill->ill_name,
 			    ipif->ipif_id));
 			putnext(ill->ill_rq, mp);
@@ -13377,6 +13519,13 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
 		return (0);
 
 	/*
+	 * IPMP meta-interfaces don't have any inherent multicast mappings,
+	 * and instead use the ones on the underlying interfaces.
+	 */
+	if (IS_IPMP(ill))
+		return (0);
+
+	/*
 	 * Delete the existing mapping from ARP. Normally ipif_down
 	 * -> ipif_arp_down should send this up to ARP. The only
 	 * reason we would find this when we are switching from
@@ -13473,26 +13622,23 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
 }
 
 /*
- * Get the resolver set up for a new interface address.
- * (Always called as writer.)
- * Called both for IPv4 and IPv6 interfaces,
- * though it only sets up the resolver for v6
- * if it's an xresolv interface (one using an external resolver).
- * Honors ILLF_NOARP.
- * The enumerated value res_act is used to tune the behavior.
- * If set to Res_act_initial, then we set up all the resolver
- * structures for a new interface.  If set to Res_act_move, then
- * we just send an AR_ENTRY_ADD message up to ARP for IPv4
- * interfaces; this is called by ip_rput_dlpi_writer() to handle
- * asynchronous hardware address change notification.  If set to
- * Res_act_defend, then we tell ARP that it needs to send a single
- * gratuitous message in defense of the address.
+ * Get the resolver set up for a new IP address.  (Always called as writer.)
+ * Called both for IPv4 and IPv6 interfaces, though it only sets up the
+ * resolver for v6 if it's an ILLF_XRESOLV interface.  Honors ILLF_NOARP.
+ *
+ * The enumerated value res_act tunes the behavior:
+ * 	* Res_act_initial: set up all the resolver structures for a new
+ *	  IP address.
+ *	* Res_act_defend: tell ARP that it needs to send a single gratuitous
+ *	  ARP message in defense of the address.
+ *	* Res_act_rebind: tell ARP to change the hardware address for an IP
+ *	  address (and issue gratuitous ARPs).  Used by ipmp_ill_bind_ipif().
+ *
  * Returns error on failure.
  */
 int
 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 {
-	caddr_t	addr;
 	mblk_t	*arp_up_mp = NULL;
 	mblk_t	*arp_down_mp = NULL;
 	mblk_t	*arp_add_mp = NULL;
@@ -13500,9 +13646,9 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 	mblk_t	*arp_add_mapping_mp = NULL;
 	mblk_t	*arp_del_mapping_mp = NULL;
 	ill_t	*ill = ipif->ipif_ill;
-	uchar_t	*area_p = NULL;
-	uchar_t	*ared_p = NULL;
 	int	err = ENOMEM;
+	boolean_t added_ipif = B_FALSE;
+	boolean_t publish;
 	boolean_t was_dup;
 
 	ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
@@ -13540,11 +13686,7 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 		 * External resolver for IPv6
 		 */
 		ASSERT(res_act == Res_act_initial);
-		if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
-			addr = (caddr_t)&ipif->ipif_v6lcl_addr;
-			area_p = (uchar_t *)&ip6_area_template;
-			ared_p = (uchar_t *)&ip6_ared_template;
-		}
+		publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr);
 	} else {
 		/*
 		 * IPv4 arp case. If the ARP stream has already started
@@ -13562,41 +13704,39 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 				ill->ill_arp_bringup_pending = 1;
 			mutex_exit(&ill->ill_lock);
 		}
-		if (ipif->ipif_lcl_addr != INADDR_ANY) {
-			addr = (caddr_t)&ipif->ipif_lcl_addr;
-			area_p = (uchar_t *)&ip_area_template;
-			ared_p = (uchar_t *)&ip_ared_template;
+		publish = (ipif->ipif_lcl_addr != INADDR_ANY);
+	}
+
+	if (IS_IPMP(ill) && publish) {
+		/*
+		 * If we're here via ipif_up(), then the ipif won't be bound
+		 * yet -- add it to the group, which will bind it if possible.
+		 * (We would add it in ipif_up(), but deleting on failure
+		 * there is gruesome.)  If we're here via ipmp_ill_bind_ipif(),
+		 * then the ipif has already been added to the group and we
+		 * just need to use the binding.
+		 */
+		if (ipmp_ipif_bound_ill(ipif) == NULL) {
+			if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) {
+				/*
+				 * We couldn't bind the ipif to an ill yet,
+				 * so we have nothing to publish.
+				 */
+				publish = B_FALSE;
+			}
+			added_ipif = B_TRUE;
 		}
 	}
 
 	/*
 	 * Add an entry for the local address in ARP only if it
-	 * is not UNNUMBERED and the address is not INADDR_ANY.
+	 * is not UNNUMBERED and it is suitable for publishing.
 	 */
-	if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) {
-		area_t *area;
-
-		/* Now ask ARP to publish our address. */
-		arp_add_mp = ill_arp_alloc(ill, area_p, addr);
-		if (arp_add_mp == NULL)
-			goto failed;
-		area = (area_t *)arp_add_mp->b_rptr;
-		if (res_act != Res_act_initial) {
-			/*
-			 * Copy the new hardware address and length into
-			 * arp_add_mp to be sent to ARP.
-			 */
-			area->area_hw_addr_length = ill->ill_phys_addr_length;
-			bcopy(ill->ill_phys_addr,
-			    ((char *)area + area->area_hw_addr_offset),
-			    area->area_hw_addr_length);
-		}
-
-		area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH |
-		    ACE_F_MYADDR;
-
+	if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) {
 		if (res_act == Res_act_defend) {
-			area->area_flags |= ACE_F_DEFEND;
+			arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND);
+			if (arp_add_mp == NULL)
+				goto failed;
 			/*
 			 * If we're just defending our address now, then
 			 * there's no need to set up ARP multicast mappings.
@@ -13605,17 +13745,18 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 			goto done;
 		}
 
-		if (res_act != Res_act_initial)
-			goto arp_setup_multicast;
-
 		/*
-		 * Allocate an ARP deletion message so we know we can tell ARP
-		 * when the interface goes down.
+		 * Allocate an ARP add message and an ARP delete message (the
+		 * latter is saved for use when the address goes down).
 		 */
-		arp_del_mp = ill_arp_alloc(ill, ared_p, addr);
-		if (arp_del_mp == NULL)
+		if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL)
+			goto failed;
+
+		if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL)
 			goto failed;
 
+		if (res_act != Res_act_initial)
+			goto arp_setup_multicast;
 	} else {
 		if (res_act != Res_act_initial)
 			goto done;
@@ -13624,14 +13765,11 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
 	 * Need to bring up ARP or setup multicast mapping only
 	 * when the first interface is coming UP.
 	 */
-	if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
-	    was_dup) {
+	if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup)
 		goto done;
-	}
 
 	/*
-	 * Allocate an ARP down message (to be saved) and an ARP up
-	 * message.
+	 * Allocate an ARP down message (to be saved) and an ARP up message.
 	 */
 	arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0);
 	if (arp_down_mp == NULL)
@@ -13648,33 +13786,21 @@ arp_setup_multicast:
 	/*
 	 * Setup the multicast mappings. This function initializes
 	 * ill_arp_del_mapping_mp also. This does not need to be done for
-	 * IPv6.
+	 * IPv6, or for the IPMP interface (since it has no link-layer).
 	 */
-	if (!ill->ill_isv6) {
+	if (!ill->ill_isv6 && !IS_IPMP(ill)) {
 		err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp);
 		if (err != 0)
 			goto failed;
 		ASSERT(ill->ill_arp_del_mapping_mp != NULL);
 		ASSERT(arp_add_mapping_mp != NULL);
 	}
-
 done:
-	if (arp_del_mp != NULL) {
-		ASSERT(ipif->ipif_arp_del_mp == NULL);
-		ipif->ipif_arp_del_mp = arp_del_mp;
-	}
-	if (arp_down_mp != NULL) {
-		ASSERT(ill->ill_arp_down_mp == NULL);
-		ill->ill_arp_down_mp = arp_down_mp;
-	}
-	if (arp_del_mapping_mp != NULL) {
-		ASSERT(ill->ill_arp_del_mapping_mp == NULL);
-		ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
-	}
 	if (arp_up_mp != NULL) {
 		ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n",
 		    ill->ill_name, ipif->ipif_id));
 		putnext(ill->ill_rq, arp_up_mp);
+		arp_up_mp = NULL;
 	}
 	if (arp_add_mp != NULL) {
 		ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n",
@@ -13686,6 +13812,7 @@ done:
 		if (!ill->ill_arp_extend)
 			ipif->ipif_addr_ready = 1;
 		putnext(ill->ill_rq, arp_add_mp);
+		arp_add_mp = NULL;
 	} else {
 		ipif->ipif_addr_ready = 1;
 	}
@@ -13693,29 +13820,40 @@ done:
 		ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n",
 		    ill->ill_name, ipif->ipif_id));
 		putnext(ill->ill_rq, arp_add_mapping_mp);
+		arp_add_mapping_mp = NULL;
 	}
-	if (res_act != Res_act_initial)
-		return (0);
 
-	if (ill->ill_flags & ILLF_NOARP)
-		err = ill_arp_off(ill);
-	else
-		err = ill_arp_on(ill);
-	if (err != 0) {
-		ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err));
-		freemsg(ipif->ipif_arp_del_mp);
-		freemsg(ill->ill_arp_down_mp);
-		freemsg(ill->ill_arp_del_mapping_mp);
-		ipif->ipif_arp_del_mp = NULL;
-		ill->ill_arp_down_mp = NULL;
-		ill->ill_arp_del_mapping_mp = NULL;
-		return (err);
+	if (res_act == Res_act_initial) {
+		if (ill->ill_flags & ILLF_NOARP)
+			err = ill_arp_off(ill);
+		else
+			err = ill_arp_on(ill);
+		if (err != 0) {
+			ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n",
+			    err));
+			goto failed;
+		}
 	}
+
+	if (arp_del_mp != NULL) {
+		ASSERT(ipif->ipif_arp_del_mp == NULL);
+		ipif->ipif_arp_del_mp = arp_del_mp;
+	}
+	if (arp_down_mp != NULL) {
+		ASSERT(ill->ill_arp_down_mp == NULL);
+		ill->ill_arp_down_mp = arp_down_mp;
+	}
+	if (arp_del_mapping_mp != NULL) {
+		ASSERT(ill->ill_arp_del_mapping_mp == NULL);
+		ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
+	}
+
 	return ((ill->ill_ipif_up_count != 0 || was_dup ||
 	    ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS);
-
 failed:
 	ip1dbg(("ipif_resolver_up: FAILED\n"));
+	if (added_ipif)
+		ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
 	freemsg(arp_add_mp);
 	freemsg(arp_del_mp);
 	freemsg(arp_add_mapping_mp);
@@ -13734,13 +13872,12 @@ ipif_arp_start_dad(ipif_t *ipif)
 {
 	ill_t *ill = ipif->ipif_ill;
 	mblk_t *arp_add_mp;
-	area_t *area;
 
+	/* ACE_F_UNVERIFIED restarts DAD */
 	if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing ||
 	    (ipif->ipif_flags & IPIF_UNNUMBERED) ||
 	    ipif->ipif_lcl_addr == INADDR_ANY ||
-	    (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
-	    (char *)&ipif->ipif_lcl_addr)) == NULL) {
+	    (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) {
 		/*
 		 * If we can't contact ARP for some reason, that's not really a
 		 * problem.  Just send out the routing socket notification that
@@ -13752,10 +13889,6 @@ ipif_arp_start_dad(ipif_t *ipif)
 		return;
 	}
 
-	/* Setting the 'unverified' flag restarts DAD */
-	area = (area_t *)arp_add_mp->b_rptr;
-	area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR |
-	    ACE_F_UNVERIFIED;
 	putnext(ill->ill_rq, arp_add_mp);
 }
 
@@ -13764,7 +13897,8 @@ ipif_ndp_start_dad(ipif_t *ipif)
 {
 	nce_t *nce;
 
-	nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE);
+	nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr,
+	    B_FALSE);
 	if (nce == NULL)
 		return;
 
@@ -13805,7 +13939,7 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
 	 */
 	if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) ||
 	    (!ill->ill_isv6 && !ill->ill_arp_extend)) {
-		ip_rts_ifmsg(ill->ill_ipif);
+		ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
 		return;
 	}
 
@@ -13838,8 +13972,10 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
 				 * we'll handle eventual routing socket
 				 * notification via DAD completion.)
 				 */
-				if (ipif == ill->ill_ipif)
-					ip_rts_ifmsg(ill->ill_ipif);
+				if (ipif == ill->ill_ipif) {
+					ip_rts_ifmsg(ill->ill_ipif,
+					    RTSQ_DEFAULT);
+				}
 			}
 		} else {
 			/*
@@ -13855,285 +13991,30 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
 	 * If we've torn down links, then notify the user right away.
 	 */
 	if (!went_up)
-		ip_rts_ifmsg(ill->ill_ipif);
-}
-
-/*
- * Wakeup all threads waiting to enter the ipsq, and sleeping
- * on any of the ills in this ipsq. The ill_lock of the ill
- * must be held so that waiters don't miss wakeups
- */
-static void
-ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock)
-{
-	phyint_t *phyint;
-
-	phyint = ipsq->ipsq_phyint_list;
-	while (phyint != NULL) {
-		if (phyint->phyint_illv4) {
-			if (!caller_holds_lock)
-				mutex_enter(&phyint->phyint_illv4->ill_lock);
-			ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
-			cv_broadcast(&phyint->phyint_illv4->ill_cv);
-			if (!caller_holds_lock)
-				mutex_exit(&phyint->phyint_illv4->ill_lock);
-		}
-		if (phyint->phyint_illv6) {
-			if (!caller_holds_lock)
-				mutex_enter(&phyint->phyint_illv6->ill_lock);
-			ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
-			cv_broadcast(&phyint->phyint_illv6->ill_cv);
-			if (!caller_holds_lock)
-				mutex_exit(&phyint->phyint_illv6->ill_lock);
-		}
-		phyint = phyint->phyint_ipsq_next;
-	}
-}
-
-static ipsq_t *
-ipsq_create(char *groupname, ip_stack_t *ipst)
-{
-	ipsq_t	*ipsq;
-
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-	ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
-	if (ipsq == NULL) {
-		return (NULL);
-	}
-
-	if (groupname != NULL)
-		(void) strcpy(ipsq->ipsq_name, groupname);
-	else
-		ipsq->ipsq_name[0] = '\0';
-
-	mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL);
-	ipsq->ipsq_flags |= IPSQ_GROUP;
-	ipsq->ipsq_next = ipst->ips_ipsq_g_head;
-	ipst->ips_ipsq_g_head = ipsq;
-	ipsq->ipsq_ipst = ipst;		/* No netstack_hold */
-	return (ipsq);
-}
-
-/*
- * Return an ipsq correspoding to the groupname. If 'create' is true
- * allocate a new ipsq if one does not exist. Usually an ipsq is associated
- * uniquely with an IPMP group. However during IPMP groupname operations,
- * multiple IPMP groups may be associated with a single ipsq. But no
- * IPMP group can be associated with more than 1 ipsq at any time.
- * For example
- *	Interfaces		IPMP grpname	ipsq	ipsq_name      ipsq_refs
- * 	hme1, hme2		mpk17-84	ipsq1	mpk17-84	2
- *	hme3, hme4		mpk17-85	ipsq2	mpk17-85	2
- *
- * Now the command ifconfig hme3 group mpk17-84 results in the temporary
- * status shown below during the execution of the above command.
- * 	hme1, hme2, hme3, hme4	mpk17-84, mpk17-85	ipsq1	mpk17-84  4
- *
- * After the completion of the above groupname command we return to the stable
- * state shown below.
- * 	hme1, hme2, hme3	mpk17-84	ipsq1	mpk17-84	3
- *	hme4			mpk17-85	ipsq2	mpk17-85	1
- *
- * Because of the above, we don't search based on the ipsq_name since that
- * would miss the correct ipsq during certain windows as shown above.
- * The ipsq_name is only used during split of an ipsq to return the ipsq to its
- * natural state.
- */
-static ipsq_t *
-ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq,
-    ip_stack_t *ipst)
-{
-	ipsq_t	*ipsq;
-	int	group_len;
-	phyint_t *phyint;
-
-	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
-	group_len = strlen(groupname);
-	ASSERT(group_len != 0);
-	group_len++;
-
-	for (ipsq = ipst->ips_ipsq_g_head;
-	    ipsq != NULL;
-	    ipsq = ipsq->ipsq_next) {
-		/*
-		 * When an ipsq is being split, and ill_split_ipsq
-		 * calls this function, we exclude it from being considered.
-		 */
-		if (ipsq == exclude_ipsq)
-			continue;
-
-		/*
-		 * Compare against the ipsq_name. The groupname change happens
-		 * in 2 phases. The 1st phase merges the from group into
-		 * the to group's ipsq, by calling ill_merge_groups and restarts
-		 * the ioctl. The 2nd phase then locates the ipsq again thru
-		 * ipsq_name. At this point the phyint_groupname has not been
-		 * updated.
-		 */
-		if ((group_len == strlen(ipsq->ipsq_name) + 1) &&
-		    (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) {
-			/*
-			 * Verify that an ipmp groupname is exactly
-			 * part of 1 ipsq and is not found in any other
-			 * ipsq.
-			 */
-			ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, ipst) ==
-			    NULL);
-			return (ipsq);
-		}
-
-		/*
-		 * Comparison against ipsq_name alone is not sufficient.
-		 * In the case when groups are currently being
-		 * merged, the ipsq could hold other IPMP groups temporarily.
-		 * so we walk the phyint list and compare against the
-		 * phyint_groupname as well.
-		 */
-		phyint = ipsq->ipsq_phyint_list;
-		while (phyint != NULL) {
-			if ((group_len == phyint->phyint_groupname_len) &&
-			    (bcmp(phyint->phyint_groupname, groupname,
-			    group_len) == 0)) {
-				/*
-				 * Verify that an ipmp groupname is exactly
-				 * part of 1 ipsq and is not found in any other
-				 * ipsq.
-				 */
-				ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq,
-				    ipst) == NULL);
-				return (ipsq);
-			}
-			phyint = phyint->phyint_ipsq_next;
-		}
-	}
-	if (create)
-		ipsq = ipsq_create(groupname, ipst);
-	return (ipsq);
+		ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
 }
 
 static void
 ipsq_delete(ipsq_t *ipsq)
 {
-	ipsq_t *nipsq;
-	ipsq_t *pipsq = NULL;
-	ip_stack_t *ipst = ipsq->ipsq_ipst;
-
-	/*
-	 * We don't hold the ipsq lock, but we are sure no new
-	 * messages can land up, since the ipsq_refs is zero.
-	 * i.e. this ipsq is unnamed and no phyint or phyint group
-	 * is associated with this ipsq. (Lookups are based on ill_name
-	 * or phyint_groupname)
-	 */
-	ASSERT(ipsq->ipsq_refs == 0);
-	ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL);
-	ASSERT(ipsq->ipsq_pending_mp == NULL);
-	if (!(ipsq->ipsq_flags & IPSQ_GROUP)) {
-		/*
-		 * This is not the ipsq of an IPMP group.
-		 */
-		ipsq->ipsq_ipst = NULL;
-		kmem_free(ipsq, sizeof (ipsq_t));
-		return;
-	}
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
-	/*
-	 * Locate the ipsq  before we can remove it from
-	 * the singly linked list of ipsq's.
-	 */
-	for (nipsq = ipst->ips_ipsq_g_head; nipsq != NULL;
-	    nipsq = nipsq->ipsq_next) {
-		if (nipsq == ipsq) {
-			break;
-		}
-		pipsq = nipsq;
-	}
-
-	ASSERT(nipsq == ipsq);
+	ipxop_t *ipx = ipsq->ipsq_xop;
 
-	/* unlink ipsq from the list */
-	if (pipsq != NULL)
-		pipsq->ipsq_next = ipsq->ipsq_next;
-	else
-		ipst->ips_ipsq_g_head = ipsq->ipsq_next;
 	ipsq->ipsq_ipst = NULL;
+	ASSERT(ipsq->ipsq_phyint == NULL);
+	ASSERT(ipsq->ipsq_xop != NULL);
+	ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
+	ASSERT(ipx->ipx_pending_mp == NULL);
 	kmem_free(ipsq, sizeof (ipsq_t));
-	rw_exit(&ipst->ips_ill_g_lock);
-}
-
-static void
-ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp,
-    queue_t *q)
-{
-	ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock));
-	ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL);
-	ASSERT(old_ipsq->ipsq_pending_ipif == NULL);
-	ASSERT(old_ipsq->ipsq_pending_mp == NULL);
-	ASSERT(current_mp != NULL);
-
-	ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl,
-	    NEW_OP, NULL);
-
-	ASSERT(new_ipsq->ipsq_xopq_mptail != NULL &&
-	    new_ipsq->ipsq_xopq_mphead != NULL);
-
-	/*
-	 * move from old ipsq to the new ipsq.
-	 */
-	new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead;
-	if (old_ipsq->ipsq_xopq_mphead != NULL)
-		new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail;
-
-	old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL;
 }
 
-void
-ill_group_cleanup(ill_t *ill)
-{
-	ill_t *ill_v4;
-	ill_t *ill_v6;
-	ipif_t *ipif;
-
-	ill_v4 = ill->ill_phyint->phyint_illv4;
-	ill_v6 = ill->ill_phyint->phyint_illv6;
-
-	if (ill_v4 != NULL) {
-		mutex_enter(&ill_v4->ill_lock);
-		for (ipif = ill_v4->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			IPIF_UNMARK_MOVING(ipif);
-		}
-		ill_v4->ill_up_ipifs = B_FALSE;
-		mutex_exit(&ill_v4->ill_lock);
-	}
-
-	if (ill_v6 != NULL) {
-		mutex_enter(&ill_v6->ill_lock);
-		for (ipif = ill_v6->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			IPIF_UNMARK_MOVING(ipif);
-		}
-		ill_v6->ill_up_ipifs = B_FALSE;
-		mutex_exit(&ill_v6->ill_lock);
-	}
-}
-/*
- * This function is called when an ill has had a change in its group status
- * to bring up all the ipifs that were up before the change.
- */
-int
-ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
+static int
+ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
 {
+	int err;
 	ipif_t *ipif;
-	ill_t *ill_v4;
-	ill_t *ill_v6;
-	ill_t *from_ill;
-	int err = 0;
 
-	ASSERT(IAM_WRITER_ILL(ill));
+	if (ill == NULL)
+		return (0);
 
 	/*
 	 * Except for ipif_state_flags and ill_state_flags the other
@@ -14142,389 +14023,86 @@ ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
 	 * even an ipif that was already down, in ill_down_ipifs. So we
 	 * just blindly clear the IPIF_CHANGING flag here on all ipifs.
 	 */
-	ill_v4 = ill->ill_phyint->phyint_illv4;
-	ill_v6 = ill->ill_phyint->phyint_illv6;
-	if (ill_v4 != NULL) {
-		ill_v4->ill_up_ipifs = B_TRUE;
-		for (ipif = ill_v4->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			mutex_enter(&ill_v4->ill_lock);
-			ipif->ipif_state_flags &= ~IPIF_CHANGING;
-			IPIF_UNMARK_MOVING(ipif);
-			mutex_exit(&ill_v4->ill_lock);
-			if (ipif->ipif_was_up) {
-				if (!(ipif->ipif_flags & IPIF_UP))
-					err = ipif_up(ipif, q, mp);
-				ipif->ipif_was_up = B_FALSE;
-				if (err != 0) {
-					/*
-					 * Can there be any other error ?
-					 */
-					ASSERT(err == EINPROGRESS);
-					return (err);
-				}
-			}
-		}
-		mutex_enter(&ill_v4->ill_lock);
-		ill_v4->ill_state_flags &= ~ILL_CHANGING;
-		mutex_exit(&ill_v4->ill_lock);
-		ill_v4->ill_up_ipifs = B_FALSE;
-		if (ill_v4->ill_move_in_progress) {
-			ASSERT(ill_v4->ill_move_peer != NULL);
-			ill_v4->ill_move_in_progress = B_FALSE;
-			from_ill = ill_v4->ill_move_peer;
-			from_ill->ill_move_in_progress = B_FALSE;
-			from_ill->ill_move_peer = NULL;
-			mutex_enter(&from_ill->ill_lock);
-			from_ill->ill_state_flags &= ~ILL_CHANGING;
-			mutex_exit(&from_ill->ill_lock);
-			if (ill_v6 == NULL) {
-				if (from_ill->ill_phyint->phyint_flags &
-				    PHYI_STANDBY) {
-					phyint_inactive(from_ill->ill_phyint);
-				}
-				if (ill_v4->ill_phyint->phyint_flags &
-				    PHYI_STANDBY) {
-					phyint_inactive(ill_v4->ill_phyint);
-				}
-			}
-			ill_v4->ill_move_peer = NULL;
-		}
-	}
+	ASSERT(IAM_WRITER_ILL(ill));
 
-	if (ill_v6 != NULL) {
-		ill_v6->ill_up_ipifs = B_TRUE;
-		for (ipif = ill_v6->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			mutex_enter(&ill_v6->ill_lock);
-			ipif->ipif_state_flags &= ~IPIF_CHANGING;
-			IPIF_UNMARK_MOVING(ipif);
-			mutex_exit(&ill_v6->ill_lock);
-			if (ipif->ipif_was_up) {
-				if (!(ipif->ipif_flags & IPIF_UP))
-					err = ipif_up(ipif, q, mp);
-				ipif->ipif_was_up = B_FALSE;
-				if (err != 0) {
-					/*
-					 * Can there be any other error ?
-					 */
-					ASSERT(err == EINPROGRESS);
-					return (err);
-				}
-			}
-		}
-		mutex_enter(&ill_v6->ill_lock);
-		ill_v6->ill_state_flags &= ~ILL_CHANGING;
-		mutex_exit(&ill_v6->ill_lock);
-		ill_v6->ill_up_ipifs = B_FALSE;
-		if (ill_v6->ill_move_in_progress) {
-			ASSERT(ill_v6->ill_move_peer != NULL);
-			ill_v6->ill_move_in_progress = B_FALSE;
-			from_ill = ill_v6->ill_move_peer;
-			from_ill->ill_move_in_progress = B_FALSE;
-			from_ill->ill_move_peer = NULL;
-			mutex_enter(&from_ill->ill_lock);
-			from_ill->ill_state_flags &= ~ILL_CHANGING;
-			mutex_exit(&from_ill->ill_lock);
-			if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
-				phyint_inactive(from_ill->ill_phyint);
-			}
-			if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) {
-				phyint_inactive(ill_v6->ill_phyint);
+	ill->ill_up_ipifs = B_TRUE;
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+		mutex_enter(&ill->ill_lock);
+		ipif->ipif_state_flags &= ~IPIF_CHANGING;
+		mutex_exit(&ill->ill_lock);
+		if (ipif->ipif_was_up) {
+			if (!(ipif->ipif_flags & IPIF_UP))
+				err = ipif_up(ipif, q, mp);
+			ipif->ipif_was_up = B_FALSE;
+			if (err != 0) {
+				ASSERT(err == EINPROGRESS);
+				return (err);
 			}
-			ill_v6->ill_move_peer = NULL;
 		}
 	}
+	mutex_enter(&ill->ill_lock);
+	ill->ill_state_flags &= ~ILL_CHANGING;
+	mutex_exit(&ill->ill_lock);
+	ill->ill_up_ipifs = B_FALSE;
 	return (0);
 }
 
 /*
- * bring down all the approriate ipifs.
+ * This function is called to bring up all the ipifs that were up before
+ * bringing the ill down via ill_down_ipifs().
  */
-/* ARGSUSED */
-static void
-ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover)
+int
+ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
 {
-	ipif_t *ipif;
+	int err;
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
-	/*
-	 * Except for ipif_state_flags the other fields of the ipif/ill that
-	 * are modified below are protected implicitly since we are a writer
-	 */
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER))
-			continue;
-		/*
-		 * Don't bring down the LINK LOCAL addresses as they are tied
-		 * to physical interface and they don't move. Treat them as
-		 * IPIF_NOFAILOVER.
-		 */
-		if (chk_nofailover && ill->ill_isv6 &&
-		    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))
-			continue;
-		if (index == 0 || index == ipif->ipif_orig_ifindex) {
-			/*
-			 * We go through the ipif_down logic even if the ipif
-			 * is already down, since routes can be added based
-			 * on down ipifs. Going through ipif_down once again
-			 * will delete any IREs created based on these routes.
-			 */
-			if (ipif->ipif_flags & IPIF_UP)
-				ipif->ipif_was_up = B_TRUE;
-			/*
-			 * If called with chk_nofailover true ipif is moving.
-			 */
-			mutex_enter(&ill->ill_lock);
-			if (chk_nofailover) {
-				ipif->ipif_state_flags |=
-				    IPIF_MOVING | IPIF_CHANGING;
-			} else {
-				ipif->ipif_state_flags |= IPIF_CHANGING;
-			}
-			mutex_exit(&ill->ill_lock);
-			/*
-			 * Need to re-create net/subnet bcast ires if
-			 * they are dependent on ipif.
-			 */
-			if (!ipif->ipif_isv6)
-				ipif_check_bcast_ires(ipif);
-			(void) ipif_logical_down(ipif, NULL, NULL);
-			ipif_non_duplicate(ipif);
-			ipif_down_tail(ipif);
-		}
-	}
-}
-
-#define	IPSQ_INC_REF(ipsq, ipst)	{			\
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));		\
-	(ipsq)->ipsq_refs++;				\
-}
+	err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
+	if (err != 0)
+		return (err);
 
-#define	IPSQ_DEC_REF(ipsq, ipst)	{			\
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));		\
-	(ipsq)->ipsq_refs--;				\
-	if ((ipsq)->ipsq_refs == 0)				\
-		(ipsq)->ipsq_name[0] = '\0'; 		\
+	return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
 }
 
 /*
- * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
- * new_ipsq.
+ * Bring down any IPIF_UP ipifs on ill.
  */
 static void
-ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq, ip_stack_t *ipst)
+ill_down_ipifs(ill_t *ill)
 {
-	phyint_t *phyint;
-	phyint_t *next_phyint;
-
-	/*
-	 * To change the ipsq of an ill, we need to hold the ill_g_lock as
-	 * writer and the ill_lock of the ill in question. Also the dest
-	 * ipsq can't vanish while we hold the ill_g_lock as writer.
-	 */
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
-	phyint = cur_ipsq->ipsq_phyint_list;
-	cur_ipsq->ipsq_phyint_list = NULL;
-	while (phyint != NULL) {
-		next_phyint = phyint->phyint_ipsq_next;
-		IPSQ_DEC_REF(cur_ipsq, ipst);
-		phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list;
-		new_ipsq->ipsq_phyint_list = phyint;
-		IPSQ_INC_REF(new_ipsq, ipst);
-		phyint->phyint_ipsq = new_ipsq;
-		phyint = next_phyint;
-	}
-}
-
-#define	SPLIT_SUCCESS		0
-#define	SPLIT_NOT_NEEDED	1
-#define	SPLIT_FAILED		2
-
-int
-ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry,
-    ip_stack_t *ipst)
-{
-	ipsq_t *newipsq = NULL;
-
-	/*
-	 * Assertions denote pre-requisites for changing the ipsq of
-	 * a phyint
-	 */
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-	/*
-	 * <ill-phyint> assocs can't change while ill_g_lock
-	 * is held as writer. See ill_phyint_reinit()
-	 */
-	ASSERT(phyint->phyint_illv4 == NULL ||
-	    MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
-	ASSERT(phyint->phyint_illv6 == NULL ||
-	    MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
-
-	if ((phyint->phyint_groupname_len !=
-	    (strlen(cur_ipsq->ipsq_name) + 1) ||
-	    bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name,
-	    phyint->phyint_groupname_len) != 0)) {
-		/*
-		 * Once we fail in creating a new ipsq due to memory shortage,
-		 * don't attempt to create new ipsq again, based on another
-		 * phyint, since we want all phyints belonging to an IPMP group
-		 * to be in the same ipsq even in the event of mem alloc fails.
-		 */
-		newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry,
-		    cur_ipsq, ipst);
-		if (newipsq == NULL) {
-			/* Memory allocation failure */
-			return (SPLIT_FAILED);
-		} else {
-			/* ipsq_refs protected by ill_g_lock (writer) */
-			IPSQ_DEC_REF(cur_ipsq, ipst);
-			phyint->phyint_ipsq = newipsq;
-			phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list;
-			newipsq->ipsq_phyint_list = phyint;
-			IPSQ_INC_REF(newipsq, ipst);
-			return (SPLIT_SUCCESS);
-		}
-	}
-	return (SPLIT_NOT_NEEDED);
-}
+	ipif_t *ipif;
 
-/*
- * The ill locks of the phyint and the ill_g_lock (writer) must be held
- * to do this split
- */
-static int
-ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, ip_stack_t *ipst)
-{
-	ipsq_t *newipsq;
+	ASSERT(IAM_WRITER_ILL(ill));
 
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
 	/*
-	 * <ill-phyint> assocs can't change while ill_g_lock
-	 * is held as writer. See ill_phyint_reinit()
+	 * Except for ipif_state_flags the other fields of the ipif/ill that
+	 * are modified below are protected implicitly since we are a writer
 	 */
-
-	ASSERT(phyint->phyint_illv4 == NULL ||
-	    MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
-	ASSERT(phyint->phyint_illv6 == NULL ||
-	    MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
-
-	if (!ipsq_init((phyint->phyint_illv4 != NULL) ?
-	    phyint->phyint_illv4: phyint->phyint_illv6)) {
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 		/*
-		 * ipsq_init failed due to no memory
-		 * caller will use the same ipsq
+		 * We go through the ipif_down logic even if the ipif
+		 * is already down, since routes can be added based
+		 * on down ipifs. Going through ipif_down once again
+		 * will delete any IREs created based on these routes.
 		 */
-		return (SPLIT_FAILED);
-	}
-
-	/* ipsq_ref is protected by ill_g_lock (writer) */
-	IPSQ_DEC_REF(cur_ipsq, ipst);
-
-	/*
-	 * This is a new ipsq that is unknown to the world.
-	 * So we don't need to hold ipsq_lock,
-	 */
-	newipsq = phyint->phyint_ipsq;
-	newipsq->ipsq_writer = NULL;
-	newipsq->ipsq_reentry_cnt--;
-	ASSERT(newipsq->ipsq_reentry_cnt == 0);
-#ifdef DEBUG
-	newipsq->ipsq_depth = 0;
-#endif
-
-	return (SPLIT_SUCCESS);
-}
+		if (ipif->ipif_flags & IPIF_UP)
+			ipif->ipif_was_up = B_TRUE;
 
-/*
- * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
- * ipsq's representing their individual groups or themselves. Return
- * whether split needs to be retried again later.
- */
-static boolean_t
-ill_split_ipsq(ipsq_t *cur_ipsq)
-{
-	phyint_t *phyint;
-	phyint_t *next_phyint;
-	int	error;
-	boolean_t need_retry = B_FALSE;
-	ip_stack_t	*ipst = cur_ipsq->ipsq_ipst;
+		mutex_enter(&ill->ill_lock);
+		ipif->ipif_state_flags |= IPIF_CHANGING;
+		mutex_exit(&ill->ill_lock);
 
-	phyint = cur_ipsq->ipsq_phyint_list;
-	cur_ipsq->ipsq_phyint_list = NULL;
-	while (phyint != NULL) {
-		next_phyint = phyint->phyint_ipsq_next;
 		/*
-		 * 'created' will tell us whether the callee actually
-		 * created an ipsq. Lack of memory may force the callee
-		 * to return without creating an ipsq.
+		 * Need to re-create net/subnet bcast ires if
+		 * they are dependent on ipif.
 		 */
-		if (phyint->phyint_groupname == NULL) {
-			error = ill_split_to_own_ipsq(phyint, cur_ipsq, ipst);
-		} else {
-			error = ill_split_to_grp_ipsq(phyint, cur_ipsq,
-			    need_retry, ipst);
-		}
-
-		switch (error) {
-		case SPLIT_FAILED:
-			need_retry = B_TRUE;
-			/* FALLTHRU */
-		case SPLIT_NOT_NEEDED:
-			/*
-			 * Keep it on the list.
-			 */
-			phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list;
-			cur_ipsq->ipsq_phyint_list = phyint;
-			break;
-		case SPLIT_SUCCESS:
-			break;
-		default:
-			ASSERT(0);
-		}
-
-		phyint = next_phyint;
-	}
-	return (need_retry);
-}
-
-/*
- * given an ipsq 'ipsq' lock all ills associated with this ipsq.
- * and return the ills in the list. This list will be
- * needed to unlock all the ills later on by the caller.
- * The <ill-ipsq> associations could change between the
- * lock and unlock. Hence the unlock can't traverse the
- * ipsq to get the list of ills.
- */
-static int
-ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max)
-{
-	int	cnt = 0;
-	phyint_t	*phyint;
-	ip_stack_t	*ipst = ipsq->ipsq_ipst;
-
-	/*
-	 * The caller holds ill_g_lock to ensure that the ill memberships
-	 * of the ipsq don't change
-	 */
-	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
-	phyint = ipsq->ipsq_phyint_list;
-	while (phyint != NULL) {
-		if (phyint->phyint_illv4 != NULL) {
-			ASSERT(cnt < list_max);
-			list[cnt++] = phyint->phyint_illv4;
-		}
-		if (phyint->phyint_illv6 != NULL) {
-			ASSERT(cnt < list_max);
-			list[cnt++] = phyint->phyint_illv6;
-		}
-		phyint = phyint->phyint_ipsq_next;
+		if (!ipif->ipif_isv6)
+			ipif_check_bcast_ires(ipif);
+		(void) ipif_logical_down(ipif, NULL, NULL);
+		ipif_non_duplicate(ipif);
+		ipif_down_tail(ipif);
 	}
-	ill_lock_ills(list, cnt);
-	return (cnt);
 }
 
 void
@@ -14577,3504 +14155,251 @@ ill_unlock_ills(ill_t **list, int cnt)
 }
 
 /*
- * Merge all the ills from 1 ipsq group into another ipsq group.
- * The source ipsq group is specified by the ipsq associated with
- * 'from_ill'. The destination ipsq group is specified by the ipsq
- * associated with 'to_ill' or 'groupname' respectively.
- * Note that ipsq itself does not have a reference count mechanism
- * and functions don't look up an ipsq and pass it around. Instead
- * functions pass around an ill or groupname, and the ipsq is looked
- * up from the ill or groupname and the required operation performed
- * atomically with the lookup on the ipsq.
+ * Redo source address selection.  This is called when a
+ * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up.
  */
-static int
-ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp,
-    queue_t *q)
-{
-	ipsq_t *old_ipsq;
-	ipsq_t *new_ipsq;
-	ill_t	**ill_list;
-	int	cnt;
-	size_t	ill_list_size;
-	boolean_t became_writer_on_new_sq = B_FALSE;
-	ip_stack_t	*ipst = from_ill->ill_ipst;
-
-	ASSERT(to_ill == NULL || ipst == to_ill->ill_ipst);
-	/* Exactly 1 of 'to_ill' and groupname can be specified. */
-	ASSERT((to_ill != NULL) ^ (groupname != NULL));
-
-	/*
-	 * Need to hold ill_g_lock as writer and also the ill_lock to
-	 * change the <ill-ipsq> assoc of an ill. Need to hold the
-	 * ipsq_lock to prevent new messages from landing on an ipsq.
-	 */
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
-	old_ipsq = from_ill->ill_phyint->phyint_ipsq;
-	if (groupname != NULL)
-		new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL, ipst);
-	else {
-		new_ipsq = to_ill->ill_phyint->phyint_ipsq;
-	}
-
-	ASSERT(old_ipsq != NULL && new_ipsq != NULL);
-
-	/*
-	 * both groups are on the same ipsq.
-	 */
-	if (old_ipsq == new_ipsq) {
-		rw_exit(&ipst->ips_ill_g_lock);
-		return (0);
-	}
-
-	cnt = old_ipsq->ipsq_refs << 1;
-	ill_list_size = cnt * sizeof (ill_t *);
-	ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
-	if (ill_list == NULL) {
-		rw_exit(&ipst->ips_ill_g_lock);
-		return (ENOMEM);
-	}
-	cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt);
-
-	/* Need ipsq lock to enque messages on new ipsq or to become writer */
-	mutex_enter(&new_ipsq->ipsq_lock);
-	if ((new_ipsq->ipsq_writer == NULL &&
-	    new_ipsq->ipsq_current_ipif == NULL) ||
-	    (new_ipsq->ipsq_writer == curthread)) {
-		new_ipsq->ipsq_writer = curthread;
-		new_ipsq->ipsq_reentry_cnt++;
-		became_writer_on_new_sq = B_TRUE;
-	}
-
-	/*
-	 * We are holding ill_g_lock as writer and all the ill locks of
-	 * the old ipsq. So the old_ipsq can't be looked up, and hence no new
-	 * message can land up on the old ipsq even though we don't hold the
-	 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq.
-	 */
-	ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q);
-
-	/*
-	 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'.
-	 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq>
-	 * assocs. till we release the ill_g_lock, and hence it can't vanish.
-	 */
-	ill_merge_ipsq(old_ipsq, new_ipsq, ipst);
-
-	/*
-	 * Mark the new ipsq as needing a split since it is currently
-	 * being shared by more than 1 IPMP group. The split will
-	 * occur at the end of ipsq_exit
-	 */
-	new_ipsq->ipsq_split = B_TRUE;
-
-	/* Now release all the locks */
-	mutex_exit(&new_ipsq->ipsq_lock);
-	ill_unlock_ills(ill_list, cnt);
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	kmem_free(ill_list, ill_list_size);
-
-	/*
-	 * If we succeeded in becoming writer on the new ipsq, then
-	 * drain the new ipsq and start processing  all enqueued messages
-	 * including the current ioctl we are processing which is either
-	 * a set groupname or failover/failback.
-	 */
-	if (became_writer_on_new_sq)
-		ipsq_exit(new_ipsq);
-
-	/*
-	 * syncq has been changed and all the messages have been moved.
-	 */
-	mutex_enter(&old_ipsq->ipsq_lock);
-	old_ipsq->ipsq_current_ipif = NULL;
-	old_ipsq->ipsq_current_ioctl = 0;
-	old_ipsq->ipsq_current_done = B_TRUE;
-	mutex_exit(&old_ipsq->ipsq_lock);
-	return (EINPROGRESS);
-}
-
-/*
- * Delete and add the loopback copy and non-loopback copy of
- * the BROADCAST ire corresponding to ill and addr. Used to
- * group broadcast ires together when ill becomes part of
- * a group.
- *
- * This function is also called when ill is leaving the group
- * so that the ires belonging to the group gets re-grouped.
- */
-static void
-ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr)
-{
-	ire_t *ire, *nire, *nire_next, *ire_head = NULL;
-	ire_t **ire_ptpn = &ire_head;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * The loopback and non-loopback IREs are inserted in the order in which
-	 * they're found, on the basis that they are correctly ordered (loopback
-	 * first).
-	 */
-	for (;;) {
-		ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
-		    ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-		if (ire == NULL)
-			break;
-
-		/*
-		 * we are passing in KM_SLEEP because it is not easy to
-		 * go back to a sane state in case of memory failure.
-		 */
-		nire = kmem_cache_alloc(ire_cache, KM_SLEEP);
-		ASSERT(nire != NULL);
-		bzero(nire, sizeof (ire_t));
-		/*
-		 * Don't use ire_max_frag directly since we don't
-		 * hold on to 'ire' until we add the new ire 'nire' and
-		 * we don't want the new ire to have a dangling reference
-		 * to 'ire'. The ire_max_frag of a broadcast ire must
-		 * be in sync with the ipif_mtu of the associate ipif.
-		 * For eg. this happens as a result of SIOCSLIFNAME,
-		 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by
-		 * the driver. A change in ire_max_frag triggered as
-		 * as a result of path mtu discovery, or due to an
-		 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a
-		 * route change -mtu command does not apply to broadcast ires.
-		 *
-		 * XXX We need a recovery strategy here if ire_init fails
-		 */
-		if (ire_init(nire,
-		    (uchar_t *)&ire->ire_addr,
-		    (uchar_t *)&ire->ire_mask,
-		    (uchar_t *)&ire->ire_src_addr,
-		    (uchar_t *)&ire->ire_gateway_addr,
-		    ire->ire_stq == NULL ? &ip_loopback_mtu :
-		    &ire->ire_ipif->ipif_mtu,
-		    ire->ire_nce,
-		    ire->ire_rfq,
-		    ire->ire_stq,
-		    ire->ire_type,
-		    ire->ire_ipif,
-		    ire->ire_cmask,
-		    ire->ire_phandle,
-		    ire->ire_ihandle,
-		    ire->ire_flags,
-		    &ire->ire_uinfo,
-		    NULL,
-		    NULL,
-		    ipst) == NULL) {
-			cmn_err(CE_PANIC, "ire_init() failed");
-		}
-		ire_delete(ire);
-		ire_refrele(ire);
-
-		/*
-		 * The newly created IREs are inserted at the tail of the list
-		 * starting with ire_head. As we've just allocated them no one
-		 * knows about them so it's safe.
-		 */
-		*ire_ptpn = nire;
-		ire_ptpn = &nire->ire_next;
-	}
-
-	for (nire = ire_head; nire != NULL; nire = nire_next) {
-		int error;
-		ire_t *oire;
-		/* unlink the IRE from our list before calling ire_add() */
-		nire_next = nire->ire_next;
-		nire->ire_next = NULL;
-
-		/* ire_add adds the ire at the right place in the list */
-		oire = nire;
-		error = ire_add(&nire, NULL, NULL, NULL, B_FALSE);
-		ASSERT(error == 0);
-		ASSERT(oire == nire);
-		ire_refrele(nire);	/* Held in ire_add */
-	}
-}
-
-/*
- * This function is usually called when an ill is inserted in
- * a group and all the ipifs are already UP. As all the ipifs
- * are already UP, the broadcast ires have already been created
- * and been inserted. But, ire_add_v4 would not have grouped properly.
- * We need to re-group for the benefit of ip_wput_ire which
- * expects BROADCAST ires to be grouped properly to avoid sending
- * more than one copy of the broadcast packet per group.
- *
- * NOTE : We don't check for ill_ipif_up_count to be non-zero here
- *	  because when ipif_up_done ends up calling this, ires have
- *        already been added before illgrp_insert i.e before ill_group
- *	  has been initialized.
- */
-static void
-ill_group_bcast_for_xmit(ill_t *ill)
+void
+ill_update_source_selection(ill_t *ill)
 {
-	ill_group_t *illgrp;
 	ipif_t *ipif;
-	ipaddr_t addr;
-	ipaddr_t net_mask;
-	ipaddr_t subnet_netmask;
 
-	illgrp = ill->ill_group;
+	ASSERT(IAM_WRITER_ILL(ill));
 
 	/*
-	 * This function is called even when an ill is deleted from
-	 * the group. Hence, illgrp could be null.
+	 * Underlying interfaces are only used for test traffic and thus
+	 * should always send with their (deprecated) source addresses.
 	 */
-	if (illgrp != NULL && illgrp->illgrp_ill_count == 1)
+	if (IS_UNDER_IPMP(ill))
 		return;
 
-	/*
-	 * Delete all the BROADCAST ires matching this ill and add
-	 * them back. This time, ire_add_v4 should take care of
-	 * grouping them with others because ill is part of the
-	 * group.
-	 */
-	ill_bcast_delete_and_add(ill, 0);
-	ill_bcast_delete_and_add(ill, INADDR_BROADCAST);
-
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-
-		if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
-		    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
-			net_mask = ip_net_mask(ipif->ipif_lcl_addr);
-		} else {
-			net_mask = htonl(IN_CLASSA_NET);
-		}
-		addr = net_mask & ipif->ipif_subnet;
-		ill_bcast_delete_and_add(ill, addr);
-		ill_bcast_delete_and_add(ill, ~net_mask | addr);
-
-		subnet_netmask = ipif->ipif_net_mask;
-		addr = ipif->ipif_subnet;
-		ill_bcast_delete_and_add(ill, addr);
-		ill_bcast_delete_and_add(ill, ~subnet_netmask | addr);
-	}
-}
-
-/*
- * This function is called from illgrp_delete when ill is being deleted
- * from the group.
- *
- * As ill is not there in the group anymore, any address belonging
- * to this ill should be cleared of IRE_MARK_NORECV.
- */
-static void
-ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr)
-{
-	ire_t *ire;
-	irb_t *irb;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	ASSERT(ill->ill_group == NULL);
-
-	ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
-	    ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-
-	if (ire != NULL) {
-		/*
-		 * IPMP and plumbing operations are serialized on the ipsq, so
-		 * no one will insert or delete a broadcast ire under our feet.
-		 */
-		irb = ire->ire_bucket;
-		rw_enter(&irb->irb_lock, RW_READER);
-		ire_refrele(ire);
-
-		for (; ire != NULL; ire = ire->ire_next) {
-			if (ire->ire_addr != addr)
-				break;
-			if (ire_to_ill(ire) != ill)
-				continue;
-
-			ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED));
-			ire->ire_marks &= ~IRE_MARK_NORECV;
-		}
-		rw_exit(&irb->irb_lock);
-	}
-}
-
-ire_t *
-irep_insert(ill_group_t *illgrp, ipaddr_t addr, ire_t *ire, ire_t ***pirep)
-{
-	boolean_t first = B_TRUE;
-	ire_t *clear_ire = NULL;
-	ire_t *start_ire = NULL;
-	uint64_t match_flags;
-	uint64_t phyi_flags;
-	boolean_t fallback = B_FALSE;
-
-	/*
-	 * irb_lock must be held by the caller.
-	 * Get to the first ire matching the address and the
-	 * group. If the address does not match we are done
-	 * as we could not find the IRE. If the address matches
-	 * we should get to the first one matching the group.
-	 */
-	while (ire != NULL) {
-		if (ire->ire_addr != addr ||
-		    ire->ire_ipif->ipif_ill->ill_group == illgrp) {
-			break;
-		}
-		ire = ire->ire_next;
-	}
-	match_flags = PHYI_FAILED | PHYI_INACTIVE;
-	start_ire = ire;
-redo:
-	while (ire != NULL && ire->ire_addr == addr &&
-	    ire->ire_ipif->ipif_ill->ill_group == illgrp) {
-		/*
-		 * The first ire for any address within a group
-		 * should always be the one with IRE_MARK_NORECV cleared
-		 * so that ip_wput_ire can avoid searching for one.
-		 * Note down the insertion point which will be used
-		 * later.
-		 */
-		if (first && (*pirep == NULL))
-			*pirep = ire->ire_ptpn;
-		/*
-		 * PHYI_FAILED is set when the interface fails.
-		 * This interface might have become good, but the
-		 * daemon has not yet detected. We should still
-		 * not receive on this. PHYI_OFFLINE should never
-		 * be picked as this has been offlined and soon
-		 * be removed.
-		 */
-		phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags;
-		if (phyi_flags & PHYI_OFFLINE) {
-			ire->ire_marks |= IRE_MARK_NORECV;
-			ire = ire->ire_next;
-			continue;
-		}
-		if (phyi_flags & match_flags) {
-			ire->ire_marks |= IRE_MARK_NORECV;
-			ire = ire->ire_next;
-			if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
-			    PHYI_INACTIVE) {
-				fallback = B_TRUE;
-			}
-			continue;
-		}
-		if (first) {
-			/*
-			 * We will move this to the front of the list later
-			 * on.
-			 */
-			clear_ire = ire;
-			ire->ire_marks &= ~IRE_MARK_NORECV;
-		} else {
-			ire->ire_marks |= IRE_MARK_NORECV;
-		}
-		first = B_FALSE;
-		ire = ire->ire_next;
-	}
-	/*
-	 * If we never nominated anybody, try nominating at least
-	 * an INACTIVE, if we found one. Do it only once though.
-	 */
-	if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) &&
-	    fallback) {
-		match_flags = PHYI_FAILED;
-		ire = start_ire;
-		*pirep = NULL;
-		goto redo;
-	}
-	return (clear_ire);
-}
-
-/*
- * This function must be called only after the broadcast ires
- * have been grouped together. For a given address addr, nominate
- * only one of the ires whose interface is not FAILED or OFFLINE.
- *
- * This is also called when an ipif goes down, so that we can nominate
- * a different ire with the same address for receiving.
- */
-static void
-ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr, ip_stack_t *ipst)
-{
-	irb_t *irb;
-	ire_t *ire;
-	ire_t *ire1;
-	ire_t *save_ire;
-	ire_t **irep = NULL;
-	ire_t *clear_ire = NULL;
-	ire_t	*new_lb_ire;
-	ire_t	*new_nlb_ire;
-	boolean_t new_lb_ire_used = B_FALSE;
-	boolean_t new_nlb_ire_used = B_FALSE;
-	boolean_t refrele_lb_ire = B_FALSE;
-	boolean_t refrele_nlb_ire = B_FALSE;
-	uint_t	max_frag;
-
-	ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES,
-	    NULL, MATCH_IRE_TYPE, ipst);
-	/*
-	 * We may not be able to find some ires if a previous
-	 * ire_create failed. This happens when an ipif goes
-	 * down and we are unable to create BROADCAST ires due
-	 * to memory failure. Thus, we have to check for NULL
-	 * below. This should handle the case for LOOPBACK,
-	 * POINTOPOINT and interfaces with some POINTOPOINT
-	 * logicals for which there are no BROADCAST ires.
-	 */
-	if (ire == NULL)
-		return;
-	/*
-	 * Currently IRE_BROADCASTS are deleted when an ipif
-	 * goes down which runs exclusively. Thus, setting
-	 * IRE_MARK_RCVD should not race with ire_delete marking
-	 * IRE_MARK_CONDEMNED. We grab the lock below just to
-	 * be consistent with other parts of the code that walks
-	 * a given bucket.
-	 */
-	save_ire = ire;
-	irb = ire->ire_bucket;
-	new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
-	if (new_lb_ire == NULL) {
-		ire_refrele(ire);
-		return;
-	}
-	new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
-	if (new_nlb_ire == NULL) {
-		ire_refrele(ire);
-		kmem_cache_free(ire_cache, new_lb_ire);
-		return;
-	}
-	IRB_REFHOLD(irb);
-	rw_enter(&irb->irb_lock, RW_WRITER);
-	clear_ire = irep_insert(illgrp, addr, ire, &irep);
-
-	/*
-	 * irep non-NULL indicates that we entered the while loop
-	 * above. If clear_ire is at the insertion point, we don't
-	 * have to do anything. clear_ire will be NULL if all the
-	 * interfaces are failed.
-	 *
-	 * We cannot unlink and reinsert the ire at the right place
-	 * in the list since there can be other walkers of this bucket.
-	 * Instead we delete and recreate the ire
-	 */
-	if (clear_ire != NULL && irep != NULL && *irep != clear_ire) {
-		ire_t *clear_ire_stq = NULL;
-		ire_t *clr_ire = NULL;
-		ire_t *ire_next = NULL;
-
-		if (clear_ire->ire_stq == NULL)
-			ire_next = clear_ire->ire_next;
-
-		rw_exit(&irb->irb_lock);
-
-		bzero(new_lb_ire, sizeof (ire_t));
-		/* XXX We need a recovery strategy here. */
-		if (ire_init(new_lb_ire,
-		    (uchar_t *)&clear_ire->ire_addr,
-		    (uchar_t *)&clear_ire->ire_mask,
-		    (uchar_t *)&clear_ire->ire_src_addr,
-		    (uchar_t *)&clear_ire->ire_gateway_addr,
-		    &clear_ire->ire_max_frag,
-		    NULL, /* let ire_nce_init derive the resolver info */
-		    clear_ire->ire_rfq,
-		    clear_ire->ire_stq,
-		    clear_ire->ire_type,
-		    clear_ire->ire_ipif,
-		    clear_ire->ire_cmask,
-		    clear_ire->ire_phandle,
-		    clear_ire->ire_ihandle,
-		    clear_ire->ire_flags,
-		    &clear_ire->ire_uinfo,
-		    NULL,
-		    NULL,
-		    ipst) == NULL)
-			cmn_err(CE_PANIC, "ire_init() failed");
-
-		refrele_lb_ire = B_TRUE;
-
-		if (ire_next != NULL &&
-		    ire_next->ire_stq != NULL &&
-		    ire_next->ire_addr == clear_ire->ire_addr &&
-		    ire_next->ire_ipif->ipif_ill ==
-		    clear_ire->ire_ipif->ipif_ill) {
-			clear_ire_stq = ire_next;
-
-			bzero(new_nlb_ire, sizeof (ire_t));
-			/* XXX We need a recovery strategy here. */
-			if (ire_init(new_nlb_ire,
-			    (uchar_t *)&clear_ire_stq->ire_addr,
-			    (uchar_t *)&clear_ire_stq->ire_mask,
-			    (uchar_t *)&clear_ire_stq->ire_src_addr,
-			    (uchar_t *)&clear_ire_stq->ire_gateway_addr,
-			    &clear_ire_stq->ire_max_frag,
-			    NULL,
-			    clear_ire_stq->ire_rfq,
-			    clear_ire_stq->ire_stq,
-			    clear_ire_stq->ire_type,
-			    clear_ire_stq->ire_ipif,
-			    clear_ire_stq->ire_cmask,
-			    clear_ire_stq->ire_phandle,
-			    clear_ire_stq->ire_ihandle,
-			    clear_ire_stq->ire_flags,
-			    &clear_ire_stq->ire_uinfo,
-			    NULL,
-			    NULL,
-			    ipst) == NULL)
-				cmn_err(CE_PANIC, "ire_init() failed");
-
-				refrele_nlb_ire = B_TRUE;
-			}
-
-		rw_enter(&irb->irb_lock, RW_WRITER);
-		/*
-		 * irb_lock was dropped across call to ire_init() due to
-		 * lock ordering issue with ipst->ips_ndp{4,6}->ndp_g_lock
-		 * mutex lock. Therefore irep could have changed. call
-		 * irep_insert() to get the new insertion point (irep) and
-		 * recheck all known conditions.
-		 */
-		irep = NULL;
-		clr_ire = irep_insert(illgrp, addr, save_ire, &irep);
-		if ((irep != NULL) && (*irep != clear_ire) &&
-		    (clr_ire == clear_ire)) {
-			if ((clear_ire_stq != NULL) &&
-			    (clr_ire->ire_next != clear_ire_stq))
-				clear_ire_stq = NULL;
-			/*
-			 * Delete the ire. We can't call ire_delete() since
-			 * we are holding the bucket lock. We can't release the
-			 * bucket lock since we can't allow irep to change.
-			 * So just mark it CONDEMNED.
-			 * The IRB_REFRELE will delete the ire from the list
-			 * and do the refrele.
-			 */
-			clear_ire->ire_marks |= IRE_MARK_CONDEMNED;
-			irb->irb_marks |= IRB_MARK_CONDEMNED;
-
-			if (clear_ire_stq != NULL &&
-			    clear_ire_stq->ire_nce != NULL) {
-				nce_fastpath_list_delete(
-				    clear_ire_stq->ire_nce);
-				clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED;
-			}
-
-			/*
-			 * Also take care of otherfields like ib/ob pkt count
-			 * etc. Need to dup them.
-			 * ditto in ill_bcast_delete_and_add
-			 */
-
-			/* Set the max_frag before adding the ire */
-			max_frag = *new_lb_ire->ire_max_fragp;
-			new_lb_ire->ire_max_fragp = NULL;
-			new_lb_ire->ire_max_frag = max_frag;
-
-			/* Add the new ire's. Insert at *irep */
-			new_lb_ire->ire_bucket = clear_ire->ire_bucket;
-			ire1 = *irep;
-			if (ire1 != NULL)
-				ire1->ire_ptpn = &new_lb_ire->ire_next;
-			new_lb_ire->ire_next = ire1;
-			/* Link the new one in. */
-			new_lb_ire->ire_ptpn = irep;
-			membar_producer();
-			*irep = new_lb_ire;
-			new_lb_ire_used = B_TRUE;
-			BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
-			    ire_stats_inserted);
-			new_lb_ire->ire_bucket->irb_ire_cnt++;
-			DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *),
-			    new_lb_ire->ire_ipif,
-			    (char *), "ire", (void *), new_lb_ire);
-			new_lb_ire->ire_ipif->ipif_ire_cnt++;
-
-			if (clear_ire_stq != NULL) {
-				ill_t	*ire_ill;
-				/* Set the max_frag before adding the ire */
-				max_frag = *new_nlb_ire->ire_max_fragp;
-				new_nlb_ire->ire_max_fragp = NULL;
-				new_nlb_ire->ire_max_frag = max_frag;
-
-				new_nlb_ire->ire_bucket = clear_ire->ire_bucket;
-				irep = &new_lb_ire->ire_next;
-				/* Add the new ire. Insert at *irep */
-				ire1 = *irep;
-				if (ire1 != NULL)
-					ire1->ire_ptpn = &new_nlb_ire->ire_next;
-				new_nlb_ire->ire_next = ire1;
-				/* Link the new one in. */
-				new_nlb_ire->ire_ptpn = irep;
-				membar_producer();
-				*irep = new_nlb_ire;
-				new_nlb_ire_used = B_TRUE;
-				BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
-				    ire_stats_inserted);
-				new_nlb_ire->ire_bucket->irb_ire_cnt++;
-				DTRACE_PROBE3(ipif__incr__cnt,
-				    (ipif_t *), new_nlb_ire->ire_ipif,
-				    (char *), "ire", (void *), new_nlb_ire);
-				new_nlb_ire->ire_ipif->ipif_ire_cnt++;
-				DTRACE_PROBE3(ill__incr__cnt,
-				    (ill_t *), new_nlb_ire->ire_stq->q_ptr,
-				    (char *), "ire", (void *), new_nlb_ire);
-				ire_ill = (ill_t *)new_nlb_ire->ire_stq->q_ptr;
-				ire_ill->ill_ire_cnt++;
-			}
-		}
-	}
-	ire_refrele(save_ire);
-	rw_exit(&irb->irb_lock);
-	/*
-	 * Since we dropped the irb_lock across call to ire_init()
-	 * and rechecking known conditions, it is possible that
-	 * the checks might fail, therefore undo the work done by
-	 * ire_init() by calling ire_refrele() on the newly created ire.
-	 */
-	if (!new_lb_ire_used) {
-		if (refrele_lb_ire) {
-			ire_refrele(new_lb_ire);
-		} else {
-			kmem_cache_free(ire_cache, new_lb_ire);
-		}
-	}
-	if (!new_nlb_ire_used) {
-		if (refrele_nlb_ire) {
-			ire_refrele(new_nlb_ire);
-		} else {
-			kmem_cache_free(ire_cache, new_nlb_ire);
-		}
-	}
-	IRB_REFRELE(irb);
-}
-
-/*
- * Whenever an ipif goes down we have to renominate a different
- * broadcast ire to receive. Whenever an ipif comes up, we need
- * to make sure that we have only one nominated to receive.
- */
-static void
-ipif_renominate_bcast(ipif_t *ipif)
-{
-	ill_t *ill = ipif->ipif_ill;
-	ipaddr_t subnet_addr;
-	ipaddr_t net_addr;
-	ipaddr_t net_mask = 0;
-	ipaddr_t subnet_netmask;
-	ipaddr_t addr;
-	ill_group_t *illgrp;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	illgrp = ill->ill_group;
-	/*
-	 * If this is the last ipif going down, it might take
-	 * the ill out of the group. In that case ipif_down ->
-	 * illgrp_delete takes care of doing the nomination.
-	 * ipif_down does not call for this case.
-	 */
-	ASSERT(illgrp != NULL);
-
-	/* There could not have been any ires associated with this */
-	if (ipif->ipif_subnet == 0)
-		return;
-
-	ill_mark_bcast(illgrp, 0, ipst);
-	ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst);
-
-	if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
-	    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
-		net_mask = ip_net_mask(ipif->ipif_lcl_addr);
-	} else {
-		net_mask = htonl(IN_CLASSA_NET);
-	}
-	addr = net_mask & ipif->ipif_subnet;
-	ill_mark_bcast(illgrp, addr, ipst);
-
-	net_addr = ~net_mask | addr;
-	ill_mark_bcast(illgrp, net_addr, ipst);
-
-	subnet_netmask = ipif->ipif_net_mask;
-	addr = ipif->ipif_subnet;
-	ill_mark_bcast(illgrp, addr, ipst);
-
-	subnet_addr = ~subnet_netmask | addr;
-	ill_mark_bcast(illgrp, subnet_addr, ipst);
-}
-
-/*
- * Whenever we form or delete ill groups, we need to nominate one set of
- * BROADCAST ires for receiving in the group.
- *
- * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires
- *    have been added, but ill_ipif_up_count is 0. Thus, we don't assert
- *    for ill_ipif_up_count to be non-zero. This is the only case where
- *    ill_ipif_up_count is zero and we would still find the ires.
- *
- * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one
- *    ipif is UP and we just have to do the nomination.
- *
- * 3) When ill_handoff_responsibility calls us, some ill has been removed
- *    from the group. So, we have to do the nomination.
- *
- * Because of (3), there could be just one ill in the group. But we have
- * to nominate still as IRE_MARK_NORCV may have been marked on this.
- * Thus, this function does not optimize when there is only one ill as
- * it is not correct for (3).
- */
-static void
-ill_nominate_bcast_rcv(ill_group_t *illgrp)
-{
-	ill_t *ill;
-	ipif_t *ipif;
-	ipaddr_t subnet_addr;
-	ipaddr_t prev_subnet_addr = 0;
-	ipaddr_t net_addr;
-	ipaddr_t prev_net_addr = 0;
-	ipaddr_t net_mask = 0;
-	ipaddr_t subnet_netmask;
-	ipaddr_t addr;
-	ip_stack_t	*ipst;
-
-	/*
-	 * When the last memeber is leaving, there is nothing to
-	 * nominate.
-	 */
-	if (illgrp->illgrp_ill_count == 0) {
-		ASSERT(illgrp->illgrp_ill == NULL);
-		return;
-	}
-
-	ill = illgrp->illgrp_ill;
-	ASSERT(!ill->ill_isv6);
-	ipst = ill->ill_ipst;
-	/*
-	 * We assume that ires with same address and belonging to the
-	 * same group, has been grouped together. Nominating a *single*
-	 * ill in the group for sending and receiving broadcast is done
-	 * by making sure that the first BROADCAST ire (which will be
-	 * the one returned by ire_ctable_lookup for ip_rput and the
-	 * one that will be used in ip_wput_ire) will be the one that
-	 * will not have IRE_MARK_NORECV set.
-	 *
-	 * 1) ip_rput checks and discards packets received on ires marked
-	 *    with IRE_MARK_NORECV. Thus, we don't send up duplicate
-	 *    broadcast packets. We need to clear IRE_MARK_NORECV on the
-	 *    first ire in the group for every broadcast address in the group.
-	 *    ip_rput will accept packets only on the first ire i.e only
-	 *    one copy of the ill.
-	 *
-	 * 2) ip_wput_ire needs to send out just one copy of the broadcast
-	 *    packet for the whole group. It needs to send out on the ill
-	 *    whose ire has not been marked with IRE_MARK_NORECV. If it sends
-	 *    on the one marked with IRE_MARK_NORECV, ip_rput will accept
-	 *    the copy echoed back on other port where the ire is not marked
-	 *    with IRE_MARK_NORECV.
-	 *
-	 * Note that we just need to have the first IRE either loopback or
-	 * non-loopback (either of them may not exist if ire_create failed
-	 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will
-	 * always hit the first one and hence will always accept one copy.
-	 *
-	 * We have a broadcast ire per ill for all the unique prefixes
-	 * hosted on that ill. As we don't have a way of knowing the
-	 * unique prefixes on a given ill and hence in the whole group,
-	 * we just call ill_mark_bcast on all the prefixes that exist
-	 * in the group. For the common case of one prefix, the code
-	 * below optimizes by remebering the last address used for
-	 * markng. In the case of multiple prefixes, this will still
-	 * optimize depending the order of prefixes.
-	 *
-	 * The only unique address across the whole group is 0.0.0.0 and
-	 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables
-	 * the first ire in the bucket for receiving and disables the
-	 * others.
-	 */
-	ill_mark_bcast(illgrp, 0, ipst);
-	ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst);
-	for (; ill != NULL; ill = ill->ill_group_next) {
-
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-
-			if (!(ipif->ipif_flags & IPIF_UP) ||
-			    ipif->ipif_subnet == 0) {
-				continue;
-			}
-			if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
-			    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
-				net_mask = ip_net_mask(ipif->ipif_lcl_addr);
-			} else {
-				net_mask = htonl(IN_CLASSA_NET);
-			}
-			addr = net_mask & ipif->ipif_subnet;
-			if (prev_net_addr == 0 || prev_net_addr != addr) {
-				ill_mark_bcast(illgrp, addr, ipst);
-				net_addr = ~net_mask | addr;
-				ill_mark_bcast(illgrp, net_addr, ipst);
-			}
-			prev_net_addr = addr;
-
-			subnet_netmask = ipif->ipif_net_mask;
-			addr = ipif->ipif_subnet;
-			if (prev_subnet_addr == 0 ||
-			    prev_subnet_addr != addr) {
-				ill_mark_bcast(illgrp, addr, ipst);
-				subnet_addr = ~subnet_netmask | addr;
-				ill_mark_bcast(illgrp, subnet_addr, ipst);
-			}
-			prev_subnet_addr = addr;
-		}
-	}
-}
-
-/*
- * This function is called while forming ill groups.
- *
- * Currently, we handle only allmulti groups. We want to join
- * allmulti on only one of the ills in the groups. In future,
- * when we have link aggregation, we may have to join normal
- * multicast groups on multiple ills as switch does inbound load
- * balancing. Following are the functions that calls this
- * function :
- *
- * 1) ill_recover_multicast : Interface is coming back UP.
- *    When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6
- *    will call ill_recover_multicast to recover all the multicast
- *    groups. We need to make sure that only one member is joined
- *    in the ill group.
- *
- * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed.
- *    Somebody is joining allmulti. We need to make sure that only one
- *    member is joined in the group.
- *
- * 3) illgrp_insert : If allmulti has already joined, we need to make
- *    sure that only one member is joined in the group.
- *
- * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving
- *    allmulti who we have nominated. We need to pick someother ill.
- *
- * 5) illgrp_delete : The ill we nominated is leaving the group,
- *    we need to pick a new ill to join the group.
- *
- * For (1), (2), (5) - we just have to check whether there is
- * a good ill joined in the group. If we could not find any ills
- * joined the group, we should join.
- *
- * For (4), the one that was nominated to receive, left the group.
- * There could be nobody joined in the group when this function is
- * called.
- *
- * For (3) - we need to explicitly check whether there are multiple
- * ills joined in the group.
- *
- * For simplicity, we don't differentiate any of the above cases. We
- * just leave the group if it is joined on any of them and join on
- * the first good ill.
- */
-int
-ill_nominate_mcast_rcv(ill_group_t *illgrp)
-{
-	ilm_t *ilm;
-	ill_t *ill;
-	ill_t *fallback_inactive_ill = NULL;
-	ill_t *fallback_failed_ill = NULL;
-	int ret = 0;
-
-	/*
-	 * Leave the allmulti on all the ills and start fresh.
-	 */
-	for (ill = illgrp->illgrp_ill; ill != NULL;
-	    ill = ill->ill_group_next) {
-		if (ill->ill_join_allmulti)
-			ill_leave_allmulti(ill);
-	}
-
-	/*
-	 * Choose a good ill. Fallback to inactive or failed if
-	 * none available. We need to fallback to FAILED in the
-	 * case where we have 2 interfaces in a group - where
-	 * one of them is failed and another is a good one and
-	 * the good one (not marked inactive) is leaving the group.
-	 */
-	for (ill = illgrp->illgrp_ill; ill != NULL; ill = ill->ill_group_next) {
-		if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE)
-			continue;
-		if (ill->ill_phyint->phyint_flags & PHYI_FAILED) {
-			fallback_failed_ill = ill;
-			continue;
-		}
-		if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) {
-			fallback_inactive_ill = ill;
-			continue;
-		}
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
-				ret = ill_join_allmulti(ill);
-				/*
-				 * ill_join_allmulti() can fail because of
-				 * memory failures so make sure we join at
-				 * least on one ill.
-				 */
-				if (ill->ill_join_allmulti)
-					return (0);
-			}
-		}
-	}
-	if (ret != 0) {
-		/*
-		 * If we tried nominating above and failed to do so,
-		 * return error. We might have tried multiple times.
-		 * But, return the latest error.
-		 */
-		return (ret);
-	}
-	if ((ill = fallback_inactive_ill) != NULL) {
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr))
-				return (ill_join_allmulti(ill));
-		}
-	} else if ((ill = fallback_failed_ill) != NULL) {
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr))
-				return (ill_join_allmulti(ill));
-		}
-	}
-	return (0);
-}
-
-/*
- * This function is called from illgrp_delete after it is
- * deleted from the group to reschedule responsibilities
- * to a different ill.
- */
-static void
-ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp)
-{
-	ilm_t	*ilm;
-	ipif_t	*ipif;
-	ipaddr_t subnet_addr;
-	ipaddr_t net_addr;
-	ipaddr_t net_mask = 0;
-	ipaddr_t subnet_netmask;
-	ipaddr_t addr;
-	ip_stack_t *ipst = ill->ill_ipst;
-
-	ASSERT(ill->ill_group == NULL);
-	/*
-	 * Broadcast Responsibility:
-	 *
-	 * 1. If this ill has been nominated for receiving broadcast
-	 * packets, we need to find a new one. Before we find a new
-	 * one, we need to re-group the ires that are part of this new
-	 * group (assumed by ill_nominate_bcast_rcv). We do this by
-	 * calling ill_group_bcast_for_xmit(ill) which will do the right
-	 * thing for us.
-	 *
-	 * 2. If this ill was not nominated for receiving broadcast
-	 * packets, we need to clear the IRE_MARK_NORECV flag
-	 * so that we continue to send up broadcast packets.
-	 */
-	if (!ill->ill_isv6) {
-		/*
-		 * Case 1 above : No optimization here. Just redo the
-		 * nomination.
-		 */
-		ill_group_bcast_for_xmit(ill);
-		ill_nominate_bcast_rcv(illgrp);
-
-		/*
-		 * Case 2 above : Lookup and clear IRE_MARK_NORECV.
-		 */
-		ill_clear_bcast_mark(ill, 0);
-		ill_clear_bcast_mark(ill, INADDR_BROADCAST);
-
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-
-			if (!(ipif->ipif_flags & IPIF_UP) ||
-			    ipif->ipif_subnet == 0) {
-				continue;
-			}
-			if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
-			    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
-				net_mask = ip_net_mask(ipif->ipif_lcl_addr);
-			} else {
-				net_mask = htonl(IN_CLASSA_NET);
-			}
-			addr = net_mask & ipif->ipif_subnet;
-			ill_clear_bcast_mark(ill, addr);
-
-			net_addr = ~net_mask | addr;
-			ill_clear_bcast_mark(ill, net_addr);
-
-			subnet_netmask = ipif->ipif_net_mask;
-			addr = ipif->ipif_subnet;
-			ill_clear_bcast_mark(ill, addr);
-
-			subnet_addr = ~subnet_netmask | addr;
-			ill_clear_bcast_mark(ill, subnet_addr);
-		}
-	}
-
-	/*
-	 * Multicast Responsibility.
-	 *
-	 * If we have joined allmulti on this one, find a new member
-	 * in the group to join allmulti. As this ill is already part
-	 * of allmulti, we don't have to join on this one.
-	 *
-	 * If we have not joined allmulti on this one, there is no
-	 * responsibility to handoff. But we need to take new
-	 * responsibility i.e, join allmulti on this one if we need
-	 * to.
-	 */
-	if (ill->ill_join_allmulti) {
-		(void) ill_nominate_mcast_rcv(illgrp);
-	} else {
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
-				(void) ill_join_allmulti(ill);
-				break;
-			}
-		}
-	}
-
-	/*
-	 * We intentionally do the flushing of IRE_CACHES only matching
-	 * on the ill and not on groups. Note that we are already deleted
-	 * from the group.
-	 *
-	 * This will make sure that all IRE_CACHES whose stq is pointing
-	 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get
-	 * deleted and IRE_CACHES that are not pointing at this ill will
-	 * be left alone.
-	 */
-	ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
-	    illgrp_cache_delete, ill, ill);
-
-	/*
-	 * Some conn may have cached one of the IREs deleted above. By removing
-	 * the ire reference, we clean up the extra reference to the ill held in
-	 * ire->ire_stq.
-	 */
-	ipcl_walk(conn_cleanup_stale_ire, NULL, ipst);
-
-	/*
-	 * Re-do source address selection for all the members in the
-	 * group, if they borrowed source address from one of the ipifs
-	 * in this ill.
-	 */
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-		if (ill->ill_isv6) {
-			ipif_update_other_ipifs_v6(ipif, illgrp);
-		} else {
-			ipif_update_other_ipifs(ipif, illgrp);
-		}
+		if (ill->ill_isv6)
+			ipif_recreate_interface_routes_v6(NULL, ipif);
+		else
+			ipif_recreate_interface_routes(NULL, ipif);
 	}
 }
 
 /*
- * Delete the ill from the group. The caller makes sure that it is
- * in a group and it okay to delete from the group. So, we always
- * delete here.
+ * Finish the group join started in ip_sioctl_groupname().
  */
+/* ARGSUSED */
 static void
-illgrp_delete(ill_t *ill)
-{
-	ill_group_t *illgrp;
-	ill_group_t *tmpg;
-	ill_t *tmp_ill;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * Reset illgrp_ill_schednext if it was pointing at us.
-	 * We need to do this before we set ill_group to NULL.
-	 */
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-	mutex_enter(&ill->ill_lock);
-
-	illgrp_reset_schednext(ill);
-
-	illgrp = ill->ill_group;
-
-	/* Delete the ill from illgrp. */
-	if (illgrp->illgrp_ill == ill) {
-		illgrp->illgrp_ill = ill->ill_group_next;
-	} else {
-		tmp_ill = illgrp->illgrp_ill;
-		while (tmp_ill->ill_group_next != ill) {
-			tmp_ill = tmp_ill->ill_group_next;
-			ASSERT(tmp_ill != NULL);
-		}
-		tmp_ill->ill_group_next = ill->ill_group_next;
-	}
-	ill->ill_group = NULL;
-	ill->ill_group_next = NULL;
-
-	illgrp->illgrp_ill_count--;
-	mutex_exit(&ill->ill_lock);
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	/*
-	 * As this ill is leaving the group, we need to hand off
-	 * the responsibilities to the other ills in the group, if
-	 * this ill had some responsibilities.
-	 */
-
-	ill_handoff_responsibility(ill, illgrp);
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
-	if (illgrp->illgrp_ill_count == 0) {
-
-		ASSERT(illgrp->illgrp_ill == NULL);
-		if (ill->ill_isv6) {
-			if (illgrp == ipst->ips_illgrp_head_v6) {
-				ipst->ips_illgrp_head_v6 = illgrp->illgrp_next;
-			} else {
-				tmpg = ipst->ips_illgrp_head_v6;
-				while (tmpg->illgrp_next != illgrp) {
-					tmpg = tmpg->illgrp_next;
-					ASSERT(tmpg != NULL);
-				}
-				tmpg->illgrp_next = illgrp->illgrp_next;
-			}
-		} else {
-			if (illgrp == ipst->ips_illgrp_head_v4) {
-				ipst->ips_illgrp_head_v4 = illgrp->illgrp_next;
-			} else {
-				tmpg = ipst->ips_illgrp_head_v4;
-				while (tmpg->illgrp_next != illgrp) {
-					tmpg = tmpg->illgrp_next;
-					ASSERT(tmpg != NULL);
-				}
-				tmpg->illgrp_next = illgrp->illgrp_next;
-			}
-		}
-		mutex_destroy(&illgrp->illgrp_lock);
-		mi_free(illgrp);
-	}
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	/*
-	 * Even though the ill is out of the group its not necessary
-	 * to set ipsq_split as TRUE as the ipifs could be down temporarily
-	 * We will split the ipsq when phyint_groupname is set to NULL.
-	 */
-
-	/*
-	 * Send a routing sockets message if we are deleting from
-	 * groups with names.
-	 */
-	if (ill->ill_phyint->phyint_groupname_len != 0)
-		ip_rts_ifmsg(ill->ill_ipif);
-}
-
-/*
- * Re-do source address selection. This is normally called when
- * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST
- * ipif comes up.
- */
-void
-ill_update_source_selection(ill_t *ill)
+ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
 {
-	ipif_t *ipif;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	if (ill->ill_group != NULL)
-		ill = ill->ill_group->illgrp_ill;
-
-	for (; ill != NULL; ill = ill->ill_group_next) {
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ill->ill_isv6)
-				ipif_recreate_interface_routes_v6(NULL, ipif);
-			else
-				ipif_recreate_interface_routes(NULL, ipif);
-		}
-	}
-}
-
-/*
- * Insert ill in a group headed by illgrp_head. The caller can either
- * pass a groupname in which case we search for a group with the
- * same name to insert in or pass a group to insert in. This function
- * would only search groups with names.
- *
- * NOTE : The caller should make sure that there is at least one ipif
- *	  UP on this ill so that illgrp_scheduler can pick this ill
- *	  for outbound packets. If ill_ipif_up_count is zero, we have
- *	  already sent a DL_UNBIND to the driver and we don't want to
- *	  send anymore packets. We don't assert for ipif_up_count
- *	  to be greater than zero, because ipif_up_done wants to call
- *	  this function before bumping up the ipif_up_count. See
- *	  ipif_up_done() for details.
- */
-int
-illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname,
-    ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up)
-{
-	ill_group_t *illgrp;
-	ill_t *prev_ill;
-	phyint_t *phyi;
+	ill_t		*ill = q->q_ptr;
+	phyint_t	*phyi = ill->ill_phyint;
+	ipmp_grp_t	*grp = phyi->phyint_grp;
 	ip_stack_t	*ipst = ill->ill_ipst;
 
-	ASSERT(ill->ill_group == NULL);
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-	mutex_enter(&ill->ill_lock);
-
-	if (groupname != NULL) {
-		/*
-		 * Look for a group with a matching groupname to insert.
-		 */
-		for (illgrp = *illgrp_head; illgrp != NULL;
-		    illgrp = illgrp->illgrp_next) {
-
-			ill_t *tmp_ill;
-
-			/*
-			 * If we have an ill_group_t in the list which has
-			 * no ill_t assigned then we must be in the process of
-			 * removing this group. We skip this as illgrp_delete()
-			 * will remove it from the list.
-			 */
-			if ((tmp_ill = illgrp->illgrp_ill) == NULL) {
-				ASSERT(illgrp->illgrp_ill_count == 0);
-				continue;
-			}
-
-			ASSERT(tmp_ill->ill_phyint != NULL);
-			phyi = tmp_ill->ill_phyint;
-			/*
-			 * Look at groups which has names only.
-			 */
-			if (phyi->phyint_groupname_len == 0)
-				continue;
-			/*
-			 * Names are stored in the phyint common to both
-			 * IPv4 and IPv6.
-			 */
-			if (mi_strcmp(phyi->phyint_groupname,
-			    groupname) == 0) {
-				break;
-			}
-		}
-	} else {
-		/*
-		 * If the caller passes in a NULL "grp_to_insert", we
-		 * allocate one below and insert this singleton.
-		 */
-		illgrp = grp_to_insert;
-	}
-
-	ill->ill_group_next = NULL;
-
-	if (illgrp == NULL) {
-		illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t));
-		if (illgrp == NULL) {
-			return (ENOMEM);
-		}
-		illgrp->illgrp_next = *illgrp_head;
-		*illgrp_head = illgrp;
-		illgrp->illgrp_ill = ill;
-		illgrp->illgrp_ill_count = 1;
-		ill->ill_group = illgrp;
-		/*
-		 * Used in illgrp_scheduler to protect multiple threads
-		 * from traversing the list.
-		 */
-		mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0);
-	} else {
-		ASSERT(ill->ill_net_type ==
-		    illgrp->illgrp_ill->ill_net_type);
-		ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type);
-
-		/* Insert ill at tail of this group */
-		prev_ill = illgrp->illgrp_ill;
-		while (prev_ill->ill_group_next != NULL)
-			prev_ill = prev_ill->ill_group_next;
-		prev_ill->ill_group_next = ill;
-		ill->ill_group = illgrp;
-		illgrp->illgrp_ill_count++;
-		/*
-		 * Inherit group properties. Currently only forwarding
-		 * is the property we try to keep the same with all the
-		 * ills. When there are more, we will abstract this into
-		 * a function.
-		 */
-		ill->ill_flags &= ~ILLF_ROUTER;
-		ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER);
-	}
-	mutex_exit(&ill->ill_lock);
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	/*
-	 * 1) When ipif_up_done() calls this function, ipif_up_count
-	 *    may be zero as it has not yet been bumped. But the ires
-	 *    have already been added. So, we do the nomination here
-	 *    itself. But, when ip_sioctl_groupname calls this, it checks
-	 *    for ill_ipif_up_count != 0. Thus we don't check for
-	 *    ill_ipif_up_count here while nominating broadcast ires for
-	 *    receive.
-	 *
-	 * 2) Similarly, we need to call ill_group_bcast_for_xmit here
-	 *    to group them properly as ire_add() has already happened
-	 *    in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert
-	 *    case, we need to do it here anyway.
-	 */
-	if (!ill->ill_isv6) {
-		ill_group_bcast_for_xmit(ill);
-		ill_nominate_bcast_rcv(illgrp);
-	}
-
-	if (!ipif_is_coming_up) {
-		/*
-		 * When ipif_up_done() calls this function, the multicast
-		 * groups have not been joined yet. So, there is no point in
-		 * nomination. ill_join_allmulti() will handle groups when
-		 * ill_recover_multicast() is called from ipif_up_done() later.
-		 */
-		(void) ill_nominate_mcast_rcv(illgrp);
-		/*
-		 * ipif_up_done calls ill_update_source_selection
-		 * anyway. Moreover, we don't want to re-create
-		 * interface routes while ipif_up_done() still has reference
-		 * to them. Refer to ipif_up_done() for more details.
-		 */
-		ill_update_source_selection(ill);
-	}
-
-	/*
-	 * Send a routing sockets message if we are inserting into
-	 * groups with names.
-	 */
-	if (groupname != NULL)
-		ip_rts_ifmsg(ill->ill_ipif);
-	return (0);
-}
-
-/*
- * Return the first phyint matching the groupname. There could
- * be more than one when there are ill groups.
- *
- * If 'usable' is set, then we exclude ones that are marked with any of
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE).
- * Needs work: called only from ip_sioctl_groupname and from the ipmp/netinfo
- * emulation of ipmp.
- */
-phyint_t *
-phyint_lookup_group(char *groupname, boolean_t usable, ip_stack_t *ipst)
-{
-	phyint_t *phyi;
-
-	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-	/*
-	 * Group names are stored in the phyint - a common structure
-	 * to both IPv4 and IPv6.
-	 */
-	phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
-	for (; phyi != NULL;
-	    phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
-	    phyi, AVL_AFTER)) {
-		if (phyi->phyint_groupname_len == 0)
-			continue;
-		/*
-		 * Skip the ones that should not be used since the callers
-		 * sometime use this for sending packets.
-		 */
-		if (usable && (phyi->phyint_flags &
-		    (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)))
-			continue;
+	/* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
+	ASSERT(!IS_IPMP(ill) && grp != NULL);
+	ASSERT(IAM_WRITER_IPSQ(ipsq));
 
-		ASSERT(phyi->phyint_groupname != NULL);
-		if (mi_strcmp(groupname, phyi->phyint_groupname) == 0)
-			return (phyi);
+	if (phyi->phyint_illv4 != NULL) {
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		VERIFY(grp->gr_pendv4-- > 0);
+		rw_exit(&ipst->ips_ipmp_lock);
+		ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
 	}
-	return (NULL);
-}
-
-
-/*
- * Return the first usable phyint matching the group index. By 'usable'
- * we exclude ones that are marked ununsable with any of
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE).
- *
- * Used only for the ipmp/netinfo emulation of ipmp.
- */
-phyint_t *
-phyint_lookup_group_ifindex(uint_t group_ifindex, ip_stack_t *ipst)
-{
-	phyint_t *phyi;
-
-	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
-
-	if (!ipst->ips_ipmp_hook_emulation)
-		return (NULL);
-
-	/*
-	 * Group indicies are stored in the phyint - a common structure
-	 * to both IPv4 and IPv6.
-	 */
-	phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
-	for (; phyi != NULL;
-	    phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
-	    phyi, AVL_AFTER)) {
-		/* Ignore the ones that do not have a group */
-		if (phyi->phyint_groupname_len == 0)
-			continue;
-
-		ASSERT(phyi->phyint_group_ifindex != 0);
-		/*
-		 * Skip the ones that should not be used since the callers
-		 * sometime use this for sending packets.
-		 */
-		if (phyi->phyint_flags &
-		    (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE))
-			continue;
-		if (phyi->phyint_group_ifindex == group_ifindex)
-			return (phyi);
+	if (phyi->phyint_illv6 != NULL) {
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		VERIFY(grp->gr_pendv6-- > 0);
+		rw_exit(&ipst->ips_ipmp_lock);
+		ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
 	}
-	return (NULL);
+	freemsg(mp);
 }
 
 /*
- * MT notes on creation and deletion of IPMP groups
- *
- * Creation and deletion of IPMP groups introduce the need to merge or
- * split the associated serialization objects i.e the ipsq's. Normally all
- * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled
- * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during
- * the execution of the SIOCSLIFGROUPNAME command the picture changes. There
- * is a need to change the <ill-ipsq> association and we have to operate on both
- * the source and destination IPMP groups. For eg. attempting to set the
- * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to
- * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the
- * source or destination IPMP group are mapped to a single ipsq for executing
- * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's.
- * The <ill-ipsq> mapping is restored back to normal at a later point. This is
- * termed as a split of the ipsq. The converse of the merge i.e. a split of the
- * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname
- * occurred on the ipsq, then the ipsq_split flag is set. This indicates the
- * ipsq has to be examined for redoing the <ill-ipsq> associations.
- *
- * In the above example the ioctl handling code locates the current ipsq of hme0
- * which is ipsq(mpk17-84). It then enters the above ipsq immediately or
- * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates
- * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into
- * the destination ipsq. If the destination ipsq is not busy, it also enters
- * the destination ipsq exclusively. Now the actual groupname setting operation
- * can proceed. If the destination ipsq is busy, the operation is enqueued
- * on the destination (merged) ipsq and will be handled in the unwind from
- * ipsq_exit.
- *
- * To prevent other threads accessing the ill while the group name change is
- * in progres, we bring down the ipifs which also removes the ill from the
- * group. The group is changed in phyint and when the first ipif on the ill
- * is brought up, the ill is inserted into the right IPMP group by
- * illgrp_insert.
+ * Process an SIOCSLIFGROUPNAME request.
  */
 /* ARGSUSED */
 int
 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *ifreq)
 {
-	int i;
-	char *tmp;
-	int namelen;
-	ill_t *ill = ipif->ipif_ill;
-	ill_t *ill_v4, *ill_v6;
-	int err = 0;
-	phyint_t *phyi;
-	phyint_t *phyi_tmp;
-	struct lifreq *lifr;
-	mblk_t	*mp1;
-	char *groupname;
-	ipsq_t *ipsq;
+	struct lifreq	*lifr = ifreq;
+	ill_t		*ill = ipif->ipif_ill;
 	ip_stack_t	*ipst = ill->ill_ipst;
-
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
-	/* Existance verified in ip_wput_nondata */
-	mp1 = mp->b_cont->b_cont;
-	lifr = (struct lifreq *)mp1->b_rptr;
-	groupname = lifr->lifr_groupname;
-
-	if (ipif->ipif_id != 0)
-		return (EINVAL);
-
-	phyi = ill->ill_phyint;
-	ASSERT(phyi != NULL);
-
-	if (phyi->phyint_flags & PHYI_VIRTUAL)
-		return (EINVAL);
-
-	tmp = groupname;
-	for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++)
-		;
-
-	if (i == LIFNAMSIZ) {
-		/* no null termination */
-		return (EINVAL);
-	}
+	phyint_t	*phyi = ill->ill_phyint;
+	ipmp_grp_t	*grp = phyi->phyint_grp;
+	mblk_t		*ipsq_mp;
+	int		err = 0;
 
 	/*
-	 * Calculate the namelen exclusive of the null
-	 * termination character.
+	 * Note that phyint_grp can only change here, where we're exclusive.
 	 */
-	namelen = tmp - groupname;
-
-	ill_v4 = phyi->phyint_illv4;
-	ill_v6 = phyi->phyint_illv6;
+	ASSERT(IAM_WRITER_ILL(ill));
 
-	/*
-	 * ILL cannot be part of a usesrc group and and IPMP group at the
-	 * same time. No need to grab the ill_g_usesrc_lock here, see
-	 * synchronization notes in ip.c
-	 */
-	if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
+	if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
+	    (phyi->phyint_flags & PHYI_VIRTUAL))
 		return (EINVAL);
-	}
-
-	/*
-	 * mark the ill as changing.
-	 * this should queue all new requests on the syncq.
-	 */
-	GRAB_ILL_LOCKS(ill_v4, ill_v6);
-
-	if (ill_v4 != NULL)
-		ill_v4->ill_state_flags |= ILL_CHANGING;
-	if (ill_v6 != NULL)
-		ill_v6->ill_state_flags |= ILL_CHANGING;
-	RELEASE_ILL_LOCKS(ill_v4, ill_v6);
-
-	if (namelen == 0) {
-		/*
-		 * Null string means remove this interface from the
-		 * existing group.
-		 */
-		if (phyi->phyint_groupname_len == 0) {
-			/*
-			 * Never was in a group.
-			 */
-			err = 0;
-			goto done;
-		}
-
-		/*
-		 * IPv4 or IPv6 may be temporarily out of the group when all
-		 * the ipifs are down. Thus, we need to check for ill_group to
-		 * be non-NULL.
-		 */
-		if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
-			ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
-			mutex_enter(&ill_v4->ill_lock);
-			if (!ill_is_quiescent(ill_v4)) {
-				/*
-				 * ipsq_pending_mp_add will not fail since
-				 * connp is NULL
-				 */
-				(void) ipsq_pending_mp_add(NULL,
-				    ill_v4->ill_ipif, q, mp, ILL_DOWN);
-				mutex_exit(&ill_v4->ill_lock);
-				err = EINPROGRESS;
-				goto done;
-			}
-			mutex_exit(&ill_v4->ill_lock);
-		}
-
-		if (ill_v6 != NULL && ill_v6->ill_group != NULL) {
-			ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
-			mutex_enter(&ill_v6->ill_lock);
-			if (!ill_is_quiescent(ill_v6)) {
-				(void) ipsq_pending_mp_add(NULL,
-				    ill_v6->ill_ipif, q, mp, ILL_DOWN);
-				mutex_exit(&ill_v6->ill_lock);
-				err = EINPROGRESS;
-				goto done;
-			}
-			mutex_exit(&ill_v6->ill_lock);
-		}
-
-		rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-		GRAB_ILL_LOCKS(ill_v4, ill_v6);
-		mutex_enter(&phyi->phyint_lock);
-		ASSERT(phyi->phyint_groupname != NULL);
-		mi_free(phyi->phyint_groupname);
-		phyi->phyint_groupname = NULL;
-		phyi->phyint_groupname_len = 0;
-
-		/* Restore the ifindex used to be the per interface one */
-		phyi->phyint_group_ifindex = 0;
-		phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-		mutex_exit(&phyi->phyint_lock);
-		RELEASE_ILL_LOCKS(ill_v4, ill_v6);
-		rw_exit(&ipst->ips_ill_g_lock);
-		err = ill_up_ipifs(ill, q, mp);
 
-		/*
-		 * set the split flag so that the ipsq can be split
-		 */
-		mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
-		phyi->phyint_ipsq->ipsq_split = B_TRUE;
-		mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
+	lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
 
-	} else {
-		if (phyi->phyint_groupname_len != 0) {
-			ASSERT(phyi->phyint_groupname != NULL);
-			/* Are we inserting in the same group ? */
-			if (mi_strcmp(groupname,
-			    phyi->phyint_groupname) == 0) {
-				err = 0;
-				goto done;
-			}
-		}
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
 
-		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-		/*
-		 * Merge ipsq for the group's.
-		 * This check is here as multiple groups/ills might be
-		 * sharing the same ipsq.
-		 * If we have to merege than the operation is restarted
-		 * on the new ipsq.
-		 */
-		ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL, ipst);
-		if (phyi->phyint_ipsq != ipsq) {
-			rw_exit(&ipst->ips_ill_g_lock);
-			err = ill_merge_groups(ill, NULL, groupname, mp, q);
-			goto done;
-		}
-		/*
-		 * Running exclusive on new ipsq.
-		 */
-
-		ASSERT(ipsq != NULL);
-		ASSERT(ipsq->ipsq_writer == curthread);
-
-		/*
-		 * Check whether the ill_type and ill_net_type matches before
-		 * we allocate any memory so that the cleanup is easier.
-		 *
-		 * We can't group dissimilar ones as we can't load spread
-		 * packets across the group because of potential link-level
-		 * header differences.
-		 */
-		phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst);
-		if (phyi_tmp != NULL) {
-			if ((ill_v4 != NULL &&
-			    phyi_tmp->phyint_illv4 != NULL) &&
-			    ((ill_v4->ill_net_type !=
-			    phyi_tmp->phyint_illv4->ill_net_type) ||
-			    (ill_v4->ill_type !=
-			    phyi_tmp->phyint_illv4->ill_type))) {
-				mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
-				phyi->phyint_ipsq->ipsq_split = B_TRUE;
-				mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
-				rw_exit(&ipst->ips_ill_g_lock);
-				return (EINVAL);
-			}
-			if ((ill_v6 != NULL &&
-			    phyi_tmp->phyint_illv6 != NULL) &&
-			    ((ill_v6->ill_net_type !=
-			    phyi_tmp->phyint_illv6->ill_net_type) ||
-			    (ill_v6->ill_type !=
-			    phyi_tmp->phyint_illv6->ill_type))) {
-				mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
-				phyi->phyint_ipsq->ipsq_split = B_TRUE;
-				mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
-				rw_exit(&ipst->ips_ill_g_lock);
-				return (EINVAL);
-			}
-		}
-
-		rw_exit(&ipst->ips_ill_g_lock);
-
-		/*
-		 * bring down all v4 ipifs.
-		 */
-		if (ill_v4 != NULL) {
-			ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
-		}
-
-		/*
-		 * bring down all v6 ipifs.
-		 */
-		if (ill_v6 != NULL) {
-			ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
-		}
-
-		/*
-		 * make sure all ipifs are down and there are no active
-		 * references. Call to ipsq_pending_mp_add will not fail
-		 * since connp is NULL.
-		 */
-		if (ill_v4 != NULL) {
-			mutex_enter(&ill_v4->ill_lock);
-			if (!ill_is_quiescent(ill_v4)) {
-				(void) ipsq_pending_mp_add(NULL,
-				    ill_v4->ill_ipif, q, mp, ILL_DOWN);
-				mutex_exit(&ill_v4->ill_lock);
-				err = EINPROGRESS;
-				goto done;
-			}
-			mutex_exit(&ill_v4->ill_lock);
-		}
-
-		if (ill_v6 != NULL) {
-			mutex_enter(&ill_v6->ill_lock);
-			if (!ill_is_quiescent(ill_v6)) {
-				(void) ipsq_pending_mp_add(NULL,
-				    ill_v6->ill_ipif, q, mp, ILL_DOWN);
-				mutex_exit(&ill_v6->ill_lock);
-				err = EINPROGRESS;
-				goto done;
-			}
-			mutex_exit(&ill_v6->ill_lock);
-		}
-
-		/*
-		 * allocate including space for null terminator
-		 * before we insert.
-		 */
-		tmp = (char *)mi_alloc(namelen + 1, BPRI_MED);
-		if (tmp == NULL)
-			return (ENOMEM);
-
-		rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-		GRAB_ILL_LOCKS(ill_v4, ill_v6);
-		mutex_enter(&phyi->phyint_lock);
-		if (phyi->phyint_groupname_len != 0) {
-			ASSERT(phyi->phyint_groupname != NULL);
-			mi_free(phyi->phyint_groupname);
-		}
-
-		/*
-		 * setup the new group name.
-		 */
-		phyi->phyint_groupname = tmp;
-		bcopy(groupname, phyi->phyint_groupname, namelen + 1);
-		phyi->phyint_groupname_len = namelen + 1;
-
-		if (ipst->ips_ipmp_hook_emulation) {
-			/*
-			 * If the group already exists we use the existing
-			 * group_ifindex, otherwise we pick a new index here.
-			 */
-			if (phyi_tmp != NULL) {
-				phyi->phyint_group_ifindex =
-				    phyi_tmp->phyint_group_ifindex;
-			} else {
-				/* XXX We need a recovery strategy here. */
-				if (!ip_assign_ifindex(
-				    &phyi->phyint_group_ifindex, ipst))
-					cmn_err(CE_PANIC,
-					    "ip_assign_ifindex() failed");
-			}
-		}
-		/*
-		 * Select whether the netinfo and hook use the per-interface
-		 * or per-group ifindex.
-		 */
-		if (ipst->ips_ipmp_hook_emulation)
-			phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex;
-		else
-			phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
-		if (ipst->ips_ipmp_hook_emulation &&
-		    phyi_tmp != NULL) {
-			/* First phyint in group - group PLUMB event */
-			ill_nic_event_plumb(ill, B_TRUE);
-		}
-		mutex_exit(&phyi->phyint_lock);
-		RELEASE_ILL_LOCKS(ill_v4, ill_v6);
-		rw_exit(&ipst->ips_ill_g_lock);
-
-		err = ill_up_ipifs(ill, q, mp);
-	}
-
-done:
 	/*
-	 *  normally ILL_CHANGING is cleared in ill_up_ipifs.
+	 * If the name hasn't changed, there's nothing to do.
 	 */
-	if (err != EINPROGRESS) {
-		GRAB_ILL_LOCKS(ill_v4, ill_v6);
-		if (ill_v4 != NULL)
-			ill_v4->ill_state_flags &= ~ILL_CHANGING;
-		if (ill_v6 != NULL)
-			ill_v6->ill_state_flags &= ~ILL_CHANGING;
-		RELEASE_ILL_LOCKS(ill_v4, ill_v6);
-	}
-	return (err);
-}
+	if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
+		goto unlock;
 
-/* ARGSUSED */
-int
-ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
-    mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
-{
-	ill_t *ill;
-	phyint_t *phyi;
-	struct lifreq *lifr;
-	mblk_t	*mp1;
-
-	/* Existence verified in ip_wput_nondata */
-	mp1 = mp->b_cont->b_cont;
-	lifr = (struct lifreq *)mp1->b_rptr;
-	ill = ipif->ipif_ill;
-	phyi = ill->ill_phyint;
-
-	lifr->lifr_groupname[0] = '\0';
 	/*
-	 * ill_group may be null if all the interfaces
-	 * are down. But still, the phyint should always
-	 * hold the name.
-	 */
-	if (phyi->phyint_groupname_len != 0) {
-		bcopy(phyi->phyint_groupname, lifr->lifr_groupname,
-		    phyi->phyint_groupname_len);
-	}
-
-	return (0);
-}
-
-
-typedef struct conn_move_s {
-	ill_t	*cm_from_ill;
-	ill_t	*cm_to_ill;
-	int	cm_ifindex;
-} conn_move_t;
-
-/*
- * ipcl_walk function for moving conn_multicast_ill for a given ill.
- */
-static void
-conn_move(conn_t *connp, caddr_t arg)
-{
-	conn_move_t *connm;
-	int ifindex;
-	int i;
-	ill_t *from_ill;
-	ill_t *to_ill;
-	ilg_t *ilg;
-	ilm_t *ret_ilm;
-
-	connm = (conn_move_t *)arg;
-	ifindex = connm->cm_ifindex;
-	from_ill = connm->cm_from_ill;
-	to_ill = connm->cm_to_ill;
-
-	/* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */
-
-	/* All multicast fields protected by conn_lock */
-	mutex_enter(&connp->conn_lock);
-	ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
-	if ((connp->conn_outgoing_ill == from_ill) &&
-	    (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) {
-		connp->conn_outgoing_ill = to_ill;
-		connp->conn_incoming_ill = to_ill;
-	}
-
-	/* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */
-
-	if ((connp->conn_multicast_ill == from_ill) &&
-	    (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) {
-		connp->conn_multicast_ill = connm->cm_to_ill;
-	}
-
-	/*
-	 * Change the ilg_ill to point to the new one. This assumes
-	 * ilm_move_v6 has moved the ilms to new_ill and the driver
-	 * has been told to receive packets on this interface.
-	 * ilm_move_v6 FAILBACKS all the ilms successfully always.
-	 * But when doing a FAILOVER, it might fail with ENOMEM and so
-	 * some ilms may not have moved. We check to see whether
-	 * the ilms have moved to to_ill. We can't check on from_ill
-	 * as in the process of moving, we could have split an ilm
-	 * in to two - which has the same orig_ifindex and v6group.
+	 * Handle requests to rename an IPMP meta-interface.
 	 *
-	 * For IPv4, ilg_ipif moves implicitly. The code below really
-	 * does not do anything for IPv4 as ilg_ill is NULL for IPv4.
-	 */
-	for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
-		ilg = &connp->conn_ilg[i];
-		if ((ilg->ilg_ill == from_ill) &&
-		    (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) {
-			/* ifindex != 0 indicates failback */
-			if (ifindex != 0) {
-				connp->conn_ilg[i].ilg_ill = to_ill;
-				continue;
-			}
-
-			mutex_enter(&to_ill->ill_lock);
-			ret_ilm = ilm_lookup_ill_index_v6(to_ill,
-			    &ilg->ilg_v6group, ilg->ilg_orig_ifindex,
-			    connp->conn_zoneid);
-			mutex_exit(&to_ill->ill_lock);
-
-			if (ret_ilm != NULL)
-				connp->conn_ilg[i].ilg_ill = to_ill;
-		}
+	 * Note that creation of the IPMP meta-interface is handled in
+	 * userland through the standard plumbing sequence.  As part of the
+	 * plumbing the IPMP meta-interface, its initial groupname is set to
+	 * the name of the interface (see ipif_set_values_tail()).
+	 */
+	if (IS_IPMP(ill)) {
+		err = ipmp_grp_rename(grp, lifr->lifr_groupname);
+		goto unlock;
 	}
-	mutex_exit(&connp->conn_lock);
-}
-
-static void
-conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex)
-{
-	conn_move_t connm;
-	ip_stack_t	*ipst = from_ill->ill_ipst;
-
-	connm.cm_from_ill = from_ill;
-	connm.cm_to_ill = to_ill;
-	connm.cm_ifindex = ifindex;
-
-	ipcl_walk(conn_move, (caddr_t)&connm, ipst);
-}
-
-/*
- * ilm has been moved from from_ill to to_ill.
- * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill.
- * appropriately.
- *
- * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because
- *	  the code there de-references ipif_ill to get the ill to
- *	  send multicast requests. It does not work as ipif is on its
- *	  move and already moved when this function is called.
- *	  Thus, we need to use from_ill and to_ill send down multicast
- *	  requests.
- */
-static void
-ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill)
-{
-	ipif_t *ipif;
-	ilm_t *ilm;
 
 	/*
-	 * See whether we need to send down DL_ENABMULTI_REQ on
-	 * to_ill as ilm has just been added.
+	 * Handle requests to add or remove an IP interface from a group.
 	 */
-	ASSERT(IAM_WRITER_ILL(to_ill));
-	ASSERT(IAM_WRITER_ILL(from_ill));
-
-	ILM_WALKER_HOLD(to_ill);
-	for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-
-		if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED))
-			continue;
-		/*
-		 * no locks held, ill/ipif cannot dissappear as long
-		 * as we are writer.
-		 */
-		ipif = to_ill->ill_ipif;
+	if (lifr->lifr_groupname[0] != '\0') {			/* add */
 		/*
-		 * No need to hold any lock as we are the writer and this
-		 * can only be changed by a writer.
+		 * Moves are handled by first removing the interface from
+		 * its existing group, and then adding it to another group.
+		 * So, fail if it's already in a group.
 		 */
-		ilm->ilm_is_new = B_FALSE;
-
-		if (to_ill->ill_net_type != IRE_IF_RESOLVER ||
-		    ipif->ipif_flags & IPIF_POINTOPOINT) {
-			ip1dbg(("ilm_send_multicast_reqs: to_ill not "
-			    "resolver\n"));
-			continue;		/* Must be IRE_IF_NORESOLVER */
-		}
-
-		if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
-			ip1dbg(("ilm_send_multicast_reqs: "
-			    "to_ill MULTI_BCAST\n"));
-			goto from;
+		if (IS_UNDER_IPMP(ill)) {
+			err = EALREADY;
+			goto unlock;
 		}
 
-		if (to_ill->ill_isv6)
-			mld_joingroup(ilm);
-		else
-			igmp_joingroup(ilm);
-
-		if (to_ill->ill_ipif_up_count == 0) {
-			/*
-			 * Nobody there. All multicast addresses will be
-			 * re-joined when we get the DL_BIND_ACK bringing the
-			 * interface up.
-			 */
-			ilm->ilm_notify_driver = B_FALSE;
-			ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n"));
-			goto from;
+		grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
+		if (grp == NULL) {
+			err = ENOENT;
+			goto unlock;
 		}
 
 		/*
-		 * For allmulti address, we want to join on only one interface.
-		 * Checking for ilm_numentries_v6 is not correct as you may
-		 * find an ilm with zero address on to_ill, but we may not
-		 * have nominated to_ill for receiving. Thus, if we have
-		 * nominated from_ill (ill_join_allmulti is set), nominate
-		 * only if to_ill is not already nominated (to_ill normally
-		 * should not have been nominated if "from_ill" has already
-		 * been nominated. As we don't prevent failovers from happening
-		 * across groups, we don't assert).
+		 * Check if the phyint and its ills are suitable for
+		 * inclusion into the group.
 		 */
-		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
-			/*
-			 * There is no need to hold ill locks as we are
-			 * writer on both ills and when ill_join_allmulti()
-			 * is called the thread is always a writer.
-			 */
-			if (from_ill->ill_join_allmulti &&
-			    !to_ill->ill_join_allmulti) {
-				(void) ill_join_allmulti(to_ill);
-			}
-		} else if (ilm->ilm_notify_driver) {
-
-			/*
-			 * This is a newly moved ilm so we need to tell the
-			 * driver about the new group. There can be more than
-			 * one ilm's for the same group in the list each with a
-			 * different orig_ifindex. We have to inform the driver
-			 * once. In ilm_move_v[4,6] we only set the flag
-			 * ilm_notify_driver for the first ilm.
-			 */
-
-			(void) ip_ll_send_enabmulti_req(to_ill,
-			    &ilm->ilm_v6addr);
-		}
-
-		ilm->ilm_notify_driver = B_FALSE;
+		if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
+			goto unlock;
 
 		/*
-		 * See whether we need to send down DL_DISABMULTI_REQ on
-		 * from_ill as ilm has just been removed.
+		 * Checks pass; join the group, and enqueue the remaining
+		 * illgrp joins for when we've become part of the group xop
+		 * and are exclusive across its IPSQs.  Since qwriter_ip()
+		 * requires an mblk_t to scribble on, and since `mp' will be
+		 * freed as part of completing the ioctl, allocate another.
 		 */
-from:
-		ipif = from_ill->ill_ipif;
-		if (from_ill->ill_net_type != IRE_IF_RESOLVER ||
-		    ipif->ipif_flags & IPIF_POINTOPOINT) {
-			ip1dbg(("ilm_send_multicast_reqs: "
-			    "from_ill not resolver\n"));
-			continue;		/* Must be IRE_IF_NORESOLVER */
-		}
-
-		if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
-			ip1dbg(("ilm_send_multicast_reqs: "
-			    "from_ill MULTI_BCAST\n"));
-			continue;
-		}
-
-		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
-			if (from_ill->ill_join_allmulti)
-				ill_leave_allmulti(from_ill);
-		} else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) {
-			(void) ip_ll_send_disabmulti_req(from_ill,
-			    &ilm->ilm_v6addr);
+		if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
+			err = ENOMEM;
+			goto unlock;
 		}
-	}
-	ILM_WALKER_RELE(to_ill);
-}
-
-/*
- * This function is called when all multicast memberships needs
- * to be moved from "from_ill" to "to_ill" for IPv6. This function is
- * called only once unlike the IPv4 counterpart where it is called after
- * every logical interface is moved. The reason is due to multicast
- * memberships are joined using an interface address in IPv4 while in
- * IPv6, interface index is used.
- */
-static void
-ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex)
-{
-	ilm_t	*ilm;
-	ilm_t	*ilm_next;
-	ilm_t	*new_ilm;
-	ilm_t	**ilmp;
-	int	count;
-	char buf[INET6_ADDRSTRLEN];
-	in6_addr_t ipv6_snm = ipv6_solicited_node_mcast;
-	ip_stack_t	*ipst = from_ill->ill_ipst;
-
-	ASSERT(MUTEX_HELD(&to_ill->ill_lock));
-	ASSERT(MUTEX_HELD(&from_ill->ill_lock));
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
 
-	if (ifindex == 0) {
 		/*
-		 * Form the solicited node mcast address which is used later.
+		 * Before we drop ipmp_lock, bump gr_pend* to ensure that the
+		 * IPMP meta-interface ills needed by `phyi' cannot go away
+		 * before ip_join_illgrps() is called back.  See the comments
+		 * in ip_sioctl_plink_ipmp() for more.
 		 */
-		ipif_t *ipif;
-
-		ipif = from_ill->ill_ipif;
-		ASSERT(ipif->ipif_id == 0);
-
-		ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
-	}
-
-	ilmp = &from_ill->ill_ilm;
-	for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
-		ilm_next = ilm->ilm_next;
-
-		if (ilm->ilm_flags & ILM_DELETED) {
-			ilmp = &ilm->ilm_next;
-			continue;
-		}
+		if (phyi->phyint_illv4 != NULL)
+			grp->gr_pendv4++;
+		if (phyi->phyint_illv6 != NULL)
+			grp->gr_pendv6++;
 
-		new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr,
-		    ilm->ilm_orig_ifindex, ilm->ilm_zoneid);
-		ASSERT(ilm->ilm_orig_ifindex != 0);
-		if (ilm->ilm_orig_ifindex == ifindex) {
-			/*
-			 * We are failing back multicast memberships.
-			 * If the same ilm exists in to_ill, it means somebody
-			 * has joined the same group there e.g. ff02::1
-			 * is joined within the kernel when the interfaces
-			 * came UP.
-			 */
-			ASSERT(ilm->ilm_ipif == NULL);
-			if (new_ilm != NULL) {
-				new_ilm->ilm_refcnt += ilm->ilm_refcnt;
-				if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
-				    !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
-					new_ilm->ilm_is_new = B_TRUE;
-				}
-			} else {
-				/*
-				 * check if we can just move the ilm
-				 */
-				if (from_ill->ill_ilm_walker_cnt != 0) {
-					/*
-					 * We have walkers we cannot move
-					 * the ilm, so allocate a new ilm,
-					 * this (old) ilm will be marked
-					 * ILM_DELETED at the end of the loop
-					 * and will be freed when the
-					 * last walker exits.
-					 */
-					new_ilm = (ilm_t *)mi_zalloc
-					    (sizeof (ilm_t));
-					if (new_ilm == NULL) {
-						ip0dbg(("ilm_move_v6: "
-						    "FAILBACK of IPv6"
-						    " multicast address %s : "
-						    "from %s to"
-						    " %s failed : ENOMEM \n",
-						    inet_ntop(AF_INET6,
-						    &ilm->ilm_v6addr, buf,
-						    sizeof (buf)),
-						    from_ill->ill_name,
-						    to_ill->ill_name));
-
-							ilmp = &ilm->ilm_next;
-							continue;
-					}
-					*new_ilm = *ilm;
-					/*
-					 * we don't want new_ilm linked to
-					 * ilm's filter list.
-					 */
-					new_ilm->ilm_filter = NULL;
-				} else {
-					/*
-					 * No walkers we can move the ilm.
-					 * lets take it out of the list.
-					 */
-					*ilmp = ilm->ilm_next;
-					ilm->ilm_next = NULL;
-					DTRACE_PROBE3(ill__decr__cnt,
-					    (ill_t *), from_ill,
-					    (char *), "ilm", (void *), ilm);
-					ASSERT(from_ill->ill_ilm_cnt > 0);
-					from_ill->ill_ilm_cnt--;
-
-					new_ilm = ilm;
-				}
+		rw_exit(&ipst->ips_ipmp_lock);
 
-				/*
-				 * if this is the first ilm for the group
-				 * set ilm_notify_driver so that we notify the
-				 * driver in ilm_send_multicast_reqs.
-				 */
-				if (ilm_lookup_ill_v6(to_ill,
-				    &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
-					new_ilm->ilm_notify_driver = B_TRUE;
-
-				DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
-				    (char *), "ilm", (void *), new_ilm);
-				new_ilm->ilm_ill = to_ill;
-				to_ill->ill_ilm_cnt++;
-
-				/* Add to the to_ill's list */
-				new_ilm->ilm_next = to_ill->ill_ilm;
-				to_ill->ill_ilm = new_ilm;
-				/*
-				 * set the flag so that mld_joingroup is
-				 * called in ilm_send_multicast_reqs().
-				 */
-				new_ilm->ilm_is_new = B_TRUE;
-			}
-			goto bottom;
-		} else if (ifindex != 0) {
-			/*
-			 * If this is FAILBACK (ifindex != 0) and the ifindex
-			 * has not matched above, look at the next ilm.
-			 */
-			ilmp = &ilm->ilm_next;
-			continue;
-		}
-		/*
-		 * If we are here, it means ifindex is 0. Failover
-		 * everything.
-		 *
-		 * We need to handle solicited node mcast address
-		 * and all_nodes mcast address differently as they
-		 * are joined witin the kenrel (ipif_multicast_up)
-		 * and potentially from the userland. We are called
-		 * after the ipifs of from_ill has been moved.
-		 * If we still find ilms on ill with solicited node
-		 * mcast address or all_nodes mcast address, it must
-		 * belong to the UP interface that has not moved e.g.
-		 * ipif_id 0 with the link local prefix does not move.
-		 * We join this on the new ill accounting for all the
-		 * userland memberships so that applications don't
-		 * see any failure.
-		 *
-		 * We need to make sure that we account only for the
-		 * solicited node and all node multicast addresses
-		 * that was brought UP on these. In the case of
-		 * a failover from A to B, we might have ilms belonging
-		 * to A (ilm_orig_ifindex pointing at A) on B accounting
-		 * for the membership from the userland. If we are failing
-		 * over from B to C now, we will find the ones belonging
-		 * to A on B. These don't account for the ill_ipif_up_count.
-		 * They just move from B to C. The check below on
-		 * ilm_orig_ifindex ensures that.
-		 */
-		if ((ilm->ilm_orig_ifindex ==
-		    from_ill->ill_phyint->phyint_ifindex) &&
-		    (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) ||
-		    IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast,
-		    &ilm->ilm_v6addr))) {
-			ASSERT(ilm->ilm_refcnt > 0);
-			count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count;
-			/*
-			 * For indentation reasons, we are not using a
-			 * "else" here.
-			 */
-			if (count == 0) {
-				ilmp = &ilm->ilm_next;
-				continue;
-			}
-			ilm->ilm_refcnt -= count;
-			if (new_ilm != NULL) {
-				/*
-				 * Can find one with the same
-				 * ilm_orig_ifindex, if we are failing
-				 * over to a STANDBY. This happens
-				 * when somebody wants to join a group
-				 * on a STANDBY interface and we
-				 * internally join on a different one.
-				 * If we had joined on from_ill then, a
-				 * failover now will find a new ilm
-				 * with this index.
-				 */
-				ip1dbg(("ilm_move_v6: FAILOVER, found"
-				    " new ilm on %s, group address %s\n",
-				    to_ill->ill_name,
-				    inet_ntop(AF_INET6,
-				    &ilm->ilm_v6addr, buf,
-				    sizeof (buf))));
-				new_ilm->ilm_refcnt += count;
-				if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
-				    !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
-					new_ilm->ilm_is_new = B_TRUE;
-				}
-			} else {
-				new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
-				if (new_ilm == NULL) {
-					ip0dbg(("ilm_move_v6: FAILOVER of IPv6"
-					    " multicast address %s : from %s to"
-					    " %s failed : ENOMEM \n",
-					    inet_ntop(AF_INET6,
-					    &ilm->ilm_v6addr, buf,
-					    sizeof (buf)), from_ill->ill_name,
-					    to_ill->ill_name));
-					ilmp = &ilm->ilm_next;
-					continue;
-				}
-				*new_ilm = *ilm;
-				new_ilm->ilm_filter = NULL;
-				new_ilm->ilm_refcnt = count;
-				new_ilm->ilm_timer = INFINITY;
-				new_ilm->ilm_rtx.rtx_timer = INFINITY;
-				new_ilm->ilm_is_new = B_TRUE;
-				/*
-				 * If the to_ill has not joined this
-				 * group we need to tell the driver in
-				 * ill_send_multicast_reqs.
-				 */
-				if (ilm_lookup_ill_v6(to_ill,
-				    &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
-					new_ilm->ilm_notify_driver = B_TRUE;
-
-				new_ilm->ilm_ill = to_ill;
-				DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
-				    (char *), "ilm", (void *), new_ilm);
-				to_ill->ill_ilm_cnt++;
-
-				/* Add to the to_ill's list */
-				new_ilm->ilm_next = to_ill->ill_ilm;
-				to_ill->ill_ilm = new_ilm;
-				ASSERT(new_ilm->ilm_ipif == NULL);
-			}
-			if (ilm->ilm_refcnt == 0) {
-				goto bottom;
-			} else {
-				new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
-				CLEAR_SLIST(new_ilm->ilm_filter);
-				ilmp = &ilm->ilm_next;
-			}
-			continue;
-		} else {
-			/*
-			 * ifindex = 0 means, move everything pointing at
-			 * from_ill. We are doing this becuase ill has
-			 * either FAILED or became INACTIVE.
-			 *
-			 * As we would like to move things later back to
-			 * from_ill, we want to retain the identity of this
-			 * ilm. Thus, we don't blindly increment the reference
-			 * count on the ilms matching the address alone. We
-			 * need to match on the ilm_orig_index also. new_ilm
-			 * was obtained by matching ilm_orig_index also.
-			 */
-			if (new_ilm != NULL) {
-				/*
-				 * This is possible only if a previous restore
-				 * was incomplete i.e restore to
-				 * ilm_orig_ifindex left some ilms because
-				 * of some failures. Thus when we are failing
-				 * again, we might find our old friends there.
-				 */
-				ip1dbg(("ilm_move_v6: FAILOVER, found new ilm"
-				    " on %s, group address %s\n",
-				    to_ill->ill_name,
-				    inet_ntop(AF_INET6,
-				    &ilm->ilm_v6addr, buf,
-				    sizeof (buf))));
-				new_ilm->ilm_refcnt += ilm->ilm_refcnt;
-				if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
-				    !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
-					new_ilm->ilm_is_new = B_TRUE;
-				}
-			} else {
-				if (from_ill->ill_ilm_walker_cnt != 0) {
-					new_ilm = (ilm_t *)
-					    mi_zalloc(sizeof (ilm_t));
-					if (new_ilm == NULL) {
-						ip0dbg(("ilm_move_v6: "
-						    "FAILOVER of IPv6"
-						    " multicast address %s : "
-						    "from %s to"
-						    " %s failed : ENOMEM \n",
-						    inet_ntop(AF_INET6,
-						    &ilm->ilm_v6addr, buf,
-						    sizeof (buf)),
-						    from_ill->ill_name,
-						    to_ill->ill_name));
-
-							ilmp = &ilm->ilm_next;
-							continue;
-					}
-					*new_ilm = *ilm;
-					new_ilm->ilm_filter = NULL;
-				} else {
-					*ilmp = ilm->ilm_next;
-					DTRACE_PROBE3(ill__decr__cnt,
-					    (ill_t *), from_ill,
-					    (char *), "ilm", (void *), ilm);
-					ASSERT(from_ill->ill_ilm_cnt > 0);
-					from_ill->ill_ilm_cnt--;
-
-					new_ilm = ilm;
-				}
-				/*
-				 * If the to_ill has not joined this
-				 * group we need to tell the driver in
-				 * ill_send_multicast_reqs.
-				 */
-				if (ilm_lookup_ill_v6(to_ill,
-				    &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
-					new_ilm->ilm_notify_driver = B_TRUE;
-
-				/* Add to the to_ill's list */
-				new_ilm->ilm_next = to_ill->ill_ilm;
-				to_ill->ill_ilm = new_ilm;
-				ASSERT(ilm->ilm_ipif == NULL);
-				new_ilm->ilm_ill = to_ill;
-				DTRACE_PROBE3(ill__incr__cnt, (ill_t *), to_ill,
-				    (char *), "ilm", (void *), new_ilm);
-				to_ill->ill_ilm_cnt++;
-				new_ilm->ilm_is_new = B_TRUE;
-			}
-
-		}
-
-bottom:
-		/*
-		 * Revert multicast filter state to (EXCLUDE, NULL).
-		 * new_ilm->ilm_is_new should already be set if needed.
-		 */
-		new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
-		CLEAR_SLIST(new_ilm->ilm_filter);
+		ipmp_phyint_join_grp(phyi, grp);
+		ill_refhold(ill);
+		qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
+		    SWITCH_OP, B_FALSE);
+		return (0);
+	} else {
 		/*
-		 * We allocated/got a new ilm, free the old one.
+		 * Request to remove the interface from a group.  If the
+		 * interface is not in a group, this trivially succeeds.
 		 */
-		if (new_ilm != ilm) {
-			if (from_ill->ill_ilm_walker_cnt == 0) {
-				*ilmp = ilm->ilm_next;
-
-				ASSERT(ilm->ilm_ipif == NULL); /* ipv6 */
-				DTRACE_PROBE3(ill__decr__cnt, (ill_t *),
-				    from_ill, (char *), "ilm", (void *), ilm);
-				ASSERT(from_ill->ill_ilm_cnt > 0);
-				from_ill->ill_ilm_cnt--;
-
-				ilm_inactive(ilm); /* frees this ilm */
-
-			} else {
-				ilm->ilm_flags |= ILM_DELETED;
-				from_ill->ill_ilm_cleanup_reqd = 1;
-				ilmp = &ilm->ilm_next;
-			}
-		}
+		rw_exit(&ipst->ips_ipmp_lock);
+		if (IS_UNDER_IPMP(ill))
+			ipmp_phyint_leave_grp(phyi);
+		return (0);
 	}
+unlock:
+	rw_exit(&ipst->ips_ipmp_lock);
+	return (err);
 }
 
 /*
- * Move all the multicast memberships to to_ill. Called when
- * an ipif moves from "from_ill" to "to_ill". This function is slightly
- * different from IPv6 counterpart as multicast memberships are associated
- * with ills in IPv6. This function is called after every ipif is moved
- * unlike IPv6, where it is moved only once.
+ * Process an SIOCGLIFBINDING request.
  */
-static void
-ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif)
-{
-	ilm_t	*ilm;
-	ilm_t	*ilm_next;
-	ilm_t	*new_ilm;
-	ilm_t	**ilmp;
-	ip_stack_t	*ipst = from_ill->ill_ipst;
-
-	ASSERT(MUTEX_HELD(&to_ill->ill_lock));
-	ASSERT(MUTEX_HELD(&from_ill->ill_lock));
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
-	ilmp = &from_ill->ill_ilm;
-	for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
-		ilm_next = ilm->ilm_next;
-
-		if (ilm->ilm_flags & ILM_DELETED) {
-			ilmp = &ilm->ilm_next;
-			continue;
-		}
-
-		ASSERT(ilm->ilm_ipif != NULL);
-
-		if (ilm->ilm_ipif != ipif) {
-			ilmp = &ilm->ilm_next;
-			continue;
-		}
-
-		if (V4_PART_OF_V6(ilm->ilm_v6addr) ==
-		    htonl(INADDR_ALLHOSTS_GROUP)) {
-			new_ilm = ilm_lookup_ipif(ipif,
-			    V4_PART_OF_V6(ilm->ilm_v6addr));
-			if (new_ilm != NULL) {
-				new_ilm->ilm_refcnt += ilm->ilm_refcnt;
-				/*
-				 * We still need to deal with the from_ill.
-				 */
-				new_ilm->ilm_is_new = B_TRUE;
-				new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
-				CLEAR_SLIST(new_ilm->ilm_filter);
-				ASSERT(ilm->ilm_ipif == ipif);
-				ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
-				if (from_ill->ill_ilm_walker_cnt == 0) {
-					DTRACE_PROBE3(ill__decr__cnt,
-					    (ill_t *), from_ill,
-					    (char *), "ilm", (void *), ilm);
-					ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
-				}
-				goto delete_ilm;
-			}
-			/*
-			 * If we could not find one e.g. ipif is
-			 * still down on to_ill, we add this ilm
-			 * on ill_new to preserve the reference
-			 * count.
-			 */
-		}
-		/*
-		 * When ipifs move, ilms always move with it
-		 * to the NEW ill. Thus we should never be
-		 * able to find ilm till we really move it here.
-		 */
-		ASSERT(ilm_lookup_ipif(ipif,
-		    V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL);
-
-		if (from_ill->ill_ilm_walker_cnt != 0) {
-			new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
-			if (new_ilm == NULL) {
-				char buf[INET6_ADDRSTRLEN];
-				ip0dbg(("ilm_move_v4: FAILBACK of IPv4"
-				    " multicast address %s : "
-				    "from %s to"
-				    " %s failed : ENOMEM \n",
-				    inet_ntop(AF_INET,
-				    &ilm->ilm_v6addr, buf,
-				    sizeof (buf)),
-				    from_ill->ill_name,
-				    to_ill->ill_name));
-
-				ilmp = &ilm->ilm_next;
-				continue;
-			}
-			*new_ilm = *ilm;
-			DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ipif,
-			    (char *), "ilm", (void *), ilm);
-			new_ilm->ilm_ipif->ipif_ilm_cnt++;
-			/* We don't want new_ilm linked to ilm's filter list */
-			new_ilm->ilm_filter = NULL;
-		} else {
-			/* Remove from the list */
-			*ilmp = ilm->ilm_next;
-			new_ilm = ilm;
-		}
-
-		/*
-		 * If we have never joined this group on the to_ill
-		 * make sure we tell the driver.
-		 */
-		if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr,
-		    ALL_ZONES) == NULL)
-			new_ilm->ilm_notify_driver = B_TRUE;
-
-		/* Add to the to_ill's list */
-		new_ilm->ilm_next = to_ill->ill_ilm;
-		to_ill->ill_ilm = new_ilm;
-		new_ilm->ilm_is_new = B_TRUE;
-
-		/*
-		 * Revert multicast filter state to (EXCLUDE, NULL)
-		 */
-		new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
-		CLEAR_SLIST(new_ilm->ilm_filter);
-
-		/*
-		 * Delete only if we have allocated a new ilm.
-		 */
-		if (new_ilm != ilm) {
-delete_ilm:
-			if (from_ill->ill_ilm_walker_cnt == 0) {
-				/* Remove from the list */
-				*ilmp = ilm->ilm_next;
-				ilm->ilm_next = NULL;
-				DTRACE_PROBE3(ipif__decr__cnt,
-				    (ipif_t *), ilm->ilm_ipif,
-				    (char *), "ilm", (void *), ilm);
-				ASSERT(ilm->ilm_ipif->ipif_ilm_cnt > 0);
-				ilm->ilm_ipif->ipif_ilm_cnt--;
-				ilm_inactive(ilm);
-			} else {
-				ilm->ilm_flags |= ILM_DELETED;
-				from_ill->ill_ilm_cleanup_reqd = 1;
-				ilmp = &ilm->ilm_next;
-			}
-		}
-	}
-}
-
-static uint_t
-ipif_get_id(ill_t *ill, uint_t id)
-{
-	uint_t	unit;
-	ipif_t	*tipif;
-	boolean_t found = B_FALSE;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * During failback, we want to go back to the same id
-	 * instead of the smallest id so that the original
-	 * configuration is maintained. id is non-zero in that
-	 * case.
-	 */
-	if (id != 0) {
-		/*
-		 * While failing back, if we still have an ipif with
-		 * MAX_ADDRS_PER_IF, it means this will be replaced
-		 * as soon as we return from this function. It was
-		 * to set to MAX_ADDRS_PER_IF by the caller so that
-		 * we can choose the smallest id. Thus we return zero
-		 * in that case ignoring the hint.
-		 */
-		if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF)
-			return (0);
-		for (tipif = ill->ill_ipif; tipif != NULL;
-		    tipif = tipif->ipif_next) {
-			if (tipif->ipif_id == id) {
-				found = B_TRUE;
-				break;
-			}
-		}
-		/*
-		 * If somebody already plumbed another logical
-		 * with the same id, we won't be able to find it.
-		 */
-		if (!found)
-			return (id);
-	}
-	for (unit = 0; unit <= ipst->ips_ip_addrs_per_if; unit++) {
-		found = B_FALSE;
-		for (tipif = ill->ill_ipif; tipif != NULL;
-		    tipif = tipif->ipif_next) {
-			if (tipif->ipif_id == unit) {
-				found = B_TRUE;
-				break;
-			}
-		}
-		if (!found)
-			break;
-	}
-	return (unit);
-}
-
 /* ARGSUSED */
-static int
-ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp,
-    ipif_t **rep_ipif_ptr)
+int
+ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+    ip_ioctl_cmd_t *ipip, void *ifreq)
 {
-	ill_t	*from_ill;
-	ipif_t	*rep_ipif;
-	uint_t	unit;
-	int err = 0;
-	ipif_t	*to_ipif;
-	struct iocblk	*iocp;
-	boolean_t failback_cmd;
-	boolean_t remove_ipif;
-	int	rc;
-	ip_stack_t	*ipst;
-
-	ASSERT(IAM_WRITER_ILL(to_ill));
-	ASSERT(IAM_WRITER_IPIF(ipif));
-
-	iocp = (struct iocblk *)mp->b_rptr;
-	failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK);
-	remove_ipif = B_FALSE;
-
-	from_ill = ipif->ipif_ill;
-	ipst = from_ill->ill_ipst;
-
-	ASSERT(MUTEX_HELD(&to_ill->ill_lock));
-	ASSERT(MUTEX_HELD(&from_ill->ill_lock));
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-
-	/*
-	 * Don't move LINK LOCAL addresses as they are tied to
-	 * physical interface.
-	 */
-	if (from_ill->ill_isv6 &&
-	    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) {
-		ipif->ipif_was_up = B_FALSE;
-		IPIF_UNMARK_MOVING(ipif);
-		return (0);
-	}
-
-	/*
-	 * We set the ipif_id to maximum so that the search for
-	 * ipif_id will pick the lowest number i.e 0 in the
-	 * following 2 cases :
-	 *
-	 * 1) We have a replacement ipif at the head of to_ill.
-	 *    We can't remove it yet as we can exceed ip_addrs_per_if
-	 *    on to_ill and hence the MOVE might fail. We want to
-	 *    remove it only if we could move the ipif. Thus, by
-	 *    setting it to the MAX value, we make the search in
-	 *    ipif_get_id return the zeroth id.
-	 *
-	 * 2) When DR pulls out the NIC and re-plumbs the interface,
-	 *    we might just have a zero address plumbed on the ipif
-	 *    with zero id in the case of IPv4. We remove that while
-	 *    doing the failback. We want to remove it only if we
-	 *    could move the ipif. Thus, by setting it to the MAX
-	 *    value, we make the search in ipif_get_id return the
-	 *    zeroth id.
-	 *
-	 * Both (1) and (2) are done only when when we are moving
-	 * an ipif (either due to failover/failback) which originally
-	 * belonged to this interface i.e the ipif_orig_ifindex is
-	 * the same as to_ill's ifindex. This is needed so that
-	 * FAILOVER from A -> B ( A failed) followed by FAILOVER
-	 * from B -> A (B is being removed from the group) and
-	 * FAILBACK from A -> B restores the original configuration.
-	 * Without the check for orig_ifindex, the second FAILOVER
-	 * could make the ipif belonging to B replace the A's zeroth
-	 * ipif and the subsequent failback re-creating the replacement
-	 * ipif again.
-	 *
-	 * NOTE : We created the replacement ipif when we did a
-	 * FAILOVER (See below). We could check for FAILBACK and
-	 * then look for replacement ipif to be removed. But we don't
-	 * want to do that because we wan't to allow the possibility
-	 * of a FAILOVER from A -> B (which creates the replacement ipif),
-	 * followed by a *FAILOVER* from B -> A instead of a FAILBACK
-	 * from B -> A.
-	 */
-	to_ipif = to_ill->ill_ipif;
-	if ((to_ill->ill_phyint->phyint_ifindex ==
-	    ipif->ipif_orig_ifindex) &&
-	    to_ipif->ipif_replace_zero) {
-		ASSERT(to_ipif->ipif_id == 0);
-		remove_ipif = B_TRUE;
-		to_ipif->ipif_id = MAX_ADDRS_PER_IF;
-	}
-	/*
-	 * Find the lowest logical unit number on the to_ill.
-	 * If we are failing back, try to get the original id
-	 * rather than the lowest one so that the original
-	 * configuration is maintained.
-	 *
-	 * XXX need a better scheme for this.
-	 */
-	if (failback_cmd) {
-		unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid);
-	} else {
-		unit = ipif_get_id(to_ill, 0);
-	}
-
-	/* Reset back to zero in case we fail below */
-	if (to_ipif->ipif_id == MAX_ADDRS_PER_IF)
-		to_ipif->ipif_id = 0;
+	ill_t		*bound_ill;
+	struct lifreq	*lifr = ifreq;
 
-	if (unit == ipst->ips_ip_addrs_per_if) {
-		ipif->ipif_was_up = B_FALSE;
-		IPIF_UNMARK_MOVING(ipif);
+	if (!IS_IPMP(ipif->ipif_ill))
 		return (EINVAL);
-	}
-
-	/*
-	 * ipif is ready to move from "from_ill" to "to_ill".
-	 *
-	 * 1) If we are moving ipif with id zero, create a
-	 *    replacement ipif for this ipif on from_ill. If this fails
-	 *    fail the MOVE operation.
-	 *
-	 * 2) Remove the replacement ipif on to_ill if any.
-	 *    We could remove the replacement ipif when we are moving
-	 *    the ipif with id zero. But what if somebody already
-	 *    unplumbed it ? Thus we always remove it if it is present.
-	 *    We want to do it only if we are sure we are going to
-	 *    move the ipif to to_ill which is why there are no
-	 *    returns due to error till ipif is linked to to_ill.
-	 *    Note that the first ipif that we failback will always
-	 *    be zero if it is present.
-	 */
-	if (ipif->ipif_id == 0) {
-		ipaddr_t inaddr_any = INADDR_ANY;
 
-		rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED);
-		if (rep_ipif == NULL) {
-			ipif->ipif_was_up = B_FALSE;
-			IPIF_UNMARK_MOVING(ipif);
-			return (ENOMEM);
-		}
-		*rep_ipif = ipif_zero;
-		/*
-		 * Before we put the ipif on the list, store the addresses
-		 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR
-		 * assumes so. This logic is not any different from what
-		 * ipif_allocate does.
-		 */
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6lcl_addr);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6src_addr);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6subnet);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6net_mask);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6brd_addr);
-		IN6_IPADDR_TO_V4MAPPED(inaddr_any,
-		    &rep_ipif->ipif_v6pp_dst_addr);
-		/*
-		 * We mark IPIF_NOFAILOVER so that this can never
-		 * move.
-		 */
-		rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER;
-		rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE;
-		rep_ipif->ipif_replace_zero = B_TRUE;
-		mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL,
-		    MUTEX_DEFAULT, NULL);
-		rep_ipif->ipif_id = 0;
-		rep_ipif->ipif_ire_type = ipif->ipif_ire_type;
-		rep_ipif->ipif_ill = from_ill;
-		rep_ipif->ipif_orig_ifindex =
-		    from_ill->ill_phyint->phyint_ifindex;
-		/* Insert at head */
-		rep_ipif->ipif_next = from_ill->ill_ipif;
-		from_ill->ill_ipif = rep_ipif;
-		/*
-		 * We don't really care to let apps know about
-		 * this interface.
-		 */
-	}
-
-	if (remove_ipif) {
-		/*
-		 * We set to a max value above for this case to get
-		 * id zero. ASSERT that we did get one.
-		 */
-		ASSERT((to_ipif->ipif_id == 0) && (unit == 0));
-		rep_ipif = to_ipif;
-		to_ill->ill_ipif = rep_ipif->ipif_next;
-		rep_ipif->ipif_next = NULL;
-		/*
-		 * If some apps scanned and find this interface,
-		 * it is time to let them know, so that they can
-		 * delete it.
-		 */
-
-		*rep_ipif_ptr = rep_ipif;
-	}
-
-	/* Get it out of the ILL interface list. */
-	ipif_remove(ipif, B_FALSE);
-
-	/* Assign the new ill */
-	ipif->ipif_ill = to_ill;
-	ipif->ipif_id = unit;
-	/* id has already been checked */
-	rc = ipif_insert(ipif, B_FALSE, B_FALSE);
-	ASSERT(rc == 0);
-	/* Let SCTP update its list */
-	sctp_move_ipif(ipif, from_ill, to_ill);
-	/*
-	 * Handle the failover and failback of ipif_t between
-	 * ill_t that have differing maximum mtu values.
-	 */
-	if (ipif->ipif_mtu > to_ill->ill_max_mtu) {
-		if (ipif->ipif_saved_mtu == 0) {
-			/*
-			 * As this ipif_t is moving to an ill_t
-			 * that has a lower ill_max_mtu, its
-			 * ipif_mtu needs to be saved so it can
-			 * be restored during failback or during
-			 * failover to an ill_t which has a
-			 * higher ill_max_mtu.
-			 */
-			ipif->ipif_saved_mtu = ipif->ipif_mtu;
-			ipif->ipif_mtu = to_ill->ill_max_mtu;
-		} else {
-			/*
-			 * The ipif_t is, once again, moving to
-			 * an ill_t that has a lower maximum mtu
-			 * value.
-			 */
-			ipif->ipif_mtu = to_ill->ill_max_mtu;
-		}
-	} else if (ipif->ipif_mtu < to_ill->ill_max_mtu &&
-	    ipif->ipif_saved_mtu != 0) {
-		/*
-		 * The mtu of this ipif_t had to be reduced
-		 * during an earlier failover; this is an
-		 * opportunity for it to be increased (either as
-		 * part of another failover or a failback).
-		 */
-		if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) {
-			ipif->ipif_mtu = ipif->ipif_saved_mtu;
-			ipif->ipif_saved_mtu = 0;
-		} else {
-			ipif->ipif_mtu = to_ill->ill_max_mtu;
-		}
+	if ((bound_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
+		lifr->lifr_binding[0] = '\0';
+		return (0);
 	}
 
-	/*
-	 * We preserve all the other fields of the ipif including
-	 * ipif_saved_ire_mp. The routes that are saved here will
-	 * be recreated on the new interface and back on the old
-	 * interface when we move back.
-	 */
-	ASSERT(ipif->ipif_arp_del_mp == NULL);
-
-	return (err);
-}
-
-static int
-ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp,
-    int ifindex, ipif_t **rep_ipif_ptr)
-{
-	ipif_t *mipif;
-	ipif_t *ipif_next;
-	int err;
-
-	/*
-	 * We don't really try to MOVE back things if some of the
-	 * operations fail. The daemon will take care of moving again
-	 * later on.
-	 */
-	for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) {
-		ipif_next = mipif->ipif_next;
-		if (!(mipif->ipif_flags & IPIF_NOFAILOVER) &&
-		    (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) {
-
-			err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr);
-
-			/*
-			 * When the MOVE fails, it is the job of the
-			 * application to take care of this properly
-			 * i.e try again if it is ENOMEM.
-			 */
-			if (mipif->ipif_ill != from_ill) {
-				/*
-				 * ipif has moved.
-				 *
-				 * Move the multicast memberships associated
-				 * with this ipif to the new ill. For IPv6, we
-				 * do it once after all the ipifs are moved
-				 * (in ill_move) as they are not associated
-				 * with ipifs.
-				 *
-				 * We need to move the ilms as the ipif has
-				 * already been moved to a new ill even
-				 * in the case of errors. Neither
-				 * ilm_free(ipif) will find the ilm
-				 * when somebody unplumbs this ipif nor
-				 * ilm_delete(ilm) will be able to find the
-				 * ilm, if we don't move now.
-				 */
-				if (!from_ill->ill_isv6)
-					ilm_move_v4(from_ill, to_ill, mipif);
-			}
-
-			if (err != 0)
-				return (err);
-		}
-	}
+	(void) strlcpy(lifr->lifr_binding, bound_ill->ill_name, LIFNAMSIZ);
+	ill_refrele(bound_ill);
 	return (0);
 }
 
-static int
-ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp)
-{
-	int ifindex;
-	int err;
-	struct iocblk	*iocp;
-	ipif_t	*ipif;
-	ipif_t *rep_ipif_ptr = NULL;
-	ipif_t	*from_ipif = NULL;
-	boolean_t check_rep_if = B_FALSE;
-	ip_stack_t	*ipst = from_ill->ill_ipst;
-
-	iocp = (struct iocblk *)mp->b_rptr;
-	if (iocp->ioc_cmd == SIOCLIFFAILOVER) {
-		/*
-		 * Move everything pointing at from_ill to to_ill.
-		 * We acheive this by passing in 0 as ifindex.
-		 */
-		ifindex = 0;
-	} else {
-		/*
-		 * Move everything pointing at from_ill whose original
-		 * ifindex of connp, ipif, ilm points at to_ill->ill_index.
-		 * We acheive this by passing in ifindex rather than 0.
-		 * Multicast vifs, ilgs move implicitly because ipifs move.
-		 */
-		ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK);
-		ifindex = to_ill->ill_phyint->phyint_ifindex;
-	}
-
-	/*
-	 * Determine if there is at least one ipif that would move from
-	 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement
-	 * ipif (if it exists) on the to_ill would be consumed as a result of
-	 * the move, in which case we need to quiesce the replacement ipif also.
-	 */
-	for (from_ipif = from_ill->ill_ipif; from_ipif != NULL;
-	    from_ipif = from_ipif->ipif_next) {
-		if (((ifindex == 0) ||
-		    (ifindex == from_ipif->ipif_orig_ifindex)) &&
-		    !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) {
-			check_rep_if = B_TRUE;
-			break;
-		}
-	}
-
-	ill_down_ipifs(from_ill, mp, ifindex, B_TRUE);
-
-	GRAB_ILL_LOCKS(from_ill, to_ill);
-	if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) {
-		(void) ipsq_pending_mp_add(NULL, ipif, q,
-		    mp, ILL_MOVE_OK);
-		RELEASE_ILL_LOCKS(from_ill, to_ill);
-		return (EINPROGRESS);
-	}
-
-	/* Check if the replacement ipif is quiescent to delete */
-	if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif,
-	    (iocp->ioc_cmd == SIOCLIFFAILBACK))) {
-		to_ill->ill_ipif->ipif_state_flags |=
-		    IPIF_MOVING | IPIF_CHANGING;
-		if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) {
-			(void) ipsq_pending_mp_add(NULL, ipif, q,
-			    mp, ILL_MOVE_OK);
-			RELEASE_ILL_LOCKS(from_ill, to_ill);
-			return (EINPROGRESS);
-		}
-	}
-	RELEASE_ILL_LOCKS(from_ill, to_ill);
-
-	ASSERT(!MUTEX_HELD(&to_ill->ill_lock));
-	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-	GRAB_ILL_LOCKS(from_ill, to_ill);
-	err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr);
-
-	/* ilm_move is done inside ipif_move for IPv4 */
-	if (err == 0 && from_ill->ill_isv6)
-		ilm_move_v6(from_ill, to_ill, ifindex);
-
-	RELEASE_ILL_LOCKS(from_ill, to_ill);
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	/*
-	 * send rts messages and multicast messages.
-	 */
-	if (rep_ipif_ptr != NULL) {
-		if (rep_ipif_ptr->ipif_recovery_id != 0) {
-			(void) untimeout(rep_ipif_ptr->ipif_recovery_id);
-			rep_ipif_ptr->ipif_recovery_id = 0;
-		}
-		ip_rts_ifmsg(rep_ipif_ptr);
-		ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr);
-#ifdef DEBUG
-		ipif_trace_cleanup(rep_ipif_ptr);
-#endif
-		mi_free(rep_ipif_ptr);
-	}
-
-	conn_move_ill(from_ill, to_ill, ifindex);
-
-	return (err);
-}
-
 /*
- * Used to extract arguments for FAILOVER/FAILBACK ioctls.
- * Also checks for the validity of the arguments.
- * Note: We are already exclusive inside the from group.
- * It is upto the caller to release refcnt on the to_ill's.
+ * Process an SIOCGLIFGROUPNAME request.
  */
-static int
-ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4,
-    ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6)
+/* ARGSUSED */
+int
+ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+    ip_ioctl_cmd_t *ipip, void *ifreq)
 {
-	int dst_index;
-	ipif_t *ipif_v4, *ipif_v6;
-	struct lifreq *lifr;
-	mblk_t *mp1;
-	boolean_t exists;
-	sin_t	*sin;
-	int	err = 0;
-	ip_stack_t	*ipst;
+	ipmp_grp_t	*grp;
+	struct lifreq	*lifr = ifreq;
+	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
 
-	if (CONN_Q(q))
-		ipst = CONNQ_TO_IPST(q);
+	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+	if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
+		lifr->lifr_groupname[0] = '\0';
 	else
-		ipst = ILLQ_TO_IPST(q);
-
-	if ((mp1 = mp->b_cont) == NULL)
-		return (EPROTO);
-
-	if ((mp1 = mp1->b_cont) == NULL)
-		return (EPROTO);
-
-	lifr = (struct lifreq *)mp1->b_rptr;
-	sin = (sin_t *)&lifr->lifr_addr;
-
-	/*
-	 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6
-	 * specific operations.
-	 */
-	if (sin->sin_family != AF_UNSPEC)
-		return (EINVAL);
-
-	/*
-	 * Get ipif with id 0. We are writer on the from ill. So we can pass
-	 * NULLs for the last 4 args and we know the lookup won't fail
-	 * with EINPROGRESS.
-	 */
-	ipif_v4 = ipif_lookup_on_name(lifr->lifr_name,
-	    mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE,
-	    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
-	ipif_v6 = ipif_lookup_on_name(lifr->lifr_name,
-	    mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE,
-	    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
-
-	if (ipif_v4 == NULL && ipif_v6 == NULL)
-		return (ENXIO);
-
-	if (ipif_v4 != NULL) {
-		ASSERT(ipif_v4->ipif_refcnt != 0);
-		if (ipif_v4->ipif_id != 0) {
-			err = EINVAL;
-			goto done;
-		}
-
-		ASSERT(IAM_WRITER_IPIF(ipif_v4));
-		*ill_from_v4 = ipif_v4->ipif_ill;
-	}
-
-	if (ipif_v6 != NULL) {
-		ASSERT(ipif_v6->ipif_refcnt != 0);
-		if (ipif_v6->ipif_id != 0) {
-			err = EINVAL;
-			goto done;
-		}
-
-		ASSERT(IAM_WRITER_IPIF(ipif_v6));
-		*ill_from_v6 = ipif_v6->ipif_ill;
-	}
-
-	err = 0;
-	dst_index = lifr->lifr_movetoindex;
-	*ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE,
-	    q, mp, ip_process_ioctl, &err, ipst);
-	if (err != 0) {
-		/*
-		 * A move may be in progress, EINPROGRESS looking up the "to"
-		 * ill means changes already done to the "from" ipsq need to
-		 * be undone to avoid potential deadlocks.
-		 *
-		 * ENXIO will usually be because there is only v6 on the ill,
-		 * that's not treated as an error unless an ENXIO is also
-		 * seen when looking up the v6 "to" ill.
-		 *
-		 * If EINPROGRESS, the mp has been enqueued and can not be
-		 * used to look up the v6 "to" ill, but a preemptive clean
-		 * up of changes to the v6 "from" ipsq is done.
-		 */
-		if (err == EINPROGRESS) {
-			if (*ill_from_v4 != NULL) {
-				ill_t   *from_ill;
-				ipsq_t  *from_ipsq;
-
-				from_ill = ipif_v4->ipif_ill;
-				from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
-				mutex_enter(&from_ipsq->ipsq_lock);
-				from_ipsq->ipsq_current_ipif = NULL;
-				mutex_exit(&from_ipsq->ipsq_lock);
-			}
-			if (*ill_from_v6 != NULL) {
-				ill_t   *from_ill;
-				ipsq_t  *from_ipsq;
-
-				from_ill = ipif_v6->ipif_ill;
-				from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
-				mutex_enter(&from_ipsq->ipsq_lock);
-				from_ipsq->ipsq_current_ipif = NULL;
-				mutex_exit(&from_ipsq->ipsq_lock);
-			}
-			goto done;
-		}
-		ASSERT(err == ENXIO);
-		err = 0;
-	}
-
-	*ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE,
-	    q, mp, ip_process_ioctl, &err, ipst);
-	if (err != 0) {
-		/*
-		 * A move may be in progress, EINPROGRESS looking up the "to"
-		 * ill means changes already done to the "from" ipsq need to
-		 * be undone to avoid potential deadlocks.
-		 */
-		if (err == EINPROGRESS) {
-			if (*ill_from_v6 != NULL) {
-				ill_t   *from_ill;
-				ipsq_t  *from_ipsq;
-
-				from_ill = ipif_v6->ipif_ill;
-				from_ipsq = from_ill->ill_phyint->phyint_ipsq;
-
-				mutex_enter(&from_ipsq->ipsq_lock);
-				from_ipsq->ipsq_current_ipif = NULL;
-				mutex_exit(&from_ipsq->ipsq_lock);
-			}
-			goto done;
-		}
-		ASSERT(err == ENXIO);
-
-		/* Both v4 and v6 lookup failed */
-		if (*ill_to_v4 == NULL) {
-			err = ENXIO;
-			goto done;
-		}
-		err = 0;
-	}
-
-	/*
-	 * If we have something to MOVE i.e "from" not NULL,
-	 * "to" should be non-NULL.
-	 */
-	if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) ||
-	    (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) {
-		err = EINVAL;
-	}
-
-done:
-	if (ipif_v4 != NULL)
-		ipif_refrele(ipif_v4);
-	if (ipif_v6 != NULL)
-		ipif_refrele(ipif_v6);
-	return (err);
+		(void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
+	rw_exit(&ipst->ips_ipmp_lock);
+	return (0);
 }
 
 /*
- * FAILOVER and FAILBACK are modelled as MOVE operations.
- *
- * We don't check whether the MOVE is within the same group or
- * not, because this ioctl can be used as a generic mechanism
- * to failover from interface A to B, though things will function
- * only if they are really part of the same group. Moreover,
- * all ipifs may be down and hence temporarily out of the group.
- *
- * ipif's that need to be moved are first brought down; V4 ipifs are brought
- * down first and then V6.  For each we wait for the ipif's to become quiescent.
- * Bringing down the ipifs ensures that all ires pointing to these ipifs's
- * have been deleted and there are no active references. Once quiescent the
- * ipif's are moved and brought up on the new ill.
- *
- * Normally the source ill and destination ill belong to the same IPMP group
- * and hence the same ipsq_t. In the event they don't belong to the same
- * same group the two ipsq's are first merged into one ipsq - that of the
- * to_ill. The multicast memberships on the source and destination ill cannot
- * change during the move operation since multicast joins/leaves also have to
- * execute on the same ipsq and are hence serialized.
+ * Process an SIOCGLIFGROUPINFO request.
  */
 /* ARGSUSED */
 int
-ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
-    ip_ioctl_cmd_t *ipip, void *ifreq)
+ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
+    ip_ioctl_cmd_t *ipip, void *dummy)
 {
-	ill_t *ill_to_v4 = NULL;
-	ill_t *ill_to_v6 = NULL;
-	ill_t *ill_from_v4 = NULL;
-	ill_t *ill_from_v6 = NULL;
-	int err = 0;
-
-	/*
-	 * setup from and to ill's, we can get EINPROGRESS only for
-	 * to_ill's.
-	 */
-	err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6,
-	    &ill_to_v4, &ill_to_v6);
-
-	if (err != 0) {
-		ip0dbg(("ip_sioctl_move: extract args failed\n"));
-		goto done;
-	}
-
-	/*
-	 * nothing to do.
-	 */
-	if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) {
-		goto done;
-	}
-
-	/*
-	 * nothing to do.
-	 */
-	if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) {
-		goto done;
-	}
-
-	/*
-	 * Mark the ill as changing.
-	 * ILL_CHANGING flag is cleared when the ipif's are brought up
-	 * in ill_up_ipifs in case of error they are cleared below.
-	 */
-
-	GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
-	if (ill_from_v4 != NULL)
-		ill_from_v4->ill_state_flags |= ILL_CHANGING;
-	if (ill_from_v6 != NULL)
-		ill_from_v6->ill_state_flags |= ILL_CHANGING;
-	RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
-
-	/*
-	 * Make sure that both src and dst are
-	 * in the same syncq group. If not make it happen.
-	 * We are not holding any locks because we are the writer
-	 * on the from_ipsq and we will hold locks in ill_merge_groups
-	 * to protect to_ipsq against changing.
-	 */
-	if (ill_from_v4 != NULL) {
-		if (ill_from_v4->ill_phyint->phyint_ipsq !=
-		    ill_to_v4->ill_phyint->phyint_ipsq) {
-			err = ill_merge_groups(ill_from_v4, ill_to_v4,
-			    NULL, mp, q);
-			goto err_ret;
-
-		}
-		ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock));
-	} else {
-
-		if (ill_from_v6->ill_phyint->phyint_ipsq !=
-		    ill_to_v6->ill_phyint->phyint_ipsq) {
-			err = ill_merge_groups(ill_from_v6, ill_to_v6,
-			    NULL, mp, q);
-			goto err_ret;
-
-		}
-		ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock));
-	}
-
-	/*
-	 * Now that the ipsq's have been merged and we are the writer
-	 * lets mark to_ill as changing as well.
-	 */
-
-	GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
-	if (ill_to_v4 != NULL)
-		ill_to_v4->ill_state_flags |= ILL_CHANGING;
-	if (ill_to_v6 != NULL)
-		ill_to_v6->ill_state_flags |= ILL_CHANGING;
-	RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
-
-	/*
-	 * Its ok for us to proceed with the move even if
-	 * ill_pending_mp is non null on one of the from ill's as the reply
-	 * should not be looking at the ipif, it should only care about the
-	 * ill itself.
-	 */
-
-	/*
-	 * lets move ipv4 first.
-	 */
-	if (ill_from_v4 != NULL) {
-		ASSERT(IAM_WRITER_ILL(ill_to_v4));
-		ill_from_v4->ill_move_in_progress = B_TRUE;
-		ill_to_v4->ill_move_in_progress = B_TRUE;
-		ill_to_v4->ill_move_peer = ill_from_v4;
-		ill_from_v4->ill_move_peer = ill_to_v4;
-		err = ill_move(ill_from_v4, ill_to_v4, q, mp);
-	}
-
-	/*
-	 * Now lets move ipv6.
-	 */
-	if (err == 0 && ill_from_v6 != NULL) {
-		ASSERT(IAM_WRITER_ILL(ill_to_v6));
-		ill_from_v6->ill_move_in_progress = B_TRUE;
-		ill_to_v6->ill_move_in_progress = B_TRUE;
-		ill_to_v6->ill_move_peer = ill_from_v6;
-		ill_from_v6->ill_move_peer = ill_to_v6;
-		err = ill_move(ill_from_v6, ill_to_v6, q, mp);
-	}
-
-err_ret:
-	/*
-	 * EINPROGRESS means we are waiting for the ipif's that need to be
-	 * moved to become quiescent.
-	 */
-	if (err == EINPROGRESS) {
-		goto done;
-	}
-
-	/*
-	 * if err is set ill_up_ipifs will not be called
-	 * lets clear the flags.
-	 */
-
-	GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
-	GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
-	/*
-	 * Some of the clearing may be redundant. But it is simple
-	 * not making any extra checks.
-	 */
-	if (ill_from_v6 != NULL) {
-		ill_from_v6->ill_move_in_progress = B_FALSE;
-		ill_from_v6->ill_move_peer = NULL;
-		ill_from_v6->ill_state_flags &= ~ILL_CHANGING;
-	}
-	if (ill_from_v4 != NULL) {
-		ill_from_v4->ill_move_in_progress = B_FALSE;
-		ill_from_v4->ill_move_peer = NULL;
-		ill_from_v4->ill_state_flags &= ~ILL_CHANGING;
-	}
-	if (ill_to_v6 != NULL) {
-		ill_to_v6->ill_move_in_progress = B_FALSE;
-		ill_to_v6->ill_move_peer = NULL;
-		ill_to_v6->ill_state_flags &= ~ILL_CHANGING;
-	}
-	if (ill_to_v4 != NULL) {
-		ill_to_v4->ill_move_in_progress = B_FALSE;
-		ill_to_v4->ill_move_peer = NULL;
-		ill_to_v4->ill_state_flags &= ~ILL_CHANGING;
-	}
-
-	/*
-	 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set.
-	 * Do this always to maintain proper state i.e even in case of errors.
-	 * As phyint_inactive looks at both v4 and v6 interfaces,
-	 * we need not call on both v4 and v6 interfaces.
-	 */
-	if (ill_from_v4 != NULL) {
-		if ((ill_from_v4->ill_phyint->phyint_flags &
-		    (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
-			phyint_inactive(ill_from_v4->ill_phyint);
-		}
-	} else if (ill_from_v6 != NULL) {
-		if ((ill_from_v6->ill_phyint->phyint_flags &
-		    (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
-			phyint_inactive(ill_from_v6->ill_phyint);
-		}
-	}
-
-	if (ill_to_v4 != NULL) {
-		if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) {
-			ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
-		}
-	} else if (ill_to_v6 != NULL) {
-		if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) {
-			ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
-		}
-	}
-
-	RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
-	RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
-
-no_err:
-	/*
-	 * lets bring the interfaces up on the to_ill.
-	 */
-	if (err == 0) {
-		err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4,
-		    q, mp);
-	}
-
-	if (err == 0) {
-		if (ill_from_v4 != NULL && ill_to_v4 != NULL)
-			ilm_send_multicast_reqs(ill_from_v4, ill_to_v4);
+	lifgroupinfo_t	*lifgr;
+	ipmp_grp_t	*grp;
+	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
 
-		if (ill_from_v6 != NULL && ill_to_v6 != NULL)
-			ilm_send_multicast_reqs(ill_from_v6, ill_to_v6);
-	}
-done:
+	/* ip_wput_nondata() verified mp->b_cont->b_cont */
+	lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
+	lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
 
-	if (ill_to_v4 != NULL) {
-		ill_refrele(ill_to_v4);
-	}
-	if (ill_to_v6 != NULL) {
-		ill_refrele(ill_to_v6);
+	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+	if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
+		rw_exit(&ipst->ips_ipmp_lock);
+		return (ENOENT);
 	}
-
-	return (err);
+	ipmp_grp_info(grp, lifgr);
+	rw_exit(&ipst->ips_ipmp_lock);
+	return (0);
 }
 
 static void
@@ -18167,10 +14492,9 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
 	 * we only wait for the ACK of the DL_UNBIND_REQ.
 	 */
 	mutex_enter(&ill->ill_lock);
-	if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
-	    (prim == DL_UNBIND_REQ)) {
+	if (!(ill->ill_state_flags & ILL_CONDEMNED) || (prim == DL_UNBIND_REQ))
 		ill->ill_dlpi_pending = prim;
-	}
+
 	mutex_exit(&ill->ill_lock);
 	putnext(ill->ill_wq, mp);
 }
@@ -18324,6 +14648,7 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
 {
 	mblk_t *mp;
 	ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
+	ipxop_t *ipx = ipsq->ipsq_xop;
 
 	ASSERT(IAM_WRITER_IPSQ(ipsq));
 	mutex_enter(&ill->ill_lock);
@@ -18336,12 +14661,11 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
 
 	if ((mp = ill->ill_dlpi_deferred) == NULL) {
 		ill->ill_dlpi_pending = DL_PRIM_INVAL;
-
-		mutex_enter(&ipsq->ipsq_lock);
-		if (ipsq->ipsq_current_done)
-			ipsq->ipsq_current_ipif = NULL;
-		mutex_exit(&ipsq->ipsq_lock);
-
+		if (ipx->ipx_current_done) {
+			mutex_enter(&ipx->ipx_lock);
+			ipx->ipx_current_ipif = NULL;
+			mutex_exit(&ipx->ipx_lock);
+		}
 		cv_signal(&ill->ill_cv);
 		mutex_exit(&ill->ill_lock);
 		return;
@@ -18379,7 +14703,7 @@ conn_delete_ire(conn_t *connp, caddr_t arg)
 }
 
 /*
- * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number
+ * Some operations (e.g., ipif_down()) conditionally delete a number
  * of IREs. Those IREs may have been previously cached in the conn structure.
  * This ipcl_walk() walker function releases all references to such IREs based
  * on the condemned flag.
@@ -18403,7 +14727,6 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
 
 /*
  * Take down a specific interface, but don't lose any information about it.
- * Also delete interface from its interface group (ifgrp).
  * (Always called as writer.)
  * This function goes through the down sequence even if the interface is
  * already down. There are 2 reasons.
@@ -18501,7 +14824,7 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
  * For eg. bind, and route operations (Eg. route add / delete) cannot return
  * failure if the ipif is currently undergoing an exclusive operation, and
  * hence pass the flag. The mblk is then enqueued in the ipsq and the operation
- * is restarted by ipsq_exit() when the currently exclusive ioctl completes.
+ * is restarted by ipsq_exit() when the current exclusive operation completes.
  * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The
  * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
  * change while the ill_lock is held. Before dropping the ill_lock we acquire
@@ -18522,7 +14845,6 @@ int
 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 {
 	ill_t		*ill = ipif->ipif_ill;
-	phyint_t	*phyi;
 	conn_t		*connp;
 	boolean_t	success;
 	boolean_t	ipif_was_up = B_FALSE;
@@ -18569,20 +14891,7 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 	}
 
 	/*
-	 * Before we delete the ill from the group (if any), we need
-	 * to make sure that we delete all the routes dependent on
-	 * this and also any ipifs dependent on this ipif for
-	 * source address. We need to do before we delete from
-	 * the group because
-	 *
-	 * 1) ipif_down_delete_ire de-references ill->ill_group.
-	 *
-	 * 2) ipif_update_other_ipifs needs to walk the whole group
-	 *    for re-doing source address selection. Note that
-	 *    ipif_select_source[_v6] called from
-	 *    ipif_update_other_ipifs[_v6] will not pick this ipif
-	 *    because we have already marked down here i.e cleared
-	 *    IPIF_UP.
+	 * Delete all IRE's pointing at this ipif or its source address.
 	 */
 	if (ipif->ipif_isv6) {
 		ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES,
@@ -18592,6 +14901,17 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		    ipst);
 	}
 
+	if (ipif_was_up && ill->ill_ipif_up_count == 0) {
+		/*
+		 * Since the interface is now down, it may have just become
+		 * inactive.  Note that this needs to be done even for a
+		 * lll_logical_down(), or ARP entries will not get correctly
+		 * restored when the interface comes back up.
+		 */
+		if (IS_UNDER_IPMP(ill))
+			ipmp_ill_refresh_active(ill);
+	}
+
 	/*
 	 * Cleaning up the conn_ire_cache or conns must be done only after the
 	 * ires have been deleted above. Otherwise a thread could end up
@@ -18609,53 +14929,9 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
 	 * entries for such ipifs.
 	 */
 	if (ipif->ipif_isv6)
-		ipif_update_other_ipifs_v6(ipif, ill->ill_group);
+		ipif_update_other_ipifs_v6(ipif);
 	else
-		ipif_update_other_ipifs(ipif, ill->ill_group);
-
-	if (ipif_was_up) {
-		/*
-		 * Check whether it is last ipif to leave this group.
-		 * If this is the last ipif to leave, we should remove
-		 * this ill from the group as ipif_select_source will not
-		 * be able to find any useful ipifs if this ill is selected
-		 * for load balancing.
-		 *
-		 * For nameless groups, we should call ifgrp_delete if this
-		 * belongs to some group. As this ipif is going down, we may
-		 * need to reconstruct groups.
-		 */
-		phyi = ill->ill_phyint;
-		/*
-		 * If the phyint_groupname_len is 0, it may or may not
-		 * be in the nameless group. If the phyint_groupname_len is
-		 * not 0, then this ill should be part of some group.
-		 * As we always insert this ill in the group if
-		 * phyint_groupname_len is not zero when the first ipif
-		 * comes up (in ipif_up_done), it should be in a group
-		 * when the namelen is not 0.
-		 *
-		 * NOTE : When we delete the ill from the group,it will
-		 * blow away all the IRE_CACHES pointing either at this ipif or
-		 * ill_wq (illgrp_cache_delete does this). Thus, no IRES
-		 * should be pointing at this ill.
-		 */
-		ASSERT(phyi->phyint_groupname_len == 0 ||
-		    (phyi->phyint_groupname != NULL && ill->ill_group != NULL));
-
-		if (phyi->phyint_groupname_len != 0) {
-			if (ill->ill_ipif_up_count == 0)
-				illgrp_delete(ill);
-		}
-
-		/*
-		 * If we have deleted some of the broadcast ires associated
-		 * with this ipif, we need to re-nominate somebody else if
-		 * the ires that we deleted were the nominated ones.
-		 */
-		if (ill->ill_group != NULL && !ill->ill_isv6)
-			ipif_renominate_bcast(ipif);
-	}
+		ipif_update_other_ipifs(ipif);
 
 	/*
 	 * neighbor-discovery or arp entries for this interface.
@@ -18734,17 +15010,12 @@ ipif_down_tail(ipif_t *ipif)
 	ill->ill_logical_down = 0;
 
 	/*
-	 * Have to be after removing the routes in ipif_down_delete_ire.
+	 * Has to be after removing the routes in ipif_down_delete_ire.
 	 */
-	if (ipif->ipif_isv6) {
-		if (ill->ill_flags & ILLF_XRESOLV)
-			ipif_arp_down(ipif);
-	} else {
-		ipif_arp_down(ipif);
-	}
+	ipif_resolver_down(ipif);
 
-	ip_rts_ifmsg(ipif);
-	ip_rts_newaddrmsg(RTM_DELETE, 0, ipif);
+	ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+	ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
 }
 
 /*
@@ -18804,39 +15075,11 @@ static void
 ipif_down_delete_ire(ire_t *ire, char *ipif_arg)
 {
 	ipif_t	*ipif = (ipif_t *)ipif_arg;
-	ill_t *ire_ill;
-	ill_t *ipif_ill;
 
 	ASSERT(IAM_WRITER_IPIF(ipif));
 	if (ire->ire_ipif == NULL)
 		return;
 
-	/*
-	 * For IPv4, we derive source addresses for an IRE from ipif's
-	 * belonging to the same IPMP group as the IRE's outgoing
-	 * interface.  If an IRE's outgoing interface isn't in the
-	 * same IPMP group as a particular ipif, then that ipif
-	 * couldn't have been used as a source address for this IRE.
-	 *
-	 * For IPv6, source addresses are only restricted to the IPMP group
-	 * if the IRE is for a link-local address or a multicast address.
-	 * Otherwise, source addresses for an IRE can be chosen from
-	 * interfaces other than the the outgoing interface for that IRE.
-	 *
-	 * For source address selection details, see ipif_select_source()
-	 * and ipif_select_source_v6().
-	 */
-	if (ire->ire_ipversion == IPV4_VERSION ||
-	    IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) ||
-	    IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
-		ire_ill = ire->ire_ipif->ipif_ill;
-		ipif_ill = ipif->ipif_ill;
-
-		if (ire_ill->ill_group != ipif_ill->ill_group) {
-			return;
-		}
-	}
-
 	if (ire->ire_ipif != ipif) {
 		/*
 		 * Look for a matching source address.
@@ -18875,83 +15118,53 @@ void
 ill_ipif_cache_delete(ire_t *ire, char *ill_arg)
 {
 	ill_t	*ill = (ill_t *)ill_arg;
-	ill_t	*ipif_ill;
 
 	ASSERT(IAM_WRITER_ILL(ill));
-	/*
-	 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
-	 * Hence this should be IRE_CACHE.
-	 */
 	ASSERT(ire->ire_type == IRE_CACHE);
 
 	/*
-	 * We are called for IRE_CACHES whose ire_ipif matches ill.
-	 * We are only interested in IRE_CACHES that has borrowed
-	 * the source address from ill_arg e.g. ipif_up_done[_v6]
-	 * for which we need to look at ire_ipif->ipif_ill match
-	 * with ill.
+	 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
+	 * ill, but we only want to delete the IRE if ire_ipif matches.
 	 */
 	ASSERT(ire->ire_ipif != NULL);
-	ipif_ill = ire->ire_ipif->ipif_ill;
-	if (ipif_ill == ill || (ill->ill_group != NULL &&
-	    ipif_ill->ill_group == ill->ill_group)) {
+	if (ill == ire->ire_ipif->ipif_ill)
 		ire_delete(ire);
-	}
 }
 
 /*
- * Delete all the ire whose stq references ill_arg.
+ * Delete all the IREs whose ire_stq's reference `ill_arg'.  IPMP uses this
+ * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references
+ * the IPMP ill.
  */
-static void
+void
 ill_stq_cache_delete(ire_t *ire, char *ill_arg)
 {
 	ill_t	*ill = (ill_t *)ill_arg;
-	ill_t	*ire_ill;
 
 	ASSERT(IAM_WRITER_ILL(ill));
-	/*
-	 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
-	 * Hence this should be IRE_CACHE.
-	 */
 	ASSERT(ire->ire_type == IRE_CACHE);
 
 	/*
-	 * We are called for IRE_CACHES whose ire_stq and ire_ipif
-	 * matches ill. We are only interested in IRE_CACHES that
-	 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the
-	 * filtering here.
+	 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
+	 * ill, but we only want to delete the IRE if ire_stq matches.
 	 */
-	ire_ill = (ill_t *)ire->ire_stq->q_ptr;
-
-	if (ire_ill == ill)
+	if (ire->ire_stq->q_ptr == ill_arg)
 		ire_delete(ire);
 }
 
 /*
- * This is called when an ill leaves the group. We want to delete
- * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is
- * pointing at ill.
+ * Delete all broadcast IREs with a source address on `ill_arg'.
  */
 static void
-illgrp_cache_delete(ire_t *ire, char *ill_arg)
+ill_broadcast_delete(ire_t *ire, char *ill_arg)
 {
-	ill_t	*ill = (ill_t *)ill_arg;
+	ill_t *ill = (ill_t *)ill_arg;
 
 	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(ill->ill_group == NULL);
-	/*
-	 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
-	 * Hence this should be IRE_CACHE.
-	 */
-	ASSERT(ire->ire_type == IRE_CACHE);
-	/*
-	 * We are called for IRE_CACHES whose ire_stq and ire_ipif
-	 * matches ill. We are interested in both.
-	 */
-	ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) ||
-	    (ire->ire_ipif->ipif_ill == ill));
+	ASSERT(ire->ire_type == IRE_BROADCAST);
 
-	ire_delete(ire);
+	if (ire->ire_ipif->ipif_ill == ill)
+		ire_delete(ire);
 }
 
 /*
@@ -18997,13 +15210,12 @@ ipif_free(ipif_t *ipif)
 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
 	/* Remove pointers to this ill in the multicast routing tables */
 	reset_mrt_vif_ipif(ipif);
+	/* If necessary, clear the cached source ipif rotor. */
+	if (ipif->ipif_ill->ill_src_ipif == ipif)
+		ipif->ipif_ill->ill_src_ipif = NULL;
 	rw_exit(&ipst->ips_ill_g_lock);
 }
 
-/*
- * Warning: this is not the only function that calls mi_free on an ipif_t.  See
- * also ill_move().
- */
 static void
 ipif_free_tail(ipif_t *ipif)
 {
@@ -19036,7 +15248,7 @@ ipif_free_tail(ipif_t *ipif)
 	sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
 
 	/* Get it out of the ILL interface list. */
-	ipif_remove(ipif, B_TRUE);
+	ipif_remove(ipif);
 	rw_exit(&ipst->ips_ill_g_lock);
 
 	mutex_destroy(&ipif->ipif_saved_ire_lock);
@@ -19208,8 +15420,10 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 			} else if (IPIF_CAN_WAIT(ipif, q)) {
 				ipsq = ill->ill_phyint->phyint_ipsq;
 				mutex_enter(&ipsq->ipsq_lock);
+				mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 				mutex_exit(&ill->ill_lock);
 				ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+				mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 				mutex_exit(&ipsq->ipsq_lock);
 				RELEASE_CONN_LOCK(q);
 				ill_refrele(ill);
@@ -19244,7 +15458,7 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
 		ire_type = IRE_LOOPBACK;
 	else
 		ire_type = IRE_LOCAL;
-	ipif = ipif_allocate(ill, id, ire_type, B_TRUE);
+	ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE);
 	if (ipif != NULL)
 		ipif_refhold_locked(ipif);
 	else if (error != NULL)
@@ -19342,65 +15556,62 @@ ill_mtu_change(ire_t *ire, char *ill_arg)
 void
 ipif_multicast_up(ipif_t *ipif)
 {
-	int err, index;
+	int err;
 	ill_t *ill;
 
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
 	ill = ipif->ipif_ill;
-	index = ill->ill_phyint->phyint_ifindex;
 
 	ip1dbg(("ipif_multicast_up\n"));
 	if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up)
 		return;
 
 	if (ipif->ipif_isv6) {
+		in6_addr_t v6allmc = ipv6_all_hosts_mcast;
+		in6_addr_t v6solmc = ipv6_solicited_node_mcast;
+
+		v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
+
 		if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
 			return;
 
-		/* Join the all hosts multicast address */
 		ip1dbg(("ipif_multicast_up - addmulti\n"));
+
 		/*
-		 * Passing B_TRUE means we have to join the multicast
-		 * membership on this interface even though this is
-		 * FAILED. If we join on a different one in the group,
-		 * we will not be able to delete the membership later
-		 * as we currently don't track where we join when we
-		 * join within the kernel unlike applications where
-		 * we have ilg/ilg_orig_index. See ip_addmulti_v6
-		 * for more on this.
+		 * Join the all hosts multicast address.  We skip this for
+		 * underlying IPMP interfaces since they should be invisible.
 		 */
-		err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index,
-		    ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
-		if (err != 0) {
-			ip0dbg(("ipif_multicast_up: "
-			    "all_hosts_mcast failed %d\n",
-			    err));
-			return;
+		if (!IS_UNDER_IPMP(ill)) {
+			err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid,
+			    ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
+			if (err != 0) {
+				ip0dbg(("ipif_multicast_up: "
+				    "all_hosts_mcast failed %d\n", err));
+				return;
+			}
+			ipif->ipif_joined_allhosts = 1;
 		}
+
 		/*
 		 * Enable multicast for the solicited node multicast address
 		 */
 		if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
-			in6_addr_t ipv6_multi = ipv6_solicited_node_mcast;
-
-			ipv6_multi.s6_addr32[3] |=
-			    ipif->ipif_v6lcl_addr.s6_addr32[3];
-
-			err = ip_addmulti_v6(&ipv6_multi, ill, index,
-			    ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE,
-			    NULL);
+			err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid,
+			    ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
 			if (err != 0) {
 				ip0dbg(("ipif_multicast_up: solicited MC"
 				    " failed %d\n", err));
-				(void) ip_delmulti_v6(&ipv6_all_hosts_mcast,
-				    ill, ill->ill_phyint->phyint_ifindex,
-				    ipif->ipif_zoneid, B_TRUE, B_TRUE);
+				if (ipif->ipif_joined_allhosts) {
+					(void) ip_delmulti_v6(&v6allmc, ill,
+					    ipif->ipif_zoneid, B_TRUE, B_TRUE);
+					ipif->ipif_joined_allhosts = 0;
+				}
 				return;
 			}
 		}
 	} else {
-		if (ipif->ipif_lcl_addr == INADDR_ANY)
+		if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
 			return;
 
 		/* Join the all hosts multicast address */
@@ -19420,7 +15631,7 @@ ipif_multicast_up(ipif_t *ipif)
  * (Explicit memberships are blown away in ill_leave_multicast() when the
  * ill is brought down.)
  */
-static void
+void
 ipif_multicast_down(ipif_t *ipif)
 {
 	int err;
@@ -19444,19 +15655,18 @@ ipif_multicast_down(ipif_t *ipif)
 	}
 
 	/*
-	 * Leave the all hosts multicast address. Similar to ip_addmulti_v6,
-	 * we should look for ilms on this ill rather than the ones that have
-	 * been failed over here.  They are here temporarily. As
-	 * ipif_multicast_up has joined on this ill, we should delete only
-	 * from this ill.
+	 * Leave the all-hosts multicast address.
 	 */
-	err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
-	    ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid,
-	    B_TRUE, B_TRUE);
-	if (err != 0) {
-		ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n",
-		    err));
+	if (ipif->ipif_joined_allhosts) {
+		err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
+		    ipif->ipif_zoneid, B_TRUE, B_TRUE);
+		if (err != 0) {
+			ip0dbg(("ipif_multicast_down: all_hosts_mcast "
+			    "failed %d\n", err));
+		}
+		ipif->ipif_joined_allhosts = 0;
 	}
+
 	/*
 	 * Disable multicast for the solicited node multicast address
 	 */
@@ -19467,9 +15677,7 @@ ipif_multicast_down(ipif_t *ipif)
 		    ipif->ipif_v6lcl_addr.s6_addr32[3];
 
 		err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill,
-		    ipif->ipif_ill->ill_phyint->phyint_ifindex,
 		    ipif->ipif_zoneid, B_TRUE, B_TRUE);
-
 		if (err != 0) {
 			ip0dbg(("ipif_multicast_down: sol MC failed %d\n",
 			    err));
@@ -19683,9 +15891,8 @@ ipif_set_default(ipif_t *ipif)
  * Return 0 if this address can be used as local address without causing
  * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
  * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
- * Special checks are needed to allow the same IPv6 link-local address
- * on different ills.
- * TODO: allowing the same site-local address on different ill's.
+ * Note that the same IPv6 link-local address is allowed as long as the ills
+ * are not on the same link.
  */
 int
 ip_addr_availability_check(ipif_t *new_ipif)
@@ -19717,30 +15924,26 @@ ip_addr_availability_check(ipif_t *new_ipif)
 		    ipif = ipif->ipif_next) {
 			if ((ipif == new_ipif) ||
 			    !(ipif->ipif_flags & IPIF_UP) ||
-			    (ipif->ipif_flags & IPIF_UNNUMBERED))
+			    (ipif->ipif_flags & IPIF_UNNUMBERED) ||
+			    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
+			    &our_v6addr))
 				continue;
-			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
-			    &our_v6addr)) {
-				if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
-					new_ipif->ipif_flags |= IPIF_UNNUMBERED;
-				else if (ipif->ipif_flags & IPIF_POINTOPOINT)
-					ipif->ipif_flags |= IPIF_UNNUMBERED;
-				else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) &&
-				    new_ipif->ipif_ill != ill)
-					continue;
-				else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) &&
-				    new_ipif->ipif_ill != ill)
-					continue;
-				else if (new_ipif->ipif_zoneid !=
-				    ipif->ipif_zoneid &&
-				    ipif->ipif_zoneid != ALL_ZONES &&
-				    IS_LOOPBACK(ill))
-					continue;
-				else if (new_ipif->ipif_ill == ill)
-					return (EADDRINUSE);
-				else
-					return (EADDRNOTAVAIL);
-			}
+
+			if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
+				new_ipif->ipif_flags |= IPIF_UNNUMBERED;
+			else if (ipif->ipif_flags & IPIF_POINTOPOINT)
+				ipif->ipif_flags |= IPIF_UNNUMBERED;
+			else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
+			    IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
+			    !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
+				continue;
+			else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
+			    ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
+				continue;
+			else if (new_ipif->ipif_ill == ill)
+				return (EADDRINUSE);
+			else
+				return (EADDRNOTAVAIL);
 		}
 	}
 
@@ -19753,13 +15956,15 @@ ip_addr_availability_check(ipif_t *new_ipif)
  * When the routine returns EINPROGRESS then mp has been consumed and
  * the ioctl will be acked from ip_rput_dlpi.
  */
-static int
+int
 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 {
-	ill_t	*ill = ipif->ipif_ill;
-	boolean_t isv6 = ipif->ipif_isv6;
-	int	err = 0;
-	boolean_t success;
+	ill_t		*ill = ipif->ipif_ill;
+	boolean_t 	isv6 = ipif->ipif_isv6;
+	int		err = 0;
+	boolean_t	success;
+	uint_t		ipif_orig_id;
+	ip_stack_t	*ipst = ill->ill_ipst;
 
 	ASSERT(IAM_WRITER_IPIF(ipif));
 
@@ -19769,6 +15974,123 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 	if (ipif->ipif_flags & IPIF_UP)
 		return (EALREADY);
 
+	/*
+	 * If this is a request to bring up a data address on an interface
+	 * under IPMP, then move the address to its IPMP meta-interface and
+	 * try to bring it up.  One complication is that the zeroth ipif for
+	 * an ill is special, in that every ill always has one, and that code
+	 * throughout IP deferences ill->ill_ipif without holding any locks.
+	 */
+	if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
+	    (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
+		ipif_t	*stubipif = NULL, *moveipif = NULL;
+		ill_t	*ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
+
+		/*
+		 * The ipif being brought up should be quiesced.  If it's not,
+		 * something has gone amiss and we need to bail out.  (If it's
+		 * quiesced, we know it will remain so via IPIF_CHANGING.)
+		 */
+		mutex_enter(&ill->ill_lock);
+		if (!ipif_is_quiescent(ipif)) {
+			mutex_exit(&ill->ill_lock);
+			return (EINVAL);
+		}
+		mutex_exit(&ill->ill_lock);
+
+		/*
+		 * If we're going to need to allocate ipifs, do it prior
+		 * to starting the move (and grabbing locks).
+		 */
+		if (ipif->ipif_id == 0) {
+			moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
+			    B_FALSE);
+			stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
+			    B_FALSE);
+			if (moveipif == NULL || stubipif == NULL) {
+				mi_free(moveipif);
+				mi_free(stubipif);
+				return (ENOMEM);
+			}
+		}
+
+		/*
+		 * Grab or transfer the ipif to move.  During the move, keep
+		 * ill_g_lock held to prevent any ill walker threads from
+		 * seeing things in an inconsistent state.
+		 */
+		rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+		if (ipif->ipif_id != 0) {
+			ipif_remove(ipif);
+		} else {
+			ipif_transfer(ipif, moveipif, stubipif);
+			ipif = moveipif;
+		}
+
+		/*
+		 * Place the ipif on the IPMP ill.  If the zeroth ipif on
+		 * the IPMP ill is a stub (0.0.0.0 down address) then we
+		 * replace that one.  Otherwise, pick the next available slot.
+		 */
+		ipif->ipif_ill = ipmp_ill;
+		ipif_orig_id = ipif->ipif_id;
+
+		if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
+			ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
+			ipif = ipmp_ill->ill_ipif;
+		} else {
+			ipif->ipif_id = -1;
+			if (ipif_insert(ipif, B_FALSE) != 0) {
+				/*
+				 * No more available ipif_id's -- put it back
+				 * on the original ill and fail the operation.
+				 * Since we're writer on the ill, we can be
+				 * sure our old slot is still available.
+				 */
+				ipif->ipif_id = ipif_orig_id;
+				ipif->ipif_ill = ill;
+				if (ipif_orig_id == 0) {
+					ipif_transfer(ipif, ill->ill_ipif,
+					    NULL);
+				} else {
+					VERIFY(ipif_insert(ipif, B_FALSE) == 0);
+				}
+				rw_exit(&ipst->ips_ill_g_lock);
+				return (ENOMEM);
+			}
+		}
+		rw_exit(&ipst->ips_ill_g_lock);
+
+		/*
+		 * Tell SCTP that the ipif has moved.  Note that even if we
+		 * had to allocate a new ipif, the original sequence id was
+		 * preserved and therefore SCTP won't know.
+		 */
+		sctp_move_ipif(ipif, ill, ipmp_ill);
+
+		/*
+		 * If the ipif being brought up was on slot zero, then we
+		 * first need to bring up the placeholder we stuck there.  In
+		 * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call
+		 * to ipif_up() itself, if we successfully bring up the
+		 * placeholder, we'll check ill_move_ipif and bring it up too.
+		 */
+		if (ipif_orig_id == 0) {
+			ASSERT(ill->ill_move_ipif == NULL);
+			ill->ill_move_ipif = ipif;
+			if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
+				ASSERT(ill->ill_move_ipif == NULL);
+			if (err != EINPROGRESS)
+				ill->ill_move_ipif = NULL;
+			return (err);
+		}
+
+		/*
+		 * Bring it up on the IPMP ill.
+		 */
+		return (ipif_up(ipif, q, mp));
+	}
+
 	/* Skip arp/ndp for any loopback interface. */
 	if (ill->ill_wq != NULL) {
 		conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
@@ -19798,7 +16120,6 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		 */
 
 		ASSERT(connp != NULL || !CONN_Q(q));
-		ASSERT(ipsq->ipsq_pending_mp == NULL);
 		if (connp != NULL)
 			mutex_enter(&connp->conn_lock);
 		mutex_enter(&ill->ill_lock);
@@ -19810,27 +16131,25 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 			return (EINTR);
 
 		/*
-		 * Crank up IPv6 neighbor discovery
-		 * Unlike ARP, this should complete when
-		 * ipif_ndp_up returns. However, for
-		 * ILLF_XRESOLV interfaces we also send a
-		 * AR_INTERFACE_UP to the external resolver.
-		 * That ioctl will complete in ip_rput.
+		 * Crank up the resolver.  For IPv6, this cranks up the
+		 * external resolver if one is configured, but even if an
+		 * external resolver isn't configured, it must be called to
+		 * reset DAD state.  For IPv6, if an external resolver is not
+		 * being used, ipif_resolver_up() will never return
+		 * EINPROGRESS, so we can always call ipif_ndp_up() here.
+		 * Note that if an external resolver is being used, there's no
+		 * need to call ipif_ndp_up() since it will do nothing.
 		 */
-		if (isv6) {
-			err = ipif_ndp_up(ipif);
-			if (err != 0) {
-				if (err != EINPROGRESS)
-					mp = ipsq_pending_mp_get(ipsq, &connp);
-				return (err);
-			}
-		}
-		/* Now, ARP */
 		err = ipif_resolver_up(ipif, Res_act_initial);
 		if (err == EINPROGRESS) {
-			/* We will complete it in ip_arp_done */
+			/* We will complete it in ip_arp_done() */
 			return (err);
 		}
+
+		if (isv6 && err == 0)
+			err = ipif_ndp_up(ipif, B_TRUE);
+
+		ASSERT(err != EINPROGRESS);
 		mp = ipsq_pending_mp_get(ipsq, &connp);
 		ASSERT(mp != NULL);
 		if (err != 0)
@@ -19843,7 +16162,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
 		ipif->ipif_addr_ready = 1;
 	}
-	return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
+
+	err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif);
+	if (err == 0 && ill->ill_move_ipif != NULL) {
+		ipif = ill->ill_move_ipif;
+		ill->ill_move_ipif = NULL;
+		return (ipif_up(ipif, q, mp));
+	}
+	return (err);
 }
 
 /*
@@ -19939,13 +16265,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 	return (EINPROGRESS);
 bad:
 	ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
-	/*
-	 * We don't have to check for possible removal from illgrp
-	 * as we have not yet inserted in illgrp. For groups
-	 * without names, this ipif is still not UP and hence
-	 * this could not have possibly had any influence in forming
-	 * groups.
-	 */
 
 	freemsg(bind_mp);
 	freemsg(unbind_mp);
@@ -19974,12 +16293,10 @@ ipif_up_done(ipif_t *ipif)
 	ipif_t   *tmp_ipif;
 	boolean_t	flush_ire_cache = B_TRUE;
 	int	err = 0;
-	phyint_t *phyi;
 	ire_t	**ipif_saved_irep = NULL;
 	int ipif_saved_ire_cnt;
 	int	cnt;
 	boolean_t	src_ipif_held = B_FALSE;
-	boolean_t	ire_added = B_FALSE;
 	boolean_t	loopback = B_FALSE;
 	ip_stack_t	*ipst = ill->ill_ipst;
 
@@ -20010,7 +16327,7 @@ ipif_up_done(ipif_t *ipif)
 		break;
 	}
 	if (flush_ire_cache)
-		ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE,
+		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
 		    IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
 
 	/*
@@ -20044,7 +16361,9 @@ ipif_up_done(ipif_t *ipif)
 			ipif->ipif_ire_type = IRE_LOCAL;
 	}
 
-	if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) {
+	if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
+	    ((ipif->ipif_flags & IPIF_DEPRECATED) &&
+	    !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
 		/*
 		 * Can't use our source address. Select a different
 		 * source address for the IRE_INTERFACE and IRE_LOCAL
@@ -20189,11 +16508,9 @@ ipif_up_done(ipif_t *ipif)
 	}
 
 	/*
-	 * Need to atomically check for ip_addr_availablity_check
-	 * under ip_addr_avail_lock, and if it fails got bad, and remove
-	 * from group also.The ill_g_lock is grabbed as reader
-	 * just to make sure no new ills or new ipifs are being added
-	 * to the system while we are checking the uniqueness of addresses.
+	 * Need to atomically check for IP address availability under
+	 * ip_addr_avail_lock.  ill_g_lock is held as reader to ensure no new
+	 * ills or new ipifs can be added while we are checking availability.
 	 */
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	mutex_enter(&ipst->ips_ip_addr_avail_lock);
@@ -20227,13 +16544,6 @@ ipif_up_done(ipif_t *ipif)
 	/*
 	 * Add in all newly created IREs.  ire_create_bcast() has
 	 * already checked for duplicates of the IRE_BROADCAST type.
-	 * We want to add before we call ifgrp_insert which wants
-	 * to know whether IRE_IF_RESOLVER exists or not.
-	 *
-	 * NOTE : We refrele the ire though we may branch to "bad"
-	 *	  later on where we do ire_delete. This is okay
-	 *	  because nobody can delete it as we are running
-	 *	  exclusively.
 	 */
 	for (irep1 = irep; irep1 > ire_array; ) {
 		irep1--;
@@ -20243,44 +16553,6 @@ ipif_up_done(ipif_t *ipif)
 		 */
 		(void) ire_add(irep1, NULL, NULL, NULL, B_FALSE);
 	}
-	ire_added = B_TRUE;
-	/*
-	 * Form groups if possible.
-	 *
-	 * If we are supposed to be in a ill_group with a name, insert it
-	 * now as we know that at least one ipif is UP. Otherwise form
-	 * nameless groups.
-	 *
-	 * If ip_enable_group_ifs is set and ipif address is not 0, insert
-	 * this ipif into the appropriate interface group, or create a
-	 * new one. If this is already in a nameless group, we try to form
-	 * a bigger group looking at other ills potentially sharing this
-	 * ipif's prefix.
-	 */
-	phyi = ill->ill_phyint;
-	if (phyi->phyint_groupname_len != 0) {
-		ASSERT(phyi->phyint_groupname != NULL);
-		if (ill->ill_ipif_up_count == 1) {
-			ASSERT(ill->ill_group == NULL);
-			err = illgrp_insert(&ipst->ips_illgrp_head_v4, ill,
-			    phyi->phyint_groupname, NULL, B_TRUE);
-			if (err != 0) {
-				ip1dbg(("ipif_up_done: illgrp allocation "
-				    "failed, error %d\n", err));
-				goto bad;
-			}
-		}
-		ASSERT(ill->ill_group != NULL);
-	}
-
-	/*
-	 * When this is part of group, we need to make sure that
-	 * any broadcast ires created because of this ipif coming
-	 * UP gets marked/cleared with IRE_MARK_NORECV appropriately
-	 * so that we don't receive duplicate broadcast packets.
-	 */
-	if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0)
-		ipif_renominate_bcast(ipif);
 
 	/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
 	ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
@@ -20331,19 +16603,30 @@ ipif_up_done(ipif_t *ipif)
 		 */
 		ill_recover_multicast(ill);
 	}
-	/* Join the allhosts multicast address */
-	ipif_multicast_up(ipif);
 
-	if (!loopback) {
+	if (ill->ill_ipif_up_count == 1) {
+		/*
+		 * Since the interface is now up, it may now be active.
+		 */
+		if (IS_UNDER_IPMP(ill))
+			ipmp_ill_refresh_active(ill);
+
 		/*
-		 * See whether anybody else would benefit from the
-		 * new ipif that we added. We call this always rather
-		 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST
-		 * ipif is for the benefit of illgrp_insert (done above)
-		 * which does not do source address selection as it does
-		 * not want to re-create interface routes that we are
-		 * having reference to it here.
+		 * If this is an IPMP interface, we may now be able to
+		 * establish ARP entries.
 		 */
+		if (IS_IPMP(ill))
+			ipmp_illgrp_refresh_arpent(ill->ill_grp);
+	}
+
+	/* Join the allhosts multicast address */
+	ipif_multicast_up(ipif);
+
+	/*
+	 * See if anybody else would benefit from our new ipif.
+	 */
+	if (!loopback &&
+	    !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
 		ill_update_source_selection(ill);
 	}
 
@@ -20386,27 +16669,11 @@ ipif_up_done(ipif_t *ipif)
 
 bad:
 	ip1dbg(("ipif_up_done: FAILED \n"));
-	/*
-	 * We don't have to bother removing from ill groups because
-	 *
-	 * 1) For groups with names, we insert only when the first ipif
-	 *    comes up. In that case if it fails, it will not be in any
-	 *    group. So, we need not try to remove for that case.
-	 *
-	 * 2) For groups without names, either we tried to insert ipif_ill
-	 *    in a group as singleton or found some other group to become
-	 *    a bigger group. For the former, if it fails we don't have
-	 *    anything to do as ipif_ill is not in the group and for the
-	 *    latter, there are no failures in illgrp_insert/illgrp_delete
-	 *    (ENOMEM can't occur for this. Check ifgrp_insert).
-	 */
+
 	while (irep > ire_array) {
 		irep--;
-		if (*irep != NULL) {
+		if (*irep != NULL)
 			ire_delete(*irep);
-			if (ire_added)
-				ire_refrele(*irep);
-		}
 	}
 	(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
 
@@ -20417,7 +16684,7 @@ bad:
 	if (src_ipif_held)
 		ipif_refrele(src_ipif);
 
-	ipif_arp_down(ipif);
+	ipif_resolver_down(ipif);
 	return (err);
 }
 
@@ -20493,119 +16760,6 @@ ill_arp_on(ill_t *ill)
 }
 
 /*
- * Called after either deleting ill from the group or when setting
- * FAILED or STANDBY on the interface.
- */
-static void
-illgrp_reset_schednext(ill_t *ill)
-{
-	ill_group_t *illgrp;
-	ill_t *save_ill;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-	/*
-	 * When called from illgrp_delete, ill_group will be non-NULL.
-	 * But when called from ip_sioctl_flags, it could be NULL if
-	 * somebody is setting FAILED/INACTIVE on some interface which
-	 * is not part of a group.
-	 */
-	illgrp = ill->ill_group;
-	if (illgrp == NULL)
-		return;
-	if (illgrp->illgrp_ill_schednext != ill)
-		return;
-
-	illgrp->illgrp_ill_schednext = NULL;
-	save_ill = ill;
-	/*
-	 * Choose a good ill to be the next one for
-	 * outbound traffic. As the flags FAILED/STANDBY is
-	 * not yet marked when called from ip_sioctl_flags,
-	 * we check for ill separately.
-	 */
-	for (ill = illgrp->illgrp_ill; ill != NULL;
-	    ill = ill->ill_group_next) {
-		if ((ill != save_ill) &&
-		    !(ill->ill_phyint->phyint_flags &
-		    (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) {
-			illgrp->illgrp_ill_schednext = ill;
-			return;
-		}
-	}
-}
-
-/*
- * Given an ill, find the next ill in the group to be scheduled.
- * (This should be called by ip_newroute() before ire_create().)
- * The passed in ill may be pulled out of the group, after we have picked
- * up a different outgoing ill from the same group. However ire add will
- * atomically check this.
- */
-ill_t *
-illgrp_scheduler(ill_t *ill)
-{
-	ill_t *retill;
-	ill_group_t *illgrp;
-	int illcnt;
-	int i;
-	uint64_t flags;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * We don't use a lock to check for the ill_group. If this ill
-	 * is currently being inserted we may end up just returning this
-	 * ill itself. That is ok.
-	 */
-	if (ill->ill_group == NULL) {
-		ill_refhold(ill);
-		return (ill);
-	}
-
-	/*
-	 * Grab the ill_g_lock as reader to make sure we are dealing with
-	 * a set of stable ills. No ill can be added or deleted or change
-	 * group while we hold the reader lock.
-	 */
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	if ((illgrp = ill->ill_group) == NULL) {
-		rw_exit(&ipst->ips_ill_g_lock);
-		ill_refhold(ill);
-		return (ill);
-	}
-
-	illcnt = illgrp->illgrp_ill_count;
-	mutex_enter(&illgrp->illgrp_lock);
-	retill = illgrp->illgrp_ill_schednext;
-
-	if (retill == NULL)
-		retill = illgrp->illgrp_ill;
-
-	/*
-	 * We do a circular search beginning at illgrp_ill_schednext
-	 * or illgrp_ill. We don't check the flags against the ill lock
-	 * since it can change anytime. The ire creation will be atomic
-	 * and will fail if the ill is FAILED or OFFLINE.
-	 */
-	for (i = 0; i < illcnt; i++) {
-		flags = retill->ill_phyint->phyint_flags;
-
-		if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
-		    ILL_CAN_LOOKUP(retill)) {
-			illgrp->illgrp_ill_schednext = retill->ill_group_next;
-			ill_refhold(retill);
-			break;
-		}
-		retill = retill->ill_group_next;
-		if (retill == NULL)
-			retill = illgrp->illgrp_ill;
-	}
-	mutex_exit(&illgrp->illgrp_lock);
-	rw_exit(&ipst->ips_ill_g_lock);
-
-	return (i == illcnt ? NULL : retill);
-}
-
-/*
  * Checks for availbility of a usable source address (if there is one) when the
  * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
  * this selection is done regardless of the destination.
@@ -20654,11 +16808,26 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
 }
 
 /*
- * Determine the best source address given a destination address and an ill.
- * Prefers non-deprecated over deprecated but will return a deprecated
- * address if there is no other choice. If there is a usable source address
- * on the interface pointed to by ill_usesrc_ifindex then that is given
- * first preference.
+ * IP source address type, sorted from worst to best.  For a given type,
+ * always prefer IP addresses on the same subnet.  All-zones addresses are
+ * suboptimal because they pose problems with unlabeled destinations.
+ */
+typedef enum {
+	IPIF_NONE,
+	IPIF_DIFFNET_DEPRECATED, 	/* deprecated and different subnet */
+	IPIF_SAMENET_DEPRECATED, 	/* deprecated and same subnet */
+	IPIF_DIFFNET_ALLZONES,		/* allzones and different subnet */
+	IPIF_SAMENET_ALLZONES,		/* allzones and same subnet */
+	IPIF_DIFFNET,			/* normal and different subnet */
+	IPIF_SAMENET			/* normal and same subnet */
+} ipif_type_t;
+
+/*
+ * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
+ * `zoneid'.  We rate usable ipifs from low -> high as per the ipif_type_t
+ * enumeration, and return the highest-rated ipif.  If there's a tie, we pick
+ * the first one, unless IPMP is used in which case we round-robin among them;
+ * see below for more.
  *
  * Returns NULL if there is no suitable source address for the ill.
  * This only occurs when there is no valid source address for the ill.
@@ -20666,17 +16835,13 @@ ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
 ipif_t *
 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
 {
-	ipif_t *ipif;
-	ipif_t *ipif_dep = NULL;	/* Fallback to deprecated */
-	ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE];
-	int index = 0;
-	boolean_t wrapped = B_FALSE;
-	boolean_t same_subnet_only = B_FALSE;
-	boolean_t ipif_same_found, ipif_other_found;
-	boolean_t specific_found;
-	ill_t	*till, *usill = NULL;
+	ill_t	*usill = NULL;
+	ill_t	*ipmp_ill = NULL;
+	ipif_t	*start_ipif, *next_ipif, *ipif, *best_ipif;
+	ipif_type_t type, best_type;
 	tsol_tpc_t *src_rhtp, *dst_rhtp;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	ip_stack_t *ipst = ill->ill_ipst;
+	boolean_t samenet;
 
 	if (ill->ill_usesrc_ifindex != 0) {
 		usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
@@ -20688,6 +16853,17 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
 	}
 
 	/*
+	 * Test addresses should never be used for source address selection,
+	 * so if we were passed one, switch to the IPMP meta-interface.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
+			ill = ipmp_ill;	/* Select source from IPMP ill */
+		else
+			return (NULL);
+	}
+
+	/*
 	 * If we're dealing with an unlabeled destination on a labeled system,
 	 * make sure that we ignore source addresses that are incompatible with
 	 * the destination's default label.  That destination's default label
@@ -20705,7 +16881,7 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
 	}
 
 	/*
-	 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill
+	 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
 	 * can be deleted. But an ipif/ill can get CONDEMNED any time.
 	 * After selecting the right ipif, under ill_lock make sure ipif is
 	 * not condemned, and increment refcnt. If ipif is CONDEMNED,
@@ -20713,190 +16889,117 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
 	 * but not under a lock.
 	 */
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-
 retry:
-	till = ill;
-	ipif_arr[0] = NULL;
+	/*
+	 * For source address selection, we treat the ipif list as circular
+	 * and continue until we get back to where we started.  This allows
+	 * IPMP to vary source address selection (which improves inbound load
+	 * spreading) by caching its last ending point and starting from
+	 * there.  NOTE: we don't have to worry about ill_src_ipif changing
+	 * ills since that can't happen on the IPMP ill.
+	 */
+	start_ipif = ill->ill_ipif;
+	if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
+		start_ipif = ill->ill_src_ipif;
 
-	if (till->ill_group != NULL)
-		till = till->ill_group->illgrp_ill;
+	ipif = start_ipif;
+	best_ipif = NULL;
+	best_type = IPIF_NONE;
+	do {
+		if ((next_ipif = ipif->ipif_next) == NULL)
+			next_ipif = ill->ill_ipif;
 
-	/*
-	 * Choose one good source address from each ill across the group.
-	 * If possible choose a source address in the same subnet as
-	 * the destination address.
-	 *
-	 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE
-	 * This is okay because of the following.
-	 *
-	 *    If PHYI_FAILED is set and we still have non-deprecated
-	 *    addresses, it means the addresses have not yet been
-	 *    failed over to a different interface. We potentially
-	 *    select them to create IRE_CACHES, which will be later
-	 *    flushed when the addresses move over.
-	 *
-	 *    If PHYI_INACTIVE is set and we still have non-deprecated
-	 *    addresses, it means either the user has configured them
-	 *    or PHYI_INACTIVE has not been cleared after the addresses
-	 *    been moved over. For the former, in.mpathd does a failover
-	 *    when the interface becomes INACTIVE and hence we should
-	 *    not find them. Once INACTIVE is set, we don't allow them
-	 *    to create logical interfaces anymore. For the latter, a
-	 *    flush will happen when INACTIVE is cleared which will
-	 *    flush the IRE_CACHES.
-	 *
-	 *    If PHYI_OFFLINE is set, all the addresses will be failed
-	 *    over soon. We potentially select them to create IRE_CACHEs,
-	 *    which will be later flushed when the addresses move over.
-	 *
-	 * NOTE : As ipif_select_source is called to borrow source address
-	 * for an ipif that is part of a group, source address selection
-	 * will be re-done whenever the group changes i.e either an
-	 * insertion/deletion in the group.
-	 *
-	 * Fill ipif_arr[] with source addresses, using these rules:
-	 *
-	 *	1. At most one source address from a given ill ends up
-	 *	   in ipif_arr[] -- that is, at most one of the ipif's
-	 *	   associated with a given ill ends up in ipif_arr[].
-	 *
-	 *	2. If there is at least one non-deprecated ipif in the
-	 *	   IPMP group with a source address on the same subnet as
-	 *	   our destination, then fill ipif_arr[] only with
-	 *	   source addresses on the same subnet as our destination.
-	 *	   Note that because of (1), only the first
-	 *	   non-deprecated ipif found with a source address
-	 *	   matching the destination ends up in ipif_arr[].
-	 *
-	 *	3. Otherwise, fill ipif_arr[] with non-deprecated source
-	 *	   addresses not in the same subnet as our destination.
-	 *	   Again, because of (1), only the first off-subnet source
-	 *	   address will be chosen.
-	 *
-	 *	4. If there are no non-deprecated ipifs, then just use
-	 *	   the source address associated with the last deprecated
-	 *	   one we find that happens to be on the same subnet,
-	 *	   otherwise the first one not in the same subnet.
-	 */
-	specific_found = B_FALSE;
-	for (; till != NULL; till = till->ill_group_next) {
-		ipif_same_found = B_FALSE;
-		ipif_other_found = B_FALSE;
-		for (ipif = till->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (!IPIF_CAN_LOOKUP(ipif))
-				continue;
-			/* Always skip NOLOCAL and ANYCAST interfaces */
-			if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
-				continue;
-			if (!(ipif->ipif_flags & IPIF_UP) ||
-			    !ipif->ipif_addr_ready)
-				continue;
-			if (ipif->ipif_zoneid != zoneid &&
-			    ipif->ipif_zoneid != ALL_ZONES)
-				continue;
-			/*
-			 * Interfaces with 0.0.0.0 address are allowed to be UP,
-			 * but are not valid as source addresses.
-			 */
-			if (ipif->ipif_lcl_addr == INADDR_ANY)
-				continue;
+		if (!IPIF_CAN_LOOKUP(ipif))
+			continue;
+		/* Always skip NOLOCAL and ANYCAST interfaces */
+		if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
+			continue;
+		if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready)
+			continue;
+		if (ipif->ipif_zoneid != zoneid &&
+		    ipif->ipif_zoneid != ALL_ZONES)
+			continue;
 
-			/*
-			 * Check compatibility of local address for
-			 * destination's default label if we're on a labeled
-			 * system.  Incompatible addresses can't be used at
-			 * all.
-			 */
-			if (dst_rhtp != NULL) {
-				boolean_t incompat;
+		/*
+		 * Interfaces with 0.0.0.0 address are allowed to be UP, but
+		 * are not valid as source addresses.
+		 */
+		if (ipif->ipif_lcl_addr == INADDR_ANY)
+			continue;
 
-				src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
-				    IPV4_VERSION, B_FALSE);
-				if (src_rhtp == NULL)
-					continue;
-				incompat =
-				    src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
-				    src_rhtp->tpc_tp.tp_doi !=
-				    dst_rhtp->tpc_tp.tp_doi ||
-				    (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
-				    &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
-				    !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
-				    src_rhtp->tpc_tp.tp_sl_set_cipso));
-				TPC_RELE(src_rhtp);
-				if (incompat)
-					continue;
-			}
+		/*
+		 * Check compatibility of local address for destination's
+		 * default label if we're on a labeled system.	Incompatible
+		 * addresses can't be used at all.
+		 */
+		if (dst_rhtp != NULL) {
+			boolean_t incompat;
 
-			/*
-			 * We prefer not to use all all-zones addresses, if we
-			 * can avoid it, as they pose problems with unlabeled
-			 * destinations.
-			 */
-			if (ipif->ipif_zoneid != ALL_ZONES) {
-				if (!specific_found &&
-				    (!same_subnet_only ||
-				    (ipif->ipif_net_mask & dst) ==
-				    ipif->ipif_subnet)) {
-					index = 0;
-					specific_found = B_TRUE;
-					ipif_other_found = B_FALSE;
-				}
-			} else {
-				if (specific_found)
-					continue;
-			}
-			if (ipif->ipif_flags & IPIF_DEPRECATED) {
-				if (ipif_dep == NULL ||
-				    (ipif->ipif_net_mask & dst) ==
-				    ipif->ipif_subnet)
-					ipif_dep = ipif;
+			src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
+			    IPV4_VERSION, B_FALSE);
+			if (src_rhtp == NULL)
+				continue;
+			incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
+			    src_rhtp->tpc_tp.tp_doi !=
+			    dst_rhtp->tpc_tp.tp_doi ||
+			    (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
+			    &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
+			    !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
+			    src_rhtp->tpc_tp.tp_sl_set_cipso));
+			TPC_RELE(src_rhtp);
+			if (incompat)
 				continue;
-			}
-			if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) {
-				/* found a source address in the same subnet */
-				if (!same_subnet_only) {
-					same_subnet_only = B_TRUE;
-					index = 0;
-				}
-				ipif_same_found = B_TRUE;
-			} else {
-				if (same_subnet_only || ipif_other_found)
-					continue;
-				ipif_other_found = B_TRUE;
-			}
-			ipif_arr[index++] = ipif;
-			if (index == MAX_IPIF_SELECT_SOURCE) {
-				wrapped = B_TRUE;
-				index = 0;
-			}
-			if (ipif_same_found)
-				break;
 		}
-	}
 
-	if (ipif_arr[0] == NULL) {
-		ipif = ipif_dep;
-	} else {
-		if (wrapped)
-			index = MAX_IPIF_SELECT_SOURCE;
-		ipif = ipif_arr[ipif_rand(ipst) % index];
-		ASSERT(ipif != NULL);
-	}
+		samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
 
-	if (ipif != NULL) {
+		if (ipif->ipif_flags & IPIF_DEPRECATED) {
+			type = samenet ? IPIF_SAMENET_DEPRECATED :
+			    IPIF_DIFFNET_DEPRECATED;
+		} else if (ipif->ipif_zoneid == ALL_ZONES) {
+			type = samenet ? IPIF_SAMENET_ALLZONES :
+			    IPIF_DIFFNET_ALLZONES;
+		} else {
+			type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
+		}
+
+		if (type > best_type) {
+			best_type = type;
+			best_ipif = ipif;
+			if (best_type == IPIF_SAMENET)
+				break; /* can't get better */
+		}
+	} while ((ipif = next_ipif) != start_ipif);
+
+	if ((ipif = best_ipif) != NULL) {
 		mutex_enter(&ipif->ipif_ill->ill_lock);
 		if (!IPIF_CAN_LOOKUP(ipif)) {
 			mutex_exit(&ipif->ipif_ill->ill_lock);
 			goto retry;
 		}
 		ipif_refhold_locked(ipif);
+
+		/*
+		 * For IPMP, update the source ipif rotor to the next ipif,
+		 * provided we can look it up.  (We must not use it if it's
+		 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
+		 * ipif_free() checked ill_src_ipif.)
+		 */
+		if (IS_IPMP(ill) && ipif != NULL) {
+			next_ipif = ipif->ipif_next;
+			if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+				ill->ill_src_ipif = next_ipif;
+			else
+				ill->ill_src_ipif = NULL;
+		}
 		mutex_exit(&ipif->ipif_ill->ill_lock);
 	}
 
 	rw_exit(&ipst->ips_ill_g_lock);
 	if (usill != NULL)
 		ill_refrele(usill);
+	if (ipmp_ill != NULL)
+		ill_refrele(ipmp_ill);
 	if (dst_rhtp != NULL)
 		TPC_RELE(dst_rhtp);
 
@@ -20929,8 +17032,7 @@ retry:
  * ipif_update_other_ipifs calls us.
  *
  * If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when illgrp_insert or ipif_up_done
- * calls us.
+ * if needed. This happens when ipif_up_done calls us.
  */
 static void
 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
@@ -21064,49 +17166,31 @@ ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
 /*
  * This old_ipif is going away.
  *
- * Determine if any other ipif's is using our address as
+ * Determine if any other ipif's are using our address as
  * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
  * IPIF_DEPRECATED).
  * Find the IRE_INTERFACE for such ipifs and recreate them
  * to use an different source address following the rules in
  * ipif_up_done.
- *
- * This function takes an illgrp as an argument so that illgrp_delete
- * can call this to update source address even after deleting the
- * old_ipif->ipif_ill from the ill group.
  */
 static void
-ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp)
+ipif_update_other_ipifs(ipif_t *old_ipif)
 {
-	ipif_t *ipif;
-	ill_t *ill;
+	ipif_t	*ipif;
+	ill_t	*ill;
 	char	buf[INET6_ADDRSTRLEN];
 
 	ASSERT(IAM_WRITER_IPIF(old_ipif));
-	ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif));
 
 	ill = old_ipif->ipif_ill;
 
-	ip1dbg(("ipif_update_other_ipifs(%s, %s)\n",
-	    ill->ill_name,
-	    inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr,
-	    buf, sizeof (buf))));
-	/*
-	 * If this part of a group, look at all ills as ipif_select_source
-	 * borrows source address across all the ills in the group.
-	 */
-	if (illgrp != NULL)
-		ill = illgrp->illgrp_ill;
-
-	for (; ill != NULL; ill = ill->ill_group_next) {
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-
-			if (ipif == old_ipif)
-				continue;
+	ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name,
+	    inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf))));
 
-			ipif_recreate_interface_routes(old_ipif, ipif);
-		}
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+		if (ipif == old_ipif)
+			continue;
+		ipif_recreate_interface_routes(old_ipif, ipif);
 	}
 }
 
@@ -21117,8 +17201,7 @@ if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 {
 	/*
 	 * ill_phyint_reinit merged the v4 and v6 into a single
-	 * ipsq. Could also have become part of a ipmp group in the
-	 * process, and we might not have been able to complete the
+	 * ipsq.  We might not have been able to complete the
 	 * operation in ipif_set_values, if we could not become
 	 * exclusive.  If so restart it here.
 	 */
@@ -21171,6 +17254,48 @@ ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
 }
 
 /*
+ * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the
+ * minimum (but complete) set exist.  This is necessary when adding or
+ * removing an interface to/from an IPMP group, since interfaces in an
+ * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever
+ * its test address subnets overlap with IPMP data addresses).	It's also
+ * used to refresh the IRE_BROADCAST entries associated with the IPMP
+ * interface when the nominated broadcast interface changes.
+ */
+void
+ill_refresh_bcast(ill_t *ill)
+{
+	ire_t *ire_array[12];	/* max ipif_create_bcast_ires() can create */
+	ire_t **irep;
+	ipif_t *ipif;
+
+	ASSERT(!ill->ill_isv6);
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	/*
+	 * Remove any old broadcast IREs.
+	 */
+	ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST,
+	    ill_broadcast_delete, ill, ill);
+
+	/*
+	 * Create new ones for any ipifs that are up and broadcast-capable.
+	 */
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+		if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) !=
+		    (IPIF_UP|IPIF_BROADCAST))
+			continue;
+
+		irep = ipif_create_bcast_ires(ipif, ire_array);
+		while (irep-- > ire_array) {
+			(void) ire_add(irep, NULL, NULL, NULL, B_FALSE);
+			if (*irep != NULL)
+				ire_refrele(*irep);
+		}
+	}
+}
+
+/*
  * Create any IRE_BROADCAST entries for `ipif', and store those entries in
  * `irep'.  Returns a pointer to the next free `irep' entry (just like
  * ire_check_and_create_bcast()).
@@ -21433,10 +17558,33 @@ ipif_check_bcast_ires(ipif_t *test_ipif)
 
 	/*
 	 * Walk through all the ipifs that will be affected by the dying IREs,
-	 * and recreate the IREs as necessary.
+	 * and recreate the IREs as necessary. Note that all interfaces in an
+	 * IPMP illgrp share the same broadcast IREs, and thus the entire
+	 * illgrp must be walked, starting with the IPMP meta-interface (so
+	 * that broadcast IREs end up on it whenever possible).
 	 */
+	if (IS_UNDER_IPMP(ill))
+		ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
+
 	irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
 
+	if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
+		ipmp_illgrp_t *illg = ill->ill_grp;
+
+		ill = list_head(&illg->ig_if);
+		for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+			for (i = 0; i < BCAST_COUNT; i++) {
+				if (bireinfo[i].bi_willdie &&
+				    !bireinfo[i].bi_haverep)
+					break;
+			}
+			if (i == BCAST_COUNT)
+				break;
+
+			irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
+		}
+	}
+
 	/*
 	 * Scan through the set of broadcast IREs and see if there are any
 	 * that we need to replace that have not yet been replaced.  If so,
@@ -21528,7 +17676,7 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	/*
 	 * If there's another ill already with the requested name, ensure
-	 * that it's of the same type.	Otherwise, ill_phyint_reinit() will
+	 * that it's of the same type.  Otherwise, ill_phyint_reinit() will
 	 * fuse together two unrelated ills, which will cause chaos.
 	 */
 	ipst = ill->ill_ipst;
@@ -21620,8 +17768,7 @@ ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 {
 	/*
 	 * ill_phyint_reinit merged the v4 and v6 into a single
-	 * ipsq. Could also have become part of a ipmp group in the
-	 * process, and we might not have been able to complete the
+	 * ipsq.  We might not have been able to complete the
 	 * slifname in ipif_set_values, if we could not become
 	 * exclusive.  If so restart it here
 	 */
@@ -21665,85 +17812,6 @@ ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
 	return (ipif);
 }
 
-typedef struct conn_change_s {
-	uint_t cc_old_ifindex;
-	uint_t cc_new_ifindex;
-} conn_change_t;
-
-/*
- * ipcl_walk function for changing interface index.
- */
-static void
-conn_change_ifindex(conn_t *connp, caddr_t arg)
-{
-	conn_change_t *connc;
-	uint_t old_ifindex;
-	uint_t new_ifindex;
-	int i;
-	ilg_t *ilg;
-
-	connc = (conn_change_t *)arg;
-	old_ifindex = connc->cc_old_ifindex;
-	new_ifindex = connc->cc_new_ifindex;
-
-	if (connp->conn_orig_bound_ifindex == old_ifindex)
-		connp->conn_orig_bound_ifindex = new_ifindex;
-
-	if (connp->conn_orig_multicast_ifindex == old_ifindex)
-		connp->conn_orig_multicast_ifindex = new_ifindex;
-
-	for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
-		ilg = &connp->conn_ilg[i];
-		if (ilg->ilg_orig_ifindex == old_ifindex)
-			ilg->ilg_orig_ifindex = new_ifindex;
-	}
-}
-
-/*
- * Walk all the ipifs and ilms on this ill and change the orig_ifindex
- * to new_index if it matches the old_index.
- *
- * Failovers typically happen within a group of ills. But somebody
- * can remove an ill from the group after a failover happened. If
- * we are setting the ifindex after this, we potentially need to
- * look at all the ills rather than just the ones in the group.
- * We cut down the work by looking at matching ill_net_types
- * and ill_types as we could not possibly grouped them together.
- */
-static void
-ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc)
-{
-	ill_t *ill;
-	ipif_t *ipif;
-	uint_t old_ifindex;
-	uint_t new_ifindex;
-	ilm_t *ilm;
-	ill_walk_context_t ctx;
-	ip_stack_t	*ipst = ill_orig->ill_ipst;
-
-	old_ifindex = connc->cc_old_ifindex;
-	new_ifindex = connc->cc_new_ifindex;
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	ill = ILL_START_WALK_ALL(&ctx, ipst);
-	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
-		if ((ill_orig->ill_net_type != ill->ill_net_type) ||
-		    (ill_orig->ill_type != ill->ill_type)) {
-			continue;
-		}
-		for (ipif = ill->ill_ipif; ipif != NULL;
-		    ipif = ipif->ipif_next) {
-			if (ipif->ipif_orig_ifindex == old_ifindex)
-				ipif->ipif_orig_ifindex = new_ifindex;
-		}
-		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-			if (ilm->ilm_orig_ifindex == old_ifindex)
-				ilm->ilm_orig_ifindex = new_ifindex;
-		}
-	}
-	rw_exit(&ipst->ips_ill_g_lock);
-}
-
 /*
  * We first need to ensure that the new index is unique, and
  * then carry the change across both v4 and v6 ill representation
@@ -21755,13 +17823,10 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
     ip_ioctl_cmd_t *ipip, void *ifreq)
 {
 	ill_t		*ill;
-	ill_t		*ill_other;
 	phyint_t	*phyi;
-	int		old_index;
-	conn_change_t	connc;
 	struct ifreq	*ifr = (struct ifreq *)ifreq;
 	struct lifreq	*lifr = (struct lifreq *)ifreq;
-	uint_t	index;
+	uint_t	old_index, index;
 	ill_t	*ill_v4;
 	ill_t	*ill_v6;
 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
@@ -21773,31 +17838,15 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 	/*
 	 * Only allow on physical interface. Also, index zero is illegal.
-	 *
-	 * Need to check for PHYI_FAILED and PHYI_INACTIVE
-	 *
-	 * 1) If PHYI_FAILED is set, a failover could have happened which
-	 *    implies a possible failback might have to happen. As failback
-	 *    depends on the old index, we should fail setting the index.
-	 *
-	 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that
-	 *    any addresses or multicast memberships are failed over to
-	 *    a non-STANDBY interface. As failback depends on the old
-	 *    index, we should fail setting the index for this case also.
-	 *
-	 * 3) If PHYI_OFFLINE is set, a possible failover has happened.
-	 *    Be consistent with PHYI_FAILED and fail the ioctl.
 	 */
 	ill = ipif->ipif_ill;
 	phyi = ill->ill_phyint;
-	if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) ||
-	    ipif->ipif_id != 0 || index == 0) {
+	if (ipif->ipif_id != 0 || index == 0) {
 		return (EINVAL);
 	}
-	old_index = phyi->phyint_ifindex;
 
 	/* If the index is not changing, no work to do */
-	if (old_index == index)
+	if (phyi->phyint_ifindex == index)
 		return (0);
 
 	/*
@@ -21816,31 +17865,17 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		return (EBUSY);
 	}
 
-	/*
-	 * The new index is unused. Set it in the phyint.
-	 * Locate the other ill so that we can send a routing
-	 * sockets message.
-	 */
-	if (ill->ill_isv6) {
-		ill_other = phyi->phyint_illv4;
-	} else {
-		ill_other = phyi->phyint_illv6;
-	}
-
+	/* The new index is unused. Set it in the phyint. */
+	old_index = phyi->phyint_ifindex;
 	phyi->phyint_ifindex = index;
 
 	/* Update SCTP's ILL list */
 	sctp_ill_reindex(ill, old_index);
 
-	connc.cc_old_ifindex = old_index;
-	connc.cc_new_ifindex = index;
-	ip_change_ifindex(ill, &connc);
-	ipcl_walk(conn_change_ifindex, (caddr_t)&connc, ipst);
-
 	/* Send the routing sockets message */
-	ip_rts_ifmsg(ipif);
-	if (ill_other != NULL)
-		ip_rts_ifmsg(ill_other->ill_ipif);
+	ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+	if (ILL_OTHER(ill))
+		ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
 
 	return (0);
 }
@@ -22038,6 +18073,45 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	    B_TRUE));
 }
 
+/*
+ * Return the number of addresses on `ill' with one or more of the values
+ * in `set' set and all of the values in `clear' clear.
+ */
+static uint_t
+ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
+{
+	ipif_t	*ipif;
+	uint_t	cnt = 0;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+		if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
+			cnt++;
+
+	return (cnt);
+}
+
+/*
+ * Return the number of migratable addresses on `ill' that are under
+ * application control.
+ */
+uint_t
+ill_appaddr_cnt(const ill_t *ill)
+{
+	return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
+	    IPIF_NOFAILOVER));
+}
+
+/*
+ * Return the number of point-to-point addresses on `ill'.
+ */
+uint_t
+ill_ptpaddr_cnt(const ill_t *ill)
+{
+	return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
+}
+
 /* ARGSUSED */
 int
 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
@@ -22158,7 +18232,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
 	int err = 0, ret;
 	uint_t ifindex;
-	phyint_t *us_phyint, *us_cli_phyint;
 	ipsq_t *ipsq = NULL;
 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
 
@@ -22167,19 +18240,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	ASSERT(CONN_Q(q));
 
 	isv6 = (Q_TO_CONN(q))->conn_af_isv6;
-	us_cli_phyint = usesrc_cli_ill->ill_phyint;
-
-	ASSERT(us_cli_phyint != NULL);
-
-	/*
-	 * If the client ILL is being used for IPMP, abort.
-	 * Note, this can be done before ipsq_try_enter since we are already
-	 * exclusive on this ILL
-	 */
-	if ((us_cli_phyint->phyint_groupname != NULL) ||
-	    (us_cli_phyint->phyint_flags & PHYI_STANDBY)) {
-		return (EINVAL);
-	}
 
 	ifindex = lifr->lifr_index;
 	if (ifindex == 0) {
@@ -22198,15 +18258,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		return (err);
 	}
 
-	/*
-	 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP
-	 * group nor can either of the interfaces be used for standy. So
-	 * to guarantee mutual exclusion with ip_sioctl_flags (which sets
-	 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname)
-	 * we need to be exclusive on the ipsq belonging to the usesrc_ill.
-	 * We are already exlusive on this ipsq i.e ipsq corresponding to
-	 * the usesrc_cli_ill
-	 */
 	ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
 	    NEW_OP, B_TRUE);
 	if (ipsq == NULL) {
@@ -22215,11 +18266,19 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 		goto done;
 	}
 
-	/* Check if the usesrc_ill is used for IPMP */
-	us_phyint = usesrc_ill->ill_phyint;
-	if ((us_phyint->phyint_groupname != NULL) ||
-	    (us_phyint->phyint_flags & PHYI_STANDBY)) {
-		err = EINVAL;
+	/* USESRC isn't currently supported with IPMP */
+	if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
+		err = ENOTSUP;
+		goto done;
+	}
+
+	/*
+	 * USESRC isn't compatible with the STANDBY flag.  (STANDBY is only
+	 * used by IPMP underlying interfaces, but someone might think it's
+	 * more general and try to use it independently with VNI.)
+	 */
+	if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
+		err = ENOTSUP;
 		goto done;
 	}
 
@@ -22372,79 +18431,45 @@ ill_phyint_compare_name(const void *name_ptr, const void *phyip)
 		return (-1);
 	return (0);
 }
+
 /*
- * This function is called from ill_delete when the ill is being
- * unplumbed. We remove the reference from the phyint and we also
- * free the phyint when there are no more references to it.
+ * This function is called on the unplumb path via ill_glist_delete() when
+ * there are no ills left on the phyint and thus the phyint can be freed.
  */
 static void
-ill_phyint_free(ill_t *ill)
+phyint_free(phyint_t *phyi)
 {
-	phyint_t *phyi;
-	phyint_t *next_phyint;
-	ipsq_t *cur_ipsq;
-	ip_stack_t	*ipst = ill->ill_ipst;
+	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
 
-	ASSERT(ill->ill_phyint != NULL);
+	ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
 
-	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
-	phyi = ill->ill_phyint;
-	ill->ill_phyint = NULL;
 	/*
-	 * ill_init allocates a phyint always to store the copy
-	 * of flags relevant to phyint. At that point in time, we could
-	 * not assign the name and hence phyint_illv4/v6 could not be
-	 * initialized. Later in ipif_set_values, we assign the name to
-	 * the ill, at which point in time we assign phyint_illv4/v6.
-	 * Thus we don't rely on phyint_illv6 to be initialized always.
+	 * If this phyint was an IPMP meta-interface, blow away the group.
+	 * This is safe to do because all of the illgrps have already been
+	 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
+	 * If we're cleaning up as a result of failed initialization,
+	 * phyint_grp may be NULL.
 	 */
-	if (ill->ill_flags & ILLF_IPV6) {
-		phyi->phyint_illv6 = NULL;
-	} else {
-		phyi->phyint_illv4 = NULL;
+	if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		ipmp_grp_destroy(phyi->phyint_grp);
+		phyi->phyint_grp = NULL;
+		rw_exit(&ipst->ips_ipmp_lock);
 	}
-	/*
-	 * ipif_down removes it from the group when the last ipif goes
-	 * down.
-	 */
-	ASSERT(ill->ill_group == NULL);
-
-	if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL)
-		return;
 
 	/*
-	 * Make sure this phyint was put in the list.
+	 * If this interface was under IPMP, take it out of the group.
 	 */
-	if (phyi->phyint_ifindex > 0) {
-		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
-		    phyi);
-		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
-		    phyi);
-	}
+	if (phyi->phyint_grp != NULL)
+		ipmp_phyint_leave_grp(phyi);
+
 	/*
-	 * remove phyint from the ipsq list.
+	 * Delete the phyint and disassociate its ipsq.  The ipsq itself
+	 * will be freed in ipsq_exit().
 	 */
-	cur_ipsq = phyi->phyint_ipsq;
-	if (phyi == cur_ipsq->ipsq_phyint_list) {
-		cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next;
-	} else {
-		next_phyint = cur_ipsq->ipsq_phyint_list;
-		while (next_phyint != NULL) {
-			if (next_phyint->phyint_ipsq_next == phyi) {
-				next_phyint->phyint_ipsq_next =
-				    phyi->phyint_ipsq_next;
-				break;
-			}
-			next_phyint = next_phyint->phyint_ipsq_next;
-		}
-		ASSERT(next_phyint != NULL);
-	}
-	IPSQ_DEC_REF(cur_ipsq, ipst);
+	phyi->phyint_ipsq->ipsq_phyint = NULL;
+	phyi->phyint_name[0] = '\0';
 
-	if (phyi->phyint_groupname_len != 0) {
-		ASSERT(phyi->phyint_groupname != NULL);
-		mi_free(phyi->phyint_groupname);
-	}
 	mi_free(phyi);
 }
 
@@ -22464,7 +18489,6 @@ ill_phyint_reinit(ill_t *ill)
 	phyint_t *phyi;
 	avl_index_t where = 0;
 	ill_t	*ill_other = NULL;
-	ipsq_t	*ipsq;
 	ip_stack_t	*ipst = ill->ill_ipst;
 
 	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
@@ -22476,6 +18500,11 @@ ill_phyint_reinit(ill_t *ill)
 	    phyi_old->phyint_illv4 == NULL));
 	ASSERT(phyi_old->phyint_ifindex == 0);
 
+	/*
+	 * Now that our ill has a name, set it in the phyint.
+	 */
+	(void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
+
 	phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
 	    ill->ill_name, &where);
 
@@ -22497,8 +18526,7 @@ ill_phyint_reinit(ill_t *ill)
 	 * we are initializing IPv4.
 	 */
 	if (phyi != NULL) {
-		ill_other = (isv6) ? phyi->phyint_illv4 :
-		    phyi->phyint_illv6;
+		ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
 		ASSERT(ill_other->ill_phyint != NULL);
 		ASSERT((isv6 && !ill_other->ill_isv6) ||
 		    (!isv6 && ill_other->ill_isv6));
@@ -22517,26 +18545,15 @@ ill_phyint_reinit(ill_t *ill)
 			ASSERT(phyi->phyint_illv4 == NULL);
 			phyi->phyint_illv4 = ill;
 		}
-		/*
-		 * This is a new ill, currently undergoing SLIFNAME
-		 * So we could not have joined an IPMP group until now.
-		 */
-		ASSERT(phyi_old->phyint_ipsq_next == NULL &&
-		    phyi_old->phyint_groupname == NULL);
 
 		/*
-		 * This phyi_old is going away. Decref ipsq_refs and
-		 * assert it is zero. The ipsq itself will be freed in
-		 * ipsq_exit
+		 * Delete the old phyint and make its ipsq eligible
+		 * to be freed in ipsq_exit().
 		 */
-		ipsq = phyi_old->phyint_ipsq;
-		IPSQ_DEC_REF(ipsq, ipst);
-		ASSERT(ipsq->ipsq_refs == 0);
-		/* Get the singleton phyint out of the ipsq list */
-		ASSERT(phyi_old->phyint_ipsq_next == NULL);
-		ipsq->ipsq_phyint_list = NULL;
 		phyi_old->phyint_illv4 = NULL;
 		phyi_old->phyint_illv6 = NULL;
+		phyi_old->phyint_ipsq->ipsq_phyint = NULL;
+		phyi_old->phyint_name[0] = '\0';
 		mi_free(phyi_old);
 	} else {
 		mutex_enter(&ill->ill_lock);
@@ -22551,9 +18568,6 @@ ill_phyint_reinit(ill_t *ill)
 		if (!phyint_assign_ifindex(phyi, ipst))
 			cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
 
-		/* No IPMP group yet, thus the hook uses the ifindex */
-		phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
-
 		avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
 		    (void *)phyi, where);
 
@@ -22571,13 +18585,6 @@ ill_phyint_reinit(ill_t *ill)
 	ill->ill_phyint = phyi;
 
 	/*
-	 * Keep the index on ipif_orig_index to be used by FAILOVER.
-	 * We do this here as when the first ipif was allocated,
-	 * ipif_allocate does not know the right interface index.
-	 */
-
-	ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex;
-	/*
 	 * Now that the phyint's ifindex has been assigned, complete the
 	 * remaining
 	 */
@@ -22606,45 +18613,14 @@ ill_phyint_reinit(ill_t *ill)
 	 */
 	if (ill->ill_name_length <= 2 ||
 	    ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') {
-		/*
-		 * Generate nic plumb event for ill_name even if
-		 * ipmp_hook_emulation is set. That avoids generating events
-		 * for the ill_names should ipmp_hook_emulation be turned on
-		 * later.
-		 */
-		ill_nic_event_plumb(ill, B_FALSE);
+		ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
+		    ill->ill_name_length);
 	}
 	RELEASE_ILL_LOCKS(ill, ill_other);
 	mutex_exit(&phyi->phyint_lock);
 }
 
 /*
- * Allocate a NE_PLUMB nic info event and store in the ill.
- * If 'group' is set we do it for the group name, otherwise the ill name.
- * It will be sent when we leave the ipsq.
- */
-void
-ill_nic_event_plumb(ill_t *ill, boolean_t group)
-{
-	phyint_t	*phyi = ill->ill_phyint;
-	char		*name;
-	int		namelen;
-
-	ASSERT(MUTEX_HELD(&ill->ill_lock));
-
-	if (group) {
-		ASSERT(phyi->phyint_groupname_len != 0);
-		namelen = phyi->phyint_groupname_len;
-		name = phyi->phyint_groupname;
-	} else {
-		namelen = ill->ill_name_length;
-		name = ill->ill_name;
-	}
-
-	ill_nic_event_dispatch(ill, 0, NE_PLUMB, name, namelen);
-}
-
-/*
  * Notify any downstream modules of the name of this interface.
  * An M_IOCTL is used even though we don't expect a successful reply.
  * Any reply message from the driver (presumably an M_IOCNAK) will
@@ -22686,8 +18662,9 @@ ip_ifname_notify(ill_t *ill, queue_t *q)
 static int
 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 {
-	int err;
+	int		err;
 	ip_stack_t	*ipst = ill->ill_ipst;
+	phyint_t	*phyi = ill->ill_phyint;
 
 	/* Set the obsolete NDD per-interface forwarding name. */
 	err = ill_set_ndd_name(ill);
@@ -22696,6 +18673,34 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 		    err);
 	}
 
+	/*
+	 * Now that ill_name is set, the configuration for the IPMP
+	 * meta-interface can be performed.
+	 */
+	if (IS_IPMP(ill)) {
+		rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+		/*
+		 * If phyi->phyint_grp is NULL, then this is the first IPMP
+		 * meta-interface and we need to create the IPMP group.
+		 */
+		if (phyi->phyint_grp == NULL) {
+			/*
+			 * If someone has renamed another IPMP group to have
+			 * the same name as our interface, bail.
+			 */
+			if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
+				rw_exit(&ipst->ips_ipmp_lock);
+				return (EEXIST);
+			}
+			phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
+			if (phyi->phyint_grp == NULL) {
+				rw_exit(&ipst->ips_ipmp_lock);
+				return (ENOMEM);
+			}
+		}
+		rw_exit(&ipst->ips_ipmp_lock);
+	}
+
 	/* Tell downstream modules where they are. */
 	ip_ifname_notify(ill, q);
 
@@ -22966,10 +18971,10 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
 	/*
 	 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
 	 */
-	if (ipsq->ipsq_current_ipif == NULL)
+	if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
 		ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
 	else
-		ASSERT(ipsq->ipsq_current_ipif == ipif);
+		ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
 
 	error = ipif_set_values_tail(ill, ipif, mp, q);
 	ipsq_exit(ipsq);
@@ -22986,18 +18991,8 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
 void
 ipif_init(ip_stack_t *ipst)
 {
-	hrtime_t hrt;
 	int i;
 
-	/*
-	 * Can't call drv_getparm here as it is too early in the boot.
-	 * As we use ipif_src_random just for picking a different
-	 * source address everytime, this need not be really random.
-	 */
-	hrt = gethrtime();
-	ipst->ips_ipif_src_random =
-	    ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff);
-
 	for (i = 0; i < MAX_G_HEADS; i++) {
 		ipst->ips_ill_g_heads[i].ill_g_list_head =
 		    (ill_if_t *)&ipst->ips_ill_g_heads[i];
@@ -23023,7 +19018,11 @@ ipif_init(ip_stack_t *ipst)
  * match is found to take care of such rare network configurations like -
  * le0: 129.146.1.1/16
  * le1: 129.146.2.2/24
- * It is used only by SO_DONTROUTE at the moment.
+ *
+ * This is used by SO_DONTROUTE and IP_NEXTHOP.  Since neither of those are
+ * supported on underlying interfaces in an IPMP group, underlying interfaces
+ * are ignored when looking up a match.  (If we didn't ignore them, we'd
+ * risk using a test address as a source for outgoing traffic.)
  */
 ipif_t *
 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
@@ -23038,6 +19037,8 @@ ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	ill = ILL_START_WALK_V4(&ctx, ipst);
 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		if (IS_UNDER_IPMP(ill))
+			continue;
 		mutex_enter(&ill->ill_lock);
 		for (ipif = ill->ill_ipif; ipif != NULL;
 		    ipif = ipif->ipif_next) {
@@ -23660,30 +19661,76 @@ ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa,
  * Knows about IEEE 802 and IEEE EUI-64 mappings.
  */
 static boolean_t
-ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
 {
 	char		*addr;
 
-	if (phys_length != ETHERADDRL)
+	if (ill->ill_phys_addr_length != ETHERADDRL)
 		return (B_FALSE);
 
 	/* Form EUI-64 like address */
 	addr = (char *)&v6addr->s6_addr32[2];
-	bcopy((char *)phys_addr, addr, 3);
+	bcopy(ill->ill_phys_addr, addr, 3);
 	addr[0] ^= 0x2;		/* Toggle Universal/Local bit */
 	addr[3] = (char)0xff;
 	addr[4] = (char)0xfe;
-	bcopy((char *)phys_addr + 3, addr + 5, 3);
+	bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
 	return (B_TRUE);
 }
 
 /* ARGSUSED */
 static boolean_t
-ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
 {
 	return (B_FALSE);
 }
 
+typedef struct ipmp_ifcookie {
+	uint32_t	ic_hostid;
+	char		ic_ifname[LIFNAMSIZ];
+	char		ic_zonename[ZONENAME_MAX];
+} ipmp_ifcookie_t;
+
+/*
+ * Construct a pseudo-random interface ID for the IPMP interface that's both
+ * predictable and (almost) guaranteed to be unique.
+ */
+static boolean_t
+ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
+{
+	zone_t		*zp;
+	uint8_t		*addr;
+	uchar_t		hash[16];
+	ulong_t 	hostid;
+	MD5_CTX		ctx;
+	ipmp_ifcookie_t	ic = { 0 };
+
+	ASSERT(IS_IPMP(ill));
+
+	(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
+	ic.ic_hostid = htonl((uint32_t)hostid);
+
+	(void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
+
+	if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
+		(void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
+		zone_rele(zp);
+	}
+
+	MD5Init(&ctx);
+	MD5Update(&ctx, &ic, sizeof (ic));
+	MD5Final(hash, &ctx);
+
+	/*
+	 * Map the hash to an interface ID per the basic approach in RFC3041.
+	 */
+	addr = &v6addr->s6_addr8[8];
+	bcopy(hash + 8, addr, sizeof (uint64_t));
+	addr[0] &= ~0x2;				/* set local bit */
+
+	return (B_TRUE);
+}
+
 /* ARGSUSED */
 static boolean_t
 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
@@ -23739,14 +19786,14 @@ ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
  * Derive IPoIB interface id from the link layer address.
  */
 static boolean_t
-ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
+ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
 {
 	char		*addr;
 
-	if (phys_length != 20)
+	if (ill->ill_phys_addr_length != 20)
 		return (B_FALSE);
 	addr = (char *)&v6addr->s6_addr32[2];
-	bcopy(phys_addr + 12, addr, 8);
+	bcopy(ill->ill_phys_addr + 12, addr, 8);
 	/*
 	 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
 	 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
@@ -23863,6 +19910,7 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
 			*ipifp = NULL;
 		return (B_FALSE);
 	}
+
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 		if (!IPIF_CAN_LOOKUP(ipif))
 			continue;
@@ -23897,71 +19945,9 @@ ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
 }
 
 /*
- * Same as ipif_lookup_zoneid() but looks at all the ills in the same group.
- */
-boolean_t
-ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
-{
-	ill_t *illg;
-	ip_stack_t	*ipst = ill->ill_ipst;
-
-	/*
-	 * We look at the passed-in ill first without grabbing ill_g_lock.
-	 */
-	if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) {
-		return (B_TRUE);
-	}
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	if (ill->ill_group == NULL) {
-		/* ill not in a group */
-		rw_exit(&ipst->ips_ill_g_lock);
-		return (B_FALSE);
-	}
-
-	/*
-	 * There's no ipif in the zone on ill, however ill is part of an IPMP
-	 * group. We need to look for an ipif in the zone on all the ills in the
-	 * group.
-	 */
-	illg = ill->ill_group->illgrp_ill;
-	do {
-		/*
-		 * We don't call ipif_lookup_zoneid() on ill as we already know
-		 * that it's not there.
-		 */
-		if (illg != ill &&
-		    ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) {
-			break;
-		}
-	} while ((illg = illg->ill_group_next) != NULL);
-	rw_exit(&ipst->ips_ill_g_lock);
-	return (illg != NULL);
-}
-
-/*
- * Check if this ill is only being used to send ICMP probes for IPMP
- */
-boolean_t
-ill_is_probeonly(ill_t *ill)
-{
-	/*
-	 * Check if the interface is FAILED, or INACTIVE
-	 */
-	if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))
-		return (B_TRUE);
-
-	return (B_FALSE);
-}
-
-/*
  * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
  * If a pointer to an ipif_t is returned then the caller will need to do
  * an ill_refrele().
- *
- * If there is no real interface which matches the ifindex, then it looks
- * for a group that has a matching index. In the case of a group match the
- * lifidx must be zero. We don't need emulate the logical interfaces
- * since IP Filter's use of netinfo doesn't use that.
  */
 ipif_t *
 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
@@ -23972,18 +19958,8 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
 
 	ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
 	    ipst);
-
-	if (ill == NULL) {
-		/* Fallback to group names only if hook_emulation set */
-		if (!ipst->ips_ipmp_hook_emulation)
-			return (NULL);
-
-		if (lifidx != 0)
-			return (NULL);
-		ill = ill_group_lookup_on_ifindex(ifindex, isv6, ipst);
-		if (ill == NULL)
-			return (NULL);
-	}
+	if (ill == NULL)
+		return (NULL);
 
 	mutex_enter(&ill->ill_lock);
 	if (ill->ill_state_flags & ILL_CONDEMNED) {
@@ -24059,7 +20035,7 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp)
 	 * If we can quiesce the ill, then set the address.  If not, then
 	 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
 	 */
-	ill_down_ipifs(ill, NULL, 0, B_FALSE);
+	ill_down_ipifs(ill);
 	mutex_enter(&ill->ill_lock);
 	if (!ill_is_quiescent(ill)) {
 		/* call cannot fail since `conn_t *' argument is NULL */
@@ -24283,10 +20259,7 @@ ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
 	if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
 		goto fail;
 
-	if (event == NE_UNPLUMB)
-		info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
-	else
-		info->hnei_event.hne_nic = ill->ill_phyint->phyint_hook_ifindex;
+	info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
 	info->hnei_event.hne_lif = lif;
 	info->hnei_event.hne_event = event;
 	info->hnei_event.hne_protocol = ill->ill_isv6 ?
@@ -24323,8 +20296,8 @@ fail:
 void
 ipif_up_notify(ipif_t *ipif)
 {
-	ip_rts_ifmsg(ipif);
-	ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
+	ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
+	ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
 	sctp_update_ipif(ipif, SCTP_IPIF_UP);
 	ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
 	    NE_LIF_UP, NULL, 0);
diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c
index 405cb653d5..52a7e74806 100644
--- a/usr/src/uts/common/inet/ip/ip_ire.c
+++ b/usr/src/uts/common/inet/ip/ip_ire.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -31,6 +31,7 @@
 #include <sys/types.h>
 #include <sys/stream.h>
 #include <sys/stropts.h>
+#include <sys/strsun.h>
 #include <sys/ddi.h>
 #include <sys/cmn_err.h>
 #include <sys/policy.h>
@@ -61,7 +62,6 @@
 #include <net/pfkeyv2.h>
 #include <inet/ipsec_info.h>
 #include <inet/sadb.h>
-#include <sys/kmem.h>
 #include <inet/tcp.h>
 #include <inet/ipclassifier.h>
 #include <sys/zone.h>
@@ -220,11 +220,6 @@ struct kmem_cache *rt_entry_cache;
  * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is
  * to be ignored when walking the ires using ire_next.
  *
- * IRE_MARK_HIDDEN signifies that the ire is a special ire typically for the
- * benefit of in.mpathd which needs to probe interfaces for failures. Normal
- * applications should not be seeing this ire and hence this ire is ignored
- * in most cases in the search using ire_next.
- *
  * Zones note:
  *	Walking IREs within a given zone also walks certain ires in other
  *	zones.  This is done intentionally.  IRE walks with a specified
@@ -1235,10 +1230,9 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
 {
 	irb_t *irb;
 	boolean_t drop = B_FALSE;
-	/* LINTED : set but not used in function */
 	boolean_t mctl_present;
 	mblk_t *first_mp = NULL;
-	mblk_t *save_mp = NULL;
+	mblk_t *data_mp = NULL;
 	ire_t *dst_ire;
 	ipha_t *ipha;
 	ip6_t *ip6h;
@@ -1258,27 +1252,16 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
 		 * we resolve an IPv6 address with an IPv4 ire
 		 * or vice versa.
 		 */
+		EXTRACT_PKT_MP(mp, first_mp, mctl_present);
+		data_mp = mp;
+		mp = first_mp;
 		if (ire->ire_ipversion == IPV4_VERSION) {
-			EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-			ipha = (ipha_t *)mp->b_rptr;
-			save_mp = mp;
-			mp = first_mp;
-
+			ipha = (ipha_t *)data_mp->b_rptr;
 			dst_ire = ire_cache_lookup(ipha->ipha_dst,
 			    ire->ire_zoneid, MBLK_GETLABEL(mp), ipst);
 		} else {
 			ASSERT(ire->ire_ipversion == IPV6_VERSION);
-			/*
-			 * Get a pointer to the beginning of the IPv6 header.
-			 * Ignore leading IPsec control mblks.
-			 */
-			first_mp = mp;
-			if (mp->b_datap->db_type == M_CTL) {
-				mp = mp->b_cont;
-			}
-			ip6h = (ip6_t *)mp->b_rptr;
-			save_mp = mp;
-			mp = first_mp;
+			ip6h = (ip6_t *)data_mp->b_rptr;
 			dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst,
 			    ire->ire_zoneid, MBLK_GETLABEL(mp), ipst);
 		}
@@ -1330,10 +1313,8 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
 		 * is over: we just drop the packet.
 		 */
 		if (ire->ire_flags & RTF_MULTIRT) {
-			if (save_mp) {
-				save_mp->b_prev = NULL;
-				save_mp->b_next = NULL;
-			}
+			data_mp->b_prev = NULL;
+			data_mp->b_next = NULL;
 			MULTIRT_DEBUG_UNTAG(mp);
 			freemsg(mp);
 		} else {
@@ -1355,9 +1336,31 @@ ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
 				    (CONN_Q(q) ? Q_TO_CONN(q) : NULL),
 				    ire->ire_zoneid, ipst);
 			} else {
+				int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN;
+
 				ASSERT(ire->ire_ipversion == IPV6_VERSION);
-				ip_newroute_v6(q, mp, &ip6h->ip6_dst, NULL,
-				    NULL, ire->ire_zoneid, ipst);
+
+				/*
+				 * If necessary, skip over the ip6i_t to find
+				 * the header with the actual source address.
+				 */
+				if (ip6h->ip6_nxt == IPPROTO_RAW) {
+					if (MBLKL(data_mp) < minlen &&
+					    pullupmsg(data_mp, -1) == 0) {
+						ip1dbg(("ire_add_then_send: "
+						    "cannot pullupmsg ip6i\n"));
+						if (mctl_present)
+							freeb(first_mp);
+						ire_refrele(ire);
+						return;
+					}
+					ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN);
+					ip6h = (ip6_t *)(data_mp->b_rptr +
+					    sizeof (ip6i_t));
+				}
+				ip_newroute_v6(q, mp, &ip6h->ip6_dst,
+				    &ip6h->ip6_src, NULL, ire->ire_zoneid,
+				    ipst);
 			}
 		}
 
@@ -1680,7 +1683,9 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep,
 {
 	ire_t *ire;
 	uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
+	boolean_t prefer;
+	ill_t *ill = ipif->ipif_ill;
+	ip_stack_t *ipst = ill->ill_ipst;
 
 	/*
 	 * No broadcast IREs for the LOOPBACK interface
@@ -1690,21 +1695,26 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep,
 	    (ipif->ipif_flags & IPIF_NOXMIT))
 		return (irep);
 
-	/* If this would be a duplicate, don't bother. */
+	/*
+	 * If this new IRE would be a duplicate, only prefer it if one of
+	 * the following is true:
+	 *
+	 * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST
+	 *    set and the new one has all of those clear.
+	 *
+	 * 2. The existing one corresponds to an underlying ILL in an IPMP
+	 *    group and the new one corresponds to an IPMP group interface.
+	 */
 	if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif,
 	    ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) {
-		/*
-		 * We look for non-deprecated (and non-anycast, non-nolocal)
-		 * ipifs as the best choice. ipifs with check_flags matching
-		 * (deprecated, etc) are used only if non-deprecated ipifs
-		 * are not available. if the existing ire's ipif is deprecated
-		 * and the new ipif is non-deprecated, switch to the new ipif
-		 */
-		if ((!(ire->ire_ipif->ipif_flags & check_flags)) ||
-		    (ipif->ipif_flags & check_flags)) {
+		prefer = ((ire->ire_ipif->ipif_flags & check_flags) &&
+		    !(ipif->ipif_flags & check_flags)) ||
+		    (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill));
+		if (!prefer) {
 			ire_refrele(ire);
 			return (irep);
 		}
+
 		/*
 		 * Bcast ires exist in pairs. Both have to be deleted,
 		 * Since we are exclusive we can make the above assertion.
@@ -1716,10 +1726,7 @@ ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep,
 		ire_delete(ire);
 		ire_refrele(ire);
 	}
-
-	irep = ire_create_bcast(ipif, addr, irep);
-
-	return (irep);
+	return (ire_create_bcast(ipif, addr, irep));
 }
 
 uint_t ip_loopback_mtu = IP_LOOPBACK_MTU;
@@ -1733,6 +1740,22 @@ ire_t **
 ire_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep)
 {
 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
+	ill_t		*ill = ipif->ipif_ill;
+
+	ASSERT(IAM_WRITER_IPIF(ipif));
+
+	if (IS_IPMP(ill)) {
+		/*
+		 * Broadcast IREs for the IPMP meta-interface use the
+		 * nominated broadcast interface to send and receive packets.
+		 * If there's no nominated interface, send the packets down to
+		 * the IPMP stub driver, which will discard them.  If the
+		 * nominated broadcast interface changes, ill_refresh_bcast()
+		 * will refresh the broadcast IREs.
+		 */
+		if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+			ill = ipif->ipif_ill;
+	}
 
 	*irep++ = ire_create(
 	    (uchar_t *)&addr,			/* dest addr */
@@ -1741,8 +1764,8 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep)
 	    NULL,				/* no gateway */
 	    &ipif->ipif_mtu,			/* max frag */
 	    NULL,				/* no src nce */
-	    ipif->ipif_rq,			/* recv-from queue */
-	    ipif->ipif_wq,			/* send-to queue */
+	    ill->ill_rq,			/* recv-from queue */
+	    ill->ill_wq,			/* send-to queue */
 	    IRE_BROADCAST,
 	    ipif,
 	    0,
@@ -1761,7 +1784,7 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep)
 	    NULL,				/* no gateway */
 	    &ip_loopback_mtu,			/* max frag size */
 	    NULL,				/* no src_nce */
-	    ipif->ipif_rq,			/* recv-from queue */
+	    ill->ill_rq,			/* recv-from queue */
 	    NULL,				/* no send-to queue */
 	    IRE_BROADCAST,			/* Needed for fanout in wput */
 	    ipif,
@@ -2049,32 +2072,23 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
 {
 	ill_t *ire_stq_ill = NULL;
 	ill_t *ire_ipif_ill = NULL;
-	ill_group_t *ire_ill_group = NULL;
 
 	ASSERT(match_flags != 0 || zoneid != ALL_ZONES);
 	/*
-	 * MATCH_IRE_ILL/MATCH_IRE_ILL_GROUP : We match both on ill
-	 *    pointed by ire_stq and ire_ipif. Only in the case of
-	 *    IRE_CACHEs can ire_stq and ire_ipif be pointing to
-	 *    different ills. But we want to keep this function generic
-	 *    enough for future use. So, we always try to match on both.
-	 *    The only caller of this function ire_walk_ill_tables, will
-	 *    call "func" after we return from this function. We expect
-	 *    "func" to do the right filtering of ires in this case.
-	 *
-	 * NOTE : In the case of MATCH_IRE_ILL_GROUP, groups
-	 * pointed by ire_stq and ire_ipif should always be the same.
-	 * So, we just match on only one of them.
+	 * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and
+	 *    ire_ipif.  Only in the case of IRE_CACHEs can ire_stq and
+	 *    ire_ipif be pointing to different ills. But we want to keep
+	 *    this function generic enough for future use. So, we always
+	 *    try to match on both.  The only caller of this function
+	 *    ire_walk_ill_tables, will call "func" after we return from
+	 *    this function. We expect "func" to do the right filtering
+	 *    of ires in this case.
 	 */
-	if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
+	if (match_flags & MATCH_IRE_ILL) {
 		if (ire->ire_stq != NULL)
-			ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+			ire_stq_ill = ire->ire_stq->q_ptr;
 		if (ire->ire_ipif != NULL)
 			ire_ipif_ill = ire->ire_ipif->ipif_ill;
-		if (ire_stq_ill != NULL)
-			ire_ill_group = ire_stq_ill->ill_group;
-		if ((ire_ill_group == NULL) && (ire_ipif_ill != NULL))
-			ire_ill_group = ire_ipif_ill->ill_group;
 	}
 
 	if (zoneid != ALL_ZONES) {
@@ -2115,7 +2129,7 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
 					ipif_t *src_ipif;
 					src_ipif =
 					    ipif_select_source_v6(ire_stq_ill,
-					    &ire->ire_addr_v6, RESTRICT_TO_NONE,
+					    &ire->ire_addr_v6, B_FALSE,
 					    IPV6_PREFER_SRC_DEFAULT,
 					    zoneid);
 					if (src_ipif != NULL) {
@@ -2143,9 +2157,9 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
 			ire_t *rire;
 
 			ire_match_flags |= MATCH_IRE_TYPE;
-			if (ire->ire_ipif != NULL) {
-				ire_match_flags |= MATCH_IRE_ILL_GROUP;
-			}
+			if (ire->ire_ipif != NULL)
+				ire_match_flags |= MATCH_IRE_ILL;
+
 			if (ire->ire_ipversion == IPV4_VERSION) {
 				rire = ire_route_lookup(ire->ire_gateway_addr,
 				    0, 0, IRE_INTERFACE, ire->ire_ipif, NULL,
@@ -2169,11 +2183,8 @@ ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
 	if (((!(match_flags & MATCH_IRE_TYPE)) ||
 	    (ire->ire_type & ire_type)) &&
 	    ((!(match_flags & MATCH_IRE_ILL)) ||
-	    (ire_stq_ill == ill || ire_ipif_ill == ill)) &&
-	    ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
-	    (ire_stq_ill == ill) || (ire_ipif_ill == ill) ||
-	    (ire_ill_group != NULL &&
-	    ire_ill_group == ill->ill_group))) {
+	    (ire_stq_ill == ill || ire_ipif_ill == ill ||
+	    ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) {
 		return (B_TRUE);
 	}
 	return (B_FALSE);
@@ -2221,8 +2232,7 @@ ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
 	boolean_t ret;
 	struct rtfuncarg rtfarg;
 
-	ASSERT((!(match_flags & (MATCH_IRE_ILL |
-	    MATCH_IRE_ILL_GROUP))) || (ill != NULL));
+	ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL));
 	ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0));
 	/*
 	 * Optimize by not looking at the forwarding table if there
@@ -2399,32 +2409,26 @@ ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp,
 	}
 
 	/*
-	 * IPMP flag settings happen without taking the exclusive route
-	 * in ip_sioctl_flags. So we need to make an atomic check here
-	 * for FAILED/OFFLINE/INACTIVE flags or if it has hit the
-	 * FAILBACK=no case.
+	 * Don't allow IRE's to be created on changing ill's.  Also, since
+	 * IPMP flags can be set on an ill without quiescing it, if we're not
+	 * a writer on stq_ill, check that the flags still allow IRE creation.
 	 */
 	if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) {
 		if (stq_ill->ill_state_flags & ILL_CHANGING) {
 			ill = stq_ill;
 			error = EAGAIN;
-		} else if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) ||
-		    (ill_is_probeonly(stq_ill) &&
-		    !(ire->ire_marks & IRE_MARK_HIDDEN))) {
-			error = EINVAL;
+		} else if (IS_UNDER_IPMP(stq_ill)) {
+			mutex_enter(&stq_ill->ill_phyint->phyint_lock);
+			if (!ipmp_ill_is_active(stq_ill) &&
+			    !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) {
+				error = EINVAL;
+			}
+			mutex_exit(&stq_ill->ill_phyint->phyint_lock);
 		}
-		goto done;
+		if (error != 0)
+			goto done;
 	}
 
-	/*
-	 * We don't check for OFFLINE/FAILED in this case because
-	 * the source address selection logic (ipif_select_source)
-	 * may still select a source address from such an ill. The
-	 * assumption is that these addresses will be moved by in.mpathd
-	 * soon. (i.e. this is a race). However link local addresses
-	 * will not move and hence ipif_select_source_v6 tries to avoid
-	 * FAILED ills. Please see ipif_select_source_v6 for more info
-	 */
 	if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) &&
 	    (ipif_ill->ill_state_flags & ILL_CHANGING)) {
 		ill = ipif_ill;
@@ -2444,8 +2448,10 @@ done:
 	if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) {
 		ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
 		mutex_enter(&ipsq->ipsq_lock);
+		mutex_enter(&ipsq->ipsq_xop->ipx_lock);
 		ire_atomic_end(irb_ptr, ire);
 		ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
+		mutex_exit(&ipsq->ipsq_xop->ipx_lock);
 		mutex_exit(&ipsq->ipsq_lock);
 		error = EINPROGRESS;
 	} else if (error != 0) {
@@ -2502,39 +2508,7 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
 		ire = ire1;
 	}
 	if (ire->ire_stq != NULL)
-		stq_ill = (ill_t *)ire->ire_stq->q_ptr;
-
-	if (ire->ire_type == IRE_CACHE) {
-		/*
-		 * If this interface is FAILED, or INACTIVE or has hit
-		 * the FAILBACK=no case, we create IRE_CACHES marked
-		 * HIDDEN for some special cases e.g. bind to
-		 * IPIF_NOFAILOVER address etc. So, if this interface
-		 * is FAILED/INACTIVE/hit FAILBACK=no case, and we are
-		 * not creating hidden ires, we should not allow that.
-		 * This happens because the state of the interface
-		 * changed while we were waiting in ARP. If this is the
-		 * daemon sending probes, the next probe will create
-		 * HIDDEN ires and we will create an ire then. This
-		 * cannot happen with NDP currently because IRE is
-		 * never queued in NDP. But it can happen in the
-		 * future when we have external resolvers with IPv6.
-		 * If the interface gets marked with OFFLINE while we
-		 * are waiting in ARP, don't add the ire.
-		 */
-		if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) ||
-		    (ill_is_probeonly(stq_ill) &&
-		    !(ire->ire_marks & IRE_MARK_HIDDEN))) {
-			/*
-			 * We don't know whether it is a valid ipif or not.
-			 * unless we do the check below. So, set it to NULL.
-			 */
-			ire->ire_ipif = NULL;
-			ire_delete(ire);
-			*irep = NULL;
-			return (EINVAL);
-		}
-	}
+		stq_ill = ire->ire_stq->q_ptr;
 
 	if (stq_ill != NULL && ire->ire_type == IRE_CACHE &&
 	    stq_ill->ill_net_type == IRE_IF_RESOLVER) {
@@ -2573,12 +2547,12 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
 		rw_exit(&ipst->ips_ill_g_lock);
 		if (ipif == NULL ||
 		    (ipif->ipif_isv6 &&
+		    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) &&
 		    !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
 		    &ipif->ipif_v6src_addr)) ||
 		    (!ipif->ipif_isv6 &&
 		    ire->ire_src_addr != ipif->ipif_src_addr) ||
 		    ire->ire_zoneid != ipif->ipif_zoneid) {
-
 			if (ipif != NULL)
 				ipif_refrele(ipif);
 			ire->ire_ipif = NULL;
@@ -2587,20 +2561,7 @@ ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
 			return (EINVAL);
 		}
 
-
 		ASSERT(ill != NULL);
-		/*
-		 * If this group was dismantled while this packets was
-		 * queued in ARP, don't add it here.
-		 */
-		if (ire->ire_ipif->ipif_ill->ill_group != ill->ill_group) {
-			/* We don't want ire_inactive bump stats for this */
-			ipif_refrele(ipif);
-			ire->ire_ipif = NULL;
-			ire_delete(ire);
-			*irep = NULL;
-			return (EINVAL);
-		}
 
 		/*
 		 * Since we didn't attach label security attributes to the
@@ -2677,6 +2638,16 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 	boolean_t need_refrele = B_FALSE;
 	nce_t	*nce;
 	ip_stack_t	*ipst = ire->ire_ipst;
+	uint_t	marks = 0;
+
+	/*
+	 * IREs with source addresses hosted on interfaces that are under IPMP
+	 * should be hidden so that applications don't accidentally end up
+	 * sending packets with test addresses as their source addresses, or
+	 * sending out interfaces that are e.g. IFF_INACTIVE.  Hide them here.
+	 */
+	if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill))
+		marks |= IRE_MARK_TESTHIDDEN;
 
 	if (ire->ire_ipif != NULL)
 		ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
@@ -2691,10 +2662,15 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 	case IRE_HOST:
 		ire->ire_mask = IP_HOST_MASK;
 		ire->ire_masklen = IP_ABITS;
+		ire->ire_marks |= marks;
 		if ((ire->ire_flags & RTF_SETSRC) == 0)
 			ire->ire_src_addr = 0;
 		break;
 	case IRE_CACHE:
+		ire->ire_mask = IP_HOST_MASK;
+		ire->ire_masklen = IP_ABITS;
+		ire->ire_marks |= marks;
+		break;
 	case IRE_BROADCAST:
 	case IRE_LOCAL:
 	case IRE_LOOPBACK:
@@ -2702,15 +2678,14 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 		ire->ire_masklen = IP_ABITS;
 		break;
 	case IRE_PREFIX:
-		if ((ire->ire_flags & RTF_SETSRC) == 0)
-			ire->ire_src_addr = 0;
-		break;
 	case IRE_DEFAULT:
+		ire->ire_marks |= marks;
 		if ((ire->ire_flags & RTF_SETSRC) == 0)
 			ire->ire_src_addr = 0;
 		break;
 	case IRE_IF_RESOLVER:
 	case IRE_IF_NORESOLVER:
+		ire->ire_marks |= marks;
 		break;
 	default:
 		ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n",
@@ -2796,19 +2771,13 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 		 */
 		flags |= MATCH_IRE_IPIF;
 		/*
-		 * If we are creating hidden ires, make sure we search on
-		 * this ill (MATCH_IRE_ILL) and a hidden ire,
-		 * while we are searching for duplicates below. Otherwise we
-		 * could potentially find an IRE on some other interface
-		 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
-		 * shouldn't do this as this will lead to an infinite loop
-		 * (if we get to ip_wput again) eventually we need an hidden
-		 * ire for this packet to go out. MATCH_IRE_ILL is explicitly
-		 * done below.
+		 * If we are creating a hidden IRE, make sure we search for
+		 * hidden IREs when searching for duplicates below.
+		 * Otherwise, we might find an IRE on some other interface
+		 * that's not marked hidden.
 		 */
-		if (ire->ire_type == IRE_CACHE &&
-		    (ire->ire_marks & IRE_MARK_HIDDEN))
-			flags |= (MATCH_IRE_MARK_HIDDEN);
+		if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
+			flags |= MATCH_IRE_MARK_TESTHIDDEN;
 	}
 	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
 		irb_ptr = ire_get_bucket(ire);
@@ -2927,7 +2896,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 			 * avoid a lookup in the caller again. If the callers
 			 * don't want to use it, they need to do a REFRELE.
 			 */
-			ip1dbg(("found dup ire existing %p new %p",
+			ip1dbg(("found dup ire existing %p new %p\n",
 			    (void *)ire1, (void *)ire));
 			IRE_REFHOLD(ire1);
 			ire_atomic_end(irb_ptr, ire);
@@ -2948,6 +2917,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 			return (0);
 		}
 	}
+
 	if (ire->ire_type & IRE_CACHE) {
 		ASSERT(ire->ire_stq != NULL);
 		nce = ndp_lookup_v4(ire_to_ill(ire),
@@ -2999,17 +2969,9 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 	}
 	/*
 	 * Make it easy for ip_wput_ire() to hit multiple broadcast ires by
-	 * grouping identical addresses together on the hash chain. We also
-	 * don't want to send multiple copies out if there are two ills part
-	 * of the same group. Thus we group the ires with same addr and same
-	 * ill group together so that ip_wput_ire can easily skip all the
-	 * ires with same addr and same group after sending the first copy.
-	 * We do this only for IRE_BROADCASTs as ip_wput_ire is currently
-	 * interested in such groupings only for broadcasts.
-	 *
-	 * NOTE : If the interfaces are brought up first and then grouped,
-	 * illgrp_insert will handle it. We come here when the interfaces
-	 * are already in group and we are bringing them UP.
+	 * grouping identical addresses together on the hash chain.  We do
+	 * this only for IRE_BROADCASTs as ip_wput_ire is currently interested
+	 * in such groupings only for broadcasts.
 	 *
 	 * Find the first entry that matches ire_addr. *irep will be null
 	 * if no match.
@@ -3023,29 +2985,7 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 	if (ire->ire_type == IRE_BROADCAST && *irep != NULL) {
 		/*
 		 * We found some ire (i.e *irep) with a matching addr. We
-		 * want to group ires with same addr and same ill group
-		 * together.
-		 *
-		 * First get to the entry that matches our address and
-		 * ill group i.e stop as soon as we find the first ire
-		 * matching the ill group and address. If there is only
-		 * an address match, we should walk and look for some
-		 * group match. These are some of the possible scenarios :
-		 *
-		 * 1) There are no groups at all i.e all ire's ill_group
-		 *    are NULL. In that case we will essentially group
-		 *    all the ires with the same addr together. Same as
-		 *    the "else" block of this "if".
-		 *
-		 * 2) There are some groups and this ire's ill_group is
-		 *    NULL. In this case, we will first find the group
-		 *    that matches the address and a NULL group. Then
-		 *    we will insert the ire at the end of that group.
-		 *
-		 * 3) There are some groups and this ires's ill_group is
-		 *    non-NULL. In this case we will first find the group
-		 *    that matches the address and the ill_group. Then
-		 *    we will insert the ire at the end of that group.
+		 * want to group ires with same addr.
 		 */
 		for (;;) {
 			ire1 = *irep;
@@ -3053,8 +2993,8 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 			    (ire1->ire_next->ire_addr != ire->ire_addr) ||
 			    (ire1->ire_type != IRE_BROADCAST) ||
 			    (ire1->ire_flags & RTF_MULTIRT) ||
-			    (ire1->ire_ipif->ipif_ill->ill_group ==
-			    ire->ire_ipif->ipif_ill->ill_group))
+			    (ire1->ire_ipif->ipif_ill->ill_grp ==
+			    ire->ire_ipif->ipif_ill->ill_grp))
 				break;
 			irep = &ire1->ire_next;
 		}
@@ -3071,18 +3011,14 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
 
 		/*
 		 * Either we have hit the end of the list or the address
-		 * did not match or the group *matched*. If we found
-		 * a match on the group, skip to the end of the group.
+		 * did not match.
 		 */
 		while (*irep != NULL) {
 			ire1 = *irep;
 			if ((ire1->ire_addr != ire->ire_addr) ||
-			    (ire1->ire_type != IRE_BROADCAST) ||
-			    (ire1->ire_ipif->ipif_ill->ill_group !=
-			    ire->ire_ipif->ipif_ill->ill_group))
+			    (ire1->ire_type != IRE_BROADCAST))
 				break;
-			if (ire1->ire_ipif->ipif_ill->ill_group == NULL &&
-			    ire1->ire_ipif == ire->ire_ipif) {
+			if (ire1->ire_ipif == ire->ire_ipif) {
 				irep = &ire1->ire_next;
 				break;
 			}
@@ -3611,15 +3547,14 @@ ire_inactive(ire_t *ire)
 	 * The ipif that is associated with an ire is ire->ire_ipif and
 	 * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call
 	 * ipif_ill_refrele_tail. Usually stq_ill is null or the same as
-	 * ire->ire_ipif->ipif_ill. So nothing more needs to be done. Only
-	 * in the case of IRE_CACHES when IPMP is used, stq_ill can be
-	 * different. If this is different from ire->ire_ipif->ipif_ill and
-	 * if the ill_ire_cnt on the stq_ill also has dropped to zero, we call
+	 * ire->ire_ipif->ipif_ill. So nothing more needs to be done.
+	 * However, for VNI or IPMP IRE entries, stq_ill can be different.
+	 * If this is different from ire->ire_ipif->ipif_ill and if the
+	 * ill_ire_cnt on the stq_ill also has dropped to zero, we call
 	 * ipif_ill_refrele_tail on the stq_ill.
 	 */
-
 	if (ire->ire_stq != NULL)
-		stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+		stq_ill = ire->ire_stq->q_ptr;
 
 	if (stq_ill == NULL || stq_ill == ill) {
 		/* Optimize the most common case */
@@ -3881,26 +3816,27 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
 {
 	ill_t *ire_ill = NULL, *dst_ill;
 	ill_t *ipif_ill = NULL;
-	ill_group_t *ire_ill_group = NULL;
-	ill_group_t *ipif_ill_group = NULL;
 
 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
 	ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
-	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
+	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
 	    (ipif != NULL && !ipif->ipif_isv6));
 	ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL);
 
 	/*
-	 * HIDDEN cache entries have to be looked up specifically with
-	 * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
-	 * when the interface is FAILED or INACTIVE. In that case,
-	 * any IRE_CACHES that exists should be marked with
-	 * IRE_MARK_HIDDEN. So, we don't really need to match below
-	 * for IRE_MARK_HIDDEN. But we do so for consistency.
+	 * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
+	 * is in fact hidden, to ensure the caller gets the right one.  One
+	 * exception: if the caller passed MATCH_IRE_IHANDLE, then they
+	 * already know the identity of the given IRE_INTERFACE entry and
+	 * there's no point trying to hide it from them.
 	 */
-	if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
-	    (ire->ire_marks & IRE_MARK_HIDDEN))
-		return (B_FALSE);
+	if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
+		if (match_flags & MATCH_IRE_IHANDLE)
+			match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+
+		if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+			return (B_FALSE);
+	}
 
 	/*
 	 * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option
@@ -3994,19 +3930,18 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
 	}
 
 	/*
-	 * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
-	 * somebody wants to send out on a particular interface which
-	 * is given by ire_stq and hence use ire_stq to derive the ill
-	 * value. ire_ipif for IRE_CACHES is just the means of getting
-	 * a source address i.e ire_src_addr = ire->ire_ipif->ipif_src_addr.
-	 * ire_to_ill does the right thing for this.
+	 * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
+	 * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
+	 * of getting a source address -- i.e., ire_src_addr ==
+	 * ire->ire_ipif->ipif_src_addr).  ire_to_ill() handles this.
+	 *
+	 * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
+	 * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
+	 * IPMP test traffic), then the ill must match exactly.
 	 */
-	if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
+	if (match_flags & MATCH_IRE_ILL) {
 		ire_ill = ire_to_ill(ire);
-		if (ire_ill != NULL)
-			ire_ill_group = ire_ill->ill_group;
 		ipif_ill = ipif->ipif_ill;
-		ipif_ill_group = ipif_ill->ill_group;
 	}
 
 	if ((ire->ire_addr == (addr & mask)) &&
@@ -4018,24 +3953,21 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
 	    (ire->ire_src_addr == ipif->ipif_src_addr)) &&
 	    ((!(match_flags & MATCH_IRE_IPIF)) ||
 	    (ire->ire_ipif == ipif)) &&
-	    ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
-	    (ire->ire_type != IRE_CACHE ||
-	    ire->ire_marks & IRE_MARK_HIDDEN)) &&
+	    ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
+	    (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
 	    ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) ||
 	    (ire->ire_type != IRE_CACHE ||
 	    ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) &&
-	    ((!(match_flags & MATCH_IRE_ILL)) ||
-	    (ire_ill == ipif_ill)) &&
 	    ((!(match_flags & MATCH_IRE_WQ)) ||
 	    (ire->ire_stq == wq)) &&
+	    ((!(match_flags & MATCH_IRE_ILL)) ||
+	    (ire_ill == ipif_ill ||
+	    (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
+	    ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
 	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
 	    (ire->ire_ihandle == ihandle)) &&
 	    ((!(match_flags & MATCH_IRE_MASK)) ||
 	    (ire->ire_mask == mask)) &&
-	    ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
-	    (ire_ill == ipif_ill) ||
-	    (ire_ill_group != NULL &&
-	    ire_ill_group == ipif_ill_group)) &&
 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
 	    (!is_system_labeled()) ||
 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -4060,8 +3992,7 @@ ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
 	 * ire_match_args() will dereference ipif MATCH_IRE_SRC or
 	 * MATCH_IRE_ILL is set.
 	 */
-	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
-	    (ipif == NULL))
+	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
 		return (NULL);
 
 	/*
@@ -4142,14 +4073,15 @@ ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif,
 
 /*
  * Check whether the IRE_LOCAL and the IRE potentially used to transmit
- * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are part of
- * the same ill group.
+ * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical
+ * or part of the same illgrp.  (In the IPMP case, usually the two IREs
+ * will both belong to the IPMP ill, but exceptions are possible -- e.g.
+ * if IPMP test addresses are on their own subnet.)
  */
 boolean_t
-ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire)
+ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire)
 {
-	ill_t		*recv_ill, *xmit_ill;
-	ill_group_t	*recv_group, *xmit_group;
+	ill_t *recv_ill, *xmit_ill;
 
 	ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK));
 	ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE));
@@ -4160,20 +4092,11 @@ ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire)
 	ASSERT(recv_ill != NULL);
 	ASSERT(xmit_ill != NULL);
 
-	if (recv_ill == xmit_ill)
-		return (B_TRUE);
-
-	recv_group = recv_ill->ill_group;
-	xmit_group = xmit_ill->ill_group;
-
-	if (recv_group != NULL && recv_group == xmit_group)
-		return (B_TRUE);
-
-	return (B_FALSE);
+	return (IS_ON_SAME_LAN(recv_ill, xmit_ill));
 }
 
 /*
- * Check if the IRE_LOCAL uses the same ill (group) as another route would use.
+ * Check if the IRE_LOCAL uses the same ill as another route would use.
  * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
  * then we don't allow this IRE_LOCAL to be used.
  */
@@ -4183,17 +4106,16 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
 {
 	ire_t		*alt_ire;
 	boolean_t	rval;
+	int		flags;
+
+	flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE;
 
 	if (ire_local->ire_ipversion == IPV4_VERSION) {
 		alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL,
-		    NULL, zoneid, 0, tsl,
-		    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-		    MATCH_IRE_RJ_BHOLE, ipst);
+		    NULL, zoneid, 0, tsl, flags, ipst);
 	} else {
-		alt_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
-		    0, NULL, NULL, zoneid, 0, tsl,
-		    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
-		    MATCH_IRE_RJ_BHOLE, ipst);
+		alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL,
+		    NULL, zoneid, 0, tsl, flags, ipst);
 	}
 
 	if (alt_ire == NULL)
@@ -4203,16 +4125,14 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
 		ire_refrele(alt_ire);
 		return (B_FALSE);
 	}
-	rval = ire_local_same_ill_group(ire_local, alt_ire);
+	rval = ire_local_same_lan(ire_local, alt_ire);
 
 	ire_refrele(alt_ire);
 	return (rval);
 }
 
 /*
- * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
- * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
- * to the hidden ones.
+ * Lookup cache
  *
  * In general the zoneid has to match (where ALL_ZONES match all of them).
  * But for IRE_LOCAL we also need to handle the case where L2 should
@@ -4220,8 +4140,7 @@ ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
  * Ethernet drivers nor Ethernet hardware loops back packets sent to their
  * own MAC address. This loopback is needed when the normal
  * routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill (or ill group) as the ill with which this IRE_LOCAL is
- * associated.
+ * the same ill as the ill with which this IRE_LOCAL is associated.
  *
  * Earlier versions of this code always matched an IRE_LOCAL independently of
  * the zoneid. We preserve that earlier behavior when
@@ -4239,7 +4158,7 @@ ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl,
 	rw_enter(&irb_ptr->irb_lock, RW_READER);
 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
 		if (ire->ire_marks & (IRE_MARK_CONDEMNED |
-		    IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) {
+		    IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) {
 			continue;
 		}
 		if (ire->ire_addr == addr) {
@@ -4284,7 +4203,7 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
 	ire_t *ire;
 
 	/*
-	 * Lets look for an ire in the cachetable whose
+	 * Look for an ire in the cachetable whose
 	 * ire_addr matches the destination.
 	 * Since we are being called by forwarding fastpath
 	 * no need to check for Trusted Solaris label.
@@ -4293,8 +4212,8 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
 	    dst, ipst->ips_ip_cache_table_size)];
 	rw_enter(&irb_ptr->irb_lock, RW_READER);
 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
-		if (ire->ire_marks & (IRE_MARK_CONDEMNED |
-		    IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) {
+		if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN |
+		    IRE_MARK_PRIVATE_ADDR)) {
 			continue;
 		}
 		if (ire->ire_addr == dst) {
@@ -4307,7 +4226,6 @@ ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
 	return (NULL);
 }
 
-
 /*
  * Locate the interface ire that is tied to the cache ire 'cire' via
  * cire->ire_ihandle.
@@ -4333,13 +4251,8 @@ ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
 	 * because the ihandle refers to an ipif which can be in only one zone.
 	 */
 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
-	/*
-	 * ip_newroute calls ire_ftable_lookup with MATCH_IRE_ILL only
-	 * for on-link hosts. We should never be here for onlink.
-	 * Thus, use MATCH_IRE_ILL_GROUP.
-	 */
 	if (pire->ire_ipif != NULL)
-		match_flags |= MATCH_IRE_ILL_GROUP;
+		match_flags |= MATCH_IRE_ILL;
 	/*
 	 * We know that the mask of the interface ire equals cire->ire_cmask.
 	 * (When ip_newroute() created 'cire' for the gateway it set its
@@ -4376,7 +4289,7 @@ ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
 	 */
 	match_flags =  MATCH_IRE_TYPE;
 	if (pire->ire_ipif != NULL)
-		match_flags |= MATCH_IRE_ILL_GROUP;
+		match_flags |= MATCH_IRE_ILL;
 	ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET,
 	    pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
 	if (ire == NULL)
@@ -4411,7 +4324,16 @@ ire_t *
 ipif_to_ire(const ipif_t *ipif)
 {
 	ire_t	*ire;
-	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
+	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+	uint_t	match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK;
+
+	/*
+	 * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
+	 * so that they aren't accidentally returned.  However, if the
+	 * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+	 */
+	if (IS_UNDER_IPMP(ipif->ipif_ill))
+		match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
 
 	ASSERT(!ipif->ipif_isv6);
 	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
@@ -4421,13 +4343,12 @@ ipif_to_ire(const ipif_t *ipif)
 	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
 		/* In this case we need to lookup destination address. */
 		ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0,
-		    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL,
-		    (MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK), ipst);
+		    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags,
+		    ipst);
 	} else {
 		ire = ire_ftable_lookup(ipif->ipif_subnet,
 		    ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL,
-		    ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
-		    MATCH_IRE_MASK), ipst);
+		    ALL_ZONES, 0, NULL, match_flags, ipst);
 	}
 	return (ire);
 }
@@ -4811,7 +4732,7 @@ ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst)
 			continue;
 		if (cire->ire_addr != dst)
 			continue;
-		if (cire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+		if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
 			continue;
 		unres_cnt--;
 	}
@@ -4983,7 +4904,7 @@ ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
 						continue;
 					if (cire->ire_marks &
 					    (IRE_MARK_CONDEMNED |
-					    IRE_MARK_HIDDEN))
+					    IRE_MARK_TESTHIDDEN))
 						continue;
 
 					if (cire->ire_gw_secattr != NULL &&
@@ -5186,7 +5107,7 @@ ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
 						continue;
 					if (cire->ire_marks &
 					    (IRE_MARK_CONDEMNED |
-					    IRE_MARK_HIDDEN))
+					    IRE_MARK_TESTHIDDEN))
 						continue;
 
 					if (cire->ire_gw_secattr != NULL &&
@@ -5401,7 +5322,7 @@ ire_trace_cleanup(const ire_t *ire)
  * invoked when the mblk containing fake_ire is freed.
  */
 void
-ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
+ire_arpresolve(ire_t *in_ire)
 {
 	areq_t		*areq;
 	ipaddr_t	*addrp;
@@ -5409,8 +5330,13 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
 	ire_t 		*ire, *buf;
 	size_t		bufsize;
 	frtn_t		*frtnp;
-	ill_t		*ill;
-	ip_stack_t	*ipst = dst_ill->ill_ipst;
+	ill_t		*dst_ill;
+	ip_stack_t	*ipst;
+
+	ASSERT(in_ire->ire_nce != NULL);
+
+	dst_ill = ire_to_ill(in_ire);
+	ipst = dst_ill->ill_ipst;
 
 	/*
 	 * Construct message chain for the resolver
@@ -5431,16 +5357,16 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
 	 */
 
 	/*
-	 * We use esballoc to allocate the second part(the ire_t size mblk)
-	 * of the message chain depicted above. THis mblk will be freed
-	 * by arp when there is a  timeout, and otherwise passed to IP
-	 * and IP will * free it after processing the ARP response.
+	 * We use esballoc to allocate the second part (IRE_MBLK)
+	 * of the message chain depicted above.  This mblk will be freed
+	 * by arp when there is a timeout, and otherwise passed to IP
+	 * and IP will free it after processing the ARP response.
 	 */
 
 	bufsize = sizeof (ire_t) + sizeof (frtn_t);
 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
 	if (buf == NULL) {
-		ip1dbg(("ire_arpresolver:alloc buffer failed\n "));
+		ip1dbg(("ire_arpresolve: alloc buffer failed\n"));
 		return;
 	}
 	frtnp = (frtn_t *)(buf + 1);
@@ -5448,16 +5374,15 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
 	frtnp->free_func = ire_freemblk;
 
 	ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
-
 	if (ire_mp == NULL) {
 		ip1dbg(("ire_arpresolve: esballoc failed\n"));
 		kmem_free(buf, bufsize);
 		return;
 	}
-	ASSERT(in_ire->ire_nce != NULL);
+
 	areq_mp = copyb(dst_ill->ill_resolver_mp);
 	if (areq_mp == NULL) {
-		kmem_free(buf, bufsize);
+		freemsg(ire_mp);
 		return;
 	}
 
@@ -5473,9 +5398,8 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
 	ire->ire_ipif_seqid = in_ire->ire_ipif_seqid;
 	ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex;
 	ire->ire_ipif = in_ire->ire_ipif;
-	ire->ire_stq = in_ire->ire_stq;
-	ill = ire_to_ill(ire);
-	ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
+	ire->ire_stq = dst_ill->ill_wq;
+	ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex;
 	ire->ire_zoneid = in_ire->ire_zoneid;
 	ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
 	ire->ire_ipst = ipst;
@@ -5528,7 +5452,6 @@ ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
  * Note that the ARP/IP merge should replace the functioanlity by providing
  * direct function calls to clean up unresolved entries in ire/nce lists.
  */
-
 void
 ire_freemblk(ire_t *ire_mp)
 {
@@ -5738,9 +5661,8 @@ retry_nce:
 		 *    is marked as ND_REACHABLE at this point.
 		 *    This nce does not undergo any further state changes,
 		 *    and exists as long as the interface is plumbed.
-		 * Note: we do the ire_nce assignment here for IRE_BROADCAST
-		 * because some functions like ill_mark_bcast() inline the
-		 * ire_add functionality.
+		 * Note: the assignment of ire_nce here is a historical
+		 * artifact of old code that used to inline ire_add().
 		 */
 		ire->ire_nce = nce;
 		/*
@@ -5772,8 +5694,7 @@ ip4_ctable_lookup_impl(ire_ctable_args_t *margs)
 	ire_t			*ire;
 	ip_stack_t		*ipst = margs->ict_ipst;
 
-	if ((margs->ict_flags &
-	    (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
+	if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
 	    (margs->ict_ipif == NULL)) {
 		return (NULL);
 	}
@@ -5802,10 +5723,7 @@ ip4_ctable_lookup_impl(ire_ctable_args_t *margs)
 /*
  * This function locates IRE_CACHE entries which were added by the
  * ire_forward() path. We can fully specify the IRE we are looking for by
- * providing the ipif_t AND the ire_stq. This is different to MATCH_IRE_ILL
- * which uses the ipif_ill. This is inadequate with IPMP groups where
- * illgrp_scheduler() may have been used to select an ill from the group for
- * the outgoing interface.
+ * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ).
  */
 ire_t *
 ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif,
diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c
index ac14adf00d..1a3df02418 100644
--- a/usr/src/uts/common/inet/ip/ip_mroute.c
+++ b/usr/src/uts/common/inet/ip/ip_mroute.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -2037,6 +2037,7 @@ static int
 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
     struct mfc *rt)
 {
+	ill_t *vill;
 	vifi_t vifi;
 	struct vif *vifp;
 	ipaddr_t dst = ipha->ipha_dst;
@@ -2102,25 +2103,21 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
 	}
 	/*
 	 * Don't forward if it didn't arrive from the parent vif for its
-	 * origin. But do match on the groups as we nominate only one
-	 * ill in the group for receiving allmulti packets.
+	 * origin.
 	 */
-	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill &&
-	    (ill->ill_group == NULL ||
-	    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group !=
-		ill->ill_group)) ||
+	vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill;
+	if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) ||
 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
 		/* Came in the wrong interface */
 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
 			"numvifs %d ill %s viftable ill %s\n",
 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
-			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
+			vill->ill_name));
 		if (ipst->ips_ip_mrtdebug > 1) {
 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 			    "ip_mdq: arrived wrong if, vifi %d ill "
 			    "%s viftable ill %s\n",
-			    (int)vifi, ill->ill_name,
-			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
+			    (int)vifi, ill->ill_name, vill->ill_name);
 		}
 		ipst->ips_mrtstat->mrts_wrong_if++;
 		rt->mfc_wrong_if++;
@@ -3047,7 +3044,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp)
 		dst  = ipha->ipha_dst;
 		ipif = vifp->v_ipif;
 
-		mutex_enter(&ipif->ipif_ill->ill_lock);
 		if (ilm_lookup_ipif(ipif, dst) != NULL) {
 			/*
 			 * The packet is not yet reassembled, thus we need to
@@ -3057,7 +3053,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp)
 			mblk_t 	*mp_loop;
 			ire_t	*ire;
 
-			mutex_exit(&ipif->ipif_ill->ill_lock);
 			if (ipst->ips_ip_mrtdebug > 1) {
 				(void) mi_strlog(mrouter->conn_rq, 1,
 				    SL_TRACE,
@@ -3082,8 +3077,6 @@ tbf_send_packet(struct vif *vifp, mblk_t *mp)
 			}
 			if (ire != NULL)
 				ire_refrele(ire);
-		} else {
-			mutex_exit(&ipif->ipif_ill->ill_lock);
 		}
 		if (ipst->ips_ip_mrtdebug > 1) {
 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c
index f3c95ae362..cbea9be165 100644
--- a/usr/src/uts/common/inet/ip/ip_multi.c
+++ b/usr/src/uts/common/inet/ip/ip_multi.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -68,12 +68,10 @@ static void	ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode,
 
 static ilm_t	*ilm_add_v6(ipif_t *ipif, const in6_addr_t *group,
     ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist,
-    int orig_ifindex, zoneid_t zoneid);
+    zoneid_t zoneid);
 static void	ilm_delete(ilm_t *ilm);
 static int	ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *group);
 static int	ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *group);
-static ilg_t	*ilg_lookup_ill_index_v6(conn_t *connp,
-    const in6_addr_t *v6group, int index);
 static ilg_t	*ilg_lookup_ipif(conn_t *connp, ipaddr_t group,
     ipif_t *ipif);
 static int	ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif,
@@ -91,25 +89,21 @@ static int	ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group,
 static int	ip_opt_delete_group_excl_v6(conn_t *connp,
     const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode,
     const in6_addr_t *v6src);
+static void	ill_ilm_walker_hold(ill_t *ill);
+static void	ill_ilm_walker_rele(ill_t *ill);
 
 /*
  * MT notes:
  *
  * Multicast joins operate on both the ilg and ilm structures. Multiple
  * threads operating on an conn (socket) trying to do multicast joins
- * need to synchronize  when operating on the ilg. Multiple threads
+ * need to synchronize when operating on the ilg. Multiple threads
  * potentially operating on different conn (socket endpoints) trying to
  * do multicast joins could eventually end up trying to manipulate the
- * ilm simulatenously and need to synchronize on the access to the ilm.
- * Both are amenable to standard Solaris MT techniques, but it would be
- * complex to handle a failover or failback which needs to manipulate
- * ilg/ilms if an applications can also simultaenously join/leave
- * multicast groups. Hence multicast join/leave also go through the ipsq_t
+ * ilm simultaneously and need to synchronize access to the ilm.  Currently,
+ * this is done by synchronizing join/leave via per-phyint ipsq_t
  * serialization.
  *
- * Multicast joins and leaves are single-threaded per phyint/IPMP group
- * using the ipsq serialization mechanism.
- *
  * An ilm is an IP data structure used to track multicast join/leave.
  * An ilm is associated with a <multicast group, ipif> tuple in IPv4 and
  * with just <multicast group> in IPv6. ilm_refcnt is the number of ilg's
@@ -211,12 +205,13 @@ conn_ilg_reap(conn_t *connp)
  * Returns a pointer to the next available ilg in conn_ilg.  Allocs more
  * buffers in size of ILG_ALLOC_CHUNK ilgs when needed, and updates conn's
  * ilg tracking fields appropriately (conn_ilg_inuse reflects usage of the
- * returned ilg).  Returns NULL on failure (ENOMEM).
+ * returned ilg).  Returns NULL on failure, in which case `*errp' will be
+ * filled in with the reason.
  *
  * Assumes connp->conn_lock is held.
  */
 static ilg_t *
-conn_ilg_alloc(conn_t *connp)
+conn_ilg_alloc(conn_t *connp, int *errp)
 {
 	ilg_t *new, *ret;
 	int curcnt;
@@ -224,10 +219,21 @@ conn_ilg_alloc(conn_t *connp)
 	ASSERT(MUTEX_HELD(&connp->conn_lock));
 	ASSERT(connp->conn_ilg_inuse <= connp->conn_ilg_allocated);
 
+	/*
+	 * If CONN_CLOSING is set, conn_ilg cleanup has begun and we must not
+	 * create any ilgs.
+	 */
+	if (connp->conn_state_flags & CONN_CLOSING) {
+		*errp = EINVAL;
+		return (NULL);
+	}
+
 	if (connp->conn_ilg == NULL) {
 		connp->conn_ilg = GETSTRUCT(ilg_t, ILG_ALLOC_CHUNK);
-		if (connp->conn_ilg == NULL)
+		if (connp->conn_ilg == NULL) {
+			*errp = ENOMEM;
 			return (NULL);
+		}
 		connp->conn_ilg_allocated = ILG_ALLOC_CHUNK;
 		connp->conn_ilg_inuse = 0;
 	}
@@ -241,12 +247,15 @@ conn_ilg_alloc(conn_t *connp)
 			 * ilg_delete_all() will have to be changed when
 			 * this logic is changed.
 			 */
+			*errp = EBUSY;
 			return (NULL);
 		}
 		curcnt = connp->conn_ilg_allocated;
 		new = GETSTRUCT(ilg_t, curcnt + ILG_ALLOC_CHUNK);
-		if (new == NULL)
+		if (new == NULL) {
+			*errp = ENOMEM;
 			return (NULL);
+		}
 		bcopy(connp->conn_ilg, new, sizeof (ilg_t) * curcnt);
 		mi_free((char *)connp->conn_ilg);
 		connp->conn_ilg = new;
@@ -378,42 +387,6 @@ ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist)
 	}
 }
 
-/*
- * If the given interface has failed, choose a new one to join on so
- * that we continue to receive packets.  ilg_orig_ifindex remembers
- * what the application used to join on so that we know the ilg to
- * delete even though we change the ill here.  Callers will store the
- * ilg returned from this function in ilg_ill.  Thus when we receive
- * a packet on ilg_ill, conn_wantpacket_v6 will deliver the packets.
- *
- * This function must be called as writer so we can walk the group
- * list and examine flags without holding a lock.
- */
-ill_t *
-ip_choose_multi_ill(ill_t *ill, const in6_addr_t *grp)
-{
-	ill_t	*till;
-	ill_group_t *illgrp = ill->ill_group;
-
-	ASSERT(IAM_WRITER_ILL(ill));
-
-	if (IN6_IS_ADDR_UNSPECIFIED(grp) || illgrp == NULL)
-		return (ill);
-
-	if ((ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) == 0)
-		return (ill);
-
-	till = illgrp->illgrp_ill;
-	while (till != NULL &&
-	    (till->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))) {
-		till = till->ill_group_next;
-	}
-	if (till != NULL)
-		return (till);
-
-	return (ill);
-}
-
 static int
 ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist,
     boolean_t isv6)
@@ -560,8 +533,7 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6)
 }
 
 /*
- * INADDR_ANY means all multicast addresses. This is only used
- * by the multicast router.
+ * INADDR_ANY means all multicast addresses.
  * INADDR_ANY is stored as IPv6 unspecified addr.
  */
 int
@@ -578,40 +550,31 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
 	if (!CLASSD(group) && group != INADDR_ANY)
 		return (EINVAL);
 
+	if (IS_UNDER_IPMP(ill))
+		return (EINVAL);
+
 	/*
-	 * INADDR_ANY is represented as the IPv6 unspecifed addr.
+	 * INADDR_ANY is represented as the IPv6 unspecified addr.
 	 */
 	if (group == INADDR_ANY)
 		v6group = ipv6_all_zeros;
 	else
 		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
 
-	mutex_enter(&ill->ill_lock);
 	ilm = ilm_lookup_ipif(ipif, group);
-	mutex_exit(&ill->ill_lock);
 	/*
 	 * Since we are writer, we know the ilm_flags itself cannot
 	 * change at this point, and ilm_lookup_ipif would not have
 	 * returned a DELETED ilm. However, the data path can free
-	 * ilm->next via ilm_walker_cleanup() so we can safely
+	 * ilm->ilm_next via ilm_walker_cleanup() so we can safely
 	 * access anything in ilm except ilm_next (for safe access to
-	 * ilm_next we'd have  to take the ill_lock).
+	 * ilm_next we'd have to take the ill_lock).
 	 */
 	if (ilm != NULL)
 		return (ilm_update_add(ilm, ilgstat, ilg_flist, B_FALSE));
 
-	/*
-	 * ilms are associated with ipifs in IPv4. It moves with the
-	 * ipif if the ipif moves to a new ill when the interface
-	 * fails. Thus we really don't check whether the ipif_ill
-	 * has failed like in IPv6. If it has FAILED the ipif
-	 * will move (daemon will move it) and hence the ilm, if the
-	 * ipif is not IPIF_NOFAILOVER. For the IPIF_NOFAILOVER ipifs,
-	 * we continue to receive in the same place even if the
-	 * interface fails.
-	 */
 	ilm = ilm_add_v6(ipif, &v6group, ilgstat, ilg_fmode, ilg_flist,
-	    ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid);
+	    ipif->ipif_zoneid);
 	if (ilm == NULL)
 		return (ENOMEM);
 
@@ -623,10 +586,7 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
 		 */
 		if (ilm_numentries_v6(ill, &v6group) > 1)
 			return (0);
-		if (ill->ill_group == NULL)
-			ret = ill_join_allmulti(ill);
-		else
-			ret = ill_nominate_mcast_rcv(ill->ill_group);
+		ret = ill_join_allmulti(ill);
 		if (ret != 0)
 			ilm_delete(ilm);
 		return (ret);
@@ -646,12 +606,8 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
 
 /*
  * The unspecified address means all multicast addresses.
- * This is only used by the multicast router.
  *
- * ill identifies the interface to join on; it may not match the
- * interface requested by the application of a failover has taken
- * place.  orig_ifindex always identifies the interface requested
- * by the app.
+ * ill identifies the interface to join on.
  *
  * ilgstat tells us if there's an ilg associated with this join,
  * and if so, if it's a new ilg or a change to an existing one.
@@ -659,9 +615,8 @@ ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
  * the ilg (and will be EXCLUDE {NULL} in the case of no ilg).
  */
 int
-ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
-    zoneid_t zoneid, ilg_stat_t ilgstat, mcast_record_t ilg_fmode,
-    slist_t *ilg_flist)
+ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+    ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist)
 {
 	ilm_t	*ilm;
 	int	ret;
@@ -673,37 +628,20 @@ ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
 		return (EINVAL);
 	}
 
+	if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_MC_SOLICITEDNODE(v6group))
+		return (EINVAL);
+
 	/*
-	 * An ilm is uniquely identified by the tuple of (group, ill,
-	 * orig_ill).  group is the multicast group address, ill is
-	 * the interface on which it is currently joined, and orig_ill
-	 * is the interface on which the application requested the
-	 * join.  orig_ill and ill are the same unless orig_ill has
-	 * failed over.
-	 *
-	 * Both orig_ill and ill are required, which means we may have
-	 * 2 ilms on an ill for the same group, but with different
-	 * orig_ills.  These must be kept separate, so that when failback
-	 * occurs, the appropriate ilms are moved back to their orig_ill
-	 * without disrupting memberships on the ill to which they had
-	 * been moved.
-	 *
-	 * In order to track orig_ill, we store orig_ifindex in the
-	 * ilm and ilg.
+	 * An ilm is uniquely identified by the tuple of (group, ill) where
+	 * `group' is the multicast group address, and `ill' is the interface
+	 * on which it is currently joined.
 	 */
-	mutex_enter(&ill->ill_lock);
-	ilm = ilm_lookup_ill_index_v6(ill, v6group, orig_ifindex, zoneid);
-	mutex_exit(&ill->ill_lock);
+	ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid);
 	if (ilm != NULL)
 		return (ilm_update_add(ilm, ilgstat, ilg_flist, B_TRUE));
 
-	/*
-	 * We need to remember where the application really wanted
-	 * to join. This will be used later if we want to failback
-	 * to the original interface.
-	 */
 	ilm = ilm_add_v6(ill->ill_ipif, v6group, ilgstat, ilg_fmode,
-	    ilg_flist, orig_ifindex, zoneid);
+	    ilg_flist, zoneid);
 	if (ilm == NULL)
 		return (ENOMEM);
 
@@ -715,11 +653,7 @@ ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
 		 */
 		if (ilm_numentries_v6(ill, v6group) > 1)
 			return (0);
-		if (ill->ill_group == NULL)
-			ret = ill_join_allmulti(ill);
-		else
-			ret = ill_nominate_mcast_rcv(ill->ill_group);
-
+		ret = ill_join_allmulti(ill);
 		if (ret != 0)
 			ilm_delete(ilm);
 		return (ret);
@@ -756,6 +690,14 @@ ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
 	ASSERT(IAM_WRITER_ILL(ill));
 
 	/*
+	 * If we're on the IPMP ill, use the nominated multicast interface to
+	 * send and receive DLPI messages, if one exists.  (If none exists,
+	 * there are no usable interfaces and thus nothing to do.)
+	 */
+	if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+		return (0);
+
+	/*
 	 * Create a AR_ENTRY_SQUERY message with a dl_enabmulti_req tacked
 	 * on.
 	 */
@@ -842,9 +784,8 @@ ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *v6groupp)
 }
 
 /*
- * INADDR_ANY means all multicast addresses. This is only used
- * by the multicast router.
- * INADDR_ANY is stored as the IPv6 unspecifed addr.
+ * INADDR_ANY means all multicast addresses.
+ * INADDR_ANY is stored as the IPv6 unspecified addr.
  */
 int
 ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
@@ -859,7 +800,7 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
 		return (EINVAL);
 
 	/*
-	 * INADDR_ANY is represented as the IPv6 unspecifed addr.
+	 * INADDR_ANY is represented as the IPv6 unspecified addr.
 	 */
 	if (group == INADDR_ANY)
 		v6group = ipv6_all_zeros;
@@ -870,9 +811,7 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
 	 * Look for a match on the ipif.
 	 * (IP_DROP_MEMBERSHIP specifies an ipif using an IP address).
 	 */
-	mutex_enter(&ill->ill_lock);
 	ilm = ilm_lookup_ipif(ipif, group);
-	mutex_exit(&ill->ill_lock);
 	if (ilm == NULL)
 		return (ENOENT);
 
@@ -897,11 +836,9 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
 			return (0);
 
 		/* If we never joined, then don't leave. */
-		if (ill->ill_join_allmulti) {
+		if (ill->ill_join_allmulti)
 			ill_leave_allmulti(ill);
-			if (ill->ill_group != NULL)
-				(void) ill_nominate_mcast_rcv(ill->ill_group);
-		}
+
 		return (0);
 	}
 
@@ -921,11 +858,10 @@ ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
 
 /*
  * The unspecified address means all multicast addresses.
- * This is only used by the multicast router.
  */
 int
-ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
-    zoneid_t zoneid, boolean_t no_ilg, boolean_t leaving)
+ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+    boolean_t no_ilg, boolean_t leaving)
 {
 	ipif_t	*ipif;
 	ilm_t *ilm;
@@ -938,25 +874,8 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
 
 	/*
 	 * Look for a match on the ill.
-	 * (IPV6_LEAVE_GROUP specifies an ill using an ifindex).
-	 *
-	 * Similar to ip_addmulti_v6, we should always look using
-	 * the orig_ifindex.
-	 *
-	 * 1) If orig_ifindex is different from ill's ifindex
-	 *    we should have an ilm with orig_ifindex created in
-	 *    ip_addmulti_v6. We should delete that here.
-	 *
-	 * 2) If orig_ifindex is same as ill's ifindex, we should
-	 *    not delete the ilm that is temporarily here because of
-	 *    a FAILOVER. Those ilms will have a ilm_orig_ifindex
-	 *    different from ill's ifindex.
-	 *
-	 * Thus, always lookup using orig_ifindex.
 	 */
-	mutex_enter(&ill->ill_lock);
-	ilm = ilm_lookup_ill_index_v6(ill, v6group, orig_ifindex, zoneid);
-	mutex_exit(&ill->ill_lock);
+	ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid);
 	if (ilm == NULL)
 		return (ENOENT);
 
@@ -985,11 +904,9 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, int orig_ifindex,
 			return (0);
 
 		/* If we never joined, then don't leave. */
-		if (ill->ill_join_allmulti) {
+		if (ill->ill_join_allmulti)
 			ill_leave_allmulti(ill);
-			if (ill->ill_group != NULL)
-				(void) ill_nominate_mcast_rcv(ill->ill_group);
-		}
+
 		return (0);
 	}
 
@@ -1020,6 +937,13 @@ ip_ll_send_disabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
 	uint32_t	addrlen, addroff;
 
 	ASSERT(IAM_WRITER_ILL(ill));
+
+	/*
+	 * See comment in ip_ll_send_enabmulti_req().
+	 */
+	if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+		return (0);
+
 	/*
 	 * Create a AR_ENTRY_SQUERY message with a dl_disabmulti_req tacked
 	 * on.
@@ -1099,16 +1023,16 @@ ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *v6group)
 }
 
 /*
- * Make the driver pass up all multicast packets
- *
- * With ill groups, the caller makes sure that there is only
- * one ill joining the allmulti group.
+ * Make the driver pass up all multicast packets.  NOTE: to keep callers
+ * IPMP-unaware, if an IPMP ill is passed in, the ill_join_allmulti flag is
+ * set on it (rather than the cast ill).
  */
 int
 ill_join_allmulti(ill_t *ill)
 {
 	mblk_t		*promiscon_mp, *promiscoff_mp;
 	uint32_t	addrlen, addroff;
+	ill_t		*join_ill = ill;
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
@@ -1120,7 +1044,13 @@ ill_join_allmulti(ill_t *ill)
 		return (0);
 	}
 
-	ASSERT(!ill->ill_join_allmulti);
+	/*
+	 * See comment in ip_ll_send_enabmulti_req().
+	 */
+	if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+		return (0);
+
+	ASSERT(!join_ill->ill_join_allmulti);
 
 	/*
 	 * Create a DL_PROMISCON_REQ message and send it directly to the DLPI
@@ -1144,20 +1074,18 @@ ill_join_allmulti(ill_t *ill)
 		ill_dlpi_send(ill, promiscon_mp);
 	}
 
-	ill->ill_join_allmulti = B_TRUE;
+	join_ill->ill_join_allmulti = B_TRUE;
 	return (0);
 }
 
 /*
  * Make the driver stop passing up all multicast packets
- *
- * With ill groups, we need to nominate some other ill as
- * this ipif->ipif_ill is leaving the group.
  */
 void
 ill_leave_allmulti(ill_t *ill)
 {
-	mblk_t		*promiscoff_mp = ill->ill_promiscoff_mp;
+	mblk_t	*promiscoff_mp;
+	ill_t	*leave_ill = ill;
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
@@ -1169,7 +1097,13 @@ ill_leave_allmulti(ill_t *ill)
 		return;
 	}
 
-	ASSERT(ill->ill_join_allmulti);
+	/*
+	 * See comment in ip_ll_send_enabmulti_req().
+	 */
+	if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
+		return;
+
+	ASSERT(leave_ill->ill_join_allmulti);
 
 	/*
 	 * Create a DL_PROMISCOFF_REQ message and send it directly to
@@ -1179,12 +1113,13 @@ ill_leave_allmulti(ill_t *ill)
 	 */
 	if ((ill->ill_net_type == IRE_IF_RESOLVER) &&
 	    !(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) {
+		promiscoff_mp = ill->ill_promiscoff_mp;
 		ASSERT(promiscoff_mp != NULL);
 		ill->ill_promiscoff_mp = NULL;
 		ill_dlpi_send(ill, promiscoff_mp);
 	}
 
-	ill->ill_join_allmulti = B_FALSE;
+	leave_ill->ill_join_allmulti = B_FALSE;
 }
 
 static ill_t *
@@ -1213,22 +1148,35 @@ int
 ip_join_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
 {
 	ill_t		*ill;
-	int		ret;
+	int		ret = 0;
 
 	if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL)
 		return (ENODEV);
+
+	/*
+	 * The ip_addmulti*() functions won't allow IPMP underlying interfaces
+	 * to join allmulti since only the nominated underlying interface in
+	 * the group should receive multicast.  We silently succeed to avoid
+	 * having to teach IPobs (currently the only caller of this routine)
+	 * to ignore failures in this case.
+	 */
+	if (IS_UNDER_IPMP(ill))
+		goto out;
+
 	if (isv6) {
-		ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ifindex,
-		    ill->ill_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
+		ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ill->ill_zoneid,
+		    ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
 	} else {
 		ret = ip_addmulti(INADDR_ANY, ill->ill_ipif, ILGSTAT_NONE,
 		    MODE_IS_EXCLUDE, NULL);
 	}
 	ill->ill_ipallmulti_cnt++;
+out:
 	ipsq_exit(ill->ill_phyint->phyint_ipsq);
 	return (ret);
 }
 
+
 int
 ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
 {
@@ -1236,14 +1184,17 @@ ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
 
 	if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL)
 		return (ENODEV);
-	ASSERT(ill->ill_ipallmulti_cnt != 0);
-	if (isv6) {
-		(void) ip_delmulti_v6(&ipv6_all_zeros, ill, ifindex,
-		    ill->ill_zoneid, B_TRUE, B_TRUE);
-	} else {
-		(void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, B_TRUE);
+
+	if (ill->ill_ipallmulti_cnt > 0) {
+		if (isv6) {
+			(void) ip_delmulti_v6(&ipv6_all_zeros, ill,
+			    ill->ill_zoneid, B_TRUE, B_TRUE);
+		} else {
+			(void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE,
+			    B_TRUE);
+		}
+		ill->ill_ipallmulti_cnt--;
 	}
-	ill->ill_ipallmulti_cnt--;
 	ipsq_exit(ill->ill_phyint->phyint_ipsq);
 	return (0);
 }
@@ -1260,8 +1211,7 @@ ip_purge_allmulti(ill_t *ill)
 	for (; ill->ill_ipallmulti_cnt > 0; ill->ill_ipallmulti_cnt--) {
 		if (ill->ill_isv6) {
 			(void) ip_delmulti_v6(&ipv6_all_zeros, ill,
-			    ill->ill_phyint->phyint_ifindex, ill->ill_zoneid,
-			    B_TRUE, B_TRUE);
+			    ill->ill_zoneid, B_TRUE, B_TRUE);
 		} else {
 			(void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE,
 			    B_TRUE);
@@ -1539,13 +1489,14 @@ void
 ill_recover_multicast(ill_t *ill)
 {
 	ilm_t	*ilm;
+	ipif_t	*ipif = ill->ill_ipif;
 	char    addrbuf[INET6_ADDRSTRLEN];
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
 	ill->ill_need_recover_multicast = 0;
 
-	ILM_WALKER_HOLD(ill);
+	ill_ilm_walker_hold(ill);
 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
 		/*
 		 * Check how many ipif's that have members in this group -
@@ -1553,47 +1504,45 @@ ill_recover_multicast(ill_t *ill)
 		 * in the list.
 		 */
 		if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 &&
-		    ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm)
+		    ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE,
+		    ALL_ZONES) != ilm) {
 			continue;
-		ip1dbg(("ill_recover_multicast: %s\n",
-		    inet_ntop(AF_INET6, &ilm->ilm_v6addr, addrbuf,
-		    sizeof (addrbuf))));
+		}
+
+		ip1dbg(("ill_recover_multicast: %s\n", inet_ntop(AF_INET6,
+		    &ilm->ilm_v6addr, addrbuf, sizeof (addrbuf))));
+
 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
-			if (ill->ill_group == NULL) {
-				(void) ill_join_allmulti(ill);
-			} else {
-				/*
-				 * We don't want to join on this ill,
-				 * if somebody else in the group has
-				 * already been nominated.
-				 */
-				(void) ill_nominate_mcast_rcv(ill->ill_group);
-			}
+			(void) ill_join_allmulti(ill);
 		} else {
-			(void) ip_ll_addmulti_v6(ill->ill_ipif,
-			    &ilm->ilm_v6addr);
+			if (ill->ill_isv6)
+				mld_joingroup(ilm);
+			else
+				igmp_joingroup(ilm);
+
+			(void) ip_ll_addmulti_v6(ipif, &ilm->ilm_v6addr);
 		}
 	}
-	ILM_WALKER_RELE(ill);
+	ill_ilm_walker_rele(ill);
+
 }
 
 /*
  * The opposite of ill_recover_multicast() -- leaves all multicast groups
- * that were explicitly joined.  Note that both these functions could be
- * disposed of if we enhanced ARP to allow us to handle DL_DISABMULTI_REQ
- * and DL_ENABMULTI_REQ messages when an interface is down.
+ * that were explicitly joined.
  */
 void
 ill_leave_multicast(ill_t *ill)
 {
 	ilm_t	*ilm;
+	ipif_t	*ipif = ill->ill_ipif;
 	char    addrbuf[INET6_ADDRSTRLEN];
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
 	ill->ill_need_recover_multicast = 1;
 
-	ILM_WALKER_HOLD(ill);
+	ill_ilm_walker_hold(ill);
 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
 		/*
 		 * Check how many ipif's that have members in this group -
@@ -1601,25 +1550,26 @@ ill_leave_multicast(ill_t *ill)
 		 * in the list.
 		 */
 		if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 &&
-		    ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm)
+		    ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE,
+		    ALL_ZONES) != ilm) {
 			continue;
-		ip1dbg(("ill_leave_multicast: %s\n",
-		    inet_ntop(AF_INET6, &ilm->ilm_v6addr, addrbuf,
-		    sizeof (addrbuf))));
+		}
+
+		ip1dbg(("ill_leave_multicast: %s\n", inet_ntop(AF_INET6,
+		    &ilm->ilm_v6addr, addrbuf, sizeof (addrbuf))));
+
 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
 			ill_leave_allmulti(ill);
-			/*
-			 * If we were part of an IPMP group, then
-			 * ill_handoff_responsibility() has already
-			 * nominated a new member (so we don't).
-			 */
-			ASSERT(ill->ill_group == NULL);
 		} else {
-			(void) ip_ll_delmulti_v6(ill->ill_ipif,
-			    &ilm->ilm_v6addr);
+			if (ill->ill_isv6)
+				mld_leavegroup(ilm);
+			else
+				igmp_leavegroup(ilm);
+
+			(void) ip_ll_delmulti_v6(ipif, &ilm->ilm_v6addr);
 		}
 	}
-	ILM_WALKER_RELE(ill);
+	ill_ilm_walker_rele(ill);
 }
 
 /* Find an ilm for matching the ill */
@@ -1628,91 +1578,79 @@ ilm_lookup_ill(ill_t *ill, ipaddr_t group, zoneid_t zoneid)
 {
 	in6_addr_t	v6group;
 
-	ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
 	/*
-	 * INADDR_ANY is represented as the IPv6 unspecifed addr.
+	 * INADDR_ANY is represented as the IPv6 unspecified addr.
 	 */
 	if (group == INADDR_ANY)
 		v6group = ipv6_all_zeros;
 	else
 		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
 
-	return (ilm_lookup_ill_v6(ill, &v6group, zoneid));
+	return (ilm_lookup_ill_v6(ill, &v6group, B_TRUE, zoneid));
 }
 
 /*
- * Find an ilm for matching the ill. All the ilm lookup functions
- * ignore ILM_DELETED ilms. These have been logically deleted, and
- * igmp and linklayer disable multicast have been done. Only mi_free
- * yet to be done. Still there in the list due to ilm_walkers. The
- * last walker will release it.
+ * Find an ilm for address `v6group' on `ill' and zone `zoneid' (which may be
+ * ALL_ZONES).  In general, if `ill' is in an IPMP group, we will match
+ * against any ill in the group.  However, if `restrict_solicited' is set,
+ * then specifically for IPv6 solicited-node multicast, the match will be
+ * restricted to the specified `ill'.
  */
 ilm_t *
-ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group, zoneid_t zoneid)
+ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group,
+    boolean_t restrict_solicited, zoneid_t zoneid)
 {
 	ilm_t	*ilm;
+	ilm_walker_t ilw;
+	boolean_t restrict_ill = B_FALSE;
 
-	ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
+	/*
+	 * In general, underlying interfaces cannot have multicast memberships
+	 * and thus lookups always match across the illgrp.  However, we must
+	 * allow IPv6 solicited-node multicast memberships on underlying
+	 * interfaces, and thus an IPMP meta-interface and one of its
+	 * underlying ills may have the same solicited-node multicast address.
+	 * In that case, we need to restrict the lookup to the requested ill.
+	 * However, we may receive packets on an underlying interface that
+	 * are for the corresponding IPMP interface's solicited-node multicast
+	 * address, and thus in that case we need to match across the group --
+	 * hence the unfortunate `restrict_solicited' argument.
+	 */
+	if (IN6_IS_ADDR_MC_SOLICITEDNODE(v6group) && restrict_solicited)
+		restrict_ill = (IS_IPMP(ill) || IS_UNDER_IPMP(ill));
 
-	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
-		if (ilm->ilm_flags & ILM_DELETED)
+	ilm = ilm_walker_start(&ilw, ill);
+	for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+		if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group))
 			continue;
-		if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) &&
-		    (zoneid == ALL_ZONES || zoneid == ilm->ilm_zoneid))
-			return (ilm);
-	}
-	return (NULL);
-}
-
-ilm_t *
-ilm_lookup_ill_index_v6(ill_t *ill, const in6_addr_t *v6group, int index,
-    zoneid_t zoneid)
-{
-	ilm_t *ilm;
-
-	ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
-
-	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
-		if (ilm->ilm_flags & ILM_DELETED)
+		if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid)
 			continue;
-		if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) &&
-		    (zoneid == ALL_ZONES || zoneid == ilm->ilm_zoneid) &&
-		    ilm->ilm_orig_ifindex == index) {
-			return (ilm);
+		if (!restrict_ill || ill == (ill->ill_isv6 ?
+		    ilm->ilm_ill : ilm->ilm_ipif->ipif_ill)) {
+			break;
 		}
 	}
-	return (NULL);
+	ilm_walker_finish(&ilw);
+	return (ilm);
 }
 
-
 /*
- * Found an ilm for the ipif. Only needed for IPv4 which does
+ * Find an ilm for the ipif. Only needed for IPv4 which does
  * ipif specific socket options.
  */
 ilm_t *
 ilm_lookup_ipif(ipif_t *ipif, ipaddr_t group)
 {
-	ill_t	*ill = ipif->ipif_ill;
-	ilm_t	*ilm;
-	in6_addr_t	v6group;
-
-	ASSERT(ill->ill_ilm_walker_cnt != 0 || MUTEX_HELD(&ill->ill_lock));
-	/*
-	 * INADDR_ANY is represented as the IPv6 unspecifed addr.
-	 */
-	if (group == INADDR_ANY)
-		v6group = ipv6_all_zeros;
-	else
-		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+	ilm_t *ilm;
+	ilm_walker_t ilw;
 
-	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
-		if (ilm->ilm_flags & ILM_DELETED)
-			continue;
-		if (ilm->ilm_ipif == ipif &&
-		    IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group))
-			return (ilm);
+	ilm = ilm_walker_start(&ilw, ipif->ipif_ill);
+	for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+		if (ilm->ilm_ipif == ipif && ilm->ilm_addr == group)
+			break;
 	}
-	return (NULL);
+	ilm_walker_finish(&ilw);
+	return (ilm);
 }
 
 /*
@@ -1739,8 +1677,7 @@ ilm_numentries_v6(ill_t *ill, const in6_addr_t *v6group)
 /* Caller guarantees that the group is not already on the list */
 static ilm_t *
 ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
-    mcast_record_t ilg_fmode, slist_t *ilg_flist, int orig_ifindex,
-    zoneid_t zoneid)
+    mcast_record_t ilg_fmode, slist_t *ilg_flist, zoneid_t zoneid)
 {
 	ill_t	*ill = ipif->ipif_ill;
 	ilm_t	*ilm;
@@ -1783,19 +1720,10 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
 		    (char *), "ilm", (void *), ilm);
 		ipif->ipif_ilm_cnt++;
 	}
+
 	ASSERT(ill->ill_ipst);
 	ilm->ilm_ipst = ill->ill_ipst;	/* No netstack_hold */
 
-	/*
-	 * After this if ilm moves to a new ill, we don't change
-	 * the ilm_orig_ifindex. Thus, if ill_index != ilm_orig_ifindex,
-	 * it has been moved. Indexes don't match even when the application
-	 * wants to join on a FAILED/INACTIVE interface because we choose
-	 * a new interface to join in. This is considered as an implicit
-	 * move.
-	 */
-	ilm->ilm_orig_ifindex = orig_ifindex;
-
 	ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED));
 	ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
 
@@ -1969,6 +1897,108 @@ ilm_delete(ilm_t *ilm)
 	}
 }
 
+/* Increment the ILM walker count for `ill' */
+static void
+ill_ilm_walker_hold(ill_t *ill)
+{
+	mutex_enter(&ill->ill_lock);
+	ill->ill_ilm_walker_cnt++;
+	mutex_exit(&ill->ill_lock);
+}
+
+/* Decrement the ILM walker count for `ill' */
+static void
+ill_ilm_walker_rele(ill_t *ill)
+{
+	mutex_enter(&ill->ill_lock);
+	ill->ill_ilm_walker_cnt--;
+	if (ill->ill_ilm_walker_cnt == 0 && ill->ill_ilm_cleanup_reqd)
+		ilm_walker_cleanup(ill);	/* drops ill_lock */
+	else
+		mutex_exit(&ill->ill_lock);
+}
+
+/*
+ * Start walking the ILMs associated with `ill'; the first ILM in the walk
+ * (if any) is returned.  State associated with the walk is stored in `ilw'.
+ * Note that walks associated with interfaces under IPMP also walk the ILMs
+ * on the associated IPMP interface; this is handled transparently to callers
+ * via ilm_walker_step().  (Usually with IPMP all ILMs will be on the IPMP
+ * interface; the only exception is to support IPv6 test addresses, which
+ * require ILMs for their associated solicited-node multicast addresses.)
+ */
+ilm_t *
+ilm_walker_start(ilm_walker_t *ilw, ill_t *ill)
+{
+	ilw->ilw_ill = ill;
+	if (IS_UNDER_IPMP(ill))
+		ilw->ilw_ipmp_ill = ipmp_ill_hold_ipmp_ill(ill);
+	else
+		ilw->ilw_ipmp_ill = NULL;
+
+	ill_ilm_walker_hold(ill);
+	if (ilw->ilw_ipmp_ill != NULL)
+		ill_ilm_walker_hold(ilw->ilw_ipmp_ill);
+
+	if (ilw->ilw_ipmp_ill != NULL && ilw->ilw_ipmp_ill->ill_ilm != NULL)
+		ilw->ilw_walk_ill = ilw->ilw_ipmp_ill;
+	else
+		ilw->ilw_walk_ill = ilw->ilw_ill;
+
+	return (ilm_walker_step(ilw, NULL));
+}
+
+/*
+ * Helper function for ilm_walker_step() that returns the next ILM
+ * associated with `ilw', regardless of whether it's deleted.
+ */
+static ilm_t *
+ilm_walker_step_all(ilm_walker_t *ilw, ilm_t *ilm)
+{
+	if (ilm == NULL)
+		return (ilw->ilw_walk_ill->ill_ilm);
+
+	if (ilm->ilm_next != NULL)
+		return (ilm->ilm_next);
+
+	if (ilw->ilw_ipmp_ill != NULL && IS_IPMP(ilw->ilw_walk_ill)) {
+		ilw->ilw_walk_ill = ilw->ilw_ill;
+		/*
+		 * It's possible that ilw_ill left the group during our walk,
+		 * so we can't ASSERT() that it's under IPMP.  Callers that
+		 * care will be writer on the IPSQ anyway.
+		 */
+		return (ilw->ilw_walk_ill->ill_ilm);
+	}
+	return (NULL);
+}
+
+/*
+ * Step to the next ILM associated with `ilw'.
+ */
+ilm_t *
+ilm_walker_step(ilm_walker_t *ilw, ilm_t *ilm)
+{
+	while ((ilm = ilm_walker_step_all(ilw, ilm)) != NULL) {
+		if (!(ilm->ilm_flags & ILM_DELETED))
+			break;
+	}
+	return (ilm);
+}
+
+/*
+ * Finish the ILM walk associated with `ilw'.
+ */
+void
+ilm_walker_finish(ilm_walker_t *ilw)
+{
+	ill_ilm_walker_rele(ilw->ilw_ill);
+	if (ilw->ilw_ipmp_ill != NULL) {
+		ill_ilm_walker_rele(ilw->ilw_ipmp_ill);
+		ill_refrele(ilw->ilw_ipmp_ill);
+	}
+	bzero(&ilw, sizeof (ilw));
+}
 
 /*
  * Looks up the appropriate ipif given a v4 multicast group and interface
@@ -2256,16 +2286,15 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
 		 * didn't find an ilg, there's nothing to do.
 		 */
 		if (!leave_grp)
-			ilg = conn_ilg_alloc(connp);
+			ilg = conn_ilg_alloc(connp, &err);
 		if (leave_grp || ilg == NULL) {
 			mutex_exit(&connp->conn_lock);
-			return (leave_grp ? 0 : ENOMEM);
+			return (leave_grp ? 0 : err);
 		}
 		ilgstat = ILGSTAT_NEW;
 		IN6_IPADDR_TO_V4MAPPED(grp, &ilg->ilg_v6group);
 		ilg->ilg_ipif = ipif;
 		ilg->ilg_ill = NULL;
-		ilg->ilg_orig_ifindex = 0;
 	} else if (leave_grp) {
 		ilg_delete(connp, ilg, NULL);
 		mutex_exit(&connp->conn_lock);
@@ -2389,7 +2418,7 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
     const struct in6_addr *grp, ill_t *ill)
 {
 	ilg_t *ilg;
-	int i, orig_ifindex, orig_fmode, new_fmode, err;
+	int i, orig_fmode, new_fmode, err;
 	slist_t *orig_filter = NULL;
 	slist_t *new_filter = NULL;
 	struct sockaddr_storage *sl;
@@ -2409,65 +2438,31 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
-	/*
-	 * Use the ifindex to do the lookup.  We can't use the ill
-	 * directly because ilg_ill could point to a different ill
-	 * if things have moved.
-	 */
-	orig_ifindex = ill->ill_phyint->phyint_ifindex;
-
 	mutex_enter(&connp->conn_lock);
-	ilg = ilg_lookup_ill_index_v6(connp, grp, orig_ifindex);
+	ilg = ilg_lookup_ill_v6(connp, grp, ill);
 	if (ilg == NULL) {
 		/*
 		 * if the request was actually to leave, and we
 		 * didn't find an ilg, there's nothing to do.
 		 */
 		if (!leave_grp)
-			ilg = conn_ilg_alloc(connp);
+			ilg = conn_ilg_alloc(connp, &err);
 		if (leave_grp || ilg == NULL) {
 			mutex_exit(&connp->conn_lock);
-			return (leave_grp ? 0 : ENOMEM);
+			return (leave_grp ? 0 : err);
 		}
 		ilgstat = ILGSTAT_NEW;
 		ilg->ilg_v6group = *grp;
 		ilg->ilg_ipif = NULL;
-		/*
-		 * Choose our target ill to join on. This might be
-		 * different from the ill we've been given if it's
-		 * currently down and part of a group.
-		 *
-		 * new ill is not refheld; we are writer.
-		 */
-		ill = ip_choose_multi_ill(ill, grp);
-		ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
 		ilg->ilg_ill = ill;
-		/*
-		 * Remember the index that we joined on, so that we can
-		 * successfully delete them later on and also search for
-		 * duplicates if the application wants to join again.
-		 */
-		ilg->ilg_orig_ifindex = orig_ifindex;
 	} else if (leave_grp) {
-		/*
-		 * Use the ilg's current ill for the deletion,
-		 * we might have failed over.
-		 */
-		ill = ilg->ilg_ill;
 		ilg_delete(connp, ilg, NULL);
 		mutex_exit(&connp->conn_lock);
-		(void) ip_delmulti_v6(grp, ill, orig_ifindex,
-		    connp->conn_zoneid, B_FALSE, B_TRUE);
+		(void) ip_delmulti_v6(grp, ill, connp->conn_zoneid, B_FALSE,
+		    B_TRUE);
 		return (0);
 	} else {
 		ilgstat = ILGSTAT_CHANGE;
-		/*
-		 * The current ill might be different from the one we were
-		 * asked to join on (if failover has occurred); we should
-		 * join on the ill stored in the ilg.  The original ill
-		 * is noted in ilg_orig_ifindex, which matched our request.
-		 */
-		ill = ilg->ilg_ill;
 		/* preserve existing state in case ip_addmulti() fails */
 		orig_fmode = ilg->ilg_fmode;
 		if (ilg->ilg_filter == NULL) {
@@ -2531,8 +2526,8 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
 
 	mutex_exit(&connp->conn_lock);
 
-	err = ip_addmulti_v6(grp, ill, orig_ifindex, connp->conn_zoneid,
-	    ilgstat, new_fmode, new_filter);
+	err = ip_addmulti_v6(grp, ill, connp->conn_zoneid, ilgstat, new_fmode,
+	    new_filter);
 	if (err != 0) {
 		/*
 		 * Restore the original filter state, or delete the
@@ -2541,7 +2536,7 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
 		 * conn_lock.
 		 */
 		mutex_enter(&connp->conn_lock);
-		ilg = ilg_lookup_ill_index_v6(connp, grp, orig_ifindex);
+		ilg = ilg_lookup_ill_v6(connp, grp, ill);
 		ASSERT(ilg != NULL);
 		if (ilgstat == ILGSTAT_NEW) {
 			ilg_delete(connp, ilg, NULL);
@@ -3043,20 +3038,12 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group,
     ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src)
 {
 	ilg_t	*ilg;
-	ill_t	*ilg_ill;
-	uint_t	ilg_orig_ifindex;
 	boolean_t leaving = B_TRUE;
 
 	ASSERT(IAM_WRITER_ILL(ill));
 
-	/*
-	 * Use the index that we originally used to join. We can't
-	 * use the ill directly because ilg_ill could point to
-	 * a new ill if things have moved.
-	 */
 	mutex_enter(&connp->conn_lock);
-	ilg = ilg_lookup_ill_index_v6(connp, v6group,
-	    ill->ill_phyint->phyint_ifindex);
+	ilg = ilg_lookup_ill_v6(connp, v6group, ill);
 	if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) {
 		mutex_exit(&connp->conn_lock);
 		return (EADDRNOTAVAIL);
@@ -3087,12 +3074,10 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group,
 			leaving = B_FALSE;
 	}
 
-	ilg_ill = ilg->ilg_ill;
-	ilg_orig_ifindex = ilg->ilg_orig_ifindex;
 	ilg_delete(connp, ilg, v6src);
 	mutex_exit(&connp->conn_lock);
-	(void) ip_delmulti_v6(v6group, ilg_ill, ilg_orig_ifindex,
-	    connp->conn_zoneid, B_FALSE, leaving);
+	(void) ip_delmulti_v6(v6group, ill, connp->conn_zoneid, B_FALSE,
+	    leaving);
 
 	return (0);
 }
@@ -3345,10 +3330,10 @@ ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode,
 
 	if (ilg == NULL) {
 		ilgstat = ILGSTAT_NEW;
-		if ((ilg = conn_ilg_alloc(connp)) == NULL) {
+		if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) {
 			mutex_exit(&connp->conn_lock);
 			l_free(new_filter);
-			return (ENOMEM);
+			return (error);
 		}
 		if (src != INADDR_ANY) {
 			ilg->ilg_filter = l_alloc();
@@ -3369,7 +3354,6 @@ ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode,
 		}
 		ilg->ilg_ipif = ipif;
 		ilg->ilg_ill = NULL;
-		ilg->ilg_orig_ifindex = 0;
 		ilg->ilg_fmode = fmode;
 	} else {
 		int index;
@@ -3437,7 +3421,6 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
     mcast_record_t fmode, const in6_addr_t *v6src)
 {
 	int	error = 0;
-	int	orig_ifindex;
 	ilg_t	*ilg;
 	ilg_stat_t ilgstat;
 	slist_t	*new_filter = NULL;
@@ -3456,13 +3439,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 	 */
 	mutex_enter(&connp->conn_lock);
 
-	/*
-	 * Use the ifindex to do the lookup. We can't use the ill
-	 * directly because ilg_ill could point to a different ill if
-	 * things have moved.
-	 */
-	orig_ifindex = ill->ill_phyint->phyint_ifindex;
-	ilg = ilg_lookup_ill_index_v6(connp, v6group, orig_ifindex);
+	ilg = ilg_lookup_ill_v6(connp, v6group, ill);
 
 	/*
 	 * Depending on the option we're handling, may or may not be okay
@@ -3501,10 +3478,10 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 	}
 
 	if (ilg == NULL) {
-		if ((ilg = conn_ilg_alloc(connp)) == NULL) {
+		if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) {
 			mutex_exit(&connp->conn_lock);
 			l_free(new_filter);
-			return (ENOMEM);
+			return (error);
 		}
 		if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) {
 			ilg->ilg_filter = l_alloc();
@@ -3521,22 +3498,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 		ilg->ilg_v6group = *v6group;
 		ilg->ilg_fmode = fmode;
 		ilg->ilg_ipif = NULL;
-		/*
-		 * Choose our target ill to join on. This might be different
-		 * from the ill we've been given if it's currently down and
-		 * part of a group.
-		 *
-		 * new ill is not refheld; we are writer.
-		 */
-		ill = ip_choose_multi_ill(ill, v6group);
-		ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
 		ilg->ilg_ill = ill;
-		/*
-		 * Remember the orig_ifindex that we joined on, so that we
-		 * can successfully delete them later on and also search
-		 * for duplicates if the application wants to join again.
-		 */
-		ilg->ilg_orig_ifindex = orig_ifindex;
 	} else {
 		int index;
 		if (ilg->ilg_fmode != fmode || IN6_IS_ADDR_UNSPECIFIED(v6src)) {
@@ -3560,13 +3522,6 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 		ilgstat = ILGSTAT_CHANGE;
 		index = ilg->ilg_filter->sl_numsrc++;
 		ilg->ilg_filter->sl_addr[index] = *v6src;
-		/*
-		 * The current ill might be different from the one we were
-		 * asked to join on (if failover has occurred); we should
-		 * join on the ill stored in the ilg.  The original ill
-		 * is noted in ilg_orig_ifindex, which matched our request.
-		 */
-		ill = ilg->ilg_ill;
 	}
 
 	/*
@@ -3584,8 +3539,8 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 	 * info for the ill, which involves looking at the status of
 	 * all the ilgs associated with this group/interface pair.
 	 */
-	error = ip_addmulti_v6(v6group, ill, orig_ifindex, connp->conn_zoneid,
-	    ilgstat, new_fmode, new_filter);
+	error = ip_addmulti_v6(v6group, ill, connp->conn_zoneid, ilgstat,
+	    new_fmode, new_filter);
 	if (error != 0) {
 		/*
 		 * But because we waited, we have to undo the ilg update
@@ -3595,7 +3550,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
 		in6_addr_t delsrc =
 		    (ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src;
 		mutex_enter(&connp->conn_lock);
-		ilg = ilg_lookup_ill_index_v6(connp, v6group, orig_ifindex);
+		ilg = ilg_lookup_ill_v6(connp, v6group, ill);
 		ASSERT(ilg != NULL);
 		ilg_delete(connp, ilg, &delsrc);
 		mutex_exit(&connp->conn_lock);
@@ -3639,7 +3594,7 @@ ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill)
 		ASSERT(ilg->ilg_ill == NULL);
 		ilg_ill = ipif->ipif_ill;
 		ASSERT(!ilg_ill->ill_isv6);
-		if (ilg_ill == ill &&
+		if (IS_ON_SAME_LAN(ilg_ill, ill) &&
 		    IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) {
 			if (SLIST_IS_EMPTY(ilg->ilg_filter)) {
 				/* no source filter, so this is a match */
@@ -3692,7 +3647,7 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
 			continue;
 		ASSERT(ilg->ilg_ipif == NULL);
 		ASSERT(ilg_ill->ill_isv6);
-		if (ilg_ill == ill &&
+		if (IS_ON_SAME_LAN(ilg_ill, ill) &&
 		    IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) {
 			if (SLIST_IS_EMPTY(ilg->ilg_filter)) {
 				/* no source filter, so this is a match */
@@ -3724,35 +3679,6 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
 }
 
 /*
- * Get the ilg whose ilg_orig_ifindex is associated with ifindex.
- * This is useful when the interface fails and we have moved
- * to a new ill, but still would like to locate using the index
- * that we originally used to join. Used only for IPv6 currently.
- */
-static ilg_t *
-ilg_lookup_ill_index_v6(conn_t *connp, const in6_addr_t *v6group, int ifindex)
-{
-	ilg_t	*ilg;
-	int	i;
-
-	ASSERT(MUTEX_HELD(&connp->conn_lock));
-	for (i = 0; i < connp->conn_ilg_inuse; i++) {
-		ilg = &connp->conn_ilg[i];
-		if (ilg->ilg_ill == NULL ||
-		    (ilg->ilg_flags & ILG_DELETED) != 0)
-			continue;
-		/* ilg_ipif is NULL for V6 */
-		ASSERT(ilg->ilg_ipif == NULL);
-		ASSERT(ilg->ilg_orig_ifindex != 0);
-		if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group) &&
-		    ilg->ilg_orig_ifindex == ifindex) {
-			return (ilg);
-		}
-	}
-	return (NULL);
-}
-
-/*
  * Find an IPv6 ilg matching group and ill
  */
 ilg_t *
@@ -3863,32 +3789,28 @@ ilg_delete_all(conn_t *connp)
 	in6_addr_t v6group;
 	boolean_t success;
 	ipsq_t	*ipsq;
-	int	orig_ifindex;
 
 	mutex_enter(&connp->conn_lock);
 retry:
 	ILG_WALKER_HOLD(connp);
-	for (i = connp->conn_ilg_inuse - 1; i >= 0; ) {
+	for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
 		ilg = &connp->conn_ilg[i];
 		/*
 		 * Since this walk is not atomic (we drop the
 		 * conn_lock and wait in ipsq_enter) we need
 		 * to check for the ILG_DELETED flag.
 		 */
-		if (ilg->ilg_flags & ILG_DELETED) {
-			/* Go to the next ilg */
-			i--;
+		if (ilg->ilg_flags & ILG_DELETED)
 			continue;
-		}
-		v6group = ilg->ilg_v6group;
 
-		if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
+		if (IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group)) {
 			ipif = ilg->ilg_ipif;
 			ill = ipif->ipif_ill;
 		} else {
 			ipif = NULL;
 			ill = ilg->ilg_ill;
 		}
+
 		/*
 		 * We may not be able to refhold the ill if the ill/ipif
 		 * is changing. But we need to make sure that the ill will
@@ -3897,11 +3819,9 @@ retry:
 		 * in which case the unplumb thread will handle the cleanup,
 		 * and we move on to the next ilg.
 		 */
-		if (!ill_waiter_inc(ill)) {
-			/* Go to the next ilg */
-			i--;
+		if (!ill_waiter_inc(ill))
 			continue;
-		}
+
 		mutex_exit(&connp->conn_lock);
 		/*
 		 * To prevent deadlock between ill close which waits inside
@@ -3916,51 +3836,31 @@ retry:
 		ipsq = ill->ill_phyint->phyint_ipsq;
 		ill_waiter_dcr(ill);
 		mutex_enter(&connp->conn_lock);
-		if (!success) {
-			/* Go to the next ilg */
-			i--;
+		if (!success)
 			continue;
-		}
 
 		/*
-		 * Make sure that nothing has changed under. For eg.
-		 * a failover/failback can change ilg_ill while we were
-		 * waiting to become exclusive above
+		 * Move on if the ilg was deleted while conn_lock was dropped.
 		 */
-		if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
-			ipif = ilg->ilg_ipif;
-			ill = ipif->ipif_ill;
-		} else {
-			ipif = NULL;
-			ill = ilg->ilg_ill;
-		}
-		if (!IAM_WRITER_ILL(ill) || (ilg->ilg_flags & ILG_DELETED)) {
-			/*
-			 * The ilg has changed under us probably due
-			 * to a failover or unplumb. Retry on the same ilg.
-			 */
+		if (ilg->ilg_flags & ILG_DELETED) {
 			mutex_exit(&connp->conn_lock);
 			ipsq_exit(ipsq);
 			mutex_enter(&connp->conn_lock);
 			continue;
 		}
 		v6group = ilg->ilg_v6group;
-		orig_ifindex = ilg->ilg_orig_ifindex;
 		ilg_delete(connp, ilg, NULL);
 		mutex_exit(&connp->conn_lock);
 
-		if (ipif != NULL)
+		if (ipif != NULL) {
 			(void) ip_delmulti(V4_PART_OF_V6(v6group), ipif,
 			    B_FALSE, B_TRUE);
-
-		else
-			(void) ip_delmulti_v6(&v6group, ill, orig_ifindex,
+		} else {
+			(void) ip_delmulti_v6(&v6group, ill,
 			    connp->conn_zoneid, B_FALSE, B_TRUE);
-
+		}
 		ipsq_exit(ipsq);
 		mutex_enter(&connp->conn_lock);
-		/* Go to the next ilg */
-		i--;
 	}
 	ILG_WALKER_RELE(connp);
 
@@ -4063,7 +3963,6 @@ conn_delete_ill(conn_t *connp, caddr_t arg)
 	int	i;
 	char	group_buf[INET6_ADDRSTRLEN];
 	in6_addr_t v6group;
-	int	orig_ifindex;
 	ilg_t	*ilg;
 
 	/*
@@ -4097,11 +3996,10 @@ conn_delete_ill(conn_t *connp, caddr_t arg)
 			    ill->ill_name));
 
 			v6group = ilg->ilg_v6group;
-			orig_ifindex = ilg->ilg_orig_ifindex;
 			ilg_delete(connp, ilg, NULL);
 			mutex_exit(&connp->conn_lock);
 
-			(void) ip_delmulti_v6(&v6group, ill, orig_ifindex,
+			(void) ip_delmulti_v6(&v6group, ill,
 			    connp->conn_zoneid, B_FALSE, B_TRUE);
 			mutex_enter(&connp->conn_lock);
 		}
@@ -4115,7 +4013,6 @@ conn_delete_ill(conn_t *connp, caddr_t arg)
 	if (connp->conn_multicast_ill == ill) {
 		/* Revert to late binding */
 		connp->conn_multicast_ill = NULL;
-		connp->conn_orig_multicast_ifindex = 0;
 	}
 	mutex_exit(&connp->conn_lock);
 }
diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c
index b53897cefe..895cc74bd2 100644
--- a/usr/src/uts/common/inet/ip/ip_ndp.c
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -83,8 +83,9 @@ static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
 static	void	nce_ire_delete(nce_t *nce);
 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
-static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
-static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
+static	nce_t	*nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *,
+    nce_t *);
+static	nce_t	*nce_lookup_mapping(ill_t *, const in6_addr_t *);
 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
     uchar_t *addr);
 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
@@ -93,11 +94,16 @@ static	mblk_t	*nce_udreq_alloc(ill_t *ill);
 static	void	nce_update(nce_t *nce, uint16_t new_state,
     uchar_t *new_ll_addr);
 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
-static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
-    ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
+static	boolean_t	nce_xmit(ill_t *ill, uint8_t type,
+    boolean_t use_lla_addr, const in6_addr_t *sender,
     const in6_addr_t *target, int flag);
+static boolean_t	nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla,
+    const in6_addr_t *target, uint_t flags);
+static boolean_t	nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla,
+    const in6_addr_t *src, uint_t flags);
 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
     nce_t **, nce_t *);
+static ipif_t	*ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill);
 
 #ifdef DEBUG
 static void	nce_trace_cleanup(const nce_t *);
@@ -110,22 +116,6 @@ static void	nce_trace_cleanup(const nce_t *);
 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
 		NCE_TABLE_SIZE)]))
 
-/*
- * Compute default flags to use for an advertisement of this nce's address.
- */
-static int
-nce_advert_flags(const nce_t *nce)
-{
-	int flag = 0;
-
-	if (nce->nce_flags & NCE_F_ISROUTER)
-		flag |= NDP_ISROUTER;
-	if (!(nce->nce_flags & NCE_F_ANYCAST))
-		flag |= NDP_ORIDE;
-
-	return (flag);
-}
-
 /* Non-tunable probe interval, based on link capabilities */
 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
 
@@ -262,8 +252,7 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
 		mutex_exit(&nce->nce_lock);
-		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
-		    &ipv6_all_zeros, addr, NDP_PROBE);
+		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
 		if (dropped) {
 			mutex_enter(&nce->nce_lock);
 			nce->nce_pcnt++;
@@ -282,23 +271,20 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
 		mutex_exit(&nce->nce_lock);
-		dropped = nce_xmit(ill,
-		    ND_NEIGHBOR_ADVERT,
-		    ill,	/* ill to be used for extracting ill_nd_lla */
-		    B_TRUE,	/* use ill_nd_lla */
-		    addr,	/* Source and target of the advertisement pkt */
-		    &ipv6_all_hosts_mcast, /* Destination of the packet */
-		    nce_advert_flags(nce));
+		dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast,
+		    0);
 		mutex_enter(&nce->nce_lock);
 		if (dropped)
 			nce->nce_unsolicit_count++;
 		if (nce->nce_unsolicit_count != 0) {
+			ASSERT(nce->nce_timeout_id == 0);
 			nce->nce_timeout_id = timeout(ndp_timer, nce,
 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
 		}
 		mutex_exit(&nce->nce_lock);
 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 	}
+
 	/*
 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
@@ -311,10 +297,10 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
 }
 
 int
-ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
-    const in6_addr_t *mask, const in6_addr_t *extract_mask,
-    uint32_t hw_extract_start, uint16_t flags, uint16_t state,
-    nce_t **newnce)
+ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
+    const in6_addr_t *addr, const in6_addr_t *mask,
+    const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags,
+    uint16_t state, nce_t **newnce)
 {
 	int	err = 0;
 	nce_t	*nce;
@@ -325,7 +311,7 @@ ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
 
 	/* Get head of v6 hash table */
 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
-	nce = nce_lookup_addr(ill, addr, nce);
+	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
 	if (nce == NULL) {
 		err = ndp_add_v6(ill,
 		    hw_addr,
@@ -562,13 +548,11 @@ nce_ire_delete_list(nce_t *nce)
 
 		if (nce->nce_ipversion == IPV4_VERSION) {
 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
-			    IRE_CACHE, nce_ire_delete1,
-			    (char *)nce, nce->nce_ill);
+			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
 		} else {
 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
-			    IRE_CACHE, nce_ire_delete1,
-			    (char *)nce, nce->nce_ill);
+			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
 		}
 		NCE_REFRELE_NOTR(nce);
 		nce = nce_next;
@@ -628,8 +612,7 @@ ndp_restart_dad(nce_t *nce)
 		nce->nce_state = ND_PROBE;
 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
 		mutex_exit(&nce->nce_lock);
-		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
-		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
+		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
 		if (dropped) {
 			mutex_enter(&nce->nce_lock);
 			nce->nce_pcnt++;
@@ -649,22 +632,19 @@ ndp_restart_dad(nce_t *nce)
  * If one is found, the refcnt on the nce will be incremented.
  */
 nce_t *
-ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
+ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
+    boolean_t caller_holds_lock)
 {
 	nce_t	*nce;
-	ip_stack_t	*ipst;
-
-	ASSERT(ill != NULL);
-	ipst = ill->ill_ipst;
+	ip_stack_t *ipst = ill->ill_ipst;
 
-	ASSERT(ill != NULL && ill->ill_isv6);
-	if (!caller_holds_lock) {
+	ASSERT(ill->ill_isv6);
+	if (!caller_holds_lock)
 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
-	}
 
 	/* Get head of v6 hash table */
 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
-	nce = nce_lookup_addr(ill, addr, nce);
+	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
 	if (nce == NULL)
 		nce = nce_lookup_mapping(ill, addr);
 	if (!caller_holds_lock)
@@ -685,14 +665,17 @@ ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
 	in6_addr_t addr6;
 	ip_stack_t *ipst = ill->ill_ipst;
 
-	if (!caller_holds_lock) {
+	if (!caller_holds_lock)
 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
-	}
 
 	/* Get head of v4 hash table */
 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
-	nce = nce_lookup_addr(ill, &addr6, nce);
+	/*
+	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
+	 * looking up have fastpath headers that are inherently per-ill.
+	 */
+	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
 	if (!caller_holds_lock)
 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 	return (nce);
@@ -706,7 +689,8 @@ ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
  * lock (ndp_g_lock).
  */
 static nce_t *
-nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
+nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
+    nce_t *nce)
 {
 	ndp_g_t		*ndp;
 	ip_stack_t	*ipst = ill->ill_ipst;
@@ -716,12 +700,12 @@ nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
 	else
 		ndp = ipst->ips_ndp4;
 
-	ASSERT(ill != NULL);
 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
 		return (NULL);
 	for (; nce != NULL; nce = nce->nce_next) {
-		if (nce->nce_ill == ill) {
+		if (nce->nce_ill == ill ||
+		    match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) {
 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
 			    &ipv6_all_ones)) {
@@ -771,8 +755,8 @@ nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
  * Process passed in parameters either from an incoming packet or via
  * user ioctl.
  */
-void
-ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
+static void
+nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
 {
 	ill_t	*ill = nce->nce_ill;
 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
@@ -852,7 +836,7 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
 				} else {
 					/*
 					 * Send locally originated packets back
-					 * into * ip_wput_v6.
+					 * into ip_wput_v6.
 					 */
 					put(ill->ill_wq, mp);
 				}
@@ -918,6 +902,65 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
 }
 
 /*
+ * Walker state structure used by ndp_process() / ndp_process_entry().
+ */
+typedef struct ndp_process_data {
+	ill_t		*np_ill; 	/* ill/illgrp to match against */
+	const in6_addr_t *np_addr; 	/* IPv6 address to match */
+	uchar_t		*np_hw_addr; 	/* passed to nce_process() */
+	uint32_t	np_flag;	/* passed to nce_process() */
+	boolean_t	np_is_adv;	/* passed to nce_process() */
+} ndp_process_data_t;
+
+/*
+ * Walker callback used by ndp_process() for IPMP groups: calls nce_process()
+ * for each NCE with a matching address that's in the same IPMP group.
+ */
+static void
+ndp_process_entry(nce_t *nce, void *arg)
+{
+	ndp_process_data_t *npp = arg;
+
+	if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) &&
+	    IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) &&
+	    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
+		nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv);
+	}
+}
+
+/*
+ * Wrapper around nce_process() that handles IPMP.  In particular, for IPMP,
+ * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have
+ * more than one NCE for a given IPv6 address to tend to.  In that case, we
+ * need to walk all NCEs and callback nce_process() for each one.  Since this
+ * is expensive, in the non-IPMP case we just directly call nce_process().
+ * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP
+ * interfaces in an IPMP group share the same NCEs -- at which point this
+ * function can be removed entirely.
+ */
+void
+ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
+{
+	ill_t *ill = nce->nce_ill;
+	struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6;
+	ndp_process_data_t np;
+
+	if (ill->ill_grp == NULL) {
+		nce_process(nce, hw_addr, flag, is_adv);
+		return;
+	}
+
+	/* IPMP case: walk all NCEs */
+	np.np_ill = ill;
+	np.np_addr = &nce->nce_addr;
+	np.np_flag = flag;
+	np.np_is_adv = is_adv;
+	np.np_hw_addr = hw_addr;
+
+	ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES);
+}
+
+/*
  * Pass arg1 to the pfi supplied, along with each nce in existence.
  * ndp_walk() places a REFHOLD on the nce and drops the lock when
  * walking the hash list.
@@ -926,7 +969,6 @@ void
 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
     boolean_t trace)
 {
-
 	nce_t	*nce;
 	nce_t	*nce1;
 	nce_t	**ncep;
@@ -1021,27 +1063,58 @@ ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
 int
 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
 {
-	nce_t		*nce;
-	int		err = 0;
+	nce_t		*nce, *hw_nce = NULL;
+	int		err;
+	ill_t		*ipmp_ill;
+	uint16_t	nce_flags;
 	uint32_t	ms;
 	mblk_t		*mp_nce = NULL;
 	ip_stack_t	*ipst = ill->ill_ipst;
+	uchar_t		*hwaddr = NULL;
 
 	ASSERT(ill->ill_isv6);
-	if (IN6_IS_ADDR_MULTICAST(dst)) {
-		err = nce_set_multicast(ill, dst);
-		return (err);
+
+	if (IN6_IS_ADDR_MULTICAST(dst))
+		return (nce_set_multicast(ill, dst));
+
+	nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
+
+	/*
+	 * If `ill' is under IPMP, then first check to see if there's an NCE
+	 * for `dst' on the IPMP meta-interface (e.g., because an application
+	 * explicitly did an SIOCLIFSETND to tie a hardware address to `dst').
+	 * If so, we use that hardware address when creating the NCE below.
+	 * Note that we don't yet have a mechanism to remove these NCEs if the
+	 * NCE for `dst' on the IPMP meta-interface is subsequently removed --
+	 * but rather than build such a beast, we should fix NCEs so that they
+	 * can be properly shared across an IPMP group.
+	 */
+	if (IS_UNDER_IPMP(ill)) {
+		if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
+			hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE);
+			if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) {
+				hwaddr = hw_nce->nce_res_mp->b_rptr +
+				    NCE_LL_ADDR_OFFSET(ipmp_ill);
+				nce_flags |= hw_nce->nce_flags;
+			}
+			ill_refrele(ipmp_ill);
+		}
 	}
+
 	err = ndp_lookup_then_add_v6(ill,
-	    NULL,	/* No hardware address */
+	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
+	    hwaddr,
 	    dst,
 	    &ipv6_all_ones,
 	    &ipv6_all_zeros,
 	    0,
-	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
-	    ND_INCOMPLETE,
+	    nce_flags,
+	    hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE,
 	    &nce);
 
+	if (hw_nce != NULL)
+		NCE_REFRELE(hw_nce);
+
 	switch (err) {
 	case 0:
 		/*
@@ -1057,11 +1130,10 @@ ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
 			NCE_REFRELE(nce);
 			return (0);
 		}
-		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
 		mutex_enter(&nce->nce_lock);
 		if (nce->nce_state != ND_INCOMPLETE) {
 			mutex_exit(&nce->nce_lock);
-			rw_exit(&ipst->ips_ill_g_lock);
 			NCE_REFRELE(nce);
 			return (0);
 		}
@@ -1069,14 +1141,11 @@ ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
 		if (mp_nce == NULL) {
 			/* The caller will free mp */
 			mutex_exit(&nce->nce_lock);
-			rw_exit(&ipst->ips_ill_g_lock);
 			ndp_delete(nce);
 			NCE_REFRELE(nce);
 			return (ENOMEM);
 		}
-		ms = nce_solicit(nce, mp_nce);
-		rw_exit(&ipst->ips_ill_g_lock);
-		if (ms == 0) {
+		if ((ms = nce_solicit(nce, mp_nce)) == 0) {
 			/* The caller will free mp */
 			if (mp_nce != mp)
 				freeb(mp_nce);
@@ -1143,6 +1212,7 @@ ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
 	}
 
 	err = ndp_lookup_then_add_v6(ill,
+	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
 	    NULL,	/* hardware address */
 	    dst,
 	    &ipv6_all_ones,
@@ -1191,7 +1261,7 @@ nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
 
 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
-	nce = nce_lookup_addr(ill, dst, nce);
+	nce = nce_lookup_addr(ill, B_FALSE, dst, nce);
 	if (nce != NULL) {
 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 		NCE_REFRELE(nce);
@@ -1259,7 +1329,13 @@ ndp_query(ill_t *ill, struct lif_nd_req *lnr)
 	sin6 = (sin6_t *)&lnr->lnr_addr;
 	addr =  &sin6->sin6_addr;
 
-	nce = ndp_lookup_v6(ill, addr, B_FALSE);
+	/*
+	 * NOTE: if the ill is an IPMP interface, then match against the whole
+	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
+	 * addresses for the data addresses on an IPMP interface even though
+	 * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill.
+	 */
+	nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE);
 	if (nce == NULL)
 		return (ESRCH);
 	/* If in INCOMPLETE state, no link layer address is available yet */
@@ -1347,24 +1423,14 @@ ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
 uint32_t
 nce_solicit(nce_t *nce, mblk_t *mp)
 {
-	ill_t		*ill;
-	ill_t		*src_ill;
 	ip6_t		*ip6h;
-	in6_addr_t	src;
-	in6_addr_t	dst;
-	ipif_t		*ipif;
-	ip6i_t		*ip6i;
-	boolean_t	dropped = B_FALSE;
-	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
+	in6_addr_t	sender;
+	boolean_t	dropped;
 
-	ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
 	ASSERT(MUTEX_HELD(&nce->nce_lock));
-	ill = nce->nce_ill;
-	ASSERT(ill != NULL);
 
-	if (nce->nce_rcnt == 0) {
+	if (nce->nce_rcnt == 0)
 		return (0);
-	}
 
 	if (mp == NULL) {
 		ASSERT(nce->nce_qd_mp != NULL);
@@ -1385,60 +1451,22 @@ nce_solicit(nce_t *nce, mblk_t *mp)
 		 * could be from the nce_qd_mp which could have b_next/b_prev
 		 * non-NULL.
 		 */
-		ip6i = (ip6i_t *)ip6h;
-		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
-		    sizeof (ip6i_t) + IPV6_HDR_LEN);
+		ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
 	}
-	src = ip6h->ip6_src;
-	/*
-	 * If the src of outgoing packet is one of the assigned interface
-	 * addresses use it, otherwise we will pick the source address below.
-	 */
-	src_ill = ill;
-	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
-		if (ill->ill_group != NULL)
-			src_ill = ill->ill_group->illgrp_ill;
-		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
-			for (ipif = src_ill->ill_ipif; ipif != NULL;
-			    ipif = ipif->ipif_next) {
-				if (IN6_ARE_ADDR_EQUAL(&src,
-				    &ipif->ipif_v6lcl_addr)) {
-					break;
-				}
-			}
-			if (ipif != NULL)
-				break;
-		}
-		/*
-		 * If no relevant ipif can be found, then it's not one of our
-		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
-		 * found, but it's not yet done with DAD verification, then
-		 * just postpone this transmission until later.
-		 */
-		if (src_ill == NULL)
-			src = ipv6_all_zeros;
-		else if (!ipif->ipif_addr_ready)
-			return (ill->ill_reachable_retrans_time);
-	}
-	dst = nce->nce_addr;
+
 	/*
-	 * If source address is unspecified, nce_xmit will choose
-	 * one for us and initialize the hardware address also
-	 * appropriately.
+	 * Need to copy the sender address into a local since `mp' can
+	 * go away once we drop nce_lock.
 	 */
-	if (IN6_IS_ADDR_UNSPECIFIED(&src))
-		src_ill = NULL;
+	sender = ip6h->ip6_src;
 	nce->nce_rcnt--;
 	mutex_exit(&nce->nce_lock);
-	rw_exit(&ipst->ips_ill_g_lock);
-	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
-	    &dst, 0);
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0);
 	mutex_enter(&nce->nce_lock);
 	if (dropped)
 		nce->nce_rcnt++;
-	return (ill->ill_reachable_retrans_time);
+	return (nce->nce_ill->ill_reachable_retrans_time);
 }
 
 /*
@@ -1475,7 +1503,7 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
 		 */
 		mutex_enter(&ill->ill_lock);
 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
-		    (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
+		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
 			mutex_exit(&ill->ill_lock);
 			continue;
 		}
@@ -1485,8 +1513,8 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
 		mutex_exit(&ill->ill_lock);
 		ipif->ipif_was_dup = B_TRUE;
 
-		if (ipif_ndp_up(ipif) != EINPROGRESS)
-			(void) ipif_up_done_v6(ipif);
+		VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
+		(void) ipif_up_done_v6(ipif);
 	}
 	freeb(mp);
 }
@@ -1515,7 +1543,7 @@ ipif6_dup_recovery(void *arg)
 	/*
 	 * No lock, because this is just an optimization.
 	 */
-	if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
+	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
 		return;
 
 	/* If the link is down, we'll retry this later */
@@ -1542,13 +1570,20 @@ ndp_do_recovery(ipif_t *ipif)
 	if (mp == NULL) {
 		mutex_enter(&ill->ill_lock);
 		if (ipif->ipif_recovery_id == 0 &&
-		    !(ipif->ipif_state_flags & (IPIF_MOVING |
-		    IPIF_CONDEMNED))) {
+		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
 		}
 		mutex_exit(&ill->ill_lock);
 	} else {
+		/*
+		 * A recovery timer may still be running if we got here from
+		 * ill_restart_dad(); cancel that timer.
+		 */
+		if (ipif->ipif_recovery_id != 0)
+			(void) untimeout(ipif->ipif_recovery_id);
+		ipif->ipif_recovery_id = 0;
+
 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
 		    sizeof (ipif->ipif_v6lcl_addr));
 		ill_refhold(ill);
@@ -1558,41 +1593,51 @@ ndp_do_recovery(ipif_t *ipif)
 }
 
 /*
- * Find the solicitation in the given message, and extract printable details
- * (MAC and IP addresses) from it.
+ * Find the MAC and IP addresses in an NA/NS message.
  */
-static nd_neighbor_solicit_t *
-ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
-    size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
+static void
+ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp,
+    uchar_t **haddr, uint_t *haddrlenp)
 {
-	nd_neighbor_solicit_t *ns;
-	ip6_t *ip6h;
+	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
+	nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
+	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
 	uchar_t *addr;
-	int alen;
+	int alen = 0;
 
-	alen = 0;
-	ip6h = (ip6_t *)mp->b_rptr;
 	if (dl_mp == NULL) {
 		nd_opt_hdr_t *opt;
-		int nslen;
+		int len;
 
 		/*
 		 * If it's from the fast-path, then it can't be a probe
-		 * message, and thus must include the source linkaddr option.
+		 * message, and thus must include a linkaddr option.
 		 * Extract that here.
 		 */
-		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
-		nslen = mp->b_wptr - (uchar_t *)ns;
-		if ((nslen -= sizeof (*ns)) > 0) {
-			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
-			    ND_OPT_SOURCE_LINKADDR);
-			if (opt != NULL &&
-			    opt->nd_opt_len * 8 - sizeof (*opt) >=
-			    ill->ill_nd_lla_len) {
-				addr = (uchar_t *)(opt + 1);
-				alen = ill->ill_nd_lla_len;
+		switch (icmp6->icmp6_type) {
+		case ND_NEIGHBOR_SOLICIT:
+			len = mp->b_wptr - (uchar_t *)ns;
+			if ((len -= sizeof (*ns)) > 0) {
+				opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1),
+				    len, ND_OPT_SOURCE_LINKADDR);
 			}
+			break;
+		case ND_NEIGHBOR_ADVERT:
+			len = mp->b_wptr - (uchar_t *)na;
+			if ((len -= sizeof (*na)) > 0) {
+				opt = ndp_get_option((nd_opt_hdr_t *)(na + 1),
+				    len, ND_OPT_TARGET_LINKADDR);
+			}
+			break;
+		}
+
+		if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >=
+		    ill->ill_nd_lla_len) {
+			addr = (uchar_t *)(opt + 1);
+			alen = ill->ill_nd_lla_len;
 		}
+
 		/*
 		 * We cheat a bit here for the sake of printing usable log
 		 * messages in the rare case where the reply we got was unicast
@@ -1624,16 +1669,17 @@ ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
 			}
 		}
 	}
+
 	if (alen > 0) {
 		*haddr = addr;
-		(void) mac_colon_addr(addr, alen, hbuf, hlen);
+		*haddrlenp = alen;
 	} else {
 		*haddr = NULL;
-		(void) strcpy(hbuf, "?");
+		*haddrlenp = 0;
 	}
-	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
-	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
-	return (ns);
+
+	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
+	*targp = ns->nd_ns_target;
 }
 
 /*
@@ -1646,68 +1692,80 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
 {
 	ill_t	*ill = rq->q_ptr;
 	ipif_t	*ipif;
-	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
-	char hbuf[MAC_STR_LEN];
-	char sbuf[INET6_ADDRSTRLEN];
-	nd_neighbor_solicit_t *ns;
-	mblk_t *dl_mp = NULL;
-	uchar_t *haddr;
+	mblk_t	*dl_mp = NULL;
+	uchar_t	*haddr;
+	uint_t	haddrlen;
 	ip_stack_t *ipst = ill->ill_ipst;
+	in6_addr_t targ;
 
 	if (DB_TYPE(mp) != M_DATA) {
 		dl_mp = mp;
 		mp = mp->b_cont;
 	}
-	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
-	    sizeof (sbuf), &haddr);
-	if (haddr != NULL &&
-	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
+
+	ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
+	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
 		/*
-		 * Ignore conflicts generated by misbehaving switches that just
-		 * reflect our own messages back to us.
+		 * Ignore conflicts generated by misbehaving switches that
+		 * just reflect our own messages back to us.  For IPMP, we may
+		 * see reflections across any ill in the illgrp.
 		 */
-		goto ignore_conflict;
+		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
+		    IS_UNDER_IPMP(ill) &&
+		    ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL)
+			goto ignore_conflict;
 	}
 
-	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+	/*
+	 * Look up the appropriate ipif.
+	 */
+	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL,
+	    NULL, ipst);
+	if (ipif == NULL)
+		goto ignore_conflict;
 
-		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
-		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
-		    &ns->nd_ns_target)) {
-			continue;
-		}
+	/* Reload the ill to match the ipif */
+	ill = ipif->ipif_ill;
 
-		/* If it's already marked, then don't do anything. */
-		if (ipif->ipif_flags & IPIF_DUPLICATE)
-			continue;
+	/* If it's already duplicate or ineligible, then don't do anything. */
+	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
+		ipif_refrele(ipif);
+		goto ignore_conflict;
+	}
 
-		/*
-		 * If this is a failure during duplicate recovery, then don't
-		 * complain.  It may take a long time to recover.
-		 */
-		if (!ipif->ipif_was_dup) {
-			ipif_get_name(ipif, ibuf, sizeof (ibuf));
-			cmn_err(CE_WARN, "%s has duplicate address %s (in "
-			    "use by %s); disabled", ibuf, sbuf, hbuf);
-		}
-		mutex_enter(&ill->ill_lock);
-		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
-		ipif->ipif_flags |= IPIF_DUPLICATE;
-		ill->ill_ipif_dup_count++;
-		mutex_exit(&ill->ill_lock);
-		(void) ipif_down(ipif, NULL, NULL);
-		ipif_down_tail(ipif);
-		mutex_enter(&ill->ill_lock);
-		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
-		    ill->ill_net_type == IRE_IF_RESOLVER &&
-		    !(ipif->ipif_state_flags & (IPIF_MOVING |
-		    IPIF_CONDEMNED)) &&
-		    ipst->ips_ip_dup_recovery > 0) {
-			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
-			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
-		}
-		mutex_exit(&ill->ill_lock);
+	/*
+	 * If this is a failure during duplicate recovery, then don't
+	 * complain.  It may take a long time to recover.
+	 */
+	if (!ipif->ipif_was_dup) {
+		char ibuf[LIFNAMSIZ];
+		char hbuf[MAC_STR_LEN];
+		char sbuf[INET6_ADDRSTRLEN];
+
+		ipif_get_name(ipif, ibuf, sizeof (ibuf));
+		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
+		    " disabled", ibuf,
+		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
+		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
 	}
+	mutex_enter(&ill->ill_lock);
+	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
+	ipif->ipif_flags |= IPIF_DUPLICATE;
+	ill->ill_ipif_dup_count++;
+	mutex_exit(&ill->ill_lock);
+	(void) ipif_down(ipif, NULL, NULL);
+	ipif_down_tail(ipif);
+	mutex_enter(&ill->ill_lock);
+	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
+	    ill->ill_net_type == IRE_IF_RESOLVER &&
+	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
+	    ipst->ips_ip_dup_recovery > 0) {
+		ASSERT(ipif->ipif_recovery_id == 0);
+		ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
+		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
+	}
+	mutex_exit(&ill->ill_lock);
+	ipif_refrele(ipif);
 ignore_conflict:
 	if (dl_mp != NULL)
 		freeb(dl_mp);
@@ -1721,7 +1779,7 @@ ignore_conflict:
  * we start a timer on the ipif.
  */
 static void
-ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
+ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 {
 	if ((mp = copymsg(mp)) != NULL) {
 		if (dl_mp == NULL)
@@ -1736,7 +1794,6 @@ ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
 			    B_FALSE);
 		}
 	}
-	ndp_delete(nce);
 }
 
 /*
@@ -1757,6 +1814,7 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
 	    NULL, NULL, ipst);
 	if (ipif == NULL)
 		return;
+
 	/*
 	 * First, figure out if this address is disposable.
 	 */
@@ -1786,19 +1844,21 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
 	 * sending out an unsolicited Neighbor Advertisement.
 	 */
 	if (defs >= maxdefense) {
-		ip_ndp_failure(ill, mp, dl_mp, nce);
+		ip_ndp_failure(ill, mp, dl_mp);
 	} else {
 		char hbuf[MAC_STR_LEN];
 		char sbuf[INET6_ADDRSTRLEN];
 		uchar_t *haddr;
+		uint_t haddrlen;
+		in6_addr_t targ;
 
-		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
-		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
+		ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
-		    hbuf, sbuf, ill->ill_name);
-		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
-		    &nce->nce_addr, &ipv6_all_hosts_mcast,
-		    nce_advert_flags(nce));
+		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)),
+		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
+		    ill->ill_name);
+
+		(void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0);
 	}
 }
 
@@ -1843,6 +1903,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 			bad_solicit = B_TRUE;
 			goto done;
 		}
+
 	}
 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
 		/* Check to see if this is a valid DAD solicitation */
@@ -1859,7 +1920,13 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 		}
 	}
 
-	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
+	/*
+	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
+	 * received this packet if it's multicast) is not the ill tied to
+	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
+	 * to ensure we find the associated NCE.
+	 */
+	our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE);
 	/*
 	 * If this is a valid Solicitation, a permanent
 	 * entry should exist in the cache
@@ -1883,7 +1950,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 			haddr = (uchar_t *)&opt[1];
 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
 			    hlen == 0) {
-				ip1dbg(("ndp_input_advert: bad SLLA\n"));
+				ip1dbg(("ndp_input_solicit: bad SLLA\n"));
 				bad_solicit = B_TRUE;
 				goto done;
 			}
@@ -1934,6 +2001,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 			goto no_source;
 
 		err = ndp_lookup_then_add_v6(ill,
+		    B_FALSE,
 		    haddr,
 		    &src,	/* Soliciting nodes address */
 		    &ipv6_all_ones,
@@ -1949,8 +2017,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 			break;
 		case EEXIST:
 			/*
-			 * B_FALSE indicates this is not an
-			 * an advertisement.
+			 * B_FALSE indicates this is not an an advertisement.
 			 */
 			ndp_process(nnce, haddr, 0, B_FALSE);
 			NCE_REFRELE(nnce);
@@ -1985,7 +2052,7 @@ no_source:
 				 * If someone else is probing our address, then
 				 * we've crossed wires.  Declare failure.
 				 */
-				ip_ndp_failure(ill, mp, dl_mp, our_nce);
+				ip_ndp_failure(ill, mp, dl_mp);
 			}
 			goto done;
 		}
@@ -1995,15 +2062,8 @@ no_source:
 		 */
 		src = ipv6_all_hosts_mcast;
 	}
-	flag |= nce_advert_flags(our_nce);
 	/* Response to a solicitation */
-	(void) nce_xmit(ill,
-	    ND_NEIGHBOR_ADVERT,
-	    ill,	/* ill to be used for extracting ill_nd_lla */
-	    B_TRUE,	/* use ill_nd_lla */
-	    &target,	/* Source and target of the advertisement pkt */
-	    &src,	/* IP Destination (source of original pkt) */
-	    flag);
+	(void) nce_xmit_advert(our_nce, B_TRUE, &src, flag);
 done:
 	if (bad_solicit)
 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
@@ -2023,8 +2083,8 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 	in6_addr_t	target;
 	nd_opt_hdr_t	*opt = NULL;
 	int		len;
-	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
 	ip_stack_t	*ipst = ill->ill_ipst;
+	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
 
 	ip6h = (ip6_t *)mp->b_rptr;
 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
@@ -2067,66 +2127,62 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
 	}
 
 	/*
-	 * If this interface is part of the group look at all the
+	 * NOTE: we match across the illgrp since we need to do DAD for all of
+	 * our local addresses, and those are spread across all the active
 	 * ills in the group.
 	 */
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	if (ill->ill_group != NULL)
-		ill = ill->ill_group->illgrp_ill;
+	if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL)
+		return;
 
-	for (; ill != NULL; ill = ill->ill_group_next) {
-		mutex_enter(&ill->ill_lock);
-		if (!ILL_CAN_LOOKUP(ill)) {
-			mutex_exit(&ill->ill_lock);
-			continue;
-		}
-		ill_refhold_locked(ill);
-		mutex_exit(&ill->ill_lock);
-		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
-		/* We have to drop the lock since ndp_process calls put* */
-		rw_exit(&ipst->ips_ill_g_lock);
-		if (dst_nce != NULL) {
-			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
-			    dst_nce->nce_state == ND_PROBE) {
-				/*
-				 * Someone else sent an advertisement for an
-				 * address that we're trying to configure.
-				 * Tear it down.  Note that dl_mp might be NULL
-				 * if we're getting a unicast reply.  This
-				 * isn't typically done (multicast is the norm
-				 * in response to a probe), but ip_ndp_failure
-				 * will handle the dl_mp == NULL case as well.
-				 */
-				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
-			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
-				/*
-				 * Someone just announced one of our local
-				 * addresses.  If it wasn't us, then this is a
-				 * conflict.  Defend the address or shut it
-				 * down.
-				 */
-				if (dl_mp != NULL &&
-				    (haddr == NULL ||
-				    nce_cmp_ll_addr(dst_nce, haddr,
-				    ill->ill_nd_lla_len))) {
-					ip_ndp_conflict(ill, mp, dl_mp,
-					    dst_nce);
-				}
-			} else {
-				if (na->nd_na_flags_reserved &
-				    ND_NA_FLAG_ROUTER) {
-					dst_nce->nce_flags |= NCE_F_ISROUTER;
+	if (dst_nce->nce_flags & NCE_F_PERMANENT) {
+		/*
+		 * Someone just advertised one of our local addresses.	First,
+		 * check it it was us -- if so, we can safely ignore it.
+		 */
+		if (haddr != NULL) {
+			if (!nce_cmp_ll_addr(dst_nce, haddr, hlen))
+				goto out;   /* from us -- no conflict */
+
+			/*
+			 * If we're in an IPMP group, check if this is an echo
+			 * from another ill in the group.  Use the double-
+			 * checked locking pattern to avoid grabbing
+			 * ill_g_lock in the non-IPMP case.
+			 */
+			if (IS_UNDER_IPMP(ill)) {
+				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
+				    ill->ill_grp, haddr, hlen) != NULL) {
+					rw_exit(&ipst->ips_ill_g_lock);
+					goto out;
 				}
-				/* B_TRUE indicates this an advertisement */
-				ndp_process(dst_nce, haddr,
-				    na->nd_na_flags_reserved, B_TRUE);
+				rw_exit(&ipst->ips_ill_g_lock);
 			}
-			NCE_REFRELE(dst_nce);
 		}
-		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-		ill_refrele(ill);
+
+		/*
+		 * This appears to be a real conflict.  If we're trying to
+		 * configure this NCE (ND_PROBE), then shut it down.
+		 * Otherwise, handle the discovered conflict.
+		 *
+		 * Note that dl_mp might be NULL if we're getting a unicast
+		 * reply.  This isn't typically done (multicast is the norm in
+		 * response to a probe), but we can handle the dl_mp == NULL
+		 * case as well.
+		 */
+		if (dst_nce->nce_state == ND_PROBE)
+			ip_ndp_failure(ill, mp, dl_mp);
+		else
+			ip_ndp_conflict(ill, mp, dl_mp, dst_nce);
+	} else {
+		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
+			dst_nce->nce_flags |= NCE_F_ISROUTER;
+
+		/* B_TRUE indicates this an advertisement */
+		ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE);
 	}
-	rw_exit(&ipst->ips_ill_g_lock);
+out:
+	NCE_REFRELE(dst_nce);
 }
 
 /*
@@ -2194,6 +2250,40 @@ done:
 }
 
 /*
+ * Utility routine to send an advertisement.  Assumes that the NCE cannot
+ * go away (e.g., because it's refheld).
+ */
+static boolean_t
+nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target,
+    uint_t flags)
+{
+	ASSERT((flags & NDP_PROBE) == 0);
+
+	if (nce->nce_flags & NCE_F_ISROUTER)
+		flags |= NDP_ISROUTER;
+	if (!(nce->nce_flags & NCE_F_ANYCAST))
+		flags |= NDP_ORIDE;
+
+	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla,
+	    &nce->nce_addr, target, flags));
+}
+
+/*
+ * Utility routine to send a solicitation.  Assumes that the NCE cannot
+ * go away (e.g., because it's refheld).
+ */
+static boolean_t
+nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
+    uint_t flags)
+{
+	if (flags & NDP_PROBE)
+		sender = &ipv6_all_zeros;
+
+	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla,
+	    sender, &nce->nce_addr, flags));
+}
+
+/*
  * nce_xmit is called to form and transmit a ND solicitation or
  * advertisement ICMP packet.
  *
@@ -2207,88 +2297,79 @@ done:
  * corresponding ill's ill_wq otherwise returns B_TRUE.
  */
 static boolean_t
-nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
-    boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
-    int flag)
+nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
+    const in6_addr_t *sender, const in6_addr_t *target, int flag)
 {
+	ill_t		*hwaddr_ill;
 	uint32_t	len;
 	icmp6_t 	*icmp6;
 	mblk_t		*mp;
 	ip6_t		*ip6h;
 	nd_opt_hdr_t	*opt;
-	uint_t		plen;
+	uint_t		plen, maxplen;
 	ip6i_t		*ip6i;
 	ipif_t		*src_ipif = NULL;
 	uint8_t		*hw_addr;
 	zoneid_t	zoneid = GLOBAL_ZONEID;
+	char		buf[INET6_ADDRSTRLEN];
+
+	ASSERT(!IS_IPMP(ill));
 
 	/*
-	 * If we have a unspecified source(sender) address, select a
-	 * proper source address for the solicitation here itself so
-	 * that we can initialize the h/w address correctly. This is
-	 * needed for interface groups as source address can come from
-	 * the whole group and the h/w address initialized from ill will
-	 * be wrong if the source address comes from a different ill.
-	 *
-	 * If the sender is specified then we use this address in order
-	 * to lookup the zoneid before calling ip_output_v6(). This is to
-	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
-	 * by IP (we cannot guarantee that the global zone has an interface
-	 * route to the destination).
-	 *
-	 * Note that the NA never comes here with the unspecified source
-	 * address. The following asserts that whenever the source
-	 * address is specified, the haddr also should be specified.
+	 * Check that the sender is actually a usable address on `ill', and if
+	 * so, track that as the src_ipif.  If not, for solicitations, set the
+	 * sender to :: so that a new one will be picked below; for adverts,
+	 * drop the packet since we expect nce_xmit_advert() to always provide
+	 * a valid sender.
 	 */
-	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
+	if (!IN6_IS_ADDR_UNSPECIFIED(sender)) {
+		if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL ||
+		    !src_ipif->ipif_addr_ready) {
+			if (src_ipif != NULL) {
+				ipif_refrele(src_ipif);
+				src_ipif = NULL;
+			}
+			if (type == ND_NEIGHBOR_ADVERT) {
+				ip1dbg(("nce_xmit: No source ipif for src %s\n",
+				    inet_ntop(AF_INET6, sender, buf,
+				    sizeof (buf))));
+				return (B_TRUE);
+			}
+			sender = &ipv6_all_zeros;
+		}
+	}
 
+	/*
+	 * If we still have an unspecified source (sender) address and this
+	 * isn't a probe, select a source address from `ill'.
+	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
-		ASSERT(operation != ND_NEIGHBOR_ADVERT);
+		ASSERT(type != ND_NEIGHBOR_ADVERT);
 		/*
-		 * Pick a source address for this solicitation, but
-		 * restrict the selection to addresses assigned to the
-		 * output interface (or interface group).  We do this
-		 * because the destination will create a neighbor cache
-		 * entry for the source address of this packet, so the
-		 * source address had better be a valid neighbor.
+		 * Pick a source address for this solicitation, but restrict
+		 * the selection to addresses assigned to the output
+		 * interface.  We do this because the destination will create
+		 * a neighbor cache entry for the source address of this
+		 * packet, so the source address needs to be a valid neighbor.
 		 */
-		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
+		src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
 		if (src_ipif == NULL) {
-			char buf[INET6_ADDRSTRLEN];
-
 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
-			    inet_ntop(AF_INET6, (char *)target, buf,
-			    sizeof (buf))));
+			    inet_ntop(AF_INET6, target, buf, sizeof (buf))));
 			return (B_TRUE);
 		}
 		sender = &src_ipif->ipif_v6src_addr;
-		hwaddr_ill = src_ipif->ipif_ill;
-	} else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
-		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst);
-		/*
-		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
-		 * ALL_ZONES if it cannot find a matching ipif for the address
-		 * we are trying to use. In this case we err on the side of
-		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
-		 */
-		if (zoneid == ALL_ZONES)
-			zoneid = GLOBAL_ZONEID;
 	}
 
 	/*
-	 * Always make sure that the NS/NA packets don't get load
-	 * spread. This is needed so that the probe packets sent
-	 * by the in.mpathd daemon can really go out on the desired
-	 * interface. Probe packets are made to go out on a desired
-	 * interface by including a ip6i with ATTACH_IF flag. As these
-	 * packets indirectly end up sending/receiving NS/NA packets
-	 * (neighbor doing NUD), we have to make sure that NA
-	 * also go out on the same interface.
+	 * We're either sending a probe or we have a source address.
 	 */
-	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
+	ASSERT((flag & NDP_PROBE) || src_ipif != NULL);
+
+	maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8);
 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
-	    plen * 8;
+	    maxplen;
 	mp = allocb(len,  BPRI_LO);
 	if (mp == NULL) {
 		if (src_ipif != NULL)
@@ -2301,28 +2382,27 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
 	ip6i = (ip6i_t *)mp->b_rptr;
 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
 	ip6i->ip6i_nxt = IPPROTO_RAW;
-	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
+	ip6i->ip6i_flags = IP6I_HOPLIMIT;
 	if (flag & NDP_PROBE)
 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
-	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
 
 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
 	ip6h->ip6_hops = IPV6_MAX_HOPS;
+	ip6h->ip6_src = *sender;
 	ip6h->ip6_dst = *target;
 	icmp6 = (icmp6_t *)&ip6h[1];
 
 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
 	    sizeof (nd_neighbor_advert_t));
 
-	if (operation == ND_NEIGHBOR_SOLICIT) {
+	if (type == ND_NEIGHBOR_SOLICIT) {
 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
 
 		if (!(flag & NDP_PROBE))
 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
-		ip6h->ip6_src = *sender;
 		ns->nd_ns_target = *target;
 		if (!(flag & NDP_UNICAST)) {
 			/* Form multicast address of the target */
@@ -2335,7 +2415,6 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
 
 		ASSERT(!(flag & NDP_PROBE));
 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
-		ip6h->ip6_src = *sender;
 		na->nd_na_target = *sender;
 		if (flag & NDP_ISROUTER)
 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
@@ -2347,22 +2426,48 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
 
 	hw_addr = NULL;
 	if (!(flag & NDP_PROBE)) {
+		/*
+		 * Use our source address to find the hardware address to put
+		 * in the packet, so that the hardware address and IP address
+		 * will match up -- even if that hardware address doesn't
+		 * match the ill we actually transmit the packet through.
+		 */
+		if (IS_IPMP(src_ipif->ipif_ill)) {
+			hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif);
+			if (hwaddr_ill == NULL) {
+				ip1dbg(("nce_xmit: no bound ill!\n"));
+				ipif_refrele(src_ipif);
+				freemsg(mp);
+				return (B_TRUE);
+			}
+		} else {
+			hwaddr_ill = src_ipif->ipif_ill;
+			ill_refhold(hwaddr_ill);	/* for symmetry */
+		}
+
+		plen = roundup(sizeof (nd_opt_hdr_t) +
+		    hwaddr_ill->ill_nd_lla_len, 8);
+
 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
 		    hwaddr_ill->ill_phys_addr;
 		if (hw_addr != NULL) {
 			/* Fill in link layer address and option len */
-			opt->nd_opt_len = (uint8_t)plen;
+			opt->nd_opt_len = (uint8_t)(plen / 8);
 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
 		}
+
+		ill_refrele(hwaddr_ill);
 	}
-	if (hw_addr == NULL) {
-		/* If there's no link layer address option, then strip it. */
-		len -= plen * 8;
-		mp->b_wptr = mp->b_rptr + len;
-		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
-	}
 
-	icmp6->icmp6_type = (uint8_t)operation;
+	if (hw_addr == NULL)
+		plen = 0;
+
+	/* Fix up the length of the packet now that plen is known */
+	len -= (maxplen - plen);
+	mp->b_wptr = mp->b_rptr + len;
+	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
+
+	icmp6->icmp6_type = type;
 	icmp6->icmp6_code = 0;
 	/*
 	 * Prepare for checksum by putting icmp length in the icmp
@@ -2370,8 +2475,17 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
 	 */
 	icmp6->icmp6_cksum = ip6h->ip6_plen;
 
-	if (src_ipif != NULL)
+	/*
+	 * Before we toss the src_ipif, look up the zoneid to pass to
+	 * ip_output_v6().  This is to ensure unicast ND_NEIGHBOR_ADVERT
+	 * packets to be routed correctly by IP (we cannot guarantee that the
+	 * global zone has an interface route to the destination).
+	 */
+	if (src_ipif != NULL) {
+		if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES)
+			zoneid = GLOBAL_ZONEID;
 		ipif_refrele(src_ipif);
+	}
 
 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
 	return (B_FALSE);
@@ -2448,7 +2562,6 @@ ndp_timer(void *arg)
 	ill_t		*ill = nce->nce_ill;
 	uint32_t	ms;
 	char		addrbuf[INET6_ADDRSTRLEN];
-	mblk_t		*mp;
 	boolean_t	dropped = B_FALSE;
 	ip_stack_t	*ipst = ill->ill_ipst;
 
@@ -2460,11 +2573,6 @@ ndp_timer(void *arg)
 	 */
 	ASSERT(nce != NULL);
 
-	/*
-	 * Grab the ill_g_lock now itself to avoid lock order problems.
-	 * nce_solicit needs ill_g_lock to be able to traverse ills
-	 */
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 	mutex_enter(&nce->nce_lock);
 	NCE_REFHOLD_LOCKED(nce);
 	nce->nce_timeout_id = 0;
@@ -2474,11 +2582,10 @@ ndp_timer(void *arg)
 	 */
 	switch (nce->nce_state) {
 	case ND_DELAY:
-		rw_exit(&ipst->ips_ill_g_lock);
 		nce->nce_state = ND_PROBE;
 		mutex_exit(&nce->nce_lock);
-		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
-		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
+		(void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros,
+		    NDP_UNICAST);
 		if (ip_debug > 3) {
 			/* ip2dbg */
 			pr_addr_dbg("ndp_timer: state for %s changed "
@@ -2489,7 +2596,6 @@ ndp_timer(void *arg)
 		return;
 	case ND_PROBE:
 		/* must be retransmit timer */
-		rw_exit(&ipst->ips_ill_g_lock);
 		nce->nce_pcnt--;
 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
 		    nce->nce_pcnt >= -1);
@@ -2504,8 +2610,8 @@ ndp_timer(void *arg)
 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
 			    addrbuf, sizeof (addrbuf))));
 			mutex_exit(&nce->nce_lock);
-			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
-			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
+			dropped = nce_xmit_solicit(nce, B_FALSE,
+			    &ipv6_all_zeros,
 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
 			    NDP_UNICAST);
 			if (dropped) {
@@ -2542,8 +2648,8 @@ ndp_timer(void *arg)
 			 */
 			nce->nce_state = ND_REACHABLE;
 			mutex_exit(&nce->nce_lock);
-			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
-			    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
+			ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr,
+			    nce->nce_ill);
 			if (ipif != NULL) {
 				if (ipif->ipif_was_dup) {
 					char ibuf[LIFNAMSIZ + 10];
@@ -2566,9 +2672,8 @@ ndp_timer(void *arg)
 			}
 			/* Begin defending our new address */
 			nce->nce_unsolicit_count = 0;
-			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
-			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
-			    nce_advert_flags(nce));
+			dropped = nce_xmit_advert(nce, B_FALSE,
+			    &ipv6_all_hosts_mcast, 0);
 			if (dropped) {
 				nce->nce_unsolicit_count = 1;
 				NDP_RESTART_TIMER(nce,
@@ -2589,51 +2694,40 @@ ndp_timer(void *arg)
 		}
 		NCE_REFRELE(nce);
 		return;
-	case ND_INCOMPLETE:
+	case ND_INCOMPLETE: {
+		ip6_t	*ip6h;
+		ip6i_t	*ip6i;
+		mblk_t	*mp, *datamp, *nextmp, **prevmpp;
+
 		/*
-		 * Must be resolvers retransmit timer.
+		 * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp
+		 * for any IPMP probe packets, and toss 'em.  IPMP probe
+		 * packets will always be at the head of nce_qd_mp and always
+		 * have an ip6i_t header, so we can stop at the first queued
+		 * ND packet without an ip6i_t.
 		 */
-		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
-			ip6i_t	*ip6i;
-			ip6_t	*ip6h;
-			mblk_t *data_mp;
-
-			/*
-			 * Walk the list of packets queued, and see if there
-			 * are any multipathing probe packets. Such packets
-			 * are always queued at the head. Since this is a
-			 * retransmit timer firing, mark such packets as
-			 * delayed in ND resolution. This info will be used
-			 * in ip_wput_v6(). Multipathing probe packets will
-			 * always have an ip6i_t. Once we hit a packet without
-			 * it, we can break out of this loop.
-			 */
-			if (mp->b_datap->db_type == M_CTL)
-				data_mp = mp->b_cont;
-			else
-				data_mp = mp;
-
-			ip6h = (ip6_t *)data_mp->b_rptr;
+		prevmpp = &nce->nce_qd_mp;
+		for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) {
+			nextmp = mp->b_next;
+			datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp;
+			ip6h = (ip6_t *)datamp->b_rptr;
 			if (ip6h->ip6_nxt != IPPROTO_RAW)
 				break;
 
-			/*
-			 * This message should have been pulled up already in
-			 * ip_wput_v6. We can't do pullups here because the
-			 * b_next/b_prev is non-NULL.
-			 */
 			ip6i = (ip6i_t *)ip6h;
-			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
-			    sizeof (ip6i_t) + IPV6_HDR_LEN);
-
-			/* Mark this packet as delayed due to ND resolution */
-			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
-				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
+			if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) {
+				inet_freemsg(mp);
+				*prevmpp = nextmp;
+			} else {
+				prevmpp = &mp->b_next;
+			}
 		}
+
+		/*
+		 * Must be resolver's retransmit timer.
+		 */
 		if (nce->nce_qd_mp != NULL) {
-			ms = nce_solicit(nce, NULL);
-			rw_exit(&ipst->ips_ill_g_lock);
-			if (ms == 0) {
+			if ((ms = nce_solicit(nce, NULL)) == 0) {
 				if (nce->nce_state != ND_REACHABLE) {
 					mutex_exit(&nce->nce_lock);
 					nce_resolv_failed(nce);
@@ -2649,11 +2743,10 @@ ndp_timer(void *arg)
 			return;
 		}
 		mutex_exit(&nce->nce_lock);
-		rw_exit(&ipst->ips_ill_g_lock);
 		NCE_REFRELE(nce);
 		break;
-	case ND_REACHABLE :
-		rw_exit(&ipst->ips_ill_g_lock);
+	}
+	case ND_REACHABLE:
 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
 		    nce->nce_unsolicit_count != 0) ||
 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
@@ -2661,13 +2754,8 @@ ndp_timer(void *arg)
 			if (nce->nce_unsolicit_count > 0)
 				nce->nce_unsolicit_count--;
 			mutex_exit(&nce->nce_lock);
-			dropped = nce_xmit(ill,
-			    ND_NEIGHBOR_ADVERT,
-			    ill,	/* ill to be used for hw addr */
-			    B_FALSE,	/* use ill_phys_addr */
-			    &nce->nce_addr,
-			    &ipv6_all_hosts_mcast,
-			    nce_advert_flags(nce));
+			dropped = nce_xmit_advert(nce, B_FALSE,
+			    &ipv6_all_hosts_mcast, 0);
 			if (dropped) {
 				mutex_enter(&nce->nce_lock);
 				nce->nce_unsolicit_count++;
@@ -2686,7 +2774,6 @@ ndp_timer(void *arg)
 		NCE_REFRELE(nce);
 		break;
 	default:
-		rw_exit(&ipst->ips_ill_g_lock);
 		mutex_exit(&nce->nce_lock);
 		NCE_REFRELE(nce);
 		break;
@@ -2819,23 +2906,20 @@ void
 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
 {
 	uint_t	count = 0;
-	mblk_t  **mpp;
+	mblk_t  **mpp, *tmp;
 
 	ASSERT(MUTEX_HELD(&nce->nce_lock));
 
-	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
-	    mpp = &(*mpp)->b_next) {
-		if (++count >
-		    nce->nce_ill->ill_max_buf) {
-			mblk_t *tmp = nce->nce_qd_mp->b_next;
-
+	for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
+		if (++count > nce->nce_ill->ill_max_buf) {
+			tmp = nce->nce_qd_mp->b_next;
 			nce->nce_qd_mp->b_next = NULL;
 			nce->nce_qd_mp->b_prev = NULL;
 			freemsg(nce->nce_qd_mp);
 			nce->nce_qd_mp = tmp;
 		}
 	}
-	/* put this on the list */
+
 	if (head_insert) {
 		mp->b_next = nce->nce_qd_mp;
 		nce->nce_qd_mp = mp;
@@ -2849,8 +2933,8 @@ nce_queue_mp(nce_t *nce, mblk_t *mp)
 {
 	boolean_t head_insert = B_FALSE;
 	ip6_t	*ip6h;
-	ip6i_t	*ip6i;
-	mblk_t *data_mp;
+	ip6i_t  *ip6i;
+	mblk_t	*data_mp;
 
 	ASSERT(MUTEX_HELD(&nce->nce_lock));
 
@@ -2867,43 +2951,28 @@ nce_queue_mp(nce_t *nce, mblk_t *mp)
 		 * non-NULL.
 		 */
 		ip6i = (ip6i_t *)ip6h;
-		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
-		    sizeof (ip6i_t) + IPV6_HDR_LEN);
+		ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
+
 		/*
-		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
-		 * This has 2 aspects mentioned below.
-		 * 1. Perform head insertion in the nce_qd_mp for these packets.
-		 * This ensures that next retransmit of ND solicitation
-		 * will use the interface specified by the probe packet,
-		 * for both NS and NA. This corresponds to the src address
-		 * in the IPv6 packet. If we insert at tail, we will be
-		 * depending on the packet at the head for successful
-		 * ND resolution. This is not reliable, because the interface
-		 * on which the NA arrives could be different from the interface
-		 * on which the NS was sent, and if the receiving interface is
-		 * failed, it will appear that the sending interface is also
-		 * failed, causing in.mpathd to misdiagnose this as link
-		 * failure.
-		 * 2. Drop the original packet, if the ND resolution did not
-		 * succeed in the first attempt. However we will create the
-		 * nce and the ire, as soon as the ND resolution succeeds.
-		 * We don't gain anything by queueing multiple probe packets
-		 * and sending them back-to-back once resolution succeeds.
-		 * It is sufficient to send just 1 packet after ND resolution
-		 * succeeds. Since mpathd is sending down probe packets at a
-		 * constant rate, we don't need to send the queued packet. We
-		 * need to queue it only for NDP resolution. The benefit of
-		 * dropping the probe packets that were delayed in ND
-		 * resolution, is that in.mpathd will not see inflated
-		 * RTT. If the ND resolution does not succeed within
-		 * in.mpathd's failure detection time, mpathd may detect
-		 * a failure, and it does not matter whether the packet
-		 * was queued or dropped.
+		 * If this packet is marked IP6I_IPMP_PROBE, then we need to:
+		 *
+		 *   1. Insert it at the head of the nce_qd_mp list.  Consider
+		 *	the normal (non-probe) load-speading case where the
+		 *	source address of the ND packet is not tied to nce_ill.
+		 *	If the ill bound to the source address cannot receive,
+		 *	the response to the ND packet will not be received.
+		 *	However, if ND packets for nce_ill's probes are queued
+		 *	behind that ND packet, those probes will also fail to
+		 *	be sent, and thus in.mpathd will erroneously conclude
+		 *	that nce_ill has also failed.
+		 *
+		 *   2. Drop the probe packet in ndp_timer() if the ND did
+		 *	not succeed on the first attempt.  This ensures that
+		 *	ND problems do not manifest as probe RTT spikes.
 		 */
-		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
+		if (ip6i->ip6i_flags & IP6I_IPMP_PROBE)
 			head_insert = B_TRUE;
 	}
-
 	nce_queue_mp_common(nce, mp, head_insert);
 }
 
@@ -2988,13 +3057,17 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
 	    (lnr->lnr_state_create != ND_STALE))
 		return (EINVAL);
 
+	if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN)
+		return (EINVAL);
+
 	sin6 = (sin6_t *)&lnr->lnr_addr;
 	addr = &sin6->sin6_addr;
 
 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 	/* We know it can not be mapping so just look in the hash table */
 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
-	nce = nce_lookup_addr(ill, addr, nce);
+	/* See comment in ndp_query() regarding IS_IPMP(ill) usage */
+	nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce);
 	if (nce != NULL)
 		new_flags = nce->nce_flags;
 
@@ -3065,7 +3138,7 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
 	 * the link layer address passed in to determine the state
 	 * much like incoming packets.
 	 */
-	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
+	nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
 	NCE_REFRELE(nce);
 	return (0);
 }
@@ -3463,7 +3536,11 @@ ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
-	nce = nce_lookup_addr(ill, &addr6, nce);
+	/*
+	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
+	 * looking up have fastpath headers that are inherently per-ill.
+	 */
+	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
 	if (nce == NULL) {
 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
 	} else {
@@ -3718,3 +3795,26 @@ ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 	return (nce != NULL);
 }
+
+/*
+ * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly
+ * with IPMP.  Specifically, since neighbor discovery is always done on
+ * underlying interfaces (even for addresses owned by an IPMP interface), we
+ * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface
+ * associated with `ill' (if it exists).
+ */
+static ipif_t *
+ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill)
+{
+	ipif_t *ipif;
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
+	if (ipif == NULL && IS_UNDER_IPMP(ill)) {
+		if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
+			ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
+			ill_refrele(ill);
+		}
+	}
+	return (ipif);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c
index 53665593be..e81c7a0e1f 100644
--- a/usr/src/uts/common/inet/ip/ip_netinfo.c
+++ b/usr/src/uts/common/inet/ip/ip_netinfo.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -568,33 +568,17 @@ ip_getifname_impl(phy_if_t phy_ifdata,
     char *buffer, const size_t buflen, boolean_t isv6, ip_stack_t *ipst)
 {
 	ill_t *ill;
-	char *name;
 
 	ASSERT(buffer != NULL);
 
 	ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, NULL, NULL,
 	    NULL, NULL, ipst);
-	if (ill != NULL) {
-		name = ill->ill_name;
-	} else {
-		/* Fallback to group names only if hook_emulation is set */
-		if (ipst->ips_ipmp_hook_emulation) {
-			ill = ill_group_lookup_on_ifindex((uint_t)phy_ifdata,
-			    isv6, ipst);
-		}
-		if (ill == NULL)
-			return (1);
-		name = ill->ill_phyint->phyint_groupname;
-	}
-	if (name != NULL) {
-		(void) strlcpy(buffer, name, buflen);
-		ill_refrele(ill);
-		return (0);
-	} else {
-		ill_refrele(ill);
+	if (ill == NULL)
 		return (1);
-	}
 
+	(void) strlcpy(buffer, ill->ill_name, buflen);
+	ill_refrele(ill);
+	return (0);
 }
 
 /*
@@ -625,9 +609,6 @@ ipv6_getmtu(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata)
 
 /*
  * Shared implementation to determine the MTU of a network interface
- *
- * Note: this does not handle a non-zero ifdata when ipmp_hook_emulation is set.
- * But IP Filter only uses a zero ifdata.
  */
 /* ARGSUSED */
 static int
@@ -653,16 +634,7 @@ ip_getmtu_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
 
 		if ((ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6,
 		    NULL, NULL, NULL, NULL, ipst)) == NULL) {
-			/*
-			 * Fallback to group names only if hook_emulation
-			 * is set
-			 */
-			if (ipst->ips_ipmp_hook_emulation) {
-				ill = ill_group_lookup_on_ifindex(
-				    (uint_t)phy_ifdata, isv6, ipst);
-			}
-			if (ill == NULL)
-				return (0);
+			return (0);
 		}
 		mtu = ill->ill_max_frag;
 		ill_refrele(ill);
@@ -686,9 +658,6 @@ ip_getpmtuenabled(net_handle_t neti)
 
 /*
  * Get next interface from the current list of IPv4 physical network interfaces
- *
- * Note: this does not handle the case when ipmp_hook_emulation is set.
- * But IP Filter does not use this function.
  */
 static phy_if_t
 ip_phygetnext(net_handle_t neti, phy_if_t phy_ifdata)
@@ -752,15 +721,10 @@ ip_phylookup_impl(const char *name, boolean_t isv6, ip_stack_t *ipst)
 
 	ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, NULL,
 	    NULL, NULL, NULL, ipst);
-
-	/* Fallback to group names only if hook_emulation is set */
-	if (ill == NULL && ipst->ips_ipmp_hook_emulation) {
-		ill = ill_group_lookup_on_name((char *)name, isv6, ipst);
-	}
 	if (ill == NULL)
 		return (0);
 
-	phy = ill->ill_phyint->phyint_hook_ifindex;
+	phy = ill->ill_phyint->phyint_ifindex;
 
 	ill_refrele(ill);
 
@@ -798,9 +762,6 @@ ipv6_lifgetnext(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata)
 /*
  * Shared implementation to get next interface from the current list of
  * logical network interfaces
- *
- * Note: this does not handle the case when ipmp_hook_emulation is set.
- * But IP Filter does not use this function.
  */
 static lif_if_t
 ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
@@ -834,7 +795,7 @@ ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
 	/*
 	 * It's safe to iterate the ill_ipif list when holding an ill_lock.
 	 * And it's also safe to access ipif_id without ipif refhold.
-	 * See ipif_get_id().
+	 * See the field access rules in ip.h.
 	 */
 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 		if (!IPIF_CAN_LOOKUP(ipif))
@@ -1013,8 +974,8 @@ ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6,
 		if (ire->ire_nce == NULL ||
 		    ire->ire_nce->nce_fp_mp == NULL &&
 		    ire->ire_nce->nce_res_mp == NULL) {
-			ip_newroute_v6(ire->ire_stq, mp,
-			    &sin6->sin6_addr, NULL, NULL, ALL_ZONES, ipst);
+			ip_newroute_v6(ire->ire_stq, mp, &sin6->sin6_addr,
+			    &ip6h->ip6_src, NULL, ALL_ZONES, ipst);
 
 			ire_refrele(ire);
 			return (0);
@@ -1170,7 +1131,7 @@ ip_routeto_impl(struct sockaddr *address, struct sockaddr *nexthop,
 	}
 
 	ASSERT(ill != NULL);
-	phy_if = (phy_if_t)ill->ill_phyint->phyint_hook_ifindex;
+	phy_if = (phy_if_t)ill->ill_phyint->phyint_ifindex;
 	if (sire != NULL)
 		ire_refrele(sire);
 	ire_refrele(ire);
@@ -1305,9 +1266,6 @@ ipv6_getlifaddr(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata,
 
 /*
  * Shared implementation to determine the network addresses for an interface
- *
- * Note: this does not handle a non-zero ifdata when ipmp_hook_emulation is set.
- * But IP Filter only uses a zero ifdata.
  */
 /* ARGSUSED */
 static int
@@ -1531,12 +1489,6 @@ ip_ni_queue_func_impl(injection_t *inject,  boolean_t out)
 
 	ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical,
 	    B_FALSE, NULL, NULL, NULL, NULL, ipst);
-
-	/* Fallback to group names only if hook_emulation is set */
-	if (ill == NULL && ipst->ips_ipmp_hook_emulation) {
-		ill = ill_group_lookup_on_ifindex((uint_t)packet->ni_physical,
-		    B_FALSE, ipst);
-	}
 	if (ill == NULL) {
 		kmem_free(inject, sizeof (*inject));
 		return;
@@ -1613,65 +1565,3 @@ done:
 	kmem_free(info->hnei_event.hne_data, info->hnei_event.hne_datalen);
 	kmem_free(arg, sizeof (hook_nic_event_int_t));
 }
-
-/*
- * Temporary function to support IPMP emulation for IP Filter.
- * Lookup an ill based on the ifindex assigned to the group.
- * Skips unusable ones i.e. where any of these flags are set:
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)
- */
-ill_t *
-ill_group_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
-{
-	ill_t	*ill;
-	phyint_t *phyi;
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	phyi = phyint_lookup_group_ifindex(index, ipst);
-	if (phyi != NULL) {
-		ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
-		if (ill != NULL) {
-			mutex_enter(&ill->ill_lock);
-			if (ILL_CAN_LOOKUP(ill)) {
-				ill_refhold_locked(ill);
-				mutex_exit(&ill->ill_lock);
-				rw_exit(&ipst->ips_ill_g_lock);
-				return (ill);
-			}
-			mutex_exit(&ill->ill_lock);
-		}
-	}
-	rw_exit(&ipst->ips_ill_g_lock);
-	return (NULL);
-}
-
-/*
- * Temporary function to support IPMP emulation for IP Filter.
- * Lookup an ill based on the group name.
- * Skips unusable ones i.e. where any of these flags are set:
- * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)
- */
-ill_t *
-ill_group_lookup_on_name(char *name, boolean_t isv6, ip_stack_t *ipst)
-{
-	ill_t	*ill;
-	phyint_t *phyi;
-
-	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-	phyi = phyint_lookup_group(name, B_TRUE, ipst);
-	if (phyi != NULL) {
-		ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
-		if (ill != NULL) {
-			mutex_enter(&ill->ill_lock);
-			if (ILL_CAN_LOOKUP(ill)) {
-				ill_refhold_locked(ill);
-				mutex_exit(&ill->ill_lock);
-				rw_exit(&ipst->ips_ill_g_lock);
-				return (ill);
-			}
-			mutex_exit(&ill->ill_lock);
-		}
-	}
-	rw_exit(&ipst->ips_ill_g_lock);
-	return (NULL);
-}
diff --git a/usr/src/uts/common/inet/ip/ip_opt_data.c b/usr/src/uts/common/inet/ip/ip_opt_data.c
index bb6e98a99e..1c91ea667f 100644
--- a/usr/src/uts/common/inet/ip/ip_opt_data.c
+++ b/usr/src/uts/common/inet/ip/ip_opt_data.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -119,9 +119,6 @@ opdes_t		ip_opt_arr[] = {
 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (struct in_addr),	0 /* not initialized */ },
-
 { IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
 	sizeof (int), 0 },
 
@@ -199,12 +196,6 @@ opdes_t		ip_opt_arr[] = {
 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int),	0 /* no ifindex */ },
-
-{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
-	sizeof (int),	0 /* no ifindex */ },
-
 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
 	sizeof (int), 0 },
 
diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c
index 3324d1d833..77ab2cc220 100644
--- a/usr/src/uts/common/inet/ip/ip_rts.c
+++ b/usr/src/uts/common/inet/ip/ip_rts.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -93,34 +93,52 @@ static void	rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics);
 static void	ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *);
 
 /*
- * Send the ack to all the routing queues.  In case of the originating queue,
- * send it only if the loopback is set.
- *
- * Messages are sent upstream only on routing sockets that did not specify an
- * address family when they were created or when the address family matches the
- * one specified by the caller.
+ * Send `mp' to all eligible routing queues.  A queue is ineligible if:
  *
+ *  1. SO_USELOOPBACK is off and it is not the originating queue.
+ *  2. RTAW_UNDER_IPMP is on and RTSQ_UNDER_IPMP is clear in `flags'.
+ *  3. RTAW_UNDER_IPMP is off and RTSQ_NORMAL is clear in `flags'.
+ *  4. It is not the same address family as `af', and `af' isn't AF_UNSPEC.
  */
 void
-rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst)
+rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
+    ip_stack_t *ipst)
 {
 	mblk_t	*mp1;
 	conn_t 	*connp, *next_connp;
 
+	/*
+	 * Since we don't have an ill_t here, RTSQ_DEFAULT must already be
+	 * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP by now.
+	 */
+	ASSERT(!(flags & RTSQ_DEFAULT));
+
 	mutex_enter(&ipst->ips_rts_clients->connf_lock);
 	connp = ipst->ips_rts_clients->connf_head;
 
-	while (connp != NULL) {
+	for (; connp != NULL; connp = next_connp) {
+		next_connp = connp->conn_next;
+
 		/*
 		 * If there was a family specified when this routing socket was
 		 * created and it doesn't match the family of the message to
 		 * copy, then continue.
 		 */
 		if ((connp->conn_proto != AF_UNSPEC) &&
-		    (connp->conn_proto != af)) {
-			connp = connp->conn_next;
+		    (connp->conn_proto != af))
 			continue;
+
+		/*
+		 * Queue the message only if the conn_t and flags match.
+		 */
+		if (connp->conn_rtaware & RTAW_UNDER_IPMP) {
+			if (!(flags & RTSQ_UNDER_IPMP))
+				continue;
+		} else {
+			if (!(flags & RTSQ_NORMAL))
+				continue;
 		}
+
 		/*
 		 * For the originating queue, we only copy the message upstream
 		 * if loopback is set.  For others reading on the routing
@@ -128,8 +146,8 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst)
 		 * message.
 		 */
 		if ((o_connp == connp) && connp->conn_loopback == 0) {
-				connp = connp->conn_next;
-				continue;
+			connp = connp->conn_next;
+			continue;
 		}
 		CONN_INC_REF(connp);
 		mutex_exit(&ipst->ips_rts_clients->connf_lock);
@@ -145,10 +163,9 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst)
 		}
 
 		mutex_enter(&ipst->ips_rts_clients->connf_lock);
-		/* Follow the next pointer before releasing the conn. */
+		/* reload next_connp since conn_next may have changed */
 		next_connp = connp->conn_next;
 		CONN_DEC_REF(connp);
-		connp = next_connp;
 	}
 	mutex_exit(&ipst->ips_rts_clients->connf_lock);
 	freemsg(mp);
@@ -209,7 +226,7 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
 		rtm->rtm_errno = error;
 	else
 		rtm->rtm_flags |= RTF_DONE;
-	rts_queue_input(mp, NULL, af, ipst);
+	rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst);
 }
 
 /* ARGSUSED */
@@ -430,7 +447,7 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
 
 	if (index != 0) {
 		ill_t   *ill;
-
+lookup:
 		/*
 		 * IPC must be refheld somewhere in ip_wput_nondata or
 		 * ip_wput_ioctl etc... and cleaned up if ioctl is killed.
@@ -445,16 +462,33 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
 			goto done;
 		}
 
-		ipif = ipif_get_next_ipif(NULL, ill);
-		ill_refrele(ill);
 		/*
-		 * If this is replacement ipif, prevent a route from
-		 * being added.
+		 * Since all interfaces in an IPMP group must be equivalent,
+		 * we prevent changes to a specific underlying interface's
+		 * routing configuration.  However, for backward compatibility,
+		 * we intepret a request to add a route on an underlying
+		 * interface as a request to add a route on its IPMP interface.
 		 */
-		if (ipif != NULL && ipif->ipif_replace_zero) {
-			error = ENETDOWN;
-			goto done;
+		if (IS_UNDER_IPMP(ill)) {
+			switch (rtm->rtm_type) {
+			case RTM_CHANGE:
+			case RTM_DELETE:
+				ill_refrele(ill);
+				error = EINVAL;
+				goto done;
+			case RTM_ADD:
+				index = ipmp_ill_get_ipmp_ifindex(ill);
+				ill_refrele(ill);
+				if (index == 0) {
+					error = EINVAL;
+					goto done;
+				}
+				goto lookup;
+			}
 		}
+
+		ipif = ipif_get_next_ipif(NULL, ill);
+		ill_refrele(ill);
 		match_flags |= MATCH_IRE_ILL;
 	}
 
@@ -1037,7 +1071,7 @@ done:
 			/* OK ACK already set up by caller except this */
 			ip2dbg(("ip_rts_request: OK ACK\n"));
 		}
-		rts_queue_input(mp, connp, af, ipst);
+		rts_queue_input(mp, connp, af, RTSQ_ALL, ipst);
 	}
 
 	iocp->ioc_error = error;
@@ -1724,7 +1758,7 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
 	rtm->rtm_errno = error;
 	rtm->rtm_flags |= RTF_DONE;
 	rtm->rtm_addrs = rtm_addrs;
-	rts_queue_input(mp, NULL, AF_INET, ipst);
+	rts_queue_input(mp, NULL, AF_INET, RTSQ_ALL, ipst);
 }
 
 /*
@@ -1733,7 +1767,13 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
  * Message type generated RTM_IFINFO.
  */
 void
-ip_rts_ifmsg(const ipif_t *ipif)
+ip_rts_ifmsg(const ipif_t *ipif, uint_t flags)
+{
+	ip_rts_xifmsg(ipif, 0, 0, flags);
+}
+
+void
+ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags)
 {
 	if_msghdr_t	*ifm;
 	mblk_t		*mp;
@@ -1741,12 +1781,12 @@ ip_rts_ifmsg(const ipif_t *ipif)
 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
 
 	/*
-	 * This message should be generated only
-	 * when the physical device is changing
-	 * state.
+	 * This message should be generated only when the physical interface
+	 * is changing state.
 	 */
 	if (ipif->ipif_id != 0)
 		return;
+
 	if (ipif->ipif_isv6) {
 		af = AF_INET6;
 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
@@ -1765,11 +1805,22 @@ ip_rts_ifmsg(const ipif_t *ipif)
 	}
 	ifm = (if_msghdr_t *)mp->b_rptr;
 	ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
-	ifm->ifm_flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags |
-	    ipif->ipif_ill->ill_phyint->phyint_flags;
+	ifm->ifm_flags = (ipif->ipif_flags | ipif->ipif_ill->ill_flags |
+	    ipif->ipif_ill->ill_phyint->phyint_flags | set) & ~clear;
 	rts_getifdata(&ifm->ifm_data, ipif);
 	ifm->ifm_addrs = RTA_IFP;
-	rts_queue_input(mp, NULL, af, ipst);
+
+	if (flags & RTSQ_DEFAULT) {
+		flags = RTSQ_ALL;
+		/*
+		 * If this message is for an underlying interface, prevent
+		 * "normal" (IPMP-unaware) routing sockets from seeing it.
+		 */
+		if (IS_UNDER_IPMP(ipif->ipif_ill))
+			flags &= ~RTSQ_NORMAL;
+	}
+
+	rts_queue_input(mp, NULL, af, flags, ipst);
 }
 
 /*
@@ -1778,7 +1829,7 @@ ip_rts_ifmsg(const ipif_t *ipif)
  * The structure of the code is based on the 4.4BSD-Lite2 <net/rtsock.c>.
  */
 void
-ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
+ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
 {
 	int		pass;
 	int		ncmd;
@@ -1793,6 +1844,17 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
 		af = AF_INET6;
 	else
 		af = AF_INET;
+
+	if (flags & RTSQ_DEFAULT) {
+		flags = RTSQ_ALL;
+		/*
+		 * If this message is for an underlying interface, prevent
+		 * "normal" (IPMP-unaware) routing sockets from seeing it.
+		 */
+		if (IS_UNDER_IPMP(ipif->ipif_ill))
+			flags &= ~RTSQ_NORMAL;
+	}
+
 	/*
 	 * If the request is DELETE, send RTM_DELETE and RTM_DELADDR.
 	 * if the request is ADD, send RTM_NEWADDR and RTM_ADD.
@@ -1827,7 +1889,7 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
 			ifam->ifam_metric = ipif->ipif_metric;
 			ifam->ifam_flags = ((cmd == RTM_ADD) ? RTF_UP : 0);
 			ifam->ifam_addrs = rtm_addrs;
-			rts_queue_input(mp, NULL, af, ipst);
+			rts_queue_input(mp, NULL, af, flags, ipst);
 		}
 		if ((cmd == RTM_ADD && pass == 2) ||
 		    (cmd == RTM_DELETE && pass == 1)) {
@@ -1857,7 +1919,7 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif)
 			if (error == 0)
 				rtm->rtm_flags |= RTF_DONE;
 			rtm->rtm_addrs = rtm_addrs;
-			rts_queue_input(mp, NULL, af, ipst);
+			rts_queue_input(mp, NULL, af, flags, ipst);
 		}
 	}
 }
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index 59ddb7461f..5afa70160d 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -2322,11 +2322,8 @@ ipcl_conn_cleanup(conn_t *connp)
 	 * We should replace these pointers with ifindex/ipaddr_t to
 	 * make the code less complex.
 	 */
-	ASSERT(connp->conn_xmit_if_ill == NULL);
-	ASSERT(connp->conn_nofailover_ill == NULL);
 	ASSERT(connp->conn_outgoing_ill == NULL);
 	ASSERT(connp->conn_incoming_ill == NULL);
-	ASSERT(connp->conn_outgoing_pill == NULL);
 	ASSERT(connp->conn_multicast_ipif == NULL);
 	ASSERT(connp->conn_multicast_ill == NULL);
 #endif
diff --git a/usr/src/uts/common/inet/ip/ipmp.c b/usr/src/uts/common/inet/ip/ipmp.c
new file mode 100644
index 0000000000..b8f3768834
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ipmp.c
@@ -0,0 +1,2201 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <inet/arp.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_rts.h>
+#include <inet/mi.h>
+#include <net/if_types.h>
+#include <sys/dlpi.h>
+#include <sys/kmem.h>
+#include <sys/modhash.h>
+#include <sys/sdt.h>
+#include <sys/strsun.h>
+#include <sys/sunddi.h>
+#include <sys/types.h>
+
+/*
+ * Convenience macros for getting the ip_stack_t associated with an
+ * ipmp_illgrp_t or ipmp_grp_t.
+ */
+#define	IPMP_GRP_TO_IPST(grp)		PHYINT_TO_IPST((grp)->gr_phyint)
+#define	IPMP_ILLGRP_TO_IPST(illg)	((illg)->ig_ipmp_ill->ill_ipst)
+
+/*
+ * Assorted constants that aren't important enough to be tunable.
+ */
+#define	IPMP_GRP_HASH_SIZE		64
+#define	IPMP_ILL_REFRESH_TIMEOUT	120	/* seconds */
+
+/*
+ * Templates for IPMP ARP messages.
+ */
+static const arie_t ipmp_aract_template = {
+	AR_IPMP_ACTIVATE,
+	sizeof (arie_t),		/* Name offset */
+	sizeof (arie_t)			/* Name length (set by ill_arp_alloc) */
+};
+
+static const arie_t ipmp_ardeact_template = {
+	AR_IPMP_DEACTIVATE,
+	sizeof (arie_t),		/* Name offset */
+	sizeof (arie_t)			/* Name length (set by ill_arp_alloc) */
+};
+
+/*
+ * IPMP meta-interface kstats (based on those in PSARC/1997/198).
+ */
+static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
+	{ "obytes",	KSTAT_DATA_UINT32 },
+	{ "obytes64",	KSTAT_DATA_UINT64 },
+	{ "rbytes",	KSTAT_DATA_UINT32 },
+	{ "rbytes64",	KSTAT_DATA_UINT64 },
+	{ "opackets",	KSTAT_DATA_UINT32 },
+	{ "opackets64",	KSTAT_DATA_UINT64 },
+	{ "oerrors",	KSTAT_DATA_UINT32 },
+	{ "ipackets",	KSTAT_DATA_UINT32 },
+	{ "ipackets64",	KSTAT_DATA_UINT64 },
+	{ "ierrors",	KSTAT_DATA_UINT32 },
+	{ "multircv",	KSTAT_DATA_UINT32 },
+	{ "multixmt",	KSTAT_DATA_UINT32 },
+	{ "brdcstrcv",	KSTAT_DATA_UINT32 },
+	{ "brdcstxmt",	KSTAT_DATA_UINT32 },
+	{ "link_up",	KSTAT_DATA_UINT32 }
+};
+
+static void	ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
+static int	ipmp_grp_create_kstats(ipmp_grp_t *);
+static int	ipmp_grp_update_kstats(kstat_t *, int);
+static void	ipmp_grp_destroy_kstats(ipmp_grp_t *);
+static ill_t	*ipmp_illgrp_min_ill(ipmp_illgrp_t *);
+static ill_t	*ipmp_illgrp_max_ill(ipmp_illgrp_t *);
+static void	ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
+static void	ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t);
+static boolean_t ipmp_ill_activate(ill_t *);
+static void	ipmp_ill_deactivate(ill_t *);
+static void	ipmp_ill_ire_mark_testhidden(ire_t *, char *);
+static void	ipmp_ill_ire_clear_testhidden(ire_t *, char *);
+static void	ipmp_ill_refresh_active_timer_start(ill_t *);
+static void	ipmp_ill_rtsaddrmsg(ill_t *, int);
+static void	ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
+static ipif_t	*ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
+static void	ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
+static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
+
+/*
+ * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
+ */
+void
+ipmp_init(ip_stack_t *ipst)
+{
+	ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
+	    IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
+	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
+	rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
+}
+
+/*
+ * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
+ */
+void
+ipmp_destroy(ip_stack_t *ipst)
+{
+	mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
+	rw_destroy(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
+ * and add it to the hash.  On success, return a pointer to the created group.
+ * Caller must ensure `grname' is not yet in the hash.  Assumes that the IPMP
+ * meta-interface associated with the group also has the same name (but they
+ * may differ later via ipmp_grp_rename()).
+ */
+ipmp_grp_t *
+ipmp_grp_create(const char *grname, phyint_t *phyi)
+{
+	ipmp_grp_t *grp;
+	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+	mod_hash_hndl_t mh;
+
+	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+	if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
+		return (NULL);
+
+	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
+	(void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
+
+	/*
+	 * Cache the group's phyint.  This is safe since a phyint_t will
+	 * outlive its ipmp_grp_t.
+	 */
+	grp->gr_phyint = phyi;
+
+	/*
+	 * Create IPMP group kstats.
+	 */
+	if (ipmp_grp_create_kstats(grp) != 0) {
+		kmem_free(grp, sizeof (ipmp_grp_t));
+		return (NULL);
+	}
+
+	/*
+	 * Insert the group into the hash.
+	 */
+	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
+		ipmp_grp_destroy_kstats(grp);
+		kmem_free(grp, sizeof (ipmp_grp_t));
+		return (NULL);
+	}
+	ipmp_grp_insert(grp, mh);
+
+	return (grp);
+}
+
+/*
+ * Create IPMP kstat structures for `grp'.  Return an errno upon failure.
+ */
+static int
+ipmp_grp_create_kstats(ipmp_grp_t *grp)
+{
+	kstat_t *ksp;
+	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
+
+	ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
+	    KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
+	if (ksp == NULL)
+		return (ENOMEM);
+
+	ksp->ks_update = ipmp_grp_update_kstats;
+	ksp->ks_private = grp;
+	bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
+
+	kstat_install(ksp);
+	grp->gr_ksp = ksp;
+	return (0);
+}
+
+/*
+ * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
+ */
+static int
+ipmp_grp_update_kstats(kstat_t *ksp, int rw)
+{
+	uint_t		i;
+	kstat_named_t	*kn = KSTAT_NAMED_PTR(ksp);
+	ipmp_grp_t	*grp = ksp->ks_private;
+	ip_stack_t	*ipst = IPMP_GRP_TO_IPST(grp);
+	ipsq_t		*ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
+	phyint_t	*phyi;
+	uint64_t	phyi_kstats[IPMP_KSTAT_MAX];
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	/*
+	 * Start with the group's baseline values.
+	 */
+	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+		if (kn[i].data_type == KSTAT_DATA_UINT32) {
+			kn[i].value.ui32 = grp->gr_kstats0[i];
+		} else {
+			ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
+			kn[i].value.ui64 = grp->gr_kstats0[i];
+		}
+	}
+
+	/*
+	 * Add in the stats of each phyint currently in the group.  Since we
+	 * don't directly track the phyints in a group, we cheat by walking
+	 * the IPSQ set under ill_g_lock.  (The IPSQ list cannot change while
+	 * ill_g_lock is held.)
+	 */
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	ipsq = grp_ipsq->ipsq_next;
+	for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
+		phyi = ipsq->ipsq_phyint;
+
+		/*
+		 * If a phyint in a group is being unplumbed, it's possible
+		 * that ill_glist_delete() -> phyint_free() already freed the
+		 * phyint (and set ipsq_phyint to NULL), but the unplumb
+		 * operation has yet to complete (and thus ipsq_dq() has yet
+		 * to remove the phyint's IPSQ from the group IPSQ's phyint
+		 * list).  We skip those phyints here (note that their kstats
+		 * have already been added to gr_kstats0[]).
+		 */
+		if (phyi == NULL)
+			continue;
+
+		ipmp_phyint_get_kstats(phyi, phyi_kstats);
+
+		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+			phyi_kstats[i] -= phyi->phyint_kstats0[i];
+			if (kn[i].data_type == KSTAT_DATA_UINT32)
+				kn[i].value.ui32 += phyi_kstats[i];
+			else
+				kn[i].value.ui64 += phyi_kstats[i];
+		}
+	}
+
+	kn[IPMP_KSTAT_LINK_UP].value.ui32 =
+	    (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
+
+	rw_exit(&ipst->ips_ill_g_lock);
+	return (0);
+}
+
+/*
+ * Destroy IPMP kstat structures for `grp'.
+ */
+static void
+ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
+{
+	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
+
+	kstat_delete_netstack(grp->gr_ksp, id);
+	bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
+	grp->gr_ksp = NULL;
+}
+
+/*
+ * Look up an IPMP group named `grname' on IP stack `ipst'.  Return NULL if it
+ * does not exist.
+ */
+ipmp_grp_t *
+ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
+{
+	ipmp_grp_t *grp;
+
+	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
+	    (mod_hash_val_t *)&grp) == 0)
+		return (grp);
+
+	return (NULL);
+}
+
+/*
+ * Place information about group `grp' into `lifgr'.
+ */
+void
+ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
+{
+	ill_t *ill;
+	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+	lifgr->gi_v4 = (grp->gr_v4 != NULL);
+	lifgr->gi_v6 = (grp->gr_v6 != NULL);
+	lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
+	lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
+	lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
+	(void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
+	lifgr->gi_m4ifname[0] = '\0';
+	lifgr->gi_m6ifname[0] = '\0';
+	lifgr->gi_bcifname[0] = '\0';
+
+	if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
+		(void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
+		(void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
+	}
+
+	if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
+		(void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
+}
+
+/*
+ * Insert `grp' into the hash using the reserved hash entry `mh'.
+ * Caller must ensure `grp' is not yet in the hash.
+ */
+static void
+ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
+{
+	int err;
+	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+	/*
+	 * Since grp->gr_name will exist at least as long as `grp' is in the
+	 * hash, we use it directly as the key.
+	 */
+	err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
+	    (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
+	if (err != 0) {
+		/*
+		 * This should never happen since `mh' was preallocated.
+		 */
+		panic("cannot insert IPMP group \"%s\" (err %d)",
+		    grp->gr_name, err);
+	}
+}
+
+/*
+ * Remove `grp' from the hash.  Caller must ensure `grp' is in it.
+ */
+static void
+ipmp_grp_remove(ipmp_grp_t *grp)
+{
+	int err;
+	mod_hash_val_t val;
+	mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
+	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+	err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
+	if (err != 0 || val != grp) {
+		panic("cannot remove IPMP group \"%s\" (err %d)",
+		    grp->gr_name, err);
+	}
+}
+
+/*
+ * Attempt to rename `grp' to new name `grname'.  Return an errno if the new
+ * group name already exists or is invalid, or if there isn't enough memory.
+ */
+int
+ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
+{
+	mod_hash_hndl_t mh;
+	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+	if (grname[0] == '\0')
+		return (EINVAL);
+
+	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
+	    (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
+		return (EEXIST);
+
+	/*
+	 * Before we remove the group from the hash, ensure we'll be able to
+	 * re-insert it by reserving space.
+	 */
+	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
+		return (ENOMEM);
+
+	ipmp_grp_remove(grp);
+	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
+	ipmp_grp_insert(grp, mh);
+
+	return (0);
+}
+
+/*
+ * Destroy `grp' and remove it from the hash.  Caller must ensure `grp' is in
+ * the hash, and that there are no interfaces on it.
+ */
+void
+ipmp_grp_destroy(ipmp_grp_t *grp)
+{
+	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+	/*
+	 * If there are still interfaces using this group, panic before things
+	 * go really off the rails.
+	 */
+	if (grp->gr_nif != 0)
+		panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
+
+	ipmp_grp_remove(grp);
+	ipmp_grp_destroy_kstats(grp);
+
+	ASSERT(grp->gr_v4 == NULL);
+	ASSERT(grp->gr_v6 == NULL);
+	ASSERT(grp->gr_nv4 == 0);
+	ASSERT(grp->gr_nv6 == 0);
+	ASSERT(grp->gr_nactif == 0);
+	ASSERT(grp->gr_linkdownmp == NULL);
+	grp->gr_phyint = NULL;
+
+	kmem_free(grp, sizeof (ipmp_grp_t));
+}
+
+/*
+ * Check whether `ill' is suitable for inclusion into `grp', and return an
+ * errno describing the problem (if any).  NOTE: many of these errno values
+ * are interpreted by ifconfig, which will take corrective action and retry
+ * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
+ */
+static int
+ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
+{
+	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+	/*
+	 * To sidestep complicated address migration logic in the kernel and
+	 * to force the kernel's all-hosts multicast memberships to be blown
+	 * away, all addresses that had been brought up must be brought back
+	 * down prior to adding an interface to a group.  (This includes
+	 * addresses currently down due to DAD.)  Once the interface has been
+	 * added to the group, its addresses can then be brought back up, at
+	 * which point they will be moved to the IPMP meta-interface.
+	 * NOTE: we do this before ill_appaddr_cnt() since bringing down the
+	 * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
+	 */
+	if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
+		return (EADDRINUSE);
+
+	/*
+	 * To avoid confusing applications by changing addresses that are
+	 * under their control, all such control must be removed prior to
+	 * adding an interface into a group.
+	 */
+	if (ill_appaddr_cnt(ill) != 0)
+		return (EADDRNOTAVAIL);
+
+	/*
+	 * Since PTP addresses do not share the same broadcast domain, they
+	 * are not allowed to be in an IPMP group.
+	 */
+	if (ill_ptpaddr_cnt(ill) != 0)
+		return (EINVAL);
+
+	/*
+	 * An ill must support multicast to be allowed into a group.
+	 */
+	if (!(ill->ill_flags & ILLF_MULTICAST))
+		return (ENOTSUP);
+
+	/*
+	 * An ill must strictly be using ARP and/or ND for address
+	 * resolution for it to be allowed into a group.
+	 */
+	if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV))
+		return (ENOTSUP);
+
+	/*
+	 * An ill cannot also be using usesrc groups.  (Although usesrc uses
+	 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
+	 * all its modifications as writer.)
+	 */
+	if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
+		return (ENOTSUP);
+
+	/*
+	 * All ills in a group must be the same mactype.
+	 */
+	if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
+		return (EINVAL);
+
+	return (0);
+}
+
+/*
+ * Check whether `phyi' is suitable for inclusion into `grp', and return an
+ * errno describing the problem (if any).  See comment above ipmp_grp_vet_ill()
+ * regarding errno values.
+ */
+int
+ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
+{
+	int err = 0;
+	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
+
+	ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
+	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
+
+	/*
+	 * An interface cannot have address families plumbed that are not
+	 * configured in the group.
+	 */
+	if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
+	    phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
+		return (EAFNOSUPPORT);
+
+	if (phyi->phyint_illv4 != NULL)
+		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
+	if (err == 0 && phyi->phyint_illv6 != NULL)
+		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
+
+	return (err);
+}
+
+/*
+ * Create a new illgrp on IPMP meta-interface `ill'.
+ */
+ipmp_illgrp_t *
+ipmp_illgrp_create(ill_t *ill)
+{
+	uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
+	ipmp_illgrp_t *illg;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(IS_IPMP(ill));
+	ASSERT(ill->ill_grp == NULL);
+
+	if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
+		return (NULL);
+
+	list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
+	list_create(&illg->ig_actif, sizeof (ill_t),
+	    offsetof(ill_t, ill_actnode));
+	list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
+	    offsetof(ipmp_arpent_t, ia_node));
+
+	illg->ig_ipmp_ill = ill;
+	ill->ill_grp = illg;
+	ipmp_illgrp_set_mtu(illg, mtu);
+
+	return (illg);
+}
+
+/*
+ * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
+ */
+void
+ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
+{
+	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+	ASSERT(IS_IPMP(illg->ig_ipmp_ill));
+
+	/*
+	 * Verify `illg' is empty.
+	 */
+	ASSERT(illg->ig_next_ill == NULL);
+	ASSERT(illg->ig_cast_ill == NULL);
+	ASSERT(list_is_empty(&illg->ig_arpent));
+	ASSERT(list_is_empty(&illg->ig_if));
+	ASSERT(list_is_empty(&illg->ig_actif));
+	ASSERT(illg->ig_nactif == 0);
+
+	/*
+	 * Destroy `illg'.
+	 */
+	illg->ig_ipmp_ill->ill_grp = NULL;
+	illg->ig_ipmp_ill = NULL;
+	list_destroy(&illg->ig_if);
+	list_destroy(&illg->ig_actif);
+	list_destroy(&illg->ig_arpent);
+	kmem_free(illg, sizeof (ipmp_illgrp_t));
+}
+
+/*
+ * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
+ * bind it to an underlying ill, while keeping an even address distribution.
+ * If the bind is successful, return a pointer to the bound ill.
+ */
+ill_t *
+ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
+{
+	ill_t *minill;
+	ipmp_arpent_t *entp;
+
+	ASSERT(IAM_WRITER_IPIF(ipif));
+	ASSERT(ipmp_ipif_is_dataaddr(ipif));
+
+	/*
+	 * IPMP data address mappings are internally managed by IP itself, so
+	 * delete any existing ARP entries associated with the address.
+	 */
+	if (!ipif->ipif_isv6) {
+		entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
+		if (entp != NULL)
+			ipmp_illgrp_destroy_arpent(illg, entp);
+	}
+
+	if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
+		ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
+
+	return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
+}
+
+/*
+ * Delete `ipif' from the pool of usable data addresses on `illg'.  If it's
+ * bound, unbind it from the underlying ill while keeping an even address
+ * distribution.
+ */
+void
+ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
+{
+	ill_t *maxill, *boundill = ipif->ipif_bound_ill;
+
+	ASSERT(IAM_WRITER_IPIF(ipif));
+
+	if (boundill != NULL) {
+		(void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
+
+		maxill = ipmp_illgrp_max_ill(illg);
+		if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
+			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
+			ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
+		}
+	}
+}
+
+/*
+ * Return the active ill with the greatest number of data addresses in `illg'.
+ */
+static ill_t *
+ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
+{
+	ill_t *ill, *bestill = NULL;
+
+	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+	ill = list_head(&illg->ig_actif);
+	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
+		if (bestill == NULL ||
+		    ill->ill_bound_cnt > bestill->ill_bound_cnt) {
+			bestill = ill;
+		}
+	}
+	return (bestill);
+}
+
+/*
+ * Return the active ill with the fewest number of data addresses in `illg'.
+ */
+static ill_t *
+ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
+{
+	ill_t *ill, *bestill = NULL;
+
+	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+	ill = list_head(&illg->ig_actif);
+	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
+		if (bestill == NULL ||
+		    ill->ill_bound_cnt < bestill->ill_bound_cnt) {
+			if (ill->ill_bound_cnt == 0)
+				return (ill);	 /* can't get better */
+			bestill = ill;
+		}
+	}
+	return (bestill);
+}
+
+/*
+ * Return a pointer to IPMP meta-interface for `illg' (which must exist).
+ * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
+ */
+ill_t *
+ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
+{
+	return (illg->ig_ipmp_ill);
+}
+
+/*
+ * Return a pointer to the next available underlying ill in `illg', or NULL if
+ * one doesn't exist.  Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
+{
+	ill_t *ill;
+	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+	if ((ill = illg->ig_next_ill) != NULL) {
+		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
+		if (illg->ig_next_ill == NULL)
+			illg->ig_next_ill = list_head(&illg->ig_actif);
+	}
+	rw_exit(&ipst->ips_ipmp_lock);
+
+	return (ill);
+}
+
+/*
+ * Return a held pointer to the next available underlying ill in `illg', or
+ * NULL if one doesn't exist.  Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
+{
+	ill_t *ill;
+	uint_t i;
+	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+	for (i = 0; i < illg->ig_nactif; i++) {
+		ill = illg->ig_next_ill;
+		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
+		if (illg->ig_next_ill == NULL)
+			illg->ig_next_ill = list_head(&illg->ig_actif);
+
+		if (ILL_CAN_LOOKUP(ill)) {
+			ill_refhold(ill);
+			rw_exit(&ipst->ips_ipmp_lock);
+			return (ill);
+		}
+	}
+	rw_exit(&ipst->ips_ipmp_lock);
+
+	return (NULL);
+}
+
+/*
+ * Return a pointer to the nominated multicast ill in `illg', or NULL if one
+ * doesn't exist.  Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg)
+{
+	/*
+	 * Since an IPMP ill's ill_grp gets cleared during I_PUNLINK but
+	 * this function can get called after that point, handle NULL.
+	 */
+	if (illg == NULL)
+		return (NULL);
+
+	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+	return (illg->ig_cast_ill);
+}
+
+/*
+ * Return a held pointer to the nominated multicast ill in `illg', or NULL if
+ * one doesn't exist.  Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
+{
+	ill_t *castill;
+	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+	castill = illg->ig_cast_ill;
+	if (castill != NULL && ILL_CAN_LOOKUP(castill)) {
+		ill_refhold(castill);
+		rw_exit(&ipst->ips_ipmp_lock);
+		return (castill);
+	}
+	rw_exit(&ipst->ips_ipmp_lock);
+	return (NULL);
+}
+
+/*
+ * Set the nominated cast ill on `illg' to `castill'.  If `castill' is NULL,
+ * any existing nomination is removed.  Caller must be inside the IPSQ.
+ */
+static void
+ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
+{
+	ill_t *ocastill = illg->ig_cast_ill;
+	ill_t *ipmp_ill = illg->ig_ipmp_ill;
+	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+	ASSERT(IAM_WRITER_ILL(ipmp_ill));
+
+	/*
+	 * Disable old nominated ill (if any).
+	 */
+	if (ocastill != NULL) {
+		DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
+		    illg, ill_t *, ocastill);
+		ASSERT(ocastill->ill_nom_cast);
+		ocastill->ill_nom_cast = B_FALSE;
+		/*
+		 * If the IPMP meta-interface is down, we never did the join,
+		 * so we must not try to leave.
+		 */
+		if (ipmp_ill->ill_dl_up)
+			ill_leave_multicast(ipmp_ill);
+	}
+
+	/*
+	 * Set new nomination.
+	 */
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+	illg->ig_cast_ill = castill;
+	rw_exit(&ipst->ips_ipmp_lock);
+
+	if (ocastill != NULL) {
+		/*
+		 * Delete any IREs tied to the old nomination.  We must do
+		 * this after the new castill is set and has reached global
+		 * visibility since the datapath has not been quiesced.
+		 */
+		ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
+		    ill_stq_cache_delete, ocastill, ocastill);
+	}
+
+	/*
+	 * Enable new nominated ill (if any).
+	 */
+	if (castill != NULL) {
+		DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
+		    illg, ill_t *, castill);
+		ASSERT(!castill->ill_nom_cast);
+		castill->ill_nom_cast = B_TRUE;
+		/*
+		 * If the IPMP meta-interface is down, the attempt to recover
+		 * will silently fail but ill_need_recover_multicast will be
+		 * erroneously cleared -- so check first.
+		 */
+		if (ipmp_ill->ill_dl_up)
+			ill_recover_multicast(ipmp_ill);
+	}
+
+	/*
+	 * For IPv4, refresh our broadcast IREs.  This needs to be done even
+	 * if there's no new nomination since ill_refresh_bcast() still must
+	 * update the IPMP meta-interface's broadcast IREs to point back at
+	 * the IPMP meta-interface itself.
+	 */
+	if (!ipmp_ill->ill_isv6)
+		ill_refresh_bcast(ipmp_ill);
+}
+
+/*
+ * Create an IPMP ARP entry and add it to the set tracked on `illg'.  If an
+ * entry for the same IP address already exists, destroy it first.  Return the
+ * created IPMP ARP entry, or NULL on failure.
+ */
+ipmp_arpent_t *
+ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp)
+{
+	uchar_t *addrp;
+	area_t *area = (area_t *)mp->b_rptr;
+	ipmp_arpent_t *entp, *oentp;
+
+	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+	ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t));
+
+	if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL)
+		return (NULL);
+
+	if ((mp = copyb(mp)) == NULL) {
+		kmem_free(entp, sizeof (ipmp_arpent_t));
+		return (NULL);
+	}
+
+	DB_TYPE(mp) = M_PROTO;
+	entp->ia_area_mp = mp;
+	entp->ia_proxyarp = proxyarp;
+	addrp = mi_offset_paramc(mp, area->area_proto_addr_offset,
+	    sizeof (ipaddr_t));
+	bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t));
+
+	if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
+		ipmp_illgrp_destroy_arpent(illg, oentp);
+
+	list_insert_head(&illg->ig_arpent, entp);
+	return (entp);
+}
+
+/*
+ * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
+ */
+void
+ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
+{
+	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+	list_remove(&illg->ig_arpent, entp);
+	freeb(entp->ia_area_mp);
+	kmem_free(entp, sizeof (ipmp_arpent_t));
+}
+
+/*
+ * Mark that ARP has been notified about the IP address on `entp'; `illg' is
+ * taken as a debugging aid for DTrace FBT probes.
+ */
+/* ARGSUSED */
+void
+ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
+{
+	entp->ia_notified = B_TRUE;
+}
+
+/*
+ * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
+ * NULL, any IPMP ARP entry is requested.  Return NULL if it does not exist.
+ */
+ipmp_arpent_t *
+ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
+{
+	ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
+
+	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
+
+	if (addrp == NULL)
+		return (entp);
+
+	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
+		if (entp->ia_ipaddr == *addrp)
+			break;
+	return (entp);
+}
+
+/*
+ * Refresh ARP entries on `illg' to be distributed across its active
+ * interfaces.  Entries that cannot be refreshed (e.g., because there are no
+ * active interfaces) are marked so that subsequent calls can try again.
+ */
+void
+ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
+{
+	ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
+	uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
+	area_t *area;
+	mblk_t *area_mp;
+	uchar_t *physaddr;
+	ipmp_arpent_t *entp;
+
+	ASSERT(IAM_WRITER_ILL(ipmp_ill));
+	ASSERT(!ipmp_ill->ill_isv6);
+
+	ill = list_head(&illg->ig_actif);
+	entp = list_head(&illg->ig_arpent);
+	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
+		if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
+			entp->ia_notified = B_FALSE;
+			continue;
+		}
+
+		area = (area_t *)entp->ia_area_mp->b_rptr;
+		ASSERT(paddrlen == ill->ill_phys_addr_length);
+		ASSERT(paddrlen == area->area_hw_addr_length);
+		physaddr = mi_offset_paramc(entp->ia_area_mp,
+		    area->area_hw_addr_offset, paddrlen);
+
+		/*
+		 * If this is a proxy ARP entry, we can skip notifying ARP if
+		 * the entry is already up-to-date.  If it has changed, we
+		 * update the entry's hardware address before notifying ARP.
+		 */
+		if (entp->ia_proxyarp) {
+			if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 &&
+			    entp->ia_notified)
+				continue;
+			bcopy(ill->ill_phys_addr, physaddr, paddrlen);
+		}
+
+		if ((area_mp = copyb(entp->ia_area_mp)) == NULL) {
+			entp->ia_notified = B_FALSE;
+			continue;
+		}
+
+		putnext(ipmp_ill->ill_rq, area_mp);
+		ipmp_illgrp_mark_arpent(illg, entp);
+
+		if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
+			ill = list_head(&illg->ig_actif);
+	}
+}
+
+/*
+ * Return an interface in `illg' with the specified `physaddr', or NULL if one
+ * doesn't exist.  Caller must hold ill_g_lock if it's not inside the IPSQ.
+ */
+ill_t *
+ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
+{
+	ill_t *ill;
+	ill_t *ipmp_ill = illg->ig_ipmp_ill;
+	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+	ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
+
+	ill = list_head(&illg->ig_if);
+	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+		if (ill->ill_phys_addr_length == paddrlen &&
+		    bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
+			return (ill);
+	}
+	return (NULL);
+}
+
+/*
+ * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
+ * Caller must be inside the IPSQ unless this is initialization.
+ */
+static void
+ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu)
+{
+	ill_t *ill = illg->ig_ipmp_ill;
+	mblk_t *mp;
+
+	ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
+
+	/*
+	 * If allocation fails, we have bigger problems than MTU.
+	 */
+	if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) {
+		illg->ig_mtu = mtu;
+		put(ill->ill_rq, mp);
+	}
+}
+
+/*
+ * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
+ * ill MTU if necessary.
+ */
+void
+ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
+{
+	ill_t *ill;
+	ill_t *ipmp_ill = illg->ig_ipmp_ill;
+	uint_t mtu = 0;
+
+	ASSERT(IAM_WRITER_ILL(ipmp_ill));
+
+	/*
+	 * Since ill_max_mtu can only change under ill_lock, we hold ill_lock
+	 * for each ill as we iterate through the list.  Any changes to the
+	 * ill_max_mtu will also trigger an update, so even if we missed it
+	 * this time around, the update will catch it.
+	 */
+	ill = list_head(&illg->ig_if);
+	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+		mutex_enter(&ill->ill_lock);
+		if (mtu == 0 || ill->ill_max_mtu < mtu)
+			mtu = ill->ill_max_mtu;
+		mutex_exit(&ill->ill_lock);
+	}
+
+	/*
+	 * MTU must be at least the minimum MTU.
+	 */
+	mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
+
+	if (illg->ig_mtu != mtu)
+		ipmp_illgrp_set_mtu(illg, mtu);
+}
+
+/*
+ * Link illgrp `illg' to IPMP group `grp'.  To simplify the caller, silently
+ * allow the same link to be established more than once.
+ */
+void
+ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
+{
+	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+	if (illg->ig_ipmp_ill->ill_isv6) {
+		ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
+		grp->gr_v6 = illg;
+	} else {
+		ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
+		grp->gr_v4 = illg;
+	}
+}
+
+/*
+ * Unlink illgrp `illg' from its IPMP group.  Return an errno if the illgrp
+ * cannot be unlinked (e.g., because there are still interfaces using it).
+ */
+int
+ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
+{
+	ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
+	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
+
+	if (illg->ig_ipmp_ill->ill_isv6) {
+		if (grp->gr_nv6 + grp->gr_pendv6 != 0)
+			return (EBUSY);
+		grp->gr_v6 = NULL;
+	} else {
+		if (grp->gr_nv4 + grp->gr_pendv4 != 0)
+			return (EBUSY);
+		grp->gr_v4 = NULL;
+	}
+	return (0);
+}
+
+/*
+ * Place `ill' into `illg', and rebalance the data addresses on `illg'
+ * to be spread evenly across the ills now in it.  Also, adjust the IPMP
+ * ill as necessary to account for `ill' (e.g., MTU).
+ */
+void
+ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
+{
+	ill_t *ipmp_ill;
+	ipif_t *ipif;
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	/* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
+	ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(ill->ill_grp == NULL);
+
+	ipmp_ill = illg->ig_ipmp_ill;
+
+	/*
+	 * Account for `ill' joining the illgrp.
+	 */
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+	if (ill->ill_isv6)
+		ill->ill_phyint->phyint_grp->gr_nv6++;
+	else
+		ill->ill_phyint->phyint_grp->gr_nv4++;
+	rw_exit(&ipst->ips_ipmp_lock);
+
+	/*
+	 * Ensure the ILLF_ROUTER flag remains consistent across the group.
+	 */
+	mutex_enter(&ill->ill_lock);
+	if (ipmp_ill->ill_flags & ILLF_ROUTER)
+		ill->ill_flags |= ILLF_ROUTER;
+	else
+		ill->ill_flags &= ~ILLF_ROUTER;
+	mutex_exit(&ill->ill_lock);
+
+	/*
+	 * Blow away all multicast memberships that currently exist on `ill'.
+	 * This may seem odd, but it's consistent with the application view
+	 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
+	 */
+	if (ill->ill_isv6) {
+		reset_conn_ill(ill);
+		reset_mrt_ill(ill);
+	} else {
+		ipif = ill->ill_ipif;
+		for (; ipif != NULL; ipif = ipif->ipif_next) {
+			reset_conn_ipif(ipif);
+			reset_mrt_vif_ipif(ipif);
+		}
+	}
+	ip_purge_allmulti(ill);
+
+	/*
+	 * Borrow the first ill's ill_phys_addr_length value for the illgrp's
+	 * physical address length.  All other ills must have the same value,
+	 * since they are required to all be the same mactype.  Also update
+	 * the IPMP ill's MTU and CoS marking, if necessary.
+	 */
+	if (list_is_empty(&illg->ig_if)) {
+		ASSERT(ipmp_ill->ill_phys_addr_length == 0);
+		/*
+		 * NOTE: we leave ill_phys_addr NULL since the IPMP group
+		 * doesn't have a physical address.  This means that code must
+		 * not assume that ill_phys_addr is non-NULL just because
+		 * ill_phys_addr_length is non-zero.  Likewise for ill_nd_lla.
+		 */
+		ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
+		ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
+		ipmp_ill->ill_type = ill->ill_type;
+
+		if (ill->ill_flags & ILLF_COS_ENABLED) {
+			mutex_enter(&ipmp_ill->ill_lock);
+			ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
+			mutex_exit(&ipmp_ill->ill_lock);
+		}
+		ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+	} else {
+		ASSERT(ipmp_ill->ill_phys_addr_length ==
+		    ill->ill_phys_addr_length);
+		ASSERT(ipmp_ill->ill_type == ill->ill_type);
+
+		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
+			mutex_enter(&ipmp_ill->ill_lock);
+			ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
+			mutex_exit(&ipmp_ill->ill_lock);
+		}
+		if (illg->ig_mtu > ill->ill_max_mtu)
+			ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+	}
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+	list_insert_tail(&illg->ig_if, ill);
+	ill->ill_grp = illg;
+	rw_exit(&ipst->ips_ill_g_lock);
+
+	/*
+	 * Hide the IREs on `ill' so that we don't accidentally find them when
+	 * sending data traffic.
+	 */
+	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
+
+	/*
+	 * Merge any broadcast IREs, if need be.
+	 */
+	if (!ill->ill_isv6)
+		ill_refresh_bcast(ill);
+
+	ipmp_ill_refresh_active(ill);
+}
+
+/*
+ * Remove `ill' from its illgrp, and rebalance the data addresses in that
+ * illgrp to be spread evenly across the remaining ills.  Also, adjust the
+ * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
+ */
+void
+ipmp_ill_leave_illgrp(ill_t *ill)
+{
+	ill_t *ipmp_ill;
+	ipif_t *ipif;
+	ipmp_arpent_t *entp;
+	ipmp_illgrp_t *illg = ill->ill_grp;
+	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+	ASSERT(IS_UNDER_IPMP(ill));
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(illg != NULL);
+
+	ipmp_ill = illg->ig_ipmp_ill;
+
+	/*
+	 * Cancel IPMP-specific ill timeouts.
+	 */
+	(void) untimeout(ill->ill_refresh_tid);
+
+	/*
+	 * Expose any previously-hidden IREs on `ill'.
+	 */
+	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
+
+	/*
+	 * Ensure the multicast state for each ipif on `ill' is down so that
+	 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
+	 * all eligible groups.
+	 */
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+		if (ipif->ipif_flags & IPIF_UP)
+			ipif_multicast_down(ipif);
+
+	/*
+	 * Account for `ill' leaving the illgrp.
+	 */
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+	if (ill->ill_isv6)
+		ill->ill_phyint->phyint_grp->gr_nv6--;
+	else
+		ill->ill_phyint->phyint_grp->gr_nv4--;
+	rw_exit(&ipst->ips_ipmp_lock);
+
+	/*
+	 * Pull `ill' out of the interface lists.
+	 */
+	if (list_link_active(&ill->ill_actnode))
+		ipmp_ill_deactivate(ill);
+	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+	list_remove(&illg->ig_if, ill);
+	ill->ill_grp = NULL;
+	rw_exit(&ipst->ips_ill_g_lock);
+
+	/*
+	 * Recreate any broadcast IREs that had been shared, if need be.
+	 */
+	if (!ill->ill_isv6)
+		ill_refresh_bcast(ill);
+
+	/*
+	 * Re-establish multicast memberships that were previously being
+	 * handled by the IPMP meta-interface.
+	 */
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+		if (ipif->ipif_flags & IPIF_UP)
+			ipif_multicast_up(ipif);
+
+	/*
+	 * Refresh the group MTU based on the new interface list.
+	 */
+	ipmp_illgrp_refresh_mtu(illg);
+
+	if (list_is_empty(&illg->ig_if)) {
+		/*
+		 * No ills left in the illgrp; we no longer have a physical
+		 * address length, nor can we support ARP, CoS, or anything
+		 * else that depends on knowing the link layer type.
+		 */
+		while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
+			ipmp_illgrp_destroy_arpent(illg, entp);
+
+		ipmp_ill->ill_phys_addr_length = 0;
+		ipmp_ill->ill_nd_lla_len = 0;
+		ipmp_ill->ill_type = IFT_OTHER;
+		mutex_enter(&ipmp_ill->ill_lock);
+		ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
+		mutex_exit(&ipmp_ill->ill_lock);
+	} else {
+		/*
+		 * If `ill' didn't support CoS, see if it can now be enabled.
+		 */
+		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
+			ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
+
+			ill = list_head(&illg->ig_if);
+			do {
+				if (!(ill->ill_flags & ILLF_COS_ENABLED))
+					break;
+			} while ((ill = list_next(&illg->ig_if, ill)) != NULL);
+
+			if (ill == NULL) {
+				mutex_enter(&ipmp_ill->ill_lock);
+				ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
+				mutex_exit(&ipmp_ill->ill_lock);
+			}
+		}
+	}
+}
+
+/*
+ * Check if `ill' should be active, and activate or deactivate if need be.
+ * Return B_FALSE if a refresh was necessary but could not be performed.
+ */
+static boolean_t
+ipmp_ill_try_refresh_active(ill_t *ill)
+{
+	boolean_t refreshed = B_TRUE;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(IS_UNDER_IPMP(ill));
+
+	if (ipmp_ill_is_active(ill)) {
+		if (!list_link_active(&ill->ill_actnode))
+			refreshed = ipmp_ill_activate(ill);
+	} else {
+		if (list_link_active(&ill->ill_actnode))
+			ipmp_ill_deactivate(ill);
+	}
+
+	return (refreshed);
+}
+
+/*
+ * Check if `ill' should be active, and activate or deactivate if need be.
+ * If the refresh fails, schedule a timer to try again later.
+ */
+void
+ipmp_ill_refresh_active(ill_t *ill)
+{
+	if (!ipmp_ill_try_refresh_active(ill))
+		ipmp_ill_refresh_active_timer_start(ill);
+}
+
+/*
+ * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
+ */
+static void
+ipmp_ill_refresh_active_timer(void *ill_arg)
+{
+	ill_t *ill = ill_arg;
+	boolean_t refreshed = B_FALSE;
+
+	/*
+	 * Clear ill_refresh_tid to indicate that no timeout is pending
+	 * (another thread could schedule a new timeout while we're still
+	 * running, but that's harmless).  If the ill is going away, bail.
+	 */
+	mutex_enter(&ill->ill_lock);
+	ill->ill_refresh_tid = 0;
+	if (ill->ill_state_flags & ILL_CONDEMNED) {
+		mutex_exit(&ill->ill_lock);
+		return;
+	}
+	mutex_exit(&ill->ill_lock);
+
+	if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
+		refreshed = ipmp_ill_try_refresh_active(ill);
+		ipsq_exit(ill->ill_phyint->phyint_ipsq);
+	}
+
+	/*
+	 * If the refresh failed, schedule another attempt.
+	 */
+	if (!refreshed)
+		ipmp_ill_refresh_active_timer_start(ill);
+}
+
+/*
+ * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
+ */
+static void
+ipmp_ill_refresh_active_timer_start(ill_t *ill)
+{
+	mutex_enter(&ill->ill_lock);
+
+	/*
+	 * If the ill is going away or a refresh is already scheduled, bail.
+	 */
+	if (ill->ill_refresh_tid != 0 ||
+	    (ill->ill_state_flags & ILL_CONDEMNED)) {
+		mutex_exit(&ill->ill_lock);
+		return;
+	}
+
+	ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
+	    SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
+
+	mutex_exit(&ill->ill_lock);
+}
+
+/*
+ * Activate `ill' so it will be used to send and receive data traffic.  Return
+ * B_FALSE if `ill' cannot be activated.  Note that we allocate any messages
+ * needed to deactivate `ill' here as well so that deactivation cannot fail.
+ */
+static boolean_t
+ipmp_ill_activate(ill_t *ill)
+{
+	ipif_t		*ipif;
+	mblk_t		*actmp = NULL, *deactmp = NULL;
+	mblk_t		*linkupmp = NULL, *linkdownmp = NULL;
+	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
+	const char	*grifname = grp->gr_ifname;
+	ipmp_illgrp_t	*illg = ill->ill_grp;
+	ill_t		*maxill;
+	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(IS_UNDER_IPMP(ill));
+
+	/*
+	 * If this will be the first active interface in the group, allocate
+	 * the link-up and link-down messages.
+	 */
+	if (grp->gr_nactif == 0) {
+		linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
+		linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
+		if (linkupmp == NULL || linkdownmp == NULL)
+			goto fail;
+	}
+
+	/*
+	 * For IPv4, allocate the activate/deactivate messages, and tell ARP.
+	 */
+	if (!ill->ill_isv6) {
+		actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template);
+		deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template);
+		if (actmp == NULL || deactmp == NULL)
+			goto fail;
+
+		ASSERT(ill->ill_ardeact_mp == NULL);
+		ill->ill_ardeact_mp = deactmp;
+		putnext(illg->ig_ipmp_ill->ill_rq, actmp);
+	}
+
+	if (list_is_empty(&illg->ig_actif)) {
+		/*
+		 * Now that we have an active ill, nominate it for multicast
+		 * and broadcast duties.  Do this before ipmp_ill_bind_ipif()
+		 * since that may need to send multicast packets (e.g., IPv6
+		 * neighbor discovery probes).
+		 */
+		ipmp_illgrp_set_cast(illg, ill);
+
+		/*
+		 * This is the first active ill in the illgrp -- add 'em all.
+		 * We can access/walk ig_ipmp_ill's ipif list since we're
+		 * writer on its IPSQ as well.
+		 */
+		ipif = illg->ig_ipmp_ill->ill_ipif;
+		for (; ipif != NULL; ipif = ipif->ipif_next)
+			if (ipmp_ipif_is_up_dataaddr(ipif))
+				ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
+	} else {
+		/*
+		 * Redistribute the addresses by moving them from the ill with
+		 * the most addresses until the ill being activated is at the
+		 * same level as the rest of the ills.
+		 */
+		for (;;) {
+			maxill = ipmp_illgrp_max_ill(illg);
+			ASSERT(maxill != NULL);
+			if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
+				break;
+			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
+			ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
+		}
+
+		/*
+		 * TODO: explore whether it's advantageous to flush IRE_CACHE
+		 * bindings to force existing connections to be redistributed
+		 * to the new ill.
+		 */
+	}
+
+	/*
+	 * Put the interface in the active list.
+	 */
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+	list_insert_tail(&illg->ig_actif, ill);
+	illg->ig_nactif++;
+	illg->ig_next_ill = ill;
+	rw_exit(&ipst->ips_ipmp_lock);
+
+	/*
+	 * Refresh ARP entries to use `ill', if need be.
+	 */
+	if (!ill->ill_isv6)
+		ipmp_illgrp_refresh_arpent(illg);
+
+	/*
+	 * Finally, mark the group link up, if necessary.
+	 */
+	if (grp->gr_nactif++ == 0) {
+		ASSERT(grp->gr_linkdownmp == NULL);
+		grp->gr_linkdownmp = linkdownmp;
+		put(illg->ig_ipmp_ill->ill_rq, linkupmp);
+	}
+	return (B_TRUE);
+fail:
+	freemsg(actmp);
+	freemsg(deactmp);
+	freemsg(linkupmp);
+	freemsg(linkdownmp);
+	return (B_FALSE);
+}
+
+/*
+ * Deactivate `ill' so it will not be used to send or receive data traffic.
+ */
+static void
+ipmp_ill_deactivate(ill_t *ill)
+{
+	ill_t		*minill;
+	ipif_t		*ipif, *ubnextipif, *ubheadipif = NULL;
+	mblk_t		*mp;
+	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
+	ipmp_illgrp_t	*illg = ill->ill_grp;
+	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
+
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(IS_UNDER_IPMP(ill));
+
+	/*
+	 * Delete IRE_CACHE entries tied to this ill before they become stale.
+	 */
+	ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
+	    ill_stq_cache_delete, ill, ill);
+
+	/*
+	 * Pull the interface out of the active list.
+	 */
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+	list_remove(&illg->ig_actif, ill);
+	illg->ig_nactif--;
+	illg->ig_next_ill = list_head(&illg->ig_actif);
+	rw_exit(&ipst->ips_ipmp_lock);
+
+	/*
+	 * If the ill that's being deactivated had been nominated for
+	 * multicast/broadcast, nominate a new one.
+	 */
+	if (ill == illg->ig_cast_ill)
+		ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
+
+	/*
+	 * Unbind all of the ipifs bound to this ill, and save 'em in a list;
+	 * we'll rebind them after we tell the resolver the ill is no longer
+	 * active.  We must do things in this order or the resolver could
+	 * accidentally rebind to the ill we're trying to remove if multiple
+	 * ills in the group have the same hardware address (which is
+	 * unsupported, but shouldn't lead to a wedged machine).
+	 */
+	while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
+		ipif->ipif_bound_next = ubheadipif;
+		ubheadipif = ipif;
+	}
+
+	if (!ill->ill_isv6) {
+		/*
+		 * Tell ARP `ill' is no longer active in the group.
+		 */
+		mp = ill->ill_ardeact_mp;
+		ill->ill_ardeact_mp = NULL;
+		ASSERT(mp != NULL);
+		putnext(illg->ig_ipmp_ill->ill_rq, mp);
+
+		/*
+		 * Refresh any ARP entries that had been using `ill'.
+		 */
+		ipmp_illgrp_refresh_arpent(illg);
+	}
+
+	/*
+	 * Rebind each ipif from the deactivated ill to the active ill with
+	 * the fewest ipifs.  If there are no active ills, the ipifs will
+	 * remain unbound.
+	 */
+	for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
+		ubnextipif = ipif->ipif_bound_next;
+		ipif->ipif_bound_next = NULL;
+
+		if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
+			ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
+	}
+
+	/*
+	 * Finally, mark the group link down, if necessary.
+	 */
+	if (--grp->gr_nactif == 0) {
+		mp = grp->gr_linkdownmp;
+		grp->gr_linkdownmp = NULL;
+		ASSERT(mp != NULL);
+		put(illg->ig_ipmp_ill->ill_rq, mp);
+	}
+}
+
+/*
+ * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
+ * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
+ */
+static void
+ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
+{
+	ipif_t *ipif;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
+
+	/*
+	 * If `ill' is truly down, there are no messages to generate since:
+	 *
+	 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
+	 *    and its addresses by bringing them down.  But that's already
+	 *    true, so there's nothing to hide.
+	 *
+	 * 2. If cmd == RTM_ADD, then we're supposed to generate messages
+	 *    indicating that any previously-hidden up addresses are again
+	 *    back up (along with the interface).  But they aren't, so
+	 *    there's nothing to expose.
+	 */
+	if (ill->ill_ipif_up_count == 0)
+		return;
+
+	if (cmd == RTM_ADD)
+		ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
+
+	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
+		if (ipif->ipif_flags & IPIF_UP)
+			ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
+
+	if (cmd == RTM_DELETE)
+		ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
+}
+
+/*
+ * Bind the address named by `ipif' to the underlying ill named by `ill'.
+ * If `act' is Res_act_none, don't notify the resolver.  Otherwise, `act'
+ * will indicate to the resolver whether this is an initial bringup of
+ * `ipif', or just a rebind to another ill.
+ */
+static void
+ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
+{
+	int err = 0;
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
+	ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
+	ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
+	ASSERT(ipif->ipif_bound_ill == NULL);
+	ASSERT(ipif->ipif_bound_next == NULL);
+
+	ipif->ipif_bound_next = ill->ill_bound_ipif;
+	ill->ill_bound_ipif = ipif;
+	ill->ill_bound_cnt++;
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+	ipif->ipif_bound_ill = ill;
+	rw_exit(&ipst->ips_ipmp_lock);
+
+	/*
+	 * If necessary, tell ARP/NDP about the new mapping.  Note that
+	 * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills.
+	 */
+	if (act != Res_act_none) {
+		if (ill->ill_isv6) {
+			VERIFY(ipif_resolver_up(ipif, act) == 0);
+			err = ipif_ndp_up(ipif, act == Res_act_initial);
+		} else {
+			err = ipif_resolver_up(ipif, act);
+		}
+
+		/*
+		 * Since ipif_ndp_up() never returns EINPROGRESS and
+		 * ipif_resolver_up() only returns EINPROGRESS when the
+		 * associated ill is not up, we should never be here with
+		 * EINPROGRESS.  We rely on this to simplify the design.
+		 */
+		ASSERT(err != EINPROGRESS);
+	}
+	/* TODO: retry binding on failure? when? */
+	ipif->ipif_bound = (err == 0);
+}
+
+/*
+ * Unbind the address named by `ipif' from the underlying ill named by `ill'.
+ * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
+ * If no ipifs are bound to `ill', NULL is returned.  If `notifyres' is
+ * B_TRUE, notify the resolver about the change.
+ */
+static ipif_t *
+ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
+{
+	ill_t *ipmp_ill;
+	ipif_t *previpif;
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(IS_UNDER_IPMP(ill));
+
+	ipmp_ill = ill->ill_grp->ig_ipmp_ill;
+
+	/*
+	 * If necessary, find an ipif to unbind.
+	 */
+	if (ipif == NULL) {
+		if ((ipif = ill->ill_bound_ipif) == NULL) {
+			ASSERT(ill->ill_bound_cnt == 0);
+			return (NULL);
+		}
+	}
+
+	ASSERT(IAM_WRITER_IPIF(ipif));
+	ASSERT(IS_IPMP(ipif->ipif_ill));
+	ASSERT(ipif->ipif_bound_ill == ill);
+	ASSERT(ill->ill_bound_cnt > 0);
+
+	/*
+	 * Unbind it.
+	 */
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+	ipif->ipif_bound_ill = NULL;
+	rw_exit(&ipst->ips_ipmp_lock);
+	ill->ill_bound_cnt--;
+
+	if (ill->ill_bound_ipif == ipif) {
+		ill->ill_bound_ipif = ipif->ipif_bound_next;
+	} else {
+		previpif = ill->ill_bound_ipif;
+		while (previpif->ipif_bound_next != ipif)
+			previpif = previpif->ipif_bound_next;
+
+		previpif->ipif_bound_next = ipif->ipif_bound_next;
+	}
+	ipif->ipif_bound_next = NULL;
+
+	/*
+	 * If requested, notify the resolvers (provided we're bound).
+	 */
+	if (notifyres && ipif->ipif_bound) {
+		if (ill->ill_isv6) {
+			ipif_ndp_down(ipif);
+		} else {
+			ASSERT(ipif->ipif_arp_del_mp != NULL);
+			putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp);
+			ipif->ipif_arp_del_mp = NULL;
+		}
+	}
+	ipif->ipif_bound = B_FALSE;
+
+	return (ipif);
+}
+
+/*
+ * Check if `ill' is active.  Caller must hold ill_lock and phyint_lock if
+ * it's not inside the IPSQ.  Since ipmp_ill_try_refresh_active() calls this
+ * to determine whether an ill should be considered active, other consumers
+ * may race and learn about an ill that should be deactivated/activated before
+ * IPMP has performed the activation/deactivation.  This should be safe though
+ * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
+ * would've been cleaned up by ipmp_ill_deactivate().
+ */
+boolean_t
+ipmp_ill_is_active(ill_t *ill)
+{
+	phyint_t *phyi = ill->ill_phyint;
+
+	ASSERT(IS_UNDER_IPMP(ill));
+	ASSERT(IAM_WRITER_ILL(ill) ||
+	    (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
+
+	/*
+	 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
+	 * set PHYI_FAILED whenever PHYI_RUNNING is cleared.  This allows the
+	 * link flapping logic to be just in in.mpathd and allows us to ignore
+	 * changes to PHYI_RUNNING.
+	 */
+	return (!(ill->ill_ipif_up_count == 0 ||
+	    (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
+}
+
+/*
+ * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet
+ * IREs with a source address on `ill_arg'.
+ */
+static void
+ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
+{
+	ill_t *ill = (ill_t *)ill_arg;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(!IS_IPMP(ill));
+
+	if (ire->ire_ipif->ipif_ill != ill)
+		return;
+
+	switch (ire->ire_type) {
+	case IRE_HOST:
+	case IRE_PREFIX:
+	case IRE_DEFAULT:
+	case IRE_CACHE:
+	case IRE_IF_RESOLVER:
+	case IRE_IF_NORESOLVER:
+		DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
+		ire->ire_marks |= IRE_MARK_TESTHIDDEN;
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source
+ * address on `ill_arg'.
+ */
+static void
+ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
+{
+	ill_t *ill = (ill_t *)ill_arg;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(!IS_IPMP(ill));
+
+	if (ire->ire_ipif->ipif_ill == ill) {
+		DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
+		ire->ire_marks &= ~IRE_MARK_TESTHIDDEN;
+	}
+}
+
+/*
+ * Return a held pointer to the IPMP ill for underlying interface `ill', or
+ * NULL if one doesn't exist.  (Unfortunately, this function needs to take an
+ * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
+ * ill_grp pointer may become stale when not under an IPSQ and not holding
+ * ipmp_lock.)  Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_ill_hold_ipmp_ill(ill_t *ill)
+{
+	ip_stack_t *ipst = ill->ill_ipst;
+	ipmp_illgrp_t *illg;
+
+	ASSERT(!IS_IPMP(ill));
+
+	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+	illg = ill->ill_grp;
+	if (illg != NULL && ILL_CAN_LOOKUP(illg->ig_ipmp_ill)) {
+		ill_refhold(illg->ig_ipmp_ill);
+		rw_exit(&ipst->ips_ipmp_lock);
+		return (illg->ig_ipmp_ill);
+	}
+	/*
+	 * Assume `ill' was removed from the illgrp in the meantime.
+	 */
+	rw_exit(&ill->ill_ipst->ips_ipmp_lock);
+	return (NULL);
+}
+
+/*
+ * Return the interface index for the IPMP ill tied to underlying interface
+ * `ill', or zero if one doesn't exist.  Caller need not be inside the IPSQ.
+ */
+uint_t
+ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
+{
+	uint_t ifindex = 0;
+	ip_stack_t *ipst = ill->ill_ipst;
+	ipmp_grp_t *grp;
+
+	ASSERT(!IS_IPMP(ill));
+
+	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+	if ((grp = ill->ill_phyint->phyint_grp) != NULL)
+		ifindex = grp->gr_phyint->phyint_ifindex;
+	rw_exit(&ipst->ips_ipmp_lock);
+	return (ifindex);
+}
+
+/*
+ * Place phyint `phyi' into IPMP group `grp'.
+ */
+void
+ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
+{
+	ill_t *ill;
+	ipsq_t *ipsq = phyi->phyint_ipsq;
+	ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
+	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+
+	ASSERT(IAM_WRITER_IPSQ(ipsq));
+	ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
+
+	/*
+	 * Send routing socket messages indicating that the phyint's ills
+	 * and ipifs vanished.
+	 */
+	if (phyi->phyint_illv4 != NULL) {
+		ill = phyi->phyint_illv4;
+		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
+	}
+
+	if (phyi->phyint_illv6 != NULL) {
+		ill = phyi->phyint_illv6;
+		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
+	}
+
+	/*
+	 * Snapshot the phyint's initial kstats as a baseline.
+	 */
+	ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
+
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+
+	phyi->phyint_grp = grp;
+	if (++grp->gr_nif == 1)
+		grp->gr_mactype = ill->ill_mactype;
+	else
+		ASSERT(grp->gr_mactype == ill->ill_mactype);
+
+	/*
+	 * Now that we're in the group, request a switch to the group's xop
+	 * when we ipsq_exit().  All future operations will be exclusive on
+	 * the group xop until ipmp_phyint_leave_grp() is called.
+	 */
+	ASSERT(ipsq->ipsq_swxop == NULL);
+	ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
+	ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
+
+	rw_exit(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Remove phyint `phyi' from its current IPMP group.
+ */
+void
+ipmp_phyint_leave_grp(phyint_t *phyi)
+{
+	uint_t i;
+	ipsq_t *ipsq = phyi->phyint_ipsq;
+	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
+	uint64_t phyi_kstats[IPMP_KSTAT_MAX];
+
+	ASSERT(IAM_WRITER_IPSQ(ipsq));
+
+	/*
+	 * If any of the phyint's ills are still in an illgrp, kick 'em out.
+	 */
+	if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
+		ipmp_ill_leave_illgrp(phyi->phyint_illv4);
+	if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
+		ipmp_ill_leave_illgrp(phyi->phyint_illv6);
+
+	/*
+	 * Send routing socket messages indicating that the phyint's ills
+	 * and ipifs have reappeared.
+	 */
+	if (phyi->phyint_illv4 != NULL)
+		ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
+	if (phyi->phyint_illv6 != NULL)
+		ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
+
+	/*
+	 * Calculate the phyint's cumulative kstats while it was in the group,
+	 * and add that to the group's baseline.
+	 */
+	ipmp_phyint_get_kstats(phyi, phyi_kstats);
+	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+		phyi_kstats[i] -= phyi->phyint_kstats0[i];
+		atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
+	}
+
+	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
+
+	phyi->phyint_grp->gr_nif--;
+	phyi->phyint_grp = NULL;
+
+	/*
+	 * As our final act in leaving the group, request a switch back to our
+	 * IPSQ's own xop when we ipsq_exit().
+	 */
+	ASSERT(ipsq->ipsq_swxop == NULL);
+	ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
+
+	rw_exit(&ipst->ips_ipmp_lock);
+}
+
+/*
+ * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
+ * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
+ */
+static void
+ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
+{
+	uint_t		i, j;
+	const char	*name;
+	kstat_t		*ksp;
+	kstat_named_t	*kn;
+
+	bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
+
+	/*
+	 * NOTE: ALL_ZONES here assumes that there's at most one link
+	 * with a given name on a given system (safe for now).
+	 */
+	ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES);
+	if (ksp == NULL)
+		return;
+
+	KSTAT_ENTER(ksp);
+
+	if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
+		/*
+		 * Bring kstats up-to-date before recording.
+		 */
+		(void) KSTAT_UPDATE(ksp, KSTAT_READ);
+
+		kn = KSTAT_NAMED_PTR(ksp);
+		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
+			name = ipmp_kstats[i].name;
+			kstats[i] = 0;
+			for (j = 0; j < ksp->ks_ndata; j++) {
+				if (strcmp(kn[j].name, name) != 0)
+					continue;
+
+				switch (kn[j].data_type) {
+				case KSTAT_DATA_INT32:
+				case KSTAT_DATA_UINT32:
+					kstats[i] = kn[j].value.ui32;
+					break;
+#ifdef	_LP64
+				case KSTAT_DATA_LONG:
+				case KSTAT_DATA_ULONG:
+					kstats[i] = kn[j].value.ul;
+					break;
+#endif
+				case KSTAT_DATA_INT64:
+				case KSTAT_DATA_UINT64:
+					kstats[i] = kn[j].value.ui64;
+					break;
+				}
+				break;
+			}
+		}
+	}
+
+	KSTAT_EXIT(ksp);
+	kstat_rele(ksp);
+}
+
+/*
+ * Refresh the active state of all ills on `phyi'.
+ */
+void
+ipmp_phyint_refresh_active(phyint_t *phyi)
+{
+	if (phyi->phyint_illv4 != NULL)
+		ipmp_ill_refresh_active(phyi->phyint_illv4);
+	if (phyi->phyint_illv6 != NULL)
+		ipmp_ill_refresh_active(phyi->phyint_illv6);
+}
+
+/*
+ * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
+ * doesn't exist.  Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
+{
+	ill_t *boundill;
+	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+
+	ASSERT(IS_IPMP(ipif->ipif_ill));
+
+	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+	boundill = ipif->ipif_bound_ill;
+	if (boundill != NULL && ILL_CAN_LOOKUP(boundill)) {
+		ill_refhold(boundill);
+		rw_exit(&ipst->ips_ipmp_lock);
+		return (boundill);
+	}
+	rw_exit(&ipst->ips_ipmp_lock);
+	return (NULL);
+}
+
+/*
+ * Return a pointer to the underlying ill bound to `ipif', or NULL if one
+ * doesn't exist.  Caller must be inside the IPSQ.
+ */
+ill_t *
+ipmp_ipif_bound_ill(const ipif_t *ipif)
+{
+	ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
+	ASSERT(IS_IPMP(ipif->ipif_ill));
+
+	return (ipif->ipif_bound_ill);
+}
+
+/*
+ * Check if `ipif' is a "stub" (placeholder address not being used).
+ */
+boolean_t
+ipmp_ipif_is_stubaddr(const ipif_t *ipif)
+{
+	if (ipif->ipif_flags & IPIF_UP)
+		return (B_FALSE);
+	if (ipif->ipif_ill->ill_isv6)
+		return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
+	else
+		return (ipif->ipif_lcl_addr == INADDR_ANY);
+}
+
+/*
+ * Check if `ipif' is an IPMP data address.
+ */
+boolean_t
+ipmp_ipif_is_dataaddr(const ipif_t *ipif)
+{
+	if (ipif->ipif_flags & IPIF_NOFAILOVER)
+		return (B_FALSE);
+	if (ipif->ipif_ill->ill_isv6)
+		return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
+	else
+		return (ipif->ipif_lcl_addr != INADDR_ANY);
+}
+
+/*
+ * Check if `ipif' is an IPIF_UP IPMP data address.
+ */
+static boolean_t
+ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
+{
+	return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
+}
diff --git a/usr/src/uts/common/inet/ip/rts.c b/usr/src/uts/common/inet/ip/rts.c
index 4999f28d1e..2751b19993 100644
--- a/usr/src/uts/common/inet/ip/rts.c
+++ b/usr/src/uts/common/inet/ip/rts.c
@@ -561,7 +561,6 @@ rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 		case SO_TYPE:
 			*i1 = SOCK_RAW;
 			break;
-
 		/*
 		 * The following three items are available here,
 		 * but are only meaningful to IP.
@@ -597,6 +596,15 @@ rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 			return (-1);
 		}
 		break;
+	case SOL_ROUTE:
+		switch (name) {
+		case RT_AWARE:
+			mutex_enter(&connp->conn_lock);
+			*i1 = connp->conn_rtaware;
+			mutex_exit(&connp->conn_lock);
+			break;
+		}
+		break;
 	default:
 		return (-1);
 	}
@@ -701,6 +709,20 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 			return (EINVAL);
 		}
 		break;
+	case SOL_ROUTE:
+		switch (name) {
+		case RT_AWARE:
+			if (!checkonly) {
+				mutex_enter(&connp->conn_lock);
+				connp->conn_rtaware = *i1;
+				mutex_exit(&connp->conn_lock);
+			}
+			break;	/* goto sizeof (int) option return */
+		default:
+			*outlenp = 0;
+			return (EINVAL);
+		}
+		break;
 	default:
 		*outlenp = 0;
 		return (EINVAL);
diff --git a/usr/src/uts/common/inet/ip/rts_opt_data.c b/usr/src/uts/common/inet/ip/rts_opt_data.c
index bac0eabdc4..7397b53b9e 100644
--- a/usr/src/uts/common/inet/ip/rts_opt_data.c
+++ b/usr/src/uts/common/inet/ip/rts_opt_data.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -60,6 +60,7 @@ opdes_t	rts_opt_arr[] = {
 { SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
 { SO_PROTOTYPE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ RT_AWARE,	SOL_ROUTE, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 };
 
 /*
diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c
index f785d8a3f6..8a3aa86d60 100644
--- a/usr/src/uts/common/inet/ip/spd.c
+++ b/usr/src/uts/common/inet/ip/spd.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -3989,7 +3989,7 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h)
 	ipsec_out_t  *io;
 	boolean_t v4;
 	mblk_t *mp;
-	boolean_t secure, attach_if;
+	boolean_t secure;
 	uint_t ifindex;
 	ipsec_selector_t sel;
 	ipsec_action_t *reflect_action = NULL;
@@ -4012,7 +4012,6 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h)
 	} else if (!ii->ipsec_in_loopback)
 		reflect_action = ipsec_in_to_out_action(ii);
 	secure = ii->ipsec_in_secure;
-	attach_if = ii->ipsec_in_attach_if;
 	ifindex = ii->ipsec_in_ill_index;
 	zoneid = ii->ipsec_in_zoneid;
 	ASSERT(zoneid != ALL_ZONES);
@@ -4057,7 +4056,6 @@ ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h)
 	io->ipsec_out_proc_begin = B_FALSE;
 	io->ipsec_out_secure = secure;
 	io->ipsec_out_v4 = v4;
-	io->ipsec_out_attach_if = attach_if;
 	io->ipsec_out_ill_index = ifindex;
 	io->ipsec_out_zoneid = zoneid;
 	io->ipsec_out_ns = ns;		/* No netstack_hold */
@@ -4549,7 +4547,6 @@ ipsec_out_to_in(mblk_t *ipsec_mp)
 	ii->ipsec_in_secure = B_TRUE;
 	ii->ipsec_in_v4 = v4;
 	ii->ipsec_in_icmp_loopback = icmp_loopback;
-	ii->ipsec_in_attach_if = B_FALSE;
 }
 
 /*
diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h
index d463c3f6ee..ad331d5706 100644
--- a/usr/src/uts/common/inet/ip6.h
+++ b/usr/src/uts/common/inet/ip6.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -133,10 +133,8 @@ typedef struct ip6_info	ip6i_t;
 #define	IP6I_RAW_CHECKSUM	0x10
 			/* Compute checksum and stuff in ip6i_checksum_off */
 #define	IP6I_VERIFY_SRC	0x20	/* Verify ip6_src. Used when IPV6_PKTINFO */
-#define	IP6I_ATTACH_IF	0x40	/* Bind to no failover address or BOUND_PIF. */
-#define	IP6I_DROP_IFDELAYED	0x80
-			/* Drop the packet if delayed in ndp resolver */
-#define	IP6I_ND_DELAYED 0x100	/* Packet was delayed in ndp resolver */
+#define	IP6I_IPMP_PROBE	0x40	/* IPMP (in.mpathd) probe packet */
+				/* 0x80 - 0x100 available */
 #define	IP6I_DONTFRAG	0x200	/* Don't fragment this packet */
 #define	IP6I_HOPLIMIT	0x400	/* hoplimit has been set by the sender */
 
@@ -340,7 +338,7 @@ extern void	icmp_time_exceeded_v6(queue_t *, mblk_t *, uint8_t,
 extern void	icmp_unreachable_v6(queue_t *, mblk_t *, uint8_t,
     boolean_t, boolean_t, zoneid_t, ip_stack_t *);
 extern void	icmp_inbound_error_fanout_v6(queue_t *, mblk_t *, ip6_t *,
-    icmp6_t *, ill_t *, boolean_t, zoneid_t);
+    icmp6_t *, ill_t *, ill_t *, boolean_t, zoneid_t);
 extern boolean_t conn_wantpacket_v6(conn_t *, ill_t *, ip6_t *, int, zoneid_t);
 extern mblk_t	*ip_add_info_v6(mblk_t *, ill_t *, const in6_addr_t *);
 extern in6addr_scope_t	ip_addr_scope_v6(const in6_addr_t *);
@@ -382,7 +380,7 @@ extern int	ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t,
     ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t,
     const in6_addr_t *, mblk_t *);
 extern void	ip_newroute_ipif_v6(queue_t *, mblk_t *, ipif_t *,
-    in6_addr_t, int, zoneid_t);
+    const in6_addr_t *, const in6_addr_t *, int, zoneid_t);
 extern void	ip_newroute_v6(queue_t *, mblk_t *, const in6_addr_t *,
     const in6_addr_t *, ill_t *, zoneid_t, ip_stack_t *);
 extern void	*ip6_kstat_init(netstackid_t, ip6_stat_t *);
diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h
index c5982de059..094800197e 100644
--- a/usr/src/uts/common/inet/ip_if.h
+++ b/usr/src/uts/common/inet/ip_if.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -80,7 +80,7 @@ extern "C" {
  */
 #define	IFF_PHYINT_FLAGS	(IFF_LOOPBACK|IFF_RUNNING|IFF_PROMISC| \
     IFF_ALLMULTI|IFF_INTELLIGENT|IFF_MULTI_BCAST|IFF_FAILED|IFF_STANDBY| \
-    IFF_INACTIVE|IFF_OFFLINE|IFF_VIRTUAL)
+    IFF_INACTIVE|IFF_OFFLINE|IFF_VIRTUAL|IFF_IPMP)
 
 #define	IFF_PHYINTINST_FLAGS	(IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP| \
     IFF_MULTICAST|IFF_ROUTER|IFF_NONUD|IFF_NORTEXCH|IFF_IPV4|IFF_IPV6| \
@@ -91,11 +91,6 @@ extern "C" {
     IFF_DEPRECATED|IFF_ADDRCONF|IFF_ANYCAST|IFF_NOFAILOVER| \
     IFF_PREFERRED|IFF_TEMPORARY|IFF_FIXEDMTU|IFF_DUPLICATE)
 
-#define	IPIF_REPL_CHECK(to_ipif, failback_cmd)				\
-	(((to_ipif)->ipif_replace_zero) || ((failback_cmd) &&		\
-	!(to_ipif)->ipif_isv6 && !((to_ipif)->ipif_flags & IPIF_UP) &&	\
-	(to_ipif)->ipif_lcl_addr == INADDR_ANY))
-
 #define	PHYI_LOOPBACK		IFF_LOOPBACK	/* is a loopback net */
 #define	PHYI_RUNNING		IFF_RUNNING	/* resources allocated */
 #define	PHYI_PROMISC		IFF_PROMISC	/* receive all packets */
@@ -107,6 +102,7 @@ extern "C" {
 #define	PHYI_INACTIVE		IFF_INACTIVE	/* Standby active or not ? */
 #define	PHYI_OFFLINE		IFF_OFFLINE	/* NIC has been offlined */
 #define	PHYI_VIRTUAL		IFF_VIRTUAL	/* Will not send or recv pkts */
+#define	PHYI_IPMP		IFF_IPMP	/* IPMP meta-interface */
 
 #define	ILLF_DEBUG		IFF_DEBUG	/* turn on debugging */
 #define	ILLF_NOTRAILERS		IFF_NOTRAILERS	/* avoid use of trailers */
@@ -137,11 +133,6 @@ extern "C" {
 #define	IPIF_FIXEDMTU		IFF_FIXEDMTU	/* set with SIOCSLIFMTU */
 #define	IPIF_DUPLICATE		IFF_DUPLICATE	/* address is in use */
 
-/* Source selection values for ipif_select_source_v6 */
-#define	RESTRICT_TO_NONE	0x0	/* No restriction in source selection */
-#define	RESTRICT_TO_GROUP	0x1	/* Restrict to IPMP group */
-#define	RESTRICT_TO_ILL		0x2	/* Restrict to ILL */
-
 #ifdef DEBUG
 #define	ILL_MAC_PERIM_HELD(ill)	ill_mac_perim_held(ill)
 #else
@@ -151,24 +142,23 @@ extern "C" {
 /* for ipif_resolver_up */
 enum ip_resolver_action {
 	Res_act_initial,		/* initial address establishment */
-	Res_act_move,			/* address move (IPMP, new DL addr) */
-	Res_act_defend			/* address defense */
+	Res_act_rebind,			/* IPMP address rebind (new hwaddr) */
+	Res_act_defend,			/* address defense */
+	Res_act_none			/* do nothing */
 };
 
-extern	ill_t	*illgrp_scheduler(ill_t *);
-extern	mblk_t	*ill_arp_alloc(ill_t *, uchar_t *, caddr_t);
-extern	mblk_t	*ipif_area_alloc(ipif_t *);
+extern	mblk_t	*ill_arp_alloc(ill_t *, const uchar_t *, caddr_t);
+extern	mblk_t	*ipif_area_alloc(ipif_t *, uint_t);
 extern	mblk_t	*ipif_ared_alloc(ipif_t *);
 extern	mblk_t	*ill_ared_alloc(ill_t *, ipaddr_t);
-extern	void	ill_dlpi_done(ill_t *, t_uscalar_t);
+extern	mblk_t	*ill_arie_alloc(ill_t *, const char *, const void *);
 extern	boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t);
+extern	void	ill_dlpi_done(ill_t *, t_uscalar_t);
 extern	void	ill_dlpi_send(ill_t *, mblk_t *);
 extern	void	ill_dlpi_send_deferred(ill_t *);
 extern	void	ill_capability_done(ill_t *);
 
 extern	mblk_t	*ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t);
-extern  ill_t	*ill_group_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *);
-extern	ill_t	*ill_group_lookup_on_name(char *, boolean_t, ip_stack_t *);
 /* NOTE: Keep unmodified ill_lookup_on_ifindex for ipp for now */
 extern  ill_t	*ill_lookup_on_ifindex_global_instance(uint_t, boolean_t,
     queue_t *, mblk_t *, ipsq_func_t, int *);
@@ -180,6 +170,7 @@ extern	ill_t	*ill_lookup_on_name(char *, boolean_t,
 extern uint_t	ill_get_next_ifindex(uint_t, boolean_t, ip_stack_t *);
 extern uint_t	ill_get_ifindex_by_name(char *, ip_stack_t *);
 extern	void	ill_ipif_cache_delete(ire_t *, char *);
+extern	void	ill_stq_cache_delete(ire_t *, char *);
 extern	void	ill_delete(ill_t *);
 extern	void	ill_delete_tail(ill_t *);
 extern	int	ill_dl_phys(ill_t *, ipif_t *, mblk_t *, queue_t *);
@@ -193,9 +184,9 @@ extern	void	ill_frag_prune(ill_t *, uint_t);
 extern	void	ill_frag_free_pkts(ill_t *, ipfb_t *, ipf_t *, int);
 extern	time_t	ill_frag_timeout(ill_t *, time_t);
 extern	int	ill_init(queue_t *, ill_t *);
-extern	int	ill_nominate_mcast_rcv(ill_group_t *);
-extern	boolean_t	ill_setdefaulttoken(ill_t *);
+extern	void	ill_refresh_bcast(ill_t *);
 extern	void	ill_restart_dad(ill_t *, boolean_t);
+extern	boolean_t	ill_setdefaulttoken(ill_t *);
 extern	int	ill_set_phys_addr(ill_t *, mblk_t *);
 extern	void	ill_set_ndmp(ill_t *, mblk_t *, uint_t, uint_t);
 
@@ -222,11 +213,9 @@ extern	void	ill_capability_reset(ill_t *, boolean_t);
 extern	void	ill_taskq_dispatch(ip_stack_t *);
 
 extern	void	ill_mtu_change(ire_t *, char *);
-extern void	ill_group_cleanup(ill_t *);
-extern int	ill_up_ipifs(ill_t *, queue_t *, mblk_t *);
-extern	boolean_t ill_is_probeonly(ill_t *);
-extern	boolean_t ill_hook_event_create(ill_t *, lif_if_t, nic_event_t,
-    nic_event_data_t, size_t);
+extern	int	ill_up_ipifs(ill_t *, queue_t *, mblk_t *);
+extern uint_t	ill_appaddr_cnt(const ill_t *);
+extern uint_t	ill_ptpaddr_cnt(const ill_t *);
 
 extern	void	ip_loopback_cleanup(ip_stack_t *);
 extern	void	ipif_get_name(const ipif_t *, char *, int);
@@ -239,6 +228,8 @@ extern	ipif_t	*ipif_lookup_addr_v6(const in6_addr_t *, ill_t *, zoneid_t,
     queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *);
 extern  boolean_t ip_addr_exists_v6(const in6_addr_t *, zoneid_t,
     ip_stack_t *);
+extern	ipif_t	*ipif_lookup_addr_exact_v6(const in6_addr_t *, ill_t *,
+    ip_stack_t *);
 extern	zoneid_t ipif_lookup_addr_zoneid(ipaddr_t, ill_t *, ip_stack_t *);
 extern	zoneid_t ipif_lookup_addr_zoneid_v6(const in6_addr_t *, ill_t *,
     ip_stack_t *);
@@ -251,31 +242,30 @@ extern	ipif_t	*ipif_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t);
 extern	ipif_t	*ipif_lookup_remote(ill_t *, ipaddr_t, zoneid_t);
 extern	ipif_t	*ipif_lookup_onlink_addr(ipaddr_t, zoneid_t, ip_stack_t *);
 extern	ipif_t	*ipif_lookup_seqid(ill_t *, uint_t);
-extern	boolean_t	ipif_lookup_zoneid(ill_t *, zoneid_t, int,
-    ipif_t **);
-extern	boolean_t	ipif_lookup_zoneid_group(ill_t *, zoneid_t, int,
-    ipif_t **);
+extern	boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int, ipif_t **);
 extern	ipif_t	*ipif_select_source(ill_t *, ipaddr_t, zoneid_t);
 extern	boolean_t	ipif_usesrc_avail(ill_t *, zoneid_t);
 extern	void	ipif_refhold(ipif_t *);
 extern	void	ipif_refhold_locked(ipif_t *);
-extern	void		ipif_refrele(ipif_t *);
+extern	void	ipif_refrele(ipif_t *);
 extern	void	ipif_all_down_tail(ipsq_t *, queue_t *, mblk_t *, void *);
+extern	void	ipif_resolver_down(ipif_t *);
 extern	int	ipif_resolver_up(ipif_t *, enum ip_resolver_action);
 extern	int	ipif_arp_setup_multicast(ipif_t *, mblk_t **);
 extern	int	ipif_down(ipif_t *, queue_t *, mblk_t *);
 extern	void	ipif_down_tail(ipif_t *);
+extern	void	ipif_multicast_down(ipif_t *);
 extern	void	ipif_multicast_up(ipif_t *);
 extern	void	ipif_ndp_down(ipif_t *);
-extern	int	ipif_ndp_up(ipif_t *);
+extern	int	ipif_ndp_up(ipif_t *, boolean_t);
 extern	int	ipif_ndp_setup_multicast(ipif_t *, struct nce_s **);
 extern	int	ipif_up_done(ipif_t *);
 extern	int	ipif_up_done_v6(ipif_t *);
 extern	void	ipif_up_notify(ipif_t *);
-extern	void	ipif_update_other_ipifs_v6(ipif_t *, ill_group_t *);
+extern	void	ipif_update_other_ipifs_v6(ipif_t *);
 extern	void	ipif_recreate_interface_routes_v6(ipif_t *, ipif_t *);
 extern	void	ill_update_source_selection(ill_t *);
-extern	ipif_t	*ipif_select_source_v6(ill_t *, const in6_addr_t *, uint_t,
+extern	ipif_t	*ipif_select_source_v6(ill_t *, const in6_addr_t *, boolean_t,
     uint32_t, zoneid_t);
 extern	boolean_t	ipif_cant_setlinklocal(ipif_t *);
 extern	int	ipif_setlinklocal(ipif_t *);
@@ -284,11 +274,8 @@ extern	ipif_t	*ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t, queue_t *,
     mblk_t *, ipsq_func_t, int *, ip_stack_t *);
 extern	ipif_t	*ipif_get_next_ipif(ipif_t *curr, ill_t *ill);
 extern	void	ipif_ill_refrele_tail(ill_t *ill);
-extern	void	ipif_arp_down(ipif_t *ipif);
 extern	void	ipif_mask_reply(ipif_t *);
-
-extern	int	illgrp_insert(ill_group_t **, ill_t *, char *, ill_group_t *,
-    boolean_t);
+extern	int 	ipif_up(ipif_t *, queue_t *, mblk_t *);
 
 extern	void	ipsq_current_start(ipsq_t *, ipif_t *, int);
 extern	void	ipsq_current_finish(ipsq_t *);
@@ -451,13 +438,13 @@ extern int ip_sioctl_tmyaddr(ipif_t *, sin_t *, queue_t *, mblk_t *,
 extern int ip_sioctl_tunparam(ipif_t *, sin_t *, queue_t *, mblk_t *,
     ip_ioctl_cmd_t *, void *);
 
+extern int ip_sioctl_get_binding(ipif_t *, sin_t *, queue_t *,
+    mblk_t *, ip_ioctl_cmd_t *, void *);
 extern int ip_sioctl_groupname(ipif_t *, sin_t *, queue_t *,
     mblk_t *, ip_ioctl_cmd_t *, void *);
 extern int ip_sioctl_get_groupname(ipif_t *, sin_t *, queue_t *,
     mblk_t *, ip_ioctl_cmd_t *, void *);
-extern int ip_sioctl_slifoindex(ipif_t *, sin_t *, queue_t *,
-    mblk_t *, ip_ioctl_cmd_t *, void *);
-extern int ip_sioctl_get_oindex(ipif_t *, sin_t *, queue_t *,
+extern int ip_sioctl_groupinfo(ipif_t *, sin_t *, queue_t *,
     mblk_t *, ip_ioctl_cmd_t *, void *);
 
 extern int ip_sioctl_get_lifzone(ipif_t *, sin_t *, queue_t *,
@@ -473,15 +460,11 @@ extern int ip_sioctl_slifusesrc(ipif_t *, sin_t *, queue_t *,
     mblk_t *, ip_ioctl_cmd_t *, void *);
 extern int ip_sioctl_get_lifsrcof(ipif_t *, sin_t *, queue_t *,
     mblk_t *, ip_ioctl_cmd_t *, void *);
-extern int ip_sioctl_set_ipmpfailback(ipif_t *, sin_t *, queue_t *,
-    mblk_t *, ip_ioctl_cmd_t *, void *);
 
 extern	void	ip_sioctl_copyin_resume(ipsq_t *, queue_t *, mblk_t *, void *);
 extern	void	ip_sioctl_copyin_setup(queue_t *, mblk_t *);
-extern	void	ip_sioctl_iocack(queue_t *, mblk_t *);
+extern	void	ip_sioctl_iocack(ipsq_t *, queue_t *, mblk_t *, void *);
 extern	ip_ioctl_cmd_t *ip_sioctl_lookup(int);
-extern int ip_sioctl_move(ipif_t *, sin_t *, queue_t *, mblk_t *,
-    ip_ioctl_cmd_t *, void *);
 
 extern	void	conn_delete_ire(conn_t *, caddr_t);
 
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index dae62ab499..369ba60005 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -394,11 +394,9 @@ typedef struct ip_lso_info_s {
 #define	CONN_IS_LSO_MD_FASTPATH(connp)	\
 	((connp)->conn_dontroute == 0 &&	/* SO_DONTROUTE */	\
 	!((connp)->conn_nexthop_set) &&		/* IP_NEXTHOP */	\
-	(connp)->conn_nofailover_ill == NULL &&	/* IPIF_NOFAILOVER */	\
-	(connp)->conn_outgoing_pill == NULL &&	/* IP{V6}_BOUND_PIF */	\
 	(connp)->conn_outgoing_ill == NULL)	/* IP{V6}_BOUND_IF */
 
-/* Definitons for fragmenting IP packets using MDT. */
+/* Definitions for fragmenting IP packets using MDT. */
 
 /*
  * Smaller and private version of pdescinfo_t used specifically for IP,
diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h
index 7accbbcfa3..0a9f8add85 100644
--- a/usr/src/uts/common/inet/ip_ire.h
+++ b/usr/src/uts/common/inet/ip_ire.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -86,31 +86,17 @@ extern "C" {
 					/* return the ire. No recursive */
 					/* lookup should be done. */
 #define	MATCH_IRE_IHANDLE	0x0200	/* Match IRE on ihandle */
-#define	MATCH_IRE_MARK_HIDDEN	0x0400	/* Match IRE ire_marks with */
-					/* IRE_MARK_HIDDEN. */
+#define	MATCH_IRE_MARK_TESTHIDDEN 0x0400 /* Match IRE_MARK_TESTHIDDEN IREs */
+
 /*
- * MATCH_IRE_ILL is used whenever we want to specifically match an IRE
- * whose ire_ipif->ipif_ill or (ill_t *)ire_stq->q_ptr matches a given
- * ill. When MATCH_IRE_ILL is used to locate an IRE_CACHE, it implies
- * that the packet will not be load balanced. This is normally used
- * by in.mpathd to send out failure detection probes.
- *
- * MATCH_IRE_ILL_GROUP is used whenever we are not specific about which
- * interface (ill) the packet should be sent out. This implies that the
- * packets will be subjected to load balancing and it might go out on
- * any interface in the group. When there is only interface in the group,
- * MATCH_IRE_ILL_GROUP becomes MATCH_IRE_ILL. Most of the code uses
- * MATCH_IRE_ILL_GROUP and MATCH_IRE_ILL is used in very few cases where
- * we want to disable load balancing.
- *
  * MATCH_IRE_PARENT is used whenever we unconditionally want to get the
  * parent IRE (sire) while recursively searching IREs for an offsubnet
  * destination. With this flag, even if no IRE_CACHETABLE or IRE_INTERFACE
  * is found to help resolving IRE_OFFSUBNET in lookup routines, the
  * IRE_OFFSUBNET sire, if any, is returned to the caller.
  */
-#define	MATCH_IRE_ILL_GROUP	0x0800	/* Match IRE on ill or the ill_group. */
-#define	MATCH_IRE_ILL		0x1000	/* Match IRE on the ill only */
+/* UNUSED			0x0800  */
+#define	MATCH_IRE_ILL		0x1000	/* Match IRE on the ill */
 
 #define	MATCH_IRE_PARENT	0x2000	/* Match parent ire, if any, */
 					/* even if ire is not matched. */
@@ -305,7 +291,7 @@ extern	ire_t	*ire_ihandle_lookup_onlink(ire_t *);
 extern	ire_t	*ire_ihandle_lookup_offlink(ire_t *, ire_t *);
 extern	ire_t	*ire_ihandle_lookup_offlink_v6(ire_t *, ire_t *);
 
-extern	boolean_t	ire_local_same_ill_group(ire_t *, ire_t *);
+extern  boolean_t	ire_local_same_lan(ire_t *, ire_t *);
 extern	boolean_t	ire_local_ok_across_zones(ire_t *, zoneid_t, void *,
     const struct ts_label_s *, ip_stack_t *);
 
@@ -354,7 +340,7 @@ extern ire_t	*ipif_lookup_multi_ire_v6(ipif_t *, const in6_addr_t *);
 extern ire_t	*ire_get_next_bcast_ire(ire_t *, ire_t *);
 extern ire_t	*ire_get_next_default_ire(ire_t *, ire_t *);
 
-extern  void	ire_arpresolve(ire_t *,  ill_t *);
+extern  void	ire_arpresolve(ire_t *);
 extern  void	ire_freemblk(ire_t *);
 extern boolean_t	ire_match_args(ire_t *, ipaddr_t, ipaddr_t, ipaddr_t,
     int, const ipif_t *, zoneid_t, uint32_t, const struct ts_label_s *, int,
diff --git a/usr/src/uts/common/inet/ip_multi.h b/usr/src/uts/common/inet/ip_multi.h
index a3f4282cc7..7dee133967 100644
--- a/usr/src/uts/common/inet/ip_multi.h
+++ b/usr/src/uts/common/inet/ip_multi.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -49,6 +49,15 @@ typedef enum {
 } ilg_stat_t;
 
 /*
+ * Flags shared via ips_mrt_flags, used by mcast_restart_timers_thread().
+ */
+typedef enum {
+	IP_MRT_STOP	= 0x1,	/* request to stop thread */
+	IP_MRT_DONE	= 0x2,	/* indication that thread is stopped */
+	IP_MRT_RUN 	= 0x4	/* request to restart timers */
+} ip_mrt_flags_t;
+
+/*
  * Extern functions
  */
 extern	mblk_t		*igmp_input(queue_t *, mblk_t *, ill_t *);
@@ -78,9 +87,7 @@ extern int		ip_get_dlpi_mbcast(ill_t *, mblk_t *);
 extern	void		ilm_free(ipif_t *);
 extern	ilm_t		*ilm_lookup_ill(ill_t *, ipaddr_t, zoneid_t);
 extern	ilm_t		*ilm_lookup_ill_v6(ill_t *, const in6_addr_t *,
-    zoneid_t);
-extern	ilm_t		*ilm_lookup_ill_index_v6(ill_t *, const in6_addr_t *,
-    int, zoneid_t);
+    boolean_t, zoneid_t);
 extern	ilm_t		*ilm_lookup_ipif(ipif_t *, ipaddr_t);
 
 extern int		ilm_numentries_v6(ill_t *, const in6_addr_t *);
@@ -92,10 +99,10 @@ extern int		ip_ll_send_enabmulti_req(ill_t *, const in6_addr_t *);
 
 extern	int		ip_addmulti(ipaddr_t, ipif_t *, ilg_stat_t,
     mcast_record_t, slist_t *);
-extern	int		ip_addmulti_v6(const in6_addr_t *, ill_t *, int,
+extern	int		ip_addmulti_v6(const in6_addr_t *, ill_t *,
     zoneid_t, ilg_stat_t, mcast_record_t, slist_t *);
 extern	int		ip_delmulti(ipaddr_t, ipif_t *, boolean_t, boolean_t);
-extern	int		ip_delmulti_v6(const in6_addr_t *, ill_t *, int,
+extern	int		ip_delmulti_v6(const in6_addr_t *, ill_t *,
     zoneid_t, boolean_t, boolean_t);
 extern	int		ill_join_allmulti(ill_t *);
 extern	void		ill_leave_allmulti(ill_t *);
@@ -140,9 +147,11 @@ extern	void		reset_conn_ipif(ipif_t *);
 extern	void		reset_conn_ill(ill_t *);
 extern	void		reset_mrt_ill(ill_t *);
 extern	void		reset_mrt_vif_ipif(ipif_t *);
-extern	void		igmp_start_timers(unsigned, ip_stack_t *);
-extern	void		mld_start_timers(unsigned, ip_stack_t *);
+extern	void		mcast_restart_timers_thread(ip_stack_t *);
 extern	void		ilm_inactive(ilm_t *);
+extern	ilm_t		*ilm_walker_start(ilm_walker_t *, ill_t *);
+extern	ilm_t		*ilm_walker_step(ilm_walker_t *, ilm_t *);
+extern	void		ilm_walker_finish(ilm_walker_t *);
 
 #endif /* _KERNEL */
 
diff --git a/usr/src/uts/common/inet/ip_ndp.h b/usr/src/uts/common/inet/ip_ndp.h
index 4dbb56a884..5eda155c0e 100644
--- a/usr/src/uts/common/inet/ip_ndp.h
+++ b/usr/src/uts/common/inet/ip_ndp.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_INET_IP_NDP_H
 #define	_INET_IP_NDP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/mutex.h>
 #include <sys/stream.h>
 #include <netinet/in.h>
@@ -318,7 +316,8 @@ extern	nd_opt_hdr_t *ndp_get_option(nd_opt_hdr_t *, int, int);
 extern	void	ndp_inactive(nce_t *);
 extern	void	ndp_input(ill_t *, mblk_t *, mblk_t *);
 extern	boolean_t ndp_lookup_ipaddr(in_addr_t, netstack_t *);
-extern	nce_t	*ndp_lookup_v6(ill_t *, const in6_addr_t *, boolean_t);
+extern	nce_t	*ndp_lookup_v6(ill_t *, boolean_t, const in6_addr_t *,
+    boolean_t);
 extern	nce_t	*ndp_lookup_v4(ill_t *, const in_addr_t *, boolean_t);
 extern	int	ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t,
     mblk_t *);
@@ -346,7 +345,7 @@ extern	void	nce_fastpath(nce_t *);
 extern	int	ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
     const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
     nce_t **);
-extern	int	ndp_lookup_then_add_v6(ill_t *, uchar_t *,
+extern	int	ndp_lookup_then_add_v6(ill_t *, boolean_t, uchar_t *,
     const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, uint32_t,
     uint16_t, uint16_t, nce_t **);
 extern	int	ndp_lookup_then_add_v4(ill_t *,
diff --git a/usr/src/uts/common/inet/ip_rts.h b/usr/src/uts/common/inet/ip_rts.h
index 70b33e0278..61bc451995 100644
--- a/usr/src/uts/common/inet/ip_rts.h
+++ b/usr/src/uts/common/inet/ip_rts.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,19 +37,28 @@ extern "C" {
  */
 #define	TSOL_RTSA_REQUEST_MAX	1	/* one per route destination */
 
+/*
+ * Flags for RTS queuing operations.
+ */
+#define	RTSQ_UNDER_IPMP	0x01	/* send only on RTAW_UNDER_IPMP queues */
+#define	RTSQ_NORMAL	0x02	/* send only on normal queues */
+#define	RTSQ_ALL	(RTSQ_UNDER_IPMP|RTSQ_NORMAL) /* send on all queues */
+#define	RTSQ_DEFAULT	0x04	/* use standard filtering */
+
 #ifdef _KERNEL
 
 extern	void	ip_rts_change(int, ipaddr_t, ipaddr_t,
-    ipaddr_t, ipaddr_t, ipaddr_t, int, int,
-    int, ip_stack_t *);
+    ipaddr_t, ipaddr_t, ipaddr_t, int, int, int, ip_stack_t *);
 
 extern	void	ip_rts_change_v6(int, const in6_addr_t *, const in6_addr_t *,
     const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, int, int, int,
     ip_stack_t *);
 
-extern	void	ip_rts_ifmsg(const ipif_t *);
+extern	void	ip_rts_ifmsg(const ipif_t *, uint_t);
 
-extern	void	ip_rts_newaddrmsg(int, int, const ipif_t *);
+extern	void	ip_rts_xifmsg(const ipif_t *, uint64_t, uint64_t, uint_t);
+
+extern	void	ip_rts_newaddrmsg(int, int, const ipif_t *, uint_t);
 
 extern	int	ip_rts_request(queue_t *, mblk_t *, cred_t *);
 
@@ -70,9 +79,11 @@ extern	void	rts_fill_msg_v6(int, int, const in6_addr_t *,
 
 extern	size_t	rts_header_msg_size(int);
 
-extern	void	rts_queue_input(mblk_t *, conn_t *, sa_family_t, ip_stack_t *);
+extern	void	rts_queue_input(mblk_t *, conn_t *, sa_family_t, uint_t,
+    ip_stack_t *);
 
 extern int ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *, cred_t *);
+
 #endif /* _KERNEL */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index 3c53e1a3d3..750378f587 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,6 +33,7 @@ extern "C" {
 
 #include <sys/netstack.h>
 #include <netinet/igmp_var.h>
+#include <sys/modhash.h>
 
 #ifdef _KERNEL
 #include <sys/list.h>
@@ -172,9 +173,6 @@ struct ip_stack {
 
 	krwlock_t	ips_ill_g_usesrc_lock;
 
-	struct ill_group *ips_illgrp_head_v4;	/* Head of IPv4 ill groups */
-	struct ill_group *ips_illgrp_head_v6;	/* Head of IPv6 ill groups */
-
 	/* Taskq dispatcher for capability operations */
 	kmutex_t	ips_capab_taskq_lock;
 	kcondvar_t	ips_capab_taskq_cv;
@@ -204,7 +202,6 @@ struct ip_stack {
 	int 		ips_igmp_timer_scheduled_last;
 	int		ips_igmp_deferred_next;
 	timeout_id_t	ips_igmp_timeout_id;
-	kthread_t	*ips_igmp_timer_thread;
 	boolean_t	ips_igmp_timer_setter_active;
 
 	/* Following protected by mld_timer_lock */
@@ -212,7 +209,6 @@ struct ip_stack {
 	int 		ips_mld_timer_scheduled_last;
 	int		ips_mld_deferred_next;
 	timeout_id_t	ips_mld_timeout_id;
-	kthread_t	*ips_mld_timer_thread;
 	boolean_t	ips_mld_timer_setter_active;
 
 	/* Protected by igmp_slowtimeout_lock */
@@ -269,8 +265,6 @@ struct ip_stack {
 	int		ips_ip_g_forward;
 	int		ips_ipv6_forward;
 
-	int		ips_ipmp_hook_emulation; /* ndd variable */
-
 	time_t		ips_ip_g_frag_timeout;
 	clock_t		ips_ip_g_frag_timo_ms;
 
@@ -280,8 +274,6 @@ struct ip_stack {
 	clock_t		ips_icmp_pkt_err_last;
 	/* Number of packets sent in burst */
 	uint_t		ips_icmp_pkt_err_sent;
-	/* Used by icmp_send_redirect_v6 for picking random src. */
-	uint_t		ips_icmp_redirect_v6_src_index;
 
 	/* Protected by ip_mi_lock */
 	void		*ips_ip_g_head;		/* Instance Data List Head */
@@ -356,8 +348,6 @@ struct ip_stack {
 
 	kstat_t		*ips_loopback_ksp;
 
-	uint_t		ips_ipif_src_random;
-
 	struct idl_s	*ips_conn_drain_list;	/* Array of conn drain lists */
 	uint_t		ips_conn_drain_list_cnt; /* Count of conn_drain_list */
 	int		ips_conn_drain_list_index; /* Next drain_list */
@@ -375,15 +365,6 @@ struct ip_stack {
 	uint64_t	ips_ipif_g_seqid;
 	union phyint_list_u *ips_phyint_g_list;	/* start of phyint list */
 
-	/*
-	 * Reflects value of FAILBACK variable in IPMP config file
-	 * /etc/default/mpathd. Default value is B_TRUE.
-	 * Set to B_FALSE if user disabled failback by configuring
-	 * "FAILBACK=no" in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this
-	 * information to kernel.
-	 */
-	boolean_t ips_ipmp_enable_failback;
-
 /* ip_neti.c */
 	hook_family_t	ips_ipv4root;
 	hook_family_t	ips_ipv6root;
@@ -427,12 +408,25 @@ struct ip_stack {
 	kcondvar_t		ips_ipobs_cb_cv;
 
 	struct __ldi_ident	*ips_ldi_ident;
+
+/* ipmp.c */
+	krwlock_t		ips_ipmp_lock;
+	mod_hash_t		*ips_ipmp_grp_hash;
+
+/* igmp.c */
+	/* multicast restart timers thread logic */
+	kmutex_t		ips_mrt_lock;
+	uint_t			ips_mrt_flags;
+	kcondvar_t		ips_mrt_cv;
+	kcondvar_t		ips_mrt_done_cv;
+	kthread_t		*ips_mrt_thread;
 };
 typedef struct ip_stack ip_stack_t;
 
 /* Finding an ip_stack_t */
 #define	CONNQ_TO_IPST(_q)	(Q_TO_CONN(_q)->conn_netstack->netstack_ip)
 #define	ILLQ_TO_IPST(_q)	(((ill_t *)(_q)->q_ptr)->ill_ipst)
+#define	PHYINT_TO_IPST(phyi)	((phyi)->phyint_ipsq->ipsq_ipst)
 
 #else /* _KERNEL */
 typedef int ip_stack_t;
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index 5fb86a5262..d80123a977 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -249,7 +249,6 @@ struct conn_s {
 
 	squeue_t	*conn_initial_sqp;	/* Squeue at open time */
 	squeue_t	*conn_final_sqp;	/* Squeue after connect */
-	ill_t		*conn_nofailover_ill;	/* Failover ill */
 	ill_t		*conn_dhcpinit_ill;	/* IP_DHCPINIT_IF */
 	ipsec_latch_t	*conn_latch;		/* latched state */
 	ill_t		*conn_outgoing_ill;	/* IP{,V6}_BOUND_IF */
@@ -295,7 +294,6 @@ struct conn_s {
 
 	uint_t		conn_proto;		/* SO_PROTOTYPE state */
 	ill_t		*conn_incoming_ill;	/* IP{,V6}_BOUND_IF */
-	ill_t		*conn_outgoing_pill;	/* IP{,V6}_BOUND_PIF */
 	ill_t		*conn_oper_pending_ill; /* pending shared ioctl */
 
 	ilg_t	*conn_ilg;		/* Group memberships */
@@ -307,9 +305,6 @@ struct conn_s {
 
 	struct ipif_s	*conn_multicast_ipif;	/* IP_MULTICAST_IF */
 	ill_t		*conn_multicast_ill;	/* IPV6_MULTICAST_IF */
-	int		conn_orig_bound_ifindex; /* BOUND_IF before MOVE */
-	int		conn_orig_multicast_ifindex;
-						/* IPv6 MC IF before MOVE */
 	struct	conn_s	*conn_drain_next;	/* Next conn in drain list */
 	struct	conn_s	*conn_drain_prev;	/* Prev conn in drain list */
 	idl_t		*conn_idl;		/* Ptr to the drain list head */
@@ -322,7 +317,7 @@ struct conn_s {
 	uchar_t		conn_broadcast_ttl; 	/* IP_BROADCAST_TTL */
 #define	conn_nexthop_v4	V4_PART_OF_V6(conn_nexthop_v6)
 	cred_t		*conn_peercred;		/* Peer credentials, if any */
-
+	int		conn_rtaware; 		/* RT_AWARE sockopt value */
 	kcondvar_t	conn_sq_cv;		/* For non-STREAMS socket IO */
 	kthread_t	*conn_sq_caller;	/* Caller of squeue sync ops */
 	sock_upcalls_t	*conn_upcalls;		/* Upcalls to sockfs */
diff --git a/usr/src/uts/common/inet/ipnet/ipnet.c b/usr/src/uts/common/inet/ipnet/ipnet.c
index 577205f25a..e94af50424 100644
--- a/usr/src/uts/common/inet/ipnet/ipnet.c
+++ b/usr/src/uts/common/inet/ipnet/ipnet.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -229,16 +229,19 @@ ipnet_if_init(void)
 int
 _init(void)
 {
-	int	ret;
+	int ret;
+	boolean_t netstack_registered = B_FALSE;
 
 	if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
 		return (ENODEV);
 	ipnet_minor_space = id_space_create("ipnet_minor_space",
 	    IPNET_MINOR_MIN, MAXMIN32);
-	netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
+
 	/*
 	 * We call ddi_taskq_create() with nthread == 1 to ensure in-order
-	 * delivery of packets to clients.
+	 * delivery of packets to clients.  Note that we need to create the
+	 * taskqs before calling netstack_register() since ipnet_stack_init()
+	 * registers callbacks that use 'em.
 	 */
 	ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
 	ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
@@ -247,6 +250,10 @@ _init(void)
 		ret = ENOMEM;
 		goto done;
 	}
+
+	netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
+	netstack_registered = B_TRUE;
+
 	if ((ret = ipnet_if_init()) == 0)
 		ret = mod_install(&modlinkage);
 done:
@@ -255,7 +262,8 @@ done:
 			ddi_taskq_destroy(ipnet_taskq);
 		if (ipnet_nicevent_taskq != NULL)
 			ddi_taskq_destroy(ipnet_nicevent_taskq);
-		netstack_unregister(NS_IPNET);
+		if (netstack_registered)
+			netstack_unregister(NS_IPNET);
 		id_space_destroy(ipnet_minor_space);
 	}
 	return (ret);
@@ -268,9 +276,10 @@ _fini(void)
 
 	if ((err = mod_remove(&modlinkage)) != 0)
 		return (err);
+
+	netstack_unregister(NS_IPNET);
 	ddi_taskq_destroy(ipnet_nicevent_taskq);
 	ddi_taskq_destroy(ipnet_taskq);
-	netstack_unregister(NS_IPNET);
 	id_space_destroy(ipnet_minor_space);
 	return (0);
 }
@@ -987,6 +996,7 @@ static boolean_t
 ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
     ipnet_addrp_t *dst)
 {
+	boolean_t		obsif;
 	uint64_t		ifindex = ipnet->ipnet_if->if_index;
 	ipnet_addrtype_t	srctype, dsttype;
 
@@ -994,6 +1004,13 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
 	dsttype = ipnet_get_addrtype(ipnet, dst);
 
 	/*
+	 * If the packet's ifindex matches ours, or the packet's group ifindex
+	 * matches ours, it's on the interface we're observing.  (Thus,
+	 * observing on the group ifindex matches all ifindexes in the group.)
+	 */
+	obsif = (ihd->ihd_ifindex == ifindex || ihd->ihd_grifindex == ifindex);
+
+	/*
 	 * Do not allow an ipnet stream to see packets that are not from or to
 	 * its zone.  The exception is when zones are using the shared stack
 	 * model.  In this case, streams in the global zone have visibility
@@ -1025,7 +1042,7 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
 	 * have our source address (this allows us to see packets we send).
 	 */
 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
-		if (ihd->ihd_ifindex == ifindex || srctype == IPNETADDR_MYADDR)
+		if (srctype == IPNETADDR_MYADDR || obsif)
 			return (B_TRUE);
 	}
 
@@ -1033,7 +1050,7 @@ ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
 	 * We accept multicast and broadcast packets transmitted or received
 	 * on the interface we're observing.
 	 */
-	if (dsttype == IPNETADDR_MBCAST && ihd->ihd_ifindex == ifindex)
+	if (dsttype == IPNETADDR_MBCAST && obsif)
 		return (B_TRUE);
 
 	return (B_FALSE);
diff --git a/usr/src/uts/common/inet/ipsec_info.h b/usr/src/uts/common/inet/ipsec_info.h
index b014bdade0..0348e10b91 100644
--- a/usr/src/uts/common/inet/ipsec_info.h
+++ b/usr/src/uts/common/inet/ipsec_info.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_INET_IPSEC_INFO_H
 #define	_INET_IPSEC_INFO_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -114,12 +112,11 @@ typedef struct ipsec_in_s {
 
 		ipsec_in_decaps : 1,	/* Was this packet decapsulated from */
 					/* a matching inner packet? */
-		ipsec_in_attach_if : 1,	/* Don't load spread this packet */
 		ipsec_in_accelerated : 1, /* hardware accelerated packet */
 
 		ipsec_in_icmp_loopback : 1, /* Looped-back ICMP packet, */
 					    /* all should trust this. */
-		ipsec_in_pad_bits : 24;
+		ipsec_in_pad_bits : 25;
 
 	int    ipsec_in_ill_index;	/* interface on which ipha_dst was */
 					/* configured when pkt was recv'd  */
@@ -197,12 +194,11 @@ typedef struct ipsec_out_s {
 		ipsec_out_reserved : 1,
 		ipsec_out_v4 : 1,
 
-		ipsec_out_attach_if : 1,
 		ipsec_out_unspec_src : 1,	/* IPv6 ip6i_t info */
 		ipsec_out_reachable : 1, 	/* NDP reachability info */
 		ipsec_out_failed: 1,
-
 		ipsec_out_se_done: 1,
+
 		ipsec_out_esp_done: 1,
 		ipsec_out_ah_done: 1,
 		ipsec_out_need_policy: 1,
@@ -225,7 +221,7 @@ typedef struct ipsec_out_s {
 		 */
 		ipsec_out_icmp_loopback: 1,
 		ipsec_out_ip_nexthop : 1,	/* IP_NEXTHOP option is set */
-		ipsec_out_pad_bits : 12;
+		ipsec_out_pad_bits : 13;
 	cred_t	*ipsec_out_cred;
 	uint32_t ipsec_out_capab_ill_index;
 
diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h
index 5abfc06581..a467abaee9 100644
--- a/usr/src/uts/common/inet/mib2.h
+++ b/usr/src/uts/common/inet/mib2.h
@@ -17,9 +17,8 @@
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ *
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -27,8 +26,6 @@
 #ifndef	_INET_MIB2_H
 #define	_INET_MIB2_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <netinet/in.h>	/* For in6_addr_t */
 #include <sys/tsol/label.h> /* For brange_t */
 #include <sys/tsol/label_macro.h> /* For brange_t */
@@ -65,9 +62,14 @@ extern "C" {
  * #define OPTLEN(x) ((((x) + sizeof(long) - 1) / sizeof(long)) * sizeof(long))
  * #define OPTVAL(opt) ((char *)(opt + 1))
  *
- * For get requests (T_NEGOTIATE), any MIB2_xxx value can be used (only
+ * For get requests (T_CURRENT), any MIB2_xxx value can be used (only
  * "get all" is supported, so all modules get a copy of the request to
- * return everything it knows.   Recommend: Use MIB2_IP
+ * return everything it knows.   In general, we use MIB2_IP.  There is
+ * one exception: in general, IP will not report information related to
+ * IRE_MARK_TESTHIDDEN routes (e.g., in the MIB2_IP_ROUTE table).
+ * However, using the special value EXPER_IP_AND_TESTHIDDEN will cause
+ * all information to be reported.  This special value should only be
+ * used by IPMP-aware low-level utilities (e.g. in.mpathd).
  *
  * IMPORTANT:  some fields are grouped in a different structure than
  * suggested by MIB-II, e.g., checksum error counts.  The original MIB-2
@@ -79,7 +81,6 @@ extern "C" {
 #define	IPPROTO_MAX	256
 #endif
 
-
 #define	MIB2_SYSTEM		(IPPROTO_MAX+1)
 #define	MIB2_INTERFACES		(IPPROTO_MAX+2)
 #define	MIB2_AT			(IPPROTO_MAX+3)
@@ -108,12 +109,13 @@ extern "C" {
 #define	EXPER_IGMP		(EXPER+1)
 #define	EXPER_DVMRP		(EXPER+2)
 #define	EXPER_RAWIP		(EXPER+3)
+#define	EXPER_IP_AND_TESTHIDDEN	(EXPER+4)
 
 /*
  * Define range of levels for experimental use
  */
 #define	EXPER_RANGE_START	(EXPER+1)
-#define	EXPER_RANGE_END		(EXPER+3)
+#define	EXPER_RANGE_END		(EXPER+4)
 
 #define	BUMP_MIB(s, x)		{				\
 	extern void __dtrace_probe___mib_##x(int, void *);	\
diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.c b/usr/src/uts/common/inet/sctp/sctp_addr.c
index 1761396031..94cc8e8883 100644
--- a/usr/src/uts/common/inet/sctp/sctp_addr.c
+++ b/usr/src/uts/common/inet/sctp/sctp_addr.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/stream.h>
diff --git a/usr/src/uts/common/inet/sctp_ip.h b/usr/src/uts/common/inet/sctp_ip.h
index 16ab99abab..7b20d3fd2b 100644
--- a/usr/src/uts/common/inet/sctp_ip.h
+++ b/usr/src/uts/common/inet/sctp_ip.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _INET_SCTP_IP_H
 #define	_INET_SCTP_IP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 488f8ee021..68e0883222 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -31,7 +31,6 @@
 #include <sys/strsubr.h>
 #include <sys/stropts.h>
 #include <sys/strlog.h>
-#include <sys/strsun.h>
 #define	_SUN_TPI_VERSION 2
 #include <sys/tihdr.h>
 #include <sys/timod.h>
@@ -4683,18 +4682,10 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
 		/* ifindex must be already set */
 		ASSERT(ifindex != 0);
 
-		if (ltcp->tcp_bound_if != 0) {
-			/*
-			 * Set newtcp's bound_if equal to
-			 * listener's value. If ifindex is
-			 * not the same as ltcp->tcp_bound_if,
-			 * it must be a packet for the ipmp group
-			 * of interfaces
-			 */
+		if (ltcp->tcp_bound_if != 0)
 			tcp->tcp_bound_if = ltcp->tcp_bound_if;
-		} else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
+		else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
 			tcp->tcp_bound_if = ifindex;
-		}
 
 		tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary;
 		tcp->tcp_recvifindex = 0;
@@ -10716,9 +10707,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
 			ipp->ipp_use_min_mtu = *i1;
 			break;
-		case IPV6_BOUND_PIF:
-			/* Handled at the IP level */
-			return (-EINVAL);
 		case IPV6_SEC_OPT:
 			/*
 			 * We should not allow policy setting after
@@ -18895,7 +18883,6 @@ tcp_zcopy_check(tcp_t *tcp)
 	    connp->conn_dontroute == 0 &&
 	    !connp->conn_nexthop_set &&
 	    connp->conn_outgoing_ill == NULL &&
-	    connp->conn_nofailover_ill == NULL &&
 	    do_tcpzcopy == 1) {
 		/*
 		 * the checks above  closely resemble the fast path checks
@@ -19139,7 +19126,6 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp)
 	ipaddr_t	dst;
 	ire_t		*ire;
 	ill_t		*ill;
-	conn_t		*connp = tcp->tcp_connp;
 	mblk_t		*ire_fp_mp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
@@ -19164,14 +19150,6 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp)
 	}
 
 	ill = ire_to_ill(ire);
-	if (connp->conn_outgoing_ill != NULL) {
-		ill_t *conn_outgoing_ill = NULL;
-		/*
-		 * Choose a good ill in the group to send the packets on.
-		 */
-		ire = conn_set_outgoing_ill(connp, ire, &conn_outgoing_ill);
-		ill = ire_to_ill(ire);
-	}
 	ASSERT(ill != NULL);
 
 	if (!tcp->tcp_ire_ill_check_done) {
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index 15b5d04d61..8c8eee3b58 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,6 +35,7 @@
 #include <inet/common.h>
 #include <inet/optcom.h>
 #include <inet/ip.h>
+#include <inet/ip_if.h>
 #include <inet/ip_impl.h>
 #include <inet/tcp.h>
 #include <inet/tcp_impl.h>
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index d977c27e53..e2314f8104 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -151,9 +151,6 @@ opdes_t	tcp_opt_arr[] = {
 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
 	sizeof (in_addr_t),	-1 /* not initialized  */ },
 
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
-	sizeof (int),	0 /* no ifindex */ },
-
 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
 	sizeof (int), 0 },
 
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 7c9433caa0..1178315cb5 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -80,6 +80,7 @@
 #include <inet/ipp_common.h>
 #include <sys/squeue_impl.h>
 #include <inet/ipnet.h>
+#include <sys/ethernet.h>
 
 /*
  * The ipsec_info.h header file is here since it has the definition for the
@@ -2141,7 +2142,6 @@ udp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 		case MCAST_UNBLOCK_SOURCE:
 		case MCAST_JOIN_SOURCE_GROUP:
 		case MCAST_LEAVE_SOURCE_GROUP:
-		case IP_DONTFAILOVER_IF:
 			/* cannot "get" the value for these */
 			return (-1);
 		case IP_BOUND_IF:
@@ -3152,9 +3152,7 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
 			ipp->ipp_use_min_mtu = *i1;
 			break;
 
-		case IPV6_BOUND_PIF:
 		case IPV6_SEC_OPT:
-		case IPV6_DONTFAILOVER_IF:
 		case IPV6_SRC_PREFERENCES:
 		case IPV6_V6ONLY:
 			/* Handled at the IP level */
@@ -5351,7 +5349,6 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port,
 	if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
 	    CONN_OUTBOUND_POLICY_PRESENT(connp, ipss) ||
 	    connp->conn_dontroute ||
-	    connp->conn_nofailover_ill != NULL ||
 	    connp->conn_outgoing_ill != NULL || optinfo.ip_opt_flags != 0 ||
 	    optinfo.ip_opt_ill_index != 0 ||
 	    ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
@@ -5419,8 +5416,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
 		ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr;
 
 		ASSERT(ipif != NULL);
-		if (stq_ill != ipif->ipif_ill && (stq_ill->ill_group == NULL ||
-		    stq_ill->ill_group != ipif->ipif_ill->ill_group))
+		if (!IS_ON_SAME_LAN(stq_ill, ipif->ipif_ill))
 			retry_caching = B_TRUE;
 	}
 
@@ -5444,7 +5440,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
 			ASSERT(ipif != NULL);
 			ire = ire_ctable_lookup(dst, 0, 0, ipif,
 			    connp->conn_zoneid, MBLK_GETLABEL(mp),
-			    MATCH_IRE_ILL_GROUP, ipst);
+			    MATCH_IRE_ILL, ipst);
 		} else {
 			ASSERT(ipif == NULL);
 			ire = ire_cache_lookup(dst, connp->conn_zoneid,
@@ -5622,12 +5618,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
 	}
 
 	if (CLASSD(dst)) {
-		boolean_t ilm_exists;
-
-		ILM_WALKER_HOLD(ill);
-		ilm_exists = (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL);
-		ILM_WALKER_RELE(ill);
-		if (ilm_exists) {
+		if (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL) {
 			ip_multicast_loopback(q, ill, mp,
 			    connp->conn_multicast_loop ? 0 :
 			    IP_FF_NO_MCAST_LOOP, zoneid);
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index 0ec5a2c45e..65729b82f1 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -132,9 +132,6 @@ opdes_t	udp_opt_arr[] = {
 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IP_DONTFAILOVER_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
-	sizeof (struct in_addr),	0 /* not initialized */ },
-
 { IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
 	sizeof (int), 0 },
 
@@ -191,12 +188,6 @@ opdes_t	udp_opt_arr[] = {
 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
 	sizeof (int),	0 /* no ifindex */ },
 
-{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
-	sizeof (int),	0 /* no ifindex */ },
-
-{ IPV6_DONTFAILOVER_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
-	sizeof (int),	0 /* no ifindex */ },
-
 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
 	sizeof (int), 0 },
 
diff --git a/usr/src/uts/common/inet/vni/vni.c b/usr/src/uts/common/inet/vni/vni.c
deleted file mode 100644
index a370a7b4be..0000000000
--- a/usr/src/uts/common/inet/vni/vni.c
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-
-#include "vni_impl.h"
-#include <sys/conf.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/dlpi.h>
-#include <sys/stat.h>
-#include <sys/ethernet.h>
-#include <sys/strsun.h>
-#include <sys/stropts.h>
-
-static int vniopen(queue_t *, dev_t *, int, int, cred_t *);
-static int vniclose(queue_t *, int, cred_t *);
-static int vniwput(queue_t *, mblk_t *);
-static int vniattach(dev_info_t *, ddi_attach_cmd_t);
-static int vnidetach(dev_info_t *, ddi_detach_cmd_t);
-
-static struct module_info minfo = {
-	VNIIDNUM,	/* mi_idnum */
-	VNINAME,	/* mi_idname */
-	VNIMINPSZ,	/* mi_minpsz */
-	VNIMAXPSZ,	/* mi_maxpsz */
-	VNIHIWAT,	/* mi_hiwat */
-	VNILOWAT	/* mi_lowat */
-};
-
-static struct qinit vnirinit = {
-	NULL,		/* qi_putp */
-	NULL,		/* qi_srvp */
-	vniopen,	/* qi_qopen */
-	vniclose,	/* qi_qclose */
-	NULL,		/* qi_qadmin */
-	&minfo,		/* qi_minfo */
-	NULL		/* qi_mstat */
-};
-
-static struct qinit vniwinit = {
-	vniwput,	/* qi_putp */
-	NULL,		/* qi_srvp */
-	NULL,		/* qi_qopen */
-	NULL,		/* qi_qclose */
-	NULL,		/* qi_qadmin */
-	&minfo,		/* qi_minfo */
-	NULL		/* qi_mstat */
-};
-
-static struct streamtab vni_info = {
-	&vnirinit,	/* st_rdinit */
-	&vniwinit,	/* st_wrinit */
-	NULL,		/* st_muxrinit */
-	NULL		/* st_muxwrinit */
-};
-
-DDI_DEFINE_STREAM_OPS(vni_ops, nulldev, nulldev, vniattach, \
-    vnidetach, nodev, nodev, VNIFLAGS, &vni_info, ddi_quiesce_not_supported);
-
-static struct modldrv modldrv = {
-	&mod_driverops,
-	"Virtual network interface",
-	&vni_ops,
-};
-
-static struct modlinkage modlinkage = {
-	MODREV_1, &modldrv, NULL
-};
-
-static vni_str_t *vni_strlist_head;
-
-/*
- * DL_INFO_ACK template for VNI pseudo interface.
- */
-static  dl_info_ack_t dlvni_infoack = {
-	DL_INFO_ACK,	/* dl_primitive */
-	0,		/* dl_max_sdu */
-	0,		/* dl_min_sdu */
-	0,		/* dl_addr_length */
-	SUNW_DL_VNI,	/* dl_mac_type */
-	0,		/* dl_reserved */
-	0,		/* dl_current_state */
-	0,		/* dl_sap_length */
-	DL_CLDLS,	/* dl_service_mode */
-	0,		/* dl_qos_length */
-	0,		/* dl_qos_offset */
-	0,		/* dl_range_length */
-	0,		/* dl_range_offset */
-	DL_STYLE2,	/* dl_provider_style */
-	0,		/* dl_addr_offset */
-	DL_VERSION_2,	/* dl_version */
-	0,		/* dl_brdcst_addr_length */
-	0,		/* dl_brdcst_addr_offset */
-	0		/* dl_growth */
-};
-
-int
-_init(void)
-{
-	return (mod_install(&modlinkage));
-}
-
-int
-_fini(void)
-{
-	return (mod_remove(&modlinkage));
-}
-
-int
-_info(struct modinfo *modinfop)
-{
-	return (mod_info(&modlinkage, modinfop));
-}
-
-static int
-vniattach(dev_info_t *devi, ddi_attach_cmd_t cmd)
-{
-	if (cmd != DDI_ATTACH) {
-		cmn_err(CE_NOTE, "vniattach failure: cmd != DDI_ATTACH\n");
-		return (DDI_FAILURE);
-	}
-
-	if (ddi_create_minor_node(devi, VNINAME, S_IFCHR,
-	    ddi_get_instance(devi), DDI_PSEUDO, CLONE_DEV) ==
-	    DDI_FAILURE) {
-		ddi_remove_minor_node(devi, NULL);
-		cmn_err(CE_NOTE, "vniattach failure: ddi_create_minor_node\n");
-		return (DDI_FAILURE);
-	}
-
-	return (DDI_SUCCESS);
-}
-
-static int
-vnidetach(dev_info_t *devi, ddi_detach_cmd_t cmd)
-{
-	if (cmd != DDI_DETACH)
-		return (DDI_FAILURE);
-
-	ddi_remove_minor_node(devi, NULL);
-	return (DDI_SUCCESS);
-}
-
-/* ARGSUSED */
-static int
-vniopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
-{
-	vni_str_t	*stp, *prevstp;
-	minor_t		minordev = 0;
-
-	if (sflag != CLONEOPEN)
-		return (EINVAL);
-
-	prevstp = NULL;
-
-	for (stp = vni_strlist_head; stp != NULL; stp = stp->st_next) {
-		if (minordev < stp->st_minor)
-			break;
-		minordev++;
-		prevstp = stp;
-	}
-
-	stp = kmem_zalloc(sizeof (vni_str_t), KM_SLEEP);
-
-	*devp = makedevice(getmajor(*devp), minordev);
-
-	stp->st_minor = minordev;
-	stp->st_state = DL_UNATTACHED;
-	stp->st_next = NULL;
-
-	q->q_ptr = stp;
-	WR(q)->q_ptr = stp;
-
-	if (prevstp != NULL) {
-		stp->st_next = prevstp->st_next;
-		prevstp->st_next = stp;
-	} else {
-		stp->st_next = vni_strlist_head;
-		vni_strlist_head = stp;
-	}
-
-	qprocson(q);
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-vniclose(queue_t *q, int flag, cred_t *credp)
-{
-	vni_str_t *stp, **prevstpp;
-
-	qprocsoff(q);
-	stp = (vni_str_t *)q->q_ptr;
-	stp->st_state = DL_UNATTACHED;
-
-	/* Unlink the per-stream entry from the list and free it */
-	stp = vni_strlist_head;
-	prevstpp = &vni_strlist_head;
-
-	for (; stp != NULL; stp = stp->st_next) {
-		if (stp == (vni_str_t *)q->q_ptr)
-			break;
-		prevstpp = &stp->st_next;
-	}
-
-	ASSERT(stp != NULL);
-
-	*prevstpp = stp->st_next;
-
-	kmem_free(stp, sizeof (vni_str_t));
-
-	q->q_ptr = WR(q)->q_ptr = NULL;
-	return (0);
-}
-
-static int
-vniwput(queue_t *q, mblk_t *mp)
-{
-	union DL_primitives	*dlp;
-	vni_str_t		*stp;
-	dl_info_ack_t		*dlip;
-	t_scalar_t		prim;
-
-	stp = q->q_ptr;
-
-	switch ((mp)->b_datap->db_type) {
-	case M_PROTO:
-	case M_PCPROTO:
-		if (MBLKL(mp) < sizeof (t_scalar_t)) {
-			dlerrorack(q, mp, DL_PRIM_INVAL, DL_UNSUPPORTED, 0);
-			return (0);
-		}
-		dlp = (void *)mp->b_rptr;
-		prim = dlp->dl_primitive;
-		switch (prim) {
-		case DL_ATTACH_REQ:
-			if (MBLKL(mp) < DL_ATTACH_REQ_SIZE) {
-				dlerrorack(q, mp, DL_ATTACH_REQ, DL_BADPRIM, 0);
-				return (0);
-			}
-			if (stp->st_state != DL_UNATTACHED) {
-				dlerrorack(q, mp, DL_ATTACH_REQ, DL_OUTSTATE,
-				    0);
-				return (0);
-			}
-			stp->st_ppa = dlp->attach_req.dl_ppa;
-			stp->st_state = DL_UNBOUND;
-			dlokack(q, mp, DL_ATTACH_REQ);
-			break;
-		case DL_BIND_REQ:
-			if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
-				dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
-				return (0);
-			}
-			if (stp->st_state != DL_UNBOUND) {
-				dlerrorack(q, mp, DL_BIND_REQ, DL_OUTSTATE, 0);
-				return (0);
-			}
-			stp->st_state = DL_IDLE;
-			dlbindack(q, mp, dlp->bind_req.dl_sap, NULL, 0, 0, 0);
-			break;
-		case DL_INFO_REQ:
-			if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
-				dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
-				return (0);
-			}
-			if ((mp = mexchange(q, mp, sizeof (dl_info_ack_t),
-			    M_PCPROTO, DL_INFO_ACK)) == NULL) {
-				return (0);
-			}
-			dlip = (void *)mp->b_rptr;
-			*dlip = dlvni_infoack;
-			dlip->dl_current_state = stp->st_state;
-			qreply(q, mp);
-			break;
-		case DL_PHYS_ADDR_REQ:
-			if (MBLKL(mp) < DL_PHYS_ADDR_REQ_SIZE) {
-				dlerrorack(q, mp, DL_PHYS_ADDR_REQ, DL_BADPRIM,
-				    0);
-				return (0);
-			}
-			dlphysaddrack(q, mp, NULL, 0);
-			break;
-		case DL_UNBIND_REQ:
-			if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
-				dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
-				return (0);
-			}
-			if (stp->st_state != DL_IDLE) {
-				dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE,
-				    0);
-				return (0);
-			}
-			/* Nothing to flush. But DLPI spec says to; so do it */
-			flushq(q, FLUSHALL);
-			flushq(RD(q), FLUSHALL);
-			stp->st_state = DL_UNBOUND;
-			dlokack(q, mp, DL_UNBIND_REQ);
-			break;
-		case DL_DETACH_REQ:
-			if (MBLKL(mp) < DL_DETACH_REQ_SIZE) {
-				dlerrorack(q, mp, DL_DETACH_REQ, DL_BADPRIM, 0);
-				return (0);
-			}
-			if (stp->st_state != DL_UNBOUND) {
-				dlerrorack(q, mp, DL_DETACH_REQ, DL_OUTSTATE,
-				    0);
-				return (0);
-			}
-			stp->st_state = DL_UNATTACHED;
-			dlokack(q, mp, DL_DETACH_REQ);
-			break;
-		default:
-			dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
-		}
-		break;
-	case M_IOCTL:
-		/*
-		 * No ioctl's currently supported. Need to have the NAK since
-		 * ifconfig calls SIOCGTUNPARAM during the end of plumb
-		 */
-		miocnak(q, mp, 0, EINVAL);
-		break;
-	case M_FLUSH:
-		/* Really nothing to flush since no msgs enqueued */
-		if (*mp->b_rptr & FLUSHR) {
-			qreply(q, mp);
-		} else {
-			freemsg(mp);
-		}
-		break;
-	default:
-		freemsg(mp);
-		break;
-	}
-	return (0);
-}
diff --git a/usr/src/uts/common/inet/vni/vni_impl.h b/usr/src/uts/common/inet/vni/vni_impl.h
deleted file mode 100644
index ffba1b08bf..0000000000
--- a/usr/src/uts/common/inet/vni/vni_impl.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_INET_VNI_IMPL_H
-#define	_INET_VNI_IMPL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#include <sys/modctl.h>
-#include <sys/stream.h>
-
-typedef struct vni_str {
-	struct vni_str	*st_next;	/* next in list */
-	t_uscalar_t	st_state;	/* DLPI state */
-	minor_t		st_minor;	/* corresponding minor */
-	uint32_t	st_ppa;		/* physical point of attachment */
-} vni_str_t;
-
-#define	DL_MAXPRIM	DL_GET_STATISTICS_ACK
-#define	VNIIDNUM	0x2a84
-#define	VNINAME		"vni"
-#define	VNIFLAGS	(D_MP|D_MTPERMOD)
-#define	VNIHIWAT	1024
-#define	VNILOWAT	512
-#define	VNIMINPSZ	0
-#define	VNIMAXPSZ	INFPSZ
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _INET_VNI_IMPL_H */